diff --git a/.bazelrc b/.bazelrc
index 316949455a0114..a93862aa78a302 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -299,9 +299,11 @@ common:cuda --@local_config_cuda//:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 common:cuda --@local_config_cuda//cuda:include_cuda_libs=true
+common:cuda --@cuda_driver//:include_cuda_umd_libs=true
 
 # This configuration is used for building the wheels.
 common:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
+common:cuda_wheel --@cuda_driver//:include_cuda_umd_libs=false
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 common:cuda_clang --config=cuda
@@ -612,7 +614,6 @@ common:use_tar_archive_files --repo_env=USE_LLVM_TAR_ARCHIVE_FILES=1
 common:use_tar_archive_files --repo_env=USE_MIRRORED_TAR_ARCHIVE_FILES=1
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
-common:rbe_base --config=use_tar_archive_files
 common:rbe_base --config=resultstore
 common:rbe_base --repo_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
 common:rbe_base --define=EXECUTOR=remote
@@ -655,8 +656,8 @@ common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instance
 # Download CUDA/CUDNN redistributions to preserve the repositories cache between
 # CPU and GPU builds.
 # TODO(ybaturina): Uncomment when RBE is ready to support this.
-commonld:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-commonld:rbe_linux_cpu --config=cuda_version
+common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
+common:rbe_linux_cpu --config=cuda_version
 
 # Deprecated RBE config with non-hermetic toolchains.
 common:rbe_linux_cpu_clang_local --config=rbe_linux_cpu
@@ -682,9 +683,6 @@ common:rbe_linux_cuda --config=cuda_clang_official
 common:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 common:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-# Enable forward compatibility for CUDA builds because RBE docker image doesn't
-# have latest CUDA drivers installed.
-common:rbe_linux_cuda --@cuda_driver//:enable_forward_compatibility=true
 
 common:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 common:rbe_linux_cuda_nvcc --config=cuda_nvcc
@@ -877,7 +875,7 @@ test:linux_cpu_wheel_test --@local_xla//third_party/py:wheel_dependency=true --c
 test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --@local_xla//third_party/py:wheel_dependency=true --config=linux_cuda_wheel_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:prebuilt_wheel_import_api_packages_test_gpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --repo_env=HERMETIC_CUDA_UMD_VERSION=12.8.1 --@local_xla//third_party/py:wheel_dependency=true --config=linux_cuda_wheel_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:prebuilt_wheel_import_api_packages_test_gpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-tf_tosa,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-tf_tosa,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310,-no_oss_py313
diff --git a/.bazelversion b/.bazelversion
index 5c733d6c13a497..26c75fe8ad4fc9 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1,2 +1,2 @@
-7.4.1
+7.7.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index c0682a4cac7035..07896a48470753 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.3"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.4"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index 75339c6b4f6bd7..e635c4cd8ccc88 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -55,7 +55,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: SARIF file
           path: results.sarif
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@3599b3baa15b485a2e49ef411a7a4bb2452e7f93 # v3.29.5
+        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v3.29.5
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
index d9408810eb32ac..53f272bd5b9d8a 100644
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -31,7 +31,7 @@ jobs:
       pull-requests: write
     steps:
       - name: Awaiting response issues
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
@@ -59,7 +59,7 @@ jobs:
           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale. Please reopen if you'd like to work on this further."
           repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: Contribution issues
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
diff --git a/RELEASE.md b/RELEASE.md
index 4ce8bbb371728e..1c34667494f477 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -23,7 +23,9 @@
     * Adds int8 and int16x8 support for SQRT operator.
     * Adds int16x8 support for EQUAL and NOT_EQUAL operators.
     * Adds support for int2 type.
-    * Adds support for int2/int4 in tfl.cast.
+    * Adds support for int2/int4 in tfl.cast .
+    * Adds support for SRQ int2 in tfl.fully_connected.
+    * Adds support for int4 in tfl.slice.
 
 ### Bug Fixes and Other Changes
 
diff --git a/ci/official/containers/ml_build/Dockerfile b/ci/official/containers/ml_build/Dockerfile
index d12c886cc6d57a..a4fb0cd9b1640a 100644
--- a/ci/official/containers/ml_build/Dockerfile
+++ b/ci/official/containers/ml_build/Dockerfile
@@ -12,14 +12,6 @@ COPY builder.packages.txt /builder.packages.txt
 
 RUN /setup.sources.sh && /setup.packages.sh /builder.packages.txt
 
-# Install devtoolset-9 in /dt9 with glibc 2.17 and libstdc++ 4.8, for building
-# manylinux2014-compatible packages.
-COPY builder.devtoolset/fixlinks.sh /fixlinks.sh
-COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
-COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
-COPY builder.devtoolset/glibc2.17-inline.patch /glibc2.17-inline.patch
-RUN /build_devtoolset.sh devtoolset-9 /dt9
-
 # Setup Python
 COPY setup.python.sh /setup.python.sh
 COPY builder.requirements.txt /builder.requirements.txt
@@ -56,9 +48,6 @@ RUN ln -sf /usr/bin/python3.12 /usr/bin/python3
 RUN ln -sf /usr/bin/python3.12 /usr/bin/python
 RUN ln -sf /usr/lib/python3.12 /usr/lib/tf_python
 
-# Make sure clang is on the path
-RUN ln -s /usr/lib/llvm-18/bin/clang /usr/bin/clang
-
 # Link the compat driver to the location if available.
 RUN if [ -e "/usr/local/cuda/compat/libcuda.so.1" ]; then ln -s /usr/local/cuda/compat/libcuda.so.1 /usr/lib/x86_64-linux-gnu/libcuda.so.1; fi
 
diff --git a/ci/official/containers/ml_build/builder.packages.txt b/ci/official/containers/ml_build/builder.packages.txt
index 8dbbf4196440da..cf914a0425ef11 100644
--- a/ci/official/containers/ml_build/builder.packages.txt
+++ b/ci/official/containers/ml_build/builder.packages.txt
@@ -1,28 +1,9 @@
-# Packages to be installed for the new Docker image.
-
-# Packages needed to build devtoolset
-file
-flex
-g++
-make
-patch
-rpm2cpio
-unar
-wget
-xz-utils
-cpio
-
 # Other build-related tools
 apt-transport-https
 autoconf
 automake
 build-essential
 ca-certificates
-llvm-18
-clang-18
-clang-tidy-18
-lld-18
-clang-format-12
 curl
 git
 parallel
@@ -32,4 +13,6 @@ unzip
 zip
 openjdk-21-jdk
 vim
+wget
 jq
+file
diff --git a/ci/official/containers/ml_build/builder.requirements.txt b/ci/official/containers/ml_build/builder.requirements.txt
index 114efaf9dc9757..ae113c68c2f03c 100644
--- a/ci/official/containers/ml_build/builder.requirements.txt
+++ b/ci/official/containers/ml_build/builder.requirements.txt
@@ -5,6 +5,9 @@ id
 urllib3
 requests
 
+# For XLA
+pyyaml
+
 # For JAX
 build ~= 1.2.2
 # uv is faster than pip for installing Python packages.
diff --git a/ci/official/containers/ml_build/setup.python.sh b/ci/official/containers/ml_build/setup.python.sh
index cd56f3ca552d0f..b849457420f522 100755
--- a/ci/official/containers/ml_build/setup.python.sh
+++ b/ci/official/containers/ml_build/setup.python.sh
@@ -45,16 +45,6 @@ fi
 
 /setup.packages.sh pythons.txt
 
-# Re-link pyconfig.h from x86_64-linux-gnu into the devtoolset directory
-# for any Python version present
-pushd /usr/include/x86_64-linux-gnu
-for f in $(ls | grep python); do
-  # set up symlink for devtoolset-9
-  rm -f /dt9/usr/include/x86_64-linux-gnu/$f
-  ln -s /usr/include/x86_64-linux-gnu/$f /dt9/usr/include/x86_64-linux-gnu/$f
-done
-popd
-
 # Python 3.10 include headers fix:
 # sysconfig.get_path('include') incorrectly points to /usr/local/include/python
 # map /usr/include/python3.10 to /usr/local/include/python3.10
diff --git a/ci/official/envs/windows_x86_2022 b/ci/official/envs/windows_x86_2022
index 56187ad78eca17..3c57bcfb8114ee 100644
--- a/ci/official/envs/windows_x86_2022
+++ b/ci/official/envs/windows_x86_2022
@@ -15,7 +15,7 @@
 TFCI_DOCKER_ENABLE=1
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_IMAGE="gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-TFCI_BAZEL_BAZELRC_ARGS="--output_user_root=C:/t"
+TFCI_BAZEL_BAZELRC_ARGS="--output_user_root=C:/x"
 TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config=windows_x86_cpu_2022"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=windows_x86_cpu_2022
 TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow"
diff --git a/ci/official/utilities/cleanup_docker.sh b/ci/official/utilities/cleanup_docker.sh
index 178da9310969ca..3be4a5f418172e 100755
--- a/ci/official/utilities/cleanup_docker.sh
+++ b/ci/official/utilities/cleanup_docker.sh
@@ -26,4 +26,5 @@ $ docker exec -it tf bash
 EOF
 
 docker ps
-docker rm -f tf-${TFCI_PYTHON_VERSION}
+echo "Removing container tf-$TFCI_PYTHON_VERSION-$TFCI_DOCKER_CONTAINER_POSTFIX"
+docker rm -f tf-$TFCI_PYTHON_VERSION-$TFCI_DOCKER_CONTAINER_POSTFIX
diff --git a/ci/official/utilities/setup_docker.sh b/ci/official/utilities/setup_docker.sh
index 89318aa4ec78dc..01e549d02dfffc 100755
--- a/ci/official/utilities/setup_docker.sh
+++ b/ci/official/utilities/setup_docker.sh
@@ -51,7 +51,7 @@ if ! docker container inspect tf >/dev/null 2>&1 ; then
     echo "GCE_METADATA_HOST=$IP_ADDR" >> $env_file
   fi
 
-  docker run $TFCI_DOCKER_ARGS --name tf-$TFCI_PYTHON_VERSION -w "$WORKING_DIR" -itd --rm \
+  docker run $TFCI_DOCKER_ARGS --name tf-$TFCI_PYTHON_VERSION-$TFCI_DOCKER_CONTAINER_POSTFIX -w "$WORKING_DIR" -itd --rm \
       -v "$TFCI_GIT_DIR:$WORKING_DIR" \
       --env-file "$env_file" \
       "$TFCI_DOCKER_IMAGE" \
@@ -65,4 +65,4 @@ if ! docker container inspect tf >/dev/null 2>&1 ; then
   fi
 
 fi
-tfrun() { docker exec tf-$TFCI_PYTHON_VERSION "$@"; }
+tfrun() { docker exec tf-$TFCI_PYTHON_VERSION-$TFCI_DOCKER_CONTAINER_POSTFIX "$@"; }
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f000821983b779..558b59368e615b 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1033,6 +1033,7 @@ package_group(
         "//tensorflow_models/google/recml/...",
         "//third_party/cloud_tpu/convergence_tools/sdc_monitoring/...",
         "//third_party/cloud_tpu/inference_converter/...",
+        "//third_party/pathways/...",
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
         "//third_party/py/gldm/...",
@@ -1180,38 +1181,31 @@ tf_cc_shared_library(
     linkstatic = 1,
     per_os_targets = True,
     roots = [
-                "//tensorflow/c/experimental/filesystem:filesystem_interface",
-                "//tensorflow/c/experimental/stream_executor:stream_executor",
-                "//tensorflow/c:env",
-                "//tensorflow/c:kernels",
-                "//tensorflow/c:kernels_experimental",
-                "//tensorflow/c:logging",
-                "//tensorflow/c:ops",
-                "//tensorflow/cc/saved_model:fingerprinting_impl",
-                "//tensorflow/cc/saved_model:loader_lite_impl",
-                "//tensorflow/cc/saved_model:metrics_impl",
-                "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
-                "//tensorflow/core/common_runtime:core_cpu_impl",
-                "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
-                "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
-                "//tensorflow/core:framework_internal_impl",
-                "//tensorflow/core/framework:tensor",
-                "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
-                "//tensorflow/core:lib_internal_impl",
-                "//tensorflow/core/profiler:profiler_impl",
-                "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
-                "//tensorflow/lite/kernels/shim:tf_kernel_shim",
-                "@local_xla//xla/stream_executor:stream_executor_impl",
-                "@local_xla//xla/tsl/framework:bfc_allocator",
-                "@local_xla//xla/tsl/framework:metrics",
-            ] + tf_additional_binary_deps() +
-            # TODO(b/259305727): Remove this select and include captured_function in macos builds.
-            select({
-                "//tensorflow:macos": [],
-                "//conditions:default": [
-                    "//tensorflow/core/data:captured_function",
-                ],
-            }),
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor",
+        "//tensorflow/c:env",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:kernels_experimental",
+        "//tensorflow/c:ops",
+        "//tensorflow/cc/saved_model:fingerprinting_impl",
+        "//tensorflow/cc/saved_model:loader_lite_impl",
+        "//tensorflow/cc/saved_model:metrics_impl",
+        "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
+        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
+        "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
+        "//tensorflow/core:lib_internal_impl",
+        "//tensorflow/core/profiler:profiler_impl",
+        "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
+        "//tensorflow/lite/kernels/shim:tf_kernel_shim",
+        "@local_xla//xla/stream_executor:stream_executor_impl",
+        "@local_xla//xla/tsl/framework:bfc_allocator",
+        "@local_xla//xla/tsl/framework:metrics",
+        "//tensorflow/core/data:captured_function",
+    ] + tf_additional_binary_deps(),
     soversion = VERSION,
     static_deps = PACKAGE_STATIC_DEPS,
     visibility = ["//visibility:public"],
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 726433bafded24..3f4ec98028e8c3 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -298,7 +298,6 @@ tf_cuda_library(
         ],
         "//conditions:default": [
             ":env",
-            ":logging",
             ":tf_status",
             ":tf_tensor",
             "//tensorflow/c/experimental/filesystem:modular_filesystem",
@@ -325,18 +324,6 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "logging",
-    srcs = ["logging.cc"],
-    hdrs = ["logging.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":c_api_macros",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:stringprintf",
-    ],
-)
-
 tf_cuda_library(
     name = "tf_status_internal",
     hdrs = [
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index b919be52b0bf68..4dd78e4cd7bbb1 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1171,7 +1171,7 @@ TEST_F(CApiFunctionTest, InvalidOutputTensor_BadNodePtr) {
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
   EXPECT_EQ(string("Node is null\n\tEncountered while processing output 0 "
                    "from function 'MyFunc'"),
-            string(TF_Message(s_)));
+            std::string(TF_Message(s_)));
 }
 
 TEST_F(CApiFunctionTest, NodeMissingInput) {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index e3e7d812b15838..f59a73a0871945 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -2478,7 +2478,7 @@ TEST_F(CApiAttributesTest, Names) {
 
   TF_OperationGetAttrName(oper, 0, value.get(), s_);
   EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  EXPECT_EQ("v", string(static_cast<const char*>(value.get()), 1));
+  EXPECT_EQ("v", std::string(static_cast<const char*>(value.get()), 1));
 }
 
 TEST_F(CApiAttributesTest, Errors) {
diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
index 97a5bbd4b6077a..9dae0d3afd46fe 100644
--- a/tensorflow/c/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -119,8 +119,7 @@ CheckpointReader::BuildV2VarMaps() {
   BundleEntryProto entry;
   v2_reader_->Seek(kHeaderEntryKey);
   for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) {
-    CHECK(entry.ParseFromArray(v2_reader_->value().data(),
-                               v2_reader_->value().size()))
+    CHECK(entry.ParseFromString(v2_reader_->value()))
         << entry.InitializationErrorString();
     for (int i = 0; i < entry.slices_size(); ++i) {
       const auto& slice_proto = entry.slices(i);
@@ -140,8 +139,7 @@ CheckpointReader::BuildV2VarMaps() {
   v2_reader_->Seek(kHeaderEntryKey);
   for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) {
     if (filtered_keys.count(string(v2_reader_->key())) > 0) continue;
-    CHECK(entry.ParseFromArray(v2_reader_->value().data(),
-                               v2_reader_->value().size()))
+    CHECK(entry.ParseFromString(v2_reader_->value()))
         << entry.InitializationErrorString();
     string key(v2_reader_->key());
     (*var_to_shape_map)[key] = TensorShape(entry.shape());
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index ccde2ba3d9b769..91f83b3f88967d 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -939,7 +939,8 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
                                const char* serialized_function_def, size_t size,
                                TF_Status* status) {
   tensorflow::FunctionDef function_def;
-  if (!function_def.ParseFromArray(serialized_function_def, size)) {
+  if (!function_def.ParseFromString(
+          absl::string_view(serialized_function_def, size))) {
     status->status =
         tensorflow::errors::InvalidArgument("Invalid FunctionDef proto");
     return;
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 03dd862f95cb0f..7d25709df2dfc7 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 struct TF_StringStream {
-  std::vector<::tensorflow::string>* list;
+  std::vector<std::string>* list;
   size_t position;
 };
 
@@ -134,7 +134,7 @@ void TF_StringStreamDone(TF_StringStream* list) {
   delete list;
 }
 TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
-  auto* children = new std::vector<::tensorflow::string>;
+  auto* children = new std::vector<std::string>;
 
   TF_SetStatus(status, TF_OK, "");
   ::tensorflow::Set_TF_Status_from_Status(
@@ -147,7 +147,7 @@ TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
 }
 
 TF_StringStream* TF_GetLocalTempDirectories() {
-  auto* tmpdirs = new std::vector<::tensorflow::string>;
+  auto* tmpdirs = new std::vector<std::string>;
 
   ::tensorflow::Env::Default()->GetLocalTempDirectories(tmpdirs);
 
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
index d4c9bfce3c2127..3d338d4377366b 100644
--- a/tensorflow/c/env_test.cc
+++ b/tensorflow/c/env_test.cc
@@ -35,14 +35,12 @@ TEST(TestEnv, TestDirHandling) {
 
     TF_Status* s = TF_NewStatus();
 
-    ::tensorflow::string dirpath =
-        ::tensorflow::io::JoinPath(tempdir, "somedir");
+    std::string dirpath = ::tensorflow::io::JoinPath(tempdir, "somedir");
     TF_CreateDir(dirpath.c_str(), s);
     ASSERT_TF_OK(s) << "TF_CreateDir failed for " << dirpath << ": "
                     << TF_Message(s);
 
-    ::tensorflow::string filepath =
-        ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
+    std::string filepath = ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
     TF_WritableFileHandle* handle;
     TF_NewWritableFile(filepath.c_str(), &handle, s);
     ASSERT_TF_OK(s) << "NewWritableFile failed for " << filepath << ": "
@@ -61,7 +59,7 @@ TEST(TestEnv, TestDirHandling) {
     ASSERT_TF_OK(s) << "TF_GetChildren failed for " << dirpath;
     const char* childpath;
     ASSERT_TRUE(TF_StringStreamNext(children, &childpath));
-    ASSERT_EQ(::tensorflow::string(childpath), "somefile.txt");
+    ASSERT_EQ(std::string(childpath), "somefile.txt");
     // There should only be one file in this directory.
     ASSERT_FALSE(TF_StringStreamNext(children, &childpath));
     ASSERT_EQ(childpath, nullptr);
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 8fa3e726e6a837..f0f6e5351372e1 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -31,10 +31,10 @@ cc_library(
         ":gcs_helper",
         ":ram_file_block_cache",
         "//tensorflow/c:env",
-        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_github_googlecloudplatform_google_cloud_cpp//google/cloud:google_cloud_cpp_common",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -65,7 +65,6 @@ cc_library(
     deps = [
         ":cleanup",
         "//tensorflow/c:env",
-        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
@@ -86,6 +85,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform/cloud:now_seconds_env",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
index b0d283fff82d9b..e639f9a7dda476 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/expiring_lru_cache_test.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace {
 
 TEST(ExpiringLRUCacheTest, MaxAge) {
-  const string key = "a";
+  const std::string key = "a";
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   tf_gcs_filesystem::ExpiringLRUCache<int> cache(
       1, 0, [&env]() { return env->NowSeconds(); });
@@ -95,9 +95,10 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
 
 TEST(ExpiringLRUCacheTest, LookupOrCompute) {
   // max_age of 0 means we should always compute.
-  uint64 num_compute_calls = 0;
+  uint64_t num_compute_calls = 0;
   tf_gcs_filesystem::ExpiringLRUCache<int>::ComputeFunc compute_func =
-      [&num_compute_calls](const string& key, int* value, TF_Status* status) {
+      [&num_compute_calls](const std::string& key, int* value,
+                           TF_Status* status) {
         *value = num_compute_calls;
         num_compute_calls++;
         return TF_SetStatus(status, TF_OK, "");
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 3b9650b7416315..f61208c7b4a174 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
-#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
index 0060abc76699c3..3e972fa6292995 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/c/env.h"
-#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 namespace tf_gcs_filesystem {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
index 4ad4a8ea1868f3..23645ed8e878bf 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <cstring>
 #include <list>
@@ -25,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 absl::Status ReadCache(tf_gcs_filesystem::RamFileBlockCache* cache,
-                       const string& filename, size_t offset, size_t n,
+                       const std::string& filename, size_t offset, size_t n,
                        std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
@@ -54,7 +54,7 @@ absl::Status ReadCache(tf_gcs_filesystem::RamFileBlockCache* cache,
 }
 
 TEST(RamFileBlockCacheTest, IsCacheEnabled) {
-  auto fetcher = [](const string& filename, size_t offset, size_t n,
+  auto fetcher = [](const std::string& filename, size_t offset, size_t n,
                     char* buffer, TF_Status* status) -> int64_t {
     // Do nothing.
     TF_SetStatus(status, TF_OK, "");
@@ -73,14 +73,14 @@ TEST(RamFileBlockCacheTest, IsCacheEnabled) {
 
 TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  string filename = "file";
+  std::string filename = "file";
   tf_gcs_filesystem::RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
 
@@ -101,12 +101,12 @@ TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
 }
 
 TEST(RamFileBlockCacheTest, PassThrough) {
-  const string want_filename = "foo/bar";
+  const std::string want_filename = "foo/bar";
   const size_t want_offset = 42;
   const size_t want_n = 1024;
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
-                     const string& got_filename, size_t got_offset,
+                     const std::string& got_filename, size_t got_offset,
                      size_t got_n, char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
@@ -143,7 +143,7 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
     buf.push_back(i);
   }
   // The fetcher just fetches slices of the buffer.
-  auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&buf](const std::string& filename, size_t offset, size_t n,
                         char* buffer, TF_Status* status) -> int64_t {
     int64_t bytes_transferred;
     if (offset < buf.size()) {
@@ -191,8 +191,8 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
 TEST(RamFileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -202,7 +202,7 @@ TEST(RamFileBlockCacheTest, CacheHits) {
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  const uint32 block_count = 256;
+  const uint32_t block_count = 256;
   tf_gcs_filesystem::RamFileBlockCache cache(
       block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
@@ -225,7 +225,7 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
   bool first_block = false;
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -269,8 +269,9 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
   // where we expected complete blocks.
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
-  auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              char* buffer, TF_Status* status) -> int64_t {
+  auto fetcher = [block_size](const std::string& filename, size_t offset,
+                              size_t n, char* buffer,
+                              TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_GE(n, 1);
@@ -293,8 +294,8 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
 TEST(RamFileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
@@ -306,7 +307,7 @@ TEST(RamFileBlockCacheTest, LRU) {
     TF_SetStatus(status, TF_OK, "");
     return n;
   };
-  const uint32 block_count = 2;
+  const uint32_t block_count = 2;
   tf_gcs_filesystem::RamFileBlockCache cache(
       block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
@@ -342,7 +343,7 @@ TEST(RamFileBlockCacheTest, LRU) {
 
 TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
@@ -386,13 +387,13 @@ TEST(RamFileBlockCacheTest, MaxStaleness) {
 
 TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
       // The first block is lower case and all subsequent blocks are upper case.
-      c = toupper(c);
+      c = absl::ascii_toupper(c);
     }
     memset(buffer, c, n);
     TF_SetStatus(status, TF_OK, "");
@@ -448,7 +449,7 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
 
 TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
@@ -458,7 +459,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   std::vector<char> out;
   // Our fake environment is initialized with the current timestamp.
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
-  uint64 now = Env::Default()->NowSeconds();
+  uint64_t now = Env::Default()->NowSeconds();
   env->SetNowSeconds(now);
   tf_gcs_filesystem::RamFileBlockCache cache(
       8, 32, 1 /* max staleness */, fetcher,
@@ -487,7 +488,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   // timestamp of `now` + 2, file "a" is stale because its first block is stale,
   // but file "b" is not stale yet. Thus, once the pruning thread wakes up (in
   // one second of wall time), it should remove "a" and leave "b" alone.
-  uint64 start = Env::Default()->NowSeconds();
+  uint64_t start = Env::Default()->NowSeconds();
   do {
     Env::Default()->SleepForMicroseconds(100000);
   } while (cache.CacheSize() == 24 && Env::Default()->NowSeconds() - start < 3);
@@ -515,7 +516,7 @@ TEST(RamFileBlockCacheTest, ParallelReads) {
   absl::BlockingCounter counter(callers);
   absl::Notification notification;
   auto fetcher = [&counter, &notification](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     if (counter.DecrementCount()) {
       notification.Notify();
@@ -560,7 +561,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   int num_requests = 0;
   absl::Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, TF_Status* status) -> int64_t {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
@@ -591,7 +592,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
 
 TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, TF_Status* status) -> int64_t {
     calls++;
     memset(buffer, 'x', n);
diff --git a/tensorflow/c/experimental/grappler/grappler_test.cc b/tensorflow/c/experimental/grappler/grappler_test.cc
index 32ac04832551c1..205aeec55ebf8c 100644
--- a/tensorflow/c/experimental/grappler/grappler_test.cc
+++ b/tensorflow/c/experimental/grappler/grappler_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_set>
 #include <vector>
 
@@ -70,11 +71,11 @@ TEST(Grappler, SuccessfulRegistration) {
 
   TF_ASSERT_OK(InitGraphPlugin(plugin_init));
   ASSERT_EQ(PluginGraphOptimizerRegistry::CreateOptimizers(
-                std::set<string>{"Success"})
+                std::set<std::string>{"Success"})
                 .size(),
             1);
   ConfigList config = PluginGraphOptimizerRegistry::GetPluginConfigs(
-      true, std::set<string>{"Success"});
+      true, std::set<std::string>{"Success"});
   ASSERT_EQ(config.toggle_config["remapping"], RewriterConfig::OFF);
 }
 
@@ -95,7 +96,7 @@ TEST(Grappler, MultiplePluginRegistration) {
   TF_ASSERT_OK(InitGraphPlugin(plugin_init_0));
   TF_ASSERT_OK(InitGraphPlugin(plugin_init_1));
   ASSERT_EQ(PluginGraphOptimizerRegistry::CreateOptimizers(
-                std::set<string>{"Device0", "Device1"})
+                std::set<std::string>{"Device0", "Device1"})
                 .size(),
             2);
 }
@@ -132,12 +133,12 @@ TEST(Grappler, OptimizeFuncNotSet) {
 
 TEST(TF_GrapplerItem, NodesToPreserve) {
   GrapplerItem item;
-  item.fetch = std::vector<string>{"Conv", "BiasAdd"};
-  std::unordered_set<string> nodes_preserved = item.NodesToPreserve();
+  item.fetch = std::vector<std::string>{"Conv", "BiasAdd"};
+  std::unordered_set<std::string> nodes_preserved = item.NodesToPreserve();
   TF_GrapplerItem* c_item = reinterpret_cast<TF_GrapplerItem*>(&item);
 
   int list_total_size = 0;
-  for (const string& s : nodes_preserved) {
+  for (const std::string& s : nodes_preserved) {
     list_total_size += s.size();
   }
 
@@ -158,20 +159,21 @@ TEST(TF_GrapplerItem, NodesToPreserve) {
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   for (size_t i = 0; i < nodes_preserved.size(); ++i) {
-    EXPECT_EQ(nodes_preserved.find(string(static_cast<const char*>(values[i]),
-                                          lens[i])) != nodes_preserved.end(),
-              true);
+    EXPECT_EQ(
+        nodes_preserved.find(std::string(static_cast<const char*>(values[i]),
+                                         lens[i])) != nodes_preserved.end(),
+        true);
   }
   TF_DeleteStatus(status);
 }
 
 TEST(TF_GrapplerItem, FetchNodes) {
   GrapplerItem item;
-  item.fetch = std::vector<string>{"Conv", "BiasAdd"};
+  item.fetch = std::vector<std::string>{"Conv", "BiasAdd"};
   TF_GrapplerItem* c_item = reinterpret_cast<TF_GrapplerItem*>(&item);
 
   int list_total_size = 0;
-  for (const string& s : item.fetch) {
+  for (const std::string& s : item.fetch) {
     list_total_size += s.size();
   }
 
@@ -193,7 +195,7 @@ TEST(TF_GrapplerItem, FetchNodes) {
   for (size_t i = 0; i < item.fetch.size(); ++i) {
     EXPECT_EQ(item.fetch[i].size(), lens[i]) << i;
     EXPECT_EQ(item.fetch[i],
-              string(static_cast<const char*>(values[i]), lens[i]))
+              std::string(static_cast<const char*>(values[i]), lens[i]))
         << i;
   }
   TF_DeleteStatus(status);
@@ -307,13 +309,13 @@ TEST(TF_FunctionLibraryDefinition, LookUpOpDef) {
       TF_NewFunctionLibraryDefinition(g_buf, status);
 
   TF_LookUpOpDef(func, "Add", op_buf, status);
-  string actual_string(reinterpret_cast<const char*>(op_buf->data),
-                       op_buf->length);
+  std::string actual_string(reinterpret_cast<const char*>(op_buf->data),
+                            op_buf->length);
   ASSERT_EQ(TF_OK, TF_GetCode(status));
 
   const OpDef* expected_op_def;
   TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef("Add", &expected_op_def));
-  string expected_serialized;
+  std::string expected_serialized;
   expected_op_def->SerializeToString(&expected_serialized);
   EXPECT_EQ(expected_serialized, actual_string);
   TF_DeleteBuffer(g_buf);
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format.cc b/tensorflow/c/experimental/ops/gen/common/case_format.cc
index 82acc32f623fd8..52808e9900ca49 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format.cc
@@ -31,14 +31,14 @@ enum CaseFormatType {
   UPPER_SNAKE,
 };
 
-string FormatStringCase(const string &str, CaseFormatType to,
-                        const char delimiter = '_') {
+std::string FormatStringCase(const std::string& str, CaseFormatType to,
+                             const char delimiter = '_') {
   const bool from_snake = (str == absl::AsciiStrToUpper(str)) ||
                           (str == absl::AsciiStrToLower(str));
   const bool toUpper = (to == UPPER_CAMEL || to == UPPER_SNAKE);
   const bool toSnake = (to == LOWER_SNAKE || to == UPPER_SNAKE);
 
-  string result;
+  std::string result;
 
   bool inputStart = true;
   bool wordStart = true;
@@ -52,7 +52,7 @@ string FormatStringCase(const string &str, CaseFormatType to,
       wordStart = true;
       continue;
     }
-    if (!from_snake && isupper(c)) {
+    if (!from_snake && absl::ascii_isupper(c)) {
       wordStart = true;
     }
 
@@ -65,9 +65,9 @@ string FormatStringCase(const string &str, CaseFormatType to,
     const bool shouldCapIfSnake = toUpper;
     const bool shouldCapIfCamel = wordStart && (toUpper || !inputStart);
     if ((toSnake && shouldCapIfSnake) || (!toSnake && shouldCapIfCamel)) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
     } else {
-      result += tolower(c);
+      result += absl::ascii_tolower(c);
     }
 
     // at this point we are no longer at the start of a word:
@@ -90,16 +90,16 @@ string FormatStringCase(const string &str, CaseFormatType to,
 // Public interface
 //
 
-string toLowerCamel(const string &s, const char delimiter) {
+std::string toLowerCamel(const std::string& s, const char delimiter) {
   return FormatStringCase(s, LOWER_CAMEL, delimiter);
 }
-string toLowerSnake(const string &s, const char delimiter) {
+std::string toLowerSnake(const std::string& s, const char delimiter) {
   return FormatStringCase(s, LOWER_SNAKE, delimiter);
 }
-string toUpperCamel(const string &s, const char delimiter) {
+std::string toUpperCamel(const std::string& s, const char delimiter) {
   return FormatStringCase(s, UPPER_CAMEL, delimiter);
 }
-string toUpperSnake(const string &s, const char delimiter) {
+std::string toUpperSnake(const std::string& s, const char delimiter) {
   return FormatStringCase(s, UPPER_SNAKE, delimiter);
 }
 
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format.h b/tensorflow/c/experimental/ops/gen/common/case_format.h
index f8255f6aa21c17..880f286788e0a2 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format.h
+++ b/tensorflow/c/experimental/ops/gen/common/case_format.h
@@ -35,10 +35,10 @@ namespace generator {
 //    "__OneTwo__" (in camel case)  <==>  "__ONE_TWO__" (in snake case)
 //
 // Note: performance not yet tested.
-string toLowerCamel(const string &s, const char delimiter = '_');
-string toLowerSnake(const string &s, const char delimiter = '_');
-string toUpperCamel(const string &s, const char delimiter = '_');
-string toUpperSnake(const string &s, const char delimiter = '_');
+std::string toLowerCamel(const std::string& s, const char delimiter = '_');
+std::string toLowerSnake(const std::string& s, const char delimiter = '_');
+std::string toUpperCamel(const std::string& s, const char delimiter = '_');
+std::string toUpperSnake(const std::string& s, const char delimiter = '_');
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
index 302bcc42453169..e60473fca7896d 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
@@ -25,13 +25,13 @@ namespace {
 // For each test case, we manually construct the 4 variations in string case and
 // test all 16 conversions: from and to each of the 4 string case variations.
 struct Variations {
-  string lower_camel;
-  string lower_snake;
-  string upper_camel;
-  string upper_snake;
+  std::string lower_camel;
+  std::string lower_snake;
+  std::string upper_camel;
+  std::string upper_snake;
 };
 
-void TestSingleVariation(const string &str, Variations expected,
+void TestSingleVariation(const std::string& str, Variations expected,
                          char delimiter = '_') {
   EXPECT_EQ(expected.lower_camel, toLowerCamel(str, delimiter));
   EXPECT_EQ(expected.lower_snake, toLowerSnake(str, delimiter));
diff --git a/tensorflow/c/experimental/ops/gen/common/controller.cc b/tensorflow/c/experimental/ops/gen/common/controller.cc
index fb3e321714b108..ae3be0379ff254 100644
--- a/tensorflow/c/experimental/ops/gen/common/controller.cc
+++ b/tensorflow/c/experimental/ops/gen/common/controller.cc
@@ -43,7 +43,7 @@ Controller::Controller(PathConfig path_config, Env* env)
 }
 Controller::~Controller() { delete api_def_map_; }
 
-const void Controller::WriteFile(const string& file_path,
+const void Controller::WriteFile(const std::string& file_path,
                                  const SourceCode& code) const {
   TF_CHECK_OK(WriteStringToFile(env_, file_path, code.Render())) << file_path;
 }
@@ -60,8 +60,9 @@ void Controller::InitializeOpApi() {
   api_def_map_ = new ApiDefMap(op_list_);
   for (const auto& op : op_list_.op()) {
     for (const auto& dir : path_config_.api_dirs) {
-      const string file_name = absl::Substitute("api_def_$0.pbtxt", op.name());
-      const string file_path = io::JoinPath(dir, file_name);
+      const std::string file_name =
+          absl::Substitute("api_def_$0.pbtxt", op.name());
+      const std::string file_path = io::JoinPath(dir, file_name);
       if (env_->FileExists(file_path).ok()) {
         TF_CHECK_OK(api_def_map_->LoadFile(env_, file_path)) << file_path;
       } else {
diff --git a/tensorflow/c/experimental/ops/gen/common/controller.h b/tensorflow/c/experimental/ops/gen/common/controller.h
index e152efeb6d8f9f..c33891f963d7a6 100644
--- a/tensorflow/c/experimental/ops/gen/common/controller.h
+++ b/tensorflow/c/experimental/ops/gen/common/controller.h
@@ -32,7 +32,8 @@ class Controller {
  public:
   explicit Controller(PathConfig path_config, Env* env = Env::Default());
   virtual ~Controller();
-  const void WriteFile(const string& file_path, const SourceCode& code) const;
+  const void WriteFile(const std::string& file_path,
+                       const SourceCode& code) const;
   const std::vector<OpSpec>& GetModelOps() const;
 
  private:
diff --git a/tensorflow/c/experimental/ops/gen/common/path_config.cc b/tensorflow/c/experimental/ops/gen/common/path_config.cc
index 2ec57d67c9d6f7..74b4c3e327223d 100644
--- a/tensorflow/c/experimental/ops/gen/common/path_config.cc
+++ b/tensorflow/c/experimental/ops/gen/common/path_config.cc
@@ -24,9 +24,10 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-PathConfig::PathConfig(const string& output_dir, const string& source_dir,
-                       const string& api_dir_list,
-                       const std::vector<string> op_names)
+PathConfig::PathConfig(const std::string& output_dir,
+                       const std::string& source_dir,
+                       const std::string& api_dir_list,
+                       const std::vector<std::string> op_names)
     : output_path(output_dir), op_names(op_names) {
   api_dirs = str_util::Split(api_dir_list, ",", str_util::SkipEmpty());
 
@@ -39,7 +40,7 @@ PathConfig::PathConfig(const string& output_dir, const string& source_dir,
   tf_root_dir = "tensorflow";
 
   // Prefix, e.g. "third_party" given root_dir "third_party/tensorflow/...."
-  std::vector<string> source_path_components =
+  std::vector<std::string> source_path_components =
       tensorflow::str_util::Split(source_dir, "/");
   auto source_tfroot_pos = std::find(source_path_components.begin(),
                                      source_path_components.end(), tf_root_dir);
@@ -51,7 +52,7 @@ PathConfig::PathConfig(const string& output_dir, const string& source_dir,
   }
 
   // TF subdir, e.g. "c/ops" given output_dir "blah/blah/tensorflow/c/ops"
-  std::vector<string> output_path_components =
+  std::vector<std::string> output_path_components =
       tensorflow::str_util::Split(output_dir, "/");
   auto output_tfroot_pos = std::find(output_path_components.begin(),
                                      output_path_components.end(), tf_root_dir);
diff --git a/tensorflow/c/experimental/ops/gen/common/path_config.h b/tensorflow/c/experimental/ops/gen/common/path_config.h
index ce29063be5f682..d47266f86e38ef 100644
--- a/tensorflow/c/experimental/ops/gen/common/path_config.h
+++ b/tensorflow/c/experimental/ops/gen/common/path_config.h
@@ -23,17 +23,18 @@ namespace tensorflow {
 namespace generator {
 
 struct PathConfig {
-  string output_path;
-  std::vector<string> op_names;
-  std::vector<string> api_dirs;
-  string tf_prefix_dir;
-  string tf_root_dir;
-  string tf_output_dir;
+  std::string output_path;
+  std::vector<std::string> op_names;
+  std::vector<std::string> api_dirs;
+  std::string tf_prefix_dir;
+  std::string tf_root_dir;
+  std::string tf_output_dir;
 
   explicit PathConfig() = default;
-  explicit PathConfig(const string &output_dir, const string &source_dir,
-                      const string &api_dir_list,
-                      const std::vector<string> op_names);
+  explicit PathConfig(const std::string& output_dir,
+                      const std::string& source_dir,
+                      const std::string& api_dir_list,
+                      const std::vector<std::string> op_names);
 };
 
 }  // namespace generator
diff --git a/tensorflow/c/experimental/ops/gen/common/source_code.cc b/tensorflow/c/experimental/ops/gen/common/source_code.cc
index 2b7bce6a263184..b12949cd1dc12b 100644
--- a/tensorflow/c/experimental/ops/gen/common/source_code.cc
+++ b/tensorflow/c/experimental/ops/gen/common/source_code.cc
@@ -25,20 +25,20 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string SourceCode::Render() const {
-  string code;
+std::string SourceCode::Render() const {
+  std::string code;
   for (const Line& line : lines_) {
-    absl::StrAppend(&code, string(line.indent * spaces_per_indent_, ' '),
+    absl::StrAppend(&code, std::string(line.indent * spaces_per_indent_, ' '),
                     line.text, "\n");
   }
   return code;
 }
 
-void SourceCode::AddLineWithIndent(const string& line) {
+void SourceCode::AddLineWithIndent(const std::string& line) {
   ValidateAndAddLine(current_indent_, line);
 }
 
-void SourceCode::AddLineWithoutIndent(const string& line) {
+void SourceCode::AddLineWithoutIndent(const std::string& line) {
   ValidateAndAddLine(0, line);
 }
 
@@ -48,7 +48,7 @@ void SourceCode::IncreaseIndent() { current_indent_++; }
 
 void SourceCode::DecreaseIndent() { current_indent_--; }
 
-void SourceCode::ValidateAndAddLine(int indent, const string& raw_line) {
+void SourceCode::ValidateAndAddLine(int indent, const std::string& raw_line) {
   absl::string_view line(raw_line);
   bool had_trailing_newline = absl::ConsumeSuffix(&line, "\n");
 
@@ -57,7 +57,8 @@ void SourceCode::ValidateAndAddLine(int indent, const string& raw_line) {
   } else if (had_trailing_newline) {
     LOG(WARNING) << "Superfluous trailing newline in '" << line << "'";
   }
-  lines_.push_back({indent, string(absl::StripTrailingAsciiWhitespace(line))});
+  lines_.push_back(
+      {indent, std::string(absl::StripTrailingAsciiWhitespace(line))});
 }
 
 }  // namespace generator
diff --git a/tensorflow/c/experimental/ops/gen/common/source_code.h b/tensorflow/c/experimental/ops/gen/common/source_code.h
index df1aa90acf7b8c..9fd7f7eec5e174 100644
--- a/tensorflow/c/experimental/ops/gen/common/source_code.h
+++ b/tensorflow/c/experimental/ops/gen/common/source_code.h
@@ -24,13 +24,13 @@ namespace generator {
 
 class SourceCode {
  public:
-  string Render() const;
+  std::string Render() const;
   void SetSpacesPerIndent(int spaces_per_indent) {
     spaces_per_indent_ = spaces_per_indent;
   }
 
-  void AddLineWithIndent(const string &line);
-  void AddLineWithoutIndent(const string &line);
+  void AddLineWithIndent(const std::string& line);
+  void AddLineWithoutIndent(const std::string& line);
   void AddBlankLine();
   void IncreaseIndent();
   void DecreaseIndent();
@@ -38,10 +38,10 @@ class SourceCode {
  private:
   struct Line {
     int indent;
-    string text;
+    std::string text;
   };
 
-  void ValidateAndAddLine(int indent_level, const string &raw_line);
+  void ValidateAndAddLine(int indent_level, const std::string& raw_line);
 
   int spaces_per_indent_ = 2;
   int current_indent_ = 0;
diff --git a/tensorflow/c/experimental/ops/gen/common/view_util.cc b/tensorflow/c/experimental/ops/gen/common/view_util.cc
index 388aa0646db82b..5ca9b59c9841e9 100644
--- a/tensorflow/c/experimental/ops/gen/common/view_util.cc
+++ b/tensorflow/c/experimental/ops/gen/common/view_util.cc
@@ -23,17 +23,20 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string Call(const string& object, const string& method,
-            std::vector<string> arguments, const char* oper) {
+std::string Call(const std::string& object, const std::string& method,
+                 std::vector<std::string> arguments, const char* oper) {
   return absl::Substitute("$0$1$2($3)", object, oper, method,
                           absl::StrJoin(arguments, ", "));
 }
 
-string Call(const string& function, std::vector<string> arguments) {
+std::string Call(const std::string& function,
+                 std::vector<std::string> arguments) {
   return absl::Substitute("$0($1)", function, absl::StrJoin(arguments, ", "));
 }
 
-string Quoted(const string& s) { return absl::Substitute("\"$0\"", s); }
+std::string Quoted(const std::string& s) {
+  return absl::Substitute("\"$0\"", s);
+}
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/common/view_util.h b/tensorflow/c/experimental/ops/gen/common/view_util.h
index 7ab437a90e4fd8..f23831ce8a07dd 100644
--- a/tensorflow/c/experimental/ops/gen/common/view_util.h
+++ b/tensorflow/c/experimental/ops/gen/common/view_util.h
@@ -22,10 +22,11 @@ limitations under the License.
 namespace tensorflow {
 namespace generator {
 
-string Call(const string &function, std::vector<string> arguments);
-string Call(const string &object, const string &method,
-            std::vector<string> arguments, const char *oper = "->");
-string Quoted(const string &s);
+std::string Call(const std::string& function,
+                 std::vector<std::string> arguments);
+std::string Call(const std::string& object, const std::string& method,
+                 std::vector<std::string> arguments, const char* oper = "->");
+std::string Quoted(const std::string& s);
 
 }  // namespace generator
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
index 3fe5c059ca4e70..45e7b87069e361 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
@@ -52,11 +52,11 @@ SourceCode CppGenerator::SourceFileContents() const {
   return GenerateOneFile(cpp::RendererContext::kSource);
 }
 
-string CppGenerator::HeaderFileName() const {
+std::string CppGenerator::HeaderFileName() const {
   return io::JoinPath(path_config_.output_path, cpp_config_.unit + "_ops.h");
 }
 
-string CppGenerator::SourceFileName() const {
+std::string CppGenerator::SourceFileName() const {
   return io::JoinPath(path_config_.output_path, cpp_config_.unit + "_ops.cc");
 }
 
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
index 0a7b08cd9b171f..b4d016e0ecca44 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h
@@ -30,8 +30,8 @@ class CppGenerator {
   explicit CppGenerator(cpp::CppConfig cpp_config, PathConfig path_config);
   SourceCode HeaderFileContents() const;
   SourceCode SourceFileContents() const;
-  string HeaderFileName() const;
-  string SourceFileName() const;
+  std::string HeaderFileName() const;
+  std::string SourceFileName() const;
   void WriteHeaderFile() const;
   void WriteSourceFile() const;
 
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
index f4a4d82bbce423..e1db2c9b8ce14b 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
@@ -30,12 +30,12 @@ namespace generator {
 namespace {
 
 TEST(CppGeneratorTest, typical_usage) {
-  string category = "testing";
-  string name_space = "tensorflow::ops";
-  string output_dir = "tensorflow/c/experimental/ops/gen/cpp/golden";
-  string source_dir = "tensorflow";
-  string api_dirs = "";
-  std::vector<string> ops = {
+  std::string category = "testing";
+  std::string name_space = "tensorflow::ops";
+  std::string output_dir = "tensorflow/c/experimental/ops/gen/cpp/golden";
+  std::string source_dir = "tensorflow";
+  std::string api_dirs = "";
+  std::vector<std::string> ops = {
       "Neg",        // Simple unary Op
       "MatMul",     // 2 inputs & attrs with default values
       "IdentityN",  // Variadic input+output
@@ -50,17 +50,19 @@ TEST(CppGeneratorTest, typical_usage) {
   CppGenerator generator(cpp_config, controller_config);
 
   Env *env = Env::Default();
-  string golden_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                   controller_config.tf_output_dir);
+  std::string golden_dir = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                        controller_config.tf_output_dir);
 
-  string generated_header = generator.HeaderFileContents().Render();
-  string generated_source = generator.SourceFileContents().Render();
-  string expected_header;
-  string header_file_name = io::JoinPath(golden_dir, "testing_ops.h.golden");
+  std::string generated_header = generator.HeaderFileContents().Render();
+  std::string generated_source = generator.SourceFileContents().Render();
+  std::string expected_header;
+  std::string header_file_name =
+      io::JoinPath(golden_dir, "testing_ops.h.golden");
   TF_CHECK_OK(ReadFileToString(env, header_file_name, &expected_header));
 
-  string expected_source;
-  string source_file_name = io::JoinPath(golden_dir, "testing_ops.cc.golden");
+  std::string expected_source;
+  std::string source_file_name =
+      io::JoinPath(golden_dir, "testing_ops.cc.golden");
   TF_CHECK_OK(ReadFileToString(env, source_file_name, &expected_source));
 
   // Remove carriage returns (for Windows)
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 4f0e64e3b0f8eb..7c8231a71133f5 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace generator {
 namespace cpp {
 
-CppConfig::CppConfig(const string &category, const string &name_space)
+CppConfig::CppConfig(const std::string& category, const std::string& name_space)
     : category(category),
       unit(absl::AsciiStrToLower(category)),
       namespaces(absl::StrSplit(name_space, "::")) {}
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
index fa7571d98a1214..eec5888e17e7cf 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h
@@ -24,13 +24,13 @@ namespace generator {
 namespace cpp {
 
 struct CppConfig {
-  string category;
-  string unit;
-  std::vector<string> namespaces;
+  std::string category;
+  std::string unit;
+  std::vector<std::string> namespaces;
 
   explicit CppConfig() = default;
-  explicit CppConfig(const string &category,
-                     const string &name_space = "tensorflow::ops");
+  explicit CppConfig(const std::string& category,
+                     const std::string& name_space = "tensorflow::ops");
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 1a685cac0c405c..50db08df1db988 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -27,10 +27,10 @@ namespace generator {
 namespace cpp {
 
 GuardRenderer::GuardRenderer(RendererContext context) : Renderer(context) {
-  string self_path = io::JoinPath(context_.path_config.tf_root_dir,
-                                  context_.path_config.tf_output_dir,
-                                  context_.cpp_config.unit + "_ops.h");
-  string with_underscores(self_path);
+  std::string self_path = io::JoinPath(context_.path_config.tf_root_dir,
+                                       context_.path_config.tf_output_dir,
+                                       context_.cpp_config.unit + "_ops.h");
+  std::string with_underscores(self_path);
   std::replace(with_underscores.begin(), with_underscores.end(), '/', '_');
   std::replace(with_underscores.begin(), with_underscores.end(), '.', '_');
   guard_ = toUpperSnake(with_underscores) + "_";
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
index a45fe89a7a011c..bbd29e4620e2c2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
@@ -31,7 +31,7 @@ class GuardRenderer : public Renderer {
   void Close();
 
  private:
-  string guard_;
+  std::string guard_;
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 38f31209f6da24..0ec8108bee7aaf 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -30,13 +30,13 @@ void IncludeRenderer::SelfHeader() {
   BlankLine();
 }
 
-string IncludeRenderer::SelfHeaderPath() const {
+std::string IncludeRenderer::SelfHeaderPath() const {
   return io::JoinPath(context_.path_config.tf_root_dir,
                       context_.path_config.tf_output_dir,
                       context_.cpp_config.unit + "_ops.h");
 }
 
-void IncludeRenderer::Include(const string &tf_file_path) {
+void IncludeRenderer::Include(const std::string& tf_file_path) {
   CodeLine("#include \"$0\"",
            io::JoinPath(context_.path_config.tf_prefix_dir, tf_file_path));
 }
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
index e43715a62e45b0..4178f0da5beeb9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
@@ -27,12 +27,12 @@ class IncludeRenderer : public Renderer {
  public:
   explicit IncludeRenderer(RendererContext context);
 
-  string SelfHeaderPath() const;
+  std::string SelfHeaderPath() const;
   void SelfHeader();
   void Headers();
 
  private:
-  void Include(const string &tf_file_path);
+  void Include(const std::string& tf_file_path);
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index db28ab303ae5c6..b490cc7fe9e86a 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -26,7 +26,7 @@ NamespaceRenderer::NamespaceRenderer(RendererContext context)
     : Renderer(context) {}
 
 void NamespaceRenderer::Open() {
-  for (const string& ns : context_.cpp_config.namespaces) {
+  for (const std::string& ns : context_.cpp_config.namespaces) {
     CodeLine("namespace " + ns + " {");
   }
 }
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index c459d239ca699f..63cb5f30eb1d9d 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -31,11 +31,11 @@ namespace tensorflow {
 namespace generator {
 namespace cpp {
 
-string OpRenderer::Signature() const {
-  std::vector<string> args_with_default_val;
-  std::vector<string> args_without_default_val;
+std::string OpRenderer::Signature() const {
+  std::vector<std::string> args_with_default_val;
+  std::vector<std::string> args_without_default_val;
   for (OpArgumentView const& argument : op_.AllArguments()) {
-    string text = argument.Declaration();
+    std::string text = argument.Declaration();
     if (context_.mode == RendererContext::kHeader) {
       absl::StrAppend(&text, argument.Initializer());
     }
@@ -45,7 +45,7 @@ string OpRenderer::Signature() const {
       args_without_default_val.push_back(text);
     }
   }
-  std::vector<string> arguments;
+  std::vector<std::string> arguments;
   arguments.reserve(args_without_default_val.size() +
                     args_with_default_val.size());
   arguments.insert(arguments.end(),
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
index 3360e14e672e3a..1ea161f55bdad9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
@@ -34,7 +34,7 @@ class OpRenderer : public Renderer {
   OpView op_;
   OpCommentRenderer comment_;
 
-  string Signature() const;
+  std::string Signature() const;
 };
 
 }  // namespace cpp
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index a9efb94335c0a6..6a608d759a3753 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -34,21 +34,21 @@ Renderer& Renderer::BlankLine() {
   return *this;
 }
 
-Renderer& Renderer::CodeLine(const string& text) {
+Renderer& Renderer::CodeLine(const std::string& text) {
   context_.code.AddLineWithoutIndent(text);
   return *this;
 }
 
-Renderer& Renderer::CodeLines(const string& text) {
+Renderer& Renderer::CodeLines(const std::string& text) {
   absl::string_view trimmed_text(text);
   str_util::RemoveWhitespaceContext(&trimmed_text);
-  for (const string& line : str_util::Split(trimmed_text, '\n')) {
+  for (const std::string& line : str_util::Split(trimmed_text, '\n')) {
     context_.code.AddLineWithoutIndent(line);
   }
   return *this;
 }
 
-Renderer& Renderer::Statement(const string& text) {
+Renderer& Renderer::Statement(const std::string& text) {
   if (absl::EndsWith(text, ";")) {
     LOG(WARNING) << "Superfluous terminating ';' in '" << text << "'";
     context_.code.AddLineWithIndent(text);
@@ -58,22 +58,22 @@ Renderer& Renderer::Statement(const string& text) {
   return *this;
 }
 
-Renderer& Renderer::TFStatement(const string& text) {
+Renderer& Renderer::TFStatement(const std::string& text) {
   return Statement(absl::Substitute("TF_RETURN_IF_ERROR($0)", text));
 }
 
-Renderer& Renderer::CommentLine(const string& text) {
+Renderer& Renderer::CommentLine(const std::string& text) {
   context_.code.AddLineWithIndent(absl::StrCat("// ", text));
   return *this;
 }
 
-Renderer& Renderer::BlockOpen(const string& text) {
+Renderer& Renderer::BlockOpen(const std::string& text) {
   context_.code.AddLineWithIndent(absl::StrCat(text, " {"));
   context_.code.IncreaseIndent();
   return *this;
 }
 
-Renderer& Renderer::BlockClose(const string& text) {
+Renderer& Renderer::BlockClose(const std::string& text) {
   context_.code.DecreaseIndent();
   context_.code.AddLineWithIndent(absl::StrCat("}", text));
   return *this;
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
index b6168b196b35b2..f41923651f44e2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
@@ -34,7 +34,7 @@ class Renderer {
 
   // Append a line of source code, left-justified (not indented).
   // Use for preprocessors directives ("#include"), namespaces, etc.
-  Renderer &CodeLine(const string &text);
+  Renderer& CodeLine(const std::string& text);
   template <typename... Args>
   Renderer CodeLine(absl::string_view text, const Args &...args) {
     return CodeLine(absl::Substitute(text, args...));
@@ -44,7 +44,7 @@ class Renderer {
   // Note: Trims leading/trailing whitespace including newlines, making this
   //       method convenient for multiline raw strings.
   // Newlines ('\n') are allowed/expected.
-  Renderer &CodeLines(const string &text);
+  Renderer& CodeLines(const std::string& text);
   template <typename... Args>
   Renderer CodeLines(absl::string_view text, const Args &...args) {
     return CodeLines(absl::Substitute(text, args...));
@@ -52,7 +52,7 @@ class Renderer {
 
   // Indent and append a C++ statement.
   // Note: do *not* include a trailing semicolon in the statement text.
-  Renderer &Statement(const string &text);
+  Renderer& Statement(const std::string& text);
   template <typename... Args>
   Renderer Statement(absl::string_view text, const Args &...args) {
     return Statement(absl::Substitute(text, args...));
@@ -60,14 +60,14 @@ class Renderer {
 
   // Indent and append a call to a TF method returning a Status to check.
   // Note: do *not* include a trailing semicolon in the statement text.
-  Renderer &TFStatement(const string &text);
+  Renderer& TFStatement(const std::string& text);
   template <typename... Args>
   Renderer TFStatement(absl::string_view text, const Args &...args) {
     return TFStatement(absl::Substitute(text, args...));
   }
 
   // Indent and append a C++ single-line style comment (using '//').
-  Renderer &CommentLine(const string &text = "");
+  Renderer& CommentLine(const std::string& text = "");
   template <typename... Args>
   Renderer CommentLine(absl::string_view text, const Args &...args) {
     return CommentLine(absl::Substitute(text, args...));
@@ -75,7 +75,7 @@ class Renderer {
 
   // Append a line of code which starts a new block: trailing with '{') and
   // indenting.
-  Renderer &BlockOpen(const string &text);
+  Renderer& BlockOpen(const std::string& text);
   template <typename... Args>
   Renderer BlockOpen(absl::string_view text, const Args &...args) {
     return BlockOpen(absl::Substitute(text, args...));
@@ -83,7 +83,7 @@ class Renderer {
 
   // Append a line of code ending a block: unindenting and adding '}'.
   // Note: optional trailing text is often a comment, e.g. '// namespace xyz'.
-  Renderer &BlockClose(const string &text = "");
+  Renderer& BlockClose(const std::string& text = "");
   template <typename... Args>
   Renderer BlockClose(absl::string_view text, const Args &...args) {
     return BlockClose(absl::Substitute(text, args...));
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index eff654c5938160..6621d1aea2c217 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -57,7 +57,7 @@ TEST(Renderer, typical_usage) {
   SourceCode code;
   TestRenderer(code).Render();
 
-  string expected = R"(// File level comment.
+  std::string expected = R"(// File level comment.
 #include "header.h"
 
 void TestFunction() {
diff --git a/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc b/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
index 18a506942de5b7..cb922d0a06b7ae 100644
--- a/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
+++ b/tensorflow/c/experimental/ops/gen/generate_cpp_main.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
diff --git a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
index c2bf61d785e6b2..417a0f26d70b92 100644
--- a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
@@ -26,8 +26,7 @@ namespace {
 
 SavedObjectGraph ParseSavedObjectGraph(absl::string_view text_proto) {
   SavedObjectGraph value;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(string(text_proto),
-                                                          &value));
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &value));
   return value;
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
index 6250af6dba1359..1796c99dc79f17 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_test_protos.cc
@@ -178,8 +178,7 @@ tuple_value: {
 
 StructuredValue ParseStructuredValue(absl::string_view text_proto) {
   StructuredValue value;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(string(text_proto),
-                                                          &value));
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(text_proto, &value));
   return value;
 }
 
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
index 64ff3dab035e8c..c44bc832547dab 100644
--- a/tensorflow/c/kernels/bitcast_op_test.cc
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -86,13 +86,13 @@ void TestBitcastOp(Tensor* input_tensor, DataType out_type,
 TEST(BitcastOpTest, TestUpcast) {
   Tensor int8_input(DT_UINT8, {8});
   for (int i = 0; i < 8; i++) {
-    int8_input.vec<uint8>()(i) = static_cast<uint8>(1);
+    int8_input.vec<uint8_t>()(i) = static_cast<uint8_t>(1);
   }
   TestBitcastOp(&int8_input, DT_UINT64, TensorShape(), error::OK);
 }
 
 TEST(BitcastOpTest, TestDowncast) {
-  Tensor int64_input(static_cast<uint64>(1));
+  Tensor int64_input(static_cast<uint64_t>(1));
   TestBitcastOp(&int64_input, DT_UINT8, TensorShape({8}), error::OK);
 }
 
diff --git a/tensorflow/c/kernels/histogram_summary_op.cc b/tensorflow/c/kernels/histogram_summary_op.cc
index 7f34e5217c20ba..35340baa5749ce 100644
--- a/tensorflow/c/kernels/histogram_summary_op.cc
+++ b/tensorflow/c/kernels/histogram_summary_op.cc
@@ -151,13 +151,13 @@ void RegisterHistogramSummaryOpKernel() {
 TF_ATTRIBUTE_UNUSED static bool IsHistogramSummaryOpKernelRegistered = []() {
   if (SHOULD_REGISTER_OP_KERNEL("HistogramSummary")) {
     RegisterHistogramSummaryOpKernel<int64_t>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint64>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int32>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint32>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint16>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int16>();
-    RegisterHistogramSummaryOpKernel<tensorflow::int8>();
-    RegisterHistogramSummaryOpKernel<tensorflow::uint8>();
+    RegisterHistogramSummaryOpKernel<uint64_t>();
+    RegisterHistogramSummaryOpKernel<int32_t>();
+    RegisterHistogramSummaryOpKernel<uint32_t>();
+    RegisterHistogramSummaryOpKernel<uint16_t>();
+    RegisterHistogramSummaryOpKernel<int16_t>();
+    RegisterHistogramSummaryOpKernel<int8_t>();
+    RegisterHistogramSummaryOpKernel<uint8_t>();
     RegisterHistogramSummaryOpKernel<Eigen::half>();
     RegisterHistogramSummaryOpKernel<tensorflow::bfloat16>();
     RegisterHistogramSummaryOpKernel<float>();
diff --git a/tensorflow/c/kernels/merge_summary_op.cc b/tensorflow/c/kernels/merge_summary_op.cc
index 339267d094a554..ddbc3440d47dc1 100644
--- a/tensorflow/c/kernels/merge_summary_op.cc
+++ b/tensorflow/c/kernels/merge_summary_op.cc
@@ -50,7 +50,7 @@ void MergeSummaryOp_Delete(void* kernel) {}
 
 void MergeSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
   tensorflow::Summary s;
-  std::unordered_set<tensorflow::string> tags;
+  std::unordered_set<std::string> tags;
   Safe_TF_StatusPtr status(TF_NewStatus());
   for (int input_num = 0; input_num < TF_NumInputs(ctx); ++input_num) {
     TF_Tensor* input;
@@ -74,7 +74,7 @@ void MergeSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
       for (int v = 0; v < summary_in.value_size(); ++v) {
         // This tag is unused by the TensorSummary op, so no need to check for
         // duplicates.
-        const tensorflow::string& tag = summary_in.value(v).tag();
+        const std::string& tag = summary_in.value(v).tag();
         if ((!tag.empty()) && !tags.insert(tag).second) {
           std::ostringstream err;
           err << "Duplicate tag " << tag << " found in summary inputs ";
diff --git a/tensorflow/c/kernels/summary_op.cc b/tensorflow/c/kernels/summary_op.cc
index d158a429433c40..5688d00fa8fa7c 100644
--- a/tensorflow/c/kernels/summary_op.cc
+++ b/tensorflow/c/kernels/summary_op.cc
@@ -155,13 +155,13 @@ void RegisterScalarSummaryOpKernel() {
 TF_ATTRIBUTE_UNUSED bool IsScalarSummaryOpKernelRegistered = []() {
   if (SHOULD_REGISTER_OP_KERNEL("ScalarSummary")) {
     RegisterScalarSummaryOpKernel<int64_t>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint64>();
-    RegisterScalarSummaryOpKernel<tensorflow::int32>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint32>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint16>();
-    RegisterScalarSummaryOpKernel<tensorflow::int16>();
-    RegisterScalarSummaryOpKernel<tensorflow::int8>();
-    RegisterScalarSummaryOpKernel<tensorflow::uint8>();
+    RegisterScalarSummaryOpKernel<uint64_t>();
+    RegisterScalarSummaryOpKernel<int32_t>();
+    RegisterScalarSummaryOpKernel<uint32_t>();
+    RegisterScalarSummaryOpKernel<uint16_t>();
+    RegisterScalarSummaryOpKernel<int16_t>();
+    RegisterScalarSummaryOpKernel<int8_t>();
+    RegisterScalarSummaryOpKernel<uint8_t>();
     RegisterScalarSummaryOpKernel<Eigen::half>();
     RegisterScalarSummaryOpKernel<tensorflow::bfloat16>();
     RegisterScalarSummaryOpKernel<float>();
diff --git a/tensorflow/c/kernels/summary_op_test.cc b/tensorflow/c/kernels/summary_op_test.cc
index da7b92f99491df..43de49bc39419d 100644
--- a/tensorflow/c/kernels/summary_op_test.cc
+++ b/tensorflow/c/kernels/summary_op_test.cc
@@ -45,13 +45,15 @@ class DummyDevice : public DeviceBase {
 };
 
 // Helper for comparing output and expected output
-void ExpectSummaryMatches(const Summary& actual, const string& expected_str) {
+void ExpectSummaryMatches(const Summary& actual,
+                          const std::string& expected_str) {
   Summary expected;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
 }
 
-void TestScalarSummaryOp(Tensor* tags, Tensor* values, string expected_output,
+void TestScalarSummaryOp(Tensor* tags, Tensor* values,
+                         std::string expected_output,
                          error::Code expected_code) {
   // Initialize node used to fetch OpKernel
   absl::Status status;
diff --git a/tensorflow/c/kernels/tensor_shape_utils.cc b/tensorflow/c/kernels/tensor_shape_utils.cc
index db0cfefedcbc86..ba54dc4eda4df9 100644
--- a/tensorflow/c/kernels/tensor_shape_utils.cc
+++ b/tensorflow/c/kernels/tensor_shape_utils.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 std::string ShapeDebugString(TF_Tensor* tensor) {
   // A TF_Tensor cannot have an unknown rank.
   CHECK_GE(TF_NumDims(tensor), 0);
-  tensorflow::string s = "[";
+  std::string s = "[";
   for (int i = 0; i < TF_NumDims(tensor); ++i) {
     if (i > 0) absl::StrAppend(&s, ",");
     int64_t dim = TF_Dim(tensor, i);
diff --git a/tensorflow/c/logging.cc b/tensorflow/c/logging.cc
deleted file mode 100644
index 13c9e6ac208a14..00000000000000
--- a/tensorflow/c/logging.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/c/logging.h"
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stringprintf.h"
-
-static ::tensorflow::string BuildMessage(const char* fmt, va_list args) {
-  ::tensorflow::string message;
-  ::tensorflow::strings::Appendv(&message, fmt, args);
-  return message;
-}
-
-void TF_Log(TF_LogLevel level, const char* fmt, ...) {
-  if (level < TF_INFO || level > TF_FATAL) return;
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  switch (level) {
-    case TF_INFO:
-      LOG(INFO) << message;
-      break;
-    case TF_WARNING:
-      LOG(WARNING) << message;
-      break;
-    case TF_ERROR:
-      LOG(ERROR) << message;
-      break;
-    case TF_FATAL:
-      LOG(FATAL) << message;
-      break;
-  }
-}
-
-void TF_VLog(int level, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  VLOG(level) << message;
-}
-
-void TF_DVLog(int level, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  auto message = BuildMessage(fmt, args);
-  va_end(args);
-  DVLOG(level) << message;
-}
diff --git a/tensorflow/c/logging.h b/tensorflow/c/logging.h
deleted file mode 100644
index 9583777b661122..00000000000000
--- a/tensorflow/c/logging.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_C_LOGGING_H_
-#define TENSORFLOW_C_LOGGING_H_
-
-#include "tensorflow/c/c_api_macros.h"
-
-// --------------------------------------------------------------------------
-// C API for tensorflow::Logging.
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum TF_LogLevel {
-  TF_INFO = 0,
-  TF_WARNING = 1,
-  TF_ERROR = 2,
-  TF_FATAL = 3,
-} TF_LogLevel;
-
-TF_CAPI_EXPORT extern void TF_Log(TF_LogLevel level, const char* fmt, ...);
-TF_CAPI_EXPORT extern void TF_VLog(int level, const char* fmt, ...);
-TF_CAPI_EXPORT extern void TF_DVLog(int level, const char* fmt, ...);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TENSORFLOW_C_LOGGING_H_
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
index 02a38e9b164eb3..c991fc1f74f2e8 100644
--- a/tensorflow/c/tf_datatype.h
+++ b/tensorflow/c/tf_datatype.h
@@ -65,6 +65,7 @@ typedef enum TF_DataType {
   TF_UINT4 = 30,
   TF_INT2 = 31,
   TF_UINT2 = 32,
+  TF_FLOAT4_E2M1FN = 33  // 2 exponent bits, 1 mantissa bit, finite-only
 } TF_DataType;
 
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
diff --git a/tensorflow/cc/framework/cc_op_gen_util.cc b/tensorflow/cc/framework/cc_op_gen_util.cc
index 45c88283a47a6c..048378e68f4525 100644
--- a/tensorflow/cc/framework/cc_op_gen_util.cc
+++ b/tensorflow/cc/framework/cc_op_gen_util.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
 
-#include <cctype>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -29,6 +28,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -107,10 +107,10 @@ string ToGuard(absl::string_view path) {
   string guard;
   guard.reserve(path.size() + 1);  // + 1 -> trailing _
   for (const char c : path) {
-    if (c >= 'A' && c <= 'Z') {
+    if (absl::ascii_isupper(c)) {
       guard += c;
-    } else if (c >= 'a' && c <= 'z') {
-      guard += c + 'A' - 'a';
+    } else if (absl::ascii_islower(c)) {
+      guard += absl::ascii_toupper(c);
     } else {
       guard += '_';
     }
@@ -306,7 +306,7 @@ string ToCamelCase(absl::string_view str) {
     } else if (c == joiner) {
       cap = true;
     } else if (cap) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
       cap = false;
     } else {
       result += c;
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
index dcac1e4c0373bd..cd332ed1791849 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
@@ -42,7 +42,7 @@ namespace tensorflow {
 namespace cc_op {
 namespace {
 
-string DefaultValue(OpDef_AttrDef attr) {
+std::string DefaultValue(OpDef_AttrDef attr) {
   static const auto* attr_default_value_map =
       new absl::flat_hash_map<absl::string_view, absl::string_view,
                               StringPieceHasher>{
@@ -80,19 +80,19 @@ string DefaultValue(OpDef_AttrDef attr) {
   return std::string(entry->second);
 }
 
-string WriteClassFuzzDef(const OpInfo& op_info) {
-  string class_signature_str = absl::Substitute(
+std::string WriteClassFuzzDef(const OpInfo& op_info) {
+  std::string class_signature_str = absl::Substitute(
       "class Fuzz$0 : public FuzzSession<$1> {\n", op_info.op_name,
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       absl::StrAppend(out, "Tensor");
                       if (ArgIsList(arg)) absl::StrAppend(out, ", Tensor");
                     }));
 
-  string build_graph_body = absl::StrCat(
+  std::string build_graph_body = absl::StrCat(
       absl::StrJoin(
           op_info.graph_op_def.input_arg(), "",
-          [op_info](string* out, const OpDef_ArgDef arg) {
+          [op_info](std::string* out, const OpDef_ArgDef arg) {
             std::string type = "DT_UINT8";
 
             if (arg.type() != DT_INVALID) {
@@ -130,7 +130,7 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
             }
           }),
       absl::StrJoin(op_info.graph_op_def.attr(), "",
-                    [op_info](string* out, const OpDef_AttrDef attr) {
+                    [op_info](std::string* out, const OpDef_AttrDef attr) {
                       if (op_info.inferred_input_attrs.count(attr.name()) ==
                               0 &&
                           !attr.has_default_value()) {
@@ -139,22 +139,22 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                       }
                     }));
 
-  string constructor_call_str = absl::Substitute(
+  std::string constructor_call_str = absl::Substitute(
       "    tensorflow::ops::$0(scope.WithOpName(\"output\")$1);\n",
       op_info.op_name,
       absl::StrCat(
           op_info.api_def.arg_order().empty()
               ? absl::StrJoin(op_info.api_def.in_arg(), "",
-                              [](string* out, const auto api_def_arg) {
+                              [](std::string* out, const auto api_def_arg) {
                                 strings::StrAppend(out, ", ",
                                                    api_def_arg.name());
                               })
               : absl::StrJoin(op_info.api_def.arg_order(), "",
-                              [](string* out, const auto name) {
+                              [](std::string* out, const auto name) {
                                 strings::StrAppend(out, ", ", name);
                               }),
           absl::StrJoin(op_info.graph_op_def.attr(), "",
-                        [op_info](string* out, const OpDef_AttrDef attr) {
+                        [op_info](std::string* out, const OpDef_AttrDef attr) {
                           if (op_info.inferred_input_attrs.count(attr.name()) ==
                                   0 &&
                               !attr.has_default_value()) {
@@ -162,20 +162,20 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                           }
                         })));
 
-  string fuzz_impl_signature_str = absl::Substitute(
+  std::string fuzz_impl_signature_str = absl::Substitute(
       "  void FuzzImpl($0) final {\n",
       absl::StrJoin(
           op_info.graph_op_def.input_arg(), ", ",
-          [](string* out, const auto arg) {
+          [](std::string* out, const auto arg) {
             strings::StrAppend(out, "const Tensor& ", arg.name(), "_0");
             if (ArgIsList(arg))
               strings::StrAppend(out, ", const Tensor& ", arg.name(), "_1");
           }));
 
-  string run_inputs_str = absl::Substitute(
+  std::string run_inputs_str = absl::Substitute(
       "    RunInputs({$0});\n",
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       if (ArgIsList(arg)) {
                         strings::StrAppend(
                             out, "{\"", arg.name(), "\", ", arg.name(), "_0}, ",
@@ -186,7 +186,7 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
                       }
                     }));
 
-  string fuzz_class_def = strings::StrCat(
+  std::string fuzz_class_def = strings::StrCat(
       class_signature_str, "  void BuildGraph(const Scope& scope) override {\n",
       build_graph_body, constructor_call_str, "  }\n", fuzz_impl_signature_str,
       run_inputs_str, "  }\n", "};\n");
@@ -194,24 +194,24 @@ string WriteClassFuzzDef(const OpInfo& op_info) {
   return fuzz_class_def;
 }
 
-string WriteFuzzTest(const OpInfo& op_info) {
+std::string WriteFuzzTest(const OpInfo& op_info) {
   return absl::Substitute(
       "FUZZ_TEST_F(Fuzz$0, Fuzz).WithDomains($1);\n", op_info.op_name,
       absl::StrJoin(op_info.graph_op_def.input_arg(), ", ",
-                    [](string* out, const auto arg) {
+                    [](std::string* out, const auto arg) {
                       absl::StrAppend(out, "AnyTensor()");
                       if (ArgIsList(arg)) absl::StrAppend(out, ", AnyTensor()");
                     }));
 }
 
-string FuzzerFileStart() {
-  const string fuzz_namespace_begin = R"namespace(
+std::string FuzzerFileStart() {
+  const std::string fuzz_namespace_begin = R"namespace(
 namespace tensorflow {
 namespace fuzzing {
 
 )namespace";
 
-  const string fuzz_header =
+  const std::string fuzz_header =
       absl::StrCat(R"include(// This file is MACHINE GENERATED! Do not edit.
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -224,8 +224,8 @@ namespace fuzzing {
   return fuzz_header;
 }
 
-string FuzzerFileEnd() {
-  const string fuzz_footer = R"footer(
+std::string FuzzerFileEnd() {
+  const std::string fuzz_footer = R"footer(
 }  // namespace fuzzing
 }  // namespace tensorflow
 )footer";
@@ -258,7 +258,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   }
 
   // TODO(unda) : zero input ops
-  std::set<string> zero_input_ops = {"Placeholder", "ImmutableConst"};
+  std::set<std::string> zero_input_ops = {"Placeholder", "ImmutableConst"};
   if (zero_input_ops.find(op_info.op_name) != zero_input_ops.end()) {
     std::cout << "NOT fuzzing: " << op_info.graph_op_def.name()
               << " takes zero inputs.\n";
@@ -266,19 +266,19 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   }
 
   // TODO(unda, 253431636): constrained kernel
-  std::set<string> constrained_kernel = {"Diag",
-                                         "DiagPart",
-                                         "GatherNd",
-                                         "GatherV2",
-                                         "QuantizeAndDequantizeV2",
-                                         "QuantizeAndDequantizeV3",
-                                         "QuantizeAndDequantizeV4",
-                                         "QuantizeAndDequantizeV4Grad",
-                                         "QuantizedConcat",
-                                         "QuantizedInstanceNorm",
-                                         "QuantizedReshape",
-                                         "ScatterNd",
-                                         "TensorScatterUpdate"};
+  std::set<std::string> constrained_kernel = {"Diag",
+                                              "DiagPart",
+                                              "GatherNd",
+                                              "GatherV2",
+                                              "QuantizeAndDequantizeV2",
+                                              "QuantizeAndDequantizeV3",
+                                              "QuantizeAndDequantizeV4",
+                                              "QuantizeAndDequantizeV4Grad",
+                                              "QuantizedConcat",
+                                              "QuantizedInstanceNorm",
+                                              "QuantizedReshape",
+                                              "ScatterNd",
+                                              "TensorScatterUpdate"};
 
   // TODO(unda, b/253431636): constrained kernel
   if (constrained_kernel.find(op_info.op_name) != constrained_kernel.end()) {
@@ -297,7 +297,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
     }
   }
 
-  std::set<string> unhandled_attr_types = {
+  std::set<std::string> unhandled_attr_types = {
       "list(type)",   "func",         "float",      "bool",
       "tensor",       "list(string)", "list(bool)", "list(shape)",
       "list(tensor)", "list(attr)"};
@@ -321,7 +321,7 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   return true;
 }
 
-string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable) {
+std::string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable) {
   return absl::StrCat(
       FuzzerFileStart(), is_fuzzable ? WriteClassFuzzDef(op_info) : "",
       is_fuzzable ? WriteFuzzTest(op_info) : "", FuzzerFileEnd());
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
index c11c9635d6d149..9dfee93e55e2e1 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 namespace cc_op {
 
 // String with single fuzzer file content.
-string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
+std::string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
 
 // Do we have all we need to create a fuzzer
 bool OpFuzzingIsOk(const OpInfo& op_info);
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
index f4a1eb642557de..6da6e2af6c3445 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
@@ -39,8 +39,9 @@ namespace tensorflow {
 namespace cc_op {
 namespace {
 
-void WriteAllFuzzers(string root_location, std::vector<string> api_def_dirs,
-                     std::vector<string> op_names) {
+void WriteAllFuzzers(std::string root_location,
+                     std::vector<std::string> api_def_dirs,
+                     std::vector<std::string> op_names) {
   OpList ops;
   absl::StatusOr<ApiDefMap> api_def_map =
       LoadOpsAndApiDefs(ops, false, api_def_dirs);
@@ -60,7 +61,7 @@ void WriteAllFuzzers(string root_location, std::vector<string> api_def_dirs,
       continue;
     }
 
-    OpInfo op_info(op_def, *api_def, std::vector<string>());
+    OpInfo op_info(op_def, *api_def, std::vector<std::string>());
     status.Update(env->NewWritableFile(
         root_location + "/" + op_def.name() + "_fuzz.cc", &fuzz_file));
     status.Update(
@@ -87,9 +88,9 @@ int main(int argc, char* argv[]) {
   for (int i = 1; i < argc; ++i) {
     fprintf(stdout, "Arg %d = %s\n", i, argv[i]);
   }
-  std::vector<tensorflow::string> api_def_srcs = tensorflow::str_util::Split(
+  std::vector<std::string> api_def_srcs = tensorflow::str_util::Split(
       argv[2], ",", tensorflow::str_util::SkipEmpty());
-  std::vector<tensorflow::string> op_names = tensorflow::str_util::Split(
+  std::vector<std::string> op_names = tensorflow::str_util::Split(
       argv[3], ",", tensorflow::str_util::SkipEmpty());
   tensorflow::cc_op::WriteAllFuzzers(argv[1], api_def_srcs, op_names);
   return 0;
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 357515a5dccb00..f3c3fd045a3d6f 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -218,9 +218,9 @@ REGISTER_GRADIENT_OP("GatherNd", GatherNdGrad);
 absl::Status CheckNumericsGrad(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string message;
+  std::string message;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "message", &message));
-  string err_msg = absl::StrCat(
+  std::string err_msg = absl::StrCat(
       "Not a number (NaN) or infinity (Inf) values detected in gradient. ",
       message);
   grad_outputs->push_back(CheckNumerics(scope, grad_inputs[0], err_msg));
@@ -411,7 +411,7 @@ REGISTER_GRADIENT_OP("DepthToSpace", DepthToSpaceGrad);
 absl::Status MirrorPadGrad(const Scope& scope, const Operation& op,
                            const std::vector<Output>& grad_inputs,
                            std::vector<Output>* grad_outputs) {
-  string mode;
+  std::string mode;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(tensorflow::ops::internal::MirrorPadGrad(
       scope, grad_inputs[0], op.input(1), mode));
@@ -424,7 +424,7 @@ REGISTER_GRADIENT_OP("MirrorPad", MirrorPadGrad);
 absl::Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string mode;
+  std::string mode;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(MirrorPad(scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index 77e2a3bfc38476..deb90eec264ee7 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -95,7 +95,7 @@ absl::Status ScaleAndTranslateGradHelper(const Scope& scope,
                                          const Operation& op,
                                          const std::vector<Output>& grad_inputs,
                                          std::vector<Output>* grad_outputs) {
-  string kernel_type;
+  std::string kernel_type;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "kernel_type", &kernel_type));
   bool antialias;
@@ -117,7 +117,7 @@ absl::Status CropAndResizeGradHelper(const Scope& scope, const Operation& op,
                                      const std::vector<Output>& grad_inputs,
                                      std::vector<Output>* grad_outputs) {
   DataType input_type;
-  string method;
+  std::string method;
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "method", &method));
   TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "T", &input_type));
   auto image_shape = Shape(scope, op.input(0));
diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc
index f7a39f39cfc42a..b77f5512237024 100644
--- a/tensorflow/cc/gradients/image_grad_test.cc
+++ b/tensorflow/cc/gradients/image_grad_test.cc
@@ -203,7 +203,7 @@ class ScaleAndTranslateGradTest : public ::testing::Test {
 
   template <typename T>
   void MakeOp(const Tensor& x_data, const Input& y_shape, Input scale,
-              Input translation, const string& kernel_type, bool antialias,
+              Input translation, const std::string& kernel_type, bool antialias,
               Output* x, Output* y) {
     *x = Const<T>(scope_, x_data);
     *y = ScaleAndTranslate(scope_, *x, y_shape, scale, translation,
@@ -216,7 +216,7 @@ class ScaleAndTranslateGradTest : public ::testing::Test {
   template <typename X_T, typename Y_T, typename JAC_T>
   void TestScaleAndTranslate(const TensorShape x_shape, const int out_height,
                              const int out_width, Input scale,
-                             Input translation, const string& kernel_type,
+                             Input translation, const std::string& kernel_type,
                              bool antialias) {
     Tensor x_data = MakeData<X_T>(x_shape);
     Output x, y;
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index bf6f509c21ee8a..c785af15f95447 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -1070,8 +1070,8 @@ absl::Status MatMulGradHelper(const Scope& scope, const bool is_batch,
 absl::Status MatMulGradCommon(const Scope& scope, const Operation& op,
                               const bool is_batch,
                               const std::vector<Output>& grad_inputs,
-                              const string& attr_adj_x,
-                              const string& attr_adj_y,
+                              const std::string& attr_adj_x,
+                              const std::string& attr_adj_y,
                               std::vector<Output>* grad_outputs) {
   auto a = op.input(0);
   auto b = op.input(1);
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 34c0a8fd54b4c4..6309080492c1da 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -54,7 +54,7 @@ absl::Status SoftmaxGrad(const Scope& scope, const Operation& op,
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
 bool IsZero(const Scope& scope, const Output& grad) {
-  string op_type_name = grad.op().node()->type_string();
+  std::string op_type_name = grad.op().node()->type_string();
   if (op_type_name == "ZerosLike" || op_type_name == "Zeros") {
     return true;
   }
@@ -204,7 +204,7 @@ REGISTER_GRADIENT_OP("L2Loss", L2LossGrad);
 absl::Status BiasAddGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string data_format;
+  std::string data_format;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.output(0).node()->attrs(), "data_format", &data_format));
   auto dx_1 =
@@ -218,9 +218,9 @@ REGISTER_GRADIENT_OP("BiasAdd", BiasAddGradHelper);
 absl::Status Conv2DGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
+  std::string data_format;
+  std::string padding;
+  std::vector<int32_t> strides;
   bool use_cudnn_on_gpu;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
@@ -245,10 +245,10 @@ REGISTER_GRADIENT_OP("Conv2D", Conv2DGrad);
 absl::Status MaxPoolGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  std::vector<int32> ksize;
+  std::string data_format;
+  std::string padding;
+  std::vector<int32_t> strides;
+  std::vector<int32_t> ksize;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
@@ -265,8 +265,8 @@ REGISTER_GRADIENT_OP("MaxPool", MaxPoolGradHelper);
 absl::Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  string data_format;
-  string padding;
+  std::string data_format;
+  std::string padding;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
@@ -283,10 +283,10 @@ REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 absl::Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
@@ -304,10 +304,10 @@ REGISTER_GRADIENT_OP("MaxPool3D", MaxPool3DGradHelper);
 absl::Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
                                const std::vector<Output>& grad_inputs,
                                std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
@@ -325,10 +325,10 @@ REGISTER_GRADIENT_OP("AvgPool", AvgPoolGradHelper);
 absl::Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
                                  const std::vector<Output>& grad_inputs,
                                  std::vector<Output>* grad_outputs) {
-  std::vector<int32> ksize;
-  std::vector<int32> strides;
-  string padding;
-  string data_format;
+  std::vector<int32_t> ksize;
+  std::vector<int32_t> strides;
+  std::string padding;
+  std::string data_format;
   auto attrs = op.output(0).node()->attrs();
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 56ac37e86b7168..1d23f9d87e2d7d 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <chrono>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 
 #include "absl/log/log.h"
@@ -70,7 +72,7 @@ absl::Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
                            queue_runner_def.enqueue_op_name().begin(),
                            queue_runner_def.enqueue_op_name().end());
   size_t op_names_size = enqueue_op_names_.size();
-  if (op_names_size > kint32max) {
+  if (op_names_size > std::numeric_limits<int32_t>::max()) {
     return absl::Status(absl::StatusCode::kInvalidArgument,
                         "Enqueue ops to run cannot exceed kint32max");
   }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index ddcf94fbc07951..1722da0d390915 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -51,8 +51,8 @@ cc_library(
         "@local_xla//xla:status_macros",
         "@local_xla//xla:util",
         "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/backends/cpu/runtime:convolution_lib",
-        "@local_xla//xla/backends/cpu/runtime:dot_lib",
+        "@local_xla//xla/backends/cpu/runtime:convolution_dims",
+        "@local_xla//xla/backends/cpu/runtime:dot_dims",
         "@local_xla//xla/backends/cpu/runtime:thunk_proto_cc",
         "@local_xla//xla/service/cpu:executable_proto_cc",
         "@local_xla//xla/tsl/platform:statusor",
@@ -96,6 +96,7 @@ cc_library(
         ":thunk_proto_execution_deserializer",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:allocator",
+        "//tensorflow/compiler/tf2xla:encoded_buffer_allocation_info",
         "//tensorflow/compiler/tf2xla:mlir_tf2xla",  # fixdeps: keep
         "//tensorflow/compiler/tf2xla:tf2xla_proto_cc",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
diff --git a/tensorflow/compiler/aot/aot_only_var_handle_op.cc b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
index 86666b073b0f71..f6293e0a2063bb 100644
--- a/tensorflow/compiler/aot/aot_only_var_handle_op.cc
+++ b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
@@ -31,7 +31,7 @@ class XlaAotOnlyVarHandleOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* context) override;
 
  private:
-  string name_;
+  std::string name_;
 };
 
 XlaAotOnlyVarHandleOp::XlaAotOnlyVarHandleOp(OpKernelConstruction* c)
diff --git a/tensorflow/compiler/aot/benchmark.cc b/tensorflow/compiler/aot/benchmark.cc
index 43b9c06418c2e1..ee4af4ca65a20f 100644
--- a/tensorflow/compiler/aot/benchmark.cc
+++ b/tensorflow/compiler/aot/benchmark.cc
@@ -37,10 +37,10 @@ namespace benchmark {
 //
 // TODO(b/33546473): Refactor tensorflow::Env::NowMicros() so that we can re-use
 // the implementation without pulling in all of the Env dependencies.
-static uint64 NowMicros() {
+static uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<uint64>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
 
 void DumpStatsToStdout(const Stats& stats) {
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index f4969b93353e42..783dc69b6ad5c2 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 #include "tensorflow/compiler/aot/thunk_proto_execution_deserializer.h"
 #include "tensorflow/compiler/tf2xla/allocator.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "xla/backends/cpu/buffer_allocation_info.h"
@@ -67,41 +69,35 @@ namespace {
 
 using xla::cpu::BufferAllocationInfo;
 
-bool IsAlpha(char c) {
-  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-}
-
-bool IsAlphaNum(char c) { return IsAlpha(c) || (c >= '0' && c <= '9'); }
-
 // Convert an XLA type into a C++ type.
-absl::Status XLATypeToCpp(xla::PrimitiveType type, string* str) {
+absl::Status XLATypeToCpp(xla::PrimitiveType type, std::string* str) {
   switch (type) {
     case xla::PRED:
       *str = "bool";
       break;
     case xla::S8:
-      *str = "tensorflow::int8";
+      *str = "int8_t";
       break;
     case xla::S16:
-      *str = "tensorflow::int16";
+      *str = "int16_t";
       break;
     case xla::S32:
-      *str = "tensorflow::int32";
+      *str = "int32_t";
       break;
     case xla::S64:
       *str = "int64_t";
       break;
     case xla::U8:
-      *str = "tensorflow::uint8";
+      *str = "uint8_t";
       break;
     case xla::U16:
-      *str = "tensorflow::uint16";
+      *str = "uint16_t";
       break;
     case xla::U32:
-      *str = "tensorflow::uint32";
+      *str = "uint32_t";
       break;
     case xla::U64:
-      *str = "tensorflow::uint64";
+      *str = "uint64_t";
       break;
     case xla::F32:
       *str = "float";
@@ -155,11 +151,11 @@ std::vector<BufferAllocationInfo> ExtractTempBufferAllocationInfos(
 // are used to generate methods for args and results.
 absl::Status AddRewritesForShape(
     int i, const xla::Shape& shape,
-    std::vector<std::pair<string, string>>* rewrites) {
-  string type;
+    std::vector<std::pair<std::string, std::string>>* rewrites) {
+  std::string type;
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
-  std::vector<string> dim_vars;
-  string dim_sizes, indices;
+  std::vector<std::string> dim_vars;
+  std::string dim_sizes, indices;
   int count = 1;
   if (shape.dimensions().size() == 0 ||
       (shape.dimensions().size() == 1 && shape.dimensions(0) == 1)) {
@@ -168,8 +164,8 @@ absl::Status AddRewritesForShape(
   } else {
     for (int dim = 0; dim < shape.dimensions().size(); ++dim) {
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
-      dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
-      indices += absl::StrCat("[dim", dim, "]");
+      absl::StrAppend(&dim_sizes, "[", shape.dimensions(dim), "]");
+      absl::StrAppend(&indices, "[dim", dim, "]");
       count *= shape.dimensions(dim);
     }
   }
@@ -190,8 +186,9 @@ absl::Status AddRewritesForShape(
 // TODO(toddw): If this becomes a problem, we should be able to change the
 // algorithm to O(N) by using a state machine, e.g. regexps or a real
 // text-templating mechanism.
-string RewriteWithName(const string& name, string code,
-                       const std::vector<std::pair<string, string>>& rewrites) {
+std::string RewriteWithName(
+    const std::string& name, std::string code,
+    const std::vector<std::pair<std::string, std::string>>& rewrites) {
   absl::StrReplaceAll(rewrites, &code);
   absl::StrReplaceAll({{"{{NAME}}", name}}, &code);
   return code;
@@ -201,7 +198,7 @@ string RewriteWithName(const string& name, string code,
 absl::Status GenArgMethods(const tf2xla::Config& config,
                            const xla::ProgramShapeProto& ps,
                            const CompileResult& compile_result,
-                           string* methods) {
+                           std::string* methods) {
   const int num_args = ps.parameters_size();
   // feed_size() + variable_size() is the maximum number of args as an
   // implementation may not create an argument for an unused variable.
@@ -211,11 +208,11 @@ absl::Status GenArgMethods(const tf2xla::Config& config,
         config.variable_size(), ") and num_args(", num_args, ")");
   }
   for (int i = 0; i < config.feed_size(); ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.parameters(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    const string code = R"(
+    const std::string code = R"(
   void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
@@ -251,7 +248,7 @@ absl::Status GenArgMethods(const tf2xla::Config& config,
 // Generate methods for results (outputs).
 absl::Status GenResultMethods(const tf2xla::Config& config,
                               const xla::ProgramShapeProto& ps,
-                              string* methods) {
+                              std::string* methods) {
   if (ps.result().element_type() != xla::TUPLE) {
     // The XlaCompiler we use to build the xla computation always generates a
     // tuple result, and we rely on this to simplify code generation.
@@ -270,11 +267,11 @@ absl::Status GenResultMethods(const tf2xla::Config& config,
                                    ps.result().tuple_shapes_size(), ")");
   }
   for (int i = 0; i < config.fetch_size(); ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.result().tuple_shapes(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    string code = R"(
+    std::string code = R"(
   {{TYPE}}* result{{NAME}}_data() {
     return static_cast<{{TYPE}}*>(result_data({{I}}));
   }
@@ -307,14 +304,14 @@ absl::Status GenResultMethods(const tf2xla::Config& config,
 // Generate methods for variables.
 absl::Status GenVariableMethods(const tf2xla::Config& config,
                                 const xla::ProgramShapeProto& ps,
-                                string* methods) {
+                                std::string* methods) {
   const int num_args = ps.parameters_size();
   for (int i = config.feed_size(); i < num_args; ++i) {
-    std::vector<std::pair<string, string>> rewrites;
+    std::vector<std::pair<std::string, std::string>> rewrites;
     TF_ASSIGN_OR_RETURN(xla::Shape shape,
                         xla::Shape::FromProto(ps.parameters(i)));
     TF_RETURN_IF_ERROR(AddRewritesForShape(i, shape, &rewrites));
-    const string code = R"(
+    const std::string code = R"(
   void set_var_{{NAME}}_data({{MAYBE_CONST}}{{TYPE}}* data) {
     set_arg_data({{I}}, data);
   }
@@ -348,7 +345,8 @@ absl::Status GenVariableMethods(const tf2xla::Config& config,
 }
 
 // Generate shape infos for args (inputs).
-absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
+absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps,
+                              std::string* infos) {
   for (int i = 0; i < ps.parameters_size(); ++i) {
     const xla::ShapeProto& shape = ps.parameters(i);
     if (shape.element_type() == xla::TUPLE) {
@@ -386,7 +384,7 @@ absl::Status GenArgShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
 
 // Generate shape infos for results.
 absl::Status GenResultShapeInfos(const xla::ProgramShapeProto& ps,
-                                 string* infos) {
+                                 std::string* infos) {
   if (ps.result().element_type() != xla::TUPLE) {
     return absl::InternalError("codegen requires the XLA result to be a tuple");
   }
@@ -420,7 +418,7 @@ absl::Status GenResultShapeInfos(const xla::ProgramShapeProto& ps,
 // tf2xla::{Feed,Fetch,Variable}. Each feed or fetch name results in a C-style
 // string literal in the array, with nullptr terminating the array.
 template <typename T>
-string GenNameToIndexCode(const T& entries, bool generate) {
+std::string GenNameToIndexCode(const T& entries, bool generate) {
   // No need for a static array if we're not supposed to generate the data.
   if (!generate) {
     return "{\n    return nullptr;\n  }";
@@ -435,7 +433,7 @@ string GenNameToIndexCode(const T& entries, bool generate) {
     end = i;
   }
   // Emit string literals up to the last non-empty name.
-  string code = "{\n    static const char* kNames[] = {";
+  std::string code = "{\n    static const char* kNames[] = {";
   for (int i = 0; i < end; ++i) {
     if (i > 0) {
       code += ", ";
@@ -704,13 +702,13 @@ absl::Status ExtendRewrites(
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kSortThunk)) {
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_key_value_sort.h")");
+        R"(#include "xla/backends/cpu/runtime/sort_lib.h")");
   }
 
   if (HasThunkKind(aot_thunks->proto().thunk_sequence(),
                    xla::cpu::ThunkProto::kTopKThunk)) {
     runtime_specific_includes.push_back(
-        R"(#include "xla/service/cpu/runtime_topk.h")");
+        R"(#include "xla/backends/cpu/runtime/topk_lib.h")");
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -836,18 +834,19 @@ absl::Status ExtendRewrites(
 absl::Status GenerateHeader(
     const CodegenOpts& opts, const tf2xla::Config& config,
     const CompileResult& compile_result, const MetadataResult& metadata_result,
-    const EmbeddedConstantBuffers& embedded_constant_buffers, string* header) {
+    const EmbeddedConstantBuffers& embedded_constant_buffers,
+    std::string* header) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
 
   absl::Span<const BufferAllocationInfo> buffer_infos =
       compile_result.aot->buffer_allocation_infos();
 
-  const std::vector<int32> arg_index_table =
+  const std::vector<int32_t> arg_index_table =
       ::xla::cpu::CreateArgIndexTable(buffer_infos);
-  const std::vector<int32> result_index_table =
+  const std::vector<int32_t> result_index_table =
       ::xla::cpu::CreateResultIndexTable(buffer_infos);
-  std::vector<string> buffer_infos_as_strings =
+  std::vector<std::string> buffer_infos_as_strings =
       BufferAllocationInfosToCppExpression(buffer_infos);
 
   // Compute sizes and generate methods.
@@ -856,11 +855,11 @@ absl::Status GenerateHeader(
   std::vector<BufferAllocationInfo> buffer_infos_for_temps =
       ExtractTempBufferAllocationInfos(buffer_infos);
   const xla::ProgramShapeProto& ps = compile_result.program_shape;
-  string methods_arg, methods_result, methods_variable;
+  std::string methods_arg, methods_result, methods_variable;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
   TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
-  string arg_shape_infos, result_shape_infos;
+  std::string arg_shape_infos, result_shape_infos;
   TF_RETURN_IF_ERROR(GenArgShapeInfos(ps, &arg_shape_infos));
   TF_RETURN_IF_ERROR(
       CheckEqual(ps.parameters_size(), arg_index_table.size(),
@@ -880,19 +879,19 @@ absl::Status GenerateHeader(
   const size_t temp_bytes_total = TotalBufferBytes(buffer_infos_for_temps);
 
   // Create rewrite strings for namespace start and end.
-  string ns_start;
-  for (const string& n : opts.namespaces) {
+  std::string ns_start;
+  for (const std::string& n : opts.namespaces) {
     ns_start += absl::StrCat("namespace ", n, " {\n");
   }
   ns_start += "\n";
-  string ns_end("\n");
+  std::string ns_end("\n");
   for (int i = opts.namespaces.size() - 1; i >= 0; --i) {
-    const string& n = opts.namespaces[i];
+    const std::string& n = opts.namespaces[i];
     ns_end += absl::StrCat("}  // end namespace ", n, "\n");
   }
 
   // Generate metadata.
-  const string arg_names_code =
+  const std::string arg_names_code =
       GenNameToIndexCode(config.feed(), opts.gen_name_to_index);
 
   auto variable_copy = config.variable();
@@ -901,12 +900,12 @@ absl::Status GenerateHeader(
       var.set_name(var.node_name());
     }
   }
-  const string variable_names_code =
+  const std::string variable_names_code =
       GenNameToIndexCode(variable_copy, opts.gen_name_to_index);
 
-  const string result_names_code =
+  const std::string result_names_code =
       GenNameToIndexCode(config.fetch(), opts.gen_name_to_index);
-  const string include_xla_data_proto =
+  const std::string include_xla_data_proto =
       opts.gen_program_shape
           ? R"(#include "xla/xla_data.pb.h")"
           : "";
@@ -1155,7 +1154,7 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
   }
 
   // The replacement strategy is naive, but good enough for our purposes.
-  std::vector<std::pair<string, string>> rewrites = {
+  std::vector<std::pair<std::string, std::string>> rewrites = {
       {"{{ARG_BYTES_ALIGNED}}", absl::StrCat(arg_bytes_aligned)},
       {"{{ARG_BYTES_TOTAL}}", absl::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
@@ -1194,10 +1193,10 @@ class {{CLASS}} final : public tensorflow::{{COMPUTATION_CLASS_BASE}} {
   return absl::OkStatus();
 }
 
-static string CreateUniqueIdentifier(const CodegenOpts& opts,
-                                     absl::string_view suffix) {
-  string result = "__tfcompile";
-  for (const string& n : opts.namespaces) {
+static std::string CreateUniqueIdentifier(const CodegenOpts& opts,
+                                          absl::string_view suffix) {
+  std::string result = "__tfcompile";
+  for (const std::string& n : opts.namespaces) {
     absl::StrAppend(&result, "_", n);
   }
 
@@ -1303,14 +1302,15 @@ absl::Status GenerateMetadata(const CodegenOpts& opts,
   return absl::OkStatus();
 }
 
-absl::Status ParseCppClass(const string& cpp_class, string* class_name,
-                           std::vector<string>* namespaces) {
+absl::Status ParseCppClass(const std::string& cpp_class,
+                           std::string* class_name,
+                           std::vector<std::string>* namespaces) {
   class_name->clear();
   namespaces->clear();
   if (cpp_class.empty()) {
     return errors::InvalidArgument("empty cpp_class: " + cpp_class);
   }
-  std::vector<string> parts = absl::StrSplit(cpp_class, "::");
+  std::vector<std::string> parts = absl::StrSplit(cpp_class, "::");
   if (parts.front().empty()) {
     // Allow a fully qualified name that starts with "::".
     parts.erase(parts.begin());
@@ -1343,11 +1343,11 @@ absl::Status ValidateCppIdent(absl::string_view ident, absl::string_view msg) {
   // implementation-defined characters`.  We disallow those here to give
   // better error messages, at the expensive of being more restrictive than
   // the standard.
-  if (ident[0] != '_' && !IsAlpha(ident[0])) {
+  if (ident[0] != '_' && !absl::ascii_isalpha(ident[0])) {
     return errors::InvalidArgument("illegal leading char: ", msg);
   }
   for (size_t pos = 1; pos < ident.size(); ++pos) {
-    if (ident[pos] != '_' && !IsAlphaNum(ident[pos])) {
+    if (ident[pos] != '_' && !absl::ascii_isalnum(ident[pos])) {
       return errors::InvalidArgument("illegal char: ", msg);
     }
   }
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 77300b0fde4e3d..ff7d96720b4eba 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -32,14 +32,14 @@ namespace tfcompile {
 // and the generated metadata object file.
 struct CodegenOpts {
   // The name of the generated C++ class, wrapping the generated function.
-  string class_name;
+  std::string class_name;
 
   // Target triple for the architecture we're targeting.
-  string target_triple;
+  std::string target_triple;
 
   // Namespaces specifies a list of C++ namespaces to add to the generated
   // header.  If empty, all symbols will be in the global namespace.
-  std::vector<string> namespaces;
+  std::vector<std::string> namespaces;
 
   // If true, generate name-to-index data for Lookup{Arg,Result}Index methods.
   bool gen_name_to_index = false;
@@ -62,27 +62,27 @@ struct CodegenOpts {
 struct MetadataResult {
   // These are top level "extern C" declarations that are expected to be visible
   // wherever program_shape_access_shim is emitted.
-  std::vector<string> header_variable_decls;
+  std::vector<std::string> header_variable_decls;
 
   // program_shape_access_shim is a C++ expression that constructs the
   // xla::ProgramShapeProto instance for the CompileResult passed to
   // GenerateMetadata.
-  string program_shape_access_shim;
+  std::string program_shape_access_shim;
 
   // hlo_profile_printer_data_access_shim is a C++ expression that constructs
   // the xla::HloProfilePrinterData instance for the CompileResult passed to
   // GenerateMetadata.  If the xla::HloProfilePrinterData is null then this is a
   // C++ expression that evaluates to nullptr at runtime.
   // This is set only for AOT legacy.
-  string hlo_profile_printer_data_access_shim;
+  std::string hlo_profile_printer_data_access_shim;
 
   // cpu_executable_access_shim is a C++ expression that constructs
   // a protobuf required to construct a CpuExecutable.
   // This is set only for AOT thunks.
-  string cpu_executable_access_shim;
+  std::string cpu_executable_access_shim;
 
   // The contents of the object (".o") file.
-  string object_file_data;
+  std::string object_file_data;
 };
 
 // Generates a set of constant buffers embedded into an object file.
@@ -105,14 +105,16 @@ absl::Status GenerateMetadata(const CodegenOpts& opts,
 absl::Status GenerateHeader(
     const CodegenOpts& opts, const tf2xla::Config& config,
     const CompileResult& compile_result, const MetadataResult& metadata_result,
-    const EmbeddedConstantBuffers& embedded_constant_buffers, string* header);
+    const EmbeddedConstantBuffers& embedded_constant_buffers,
+    std::string* header);
 
 // ParseCppClass parses `cpp_class` into its `class_name` and `namespaces`
 // components.  The syntax is [[<optional_namespace>::],...]<class_name>.  This
 // mirrors the C++ syntax for referring to a class, where multiple namespaces
 // may precede the class name, separated by double-colons.
-absl::Status ParseCppClass(const string& cpp_class, string* class_name,
-                           std::vector<string>* namespaces);
+absl::Status ParseCppClass(const std::string& cpp_class,
+                           std::string* class_name,
+                           std::vector<std::string>* namespaces);
 
 // ValidateCppIdent returns OK iff ident is a valid C++ identifier.  The msg is
 // appended to error messages.
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index a4f18482db7f32..ec0f336d87f716 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -53,7 +53,7 @@ TEST(ValidateCppIdent, Simple) {
   TF_EXPECT_OK(ValidateCppIdent("_abc", ""));
   TF_EXPECT_OK(ValidateCppIdent("_abc123", ""));
   // Make sure we didn't skip a valid letter or digit
-  string ident;
+  std::string ident;
   for (char c = 'a'; c <= 'z'; c++) {
     ident.append(1, c);
   }
@@ -78,18 +78,19 @@ TEST(ValidateCppIdent, Simple) {
 
 class ParseCppClassTest : public ::testing::Test {
  protected:
-  void ExpectOK(const string& cpp_class, const string& want_class_name,
-                const std::vector<string>& want_namespaces) {
-    string class_name;
-    std::vector<string> namespaces;
+  void ExpectOK(const std::string& cpp_class,
+                const std::string& want_class_name,
+                const std::vector<std::string>& want_namespaces) {
+    std::string class_name;
+    std::vector<std::string> namespaces;
     TF_EXPECT_OK(ParseCppClass(cpp_class, &class_name, &namespaces));
     EXPECT_EQ(class_name, want_class_name);
     EXPECT_EQ(namespaces, want_namespaces);
   }
 
-  void ExpectFail(const string& cpp_class) {
-    string class_name;
-    std::vector<string> namespaces;
+  void ExpectFail(const std::string& cpp_class) {
+    std::string class_name;
+    std::vector<std::string> namespaces;
     EXPECT_NE(ParseCppClass(cpp_class, &class_name, &namespaces),
               absl::OkStatus())
         << cpp_class;
@@ -110,7 +111,7 @@ TEST_F(ParseCppClassTest, ParseOK) {
   ExpectOK("::_foo::MyClass", "MyClass", {"_foo"});
   ExpectOK("::_foo::_MyClass", "_MyClass", {"_foo"});
   // Make sure we didn't skip a valid letter or digit
-  string ident;
+  std::string ident;
   for (char c = 'a'; c <= 'z'; c++) {
     ident.append(1, c);
   }
@@ -143,10 +144,10 @@ TEST_F(ParseCppClassTest, ParseFail) {
 }
 
 static void CompareWithGoldenFile(
-    const string& tensorflow_relative_golden_file_name,
-    const string& expected_contents, bool ignore_cr) {
+    const std::string& tensorflow_relative_golden_file_name,
+    const std::string& expected_contents, bool ignore_cr) {
   // Get rid of all CR characters, we may be running under windows.
-  string sanitized_expected_contents(expected_contents);
+  std::string sanitized_expected_contents(expected_contents);
   if (ignore_cr) {
     sanitized_expected_contents.erase(
         std::remove(sanitized_expected_contents.begin(),
@@ -159,7 +160,7 @@ static void CompareWithGoldenFile(
   // blaz test --test_strategy=local \
   //   "third_party/tensorflow/compiler/aot:codegen_test"
   const bool update_golden = false;
-  string golden_file_name =
+  std::string golden_file_name =
       GetDataDependencyFilepath(tensorflow_relative_golden_file_name);
 
   if (update_golden) {
@@ -167,7 +168,7 @@ static void CompareWithGoldenFile(
         WriteStringToFile(Env::Default(), golden_file_name, expected_contents));
   }
 
-  string golden_file_contents;
+  std::string golden_file_contents;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), golden_file_name,
                                 &golden_file_contents));
   if (ignore_cr) {
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 7d0897829b98ca..48c92bf346926f 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -212,7 +212,7 @@ absl::Status CompileGraph(GraphDef graph_def, const tf2xla::Config& config,
   return CompileXla(client, computation, aot_opts, compile_result);
 }
 
-static absl::Status ReadProtoFile(const string& fname,
+static absl::Status ReadProtoFile(const std::string& fname,
                                   protobuf::Message* proto) {
   if (absl::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
@@ -297,7 +297,7 @@ absl::Status Main(const MainFlags& flags) {
   TF_RETURN_IF_ERROR(ReadProtoFile(flags.config, &config));
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   if (flags.dump_fetch_nodes) {
-    std::set<string> nodes;
+    std::set<std::string> nodes;
     for (const tf2xla::Fetch& fetch : config.fetch()) {
       nodes.insert(fetch.id().node_name());
     }
@@ -368,7 +368,7 @@ absl::Status Main(const MainFlags& flags) {
       GenerateMetadata(codegen_opts, compile_result, &metadata_result));
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_metadata_object,
                                        metadata_result.object_file_data));
-  string header;
+  std::string header;
   TF_RETURN_IF_ERROR(GenerateHeader(codegen_opts, config, compile_result,
                                     metadata_result, embedded_constant_buffers,
                                     &header));
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 303854f40ed88c..2a0418126b8aaf 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -38,7 +38,7 @@ struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
   xla::ProgramShapeProto program_shape;  // Static shape of args and results.
-  string entry_point;                    // Name of generated function.
+  std::string entry_point;               // Name of generated function.
   int pointer_size = 0;                  // Size of a pointer in bytes.
 };
 
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index b6a6e4cfc2c8d9..1626686ba465ad 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -41,9 +41,9 @@ using xla::llvm_ir::AsStringRef;
 
 static void AddEmbeddedProtocolBufferToLlvmModule(
     llvm::Module* module, const ::tensorflow::protobuf::MessageLite& proto,
-    absl::string_view unique_identifier, string* protobuf_array_symbol_name,
-    int64_t* protobuf_array_size) {
-  string protobuf_array_contents = proto.SerializeAsString();
+    absl::string_view unique_identifier,
+    std::string* protobuf_array_symbol_name, int64_t* protobuf_array_size) {
+  std::string protobuf_array_contents = proto.SerializeAsString();
   *protobuf_array_symbol_name =
       absl::StrCat(unique_identifier, "_protobuf_array_contents");
   *protobuf_array_size = protobuf_array_contents.size();
@@ -58,10 +58,10 @@ static void AddEmbeddedProtocolBufferToLlvmModule(
       protobuf_array_initializer, AsStringRef(*protobuf_array_symbol_name));
 }
 
-static string CreateCPPShimExpression(
+static std::string CreateCPPShimExpression(
     absl::string_view qualified_cpp_protobuf_name,
     absl::string_view protobuf_array_symbol_name, int64_t protobuf_array_size) {
-  string code =
+  std::string code =
       "[]() {\n"
       "    {{PROTOBUF_NAME}}* proto = new {{PROTOBUF_NAME}};\n"
       "    proto->ParseFromArray(&{{ARRAY_SYMBOL}}[0], {{ARRAY_SIZE}});\n"
@@ -77,7 +77,7 @@ static string CreateCPPShimExpression(
       });
 }
 
-static absl::StatusOr<string> CodegenModule(
+static absl::StatusOr<std::string> CodegenModule(
     llvm::TargetMachine* target_machine, std::unique_ptr<llvm::Module> module) {
   llvm::SmallVector<char, 0> stream_buffer;
   llvm::raw_svector_ostream ostream(stream_buffer);
@@ -91,7 +91,7 @@ static absl::StatusOr<string> CodegenModule(
 
   codegen_passes.run(*module);
 
-  return string(stream_buffer.begin(), stream_buffer.end());
+  return std::string(stream_buffer.begin(), stream_buffer.end());
 }
 
 static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
@@ -124,9 +124,9 @@ absl::StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
   EmbeddedProtocolBuffers result;
 
   for (const ProtobufToEmbed& protobuf_to_embed : protobufs_to_embed) {
-    string cpp_shim, cpp_variable_decl;
+    std::string cpp_shim, cpp_variable_decl;
     if (protobuf_to_embed.message) {
-      string protobuf_array_symbol_name;
+      std::string protobuf_array_symbol_name;
       int64_t protobuf_array_size;
 
       AddEmbeddedProtocolBufferToLlvmModule(
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 0af4d4a3362f8c..aa3553f3b6a85b 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -37,11 +37,11 @@ struct EmbeddedProtocolBuffers {
   struct CPPShim {
     // `expression` is a C++ expression that creates an instance of said
     // protocol buffer when executed.
-    string expression;
+    std::string expression;
 
     // `variable_decl` is an "extern C" array declaration that is used in
     // `expression`.  It must be visible wherever `expression` is emitted.
-    string variable_decl;
+    std::string variable_decl;
   };
 
   // Each cpp_shim corresponds to one embedded protocol buffer.
@@ -50,20 +50,20 @@ struct EmbeddedProtocolBuffers {
   // The contents of the object (".o") file the protocol buffers are embbed in.
   // This needs to be linked in to any program that wants to execute any of the
   // expressions in `cpp_shims`.
-  string object_file_data;
+  std::string object_file_data;
 };
 
 // Describes a protocol buffer to embed into an object file.
 struct ProtobufToEmbed {
   // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
   // or DSO the generated object file will be linked into.
-  string symbol_prefix;
+  std::string symbol_prefix;
 
   // `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
   // namespace qualified) protocol buffer name.  This is only used in
   // CPPShim::expression so relatively qualified names are fine as long as
   // they're valid wherever CPPShim::expression is emitted.
-  string qualified_cpp_protobuf_name;
+  std::string qualified_cpp_protobuf_name;
 
   // `message` is the protocol buffer to be embedded.  It is allowed to be
   // nullptr, in which case the generated C++ shim expression is just `nullptr`,
diff --git a/tensorflow/compiler/aot/flags.h b/tensorflow/compiler/aot/flags.h
index 9a3f2900dbafe4..5d0f93f7d67b88 100644
--- a/tensorflow/compiler/aot/flags.h
+++ b/tensorflow/compiler/aot/flags.h
@@ -27,27 +27,27 @@ namespace tfcompile {
 // Flags for the tfcompile binary.  See *.cc file for descriptions.
 
 struct MainFlags {
-  string graph;
-  string debug_info;
-  string debug_info_path_begin_marker;
-  string config;
+  std::string graph;
+  std::string debug_info;
+  std::string debug_info_path_begin_marker;
+  std::string config;
   bool dump_fetch_nodes = false;
-  string target_triple;
-  string target_cpu;
-  string target_features;
-  string entry_point;
-  string cpp_class;
-  string out_function_object;
-  string out_metadata_object;
-  string out_header;
-  string out_constant_buffers_object;
-  string out_session_module;
-  string mlir_components;
+  std::string target_triple;
+  std::string target_cpu;
+  std::string target_features;
+  std::string entry_point;
+  std::string cpp_class;
+  std::string out_function_object;
+  std::string out_metadata_object;
+  std::string out_header;
+  std::string out_constant_buffers_object;
+  std::string out_session_module;
+  std::string mlir_components;
   bool experimental_quantize = false;
 
   // Sanitizer pass options
   bool sanitize_dataflow = false;
-  string sanitize_abilists_dataflow;
+  std::string sanitize_abilists_dataflow;
 
   // C++ codegen options
   bool gen_name_to_index = false;
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 8caeec32b7bc5e..e2509d653974e7 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -63,6 +63,7 @@ def _tfcompile_model_library_rule_impl(ctx):
                       "--xla_cpu_fast_math_honor_functions=false " +
                       "--xla_cpu_fast_math_honor_division=false " +
                       "--xla_cpu_enable_fast_min_max=true " +
+                      "--xla_cpu_experimental_ynn_fusion_type= " +
                       additional_xla_flags + " " +
                       "$${XLA_FLAGS:-}' "),
         "CUDA_VISIBLE_DEVICES": "",
@@ -335,11 +336,10 @@ def _tf_library(
         ] or []) + (include_standard_runtime_deps and [
             # TODO(cwhipkey): only depend on kernel code that the model actually
             # needed.
+            "@local_xla//xla/backends/cpu/runtime:sort_lib",
+            "@local_xla//xla/backends/cpu/runtime:topk_lib",
             "@local_xla//xla/service/cpu:runtime_conv2d",
-            "@local_xla//xla/service/cpu:runtime_custom_call_status",
-            "@local_xla//xla/service/cpu:runtime_key_value_sort",
             "@local_xla//xla/service/cpu:runtime_matmul",
-            "@local_xla//xla/service/cpu:runtime_topk",
             "@local_xla//xla/service/cpu:runtime_single_threaded_conv2d",
             "@local_xla//xla/service/cpu:runtime_single_threaded_matmul",
             "@eigen_archive//:eigen3",
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
index d2ced20a8d5eec..0c4edc85f99d19 100644
--- a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/layout_util.h"
 #include "xla/service/cpu/executable.pb.h"
@@ -594,35 +594,46 @@ ThunkProtoExecutionDeserializer::GetSortThunkRunImpl(
   std::vector<std::string> buffers_to_sort;
   buffers_to_sort.reserve(sort_thunk.inputs_shapes_size());
 
-  std::vector<int32_t> values_primitive_type_size_in_bytes;
-  values_primitive_type_size_in_bytes.reserve(sort_thunk.inputs_shapes_size());
+  std::vector<int32_t> primitive_sizes;
+  primitive_sizes.reserve(sort_thunk.inputs_shapes_size());
   for (const auto& buffer_proto : sort_thunk.inputs_shapes()) {
     buffers_to_sort.push_back(
-        absl::StrCat("reinterpret_cast<char*>(",
+        absl::StrCat("reinterpret_cast<std::byte*>(",
                      GetBufferAllocationString(buffer_proto.slice()), ")"));
-    values_primitive_type_size_in_bytes.push_back(
-        xla::ShapeUtil::ByteSizeOfPrimitiveType(
-            buffer_proto.shape().element_type()));
+    primitive_sizes.push_back(xla::ShapeUtil::ByteSizeOfPrimitiveType(
+        buffer_proto.shape().element_type()));
   }
   absl::string_view sort_thunk_invocation_format = R"(
      // Sort Thunk
      {
-       std::vector<char*> values = {
+       std::vector<std::byte*> values = {
          {{BUFFERS_TO_SORT}}
        };
-       std::vector<int32_t> values_primitive_type_size_in_bytes = {
+       std::vector<size_t> primitive_sizes = {
          {{VALUES_PRIMITIVE_TYPE_SIZE_IN_BYTES}}
        };
 
-       __xla_cpu_runtime_KeyValueSort(
-         {{HIGHER_DIMENSIONS}}, {{SORT_DIMENSION_ELEMENTS}}, {{LOWER_DIMENSIONS}},
-         values.data(),
-         int32_t(values.size()),
-         values_primitive_type_size_in_bytes.data(),
-         /*is_stable=*/{{IS_STABLE}},
-         reinterpret_cast<char*>(run_options),
-         /*prof_counters=*/nullptr,
-         reinterpret_cast<void(*)(char*, char*, char**, char**, int64_t*)>({{SORT_FUNCTION_NAME}}));
+       // Type alias compatible with `FunctionLibrary::Comparator`.
+       using Comparator = void(bool* result, const void* run_options,
+                               const void** params, const void* buffer_table,
+                               const void* status, const void* prof_counters);
+       Comparator* comparator = reinterpret_cast<Comparator*>(
+           {{SORT_FUNCTION_NAME}});
+
+       absl::AnyInvocable<bool(const void** data)> less_than =
+           [comparator](const void** data) {
+             bool result;
+             (*comparator)(&result, nullptr, data, nullptr, nullptr, nullptr);
+             return result;
+           };
+
+       xla::cpu::internal::SortInplace(
+         {
+           {{HIGHER_DIMENSIONS}},
+           {{SORT_DIMENSION_ELEMENTS}},
+           {{LOWER_DIMENSIONS}}
+         },
+         values, primitive_sizes, {{IS_STABLE}}, &less_than);
      })";
 
   TF_ASSIGN_OR_RETURN(
@@ -660,7 +671,7 @@ ThunkProtoExecutionDeserializer::GetSortThunkRunImpl(
           {"{{SORT_FUNCTION_NAME}}", sort_thunk.comparator_name()},
           {"{{BUFFERS_TO_SORT}}", absl::StrJoin(buffers_to_sort, ", ")},
           {"{{VALUES_PRIMITIVE_TYPE_SIZE_IN_BYTES}}",
-           absl::StrJoin(values_primitive_type_size_in_bytes, ", ")},
+           absl::StrJoin(primitive_sizes, ", ")},
           {"{{IS_STABLE}}", sort_thunk.is_stable() ? "true" : "false"},
       });
 }
@@ -677,7 +688,7 @@ ThunkProtoExecutionDeserializer::GetTopKThunkRunImpl(
   absl::string_view topk_thunk_invocation_format = R"(
      // TopK Thunk
      {
-    __xla_cpu_runtime_TopKF32({{BATCH_SIZE}}, {{INPUT_SIZE}}, {{K}},
+    ::xla::cpu::internal::TopK({{BATCH_SIZE}}, {{INPUT_SIZE}}, {{K}},
                               reinterpret_cast<const float*>({{VALUES_PTR}}),
                               reinterpret_cast<float*>({{OUTPUT_PTR}}),
                               reinterpret_cast<int32_t*>({{INDICES_PTR}}));
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
index 1e5e47f140020e..a5adeff3917b46 100644
--- a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/cpu/executable.pb.h"
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c65bb6c44b1079..7c1772c084750c 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -654,6 +654,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_xla//xla:future",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:util",
@@ -662,7 +663,6 @@ cc_library(
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:pjrt_common",
         "@local_xla//xla/pjrt:pjrt_executable",
-        "@local_xla//xla/pjrt:pjrt_future",
         "@local_xla//xla/service:executable",
         "@local_xla//xla/service:maybe_owning_device_memory",
         "@local_xla//xla/service:shaped_buffer",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index bed899bfed2f3e..31f1aeedd9850e 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -132,7 +132,7 @@ void MergeOutgoingDataEdges(const Scope& s, Node* old_node, Node* new_node,
     if (merged_output.node() == nullptr) {
       Output new_output(new_node, oidx);
       if (debugging_opts.print_outputs) {
-        string cpu_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+        std::string cpu_device = "/job:localhost/replica:0/task:0/device:CPU:0";
         ops::Print print_op(s.WithOpName("print_", oidx)
                                 .WithDevice(cpu_device)
                                 .WithAssignedDevice(cpu_device),
@@ -298,7 +298,8 @@ absl::StatusOr<Node*> ReplaceFunctionCallWithPartitionedCall(
     const GraphOptimizationPassOptions& options,
     const FunctionLibraryDefinition& flib_def, Node* n, Graph* g,
     const NameAttrList& func, const Scope& root) {
-  string config_string = options.session_options->config.SerializeAsString();
+  std::string config_string =
+      options.session_options->config.SerializeAsString();
 
   int input_count = absl::c_count_if(
       n->in_edges(), [](const Edge* e) { return !e->IsControlEdge(); });
@@ -346,7 +347,8 @@ absl::StatusOr<Node*> ReplaceFunctionCallWithPartitionedCall(
 
 absl::StatusOr<jit::DeviceId> InferDeviceForCluster(
     jit::DeviceInfoCache* device_info_cache, Node* n,
-    const string& function_name, const FunctionLibraryDefinition& flib_def) {
+    const std::string& function_name,
+    const FunctionLibraryDefinition& flib_def) {
   const FunctionDef* func_def = flib_def.Find(function_name);
   TF_RET_CHECK(func_def) << "Could not find " << function_name;
 
@@ -485,7 +487,8 @@ absl::Status ReplaceNodeWithXlaCompileAndXlaRun(
     requires_compilation = true;
   }
 
-  string device_name_str = string(device_info_cache->GetNameFor(device));
+  std::string device_name_str =
+      std::string(device_info_cache->GetNameFor(device));
 
   absl::Status status;
   Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index c3b5ba5521ee65..6b90557df4b86f 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -85,8 +85,8 @@ absl::Status BuildXlaOps(const Scope& s, const FunctionDefLibrary& fdef_lib,
   return absl::OkStatus();
 }
 
-absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
-                                   const string& node_name,
+absl::Status MakeXlaCompiledKernel(Graph* graph, const std::string& callee_name,
+                                   const std::string& node_name,
                                    int num_constant_args, int num_resource_args,
                                    Node** result) {
   NodeDef call_node;
@@ -99,14 +99,16 @@ absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
   return absl::OkStatus();
 }
 
-absl::Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
-                                   const string& node_name, Node** result) {
+absl::Status MakeXlaCompiledKernel(Graph* graph, const std::string& callee_name,
+                                   const std::string& node_name,
+                                   Node** result) {
   return MakeXlaCompiledKernel(graph, callee_name, node_name,
                                /*num_constant_args=*/0, /*num_resource_args=*/0,
                                result);
 }
 
-Node* MakeWrite(const Scope& scope, Output value_to_write, const string& id) {
+Node* MakeWrite(const Scope& scope, Output value_to_write,
+                const std::string& id) {
   Output var_handle = ops::VarHandleOp(scope.WithOpName("Var_" + id), DT_FLOAT,
                                        TensorShape({}));
   ops::AssignVariableOp assign_op(scope.WithOpName("Assignee_" + id),
@@ -114,12 +116,13 @@ Node* MakeWrite(const Scope& scope, Output value_to_write, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   return MakeWrite(
       scope, ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f), id);
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(
+    const std::string& name) {
   FunctionDefLibrary fdef_lib;
   FunctionDef func = FunctionDefHelper::Create(
       /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
index bb8dce848cfbc9..4164efc65a8f4c 100644
--- a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
@@ -36,19 +36,21 @@ class CloneConstantsForBetterClusteringPassImpl {
 
  private:
   absl::Status CloneSmallConstantInputs(
-      const absl::flat_hash_set<string>& name_set, Node* n);
-  string GenerateUniqueName(const absl::flat_hash_set<string>& name_set,
-                            absl::string_view prefix);
-  absl::StatusOr<Node*> CloneNode(const absl::flat_hash_set<string>& name_set,
-                                  Node* n);
+      const absl::flat_hash_set<std::string>& name_set, Node* n);
+  std::string GenerateUniqueName(
+      const absl::flat_hash_set<std::string>& name_set,
+      absl::string_view prefix);
+  absl::StatusOr<Node*> CloneNode(
+      const absl::flat_hash_set<std::string>& name_set, Node* n);
 
   Graph* graph_;
   int unique_name_counter_;
 };
 
-string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
-    const absl::flat_hash_set<string>& name_set, absl::string_view prefix) {
-  string candidate;
+std::string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
+    const absl::flat_hash_set<std::string>& name_set,
+    absl::string_view prefix) {
+  std::string candidate;
   do {
     candidate = absl::StrCat(prefix, "/clone_", unique_name_counter_++);
   } while (name_set.contains(candidate));
@@ -56,7 +58,7 @@ string CloneConstantsForBetterClusteringPassImpl::GenerateUniqueName(
 }
 
 absl::StatusOr<Node*> CloneConstantsForBetterClusteringPassImpl::CloneNode(
-    const absl::flat_hash_set<string>& name_set, Node* n) {
+    const absl::flat_hash_set<std::string>& name_set, Node* n) {
   NodeDef new_in_def = n->def();
   new_in_def.clear_input();
   new_in_def.set_name(GenerateUniqueName(name_set, new_in_def.name()));
@@ -112,7 +114,7 @@ bool IsInPlaceOp(absl::string_view op_name) {
 
 absl::Status
 CloneConstantsForBetterClusteringPassImpl::CloneSmallConstantInputs(
-    const absl::flat_hash_set<string>& name_set, Node* n) {
+    const absl::flat_hash_set<std::string>& name_set, Node* n) {
   std::vector<const Edge*> in_edges;
   // Get the edges and sort them so we clone in a deterministic order.
   absl::c_copy(n->in_edges(), std::back_inserter(in_edges));
@@ -142,7 +144,7 @@ CloneConstantsForBetterClusteringPassImpl::CloneSmallConstantInputs(
 }
 
 absl::Status CloneConstantsForBetterClusteringPassImpl::Run() {
-  absl::flat_hash_set<string> name_set;
+  absl::flat_hash_set<std::string> name_set;
   absl::c_transform(graph_->nodes(), std::inserter(name_set, name_set.begin()),
                     [](Node* n) { return n->name(); });
   std::vector<Node*> nodes;
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass.cc b/tensorflow/compiler/jit/cluster_scoping_pass.cc
index e70be48f0b7341..20a3d98be1d0f2 100644
--- a/tensorflow/compiler/jit/cluster_scoping_pass.cc
+++ b/tensorflow/compiler/jit/cluster_scoping_pass.cc
@@ -51,8 +51,8 @@ class ClusterScopingPassImpl {
   size_t unique_scope_id_;
 };
 
-std::optional<string> GetXlaInternalScope(Node* node) {
-  string scope;
+std::optional<std::string> GetXlaInternalScope(Node* node) {
+  std::string scope;
   if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
     return scope;
   }
@@ -85,8 +85,8 @@ void SetXlaInternalScope(Node* node, absl::string_view scope) {
 //  Node_X (scope "stage") -> Stage
 //
 void AddOrAppendXlaInternalScope(Node* node, absl::string_view suffix) {
-  string updated_scope;
-  std::optional<string> cur_scope = GetXlaInternalScope(node);
+  std::string updated_scope;
+  std::optional<std::string> cur_scope = GetXlaInternalScope(node);
   if (cur_scope == std::nullopt) {
     updated_scope = std::string(suffix);
   } else {
@@ -96,7 +96,7 @@ void AddOrAppendXlaInternalScope(Node* node, absl::string_view suffix) {
 }
 
 void ClusterScopingPassImpl::AddScopeToAllTransitivePredecessors(Node* start) {
-  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+  const std::string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
 
   std::vector<Node*> starts;
   starts.push_back(start);
@@ -106,7 +106,7 @@ void ClusterScopingPassImpl::AddScopeToAllTransitivePredecessors(Node* start) {
 }
 
 void ClusterScopingPassImpl::AddScopeToAllTransitiveSuccessors(Node* start) {
-  const string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
+  const std::string unique_suffix = absl::StrCat("_", GetUniqueScopeId());
 
   std::vector<Node*> starts;
   starts.push_back(start);
diff --git a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
index b09cb2c12fa297..66cc10775992a3 100644
--- a/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
+++ b/tensorflow/compiler/jit/cluster_scoping_pass_test.cc
@@ -45,10 +45,11 @@ absl::Status ClusterScoping(std::unique_ptr<Graph>* graph) {
   return pass.Run(opt_options);
 }
 
-absl::flat_hash_map<string, string> GetXlaInternalScopes(const Graph& graph) {
-  absl::flat_hash_map<string, string> scopes;
+absl::flat_hash_map<std::string, std::string> GetXlaInternalScopes(
+    const Graph& graph) {
+  absl::flat_hash_map<std::string, std::string> scopes;
   for (Node* node : graph.nodes()) {
-    string scope;
+    std::string scope;
     if (GetNodeAttr(node->attrs(), kXlaInternalScopeAttr, &scope).ok()) {
       scopes[node->name()] = scope;
     }
@@ -63,7 +64,7 @@ absl::flat_hash_map<string, string> GetXlaInternalScopes(const Graph& graph) {
   return scopes;
 }
 
-Node* BuildStageNode(GraphDefBuilder& builder, string name,
+Node* BuildStageNode(GraphDefBuilder& builder, std::string name,
                      std::initializer_list<DataType> dtypes,
                      absl::Span<const ops::NodeOut> values) {
   auto opts = builder.opts()
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 50b26371698877..6c77648817f808 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -172,7 +172,7 @@ RecursiveCompilabilityChecker::FindUncompilableNodes(
 }
 
 bool RecursiveCompilabilityChecker::HasXLAKernel(
-    const Node& node, string* uncompilable_reason) const {
+    const Node& node, std::string* uncompilable_reason) const {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
@@ -424,7 +424,7 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     return false;
   }
 
-  string uncompilable_reason;
+  std::string uncompilable_reason;
   if (IsFunctionCall(*lib_runtime->GetFunctionLibraryDefinition(), node)) {
     if (!IsCompilableCall(node.def(), lib_runtime, stack_trace,
                           encapsulating_function, uncompilable_nodes)) {
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 0d86c22de11a22..7d6741529ebd08 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -262,7 +262,7 @@ class RecursiveCompilabilityChecker {
   }
 
   bool HasXLAKernel(const Node& node,
-                    string* uncompilable_reason = nullptr) const;
+                    std::string* uncompilable_reason = nullptr) const;
 
   static void MaybeMarkUncompilableNode(
       const absl::string_view reason,
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 2b2db07642d1ab..fa546e3543e358 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -123,7 +123,7 @@ class Predicate {
  public:
   enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol, kIntSymbol };
 
-  virtual string ToString() const = 0;
+  virtual std::string ToString() const = 0;
 
   // An ID assigned to the Predicate at construction time.  Conceptually like a
   // pointer, except that it is stable across runs.
@@ -156,12 +156,12 @@ class AndPredicate : public Predicate {
   explicit AndPredicate(int64_t id, std::vector<Predicate*> operands)
       : Predicate(id), operands_(std::move(operands)) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     if (operands().empty()) {
       return "#true";
     }
 
-    std::vector<string> operands_str;
+    std::vector<std::string> operands_str;
     std::transform(operands().begin(), operands().end(),
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
@@ -186,12 +186,12 @@ class OrPredicate : public Predicate {
   explicit OrPredicate(int64_t id, std::vector<Predicate*> operands)
       : Predicate(id), operands_(std::move(operands)) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     if (operands().empty()) {
       return "#false";
     }
 
-    std::vector<string> operands_str;
+    std::vector<std::string> operands_str;
     std::transform(operands().begin(), operands().end(),
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
@@ -215,7 +215,7 @@ class NotPredicate : public Predicate {
   explicit NotPredicate(int64_t id, Predicate* operand)
       : Predicate(id), operands_({operand}) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return absl::StrCat("~", operand()->ToString());
   }
 
@@ -251,14 +251,14 @@ class NotPredicate : public Predicate {
 class AndRecurrencePredicate : public Predicate {
  public:
   explicit AndRecurrencePredicate(int64_t id, Predicate* start, Predicate* step,
-                                  std::vector<string> frame)
+                                  std::vector<std::string> frame)
       : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
-  absl::Span<const string> frame() const { return frame_; }
+  absl::Span<const std::string> frame() const { return frame_; }
 
-  string ToString() const override {
+  std::string ToString() const override {
     return absl::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
                         "}<", absl::StrJoin(frame(), ";"), ">");
   }
@@ -271,7 +271,7 @@ class AndRecurrencePredicate : public Predicate {
 
  private:
   std::array<Predicate*, 2> operands_;
-  std::vector<string> frame_;
+  std::vector<std::string> frame_;
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -286,7 +286,7 @@ class SymbolPredicate : public Predicate {
         tensor_id_(std::move(tensor_id)),
         must_be_true_(must_be_true) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return must_be_true() ? absl::StrCat("*", tensor_id_.ToString())
                           : tensor_id_.ToString();
   }
@@ -320,7 +320,7 @@ class IntSymbolPredicate : public Predicate {
         tensor_id_(std::move(tensor_id)),
         must_have_value_(must_have_value) {}
 
-  string ToString() const override {
+  std::string ToString() const override {
     return must_have_value().has_value()
                ? absl::StrCat(tensor_id_.ToString(), "=", *must_have_value_)
                : tensor_id_.ToString();
@@ -396,7 +396,7 @@ class PredicateFactory {
   }
 
   Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step,
-                                        std::vector<string> frame) {
+                                        std::vector<std::string> frame) {
     SignatureForAndRec signature(start, step, std::move(frame));
     auto it = interned_and_rec_instances_.find(signature);
     if (it != interned_and_rec_instances_.end()) {
@@ -463,8 +463,8 @@ class PredicateFactory {
       Tensor tensor(proto->dtype());
       TF_RET_CHECK(tensor.FromProto(*proto));
 
-      *predicate = tensor.scalar<int32>()() == *must_have_value ? MakeTrue()
-                                                                : MakeFalse();
+      *predicate = tensor.scalar<int32_t>()() == *must_have_value ? MakeTrue()
+                                                                  : MakeFalse();
       return absl::OkStatus();
     }
     SignatureForIntSymbol signature = {tensor_id, must_have_value};
@@ -559,9 +559,9 @@ class PredicateFactory {
       std::pair<Predicate::Kind, absl::Span<Predicate* const>>;
   using SignatureForNot = Predicate*;
   using SignatureForAndRec =
-      std::tuple<Predicate*, Predicate*, std::vector<string>>;
+      std::tuple<Predicate*, Predicate*, std::vector<std::string>>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
-  using SignatureForIntSymbol = std::pair<SafeTensorId, std::optional<int32>>;
+  using SignatureForIntSymbol = std::pair<SafeTensorId, std::optional<int32_t>>;
 
   struct HashSignatureForAndOr {
     size_t operator()(const SignatureForAndOr& signature) const {
@@ -586,7 +586,7 @@ class PredicateFactory {
           SafeTensorId::Hasher()(signature.first),
           Hash64Combine(
               ::tensorflow::hash<bool>()(signature.second.has_value()),
-              ::tensorflow::hash<int32>()(
+              ::tensorflow::hash<int32_t>()(
                   signature.second.has_value() ? *signature.second : 0)));
     }
   };
@@ -830,8 +830,8 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   absl::StatusOr<DeadnessAnalysis::DeadnessPredicate> GetPredicateFor(
       Node* n, int oidx) const override;
   void Print() const override;
-  absl::flat_hash_map<TensorId, string, TensorId::Hasher> PredicateMapAsString()
-      const;
+  absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>
+  PredicateMapAsString() const;
 
  private:
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
@@ -958,7 +958,7 @@ absl::Status DeadnessAnalysisImpl::HandleSwitch(
     for (int i = 0; i < n->num_outputs() - 1; i++) {
       TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
           pred_edge->src(), pred_edge->src_output(),
-          /*must_have_value=*/std::optional<int32>(i), &branch_pred));
+          /*must_have_value=*/std::optional<int32_t>(i), &branch_pred));
       input_preds.push_back(branch_pred);
       SetPredicate(n, i, predicate_factory_.MakeAndPredicate(input_preds),
                    should_revisit);
@@ -982,7 +982,7 @@ absl::Status DeadnessAnalysisImpl::HandleSwitch(
 
 namespace {
 absl::Status CreateMultipleNextIterationInputsError(Node* merge) {
-  std::vector<string> backedges;
+  std::vector<std::string> backedges;
   for (const Edge* backedge : merge->in_edges()) {
     if (backedge->src()->IsNextIteration()) {
       backedges.push_back(absl::StrCat("  ", SummarizeNode(*backedge->src())));
@@ -1058,7 +1058,7 @@ Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
 
 absl::Status GetFullFrame(const Node* n,
                           absl::Span<const ControlFlowInfo> cfi_infos,
-                          std::vector<string>* frame) {
+                          std::vector<std::string>* frame) {
   int depth = 0;
   for (const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()]; !n->IsSource();
        n = cfi_iter->parent_frame, cfi_iter = &cfi_infos[n->id()]) {
@@ -1174,7 +1174,7 @@ absl::Status DeadnessAnalysisImpl::HandleMerge(
 
         Predicate* start =
             predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
-        std::vector<string> frame;
+        std::vector<std::string> frame;
         TF_RETURN_IF_ERROR(GetFullFrame(n, control_flow_info_, &frame));
         Predicate* and_rec = predicate_factory_.MakeAndRecurrencePredicate(
             start, step, std::move(frame));
@@ -1358,7 +1358,7 @@ absl::Status DeadnessAnalysisImpl::GetFrameBasedTopologicalOrder(
 // nested while, as there is no clean cut for separating them in the topological
 // order.
 absl::Status DeadnessAnalysisImpl::Populate(bool enable_optimistic) {
-  std::vector<string> unreachable_nodes;
+  std::vector<std::string> unreachable_nodes;
   // Compute the loop structure of the graph.
   TF_RETURN_IF_ERROR(
       BuildControlFlowInfo(&graph_, &control_flow_info_, &unreachable_nodes));
@@ -1582,9 +1582,9 @@ DeadnessAnalysis::~DeadnessAnalysis() {}
   return absl::OkStatus();
 }
 
-absl::flat_hash_map<TensorId, string, TensorId::Hasher>
+absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>
 DeadnessAnalysisImpl::PredicateMapAsString() const {
-  absl::flat_hash_map<TensorId, string, TensorId::Hasher> result;
+  absl::flat_hash_map<TensorId, std::string, TensorId::Hasher> result;
   for (const auto& kv_pair : predicate_map_) {
     CHECK(result.insert({kv_pair.first, kv_pair.second->ToString()}).second);
   }
@@ -1603,7 +1603,7 @@ absl::Status ComputePredicates(const Graph& graph,
 
 }  // namespace deadness_analysis_internal
 
-string DeadnessAnalysis::DebugString(DeadnessPredicate predicate) const {
+std::string DeadnessAnalysis::DebugString(DeadnessPredicate predicate) const {
   return static_cast<Predicate*>(predicate.pred_)->ToString();
 }
 
diff --git a/tensorflow/compiler/jit/deadness_analysis.h b/tensorflow/compiler/jit/deadness_analysis.h
index 80fa9a20faef41..1cd394154faf36 100644
--- a/tensorflow/compiler/jit/deadness_analysis.h
+++ b/tensorflow/compiler/jit/deadness_analysis.h
@@ -81,7 +81,7 @@ class DeadnessAnalysis {
   virtual void Print() const = 0;
   virtual ~DeadnessAnalysis();
 
-  string DebugString(DeadnessPredicate predicate) const;
+  std::string DebugString(DeadnessPredicate predicate) const;
 
   // Run the deadness analysis over `graph` and returns an error or a populated
   // instance of DeadnessAnalysis in `result`.
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index 0dc18d3e129d79..569cdeadae735e 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -24,7 +24,8 @@ namespace deadness_analysis_internal {
 
 // Returns a map describing the predicate each Tensor was mapped to.  For
 // testing purposes only.
-using PredicateMapTy = absl::flat_hash_map<TensorId, string, TensorId::Hasher>;
+using PredicateMapTy =
+    absl::flat_hash_map<TensorId, std::string, TensorId::Hasher>;
 absl::Status ComputePredicates(const Graph& graph,
                                PredicateMapTy* out_predicate_map,
                                bool enable_optimistic = true);
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 894ee659121e25..fd7d93b3772f5f 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -61,7 +61,7 @@ absl::Status AnalyzeDeadness(Graph* graph,
   return DeadnessAnalysis::Run(*graph, result);
 }
 
-ops::Switch CreateSwitch(const Scope& root, const string& prefix) {
+ops::Switch CreateSwitch(const Scope& root, const std::string& prefix) {
   Output value = ops::Placeholder(root.WithOpName(prefix + "/value"), DT_FLOAT);
   Output predicate =
       ops::Placeholder(root.WithOpName(prefix + "/pred"), DT_BOOL);
@@ -76,7 +76,7 @@ void VLogGraphIfAsked(const Graph& graph) {
   if (VLOG_IS_ON(3)) {
     GraphDef graph_def;
     graph.ToGraphDef(&graph_def);
-    string serialized;
+    std::string serialized;
     ::tensorflow::protobuf::TextFormat::PrintToString(graph_def, &serialized);
     LOG(INFO) << serialized;
   }
@@ -127,8 +127,8 @@ struct InductionVarInfo {
 //    +-----> |     Exit      |
 //            +---------------+
 InductionVarInfo CreateInductionVariable(const Scope& root,
-                                         const string& prefix,
-                                         const string& frame_name,
+                                         const std::string& prefix,
+                                         const std::string& frame_name,
                                          const Output& initial_value) {
   Output enter_initial_value = ops::internal::Enter(
       root.WithOpName(prefix + "/enter"), initial_value, frame_name);
@@ -158,8 +158,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
 }
 
 InductionVarInfo CreateInductionVariable(const Scope& root,
-                                         const string& prefix,
-                                         const string& frame_name,
+                                         const std::string& prefix,
+                                         const std::string& frame_name,
                                          int32_t init) {
   return CreateInductionVariable(
       root, prefix, frame_name,
@@ -201,7 +201,7 @@ struct DependentInductionVar {
 };
 
 DependentInductionVar CreateDependentLoopInvariantValue(
-    const Scope& root, const string& prefix, const string& frame_name,
+    const Scope& root, const std::string& prefix, const std::string& frame_name,
     const Output& loop_cond, const Output& value) {
   Output enter_value = ops::internal::Enter(root.WithOpName(prefix + "/enter"),
                                             value, frame_name);
@@ -218,7 +218,7 @@ DependentInductionVar CreateDependentLoopInvariantValue(
 }
 
 DependentInductionVar CreateDependentLoopInvariantValue(
-    const Scope& root, const string& prefix, const string& frame_name,
+    const Scope& root, const std::string& prefix, const std::string& frame_name,
     const Output& loop_cond, int32_t value) {
   return CreateDependentLoopInvariantValue(
       root, prefix, frame_name, loop_cond,
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.cc b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
index 9ec02d92d37cd6..8288b44e7f1c1d 100644
--- a/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
@@ -65,9 +65,9 @@ struct SignatureNotEqual {
 // Functor that incrementally computes a Signature's hash given its current hash
 // and one of its args.
 struct SignatureHashCombiner {
-  explicit SignatureHashCombiner(const uint64 h) : h(h) {}
-  uint64 h;
-  uint64 operator()(const Tensor& arg) {
+  explicit SignatureHashCombiner(const uint64_t h) : h(h) {}
+  uint64_t h;
+  uint64_t operator()(const Tensor& arg) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.dtype())));
     h = Hash64Combine(
         h, Hash64(arg.tensor_data().data(), arg.tensor_data().size()));
@@ -76,7 +76,7 @@ struct SignatureHashCombiner {
     }
     return h;
   }
-  uint64 operator()(const TensorTypeAndShape& arg) {
+  uint64_t operator()(const TensorTypeAndShape& arg) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
     h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
     for (int dim : arg.second) {
@@ -108,8 +108,8 @@ bool Signature::operator==(const Signature& other) const {
   return true;
 }
 
-uint64 Signature::Hash::operator()(const Signature& signature) const {
-  uint64 h = std::hash<string>()(signature.name);
+uint64_t Signature::Hash::operator()(const Signature& signature) const {
+  uint64_t h = std::hash<std::string>()(signature.name);
   for (const auto& arg : signature.args) {
     h = std::visit(SignatureHashCombiner(h), arg);
   }
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.h b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
index b4c2840eedee59..721c1d3b78c50e 100644
--- a/tensorflow/compiler/jit/device_compilation_cluster_signature.h
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
@@ -58,7 +58,8 @@ struct DeviceCompilationClusterSignature {
   bool operator==(const DeviceCompilationClusterSignature& other) const;
 
   struct Hash {
-    uint64 operator()(const DeviceCompilationClusterSignature& signature) const;
+    uint64_t operator()(
+        const DeviceCompilationClusterSignature& signature) const;
   };
 
   // Returns a human-readable description of the signature.
diff --git a/tensorflow/compiler/jit/device_compilation_profiler.cc b/tensorflow/compiler/jit/device_compilation_profiler.cc
index 5e1b3b26e8ecb5..ec161293b7643d 100644
--- a/tensorflow/compiler/jit/device_compilation_profiler.cc
+++ b/tensorflow/compiler/jit/device_compilation_profiler.cc
@@ -107,7 +107,7 @@ absl::Status DeviceCompilationProfiler::RegisterCompilation(
       cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
           .first;
 
-  const uint64 compile_time_s = compile_time_us / 1.0e6;
+  const uint64_t compile_time_s = compile_time_us / 1.0e6;
   it->second.compile_count++;
   it->second.cumulative_compile_time_us += compile_time_us;
   VLOG(1) << "Compiled " << function_name << " " << it->second.compile_count
diff --git a/tensorflow/compiler/jit/device_compiler.h b/tensorflow/compiler/jit/device_compiler.h
index 0fae07abd22897..a9f2418282c414 100644
--- a/tensorflow/compiler/jit/device_compiler.h
+++ b/tensorflow/compiler/jit/device_compiler.h
@@ -137,7 +137,7 @@ class DeviceCompiler : public ResourceBase {
     return compiler_client_.get();
   }
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   // Common implementation of Compile and CompileSingleOp. The `OpKernelContext`
@@ -259,7 +259,7 @@ DeviceCompiler<ExecutableType, ClientType>::~DeviceCompiler() {
 }
 
 template <typename ExecutableType, typename ClientType>
-string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
+std::string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
   return "DeviceCompiler";
 }
 
@@ -331,7 +331,7 @@ DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
     CompileScope scope, OpKernelContext* ctx,
     DeviceCompilationProfiler* profiler, mutex* mu) {
   tensorflow::Env* env = tensorflow::Env::Default();
-  const uint64 compile_start_us = env->NowMicros();
+  const uint64_t compile_start_us = env->NowMicros();
 
   TfGraphToHloCompiler compiler(options);
   cache_value.compile_state = DeviceCompileState::kCompiled;
@@ -385,8 +385,8 @@ DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
   // Finalize the cache to release the XlaComputation after it was compiled.
   cache_->Finalize();
 
-  const uint64 compile_end_us = env->NowMicros();
-  const uint64 compile_time_us = compile_end_us - compile_start_us;
+  const uint64_t compile_end_us = env->NowMicros();
+  const uint64_t compile_time_us = compile_end_us - compile_start_us;
 
   device_compiler_internal::LogOnceXlaCompiledFirstCluster();
   TF_RETURN_IF_ERROR(profiler->RegisterCompilation(
@@ -496,7 +496,7 @@ absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileImpl(
 
   profiler->RegisterExecution(function);
 
-  string human_signature;
+  std::string human_signature;
   if (VLOG_IS_ON(2)) {
     human_signature = VLOG_IS_ON(3) ? signature.HumanString() : function.name();
     VLOG(2) << "DeviceCompilationClusterSignature: " << human_signature;
diff --git a/tensorflow/compiler/jit/device_compiler_test.cc b/tensorflow/compiler/jit/device_compiler_test.cc
index 64e286bff55b07..749110be186311 100644
--- a/tensorflow/compiler/jit/device_compiler_test.cc
+++ b/tensorflow/compiler/jit/device_compiler_test.cc
@@ -139,7 +139,7 @@ class MockXlaDeviceExecutablePersistor
             Config{testing::TmpDir(), false, "xla"},
             DeviceType(DEVICE_CPU_XLA_JIT)) {}
   MOCK_METHOD(absl::Status, TryToPersistExecutable,
-              (uint64, const std::string&, const XlaCompiler::Options&,
+              (uint64_t, const std::string&, const XlaCompiler::Options&,
                const XlaCompiler::CompilationResult&,
                const xla::LocalExecutable&,
                (DeviceCompilerClient<xla::LocalExecutable, xla::LocalClient>*)),
@@ -425,7 +425,7 @@ TEST_F(DeviceCompilerTest, CompileFailedToLoadFromPersistentCache) {
       &xla_executable));
 
   // Corrupt the file which contains the serialized executable.
-  std::vector<string> files;
+  std::vector<std::string> files;
   TF_ASSERT_OK(Env::Default()->GetChildren(testing::TmpDir(), &files));
   std::string const* serialized_executable_filename = nullptr;
   for (const auto& file : files) {
diff --git a/tensorflow/compiler/jit/device_context_test.cc b/tensorflow/compiler/jit/device_context_test.cc
index 34a0c3d5ea067b..33bba30f3db3e1 100644
--- a/tensorflow/compiler/jit/device_context_test.cc
+++ b/tensorflow/compiler/jit/device_context_test.cc
@@ -38,7 +38,7 @@ static bool Initialized = [] {
 
 class DeviceContextTest : public ::testing::Test {
  public:
-  void SetDevice(const string& device_type) {
+  void SetDevice(const std::string& device_type) {
     auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
     rollout_config.AllowForDeviceInXlaLaunch(DeviceType(device_type));
     rollout_config.AllowForDeviceInXlaCompileOnDemand(DeviceType(device_type));
diff --git a/tensorflow/compiler/jit/device_executable_persistor.h b/tensorflow/compiler/jit/device_executable_persistor.h
index 458441c86b5c43..5a64b078e1a93c 100644
--- a/tensorflow/compiler/jit/device_executable_persistor.h
+++ b/tensorflow/compiler/jit/device_executable_persistor.h
@@ -96,7 +96,7 @@ class DeviceExecutablePersistor {
   // TODO(b/255826209): Take in Signature instead of hash and string once cache
   // is refactored.
   std::optional<StatusOr<std::unique_ptr<ExecutableType>>> TryToLoadExecutable(
-      uint64 signature_hash, const std::string& signature_str,
+      uint64_t signature_hash, const std::string& signature_str,
       const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       DeviceCompilerClient<ExecutableType, ClientType>* client) const;
@@ -107,7 +107,7 @@ class DeviceExecutablePersistor {
   // TODO(b/255826209): Take in Signature instead hash and string once cache
   // is refactored.
   virtual absl::Status TryToPersistExecutable(
-      uint64 signature_hash, const std::string& signature_str,
+      uint64_t signature_hash, const std::string& signature_str,
       const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       const ExecutableType& executable,
@@ -123,15 +123,15 @@ class DeviceExecutablePersistor {
   // Returns a cache key proto that identifies an entry in the compilation
   // cache.
   XlaSerializedCacheKey BuildSerializedCacheKey(
-      uint64 signature_hash, const xla::HloModuleProto& hlo_module) const;
+      uint64_t signature_hash, const xla::HloModuleProto& hlo_module) const;
 
   XlaSerializedCacheKey BuildSerializedCacheKey(
-      uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+      uint64_t signature_hash, const xla::HloModuleProto& hlo_module,
       bool compiled_using_pjrt) const;
 
   // Serializes the signature and its corresponding entry to a proto message.
   absl::StatusOr<XlaSerializedCacheEntry> SerializeEntry(
-      uint64 signature_hash, const XlaCompiler::Options& options,
+      uint64_t signature_hash, const XlaCompiler::Options& options,
       const XlaCompiler::CompilationResult& compilation_result,
       const ExecutableType& executable,
       DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const;
@@ -189,7 +189,7 @@ std::string DeviceExecutablePersistor<ExecutableType, ClientType>::GetFilePath(
 template <typename ExecutableType, typename ClientType>
 XlaSerializedCacheKey
 DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
-    uint64 signature_hash, const xla::HloModuleProto& hlo_module,
+    uint64_t signature_hash, const xla::HloModuleProto& hlo_module,
     bool compiled_using_pjrt) const {
   XlaSerializedCacheKey key;
   key.set_signature_fingerprint(signature_hash);
@@ -203,7 +203,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
 template <typename ExecutableType, typename ClientType>
 XlaSerializedCacheKey
 DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
-    uint64 signature_hash, const xla::HloModuleProto& hlo_module) const {
+    uint64_t signature_hash, const xla::HloModuleProto& hlo_module) const {
   return BuildSerializedCacheKey(signature_hash, hlo_module, false);
 }
 
@@ -212,7 +212,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
 template <>
 inline XlaSerializedCacheKey
 DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>::
-    BuildSerializedCacheKey(uint64 signature_hash,
+    BuildSerializedCacheKey(uint64_t signature_hash,
                             const xla::HloModuleProto& hlo_module) const {
   return BuildSerializedCacheKey(signature_hash, hlo_module, true);
 }
@@ -305,7 +305,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::SaveSerializedEntry(
 template <typename ExecutableType, typename ClientType>
 absl::StatusOr<XlaSerializedCacheEntry>
 DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
-    uint64 signature_hash, const XlaCompiler::Options& options,
+    uint64_t signature_hash, const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     const ExecutableType& executable,
     DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
@@ -340,7 +340,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
 template <typename ExecutableType, typename ClientType>
 std::optional<StatusOr<std::unique_ptr<ExecutableType>>>
 DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
-    uint64 signature_hash, const std::string& signature_str,
+    uint64_t signature_hash, const std::string& signature_str,
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
@@ -376,7 +376,7 @@ DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
 template <typename ExecutableType, typename ClientType>
 absl::Status
 DeviceExecutablePersistor<ExecutableType, ClientType>::TryToPersistExecutable(
-    uint64 signature_hash, const std::string& signature_str,
+    uint64_t signature_hash, const std::string& signature_str,
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& compilation_result,
     const ExecutableType& executable,
diff --git a/tensorflow/compiler/jit/device_executable_persistor_test.cc b/tensorflow/compiler/jit/device_executable_persistor_test.cc
index 7779f1112e7b9e..62cfd4c1b8e0b7 100644
--- a/tensorflow/compiler/jit/device_executable_persistor_test.cc
+++ b/tensorflow/compiler/jit/device_executable_persistor_test.cc
@@ -222,7 +222,7 @@ absl::StatusOr<XlaSerializedCacheEntry> ReadCacheEntryFromFile(
 }
 
 XlaSerializedCacheKey CreateCacheKey(
-    uint64 signature_hash,
+    uint64_t signature_hash,
     const XlaCompiler::CompilationResult& compilation_result,
     const DeviceType& device_type, const std::string& persistence_prefix,
     bool compiled_using_pjrt = false) {
diff --git a/tensorflow/compiler/jit/device_util.cc b/tensorflow/compiler/jit/device_util.cc
index 828da0b08c2590..1979aec5bcf0c3 100644
--- a/tensorflow/compiler/jit/device_util.cc
+++ b/tensorflow/compiler/jit/device_util.cc
@@ -44,7 +44,7 @@ void DeviceSet::UnionWith(const DeviceSet& other) {
 }
 
 bool DeviceSet::IsEmpty() const {
-  return absl::c_all_of(storage_, [&](uint64 val) { return val == 0; });
+  return absl::c_all_of(storage_, [&](uint64_t val) { return val == 0; });
 }
 
 absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
@@ -56,7 +56,7 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   }
 
   int new_id = names_.size();
-  names_.push_back(string(name));
+  names_.push_back(std::string(name));
   id_to_device_type_.push_back(std::make_unique<DeviceType>(""));
   DeviceType* device_type = id_to_device_type_.back().get();
   TF_RETURN_IF_ERROR(DeviceNameToDeviceType(names_.back(), device_type));
@@ -64,7 +64,7 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   is_cpu_.push_back(device_type->type_string() == DEVICE_CPU);
   is_gpu_.push_back(device_type->type_string() == DEVICE_GPU);
 
-  name_to_id_.emplace(string(name), DeviceId(new_id));
+  name_to_id_.emplace(std::string(name), DeviceId(new_id));
 
   const XlaOpRegistry::DeviceRegistration* compilation_device;
   if (!XlaOpRegistry::GetCompilationDevice(device_type->type(),
@@ -76,10 +76,10 @@ absl::StatusOr<DeviceId> DeviceInfoCache::GetIdFor(absl::string_view name) {
   return DeviceId(new_id);
 }
 
-string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
-  std::vector<string> names;
+std::string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
+  std::vector<std::string> names;
   device_set.ForEach([&](DeviceId device_id) {
-    names.push_back(string(GetNameFor(device_id)));
+    names.push_back(std::string(GetNameFor(device_id)));
     return true;
   });
 
@@ -87,7 +87,7 @@ string DeviceInfoCache::DebugString(const DeviceSet& device_set) const {
 }
 }  // namespace jit
 
-absl::Status DeviceNameToDeviceType(const string& device,
+absl::Status DeviceNameToDeviceType(const std::string& device,
                                     DeviceType* device_type) {
   DeviceNameUtils::ParsedName parsed;
   if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
index 745f87309501d8..fa862aac88c394 100644
--- a/tensorflow/compiler/jit/device_util.h
+++ b/tensorflow/compiler/jit/device_util.h
@@ -75,9 +75,9 @@ class DeviceSet {
     // iterator if this ends up being used widely.
     for (int word_index = 0, end = storage_.size(); word_index < end;
          word_index++) {
-      uint64 word = storage_[word_index];
+      uint64_t word = storage_[word_index];
       while (word != 0) {
-        uint64 only_lowest_bit_set = word & -word;
+        uint64_t only_lowest_bit_set = word & -word;
         // The number of trailing zeros in a non-zero word is the index of the
         // least significant 1.
         int bit_index = absl::countr_zero(word);
@@ -90,7 +90,7 @@ class DeviceSet {
   }
 
  private:
-  absl::InlinedVector<uint64, 1> storage_;
+  absl::InlinedVector<uint64_t, 1> storage_;
 
   const int kWordSize = 64;
 };
@@ -131,17 +131,17 @@ class DeviceInfoCache {
     return std::cref(*id_to_device_type_[device_id.id()]);
   }
 
-  string DebugString(const DeviceSet& device_set) const;
+  std::string DebugString(const DeviceSet& device_set) const;
 
  private:
-  absl::flat_hash_map<string, DeviceId> name_to_id_;
+  absl::flat_hash_map<std::string, DeviceId> name_to_id_;
 
   // These fields are populated for a device in GetIdFor, *before* we give out a
   // DeviceId.
   std::vector<const XlaOpRegistry::DeviceRegistration*>
       id_to_compilation_device_;
   std::vector<std::unique_ptr<DeviceType>> id_to_device_type_;
-  std::vector<string> names_;
+  std::vector<std::string> names_;
   std::vector<bool> is_cpu_;
   std::vector<bool> is_gpu_;
 };
@@ -149,7 +149,7 @@ class DeviceInfoCache {
 }  // namespace jit
 
 // Returns the DeviceType corresponding to 'device'.
-absl::Status DeviceNameToDeviceType(const string& device,
+absl::Status DeviceNameToDeviceType(const std::string& device,
                                     DeviceType* device_type);
 
 // Picks the device for which XLA should compile a cluster that contains
diff --git a/tensorflow/compiler/jit/device_util_test.cc b/tensorflow/compiler/jit/device_util_test.cc
index cef39df6283f2b..be58292f931686 100644
--- a/tensorflow/compiler/jit/device_util_test.cc
+++ b/tensorflow/compiler/jit/device_util_test.cc
@@ -23,7 +23,7 @@ namespace {
 
 absl::Status PickDeviceHelper(bool allow_mixing_unknown_and_cpu,
                               absl::Span<const absl::string_view> device_names,
-                              string* result) {
+                              std::string* result) {
   jit::DeviceInfoCache cache;
   jit::DeviceSet device_set;
   for (absl::string_view name : device_names) {
@@ -34,14 +34,14 @@ absl::Status PickDeviceHelper(bool allow_mixing_unknown_and_cpu,
   TF_ASSIGN_OR_RETURN(
       jit::DeviceId result_id,
       PickDeviceForXla(cache, device_set, allow_mixing_unknown_and_cpu));
-  *result = string(cache.GetNameFor(result_id));
+  *result = std::string(cache.GetNameFor(result_id));
   return absl::OkStatus();
 }
 
 void CheckPickDeviceResult(absl::string_view expected_result,
                            bool allow_mixing_unknown_and_cpu,
                            absl::Span<const absl::string_view> inputs) {
-  string result;
+  std::string result;
   TF_ASSERT_OK(PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result))
       << "inputs = [" << absl::StrJoin(inputs, ", ")
       << "], allow_mixing_unknown_and_cpu=" << allow_mixing_unknown_and_cpu
@@ -51,7 +51,7 @@ void CheckPickDeviceResult(absl::string_view expected_result,
 
 void CheckPickDeviceHasError(bool allow_mixing_unknown_and_cpu,
                              absl::Span<const absl::string_view> inputs) {
-  string result;
+  std::string result;
   EXPECT_FALSE(
       PickDeviceHelper(allow_mixing_unknown_and_cpu, inputs, &result).ok());
 }
@@ -110,10 +110,10 @@ void SimpleRoundTripTestForDeviceSet(int num_devices) {
   jit::DeviceSet device_set;
   jit::DeviceInfoCache device_info_cache;
 
-  std::vector<string> expected_devices, actual_devices;
+  std::vector<std::string> expected_devices, actual_devices;
 
   for (int i = 0; i < num_devices; i++) {
-    string device_name =
+    std::string device_name =
         absl::StrCat("/job:localhost/replica:0/task:0/device:XPU:", i);
     TF_ASSERT_OK_AND_ASSIGN(jit::DeviceId device_id,
                             device_info_cache.GetIdFor(device_name));
@@ -122,7 +122,8 @@ void SimpleRoundTripTestForDeviceSet(int num_devices) {
   }
 
   device_set.ForEach([&](jit::DeviceId device_id) {
-    actual_devices.push_back(string(device_info_cache.GetNameFor(device_id)));
+    actual_devices.push_back(
+        std::string(device_info_cache.GetNameFor(device_id)));
     return true;
   });
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 3e8a43ce08ed58..6e7d16de16a4f6 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -115,7 +115,7 @@ void MarkGuaranteedConstants(
 }
 
 struct OutputInputTensorPairHasher {
-  uint64 operator()(std::pair<OutputTensor, InputTensor> const& s) const {
+  uint64_t operator()(std::pair<OutputTensor, InputTensor> const& s) const {
     return Hash64Combine(OutputTensor::Hash()(s.first),
                          InputTensor::Hash()(s.second));
   }
@@ -128,7 +128,7 @@ static const char* const kRetValOp = "_Retval";
 
 class Encapsulator {
  public:
-  Encapsulator(string group_attribute, Graph const* graph_in)
+  Encapsulator(std::string group_attribute, Graph const* graph_in)
       : group_attribute_(std::move(group_attribute)), graph_in_(graph_in) {}
 
   // Find subgraphs marked with 'group_attribute', and build a new
@@ -182,7 +182,7 @@ class Encapsulator {
     // 'reuse_existing_functions' is set, use an existing function with the same
     // name, if any.  If 'rewrite_subgraph_fn' is set, it is applied to the
     // subgraph before function conversion.
-    absl::Status BuildFunctionDef(const string& name_in,
+    absl::Status BuildFunctionDef(const std::string& name_in,
                                   const RewriteSubgraphFn& rewrite_subgraph_fn,
                                   bool reuse_existing_functions,
                                   FunctionLibraryDefinition* library);
@@ -226,7 +226,7 @@ class Encapsulator {
         const absl::flat_hash_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
-    absl::Status MakeSequencingNode(const string& subgraph_name,
+    absl::Status MakeSequencingNode(const std::string& subgraph_name,
                                     Graph* graph_out);
 
     // If there is a sequencer node, adds a control edge from the sequencer to
@@ -243,14 +243,14 @@ class Encapsulator {
 
     // Which device are these nodes on? Used to assign a device to the call
     // node.
-    string device_;
+    std::string device_;
 
     // NodeDef for the function call node.
     NodeDef call_node_def_;
 
     // Name that is used for the call node. This may not be
     // call_node_def_.name() if the client supplies a rewrite lambda.
-    string function_def_name_;
+    std::string function_def_name_;
 
     // Placeholder node simulating the host compute key in the output graph.
     // Not owned.
@@ -275,7 +275,7 @@ class Encapsulator {
     // Set of node names that are the source of a control output of the
     // subgraph. We store strings here so that we can tolerate nodes being
     // removed from the graph.
-    absl::flat_hash_set<string> control_output_nodes_;
+    absl::flat_hash_set<std::string> control_output_nodes_;
 
     // NoOp node in the output graph that is sequenced after the call node.
     Node* sequencer_ = nullptr;
@@ -283,7 +283,7 @@ class Encapsulator {
 
   // Returns the key attribute associated with a node in attr. Sets either
   // result to the empty string if the respective attribute is not found.
-  absl::Status GetFunctionNameAttr(Node const* node, string* attr) const;
+  absl::Status GetFunctionNameAttr(Node const* node, std::string* attr) const;
 
   // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to
   // subgraphs for data edges that cross subgraph boundaries.
@@ -308,36 +308,35 @@ class Encapsulator {
   // a subgraph boundary it is the output of a call node, otherwise it is a node
   // in the output graph.
   absl::Status FindOutputImageOfEdgeSrc(
-      const string& src_func_id, const string& dst_func_id,
+      const std::string& src_func_id, const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_src_node, Node** src_image);
 
   // Finds an edge source slot in the output graph. If the edge crosses a
   // subgraph boundary it is a slot on the output of a call node, otherwise it
   // is a slot on a node in the output graph.
-  int FindOutputSlotOfEdgeSrc(const string& src_func_id,
-                              const string& dst_func_id,
-                              const Edge* edge);
+  int FindOutputSlotOfEdgeSrc(const std::string& src_func_id,
+                              const std::string& dst_func_id, const Edge* edge);
 
   // Finds the image of an edge destination in the output graph. If the edge
   // crosses a subgraph boundary it is the input of a call node, otherwise it is
   // a node in the output graph.
   absl::Status FindOutputImageOfEdgeDst(
-      const string& src_func_id, const string& dst_func_id,
+      const std::string& src_func_id, const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_dst_node, Node** dst_image);
 
   // Finds an edge destination slot in the output graph. If the edge crosses a
   // subgraph boundary it is a slot on the input of a call node, otherwise it is
   // a slot on a node in the output graph.
-  int FindOutputSlotOfEdgeDst(const string& src_func_id,
-                              const string& dst_func_id,
-                              const Edge* edge);
+  int FindOutputSlotOfEdgeDst(const std::string& src_func_id,
+                              const std::string& dst_func_id, const Edge* edge);
 
   // Copies a single edge to the output graph. The edge is either entirely
   // within the output graph, or crosses into or out of a compiled subgraph.
   absl::Status CopyEdgeToOutputGraph(
-      const Edge* edge, const string& src_func_id, const string& dst_func_id,
+      const Edge* edge, const std::string& src_func_id,
+      const std::string& dst_func_id,
       const absl::flat_hash_map<const Node*, Node*>& node_images,
       Graph* graph_out,
       absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
@@ -358,10 +357,10 @@ class Encapsulator {
       absl::flat_hash_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
-  const string group_attribute_;
+  const std::string group_attribute_;
   const Graph* graph_in_;
 
-  absl::flat_hash_map<string, Subgraph> subgraphs_;
+  absl::flat_hash_map<std::string, Subgraph> subgraphs_;
 
   Encapsulator(const Encapsulator&) = delete;
   void operator=(const Encapsulator&) = delete;
@@ -374,19 +373,20 @@ namespace {
 // including clusters that are not present in the ancestors map. has_successors
 // is the set of clusters that are ancestors of some other cluster.
 void TopologicalClusterSort(
-    const absl::flat_hash_set<string>& clusters,
-    const absl::flat_hash_set<string>& has_successors,
-    const absl::flat_hash_map<string, absl::flat_hash_set<string>>& ancestors,
-    std::vector<string>* sorted) {
+    const absl::flat_hash_set<std::string>& clusters,
+    const absl::flat_hash_set<std::string>& has_successors,
+    const absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>>&
+        ancestors,
+    std::vector<std::string>* sorted) {
   // The nodes are placed in 'sorted' in topological order.
   sorted->clear();
   // We don't use the standard DFS because we are not operating on Node*
   // objects.
   struct Work {
-    string cluster;
+    std::string cluster;
     bool leave;
   };
-  std::set<string> visited;
+  std::set<std::string> visited;
   std::vector<Work> stack;
   // Seed the processing list with clusters that have no successors.
   for (const auto& cluster : clusters) {
@@ -523,7 +523,7 @@ absl::Status Encapsulator::Subgraph::RecordResult(
 }
 
 absl::Status Encapsulator::Subgraph::MakeSequencingNode(
-    const string& subgraph_name, Graph* graph_out) {
+    const std::string& subgraph_name, Graph* graph_out) {
   if (sequencer_ == nullptr) {
     NodeDef seq_def;
     // TODO(shikharagarwal): What source node should we use for errors?
@@ -547,11 +547,11 @@ void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
 }
 
 absl::Status Encapsulator::Subgraph::BuildFunctionDef(
-    const string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    const std::string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
     bool reuse_existing_functions, FunctionLibraryDefinition* library) {
   // name_in is copied here because name may be modified below if
   // rewrite_subgraph_fn is true.
-  string name = name_in;
+  std::string name = name_in;
   call_node_def_.set_op(name);
   call_node_def_.set_name(name);
   call_node_def_.set_device(device_);
@@ -596,7 +596,7 @@ absl::Status Encapsulator::Subgraph::BuildFunctionDef(
   function_def_name_ = name;
 
   FunctionDef fdef;
-  auto lookup = [this](const Node* node) -> std::optional<string> {
+  auto lookup = [this](const Node* node) -> std::optional<std::string> {
     if (control_output_nodes_.contains(node->name())) {
       return std::make_optional(node->name());
     }
@@ -625,7 +625,7 @@ absl::Status Encapsulator::Subgraph::BuildFunctionDef(
 
 absl::Status Encapsulator::Subgraph::ReplaceFunctionDef(
     FunctionLibraryDefinition* library) {
-  const string& name = function_def_name_;
+  const std::string& name = function_def_name_;
 
   FunctionDef fdef;
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
@@ -654,7 +654,7 @@ absl::Status Encapsulator::Subgraph::AddFunctionCallNode(
 }
 
 absl::Status Encapsulator::GetFunctionNameAttr(Node const* node,
-                                               string* attr) const {
+                                               std::string* attr) const {
   AttrSlice attrs = node->attrs();
   attr->clear();
   for (const auto& node_attr : attrs) {
@@ -667,12 +667,12 @@ absl::Status Encapsulator::GetFunctionNameAttr(Node const* node,
   return absl::OkStatus();
 }
 
-bool IsInSubgraph(const string& func_id) { return !func_id.empty(); }
+bool IsInSubgraph(const std::string& func_id) { return !func_id.empty(); }
 
 absl::Status Encapsulator::CopySubgraphNodes(
     absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id;
+    std::string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
     if (!IsInSubgraph(func_id)) continue;
 
@@ -688,9 +688,9 @@ absl::Status Encapsulator::CopySubgraphEdges(
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id;
+    std::string src_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
-    string dst_func_id;
+    std::string dst_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
     Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
     Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
@@ -793,7 +793,7 @@ absl::Status Encapsulator::BuildFunctionDefs(
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     FunctionLibraryDefinition* library) {
   for (auto& subgraph_entry : subgraphs_) {
-    string name = subgraph_entry.first;
+    std::string name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     TF_RETURN_IF_ERROR(subgraph.BuildFunctionDef(
         name, rewrite_subgraph_fn, reuse_existing_functions, library));
@@ -804,7 +804,7 @@ absl::Status Encapsulator::BuildFunctionDefs(
 absl::Status Encapsulator::CopyNodesToOutputGraph(
     Graph* graph_out, absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id;
+    std::string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
 
     // Don't copy nodes that are going to be encapsulated.
@@ -829,7 +829,7 @@ absl::Status Encapsulator::AddFunctionCallNodes(
 }
 
 absl::Status Encapsulator::FindOutputImageOfEdgeSrc(
-    const string& src_func_id, const string& dst_func_id,
+    const std::string& src_func_id, const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_src_node, Node** src_image) {
   if (IsInSubgraph(src_func_id)) {
@@ -844,8 +844,8 @@ absl::Status Encapsulator::FindOutputImageOfEdgeSrc(
   return absl::OkStatus();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
-                                          const string& dst_func_id,
+int Encapsulator::FindOutputSlotOfEdgeSrc(const std::string& src_func_id,
+                                          const std::string& dst_func_id,
                                           const Edge* edge) {
   if (IsInSubgraph(src_func_id)) {
     const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
@@ -860,7 +860,7 @@ int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
 }
 
 absl::Status Encapsulator::FindOutputImageOfEdgeDst(
-    const string& src_func_id, const string& dst_func_id,
+    const std::string& src_func_id, const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_dst_node, Node** dst_image) {
   if (IsInSubgraph(dst_func_id)) {
@@ -875,8 +875,8 @@ absl::Status Encapsulator::FindOutputImageOfEdgeDst(
   return absl::OkStatus();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
-                                          const string& dst_func_id,
+int Encapsulator::FindOutputSlotOfEdgeDst(const std::string& src_func_id,
+                                          const std::string& dst_func_id,
                                           const Edge* edge) {
   if (IsInSubgraph(dst_func_id)) {
     const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
@@ -891,7 +891,8 @@ int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
 }
 
 absl::Status Encapsulator::CopyEdgeToOutputGraph(
-    const Edge* edge, const string& src_func_id, const string& dst_func_id,
+    const Edge* edge, const std::string& src_func_id,
+    const std::string& dst_func_id,
     const absl::flat_hash_map<const Node*, Node*>& node_images,
     Graph* graph_out,
     absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
@@ -943,9 +944,9 @@ absl::Status Encapsulator::AddEdgesToOutputGraph(
       edges_added;
 
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id;
+    std::string src_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
-    string dst_func_id;
+    std::string dst_func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
 
     // Ignore edges that are strictly contained within one subgraph, unless
@@ -1091,7 +1092,7 @@ absl::Status Encapsulator::BuildOutputGraph(
 }  // anonymous namespace
 
 absl::Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
+    std::string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
   Encapsulator encapsulator(std::move(group_attribute),
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 0c7729f67349b5..ed2c9ef45a2c16 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -73,7 +73,7 @@ typedef std::function<absl::Status(
 // dep from B. Originally D must run after C, post-transformation this
 // dependency is lost.
 absl::Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
+    std::string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 1e05ad067def7f..94b136a02b99cf 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -46,7 +46,7 @@ const char* const kXlaHostTransferSequencerAttr =
     "_xla_host_transfer_sequencer";
 
 absl::Status AddGraphDefToFunctionLibrary(
-    const GraphDefBuilder& graphdef_builder, const string& name_suffix,
+    const GraphDefBuilder& graphdef_builder, const std::string& name_suffix,
     FunctionDefLibrary* library) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(graphdef_builder.ToGraphDef(&graphdef));
@@ -64,13 +64,14 @@ absl::Status AddGraphDefToFunctionLibrary(
 }
 
 template <class Tkey, class Tvalue>
-bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
-                   const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
-                   const std::function<string(const Tkey&)>& key_to_string,
-                   const std::function<string(const Tvalue&)>& value_to_string,
-                   const std::function<bool(const Tkey&, const Tvalue&,
-                                            const Tvalue&)>& compare,
-                   const string& map_name, string* diff) {
+bool EqualProtoMap(
+    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
+    const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
+    const std::function<std::string(const Tkey&)>& key_to_string,
+    const std::function<std::string(const Tvalue&)>& value_to_string,
+    const std::function<bool(const Tkey&, const Tvalue&, const Tvalue&)>&
+        compare,
+    const std::string& map_name, std::string* diff) {
   for (const auto& elt_a : a) {
     const auto iter = b.find(elt_a.first);
     if (iter == b.end()) {
@@ -106,7 +107,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
 }
 
 bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
-                          const string& diff_preamble, string* diff) {
+                          const std::string& diff_preamble, std::string* diff) {
   if (a.op() != b.op()) {
     if (diff) {
       *diff = absl::StrCat(diff_preamble, " mismatch for node ", a.name(),
@@ -131,8 +132,8 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
-  std::unordered_set<string> control_input_a;
-  std::unordered_set<string> control_input_b;
+  std::unordered_set<std::string> control_input_a;
+  std::unordered_set<std::string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
     if (absl::StartsWith(a.input(i), "^")) {
       if (!absl::StartsWith(b.input(i), "^")) {
@@ -164,17 +165,17 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
     }
     return false;
   }
-  return EqualProtoMap<string, AttrValue>(
-      a.attr(), b.attr(), [](const string& s) { return s; },
+  return EqualProtoMap<std::string, AttrValue>(
+      a.attr(), b.attr(), [](const std::string& s) { return s; },
       [](const AttrValue& v) { return v.DebugString(); },
-      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+      [](const std::string& key, const AttrValue& av, const AttrValue& bv) {
         if (key == "ancestors") {
           // The ancestors are added from a set so the order is unpredictable;
           // just compare set equality not list equality.
-          std::unordered_set<string> a_set(av.list().s().begin(),
-                                           av.list().s().end());
-          std::unordered_set<string> b_set(bv.list().s().begin(),
-                                           bv.list().s().end());
+          std::unordered_set<std::string> a_set(av.list().s().begin(),
+                                                av.list().s().end());
+          std::unordered_set<std::string> b_set(bv.list().s().begin(),
+                                                bv.list().s().end());
           return a_set == b_set;
         } else {
           return av.DebugString() == bv.DebugString();
@@ -184,7 +185,7 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
 }
 
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
-                      string* diff) {
+                      std::string* diff) {
   if (a.signature().DebugString() != b.signature().DebugString()) {
     if (diff) {
       *diff =
@@ -194,22 +195,21 @@ bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
     }
     return false;
   }
-  if (!EqualProtoMap<string, AttrValue>(
-          a.attr(), b.attr(), [](const string& s) { return s; },
+  if (!EqualProtoMap<std::string, AttrValue>(
+          a.attr(), b.attr(), [](const std::string& s) { return s; },
           [](const AttrValue& v) { return v.DebugString(); },
-          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+          [](const std::string& key, const AttrValue& av, const AttrValue& bv) {
             return av.DebugString() == bv.DebugString();
           },
           absl::StrCat("attr mismatch for function ", a.signature().name()),
           diff)) {
     return false;
   }
-  if (!EqualProtoMap<string, string>(
-          a.ret(), b.ret(), [](const string& s) { return s; },
-          [](const string& s) { return s; },
-          [](const string& key, const string& av, const string& bv) {
-            return av == bv;
-          },
+  if (!EqualProtoMap<std::string, std::string>(
+          a.ret(), b.ret(), [](const std::string& s) { return s; },
+          [](const std::string& s) { return s; },
+          [](const std::string& key, const std::string& av,
+             const std::string& bv) { return av == bv; },
           absl::StrCat("ret mismatch for function ", a.signature().name()),
           diff)) {
     return false;
@@ -257,8 +257,9 @@ bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
 }
 
 bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
-                             const FunctionDefLibrary& actual, string* diff) {
-  std::unordered_map<string, const FunctionDef*> actual_index;
+                             const FunctionDefLibrary& actual,
+                             std::string* diff) {
+  std::unordered_map<std::string, const FunctionDef*> actual_index;
   for (const FunctionDef& function : actual.function()) {
     actual_index[function.signature().name()] = &function;
   }
@@ -343,7 +344,7 @@ REGISTER_OP("AddNLikeTest")
     .SetIsAggregate();
 
 Node* Sequencer(const GraphDefBuilder::Options& opts,
-                const string& call_node_name) {
+                const std::string& call_node_name) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("NoOp"), "NoOp",
                            opts.op_registry());
@@ -383,7 +384,7 @@ Node* KeyPlaceholderShape(const GraphDefBuilder::Options& opts) {
   return KnownShapeBase(DT_STRING, {2}, opts);
 }
 
-Node* KeyPlaceholder(const string& call_node,
+Node* KeyPlaceholder(const std::string& call_node,
                      const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(absl::StrCat(call_node, "_key_placeholder"),
@@ -396,15 +397,16 @@ Node* KeyPlaceholder(const string& call_node,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
-                 const string& new_func_name, const string& oc_cluster,
+Node* RecvAtHost(ops::NodeOut key_input, const std::string& cluster,
+                 const std::string& new_func_name,
+                 const std::string& oc_cluster,
                  absl::Span<const DataType> dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  string key = absl::StrCat("host_compute_channel_", cluster, "_",
-                            new_func_name, "_", oc_cluster);
-  string name = absl::StrCat("outside_compilation_", cluster, "_",
-                             new_func_name, "_", oc_cluster, "_recv");
+  std::string key = absl::StrCat("host_compute_channel_", cluster, "_",
+                                 new_func_name, "_", oc_cluster);
+  std::string name = absl::StrCat("outside_compilation_", cluster, "_",
+                                  new_func_name, "_", oc_cluster, "_recv");
   NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
   node_builder.Input(std::move(key_input));
@@ -416,15 +418,16 @@ Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(ops::NodeOut key_input, const string& cluster,
-                   const string& new_func_name, const string& oc_cluster,
+Node* SendFromHost(ops::NodeOut key_input, const std::string& cluster,
+                   const std::string& new_func_name,
+                   const std::string& oc_cluster,
                    const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  string key = absl::StrCat("host_compute_channel_", cluster, "_",
-                            new_func_name, "_", oc_cluster);
-  string name = absl::StrCat("outside_compilation_", cluster, "_",
-                             new_func_name, "_", oc_cluster, "_send");
+  std::string key = absl::StrCat("host_compute_channel_", cluster, "_",
+                                 new_func_name, "_", oc_cluster);
+  std::string name = absl::StrCat("outside_compilation_", cluster, "_",
+                                  new_func_name, "_", oc_cluster, "_send");
   NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
@@ -477,8 +480,9 @@ Node* RetOp(int index, ops::NodeOut a, const GraphDefBuilder::Options& opts) {
   return opts.FinalizeBuilder(&node_builder);
 }
 
-absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
-                         const std::vector<string>& encapsulated_functions) {
+absl::Status Encapsulate(
+    GraphDef* graphdef, FunctionDefLibrary* library,
+    const std::vector<std::string>& encapsulated_functions) {
   absl::Status s;
   // Convert the GraphDef to a Graph
   std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -512,7 +516,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                                       &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
-  std::unordered_map<string, XlaClusterInfo> clusters;
+  std::unordered_map<std::string, XlaClusterInfo> clusters;
   for (const auto& func : encapsulated_functions) {
     Node* xla_computation_node;
     for (Node* n : graph_out->nodes()) {
@@ -527,7 +531,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
     func_name_attrs.set_name(func);
     clusters.emplace(func,
                      XlaClusterInfo{func, func_name_attrs, xla_computation_node,
-                                    std::map<string, int>{}});
+                                    std::map<std::string, int>{}});
   }
   bool modified;
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
@@ -551,7 +555,7 @@ absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
 }
 
 absl::Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
-  std::vector<string> encapsulated_functions;
+  std::vector<std::string> encapsulated_functions;
   return Encapsulate(graphdef, library, encapsulated_functions);
 }
 
@@ -698,8 +702,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
 }
 
 // Returns a vector of node names in 'graph', sorted by name.
-std::vector<string> GraphNodes(const Graph& graph) {
-  std::vector<string> nodes;
+std::vector<std::string> GraphNodes(const Graph& graph) {
+  std::vector<std::string> nodes;
   for (const auto& node : graph.nodes()) {
     if (!node->IsSource() && !node->IsSink()) {
       nodes.push_back(node->name());
@@ -710,8 +714,9 @@ std::vector<string> GraphNodes(const Graph& graph) {
 }
 
 // Returns a sorted vector of (src, dst) edges in 'graph'.
-std::vector<std::pair<string, string>> GraphEdges(const Graph& graph) {
-  std::vector<std::pair<string, string>> edges;
+std::vector<std::pair<std::string, std::string>> GraphEdges(
+    const Graph& graph) {
+  std::vector<std::pair<std::string, std::string>> edges;
   for (const Edge* edge : graph.edges()) {
     if (edge->src()->IsSource() || edge->dst()->IsSink()) continue;
     edges.emplace_back(
@@ -742,10 +747,11 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
       /*rewrite_subgraph_fn=*/{},
       /*reuse_existing_functions=*/false, &graph, &library));
 
-  std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
+  std::vector<std::string> expected_nodes = {"cluster1", "cluster2", "mul",
+                                             "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
 
-  std::vector<std::pair<string, string>> expected_edges = {
+  std::vector<std::pair<std::string, std::string>> expected_edges = {
       {"cluster1:0", "cluster2:0"},
       {"cluster1:0", "mul:0"},
       {"cluster2:0", "mul:1"},
@@ -753,7 +759,7 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   EXPECT_EQ(expected_edges, GraphEdges(*graph));
 }
 
-const Node* FindNodeByName(const Graph& graph, const string& name) {
+const Node* FindNodeByName(const Graph& graph, const std::string& name) {
   for (const Node* node : graph.nodes()) {
     if (node->name() == name) return node;
   }
@@ -889,7 +895,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -931,7 +937,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {"C:o:0", "c:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -941,7 +947,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"c"}},
@@ -1025,7 +1031,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1102,7 +1108,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"F:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1112,8 +1118,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"F", "outside_compilation_O1_host_compute"}},
@@ -1122,7 +1129,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1132,7 +1139,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1235,7 +1242,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1", "F2"};
+  std::vector<std::string> encapsulated_functions{"F1", "F2"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1262,7 +1269,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1273,7 +1280,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1295,7 +1302,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"d_0_arg", "G:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1306,7 +1313,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1409,7 +1416,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1", "F2"};
+  std::vector<std::string> encapsulated_functions{"F1", "F2"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1432,7 +1439,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"C:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1443,7 +1450,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1462,7 +1469,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"G:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F2_F2_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1473,7 +1480,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1556,7 +1563,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1578,7 +1585,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1589,7 +1596,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1652,7 +1659,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1674,7 +1681,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1685,7 +1692,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"D"}},
@@ -1748,7 +1755,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1785,7 +1792,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1795,7 +1802,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1858,7 +1865,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -1899,7 +1906,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -1909,7 +1916,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -1978,7 +1985,7 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2037,7 +2044,7 @@ TEST(EncapsulateSubgraphsTest,
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2047,7 +2054,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
           {{"outside_compilation_O2_host_compute"},
@@ -2055,7 +2062,7 @@ TEST(EncapsulateSubgraphsTest,
            {"F:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2065,8 +2072,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"outside_compilation_O1_host_compute"}},
@@ -2149,7 +2157,7 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2189,7 +2197,7 @@ TEST(EncapsulateSubgraphsTest,
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O2"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2199,8 +2207,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node",
-                                       "outside_compilation_O1_host_compute"})},
+             absl::Span<const std::string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O2_host_compute"}},
            {"outside_compilation_O1_host_compute"}},
@@ -2209,7 +2218,7 @@ TEST(EncapsulateSubgraphsTest,
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2219,7 +2228,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -2303,7 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2340,7 +2349,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O1"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2350,7 +2359,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O1"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node"})},
+          absl::Span<const std::string>({"_xla_token_arg_node"})},
          {"_xla_original_oc_node_name",
           "outside_compilation_O1_host_compute"}}},
        {{"outside_compilation_O2_host_compute"},
@@ -2358,7 +2367,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O2"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2368,7 +2377,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>(
+          absl::Span<const std::string>(
               {"_xla_token_arg_node", "outside_compilation_O1_host_compute"})},
          {"_xla_original_oc_node_name", "outside_compilation_O2_host_compute"}},
         {"outside_compilation_O1_host_compute"}},
@@ -2377,7 +2386,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors", absl::Span<const string>({})},
+         {"ancestors", absl::Span<const std::string>({})},
          {"key", "host_compute_channel_F1_F1_O3"},
          {"send_key", ""},
          {"recv_key", ""},
@@ -2387,9 +2396,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node",
-                                    "outside_compilation_O1_host_compute",
-                                    "outside_compilation_O2_host_compute"})},
+          absl::Span<const std::string>(
+              {"_xla_token_arg_node", "outside_compilation_O1_host_compute",
+               "outside_compilation_O2_host_compute"})},
          {"_xla_original_oc_node_name", "outside_compilation_O3_host_compute"}},
         {"outside_compilation_O1_host_compute",
          "outside_compilation_O2_host_compute"}}},
@@ -2470,7 +2479,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2507,7 +2516,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
            {"a_0_arg"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2517,7 +2526,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}}},
       },
@@ -2586,7 +2595,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  std::vector<string> encapsulated_functions{"F1"};
+  std::vector<std::string> encapsulated_functions{"F1"};
   TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
@@ -2627,7 +2636,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"c_0_arg", "c:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors", absl::Span<const string>({})},
+            {"ancestors", absl::Span<const std::string>({})},
             {"key", "host_compute_channel_F1_F1_O1"},
             {"send_key", ""},
             {"recv_key", ""},
@@ -2637,7 +2646,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O1"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})},
+             absl::Span<const std::string>({"_xla_token_arg_node"})},
             {"_xla_original_oc_node_name",
              "outside_compilation_O1_host_compute"}},
            {"c"}},
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index fa94a341bbabc6..445ca63c05ad66 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -36,7 +36,8 @@ namespace {
 
 // Returns string attribute value for the node if the attribute is present,
 // otherwise returns empty optional value.
-std::optional<string> GetStringAttr(const Node& n, const string& attr_name) {
+std::optional<std::string> GetStringAttr(const Node& n,
+                                         const std::string& attr_name) {
   auto attr = n.attrs().Find(attr_name);
   if (!attr) {
     return std::nullopt;
@@ -47,8 +48,8 @@ std::optional<string> GetStringAttr(const Node& n, const string& attr_name) {
 
 // Adds a value to the node's list attribute.
 template <typename T>
-absl::Status AppendToListAttr(Node* n, const string& attr_name,
-                              const string& value) {
+absl::Status AppendToListAttr(Node* n, const std::string& attr_name,
+                              const std::string& value) {
   std::vector<T> attr_value;
   absl::Status s = GetNodeAttr(n->attrs(), attr_name, &attr_value);
   if (!s.ok() && s.code() != error::NOT_FOUND) {
@@ -63,7 +64,7 @@ absl::Status AppendToListAttr(Node* n, const string& attr_name,
 
 // Replaces attribute value.
 template <typename T>
-void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
+void ReplaceAttr(Node* n, const std::string& attr_name, const T& value) {
   n->ClearAttr(attr_name);
   n->AddAttr(attr_name, value);
 }
@@ -71,7 +72,7 @@ void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
 // Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather edges to remove. We should not remove the edge while iterating.
   std::vector<const Edge*> edges_to_remove;
   for (const Edge* e : g->edges()) {
@@ -89,7 +90,7 @@ absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
         // Case 1a: outside compilation to outside compilation control edge.
         edges_to_remove.push_back(e);
 
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+        TF_RETURN_IF_ERROR(AppendToListAttr<std::string>(
             e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
             e->src()->name()));
       }
@@ -111,7 +112,7 @@ absl::Status PreprocessControlEdgesBetweenOutsideCompilations(
 // Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather edges between outside compilation and host computation. Notice that
   // we do not store `Edge*` directly because we remove some nodes while adding
   // Identity nodes, and those Edge pointers might be invalidated.
@@ -138,7 +139,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
 
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
-  std::map<std::pair<string, int>, Node*> placeholders;
+  std::map<std::pair<std::string, int>, Node*> placeholders;
   for (int i = 0, end = edges.size(); i < end; i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
@@ -148,7 +149,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
     g->RemoveEdge(e);
 
     // Find or create placeholder node.
-    string new_name =
+    std::string new_name =
         absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
     auto placeholder_index = std::make_pair(src->name(), src_output);
     auto iter = placeholders.find(placeholder_index);
@@ -156,7 +157,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
     if (iter == placeholders.end()) {
       NodeDefBuilder placeholder_builder(new_name, "Placeholder");
       placeholder_builder.Attr("dtype", src->output_type(src_output));
-      string outside_compilation_attr;
+      std::string outside_compilation_attr;
       TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
                                      outside_compilation_attr_name,
                                      &outside_compilation_attr));
@@ -195,7 +196,7 @@ absl::Status PreprocessDataEdgesBetweenOutsideCompilations(
 // Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PostprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Gather all outside compilation to outside compilation nodes.
   std::vector<Node*> placeholder_nodes;
   for (Node* n : g->nodes()) {
@@ -208,7 +209,7 @@ absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
   // Remove the placeholder nodes, and reconnect original edge.
   auto node_name_index = g->BuildNodeNameIndex();
   for (auto n : placeholder_nodes) {
-    string node_name;
+    std::string node_name;
     int node_src_output;
     TF_RETURN_IF_ERROR(GetNodeAttr(
         n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
@@ -271,12 +272,12 @@ absl::Status PostprocessDataEdgesBetweenOutsideCompilations(
 // Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PostprocessEdgesBetweenOutsideCompilations` for details.
 absl::Status PostprocessControlEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   auto node_name_index = g->BuildNodeNameIndex();
 
   // Reconnect outside compilation to outside compilation control edge.
   for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
+    std::vector<std::string> control_deps;
     absl::Status s =
         GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
                     &control_deps);
@@ -288,7 +289,7 @@ absl::Status PostprocessControlEdgesBetweenOutsideCompilations(
       }
     } else {
       n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
-      for (const string& control_input : control_deps) {
+      for (const std::string& control_input : control_deps) {
         auto iter = node_name_index.find(control_input);
         if (iter == node_name_index.end()) {
           return errors::Internal("Cannot find original node for ",
@@ -342,11 +343,11 @@ absl::Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g) {
 }
 
 absl::StatusOr<
-    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+    std::unique_ptr<absl::flat_hash_map<std::string, std::vector<std::string>>>>
 OutsideCompilationClusterDependencies(
-    const Graph* g, const string& outside_compilation_attr_name) {
+    const Graph* g, const std::string& outside_compilation_attr_name) {
   auto cluster_deps = std::make_unique<
-      absl::flat_hash_map<string, absl::flat_hash_set<string>>>();
+      absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>>>();
 
   for (const Edge* e : g->edges()) {
     auto src_outside_compilation =
@@ -360,18 +361,18 @@ OutsideCompilationClusterDependencies(
       if (dst_deps_it == cluster_deps->end()) {
         cluster_deps->insert(std::make_pair(
             *dst_outside_compilation,
-            absl::flat_hash_set<string>({*src_outside_compilation})));
+            absl::flat_hash_set<std::string>({*src_outside_compilation})));
       } else {
         dst_deps_it->second.insert(*src_outside_compilation);
       }
     }
   }
 
-  auto cluster_deps_ordered =
-      std::make_unique<absl::flat_hash_map<string, std::vector<string>>>();
+  auto cluster_deps_ordered = std::make_unique<
+      absl::flat_hash_map<std::string, std::vector<std::string>>>();
 
   for (auto it = cluster_deps->begin(); it != cluster_deps->end(); it++) {
-    std::vector<string> ordered_deps(it->second.begin(), it->second.end());
+    std::vector<std::string> ordered_deps(it->second.begin(), it->second.end());
     std::sort(ordered_deps.begin(), ordered_deps.end());
     cluster_deps_ordered->insert(std::make_pair(it->first, ordered_deps));
   }
@@ -380,7 +381,7 @@ OutsideCompilationClusterDependencies(
 }
 
 absl::Status PreprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   // Remove edges from source node to outside compilation nodes, and edges
   // from outside compilation nodes to sink node.
   std::vector<const Edge*> edges_to_remove;
@@ -406,7 +407,7 @@ absl::Status PreprocessEdgesBetweenOutsideCompilations(
 }
 
 absl::Status PostprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
       g, outside_compilation_attr_name));
   TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index 7c99763c770728..81ab31c79dcda2 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -95,21 +95,21 @@ struct XlaClusterInfo {
   // without losing aggregate initialization, which allows us to get rid of
   // the constructor definitions again.
   XlaClusterInfo() {}
-  XlaClusterInfo(const string& cluster_name,
+  XlaClusterInfo(const std::string& cluster_name,
                  const NameAttrList& func_name_attrs, Node* node,
-                 const std::map<string, int>& host_compute_core)
+                 const std::map<std::string, int>& host_compute_core)
       : cluster_name(cluster_name),
         func_name_attrs(func_name_attrs),
         node(node),
         host_compute_core(host_compute_core) {}
   // XLA cluster name. It might be different from `func_name`.
-  const string cluster_name;
+  const std::string cluster_name;
   // Name and attributes of XLA computation function.
   const NameAttrList func_name_attrs;
   // The XLA computation node in the graph.
   Node* node;
   // A mapping from outside compilation cluster name to its device assignment.
-  const std::map<string, int> host_compute_core;
+  const std::map<std::string, int> host_compute_core;
 };
 
 // Finds dependencies between outside compilation clusters, including both data
@@ -117,9 +117,9 @@ struct XlaClusterInfo {
 // outside compilation cluster to a set of names of outside compilation clusters
 // that it depends on.
 absl::StatusOr<
-    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+    std::unique_ptr<absl::flat_hash_map<std::string, std::vector<std::string>>>>
 OutsideCompilationClusterDependencies(
-    const Graph* g, const string& outside_compilation_attr_name);
+    const Graph* g, const std::string& outside_compilation_attr_name);
 
 // Preprocesses edges within the same XLA cluster. It will perform the following
 // operations in order:
@@ -135,7 +135,7 @@ OutsideCompilationClusterDependencies(
 // 2.  For data edges between different outside compilations, remove the edge
 //     and create a Placeholder node as dst node's input.
 absl::Status PreprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name);
+    Graph* g, const std::string& outside_compilation_attr_name);
 
 // Postprocesses edges within the same XLA cluster. This function reverts what
 // `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
@@ -149,7 +149,7 @@ absl::Status PreprocessEdgesBetweenOutsideCompilations(
 // `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
 // They are handled in `RewriteOutsideCompilationSubgraphFn`.
 absl::Status PostprocessEdgesBetweenOutsideCompilations(
-    Graph* g, const string& outside_compilation_attr_name);
+    Graph* g, const std::string& outside_compilation_attr_name);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 0e59bf0c19d93e..8ba11404010363 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -46,7 +46,7 @@ const char* const kXlaClusterOutput = "XlaClusterOutput";
 
 bool IsCpuGpuCompile(const Graph* graph) {
   for (Node* n : graph->nodes()) {
-    string name;
+    std::string name;
     // Only consider nodes being compiled.
     if (!TryGetNodeAttr(n->attrs(), kXlaClusterIdAttr, &name)) continue;
     // Early return for any node with a device that is not a CPU or GPU.
@@ -185,7 +185,7 @@ absl::Status RewriteSubgraph(
   // Uniquify the function name by computing a fingerprint of the function.
   // Nondeterminism in serialization would not lead to incorrect results, but
   // may cause spurious cache misses.
-  TF_ASSIGN_OR_RETURN(uint64 fingerprint, FingerprintGraph(*graph));
+  TF_ASSIGN_OR_RETURN(uint64_t fingerprint, FingerprintGraph(*graph));
   VLOG(1) << "Subgraph fingerprint:" << fingerprint;
   call_def->set_op(absl::StrCat(call_def->op(), "_", fingerprint));
   return absl::OkStatus();
@@ -360,7 +360,8 @@ absl::Status RewriteSubgraph(
 /*static*/ absl::Status EncapsulateXlaComputationsPass::BuildXlaLaunchOps(
     Graph* graph) {
   const auto is_xla_launch_node = [](const Node& node) -> absl::StatusOr<bool> {
-    const string& name = GetNodeAttrString(node.attrs(), kXlaClusterIdAttr);
+    const std::string& name =
+        GetNodeAttrString(node.attrs(), kXlaClusterIdAttr);
     return !name.empty();
   };
 
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index 16a17c3c2a03a6..acd5319cf8ed16 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 namespace tensorflow {
 
 static std::unique_ptr<Graph> MakeOuterGraph(
-    const FunctionLibraryDefinition& flib_def, const string& function) {
+    const FunctionLibraryDefinition& flib_def, const std::string& function) {
   Scope scope = Scope::NewRootScope().ExitOnError();
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib_def.ToProto()));
 
@@ -143,7 +143,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
   // Test that control edge insertion order doesn't affect the cache key
   // (cluster name) generated by TPU encapsulate pass.
   auto get_serialized_graph = [](bool control_input_reversed,
-                                 bool operand_reversed) -> string {
+                                 bool operand_reversed) -> std::string {
     FunctionLibraryDefinition flib_def(OpRegistry::Global(),
                                        FunctionDefLibrary());
     std::unique_ptr<Graph> graph(new Graph(&flib_def));
@@ -250,8 +250,8 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
 
   TF_ASSERT_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
 
-  std::unordered_map<string, Node*> index = graph->BuildNodeNameIndex();
-  string function = index.at("launch0")->type_string();
+  std::unordered_map<std::string, Node*> index = graph->BuildNodeNameIndex();
+  std::string function = index.at("launch0")->type_string();
 
   // Tests the outer graph is as expected.
   {
@@ -285,9 +285,9 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
   // function. Encapsulation should be deterministic to avoid recompilation.
   TF_ASSERT_OK(
       EncapsulateXlaComputationsPass::Encapsulate(&graph_copy, &flib_def));
-  std::unordered_map<string, Node*> index_copy =
+  std::unordered_map<std::string, Node*> index_copy =
       graph_copy->BuildNodeNameIndex();
-  string function_copy = index_copy.at("launch0")->type_string();
+  std::string function_copy = index_copy.at("launch0")->type_string();
   EXPECT_EQ(function, function_copy);
 }
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 140c47dbcac804..05514f00bd29d5 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -42,7 +42,7 @@ namespace {
 
 // Control return mapping function for outside compilation host graphs.
 // All nodes with kXlaHasHostTransfer attribute are control outputs.
-std::optional<string> HostGraphControlRetMapping(const Node* n) {
+std::optional<std::string> HostGraphControlRetMapping(const Node* n) {
   if (HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) {
     return n->name();
   }
@@ -52,7 +52,7 @@ std::optional<string> HostGraphControlRetMapping(const Node* n) {
 // Add a key placeholder node to the graph. The key placeholder node will be
 // used as input for XlaRecvAtHost/XlaSendFromHost nodes.
 absl::StatusOr<Node*> AddHostComputeKeyPlaceholder(
-    const string& xla_cluster_name, Graph* g) {
+    const std::string& xla_cluster_name, Graph* g) {
   NodeDef key_def;
   NodeDefBuilder builder(absl::StrCat(xla_cluster_name, "_key_placeholder"),
                          "Placeholder");
@@ -74,7 +74,8 @@ bool IsKeyPlaceholderNode(const Node& n) {
 }
 
 // Returns nodes with given type.
-std::vector<Node*> GatherNodesWithType(const Graph& g, const string& type) {
+std::vector<Node*> GatherNodesWithType(const Graph& g,
+                                       const std::string& type) {
   std::vector<Node*> result;
   for (Node* n : g.nodes()) {
     if (n->type_string() == type) {
@@ -105,7 +106,7 @@ absl::Status GetArgDataTypes(const std::vector<Node*>& arg_nodes,
 
 // Builds XlaRecvAtHost node.
 absl::StatusOr<Node*> BuildRecvAtHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     const std::vector<DataType>& recv_at_host_dtypes, Node* key_placeholder) {
   NodeDefBuilder recv_at_host_builder(
       absl::StrCat("outside_compilation_", oc_cluster_name, "_recv"),
@@ -128,7 +129,7 @@ absl::StatusOr<Node*> BuildRecvAtHostNode(
 
 // Builds XlaRecvAtHost node, and replaces all _Arg nodes with it.
 absl::StatusOr<Node*> ReplaceArgNodesWithRecvAtHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     std::vector<DataType>* recv_at_host_dtypes, Node* key_placeholder) {
   // TODO(b/77601805): use out nodes for source node, instead of traversing all
   // nodes.
@@ -205,7 +206,7 @@ absl::Status GetRetDataTypes(const std::vector<Node*>& ret_nodes,
 
 // Builds XlaSendFromHost node.
 absl::StatusOr<Node*> BuildSendFromHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     const std::vector<Node*>& ret_nodes,
     const std::vector<DataType>& send_from_host_dtypes, Node* key_placeholder) {
   NodeDefBuilder send_from_host_builder(
@@ -245,7 +246,7 @@ absl::StatusOr<Node*> BuildSendFromHostNode(
 
 // Builds XlaSendFromHost node, and replaces all _Retval nodes with it.
 absl::StatusOr<Node*> ReplaceRetNodesWithSendFromHostNode(
-    Graph* g, const string& oc_cluster_name,
+    Graph* g, const std::string& oc_cluster_name,
     std::vector<DataType>* send_from_host_dtypes, Node* key_placeholder) {
   // TODO(b/77601805): use in nodes for sink node, instead of traversing all
   // nodes.
@@ -299,16 +300,17 @@ std::optional<std::vector<PartialTensorShape>> GetInferredInputShapes(
   return results;
 }
 
-string host_compute_node_name(const string& original_oc_name) {
+std::string host_compute_node_name(const std::string& original_oc_name) {
   return absl::StrCat("outside_compilation_", original_oc_name,
                       "_host_compute");
 }
 
 // Builds XlaHostCompute NodeDef from the outside compilation call node.
 absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
-    const Node* call_node, const std::map<string, int>& host_compute_core,
-    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
-  string original_oc_name;
+    const Node* call_node, const std::map<std::string, int>& host_compute_core,
+    const absl::flat_hash_map<std::string, std::vector<std::string>>&
+        cluster_deps) {
+  std::string original_oc_name;
   TF_RETURN_IF_ERROR(GetNodeAttr(
       call_node->attrs(), "_outside_compilation_subgraph", &original_oc_name));
   NodeDefBuilder host_compute_builder(host_compute_node_name(original_oc_name),
@@ -341,7 +343,7 @@ absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
   // according to their host-side graph dependency. This can cause deadlock.
   // Therefore, we hint XLA what the correct ordering of these clusters should
   // be to avoid deadlocks.
-  std::vector<string> xla_token_input_nodes;
+  std::vector<std::string> xla_token_input_nodes;
   xla_token_input_nodes.emplace_back(kXlaTokenArgNodeName);
   auto cluster_deps_it = cluster_deps.find(original_oc_name);
   if (cluster_deps_it != cluster_deps.end()) {
@@ -376,8 +378,10 @@ absl::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
 
 // Replace outside compilation function call node with XlaHostCompute node.
 TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> ReplaceOutsideCompilationCallNode(
-    Graph* g, Node* call_node, const std::map<string, int>& host_compute_core,
-    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
+    Graph* g, Node* call_node,
+    const std::map<std::string, int>& host_compute_core,
+    const absl::flat_hash_map<std::string, std::vector<std::string>>&
+        cluster_deps) {
   // Build XlaHostCompute NodeDef.
   TF_ASSIGN_OR_RETURN(
       NodeDef node_def,
@@ -405,8 +409,8 @@ absl::Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
       n->ClearAttr("device_ordinal");
       n->AddAttr("device_ordinal", device_ordinal_value);
     } else if (n->IsIfNode()) {
-      for (const string& attr_name :
-           std::vector<string>{"then_branch", "else_branch"}) {
+      for (const std::string& attr_name :
+           std::vector<std::string>{"then_branch", "else_branch"}) {
         NameAttrList branch_func;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
         (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -414,7 +418,8 @@ absl::Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
         n->AddAttr(attr_name, branch_func);
       }
     } else if (n->IsWhileNode()) {
-      for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+      for (const std::string& attr_name :
+           std::vector<std::string>{"cond", "body"}) {
         NameAttrList branch_func;
         TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
         (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -448,11 +453,12 @@ bool HasLiftedArgs(const FunctionDef& function_def) {
 absl::StatusOr<std::vector<std::pair<Node*, Node*>>>
 LiftedArgsAndOutsideCompilationNodesInFunctionBody(
     const FunctionBody& function_body,
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node) {
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node) {
   std::vector<std::pair<Node*, Node*>>
       lifted_arg_nodes_and_outside_compilation_nodes;
   for (Node* n : function_body.graph->op_nodes()) {
-    string oc_cluster;
+    std::string oc_cluster;
     if (n->type_string() == "Placeholder" &&
         GetNodeAttr(n->def(), kXlaLiftedArgOutsideCompilationAttrName,
                     &oc_cluster)
@@ -471,7 +477,7 @@ LiftedArgsAndOutsideCompilationNodesInFunctionBody(
 absl::StatusOr<std::vector<DataType>> UpdateTypesAttribute(
     const std::vector<std::pair<Node*, Node*>>&
         lifted_arg_nodes_and_outside_compilation_nodes,
-    const string& type_attr_name, Node* n) {
+    const std::string& type_attr_name, Node* n) {
   std::vector<DataType> data_types;
   data_types.reserve(lifted_arg_nodes_and_outside_compilation_nodes.size());
   TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), type_attr_name, &data_types));
@@ -578,7 +584,8 @@ absl::Status AddFunctionWithNewName(const std::string& new_name,
 // Reconnect outside compilation lifted arguments in a functional While node to
 // its outside compilation tensor sources.
 absl::Status PostprocessLiftedArgsForWhile(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   TF_RET_CHECK(n->IsWhileNode());
 
@@ -687,7 +694,8 @@ absl::Status PostprocessLiftedArgsForWhile(
 }
 
 absl::Status PostprocessLiftedArgsForIf(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   TF_RET_CHECK(n->IsIfNode());
 
@@ -826,7 +834,8 @@ absl::Status PostprocessLiftedArgsForIf(
 }
 
 absl::Status PostprocessLiftedArgsForCall(
-    const std::unordered_map<string, Node*>& outside_compilation_attr_to_node,
+    const std::unordered_map<std::string, Node*>&
+        outside_compilation_attr_to_node,
     Graph* g, Node* n, FunctionLibraryDefinition* fld) {
   const FunctionDef* fdef = fld->Find(n->type_string());
   TF_RET_CHECK(fdef);
@@ -924,12 +933,12 @@ absl::Status PostprocessLiftedArgsForCall(
 
 // Creates a mapping from outside compilation cluster name to lifted argument
 // placeholder.
-absl::StatusOr<std::unordered_map<string, Node*>> OutsideCompilationAttrToNode(
-    const Graph& g) {
-  std::unordered_map<string, Node*> outside_compilation_attr_to_node;
+absl::StatusOr<std::unordered_map<std::string, Node*>>
+OutsideCompilationAttrToNode(const Graph& g) {
+  std::unordered_map<std::string, Node*> outside_compilation_attr_to_node;
   for (Node* n : g.op_nodes()) {
     bool is_lifted_arg;
-    string outside_compilation_attr;
+    std::string outside_compilation_attr;
     if (TryGetNodeAttr(n->def(), kXlaIsLiftedArgAttrName, &is_lifted_arg) &&
         TryGetNodeAttr(n->def(), "_xla_outside_compilation",
                        &outside_compilation_attr)) {
@@ -988,8 +997,9 @@ absl::Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) {
 //    replace this node with compilation result node.
 // 3) all outside compilation graphs.
 absl::Status ConstructHostGraph(
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const std::vector<string>& outside_compilation_host_graphs,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::vector<std::string>& outside_compilation_host_graphs,
     FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
   host_graph->reset(new Graph(fld));
 
@@ -1013,7 +1023,7 @@ absl::Status ConstructHostGraph(
   //    XlaSendFromHost, If/While nodes containing
   //    XlaRecvAtHost/XlaSendFromHost) to sequencer node.
   // c) Clear node_def.device(), so device placer won't get confused.
-  for (const string& host_func : outside_compilation_host_graphs) {
+  for (const std::string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
     // Temporarily use "0" as "_device_ordinal". It will be reset to placeholder
     // value after we expanded all host graphs. We cannot just use placeholder
@@ -1021,7 +1031,7 @@ absl::Status ConstructHostGraph(
     // value for attributes.
     AttrValue device_ordinal_attr;
     device_ordinal_attr.set_i(0);
-    protobuf::Map<string, AttrValue> attrs;
+    protobuf::Map<std::string, AttrValue> attrs;
     attrs["_device_ordinal"] = device_ordinal_attr;
     std::unique_ptr<FunctionBody> host_fbody;
     const FunctionDef* host_fdef = fld->Find(host_func);
@@ -1123,18 +1133,17 @@ absl::Status ConstructHostGraph(
 
 // Expand XLA computation's outside compilation host side graph into main graph.
 // Add a control edge between sequencer node and the XLA computation node.
-absl::Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
-                                          FunctionLibraryDefinition* fld,
-                                          const string& host_graph_func_name,
-                                          Node* xla_computation_node,
-                                          Node* pivot_node) {
+absl::Status ExpandHostGraphIntoMainGraph(
+    Graph* main_graph, FunctionLibraryDefinition* fld,
+    const std::string& host_graph_func_name, Node* xla_computation_node,
+    Node* pivot_node) {
   // Temporarily use "0" as "_device_ordinal". It will be rewritten with the
   // correct value in a later pass. We cannot just use placeholder value here
   // because FunctionDef instantiation does not allow placeholder value for
   // attributes.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* host_graph_func = fld->Find(host_graph_func_name);
@@ -1207,12 +1216,12 @@ absl::Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
 // 2) Remove control edges.
 // 3) Prune nodes that are not useful for shape inference.
 absl::Status RewriteShapeInferenceGraph(
-    const string& shape_inference_graph_name, Graph* host_graph,
+    const std::string& shape_inference_graph_name, Graph* host_graph,
     Node* pivot_node, FunctionLibraryDefinition* fld) {
   // Use "0" as "_device_ordinal". It does not matter for shape inference.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* shape_inference_graph =
@@ -1338,13 +1347,13 @@ void SetMaximalSharding(NodeDefBuilder& node_builder) {
 
 // Builds XlaSendToHost node which sends cond predicate to host.
 TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> BuildSendIfPredNode(
-    const string& name, const string& host_transfer_key, Node* pred_node,
-    Graph* g) {
+    const std::string& name, const std::string& host_transfer_key,
+    Node* pred_node, Graph* g) {
   NodeDefBuilder send_pred_builder(name, "XlaSendToHost");
   send_pred_builder.Attr("Tinput", DT_BOOL);
   send_pred_builder.Attr("key", absl::StrCat(host_transfer_key, "_dtoh_0"));
   send_pred_builder.Attr(kXlaTokenInputNodesAttrName,
-                         std::vector<string>{kXlaTokenArgNodeName});
+                         std::vector<std::string>{kXlaTokenArgNodeName});
   send_pred_builder.Attr(kXlaOriginalOutsideCompilationNodeName, name);
   SetMaximalSharding(send_pred_builder);
   send_pred_builder.Input(pred_node->name(), 0, DT_BOOL);
@@ -1356,14 +1365,14 @@ TF_ATTRIBUTE_NOINLINE absl::StatusOr<Node*> BuildSendIfPredNode(
 }
 
 // Replaces key placeholder node with an _Arg node.
-absl::Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
-                                              const string& func_name,
-                                              FunctionLibraryDefinition* fld) {
+absl::Status ReplaceKeyPlaceholderWithArgNode(
+    const std::string& xla_cluster_name, const std::string& func_name,
+    FunctionLibraryDefinition* fld) {
   // Temporarily use "0" as "_device_ordinal". It will be reset to placeholder
   // value after rewriting.
   AttrValue device_ordinal_attr;
   device_ordinal_attr.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_attr;
   std::unique_ptr<FunctionBody> fbody;
   const FunctionDef* func = fld->Find(func_name);
@@ -1404,14 +1413,15 @@ absl::Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
 
 // Builds host side graph for If node.
 TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForIfNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const string& if_node_name, const string& host_transfer_key,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
-    const string& then_branch_host_func_name,
-    const string& else_branch_host_func_name) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const std::string& if_node_name,
+    const std::string& host_transfer_key,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const std::string& then_branch_host_func_name,
+    const std::string& else_branch_host_func_name) {
   Graph host_graph(fld);
-  string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
+  std::string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("_device_ordinal");
 
@@ -1484,7 +1494,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForIfNode(
 
 // Rewrites loop cond to add a node which sends loop cond to host.
 TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
-    const string& cond_xla_func_name, const string& host_transfer_key,
+    const std::string& cond_xla_func_name, const std::string& host_transfer_key,
     NameAttrList* loop_cond_func, FunctionLibraryDefinition* fld,
     Node* while_node) {
   // Instantiate the loop cond function.
@@ -1523,7 +1533,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
   send_loop_cond_builder.Attr("key",
                               absl::StrCat(host_transfer_key, "_dtoh_0"));
   send_loop_cond_builder.Attr(kXlaTokenInputNodesAttrName,
-                              std::vector<string>{kXlaTokenArgNodeName});
+                              std::vector<std::string>{kXlaTokenArgNodeName});
   send_loop_cond_builder.Attr(kXlaOriginalOutsideCompilationNodeName,
                               send_loop_cond_builder.node_name());
   SetMaximalSharding(send_loop_cond_builder);
@@ -1560,10 +1570,13 @@ TF_ATTRIBUTE_NOINLINE absl::Status AddSendLoopPredToLoopCond(
 
 // Rewrites while loop cond function for host.
 absl::Status RewriteHostWhileLoopCond(
-    const string& cond_host_func_name, const string& while_node_name,
-    const string& host_transfer_key, const string& xla_cluster_attr_name,
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+    const std::string& cond_host_func_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& outside_compilation_name,
+    FunctionLibraryDefinition* fld) {
   // Replace key placeholder node with _Arg node.
   TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
       xla_cluster_name, cond_host_func_name, fld));
@@ -1571,7 +1584,7 @@ absl::Status RewriteHostWhileLoopCond(
   // Instantiate cond function.
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_temp_value;
   std::unique_ptr<FunctionBody> cond_fbody;
   const FunctionDef* cond_host_func = fld->Find(cond_host_func_name);
@@ -1634,10 +1647,13 @@ absl::Status RewriteHostWhileLoopCond(
 
 // Rewrites while loop body function for host.
 absl::Status RewriteHostWhileLoopBody(
-    const string& body_host_func_name, const string& while_node_name,
-    const string& host_transfer_key, const string& xla_cluster_attr_name,
-    const string& xla_cluster_name, const string& outside_compilation_attr_name,
-    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+    const std::string& body_host_func_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& outside_compilation_name,
+    FunctionLibraryDefinition* fld) {
   // Replace key placeholder node with _Arg node.
   TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
       xla_cluster_name, body_host_func_name, fld));
@@ -1645,7 +1661,7 @@ absl::Status RewriteHostWhileLoopBody(
   // Instantiate body function.
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> attrs;
+  protobuf::Map<std::string, AttrValue> attrs;
   attrs["_device_ordinal"] = device_ordinal_temp_value;
   std::unique_ptr<FunctionBody> body_fbody;
   const FunctionDef* body_host_func = fld->Find(body_host_func_name);
@@ -1692,13 +1708,16 @@ absl::Status RewriteHostWhileLoopBody(
 
 // Builds host side graph for while node.
 TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForWhileNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const string& while_node_name, const string& host_transfer_key,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
-    const string& cond_host_func_name, const string& body_host_func_name) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const std::string& while_node_name,
+    const std::string& host_transfer_key,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const std::string& cond_host_func_name,
+    const std::string& body_host_func_name) {
   Graph host_graph(fld);
-  string outside_compilation_name = absl::StrCat("oc_while_", while_node_name);
+  std::string outside_compilation_name =
+      absl::StrCat("oc_while_", while_node_name);
 
   // Step 1: add key placeholder node.
   TF_ASSIGN_OR_RETURN(
@@ -1759,10 +1778,12 @@ TF_ATTRIBUTE_NOINLINE absl::Status BuildHostGraphForWhileNode(
 
 // Builds host graph for func call nodes.
 absl::Status BuildHostGraphForFuncCallNode(
-    const string& xla_cluster_attr_name, const string& xla_cluster_name,
-    const string& outside_compilation_attr_name,
-    const string& func_call_node_name, const string& func_call_host_func_name,
-    const string& host_graph_func_name, FunctionLibraryDefinition* fld) {
+    const std::string& xla_cluster_attr_name,
+    const std::string& xla_cluster_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& func_call_node_name,
+    const std::string& func_call_host_func_name,
+    const std::string& host_graph_func_name, FunctionLibraryDefinition* fld) {
   Graph host_graph(fld);
   AttrValue device_ordinal_value;
   device_ordinal_value.set_placeholder("_device_ordinal");
@@ -1807,18 +1828,19 @@ absl::Status BuildHostGraphForFuncCallNode(
 }
 
 TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   bool func_has_outside_compilation = false;
   NameAttrList func;
   if (fld->Contains(n->type_string())) {
     func.set_name(n->type_string());
-    typedef protobuf::Map<string, AttrValue> AttrMap;
+    typedef protobuf::Map<std::string, AttrValue> AttrMap;
     *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
   } else if (n->IsPartitionedCall()) {
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &func));
@@ -1827,7 +1849,7 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
     func.set_name(FunctionLibraryDefinition::kGradientOp);
     *func.mutable_attr() = n->def().attr();
   }
-  string canonical_func_name;
+  std::string canonical_func_name;
   if (func.name() == FunctionLibraryDefinition::kGradientOp) {
     NameAttrList forward_func;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &forward_func));
@@ -1835,8 +1857,8 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
   } else {
     canonical_func_name = func.name();
   }
-  string new_func_name = absl::StrCat(canonical_func_name, "_oc");
-  string host_func_name =
+  std::string new_func_name = absl::StrCat(canonical_func_name, "_oc");
+  std::string host_func_name =
       absl::StrCat("oc_func_call_host_", canonical_func_name);
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
@@ -1876,11 +1898,11 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
   TF_RETURN_IF_ERROR(replace_builder->Finalize(replace_def.get()));
   TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, *replace_def));
   replace->AddAttr(kXlaTokenInputNodesAttrName,
-                   std::vector<string>{kXlaTokenArgNodeName});
+                   std::vector<std::string>{kXlaTokenArgNodeName});
   replace->AddAttr(kXlaOriginalOutsideCompilationNodeName, replace->name());
 
   // Build host side graph for the function call.
-  string oc_host_graph_name =
+  std::string oc_host_graph_name =
       absl::StrCat("oc_func_host_graph_", replace->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForFuncCallNode(
       xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
@@ -1893,12 +1915,13 @@ TF_ATTRIBUTE_NOINLINE absl::Status ExtractOutsideCompilationForFuncCallNode(
 }
 
 absl::Status ExtractOutsideCompilationForIfNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Instantiate "then_branch" and "else_branch".
   NameAttrList then_branch, else_branch;
@@ -1908,12 +1931,14 @@ absl::Status ExtractOutsideCompilationForIfNode(
   // Extract outside compilation for then_branch and else_branch.
   bool then_branch_has_outside_compilation = false;
   bool else_branch_has_outside_compilation = false;
-  string then_branch_host_func_name =
-             absl::StrCat("oc_then_branch_host_if_", then_branch.name()),
-         else_branch_host_func_name =
-             absl::StrCat("oc_else_branch_host_if_", else_branch.name());
-  string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
-         else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
+  std::string then_branch_host_func_name =
+                  absl::StrCat("oc_then_branch_host_if_", then_branch.name()),
+              else_branch_host_func_name =
+                  absl::StrCat("oc_else_branch_host_if_", else_branch.name());
+  std::string then_branch_xla_func_name =
+                  absl::StrCat(then_branch.name(), "_oc"),
+              else_branch_xla_func_name =
+                  absl::StrCat(else_branch.name(), "_oc");
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       then_branch, then_branch_xla_func_name, then_branch_host_func_name,
@@ -1946,7 +1971,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   }
   n->AddAttr(kXlaOriginalOutsideCompilationNodeName, n->name());
 
-  string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
+  std::string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
 
   // XLA computation: add a SendToHost node to send cond predicate.
   Node* pred_node;
@@ -1956,7 +1981,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
       BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
                           host_transfer_key, pred_node, g));
   n->AddAttr(kXlaTokenInputNodesAttrName,
-             std::vector<string>{send_pred_node->name()});
+             std::vector<std::string>{send_pred_node->name()});
 
   // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
   // visit If node after `send_pred_node`, thus the token output for
@@ -1969,7 +1994,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   // we need to create a no-op host graph.
   if (!then_branch_has_outside_compilation) {
     std::unique_ptr<Graph> then_branch_host_graph(new Graph(fld));
-    std::vector<string> then_branch_host_graphs;
+    std::vector<std::string> then_branch_host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(
         xla_cluster_name, outside_compilation_attr_name,
         then_branch_host_graphs, fld, &then_branch_host_graph));
@@ -1986,7 +2011,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
   }
   if (!else_branch_has_outside_compilation) {
     std::unique_ptr<Graph> else_branch_host_graph(new Graph(fld));
-    std::vector<string> else_branch_host_graphs;
+    std::vector<std::string> else_branch_host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(
         xla_cluster_name, outside_compilation_attr_name,
         else_branch_host_graphs, fld, &else_branch_host_graph));
@@ -2001,7 +2026,7 @@ absl::Status ExtractOutsideCompilationForIfNode(
       TF_RETURN_IF_ERROR(fld->AddFunctionDef(else_branch_host_fdef));
     }
   }
-  string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
+  std::string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       n->name(), host_transfer_key, oc_host_graph_name, fld,
@@ -2012,12 +2037,13 @@ absl::Status ExtractOutsideCompilationForIfNode(
 }
 
 absl::Status ExtractOutsideCompilationForWhileNode(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, Graph* g, Node* n,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core, Graph* g, Node* n,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
-    std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Instantiate "cond" and "body".
   NameAttrList cond, body;
@@ -2027,10 +2053,12 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   // Extract outside compilation for cond and body.
   bool cond_has_outside_compilation = false;
   bool body_has_outside_compilation = false;
-  string cond_host_func_name = absl::StrCat("oc_cond_host_while_", cond.name()),
-         body_host_func_name = absl::StrCat("oc_body_host_while_", body.name());
-  string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
-         body_xla_func_name = absl::StrCat(body.name(), "_oc");
+  std::string cond_host_func_name =
+                  absl::StrCat("oc_cond_host_while_", cond.name()),
+              body_host_func_name =
+                  absl::StrCat("oc_body_host_while_", body.name());
+  std::string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
+              body_xla_func_name = absl::StrCat(body.name(), "_oc");
   TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
@@ -2060,19 +2088,19 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   }
   n->AddAttr(kXlaOriginalOutsideCompilationNodeName, n->name());
 
-  string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
+  std::string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
 
   // XLA computation: rewrite cond function to add a SendToHost node to send
   // loop predicate.
   TF_RETURN_IF_ERROR(AddSendLoopPredToLoopCond(
       cond_xla_func_name, host_transfer_key, &cond, fld, n));
   n->AddAttr(kXlaTokenInputNodesAttrName,
-             std::vector<string>{kXlaTokenArgNodeName});
+             std::vector<std::string>{kXlaTokenArgNodeName});
 
   // Build host side graph for the "While" node.
   if (!cond_has_outside_compilation) {
     std::unique_ptr<Graph> cond_host_graph(new Graph(fld));
-    std::vector<string> host_graphs;
+    std::vector<std::string> host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(xla_cluster_name,
                                           outside_compilation_attr_name,
                                           host_graphs, fld, &cond_host_graph));
@@ -2088,7 +2116,7 @@ absl::Status ExtractOutsideCompilationForWhileNode(
   }
   if (!body_has_outside_compilation) {
     std::unique_ptr<Graph> body_host_graph(new Graph(fld));
-    std::vector<string> host_graphs;
+    std::vector<std::string> host_graphs;
     TF_RETURN_IF_ERROR(ConstructHostGraph(xla_cluster_name,
                                           outside_compilation_attr_name,
                                           host_graphs, fld, &body_host_graph));
@@ -2102,7 +2130,8 @@ absl::Status ExtractOutsideCompilationForWhileNode(
       TF_RETURN_IF_ERROR(fld->AddFunctionDef(body_host_fdef));
     }
   }
-  string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
+  std::string oc_host_graph_name =
+      absl::StrCat("oc_while_host_graph_", n->name());
   TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
       n->name(), host_transfer_key, oc_host_graph_name, fld,
@@ -2113,11 +2142,13 @@ absl::Status ExtractOutsideCompilationForWhileNode(
 }
 
 absl::Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
-    Graph* g, const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
-    std::vector<string>* shape_inference_graphs,
+    Graph* g, const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* host_graphs,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   std::vector<Node*> if_nodes, while_nodes, func_call_nodes;
   for (Node* n : g->nodes()) {
@@ -2155,7 +2186,7 @@ absl::Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
 }
 
 absl::Status CopyOutsideCompilationConstNodes(
-    Graph* g, const string& outside_compilation_attr_name) {
+    Graph* g, const std::string& outside_compilation_attr_name) {
   for (Node* n : g->op_nodes()) {
     if (!n->IsConstant() ||
         !HasNodeAttr(n->def(), outside_compilation_attr_name)) {
@@ -2205,8 +2236,8 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
     const std::vector<OutputTensor>& arg_source_tensors,
     std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
     std::vector<int>* output_permutation, NodeDef* node_def) {
-  string old_name = node_def->op();
-  string new_name =
+  std::string old_name = node_def->op();
+  std::string new_name =
       absl::StrCat(xla_cluster_name_, "_", new_function_name_, "_", old_name);
   node_def->set_op(new_name);
   node_def->set_name(new_name);
@@ -2290,14 +2321,14 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
     AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", *shapes, node_def);
   } else {
-    string shape_inference_func_name =
+    std::string shape_inference_func_name =
         absl::StrCat("_outside_compilation_shape_inference_", new_name);
     NameAttrList shape_inference_graph;
     shape_inference_graph.set_name(shape_inference_func_name);
     AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", std::vector<TensorShapeProto>{}, node_def);
   }
-  AddNodeAttr("ancestors", std::vector<string>{}, node_def);
+  AddNodeAttr("ancestors", std::vector<std::string>{}, node_def);
   AddNodeAttr("Tinputs", recv_at_host_dtypes, node_def);
   AddNodeAttr("Toutputs", send_from_host_dtypes, node_def);
   AddNodeAttr("key", absl::StrCat("host_compute_channel_", new_name), node_def);
@@ -2306,15 +2337,16 @@ absl::Status RewriteOutsideCompilationSubgraphFn::operator()(
 }
 
 absl::Status ExtractOutsideCompilationForFunction(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const NameAttrList& func_name_attrs, const string& new_func_name,
-    const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+    const std::string& new_func_name, const std::string& host_graph_func_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation) {
   // Convert the function to graph.
-  const string& func_name = func_name_attrs.name();
+  const std::string& func_name = func_name_attrs.name();
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(
       flr->Instantiate(func_name, AttrSlice(&func_name_attrs.attr()), &handle));
@@ -2345,8 +2377,8 @@ absl::Status ExtractOutsideCompilationForFunction(
   }
 
   std::unique_ptr<Graph> graph_out;
-  std::vector<string> outside_compilation_host_graphs;
-  std::vector<string> shape_inference_graphs_to_rewrite;
+  std::vector<std::string> outside_compilation_host_graphs;
+  std::vector<std::string> shape_inference_graphs_to_rewrite;
   if (*has_outside_compilation) {
     // Copy outside compilation Const nodes with non outside compilation users.
     TF_RETURN_IF_ERROR(CopyOutsideCompilationConstNodes(
@@ -2404,7 +2436,7 @@ absl::Status ExtractOutsideCompilationForFunction(
         }
       }
     }
-    std::map<string, Node*> host_compute_nodes;
+    std::map<std::string, Node*> host_compute_nodes;
     for (Node* n : outside_compilation_nodes) {
       auto host_compute_node_or = ReplaceOutsideCompilationCallNode(
           graph_out.get(), n, host_compute_core, *cluster_deps);
@@ -2416,11 +2448,11 @@ absl::Status ExtractOutsideCompilationForFunction(
     // them so XlaCompiler can handle them in correct order.
     for (const auto& iter : host_compute_nodes) {
       Node* host_compute_node = iter.second;
-      std::vector<string> token_input_node_names;
+      std::vector<std::string> token_input_node_names;
       TF_RETURN_IF_ERROR(GetNodeAttr(host_compute_node->def(),
                                      kXlaTokenInputNodesAttrName,
                                      &token_input_node_names));
-      for (const string& node_name : token_input_node_names) {
+      for (const std::string& node_name : token_input_node_names) {
         if (node_name == kXlaTokenArgNodeName) {
           continue;
         }
@@ -2459,7 +2491,7 @@ absl::Status ExtractOutsideCompilationForFunction(
     // Shape inference graphs might contain Placeholder nodes for outside
     // compilation to outside compilation edges. Rewrite shape inference graphs
     // to remove such nodes.
-    for (const string& shape_inference_graph :
+    for (const std::string& shape_inference_graph :
          shape_inference_graphs_to_rewrite) {
       TF_RETURN_IF_ERROR(
           RewriteShapeInferenceGraph(shape_inference_graph, host_graph.get(),
@@ -2467,7 +2499,7 @@ absl::Status ExtractOutsideCompilationForFunction(
     }
 
     // Remove the outside compilation graphs from function library.
-    for (const string& func : outside_compilation_host_graphs) {
+    for (const std::string& func : outside_compilation_host_graphs) {
       TF_RETURN_IF_ERROR(fld->RemoveFunction(func));
     }
 
@@ -2499,9 +2531,9 @@ absl::Status ExtractOutsideCompilationForFunction(
 }
 
 absl::Status ExtractOutsideCompilation(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::unordered_map<std::string, XlaClusterInfo>& clusters, Graph* g,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
     bool* modified) {
   if (VLOG_IS_ON(4)) {
@@ -2511,14 +2543,14 @@ absl::Status ExtractOutsideCompilation(
   *modified = false;
   auto node_name_index = g->BuildNodeNameIndex();
   for (auto& iter : clusters) {
-    string xla_cluster_name = iter.first;
+    std::string xla_cluster_name = iter.first;
     Node* n = iter.second.node;
     auto const& func_name_attrs = iter.second.func_name_attrs;
     auto const& host_compute_core = iter.second.host_compute_core;
 
-    std::vector<string> shape_inference_graphs;
+    std::vector<std::string> shape_inference_graphs;
     bool has_outside_compilation;
-    string host_graph_func_name =
+    std::string host_graph_func_name =
         absl::StrCat("oc_host_graph_", xla_cluster_name);
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
@@ -2528,7 +2560,7 @@ absl::Status ExtractOutsideCompilation(
     *modified |= has_outside_compilation;
 
     if (has_outside_compilation) {
-      string pivot_name = absl::StrCat(xla_cluster_name, "/pivot");
+      std::string pivot_name = absl::StrCat(xla_cluster_name, "/pivot");
       Node* pivot_node = node_name_index[pivot_name];
       TF_RETURN_IF_ERROR(ExpandHostGraphIntoMainGraph(
           g, fld, host_graph_func_name, n, pivot_node));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index 7631ccd0bc6ab0..c1697fcb4cde0d 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -44,9 +44,9 @@ namespace tensorflow {
 class RewriteOutsideCompilationSubgraphFn {
  public:
   RewriteOutsideCompilationSubgraphFn(
-      const string& xla_cluster_attr_name,
-      const string& outside_compilation_attr_name,
-      const string& xla_cluster_name, const string& new_function_name)
+      const std::string& xla_cluster_attr_name,
+      const std::string& outside_compilation_attr_name,
+      const std::string& xla_cluster_name, const std::string& new_function_name)
       : xla_cluster_attr_name_(xla_cluster_attr_name),
         outside_compilation_attr_name_(outside_compilation_attr_name),
         xla_cluster_name_(xla_cluster_name),
@@ -59,10 +59,10 @@ class RewriteOutsideCompilationSubgraphFn {
                           NodeDef* node_def);
 
  private:
-  string xla_cluster_attr_name_;
-  string outside_compilation_attr_name_;
-  string xla_cluster_name_;
-  string new_function_name_;
+  std::string xla_cluster_attr_name_;
+  std::string outside_compilation_attr_name_;
+  std::string xla_cluster_name_;
+  std::string new_function_name_;
 };
 
 // For an XLA computation function, replace all outside compilations with
@@ -88,12 +88,13 @@ class RewriteOutsideCompilationSubgraphFn {
 // has_outside_compilation: a bool indicating whether this function has any
 //   outside compilation nodes.
 absl::Status ExtractOutsideCompilationForFunction(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name, const string& xla_cluster_name,
-    const NameAttrList& func_name_attrs, const string& new_func_name,
-    const string& host_graph_func_name,
-    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
-    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+    const std::string& new_func_name, const std::string& host_graph_func_name,
+    const std::map<std::string, int>& host_compute_core,
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    std::vector<std::string>* shape_inference_graphs,
     bool* has_outside_compilation);
 
 // Rewrites XLA computation in `clusters` to replace outside compilation nodes
@@ -101,9 +102,9 @@ absl::Status ExtractOutsideCompilationForFunction(
 // of outside compilation outputs cannot be determined now, we will store shape
 // inference graph into `fld`.
 absl::Status ExtractOutsideCompilation(
-    const string& xla_cluster_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    const std::string& xla_cluster_attr_name,
+    const std::string& outside_compilation_attr_name,
+    const std::unordered_map<std::string, XlaClusterInfo>& clusters, Graph* g,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
     bool* modified);
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index 4d007d07504939..1a6441a80726a0 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -236,14 +236,14 @@ class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
   }
 
   absl::Status ExtractOutsideCompilationTest(
-      const string &xla_cluster_attr_name,
-      const string &outside_compilation_attr_name,
-      const string &xla_cluster_name, const NameAttrList &func_name_attrs,
-      const string &new_func_name, const string &host_graph_func_name,
-      const std::map<string, int> &host_compute_core,
-      FunctionLibraryDefinition *fld,
-      std::vector<string> *shape_inference_graphs,
-      bool *has_outside_compilation) {
+      const std::string& xla_cluster_attr_name,
+      const std::string& outside_compilation_attr_name,
+      const std::string& xla_cluster_name, const NameAttrList& func_name_attrs,
+      const std::string& new_func_name, const std::string& host_graph_func_name,
+      const std::map<std::string, int>& host_compute_core,
+      FunctionLibraryDefinition* fld,
+      std::vector<std::string>* shape_inference_graphs,
+      bool* has_outside_compilation) {
     OptimizerOptions opts;
     pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
@@ -288,9 +288,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -342,7 +342,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   std::unique_ptr<FunctionBody> host_fbody;
   AttrValue device_ordinal_temp_value;
   device_ordinal_temp_value.set_i(0);
-  protobuf::Map<string, AttrValue> host_func_attrs;
+  protobuf::Map<std::string, AttrValue> host_func_attrs;
   host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
   TF_CHECK_OK(FunctionDefToBodyHelper(
       *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld, &host_fbody));
@@ -406,9 +406,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -481,9 +481,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -498,7 +498,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -568,7 +568,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     // _xla_token_input_nodes.
     Node *if_node = node_name_index["if"];
     EXPECT_NE(if_node, nullptr);
-    std::vector<string> token_inputs;
+    std::vector<std::string> token_inputs;
     TF_CHECK_OK(
         GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
     EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
@@ -631,9 +631,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -648,7 +648,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -767,9 +767,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
   }
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core;
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core;
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -784,7 +784,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     std::unique_ptr<FunctionBody> host_fbody;
     AttrValue device_ordinal_temp_value;
     device_ordinal_temp_value.set_i(0);
-    protobuf::Map<string, AttrValue> host_func_attrs;
+    protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
     TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
                                         AttrSlice(&host_func_attrs), &fld,
@@ -873,9 +873,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -898,14 +898,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   EXPECT_NE(host_compute_1, nullptr);
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
-  std::vector<string> token_input_nodes;
+  std::vector<std::string> token_input_nodes;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
 
-  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  std::vector<std::string> expected_token_input_nodes_0(
+      {"_xla_token_arg_node"});
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
   token_input_nodes.clear();
-  std::vector<string> expected_token_input_nodes_1(
+  std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
@@ -955,9 +956,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
-  protobuf::Map<string, tensorflow::AttrValue> attrs;
-  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::vector<string> shape_inference_graphs;
+  protobuf::Map<std::string, tensorflow::AttrValue> attrs;
+  std::map<std::string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<std::string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
@@ -980,14 +981,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   EXPECT_NE(host_compute_1, nullptr);
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
-  std::vector<string> token_input_nodes;
+  std::vector<std::string> token_input_nodes;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
 
-  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  std::vector<std::string> expected_token_input_nodes_0(
+      {"_xla_token_arg_node"});
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
   token_input_nodes.clear();
-  std::vector<string> expected_token_input_nodes_1(
+  std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 446df9cac70e2d..a0a0d45736f1e8 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -46,7 +46,7 @@ std::vector<Flag>* jitrt_flag_list;
 std::vector<Flag>* flag_list;
 absl::once_flag flags_init;
 
-bool SetterForXlaAutoJitFlag(const string& value) {
+bool SetterForXlaAutoJitFlag(const std::string& value) {
   int32_t opt_level;
   // We need to use the mark_for_compilation_flags directly here instead of
   // going via GetMarkForCompilationPassFlags() to avoid infinite recursion. The
@@ -81,7 +81,7 @@ bool SetterForXlaAutoJitFlag(const string& value) {
   return true;
 }
 
-bool SetterForXlaCallModuleDisabledChecks(const string& value) {
+bool SetterForXlaCallModuleDisabledChecks(const std::string& value) {
   auto directives = absl::StrSplit(value, ',', absl::SkipEmpty());
   call_module_flags->disabled_checks.insert(directives.begin(),
                                             directives.end());
@@ -231,7 +231,7 @@ void AllocateAndParseFlags() {
   mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_general = 0;
   mark_for_compilation_flags->tf_xla_min_cluster_size = 4;
   mark_for_compilation_flags->tf_xla_max_cluster_size =
-      std::numeric_limits<int32>::max();
+      std::numeric_limits<int32_t>::max();
   mark_for_compilation_flags->tf_xla_clustering_debug = false;
   mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
   mark_for_compilation_flags->tf_xla_clustering_fuel =
@@ -463,7 +463,7 @@ void ResetFlags() {
 
 }  // namespace
 
-bool SetXlaAutoJitFlagFromFlagString(const string& value) {
+bool SetXlaAutoJitFlagFromFlagString(const std::string& value) {
   absl::call_once(flags_init, &AllocateAndParseFlags);
   return SetterForXlaAutoJitFlag(value);
 }
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 3561551f363ac6..96154b892ae5b0 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -41,15 +41,15 @@ struct XlaAutoJitFlag {
   // `optimization_level_general` applies.
   //
   // Experimental.
-  int32 optimization_level_single_gpu;
-  int32 optimization_level_general;
+  int32_t optimization_level_single_gpu;
+  int32_t optimization_level_general;
 };
 
 // Sets the xla_auto_jit_flag based on the given flag string. Supported syntax
 // is:
 // <number>: sets general and single_gpu setting to the provided number.
 // single-gpu(<number>): sets the single_gpu setting to the provided number.
-bool SetXlaAutoJitFlagFromFlagString(const string& value);
+bool SetXlaAutoJitFlagFromFlagString(const std::string& value);
 
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
@@ -57,16 +57,16 @@ struct MarkForCompilationPassFlags {
 
   // Minimum number of operators in an XLA compilation. Ignored for operators
   // placed on an XLA device or operators explicitly marked for compilation.
-  int32 tf_xla_min_cluster_size;
+  int32_t tf_xla_min_cluster_size;
 
   // Maximum number of operators in an XLA compilation.
-  int32 tf_xla_max_cluster_size;
+  int32_t tf_xla_max_cluster_size;
 
   // If non-empty, limit XLA clustering to the following TF operations.
-  string tf_xla_ops_to_cluster;
+  std::string tf_xla_ops_to_cluster;
 
   // If non-empty, remove following operations from XLA clustering excludelist.
-  string tf_xla_cluster_exclude_ops;
+  std::string tf_xla_cluster_exclude_ops;
 
   // Dump graphs during XLA compilation.
   bool tf_xla_clustering_debug;
@@ -110,7 +110,7 @@ struct MarkForCompilationPassFlags {
   bool tf_xla_disable_strict_signature_checks;
 
   // Specifies the persistance cache prefix. Default is "xla_compile_cache"
-  string tf_xla_persistent_cache_prefix;
+  std::string tf_xla_persistent_cache_prefix;
 };
 
 // Flags associated with XLA Sparse Core.
diff --git a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
index 75bd1d7310a295..1b0239c3550970 100644
--- a/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
+++ b/tensorflow/compiler/jit/force_xla_constants_on_host_pass_test.cc
@@ -95,7 +95,7 @@ TEST(ForceXlaConstantsOnHostPassTest, Simple) {
     if (CanCreateXlaKernel(node->def())) {
       EXPECT_FALSE(found);
       found = true;
-      std::vector<int32> hostmem_attr;
+      std::vector<int32_t> hostmem_attr;
       EXPECT_TRUE(TryGetNodeAttr(node->def(), "_input_hostmem", &hostmem_attr));
       EXPECT_EQ(hostmem_attr.size(), 1);
       EXPECT_EQ(hostmem_attr[0], 1);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index 8317d222928200..03a7d1081b8b53 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -93,7 +93,7 @@ std::vector<int64_t> IntTensorAsVector(const Tensor& t) {
   result.reserve(t.NumElements());
   for (int i = 0; i < t.NumElements(); i++) {
     int64_t element = t.dtype() == DT_INT32
-                          ? static_cast<int64_t>(t.flat<int32>()(i))
+                          ? static_cast<int64_t>(t.flat<int32_t>()(i))
                           : t.flat<int64_t>()(i);
     result.push_back(element);
   }
@@ -251,14 +251,14 @@ absl::Status ComputeSliceSize(const Scope& host_scope,
 absl::Status ConvertTensorFlowSliceToStaticShapedSlice(
     Graph* g, Node* slice, const SliceInputs& slice_inputs,
     absl::string_view cluster_name, Node** result) {
-  string host_name;
+  std::string host_name;
   TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
       slice->assigned_device_name(), &host_name));
 
   absl::Status status;
   Scope main_scope =
       NewInternalScope(g, &status, /*refiner=*/nullptr)
-          .WithXlaCluster(string(cluster_name))
+          .WithXlaCluster(std::string(cluster_name))
           .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
   Scope host_scope = main_scope.WithAssignedDevice(host_name);
 
@@ -286,7 +286,7 @@ absl::Status ConvertTensorFlowSliceToStaticShapedSlice(
 
   TF_RETURN_IF_ERROR(main_scope.status());
 
-  std::vector<string> compile_time_const_inputs;
+  std::vector<std::string> compile_time_const_inputs;
   compile_time_const_inputs.push_back("size");
   (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
                      compile_time_const_inputs);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 411f761995483a..6a8523a7d4c893 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -66,7 +66,8 @@ class FakeDevice : public Device {
 
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
 
-  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& type) {
     DeviceAttributes device_attributes;
     device_attributes.set_name(name);
     device_attributes.set_device_type(DeviceType(type).type());
@@ -100,7 +101,7 @@ absl::Status IncreaseDynamismForAutoJit(const Scope& s,
 
   // Scope::ToGraph seems to drop assigned devices, probably because it goes
   // through a GraphDef.  So explicitly maintain the device assignment.
-  std::unordered_map<string, string> assigned_device_names;
+  std::unordered_map<std::string, std::string> assigned_device_names;
   for (Node* n : s.graph()->nodes()) {
     assigned_device_names[n->name()] = n->assigned_device_name();
   }
@@ -149,7 +150,7 @@ TEST(SliceToDynamicSliceRewriteTest, Basic) {
                    Inputs(m_slice_size_0, Const(static_cast<int64_t>(500)),
                           Const(zero_32))));
 
-  std::vector<string> compile_time_constant_inputs;
+  std::vector<std::string> compile_time_constant_inputs;
   compile_time_constant_inputs.push_back("size");
   auto m_dynamic_slice = NodeWith(
       Op("Slice"), AssignedDevice(kDeviceName),
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index c3a24f3e0f7163..340cdbe8032c63 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -151,7 +151,7 @@ class MarkForCompilationPassImpl {
             std::optional<DeviceId> resource_op_device,
             std::optional<int> resource_var_operation_node_id,
             std::optional<DeadnessPredicate> deadness_predicate,
-            bool is_xla_compile_attr_true, std::optional<string> xla_scope)
+            bool is_xla_compile_attr_true, std::optional<std::string> xla_scope)
         : cycles_graph_node_id_(tf_graph_node_id),
           effective_cluster_size_(effective_cluster_size),
           has_functional_control_flow_(has_functional_control_flow),
@@ -220,7 +220,7 @@ class MarkForCompilationPassImpl {
 
     // If not nullopt then the all nodes in the cluster either do not have the
     // XlaScope attribute set or have it set to the value returned.
-    const std::optional<string>& xla_scope() const { return xla_scope_; }
+    const std::optional<std::string>& xla_scope() const { return xla_scope_; }
 
     // Returns the TF graph node IDs for the resource variable operations in
     // this cluster.
@@ -228,7 +228,7 @@ class MarkForCompilationPassImpl {
       return resource_var_operation_node_ids_;
     }
 
-    string DebugString(const Graph& graph) const {
+    std::string DebugString(const Graph& graph) const {
       Node* node = graph.FindNodeId(cycles_graph_node_id());
       if (!node) {
         // This should never happen but we try to be resilient because this is a
@@ -254,7 +254,7 @@ class MarkForCompilationPassImpl {
     std::optional<DeviceId> resource_op_device_;
     std::optional<DeadnessPredicate> deadness_predicate_;
     bool is_xla_compile_attr_true_;
-    std::optional<string> xla_scope_;
+    std::optional<std::string> xla_scope_;
     std::vector<int> resource_var_operation_node_ids_;
 
     Cluster(const Cluster&) = delete;
@@ -365,7 +365,7 @@ class MarkForCompilationPassImpl {
                           std::optional<int> resource_var_operation_node_id,
                           std::optional<DeadnessPredicate> deadness_predicate,
                           bool is_xla_compile_attr_true,
-                          std::optional<string> xla_scope) {
+                          std::optional<std::string> xla_scope) {
     cluster_storage_.push_back(std::make_unique<Cluster>(
         cycles_graph_node_id, effective_cluster_size,
         has_functional_control_flow, device_set, resource_op_device,
@@ -374,7 +374,7 @@ class MarkForCompilationPassImpl {
     return cluster_storage_.back().get();
   }
 
-  std::optional<string> GetXlaScope(Node* n);
+  std::optional<std::string> GetXlaScope(Node* n);
 
   // Returns the cluster for node `n`.  If two nodes, N1 and N2, are placed in
   // the same cluster by the clustering algorithm then this function will return
@@ -417,7 +417,8 @@ class MarkForCompilationPassImpl {
   // Returns a string representing `cycles_graph_node_id`.  If the node is
   // unclusterable (either it is a phatom "frame" node or is not a compilation
   // candidate) then set `*found_unclustered` to true.
-  string DebugStringForCyclesGraphNode(int node_id, bool* found_unclustered);
+  std::string DebugStringForCyclesGraphNode(int node_id,
+                                            bool* found_unclustered);
 
   // We could not contract the edge from `from` to `to`.  Return a string
   // describing an alternate path from `from` to `to` (besides the direct edge
@@ -429,7 +430,7 @@ class MarkForCompilationPassImpl {
   // contracted because of the path [P,Q,R]" where P, Q and R are all clusters
   // since in that case a natural question is why we could not form a {A, P, Q,
   // R, B} cluster.
-  string DescribePotentialCycle(int from, int to);
+  std::string DescribePotentialCycle(int from, int to);
 
   // Merge the clusters `cluster_from` and `cluster_to`. After this step the
   // larger combined cluster is represented by `cluster_from`, but can have
@@ -459,8 +460,8 @@ class MarkForCompilationPassImpl {
     return true;
   }
 
-  string EdgeContractionFailureMsg(Cluster* from, Cluster* to,
-                                   absl::string_view reason) {
+  std::string EdgeContractionFailureMsg(Cluster* from, Cluster* to,
+                                        absl::string_view reason) {
     return absl::StrCat("Could not contract ", from->DebugString(*graph_),
                         " -> ", to->DebugString(*graph_), " because ", reason,
                         ".");
@@ -468,7 +469,7 @@ class MarkForCompilationPassImpl {
 
   DebugOptions debug_options_;
   Graph* graph_;
-  uint64 graph_fingerprint_;
+  uint64_t graph_fingerprint_;
   FunctionLibraryDefinition* flib_def_;
   Env* env_;
   OptimizerOptions::GlobalJitLevel global_jit_level_;
@@ -547,7 +548,7 @@ std::vector<int> MarkForCompilationPassImpl::FindAlternatePathForDebugging(
   return path;
 }
 
-string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
+std::string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
     int cycles_graph_node_id, bool* found_unclustered) {
   Cluster* cluster = GetClusterForCyclesGraphNode(cycles_graph_node_id);
   if (cluster) {
@@ -567,8 +568,9 @@ string MarkForCompilationPassImpl::DebugStringForCyclesGraphNode(
   return node->name();
 }
 
-string MarkForCompilationPassImpl::DescribePotentialCycle(int from, int to) {
-  std::vector<string> path_str;
+std::string MarkForCompilationPassImpl::DescribePotentialCycle(int from,
+                                                               int to) {
+  std::vector<std::string> path_str;
   bool found_unclustered = false;
   absl::c_transform(FindAlternatePathForDebugging(from, to),
                     std::back_inserter(path_str), [&](int node_id) {
@@ -701,7 +703,7 @@ absl::StatusOr<bool> MarkForCompilationPassImpl::ForEachEdgeInPostOrder(
 
     // Make a copy of the set of successors because we may modify the graph in
     // TryToContractEdge.
-    std::vector<int32> successors_copy =
+    std::vector<int32_t> successors_copy =
         cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
 
     for (int to : successors_copy) {
@@ -974,7 +976,7 @@ class ClusterSequenceNumberGenerator {
     sequence_numbers_.clear();
   }
 
-  int64 GetNext(uint64 key) {
+  int64_t GetNext(uint64_t key) {
     mutex_lock lock(mu_);
     return sequence_numbers_[key]++;
   }
@@ -987,13 +989,13 @@ class ClusterSequenceNumberGenerator {
 
  private:
   mutex mu_;
-  absl::flat_hash_map<uint64, int64> sequence_numbers_;
+  absl::flat_hash_map<uint64_t, int64_t> sequence_numbers_;
 };
 
 // Get a monotonic sequence numbers for a graph identified by its `fingerprint`.
 // The sequence number is necessary to disambiguate clusters extracted from the
 // same graph and when duplicate graphs exist within the same process.
-int64_t GetNextClusterSequenceNumber(uint64 fingerprint) {
+int64_t GetNextClusterSequenceNumber(uint64_t fingerprint) {
   return ClusterSequenceNumberGenerator::Global().GetNext(fingerprint);
 }
 
@@ -1002,7 +1004,7 @@ absl::Status MarkForCompilationPassImpl::CreateClusters() {
   clusters_created_ = true;
 
   // Names for each cluster.
-  std::unordered_map<int, string> cluster_names;
+  std::unordered_map<int, std::string> cluster_names;
 
   if (debug_options_.dump_graphs) {
     DumpGraphToFile("before_mark_for_compilation", *graph_, flib_def_);
@@ -1030,7 +1032,7 @@ absl::Status MarkForCompilationPassImpl::CreateClusters() {
     if (cluster->effective_cluster_size() >= debug_options_.min_cluster_size ||
         cluster->has_functional_control_flow() ||
         cluster->is_xla_compile_attr_true()) {
-      string& name = cluster_names[cluster->cycles_graph_node_id()];
+      std::string& name = cluster_names[cluster->cycles_graph_node_id()];
 
       if (name.empty()) {
         if (!cluster_name_prefix_.empty()) {
@@ -1099,7 +1101,7 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
   return false;
 }
 
-std::optional<string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
+std::optional<std::string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
   // Look for either _XlaScope or _XlaInternalScope on both nodes to guide
   // clustering.  If both nodes have a scope and the scopes do not match, do
   // not cluster along this edge.  If even one of the nodes lacks a scope
@@ -1118,14 +1120,14 @@ std::optional<string> MarkForCompilationPassImpl::GetXlaScope(Node* node) {
 
   if (global_jit_level_ != OptimizerOptions::OFF) {
     // If global_jit_level_ is ON, respect only _XlaInternalScope.
-    const string& scope =
+    const std::string& scope =
         GetNodeAttrString(node->attrs(), kXlaInternalScopeAttr);
     if (!scope.empty()) {
       return scope;
     }
   } else {
     // If global_jit_level_ is OFF, respect only _XlaScope.
-    const string& scope = GetNodeAttrString(node->attrs(), kXlaScopeAttr);
+    const std::string& scope = GetNodeAttrString(node->attrs(), kXlaScopeAttr);
     if (!scope.empty()) {
       return scope;
     }
@@ -1186,9 +1188,9 @@ absl::Status MarkForCompilationPassImpl::BuildInitialClusterSet() {
           deadness_analysis_->GetPredicateFor(node, Graph::kControlSlot));
     }
 
-    const string& device_name_str = !node->assigned_device_name().empty()
-                                        ? node->assigned_device_name()
-                                        : node->requested_device();
+    const std::string& device_name_str = !node->assigned_device_name().empty()
+                                             ? node->assigned_device_name()
+                                             : node->requested_device();
     TF_ASSIGN_OR_RETURN(DeviceId device,
                         device_info_cache_.GetIdFor(device_name_str));
 
@@ -1258,16 +1260,17 @@ absl::StatusOr<bool> IsIdentityDrivingConstsInLoop(Node* node) {
   return true;
 }
 
-absl::flat_hash_set<string> CreateClusterExcludeList() {
+absl::flat_hash_set<std::string> CreateClusterExcludeList() {
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
-  absl::flat_hash_set<string> excludelist;
+  absl::flat_hash_set<std::string> excludelist;
   for (auto s : absl::StrSplit(flags->tf_xla_cluster_exclude_ops, ',')) {
     if (!s.empty()) {
-      excludelist.insert(string(s));
+      excludelist.insert(std::string(s));
     }
   }
   if (VLOG_IS_ON(2) && !excludelist.empty()) {
-    std::vector<string> vexcludelist(excludelist.begin(), excludelist.end());
+    std::vector<std::string> vexcludelist(excludelist.begin(),
+                                          excludelist.end());
     absl::c_sort(vexcludelist);
     VLOG(2) << "XLA clustering will exclude following TF operations from auto "
                "clustering: "
@@ -1276,11 +1279,11 @@ absl::flat_hash_set<string> CreateClusterExcludeList() {
   return excludelist;
 }
 
-absl::flat_hash_set<string> GetOrCreateAllowlist() {
-  absl::flat_hash_map<string, std::vector<string>>* allowlist_table =
+absl::flat_hash_set<std::string> GetOrCreateAllowlist() {
+  absl::flat_hash_map<std::string, std::vector<std::string>>* allowlist_table =
       tensorflow::GetAllowlistTable();
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
-  absl::flat_hash_set<string> allowlist;
+  absl::flat_hash_set<std::string> allowlist;
 
   for (auto s : absl::StrSplit(flags->tf_xla_ops_to_cluster, ',')) {
     if (s == "FUSIBLE") {
@@ -1292,12 +1295,12 @@ absl::flat_hash_set<string> GetOrCreateAllowlist() {
       allowlist.insert(v.begin(), v.end());
     } else if (!s.empty()) {
       // Should be a user provided TF operation.
-      allowlist.insert(string(s));
+      allowlist.insert(std::string(s));
     }
   }
 
   if (VLOG_IS_ON(2) && !allowlist.empty()) {
-    std::vector<string> vallowlist(allowlist.begin(), allowlist.end());
+    std::vector<std::string> vallowlist(allowlist.begin(), allowlist.end());
     absl::c_sort(vallowlist);
     VLOG(2) << "XLA clustering will only consider the following TF operations: "
             << absl::StrJoin(vallowlist, " ");
@@ -1338,8 +1341,8 @@ absl::Status MarkForCompilationPassImpl::FindCompilationCandidates() {
 
   auto allowlist = GetOrCreateAllowlist();
 
-  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
-  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());
+  std::vector<std::string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<std::string> all_ops(vall_ops.begin(), vall_ops.end());
   // Check that user's provided TF operation really exists.
   for (const auto& s : allowlist) {
     if (!all_ops.contains(s)) {
@@ -1674,7 +1677,7 @@ void MarkForCompilationPassImpl::DumpPostClusteringGraphs() {
   DumpGraphToFile("mark_for_compilation_annotated", new_graph, flib_def_);
 }
 
-string RatioToString(int numerator, int denominator) {
+std::string RatioToString(int numerator, int denominator) {
   return absl::StrFormat("%d / %d (%.2f%%)", numerator, denominator,
                          (100.0 * numerator) / denominator);
 }
@@ -1985,10 +1988,11 @@ absl::Status MarkForCompilationPass::RunForTest(
   return MarkForCompilation(options, debug_options);
 }
 
-absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable() {
+absl::flat_hash_map<std::string, std::vector<std::string>>*
+GetAllowlistTable() {
   // Table format: category name: {list of TF operations in that category}
-  static absl::flat_hash_map<string, std::vector<string>>* result =
-      new absl::flat_hash_map<string, std::vector<string>>{
+  static absl::flat_hash_map<std::string, std::vector<std::string>>* result =
+      new absl::flat_hash_map<std::string, std::vector<std::string>>{
           // Unary
           {"PW",
            {"ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
@@ -2056,8 +2060,8 @@ void ResetClusterSequenceNumber() {
   ClusterSequenceNumberGenerator::Global().Reset();
 }
 
-absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
-  absl::flat_hash_set<string> result{
+absl::flat_hash_set<std::string> GetKnownXLAAllowlistOp() {
+  absl::flat_hash_set<std::string> result{
       "AdjustContrastv2",
       "AdjustHue",
       "AdjustSaturation",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index 558912f2eee2e0..d6a2814ed33982 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -47,7 +47,7 @@ class MarkForCompilationPass : public GraphOptimizationPass {
   friend class MarkForCompilationPassTestHelper;
 };
 
-absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable();
+absl::flat_hash_map<std::string, std::vector<std::string>>* GetAllowlistTable();
 
 namespace testing {
 // DO NOT USE IN PRODUCTION.
@@ -56,7 +56,7 @@ namespace testing {
 void ResetClusterSequenceNumber();
 
 // Return a list of operation that we choose not to put into the allowlist.
-absl::flat_hash_set<string> GetKnownXLAAllowlistOp();
+absl::flat_hash_set<std::string> GetKnownXLAAllowlistOp();
 }  // namespace testing
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1a120791206369..1d4031a4ffc926 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -67,10 +67,10 @@ static bool Initialized = [] {
 REGISTER_OP("UncompilableNullary").Output("o: float");
 REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
 
-std::unordered_map<string, string> GetClusters(const Graph& graph) {
-  std::unordered_map<string, string> ids;
+std::unordered_map<std::string, std::string> GetClusters(const Graph& graph) {
+  std::unordered_map<std::string, std::string> ids;
   for (Node* node : graph.nodes()) {
-    string cluster;
+    std::string cluster;
     if (TryGetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster)) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
@@ -86,10 +86,10 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   return ids;
 }
 
-std::set<string> GetClusterNames(const Graph& graph) {
-  std::set<string> names;
+std::set<std::string> GetClusterNames(const Graph& graph) {
+  std::set<std::string> names;
   for (Node* node : graph.nodes()) {
-    string cluster;
+    std::string cluster;
     if (TryGetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster)) {
       CHECK(!cluster.empty());
       names.insert(cluster);
@@ -98,10 +98,10 @@ std::set<string> GetClusterNames(const Graph& graph) {
   return names;
 }
 
-absl::flat_hash_map<string, std::vector<string>> GetClusterSets(
-    const Graph& g, std::vector<string>* cluster_names = nullptr) {
+absl::flat_hash_map<std::string, std::vector<std::string>> GetClusterSets(
+    const Graph& g, std::vector<std::string>* cluster_names = nullptr) {
   CHECK(cluster_names == nullptr || cluster_names->empty());
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets;
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets;
   for (const auto& p : GetClusters(g)) {
     cluster_sets[p.second].push_back(p.first);
   }
@@ -357,7 +357,7 @@ TEST(XlaCompilationTest, CallXlaDeviceFuncWithResourceOp) {
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  std::string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   testing::FindNodeByName(graph.get(), "A")
       ->set_assigned_device_name(xla_cpu_device);
   testing::FindNodeByName(graph.get(), "tanh0")
@@ -694,7 +694,7 @@ TEST(XlaCompilationTest, ClusterNodesWithMismatchingInputDeadness) {
 }
 
 namespace {
-Node* MakeRead(const Scope& scope, const string& id,
+Node* MakeRead(const Scope& scope, const std::string& id,
                Node** var_handle_op = nullptr) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
@@ -706,7 +706,7 @@ Node* MakeRead(const Scope& scope, const string& id,
   return read.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write =
@@ -716,7 +716,7 @@ Node* MakeWrite(const Scope& scope, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeNeutral(const Scope& scope, const string& id) {
+Node* MakeNeutral(const Scope& scope, const std::string& id) {
   return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
 }
 }  // namespace
@@ -733,11 +733,11 @@ TEST(XlaCompilationTest, ResourcesClusteringAllowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 1);
-  std::vector<string> expected_clustered_nodes = {"AssignmentW", "ReadR",
-                                                  "ValueToAssignW"};
+  std::vector<std::string> expected_clustered_nodes = {"AssignmentW", "ReadR",
+                                                       "ValueToAssignW"};
   ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes);
 }
 
@@ -753,7 +753,7 @@ TEST(XlaCompilationTest, ResourcesClusteringDisallowed) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph);
   ASSERT_EQ(cluster_sets.size(), 0);
 }
@@ -779,13 +779,13 @@ TEST(XlaCompilationTest, ChainOfOps) {
   TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::vector<string> cluster_names;
-  absl::flat_hash_map<string, std::vector<string>> cluster_sets =
+  std::vector<std::string> cluster_names;
+  absl::flat_hash_map<std::string, std::vector<std::string>> cluster_sets =
       GetClusterSets(*graph, &cluster_names);
 
   ASSERT_EQ(cluster_sets.size(), 1);
 
-  std::vector<string> expected_clustered_nodes_a = {
+  std::vector<std::string> expected_clustered_nodes_a = {
       "AssignmentW1", "ConstN0", "ReadR0", "ValueToAssignW1"};
   ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a);
 }
@@ -881,7 +881,7 @@ TEST(XlaCompilationTest, ConstOp) {
   {
     std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
     Scope root = Scope::NewRootScope().ExitOnError();
-    auto c = ops::Const(root.WithOpName("const"), string("string"));
+    auto c = ops::Const(root.WithOpName("const"), std::string("string"));
     c.node()->AddAttr(kXlaCompileAttr, true);
     TF_ASSERT_OK(root.ToGraph(graph.get()));
     TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
@@ -901,12 +901,12 @@ TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name}, {"add", cluster_name}});
   EXPECT_EQ(clusters, expected_clusters);
 }
@@ -924,12 +924,12 @@ TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name},
        {"identity", cluster_name},
        {"add", cluster_name}});
@@ -956,7 +956,7 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so
   // it won't be clustered.  ctrl_trigger_b is okay to cluster but we don't
@@ -982,7 +982,7 @@ TEST(XlaCompilationTest, RandomShape) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["shape"], "");
 }
 
@@ -1028,7 +1028,7 @@ TEST(XlaCompilationTest, RandomShapeWithFunc) {
   TF_ASSERT_OK(
       MarkForCompilationPassTestHelper::MarkForCompilation(&graph, fld.get()));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["fn_call"], "");
 }
 
@@ -1054,12 +1054,12 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_gpu_device));
+      n->set_assigned_device_name(std::string(xla_gpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/shape_rng"], "");
   EXPECT_EQ(clusters["test/reshape"], "");
 }
@@ -1087,12 +1087,12 @@ TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_gpu_device));
+      n->set_assigned_device_name(std::string(xla_gpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/read"], "");
   EXPECT_EQ(clusters["test/read"], clusters["test/reshape"]);
 }
@@ -1133,15 +1133,15 @@ TEST(XlaCompilationTest, DontClusterMergingNodes) {
 
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   // Each of the MatMuls should be in a separate cluster.
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul0_dev0"]);
   EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul1_dev1"]);
@@ -1170,17 +1170,17 @@ TEST(XlaCompilationTest, DontClusterMergingNodesOnCPU) {
 
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"cpu")) {
-      n->set_assigned_device_name(string(xla_cpu_dev0));
+      n->set_assigned_device_name(std::string(xla_cpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   // Each of the MatMuls should be in a separate cluster.
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul0_dev0"]);
   EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul1_dev1"]);
@@ -1223,14 +1223,14 @@ TEST(XlaCompilationTest, NOT_DontClusterSpreadingNodes) {
   TF_ASSERT_OK(root.ToGraph(graph.get()));
   for (Node* n : graph->nodes()) {
     if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
-      n->set_assigned_device_name(string(xla_gpu_dev0));
+      n->set_assigned_device_name(std::string(xla_gpu_dev0));
     } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
-      n->set_assigned_device_name(string(xla_gpu_dev1));
+      n->set_assigned_device_name(std::string(xla_gpu_dev1));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["A_dev0"], clusters["MatMulSource_dev0"]);
   EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
   EXPECT_NE(clusters["MatMulSource_dev0"], clusters["MatMul1_dev1"]);
@@ -1254,12 +1254,12 @@ TEST(XlaCompilationTest, ClusterStatefulRandomOpOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_cpu_device));
+      n->set_assigned_device_name(std::string(xla_cpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/a"], "");
   EXPECT_NE(clusters["test/b"], "");
   EXPECT_NE(clusters["test/c"], "");
@@ -1277,7 +1277,7 @@ TEST(XlaCompilationTest, DontAutoClusterStatefulRandomOp) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/a"], "");
   EXPECT_EQ(clusters["test/b"], "");
 }
@@ -1299,12 +1299,12 @@ TEST(XlaCompilationTest, ClusterDummyOpsOnXlaDevice) {
 
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
-      n->set_assigned_device_name(string(xla_cpu_device));
+      n->set_assigned_device_name(std::string(xla_cpu_device));
     }
   }
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/check"], "");
   EXPECT_NE(clusters["test/greaterequal"], "");
   EXPECT_NE(clusters["test/assert"], "");
@@ -1324,7 +1324,7 @@ TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/assert"], "");
   EXPECT_EQ(clusters["test/check"], "");
 }
@@ -1345,7 +1345,7 @@ TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
 }
 
@@ -1373,7 +1373,7 @@ TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
 }
 
@@ -1391,7 +1391,7 @@ TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
 
-  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  std::string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   for (Node* n : graph->nodes()) {
     if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
       n->set_assigned_device_name(xla_cpu_device);
@@ -1400,7 +1400,7 @@ TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
   EXPECT_NE(clusters["test/tensor_list_reserve"], "");
 }
 
@@ -1427,7 +1427,7 @@ TEST(XlaCompilationTest, CreateCombinedCpuGpuClusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/x"], "");
 
@@ -1451,7 +1451,7 @@ TEST(XlaCompilationTest, DontCreateGpu0AndGpu1Clusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/x"], "");
   EXPECT_EQ(clusters["test/y"], "");
@@ -1473,7 +1473,7 @@ TEST(XlaCompilationTest, DontCreateCombinedCpuUnknownClusters) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/x"], "");
   EXPECT_EQ(clusters["test/y"], "");
@@ -1486,8 +1486,8 @@ TEST(XlaCompilationTest, ClusterResourceOpsWhenSafe) {
   Node* resource_read = MakeRead(root, "read", &var_handle);
   Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
 
-  string resource_read_name = resource_read->name();
-  string var_handle_name = var_handle->name();
+  std::string resource_read_name = resource_read->name();
+  std::string var_handle_name = var_handle->name();
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
@@ -1499,7 +1499,7 @@ TEST(XlaCompilationTest, ClusterResourceOpsWhenSafe) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/b"], "");
   EXPECT_EQ(clusters["test/b"], clusters[resource_read_name]);
@@ -1512,8 +1512,8 @@ TEST(XlaCompilationTest, DontClusterResourceOpsWhenUnsafe) {
   Node* resource_read = MakeRead(root, "read", &var_handle);
   Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
 
-  string resource_read_name = resource_read->name();
-  string var_handle_name = var_handle->name();
+  std::string resource_read_name = resource_read->name();
+  std::string var_handle_name = var_handle->name();
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
@@ -1525,7 +1525,7 @@ TEST(XlaCompilationTest, DontClusterResourceOpsWhenUnsafe) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/b"], "");
   EXPECT_EQ(clusters[resource_read_name], "");
@@ -1555,7 +1555,7 @@ TEST(XlaCompilationTest, DontClusterNodesWithScopedAllocatorAttr) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/z"], "");
 }
@@ -1580,7 +1580,7 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_EQ(clusters["test/z"], "");
 }
@@ -1610,7 +1610,7 @@ TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/y"], "");
   EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
@@ -1632,7 +1632,7 @@ TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["test/y"], "");
   EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
@@ -1705,7 +1705,7 @@ TEST(XlaCompilationTest, IterationIncrementAndGroupDeps) {
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   EXPECT_NE(clusters["some_ctrl_input"], "");
   EXPECT_EQ(clusters["some_ctrl_input"], clusters["weights_0_update"]);
@@ -1875,19 +1875,19 @@ TEST(XlaCompilationTest, ClusterSessionName) {
   TF_ASSERT_OK(
       MarkForCompilationPassTestHelper::MarkForCompilation(&graph, options));
 
-  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
 
   ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
+  std::string cluster_name = clusters.begin()->second;
 
-  std::unordered_map<string, string> expected_clusters(
+  std::unordered_map<std::string, std::string> expected_clusters(
       {{"negate", cluster_name}, {"add", cluster_name}});
   EXPECT_EQ(clusters, expected_clusters);
   EXPECT_THAT(cluster_name, ::testing::StartsWith("test_session_name"));
 }
 
 namespace {
-Node* MakeStageNode(GraphDefBuilder& builder, string name,
+Node* MakeStageNode(GraphDefBuilder& builder, std::string name,
                     std::initializer_list<DataType> dtypes,
                     absl::Span<const ops::NodeOut> values) {
   auto opts = builder.opts()
@@ -1949,7 +1949,7 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
         &graph,
         MarkForCompilationPassTestHelper::Options().WithNoClusterScoping()));
 
-    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
     EXPECT_EQ(clusters["add0"], clusters["add1"]);
     EXPECT_EQ(clusters["add0"], clusters["relu1"]);
     EXPECT_EQ(clusters["relu0"], clusters["add1"]);
@@ -1964,7 +1964,7 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
 
     TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
-    std::unordered_map<string, string> clusters = GetClusters(*graph);
+    std::unordered_map<std::string, std::string> clusters = GetClusters(*graph);
     EXPECT_NE(clusters["add0"], clusters["add1"]);
     EXPECT_NE(clusters["add0"], clusters["relu1"]);
     EXPECT_NE(clusters["relu0"], clusters["add1"]);
@@ -1973,9 +1973,9 @@ TEST(XlaCompilationTest, StagePipelinePreservedByClusterScopingPass) {
 }
 TEST(XlaCompilationTest, XLALiteAllowlist) {
   auto* allowlist_table = tensorflow::GetAllowlistTable();
-  absl::flat_hash_set<string> hallowlist;
-  std::vector<string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
-  absl::flat_hash_set<string> all_ops(vall_ops.begin(), vall_ops.end());
+  absl::flat_hash_set<std::string> hallowlist;
+  std::vector<std::string> vall_ops = XlaOpRegistry::GetAllRegisteredOps();
+  absl::flat_hash_set<std::string> all_ops(vall_ops.begin(), vall_ops.end());
 
   // Check that all the operations in the table are existing TF operations
   for (auto pair : *allowlist_table) {
@@ -1988,10 +1988,10 @@ TEST(XlaCompilationTest, XLALiteAllowlist) {
   // Check that all registered XLA operation are in the allowlist
   // table or are known to not be in it.
 
-  absl::flat_hash_set<string> known_not_in_list =
+  absl::flat_hash_set<std::string> known_not_in_list =
       tensorflow::testing::GetKnownXLAAllowlistOp();
-  std::vector<string> unknow_op;
-  for (string op : vall_ops) {
+  std::vector<std::string> unknow_op;
+  for (std::string op : vall_ops) {
     if (!hallowlist.contains(op) && !known_not_in_list.contains(op)) {
       unknow_op.push_back(op);
     }
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index ce1f2cd5bcd671..db158fc84a0173 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -35,7 +35,7 @@ namespace {
 using impl::NodeMatcherProperties;
 using impl::OutEdge;
 
-string IndentAllButFirstLine(absl::string_view text) {
+std::string IndentAllButFirstLine(absl::string_view text) {
   std::vector<std::string> lines = absl::StrSplit(text, '\n');
   for (int i = 1; i < lines.size(); i++) {
     lines[i].insert(0, "  ");
@@ -86,21 +86,21 @@ bool MatchAndExplainTensor(const Tensor& tensor, const Tensor& expected_tensor,
     case DT_DOUBLE:
       return CompareTensor<double>(tensor, expected_tensor, listener);
     case DT_INT8:
-      return CompareTensor<int8>(tensor, expected_tensor, listener);
+      return CompareTensor<int8_t>(tensor, expected_tensor, listener);
     case DT_INT16:
-      return CompareTensor<int16>(tensor, expected_tensor, listener);
+      return CompareTensor<int16_t>(tensor, expected_tensor, listener);
     case DT_INT32:
-      return CompareTensor<int32>(tensor, expected_tensor, listener);
+      return CompareTensor<int32_t>(tensor, expected_tensor, listener);
     case DT_INT64:
       return CompareTensor<int64_t>(tensor, expected_tensor, listener);
     case DT_UINT8:
-      return CompareTensor<uint8>(tensor, expected_tensor, listener);
+      return CompareTensor<uint8_t>(tensor, expected_tensor, listener);
     case DT_UINT16:
-      return CompareTensor<uint16>(tensor, expected_tensor, listener);
+      return CompareTensor<uint16_t>(tensor, expected_tensor, listener);
     case DT_UINT32:
-      return CompareTensor<uint32>(tensor, expected_tensor, listener);
+      return CompareTensor<uint32_t>(tensor, expected_tensor, listener);
     case DT_UINT64:
-      return CompareTensor<uint64>(tensor, expected_tensor, listener);
+      return CompareTensor<uint64_t>(tensor, expected_tensor, listener);
     default:
       LOG(FATAL) << "Unsupported dtype "  // Crash ok: testonly.
                  << DataType_Name(tensor.dtype());
@@ -188,7 +188,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     if (control_dep_set &&
         !control_dep_set->MatchAndExplain(control_deps, &inner_listener)) {
       if (listener->IsInterested()) {
-        string explanation = inner_listener.str();
+        std::string explanation = inner_listener.str();
         if (!explanation.empty()) {
           explanation = absl::StrCat(", ", explanation, ",");
         }
@@ -225,7 +225,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
   }
 
   void DescribeTo(::std::ostream* os) const override {
-    std::vector<string> predicates;
+    std::vector<std::string> predicates;
 
     if (name) {
       predicates.push_back(absl::StrCat("name: ", *name));
@@ -282,10 +282,11 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
 
     if (!attrs.empty()) {
       printed_something = true;
-      std::vector<string> attrs_str;
+      std::vector<std::string> attrs_str;
       absl::c_transform(
           attrs, std::back_inserter(attrs_str),
-          [](const std::pair<string, std::optional<AttrValue>>& attr_kv_pair) {
+          [](const std::pair<std::string, std::optional<AttrValue>>&
+                 attr_kv_pair) {
             return absl::StrCat(attr_kv_pair.first, "->",
                                 attr_kv_pair.second
                                     ? SummarizeAttrValue(*attr_kv_pair.second)
@@ -319,7 +320,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     if (listener->IsInterested()) {
       *listener << "\ninput " << input_idx << " does not match expected:\n";
       (*input_matchers)[input_idx].DescribeTo(listener->stream());
-      string explanation = inner_listener.str();
+      std::string explanation = inner_listener.str();
       if (!explanation.empty()) {
         *listener << ", " << explanation;
       }
@@ -327,14 +328,14 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     return false;
   }
 
-  std::optional<string> op;
-  std::optional<string> name;
-  std::optional<string> assigned_device;
+  std::optional<std::string> op;
+  std::optional<std::string> name;
+  std::optional<std::string> assigned_device;
   std::optional<Tensor> constant_value;
   std::optional<std::vector<::testing::Matcher<OutEdge>>> input_matchers;
   std::optional<::testing::Matcher<absl::Span<const Node* const>>>
       control_dep_set;
-  std::map<string, std::optional<AttrValue>> attrs;
+  std::map<std::string, std::optional<AttrValue>> attrs;
 };
 
 // Matches a dst and dst_output on an input edge.  Today we only use this with
@@ -352,7 +353,7 @@ class OutEdgeMatcher : public ::testing::MatcherInterface<OutEdge> {
       if (listener->IsInterested()) {
         *listener << "\nsource does not match expected ";
         src_matcher_.DescribeTo(listener->stream());
-        string explanation = inner_listener.str();
+        std::string explanation = inner_listener.str();
         if (!explanation.empty()) {
           *listener << "\n\t" << explanation;
         }
@@ -432,21 +433,21 @@ ::testing::Matcher<const Node*> impl::NodeWith(
   return ::testing::MakeMatcher(matcher);
 }
 
-impl::NodeMatcherProperties Name(string name) {
+impl::NodeMatcherProperties Name(std::string name) {
   impl::NodeMatcherProperties props;
   props.set_name(std::move(name));
   return props;
 }
 
 // Matches a node with op `op`.
-impl::NodeMatcherProperties Op(string op) {
+impl::NodeMatcherProperties Op(std::string op) {
   impl::NodeMatcherProperties props;
   props.set_op(std::move(op));
   return props;
 }
 
 // Matches a node with assigned device `assigned_device`.
-impl::NodeMatcherProperties AssignedDevice(string assigned_device) {
+impl::NodeMatcherProperties AssignedDevice(std::string assigned_device) {
   impl::NodeMatcherProperties props;
   props.set_assigned_device(std::move(assigned_device));
   return props;
@@ -472,15 +473,15 @@ impl::NodeMatcherProperties impl::CtrlDeps(
   return props;
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, bool>& bool_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, bool>& bool_attr) {
   AttrValue attr_value;
   attr_value.set_b(bool_attr.second);
   return {bool_attr.first, attr_value};
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, absl::Span<const int>>& int_list_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const int>>& int_list_attr) {
   AttrValue attr_value;
   AttrValue::ListValue* list = attr_value.mutable_list();
   for (int i : int_list_attr.second) {
@@ -489,23 +490,24 @@ std::pair<string, AttrValue> impl::AttrLiteralHelper(
   return {int_list_attr.first, attr_value};
 }
 
-std::pair<string, AttrValue> impl::AttrLiteralHelper(
-    const std::pair<string, absl::Span<const string>>& string_list_attr) {
+std::pair<std::string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const std::string>>&
+        string_list_attr) {
   AttrValue attr_value;
   AttrValue::ListValue* list = attr_value.mutable_list();
-  for (const string& s : string_list_attr.second) {
+  for (const std::string& s : string_list_attr.second) {
     list->add_s(s);
   }
   return {string_list_attr.first, attr_value};
 }
 
-impl::NodeMatcherProperties impl::Attr(std::pair<string, AttrValue> attr) {
+impl::NodeMatcherProperties impl::Attr(std::pair<std::string, AttrValue> attr) {
   impl::NodeMatcherProperties props;
   props.set_attr(std::move(attr));
   return props;
 }
 
-impl::NodeMatcherProperties impl::Attr(string name) {
+impl::NodeMatcherProperties impl::Attr(std::string name) {
   impl::NodeMatcherProperties props;
   props.set_attr({std::move(name), std::nullopt});
   return props;
diff --git a/tensorflow/compiler/jit/node_matchers.h b/tensorflow/compiler/jit/node_matchers.h
index bb2c1875306185..1391df3743bd4c 100644
--- a/tensorflow/compiler/jit/node_matchers.h
+++ b/tensorflow/compiler/jit/node_matchers.h
@@ -84,11 +84,11 @@ class NodeMatcherProperties {
  public:
   using NodeSeqMatcher = std::vector<::testing::Matcher<const Node*>>;
   using InputSeqMatcher = std::vector<::testing::Matcher<OutEdge>>;
-  using AttrKeyValuePair = std::pair<string, std::optional<AttrValue>>;
+  using AttrKeyValuePair = std::pair<std::string, std::optional<AttrValue>>;
 
-  const std::optional<string>& name() const { return name_; }
-  const std::optional<string>& op() const { return op_; }
-  const std::optional<string>& assigned_device() const {
+  const std::optional<std::string>& name() const { return name_; }
+  const std::optional<std::string>& op() const { return op_; }
+  const std::optional<std::string>& assigned_device() const {
     return assigned_device_;
   }
   const std::optional<Tensor>& constant_value() const {
@@ -102,17 +102,17 @@ class NodeMatcherProperties {
   }
   const std::optional<AttrKeyValuePair>& attr() const { return attr_; }
 
-  void set_name(string name) {
+  void set_name(std::string name) {
     DCHECK(IsEmpty());
     name_ = std::move(name);
   }
 
-  void set_op(string op) {
+  void set_op(std::string op) {
     DCHECK(IsEmpty());
     op_ = std::move(op);
   }
 
-  void set_assigned_device(string assigned_device) {
+  void set_assigned_device(std::string assigned_device) {
     DCHECK(IsEmpty());
     assigned_device_ = std::move(assigned_device);
   }
@@ -144,9 +144,9 @@ class NodeMatcherProperties {
   }
 
  private:
-  std::optional<string> name_;
-  std::optional<string> op_;
-  std::optional<string> assigned_device_;
+  std::optional<std::string> name_;
+  std::optional<std::string> op_;
+  std::optional<std::string> assigned_device_;
   std::optional<Tensor> constant_value_;
   std::optional<InputSeqMatcher> input_matchers_;
   std::optional<NodeSeqMatcher> control_deps_;
@@ -162,39 +162,40 @@ impl::NodeMatcherProperties Inputs(
 impl::NodeMatcherProperties CtrlDeps(
     absl::Span<const ::testing::Matcher<const Node*>> control_deps);
 
-impl::NodeMatcherProperties Attr(std::pair<string, AttrValue> attrs);
-impl::NodeMatcherProperties Attr(string name);
+impl::NodeMatcherProperties Attr(std::pair<std::string, AttrValue> attrs);
+impl::NodeMatcherProperties Attr(std::string name);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, bool>& bool_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, bool>& bool_attr);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, absl::Span<const int>>& int_list_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const int>>& int_list_attr);
 
-std::pair<string, AttrValue> AttrLiteralHelper(
-    const std::pair<string, absl::Span<const string>>& string_list_attr);
+std::pair<std::string, AttrValue> AttrLiteralHelper(
+    const std::pair<std::string, absl::Span<const std::string>>&
+        string_list_attr);
 }  // namespace impl
 
 // -----------------------------------------------------------------------------
 // Public interface.
 
 // Matches a node with name `name`.
-impl::NodeMatcherProperties Name(string name);
+impl::NodeMatcherProperties Name(std::string name);
 
 // Matches a node with op `op`.
-impl::NodeMatcherProperties Op(string op);
+impl::NodeMatcherProperties Op(std::string op);
 
 // Matches a node with assigned device `assigned_device`.
-impl::NodeMatcherProperties AssignedDevice(string assigned_device);
+impl::NodeMatcherProperties AssignedDevice(std::string assigned_device);
 
 // Matches a node with a boolean typed attribute named `name` and with value
 // `value`.
 template <typename ValueTy>
-impl::NodeMatcherProperties Attr(const string& name, ValueTy value) {
+impl::NodeMatcherProperties Attr(const std::string& name, ValueTy value) {
   return impl::Attr({impl::AttrLiteralHelper({name, value})});
 }
 
-inline impl::NodeMatcherProperties Attr(const string& name) {
+inline impl::NodeMatcherProperties Attr(const std::string& name) {
   return impl::Attr(name);
 }
 
diff --git a/tensorflow/compiler/jit/node_matchers_test.cc b/tensorflow/compiler/jit/node_matchers_test.cc
index 6f37d5617b6ce6..ac1d9ce3468df1 100644
--- a/tensorflow/compiler/jit/node_matchers_test.cc
+++ b/tensorflow/compiler/jit/node_matchers_test.cc
@@ -41,7 +41,7 @@ using testing::matchers::Op;
 using testing::matchers::Out;
 
 template <typename M, typename T>
-string Explain(const T& t, const M& m) {
+std::string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
   EXPECT_THAT(t, ::testing::Not(m));  // For the error message.
   EXPECT_FALSE(m.MatchAndExplain(t, &listener));
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index c8bbcee20e3829..9539a14d060f42 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -100,7 +100,7 @@ absl::Status PartiallyDecluster(std::unique_ptr<Graph>* graph) {
   return pass.Run(opt_options);
 }
 
-Node* FindNodeByName(const Graph& graph, const string& name) {
+Node* FindNodeByName(const Graph& graph, const std::string& name) {
   for (Node* node : graph.nodes()) {
     if (node->name() == name) {
       return node;
@@ -109,7 +109,7 @@ Node* FindNodeByName(const Graph& graph, const string& name) {
   return nullptr;
 }
 
-bool GetInputsForNode(const Graph& graph, const string& node_name,
+bool GetInputsForNode(const Graph& graph, const std::string& node_name,
                       std::vector<Node*>* inputs) {
   const Node* node = FindNodeByName(graph, node_name);
   if (node == nullptr) {
@@ -292,7 +292,7 @@ TEST(PartiallyDeclusterPassTest, DeclusterDependentNodes) {
 void AddToCluster(absl::Span<Node* const> nodes,
                   absl::string_view cluster_name) {
   for (Node* n : nodes) {
-    n->AddAttr(kXlaClusterAttr, string(cluster_name));
+    n->AddAttr(kXlaClusterAttr, std::string(cluster_name));
   }
 }
 
diff --git a/tensorflow/compiler/jit/pjrt_base_device.cc b/tensorflow/compiler/jit/pjrt_base_device.cc
index ce7ed954575040..d25d77d6cff22b 100644
--- a/tensorflow/compiler/jit/pjrt_base_device.cc
+++ b/tensorflow/compiler/jit/pjrt_base_device.cc
@@ -17,8 +17,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-DeviceAttributes BuildPjRtBaseDeviceAttributes(const string& name_prefix,
-                                               const string& device_name,
+DeviceAttributes BuildPjRtBaseDeviceAttributes(const std::string& name_prefix,
+                                               const std::string& device_name,
                                                int device_ordinal) {
   return Device::BuildDeviceAttributes(
       absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index 2fee2b0b898890..33f09704d7c72b 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -143,7 +143,7 @@ bool IsEdgeSafe(XlaResourceOpKind from, XlaResourceOpKind to) {
 
 using ResourceOp = std::pair<int, XlaResourceOpKind>;
 
-string ResourceOpToString(const ResourceOp& resource_op) {
+std::string ResourceOpToString(const ResourceOp& resource_op) {
   return absl::StrCat(
       resource_op.first, ": ",
       XlaResourceOpInfo::XlaResourceOpKindToString(resource_op.second));
@@ -233,14 +233,14 @@ class ResourceOpSet {
   void operator=(const ResourceOpSet&) = delete;
 };
 
-string ResourceOpSetToString(const ResourceOpSet& resource_op_set) {
-  std::vector<string> elements_debug_string;
+std::string ResourceOpSetToString(const ResourceOpSet& resource_op_set) {
+  std::vector<std::string> elements_debug_string;
   std::transform(resource_op_set.begin(), resource_op_set.end(),
                  std::back_inserter(elements_debug_string), ResourceOpToString);
   return absl::StrCat("{", absl::StrJoin(elements_debug_string, ","), "}");
 }
 
-string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) {
+std::string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) {
   return absl::StrCat(
       "[", n.name(), ": ", n.type_string(), "(",
       XlaResourceOpInfo::XlaResourceOpKindToString(resource_op_kind), ")", "]");
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
index 8a80b8ae9b3497..6b038c992f1715 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Node* MakeRead(const Scope& scope, const string& id) {
+Node* MakeRead(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output read =
@@ -46,7 +46,7 @@ Node* MakeRead(const Scope& scope, const string& id) {
   return read.node();
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
+Node* MakeWrite(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write =
@@ -56,7 +56,7 @@ Node* MakeWrite(const Scope& scope, const string& id) {
   return assign_op.operation.node();
 }
 
-Node* MakeModify(const Scope& scope, const string& id) {
+Node* MakeModify(const Scope& scope, const std::string& id) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output value_to_write = ops::Const(scope.WithOpName("Increment" + id), 1.0f);
@@ -65,7 +65,7 @@ Node* MakeModify(const Scope& scope, const string& id) {
   return assign_add_op.operation.node();
 }
 
-Node* MakeNeutral(const Scope& scope, const string& id) {
+Node* MakeNeutral(const Scope& scope, const std::string& id) {
   return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
 }
 
@@ -238,7 +238,8 @@ TEST(ResourceOperationSafetyAnalysisTest, WriteReadModify) {
   EXPECT_EQ(incompatible_pairs[1], write_modify_pair);
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(
+    const std::string& name) {
   FunctionDefLibrary flib_def;
   FunctionDef func = FunctionDefHelper::Create(
       /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
@@ -249,8 +250,8 @@ FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   return flib_def;
 }
 
-Node* MakeCall(Graph* graph, const string& callee_name, const string& node_name,
-               absl::Status* status) {
+Node* MakeCall(Graph* graph, const std::string& callee_name,
+               const std::string& node_name, absl::Status* status) {
   NodeDef call_node;
   call_node.set_name(node_name);
   call_node.set_op(callee_name);
diff --git a/tensorflow/compiler/jit/shape_inference.h b/tensorflow/compiler/jit/shape_inference.h
index 467ecb83a74aae..b1469d2d699bf1 100644
--- a/tensorflow/compiler/jit/shape_inference.h
+++ b/tensorflow/compiler/jit/shape_inference.h
@@ -35,7 +35,8 @@ struct InferredShape {
   DataType handle_type = DT_INVALID;
   PartialTensorShape handle_shape;
 };
-typedef std::unordered_map<string, std::vector<InferredShape>> GraphShapeInfo;
+typedef std::unordered_map<std::string, std::vector<InferredShape>>
+    GraphShapeInfo;
 
 // Infer shapes for all Tensors in a graph, and save them in a map.  The vector
 // for a Node contains the information about each of its outputs.
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
index eaabf18c79603c..599d442de4b092 100644
--- a/tensorflow/compiler/jit/shape_inference_test.cc
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -61,7 +61,7 @@ TEST(ShapeInferenceTest, Basics) {
   TF_ASSERT_OK(InferShapes(graph.get(), /*arg_shapes=*/{},
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}}, {"B", {PartialTensorShape({3})}},
       {"C", {PartialTensorShape()}},       {"D", {PartialTensorShape({2, 3})}},
       {"E", {PartialTensorShape()}},       {"F", {PartialTensorShape()}},
@@ -94,7 +94,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSize) {
   TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}},
       {"B", {PartialTensorShape({2, 3})}},
       {"C", {PartialTensorShape({2, 3})}},
@@ -127,7 +127,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSizeIncompleteUserArgs) {
   TF_ASSERT_OK(InferShapes(graph.get(), arg_shapes,
                            /*fnlib_def=*/nullptr, &shape_info));
 
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"A", {PartialTensorShape({2, 3})}},
       {"B", {PartialTensorShape({2, 3})}},
       {"C", {PartialTensorShape({2, 3})}},
@@ -156,7 +156,7 @@ TEST(ShapeInferenceTest, WhileLoop) {
         ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -168,11 +168,11 @@ TEST(ShapeInferenceTest, WhileLoop) {
     auto identity = ops::Identity(scope.WithOpName("while/Identity"),
                                   switch_node.output_true);
     auto identity_shape =
-        ops::Const<int32>(scope.WithOpName("while/Identity/shape"), {});
+        ops::Const<int32_t>(scope.WithOpName("while/Identity/shape"), {});
     auto identity_reshaped = ops::Reshape(
         scope.WithOpName("while/Identity/reshaped"), identity, identity_shape);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity_reshaped, one);
     auto next_iteration =
@@ -190,7 +190,7 @@ TEST(ShapeInferenceTest, WhileLoop) {
   GraphShapeInfo shape_info;
   TF_ASSERT_OK(InferShapes(&graph, /*arg_shapes=*/{}, /*fnlib_def=*/nullptr,
                            &shape_info));
-  std::map<string, std::vector<PartialTensorShape>> expected = {
+  std::map<std::string, std::vector<PartialTensorShape>> expected = {
       {"while/Identity", {PartialTensorShape()}},
       {"while/add", {PartialTensorShape({})}},
   };
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 81ab1d8d05f96e..30a9ab51faf105 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 absl::Status ShapeAnnotationsMatch(
     const Graph& graph, const GraphShapeInfo& shape_info,
-    std::map<string, std::vector<PartialTensorShape>> expected_shapes) {
+    std::map<std::string, std::vector<PartialTensorShape>> expected_shapes) {
   for (Node* node : graph.op_nodes()) {
     auto sit = shape_info.find(node->name());
     TF_RET_CHECK(sit != shape_info.end())
@@ -50,7 +50,7 @@ absl::Status ShapeAnnotationsMatch(
     }
   }
   if (!expected_shapes.empty()) {
-    std::vector<string> missing;
+    std::vector<std::string> missing;
     missing.reserve(expected_shapes.size());
     for (const auto& entry : expected_shapes) {
       missing.push_back(entry.first);
@@ -88,12 +88,12 @@ void DeviceSetup::AddDevicesAndSetUp(
   flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
 }
 
-Device* DeviceSetup::GetDevice(const string& device_name) {
+Device* DeviceSetup::GetDevice(const std::string& device_name) {
   if (device_mgr_ == nullptr) {
     return nullptr;
   }
 
-  string full_device_name = absl::StrCat(
+  std::string full_device_name = absl::StrCat(
       "/job:localhost/replica:0/task:0/device:", device_name, ":0");
   Device* device;
   TF_CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
diff --git a/tensorflow/compiler/jit/test_util.h b/tensorflow/compiler/jit/test_util.h
index ec694662297399..ba7d2533ef7c74 100644
--- a/tensorflow/compiler/jit/test_util.h
+++ b/tensorflow/compiler/jit/test_util.h
@@ -44,7 +44,7 @@ namespace tensorflow {
 // `expected_shapes` entries.
 absl::Status ShapeAnnotationsMatch(
     const Graph& graph, const GraphShapeInfo& shape_info,
-    std::map<string, std::vector<PartialTensorShape>> expected_shapes);
+    std::map<std::string, std::vector<PartialTensorShape>> expected_shapes);
 
 // A helper object to create GraphOptimizationPassOptions.
 struct GraphOptimizationPassWrapper {
@@ -74,7 +74,7 @@ class DeviceSetup {
   void AddDevicesAndSetUp(
       const std::vector<std::string>& device_names,
       const std::optional<FunctionDef>& fdef = std::nullopt);
-  Device* GetDevice(const string& device_name);
+  Device* GetDevice(const std::string& device_name);
   FunctionLibraryRuntime* flr() { return flr_; }
 
  private:
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test.cc b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
index 90e73c23d210d7..d108bc51b5ee33 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
@@ -23,7 +23,7 @@ class AutoClusteringTestImpl : public AutoClusteringTest {
  protected:
   // Test auto-clustering with a proto text file ${key}.pbtxt.
   absl::Status RunAutoClusteringTestWithPbtxt(absl::string_view key) {
-    string file_name_without_extension =
+    std::string file_name_without_extension =
         absl::StrCat(testing::TensorFlowSrcRoot(), "/compiler/jit/tests/", key);
 
     return AutoClusteringTest::RunAutoClusteringTestWithPbtxt(
@@ -33,7 +33,7 @@ class AutoClusteringTestImpl : public AutoClusteringTest {
 
   // Test auto-clustering with a gzipped proto text file ${key}.pbtxt.gz.
   absl::Status RunAutoClusteringTestWithGzippedPbtxt(absl::string_view key) {
-    string file_name_without_extension =
+    std::string file_name_without_extension =
         absl::StrCat(testing::TensorFlowSrcRoot(), "/compiler/jit/tests/", key);
 
     return AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index dee77ac750ee54..258449e91120e1 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-absl::StatusOr<string> SummarizeClustering(
+absl::StatusOr<std::string> SummarizeClustering(
     const GraphDef& auto_clustered_graph_def) {
   testing::ResetClusterSequenceNumber();
   Graph graph(OpRegistry::Global());
@@ -45,7 +45,7 @@ absl::StatusOr<string> SummarizeClustering(
 
   // cluster_id -> (operation name -> # of operations)
   const int kNoCluster = -1;
-  std::map<int, std::map<string, int>> clusters;
+  std::map<int, std::map<std::string, int>> clusters;
   std::map<int, int> cluster_size;
   int clustered_nodes = 0;
   for (Node* n : graph.op_nodes()) {
@@ -60,7 +60,7 @@ absl::StatusOr<string> SummarizeClustering(
     cluster_size[cluster]++;
   }
 
-  string result =
+  std::string result =
       absl::StrCat("Clustered nodes: ", clustered_nodes,
                    "\nUnclustered nodes: ", cluster_size[kNoCluster],
                    "\nNumber of clusters: ", clusters.size() - 1, "\n\n");
@@ -99,7 +99,7 @@ absl::Status AssertGraphDefIsUnclustered(const GraphDef& graphdef) {
   return absl::OkStatus();
 }
 
-absl::Status ReadTextProtoFromString(Env* env, const string& data,
+absl::Status ReadTextProtoFromString(Env* env, const std::string& data,
                                      ::tensorflow::protobuf::Message* proto) {
   if (!::tensorflow::protobuf::TextFormat::ParseFromString(data, proto)) {
     return errors::DataLoss("Can't parse input data as text proto");
@@ -141,7 +141,8 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestImpl(
     graphdef = std::move(next);
   }
 
-  TF_ASSIGN_OR_RETURN(string clustering_summary, SummarizeClustering(graphdef));
+  TF_ASSIGN_OR_RETURN(std::string clustering_summary,
+                      SummarizeClustering(graphdef));
 
   // To update golden files flip this to true and run
   //
@@ -149,13 +150,15 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestImpl(
   //   tensorflow/compiler/jit/tests:auto_clustering_test
   bool update_golden = false;
   if (update_golden) {
-    TF_RETURN_IF_ERROR(WriteStringToFile(
-        Env::Default(), string(golden_summary_file_path), clustering_summary));
+    TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(),
+                                         std::string(golden_summary_file_path),
+                                         clustering_summary));
   }
 
-  string golden_file_contents;
-  TF_RETURN_IF_ERROR(ReadFileToString(
-      Env::Default(), string(golden_summary_file_path), &golden_file_contents));
+  std::string golden_file_contents;
+  TF_RETURN_IF_ERROR(ReadFileToString(Env::Default(),
+                                      std::string(golden_summary_file_path),
+                                      &golden_file_contents));
 
   EXPECT_EQ(golden_file_contents, clustering_summary);
 
@@ -167,7 +170,7 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestWithPbtxt(
     absl::string_view golden_summary_file_path) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
-      ReadTextProto(Env::Default(), string(pbtxt_file_path), &graphdef));
+      ReadTextProto(Env::Default(), std::string(pbtxt_file_path), &graphdef));
   return RunAutoClusteringTestImpl(std::move(graphdef),
                                    golden_summary_file_path);
 }
@@ -177,8 +180,8 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
     absl::string_view golden_summary_file_path) {
   Env* env = Env::Default();
   std::unique_ptr<RandomAccessFile> file_reader;
-  TF_RETURN_IF_ERROR(
-      env->NewRandomAccessFile(string(gzipped_pbtxt_file_path), &file_reader));
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
+      std::string(gzipped_pbtxt_file_path), &file_reader));
   std::unique_ptr<io::RandomAccessInputStream> input_stream(
       new io::RandomAccessInputStream(file_reader.get()));
   constexpr int k_buffer_size = 256 << 10;  // 256kb
@@ -206,7 +209,7 @@ absl::Status BenchmarkMarkForCompilation(absl::string_view graph_def_path,
                                          benchmark::State& state) {
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(
-      ReadTextProto(Env::Default(), string(graph_def_path), &graph_def));
+      ReadTextProto(Env::Default(), std::string(graph_def_path), &graph_def));
 
   OptimizationPassRunner runner;
   TF_RETURN_IF_ERROR(runner.SetJitLevel(tensorflow::OptimizerOptions::ON_2));
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
index e4be1a1f641656..33e2daf941eafb 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
@@ -29,7 +29,7 @@ namespace {
 // Creates a float tensor of linearly increasing values, starting from offset.
 Tensor CreateInputTensor(const TensorShape& shape, float offset) {
   Tensor tensor(DT_FLOAT, shape);
-  for (int64 i = 0; i < tensor.flat<float>().size(); ++i) {
+  for (int64_t i = 0; i < tensor.flat<float>().size(); ++i) {
     tensor.flat<float>()(i) = offset + i;
   }
   return tensor;
@@ -127,7 +127,7 @@ absl::Status DeviceCompilerSerializeTest::ExecuteWithBatch(
   }
 
   Tensor f32_input(DT_FLOAT, shape);
-  for (int64 i = 0; i < f32_input.NumElements(); ++i) {
+  for (int64_t i = 0; i < f32_input.NumElements(); ++i) {
     EXPECT_NEAR(golden_output_tensors[0].flat<float>()(i),
                 output_tensors[0].flat<float>()(i), 1e-3);
   }
@@ -139,7 +139,7 @@ DeviceCompilerSerializeTest::AlterPersistentCacheEntryHloModuleNames(
     absl::string_view persistent_cache_dir_path,
     absl::string_view file_prefix) {
   Env* env = Env::Default();
-  std::vector<string> file_names;
+  std::vector<std::string> file_names;
   TF_RETURN_IF_ERROR(
       env->GetChildren(tensorflow::testing::TmpDir(), &file_names));
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index ce0285c2e797d2..8ccb236897ce39 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -45,11 +45,11 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "xla/client/local_client.h"
+#include "xla/future.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/executable.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
@@ -809,8 +809,6 @@ xla::ExecuteOptions GetPjRtExecuteOptions(
     const DeviceType& device_type,
     absl::flat_hash_set<int> non_donatable_input_indices) {
   xla::ExecuteOptions options;
-  options.arguments_are_tupled = false;
-  options.untuple_result = true;
   // Hardcode run id to always be one: TF distributed strategy
   // differentiates between subsequent runs using dependency edges. This
   // is safe, as only TF dist-strat can produce distributed ops, and we
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
index 9e71286dc95df8..d8ed5feac79f12 100644
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -207,8 +207,6 @@ class PjRtExecutionUtilTest : public OpsTestBase {
         &executable_args, /*owned_args=*/{}, &non_donatable_input_indices));
 
     xla::ExecuteOptions exe_options;
-    exe_options.arguments_are_tupled = false;
-    exe_options.untuple_result = true;
 
     // TODO(b/257548614): currently PJRT is compiled as portable (num_replica =
     // 1 and num_partition = 1). Support multiple partitions case.
@@ -520,8 +518,6 @@ TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputsResourceUpdates) {
 TEST(XlaLaunchUtilTest, GetPjRtExecuteOptions) {
   xla::ExecuteOptions options =
       GetPjRtExecuteOptions(DeviceType(DEVICE_GPU), {});
-  EXPECT_FALSE(options.arguments_are_tupled);
-  EXPECT_TRUE(options.untuple_result);
   EXPECT_FALSE(options.strict_shape_checking);
   EXPECT_TRUE(options.use_major_to_minor_data_layout_for_callbacks);
 }
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 7f200aa186a466..ab6c5abeca86f0 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1990,7 +1990,6 @@ cc_library(
         ":tf_tfl_passes",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/lite/core:macros",
         "//tensorflow/compiler/mlir/lite/debug",
         "//tensorflow/compiler/mlir/lite/experimental/remat:metadata_util",
         "//tensorflow/compiler/mlir/lite/metrics:converter_error_data_proto_cc",
@@ -2212,10 +2211,8 @@ tf_proto_library(
     srcs = ["converter_flags.proto"],
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto",
-        "//tensorflow/compiler/mlir/lite/debug:debug_options_proto",
         ":types_proto",
+        "//tensorflow/compiler/mlir/lite/debug:debug_options_proto",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/compiler/mlir/lite/converter_flags.proto b/tensorflow/compiler/mlir/lite/converter_flags.proto
index 1c1a1ad00aea74..49795ad8337d9a 100644
--- a/tensorflow/compiler/mlir/lite/converter_flags.proto
+++ b/tensorflow/compiler/mlir/lite/converter_flags.proto
@@ -17,8 +17,6 @@ package tflite;
 
 import "tensorflow/compiler/mlir/lite/debug/debug_options.proto";
 import "tensorflow/compiler/mlir/lite/types.proto";
-import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto";
-import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto";
 
 // Supported I/O file formats. Some formats may be input-only or output-only.
 enum FileFormat {
@@ -43,6 +41,8 @@ enum FileFormat {
 //
 // Next ID to use: 69.
 message ConverterFlags {
+  reserved 54, 61;
+
   // Input file format
   optional FileFormat input_format = 1;
 
@@ -312,12 +312,6 @@ message ConverterFlags {
   // If true, disable folding mul->fc as in layer norm during optimize pass.
   optional bool disable_fuse_mul_and_fc = 53 [default = false];
 
-  // Indicates the quantization specs. Quantization spec can be set to either
-  // a preset method or a custom method.
-  // Note: This is deprecated; use `quantization_config` instead.
-  optional stablehlo.quantization.QuantizationOptions quantization_options = 54
-      [deprecated = true];
-
   // Flag to enable hlo to tf conversion.
   // This is useful to exercise StableHLO -> HLO -> TF -> TFLite path.
   optional bool enable_hlo_to_tf_conversion = 55
@@ -346,11 +340,6 @@ message ConverterFlags {
   // WARNING: Experimental interface, subject to change.
   optional string qdq_conversion_mode = 60 [default = "NONE"];
 
-  // Configures quantization behavior. This config is fed to the StableHLO
-  // Quantizer integrated in the converter.
-  // WARNING: Experimental interface, subject to change.
-  optional stablehlo.quantization.QuantizationConfig quantization_config = 61;
-
   // Disables per channel weights quantization for Dense layers and enables
   // legacy per tensor quantization. The legacy quantization for Dense layers is
   // inconsistent with Conv 1x1 which always performs per channel quantization.
diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index 6c26865757950a..b82d5725182745 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -120,7 +120,7 @@ class InitPassManagerTest : public testing::Test {
   }
 
   absl::Status GetDumpDir(std::string* dump_dir) {
-    std::vector<string> files;
+    std::vector<std::string> files;
     if (auto status = tsl::Env::Default()->GetChildren(path_, &files);
         !status.ok()) {
       return status;
diff --git a/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
index 0d83e1971072c3..80975abd3e9a7a 100644
--- a/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
+++ b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "mlir/Support/LLVM.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
 #include "mlir-c/IR.h"  // from @llvm-project
 #include "mlir/Bindings/Python/NanobindAdaptors.h"  // from @llvm-project  // IWYU pragma: keep
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 05abe12b6ebf58..d7027e91f480ef 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -4962,7 +4962,7 @@ void IfOp::getSuccessorRegions(RegionBranchPoint point,
                                SmallVectorImpl<RegionSuccessor>& regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(RegionSuccessor(getOperation(), getResults()));
     return;
   }
 
@@ -5233,6 +5233,22 @@ int64_t SoftmaxOp::GetArithmeticCount(Operation* op) {
 // TanhOp
 //===----------------------------------------------------------------------===//
 
+OpFoldResult TanhOp::fold(FoldAdaptor adaptor) {
+  if (!ShouldFoldOperation(this->getOperation())) return {};
+
+  auto operands = adaptor.getOperands();
+  Type result_type = getType();
+  // Only constant fold for tensor of f32 is implemented.
+  if (!IsF32ShapedType(result_type)) return nullptr;
+
+  auto compute = [](APFloat value) -> APFloat {
+    float f = value.convertToFloat();
+    float result = std::tanh(f);
+    return APFloat(result);
+  };
+  return ConstFoldUnaryOp(result_type, operands[0], compute);
+}
+
 int64_t TanhOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 64fc866b2be055..c90859cd6accfe 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1100,7 +1100,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
 
   let arguments = (ins
     TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
-    TFL_TensorOf<[F32, QI4, QI8, QUI8, QI16]>:$filter,
+    TFL_TensorOf<[F32, QI2, QI4, QI8, QUI8, QI16]>:$filter,
     TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
 
     TFL_AFAttr:$fused_activation_function,
@@ -2477,13 +2477,13 @@ equivalent to setting:
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI4, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
     TFL_I32OrI64Tensor:$begin,
     TFL_I32OrI64Tensor:$size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI4, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
   );
 
   let hasVerifier = 1;
@@ -3575,6 +3575,8 @@ def TFL_TanhOp: TFL_Op<"tanh", [
         /*scale=*/1.0 / (1<<(bit_width-1)), /*zero_point=*/0);
   }
   }];
+
+  let hasFolder = 1;
 }
 
 def TFL_TileOp: TFL_Op<"tile", [
@@ -4279,7 +4281,7 @@ def TFL_DequantizeOp: TFL_Op<"dequantize", [NoMemoryEffect]> {
     quantization parameters.
   }];
 
-  let arguments = (ins TFL_TensorOf<[QI4, QI8, QUI8, QI16, F16]>:$input);
+  let arguments = (ins TFL_TensorOf<[QI2, QI4, QI8, QUI8, QI16, F16]>:$input);
 
   let results = (outs TFL_FpTensor:$output);
 
diff --git a/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc b/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
index a3ae7f73b24f24..b5a3319ba13362 100644
--- a/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
+++ b/tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape_test.cc
@@ -19,9 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/kernels/internal/runtime_shape.h"
 
 #include <cstdint>
-#include <functional>
 #include <initializer_list>
-#include <numeric>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
index aa700dc166e046..29ed664e7ae78f 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 // error status if it fails to convert the input.
 absl::Status ConvertJaxToTFLiteFlatBuffer(
     const std::string& input, const tflite::ModelFlags& model_flags,
-    tflite::ConverterFlags& converter_flags, string* result);
+    tflite::ConverterFlags& converter_flags, std::string* result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index fa94cd3b5b8120..c334f24442b491 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -140,8 +140,8 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
   mlir::TFL::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   std::vector<std::optional<double>> node_mins;
   std::vector<std::optional<double>> node_maxs;
@@ -210,8 +210,6 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
       converter_flags.convert_to_stablehlo();
   pass_config.legalize_custom_tensor_list_ops =
       converter_flags.legalize_custom_tensor_list_ops();
-  pass_config.enable_stablehlo_quantizer =
-      converter_flags.has_quantization_config();
   pass_config.enable_composite_direct_lowering =
       converter_flags.enable_composite_direct_lowering();
   pass_config.model_origin_framework = converter_flags.model_origin_framework();
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
index 33b9bacf2dfdeb..446652ccb8da05 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 // error status if it fails to convert the input.
 absl::Status ConvertSavedModelToTFLiteFlatBuffer(
     const tflite::ModelFlags& model_flags,
-    tflite::ConverterFlags& converter_flags, string* result,
+    tflite::ConverterFlags& converter_flags, std::string* result,
     const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index f837a6f0140e7b..de75080ab5da82 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -46,8 +46,8 @@ absl::Status RegisterAllCustomOps(
 absl::Status PopulateQuantizationSpecs(
     const tflite::ModelFlags& model_flags,
     tflite::ConverterFlags& converter_flags,
-    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
-    std::vector<string>* node_dtypes,
+    mlir::TFL::QuantizationSpecs* quant_specs,
+    std::vector<std::string>* node_names, std::vector<std::string>* node_dtypes,
     std::vector<std::optional<std::vector<int>>>* node_shapes,
     std::vector<std::optional<double>>* node_mins,
     std::vector<std::optional<double>>* node_maxs);
@@ -60,7 +60,8 @@ absl::Status ConvertMLIRToTFLiteFlatBuffer(
     std::unique_ptr<mlir::MLIRContext>&& context,
     mlir::OwningOpRef<mlir::ModuleOp> module,
     const mlir::TFL::PassConfig& pass_config,
-    const std::unordered_set<std::string>& saved_model_tags, string* result,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result,
     const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 // Give a warning for any unused flags that have been specified.
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index ae3b6233f8e959..1e1f79af16cbd6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -93,7 +93,7 @@ std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
 
 class QuantizeWeightsTest : public testing::Test {
  protected:
-  QuantizeWeightsTest() {}
+  QuantizeWeightsTest() = default;
 
   void LoadBasicModel() {
     input_model_ = ReadTestModel();
diff --git a/tensorflow/compiler/mlir/lite/schema/schema.fbs b/tensorflow/compiler/mlir/lite/schema/schema.fbs
index 01a214ab2c03bf..6cd1c51fb0cf9e 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema.fbs
+++ b/tensorflow/compiler/mlir/lite/schema/schema.fbs
@@ -24,6 +24,8 @@
 // Version 3c: Move constant tensor buffers & custom op buffers outside from
 //             Flatbuffers. Has backward compatibility with version 3, 3a and
 //             3b.
+// Version 3d: Add ExternalBuffer tables and tensor.external_buffer field for
+//             referencing immutable data stored in external files.
 
 namespace tflite;
 
@@ -263,6 +265,11 @@ table Tensor {
   // Currently only 1 subtype is supported. The field is defined as an array for
   // flexibility of supporting multiple subtypes in the future.
   variant_tensors:[VariantSubType];
+
+  // Optional reference to an ExternalBuffer entry that stores constant tensor
+  // data outside of the FlatBuffer. A value of 0 indicates that the tensor uses
+  // the traditional embedded buffer field instead.
+  external_buffer:uint;
 }
 
 // A list of builtin operators. Builtin operators are slightly faster than custom
@@ -1613,6 +1620,22 @@ table Buffer {
   size: ulong;
 }
 
+// Groups external buffers by file/URI.
+table ExternalBufferGroup {
+  name:string;
+}
+
+// Describes an immutable data slice stored in an external file.
+table ExternalBuffer {
+  // Unique identifier for this external buffer.
+  id:uint;
+  // Index into the external_buffer_groups array.
+  group:uint;
+  offset:ulong;
+  length:ulong;
+  packing:string;
+}
+
 table Metadata {
   // A human readable string to uniquely identify a Metadata.
   name:string;
@@ -1680,6 +1703,12 @@ table Model {
 
   // Optional SignatureDefs for the model.
   signature_defs:[SignatureDef];
+
+  // Optional groups for external weight buffers.
+  external_buffer_groups:[ExternalBufferGroup];
+
+  // Optional list of external weight buffers referenced by tensors.
+  external_buffers:[ExternalBuffer];
 }
 
 root_type Model;
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_generated.h b/tensorflow/compiler/mlir/lite/schema/schema_generated.h
index b04076af12a074..2b1701a8b9c0b9 100755
--- a/tensorflow/compiler/mlir/lite/schema/schema_generated.h
+++ b/tensorflow/compiler/mlir/lite/schema/schema_generated.h
@@ -681,6 +681,14 @@ struct Buffer;
 struct BufferBuilder;
 struct BufferT;
 
+struct ExternalBufferGroup;
+struct ExternalBufferGroupBuilder;
+struct ExternalBufferGroupT;
+
+struct ExternalBuffer;
+struct ExternalBufferBuilder;
+struct ExternalBufferT;
+
 struct Metadata;
 struct MetadataBuilder;
 struct MetadataT;
@@ -5952,6 +5960,7 @@ struct TensorT : public ::flatbuffers::NativeTable {
   std::vector<int32_t> shape_signature{};
   bool has_rank = false;
   std::vector<std::unique_ptr<tflite::VariantSubTypeT>> variant_tensors{};
+  uint32_t external_buffer = 0;
   TensorT() = default;
   TensorT(const TensorT &o);
   TensorT(TensorT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -5971,7 +5980,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_SPARSITY = 16,
     VT_SHAPE_SIGNATURE = 18,
     VT_HAS_RANK = 20,
-    VT_VARIANT_TENSORS = 22
+    VT_VARIANT_TENSORS = 22,
+    VT_EXTERNAL_BUFFER = 24
   };
   const ::flatbuffers::Vector<int32_t> *shape() const {
     return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
@@ -6003,6 +6013,9 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
     return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
   }
+  uint32_t external_buffer() const {
+    return GetField<uint32_t>(VT_EXTERNAL_BUFFER, 0);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SHAPE) &&
@@ -6022,6 +6035,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            VerifyOffset(verifier, VT_VARIANT_TENSORS) &&
            verifier.VerifyVector(variant_tensors()) &&
            verifier.VerifyVectorOfTables(variant_tensors()) &&
+           VerifyField<uint32_t>(verifier, VT_EXTERNAL_BUFFER, 4) &&
            verifier.EndTable();
   }
   TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -6063,6 +6077,9 @@ struct TensorBuilder {
   void add_variant_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
     fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
   }
+  void add_external_buffer(uint32_t external_buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_EXTERNAL_BUFFER, external_buffer, 0);
+  }
   explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -6085,8 +6102,10 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(
     ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0,
     bool has_rank = false,
-    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0,
+    uint32_t external_buffer = 0) {
   TensorBuilder builder_(_fbb);
+  builder_.add_external_buffer(external_buffer);
   builder_.add_variant_tensors(variant_tensors);
   builder_.add_shape_signature(shape_signature);
   builder_.add_sparsity(sparsity);
@@ -6111,7 +6130,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
     ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
     const std::vector<int32_t> *shape_signature = nullptr,
     bool has_rank = false,
-    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr,
+    uint32_t external_buffer = 0) {
   auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
   auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
@@ -6127,7 +6147,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
       sparsity,
       shape_signature__,
       has_rank,
-      variant_tensors__);
+      variant_tensors__,
+      external_buffer);
 }
 
 ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -16531,6 +16552,182 @@ inline ::flatbuffers::Offset<Buffer> CreateBufferDirect(
 
 ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ExternalBufferGroupT : public ::flatbuffers::NativeTable {
+  typedef ExternalBufferGroup TableType;
+  std::string name{};
+};
+
+struct ExternalBufferGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExternalBufferGroupT NativeTableType;
+  typedef ExternalBufferGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+  ExternalBufferGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExternalBufferGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExternalBufferGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExternalBufferGroupBuilder {
+  typedef ExternalBufferGroup Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(ExternalBufferGroup::VT_NAME, name);
+  }
+  explicit ExternalBufferGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExternalBufferGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExternalBufferGroup>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
+  ExternalBufferGroupBuilder builder_(_fbb);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateExternalBufferGroup(
+      _fbb,
+      name__);
+}
+
+::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExternalBufferT : public ::flatbuffers::NativeTable {
+  typedef ExternalBuffer TableType;
+  uint32_t id = 0;
+  uint32_t group = 0;
+  uint64_t offset = 0;
+  uint64_t length = 0;
+  std::string packing{};
+};
+
+struct ExternalBuffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExternalBufferT NativeTableType;
+  typedef ExternalBufferBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID = 4,
+    VT_GROUP = 6,
+    VT_OFFSET = 8,
+    VT_LENGTH = 10,
+    VT_PACKING = 12
+  };
+  uint32_t id() const {
+    return GetField<uint32_t>(VT_ID, 0);
+  }
+  uint32_t group() const {
+    return GetField<uint32_t>(VT_GROUP, 0);
+  }
+  uint64_t offset() const {
+    return GetField<uint64_t>(VT_OFFSET, 0);
+  }
+  uint64_t length() const {
+    return GetField<uint64_t>(VT_LENGTH, 0);
+  }
+  const ::flatbuffers::String *packing() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PACKING);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_ID, 4) &&
+           VerifyField<uint32_t>(verifier, VT_GROUP, 4) &&
+           VerifyField<uint64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_PACKING) &&
+           verifier.VerifyString(packing()) &&
+           verifier.EndTable();
+  }
+  ExternalBufferT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExternalBufferT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExternalBuffer> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExternalBufferBuilder {
+  typedef ExternalBuffer Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(uint32_t id) {
+    fbb_.AddElement<uint32_t>(ExternalBuffer::VT_ID, id, 0);
+  }
+  void add_group(uint32_t group) {
+    fbb_.AddElement<uint32_t>(ExternalBuffer::VT_GROUP, group, 0);
+  }
+  void add_offset(uint64_t offset) {
+    fbb_.AddElement<uint64_t>(ExternalBuffer::VT_OFFSET, offset, 0);
+  }
+  void add_length(uint64_t length) {
+    fbb_.AddElement<uint64_t>(ExternalBuffer::VT_LENGTH, length, 0);
+  }
+  void add_packing(::flatbuffers::Offset<::flatbuffers::String> packing) {
+    fbb_.AddOffset(ExternalBuffer::VT_PACKING, packing);
+  }
+  explicit ExternalBufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExternalBuffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExternalBuffer>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t id = 0,
+    uint32_t group = 0,
+    uint64_t offset = 0,
+    uint64_t length = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> packing = 0) {
+  ExternalBufferBuilder builder_(_fbb);
+  builder_.add_length(length);
+  builder_.add_offset(offset);
+  builder_.add_packing(packing);
+  builder_.add_group(group);
+  builder_.add_id(id);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBufferDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t id = 0,
+    uint32_t group = 0,
+    uint64_t offset = 0,
+    uint64_t length = 0,
+    const char *packing = nullptr) {
+  auto packing__ = packing ? _fbb.CreateString(packing) : 0;
+  return tflite::CreateExternalBuffer(
+      _fbb,
+      id,
+      group,
+      offset,
+      length,
+      packing__);
+}
+
+::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct MetadataT : public ::flatbuffers::NativeTable {
   typedef Metadata TableType;
   std::string name{};
@@ -16802,6 +16999,8 @@ struct ModelT : public ::flatbuffers::NativeTable {
   std::vector<int32_t> metadata_buffer{};
   std::vector<std::unique_ptr<tflite::MetadataT>> metadata{};
   std::vector<std::unique_ptr<tflite::SignatureDefT>> signature_defs{};
+  std::vector<std::unique_ptr<tflite::ExternalBufferGroupT>> external_buffer_groups{};
+  std::vector<std::unique_ptr<tflite::ExternalBufferT>> external_buffers{};
   ModelT() = default;
   ModelT(const ModelT &o);
   ModelT(ModelT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -16819,7 +17018,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_BUFFERS = 12,
     VT_METADATA_BUFFER = 14,
     VT_METADATA = 16,
-    VT_SIGNATURE_DEFS = 18
+    VT_SIGNATURE_DEFS = 18,
+    VT_EXTERNAL_BUFFER_GROUPS = 20,
+    VT_EXTERNAL_BUFFERS = 22
   };
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
@@ -16845,6 +17046,12 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
     return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
   }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *external_buffer_groups() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *>(VT_EXTERNAL_BUFFER_GROUPS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *external_buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *>(VT_EXTERNAL_BUFFERS);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION, 4) &&
@@ -16867,6 +17074,12 @@ struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
            verifier.VerifyVector(signature_defs()) &&
            verifier.VerifyVectorOfTables(signature_defs()) &&
+           VerifyOffset(verifier, VT_EXTERNAL_BUFFER_GROUPS) &&
+           verifier.VerifyVector(external_buffer_groups()) &&
+           verifier.VerifyVectorOfTables(external_buffer_groups()) &&
+           VerifyOffset(verifier, VT_EXTERNAL_BUFFERS) &&
+           verifier.VerifyVector(external_buffers()) &&
+           verifier.VerifyVectorOfTables(external_buffers()) &&
            verifier.EndTable();
   }
   ModelT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -16902,6 +17115,12 @@ struct ModelBuilder {
   void add_signature_defs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
     fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
   }
+  void add_external_buffer_groups(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>> external_buffer_groups) {
+    fbb_.AddOffset(Model::VT_EXTERNAL_BUFFER_GROUPS, external_buffer_groups);
+  }
+  void add_external_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>>> external_buffers) {
+    fbb_.AddOffset(Model::VT_EXTERNAL_BUFFERS, external_buffers);
+  }
   explicit ModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -16922,8 +17141,12 @@ inline ::flatbuffers::Offset<Model> CreateModel(
     ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
-    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>> external_buffer_groups = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ExternalBuffer>>> external_buffers = 0) {
   ModelBuilder builder_(_fbb);
+  builder_.add_external_buffers(external_buffers);
+  builder_.add_external_buffer_groups(external_buffer_groups);
   builder_.add_signature_defs(signature_defs);
   builder_.add_metadata(metadata);
   builder_.add_metadata_buffer(metadata_buffer);
@@ -16944,7 +17167,9 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
     const std::vector<::flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
     const std::vector<int32_t> *metadata_buffer = nullptr,
     const std::vector<::flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
-    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> *external_buffer_groups = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::ExternalBuffer>> *external_buffers = nullptr) {
   auto operator_codes__ = operator_codes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
   auto subgraphs__ = subgraphs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
@@ -16952,6 +17177,8 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
   auto metadata__ = metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
   auto signature_defs__ = signature_defs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
+  auto external_buffer_groups__ = external_buffer_groups ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBufferGroup>>(*external_buffer_groups) : 0;
+  auto external_buffers__ = external_buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBuffer>>(*external_buffers) : 0;
   return tflite::CreateModel(
       _fbb,
       version,
@@ -16961,7 +17188,9 @@ inline ::flatbuffers::Offset<Model> CreateModelDirect(
       buffers__,
       metadata_buffer__,
       metadata__,
-      signature_defs__);
+      signature_defs__,
+      external_buffer_groups__,
+      external_buffers__);
 }
 
 ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -17215,7 +17444,7 @@ inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const ::flatbu
   (void)_resolver;
   { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } else { _o->traversal_order.resize(0); } }
   { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } else { _o->block_map.resize(0); } }
-  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->dim_metadata.resize(0); } }
+  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->dim_metadata.resize(0); } }
 }
 
 inline ::flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -17277,7 +17506,8 @@ inline TensorT::TensorT(const TensorT &o)
         is_variable(o.is_variable),
         sparsity((o.sparsity) ? new tflite::SparsityParametersT(*o.sparsity) : nullptr),
         shape_signature(o.shape_signature),
-        has_rank(o.has_rank) {
+        has_rank(o.has_rank),
+        external_buffer(o.external_buffer) {
   variant_tensors.reserve(o.variant_tensors.size());
   for (const auto &variant_tensors_ : o.variant_tensors) { variant_tensors.emplace_back((variant_tensors_) ? new tflite::VariantSubTypeT(*variant_tensors_) : nullptr); }
 }
@@ -17293,6 +17523,7 @@ inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT {
   std::swap(shape_signature, o.shape_signature);
   std::swap(has_rank, o.has_rank);
   std::swap(variant_tensors, o.variant_tensors);
+  std::swap(external_buffer, o.external_buffer);
   return *this;
 }
 
@@ -17314,7 +17545,8 @@ inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function
   { auto _e = sparsity(); if (_e) { if(_o->sparsity) { _e->UnPackTo(_o->sparsity.get(), _resolver); } else { _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); } } else if (_o->sparsity) { _o->sparsity.reset(); } }
   { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } else { _o->shape_signature.resize(0); } }
   { auto _e = has_rank(); _o->has_rank = _e; }
-  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->variant_tensors.resize(0); } }
+  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->variant_tensors.resize(0); } }
+  { auto _e = external_buffer(); _o->external_buffer = _e; }
 }
 
 inline ::flatbuffers::Offset<Tensor> Tensor::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -17335,6 +17567,7 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuild
   auto _shape_signature = _o->shape_signature.size() ? _fbb.CreateVector(_o->shape_signature) : 0;
   auto _has_rank = _o->has_rank;
   auto _variant_tensors = _o->variant_tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>> (_o->variant_tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateVariantSubType(*__va->__fbb, __va->__o->variant_tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffer = _o->external_buffer;
   return tflite::CreateTensor(
       _fbb,
       _shape,
@@ -17346,7 +17579,8 @@ inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuild
       _sparsity,
       _shape_signature,
       _has_rank,
-      _variant_tensors);
+      _variant_tensors,
+      _external_buffer);
 }
 
 inline StablehloGatherOptionsT *StablehloGatherOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
@@ -21575,10 +21809,10 @@ inline SubGraphT *SubGraph::UnPack(const ::flatbuffers::resolver_function_t *_re
 inline void SubGraph::UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->tensors.resize(0); } }
+  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->tensors.resize(0); } }
   { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
   { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
-  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operators.resize(0); } }
+  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->operators.resize(0); } }
   { auto _e = name(); if (_e) _o->name = _e->str(); }
   { auto _e = debug_metadata_index(); _o->debug_metadata_index = _e; }
 }
@@ -21640,6 +21874,70 @@ inline ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuild
       _size);
 }
 
+inline ExternalBufferGroupT *ExternalBufferGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExternalBufferGroupT>(new ExternalBufferGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExternalBufferGroup::UnPackTo(ExternalBufferGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> ExternalBufferGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExternalBufferGroup(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExternalBufferGroup> CreateExternalBufferGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExternalBufferGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  return tflite::CreateExternalBufferGroup(
+      _fbb,
+      _name);
+}
+
+inline ExternalBufferT *ExternalBuffer::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ExternalBufferT>(new ExternalBufferT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ExternalBuffer::UnPackTo(ExternalBufferT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = id(); _o->id = _e; }
+  { auto _e = group(); _o->group = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = length(); _o->length = _e; }
+  { auto _e = packing(); if (_e) _o->packing = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> ExternalBuffer::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExternalBuffer(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ExternalBuffer> CreateExternalBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const ExternalBufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExternalBufferT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _id = _o->id;
+  auto _group = _o->group;
+  auto _offset = _o->offset;
+  auto _length = _o->length;
+  auto _packing = _o->packing.empty() ? 0 : _fbb.CreateString(_o->packing);
+  return tflite::CreateExternalBuffer(
+      _fbb,
+      _id,
+      _group,
+      _offset,
+      _length,
+      _packing);
+}
+
 inline MetadataT *Metadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MetadataT>(new MetadataT());
   UnPackTo(_o.get(), _resolver);
@@ -21724,8 +22022,8 @@ inline SignatureDefT *SignatureDef::UnPack(const ::flatbuffers::resolver_functio
 inline void SignatureDef::UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inputs.resize(0); } }
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->outputs.resize(0); } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->outputs.resize(0); } }
   { auto _e = signature_key(); if (_e) _o->signature_key = _e->str(); }
   { auto _e = subgraph_index(); _o->subgraph_index = _e; }
 }
@@ -21764,6 +22062,10 @@ inline ModelT::ModelT(const ModelT &o)
   for (const auto &metadata_ : o.metadata) { metadata.emplace_back((metadata_) ? new tflite::MetadataT(*metadata_) : nullptr); }
   signature_defs.reserve(o.signature_defs.size());
   for (const auto &signature_defs_ : o.signature_defs) { signature_defs.emplace_back((signature_defs_) ? new tflite::SignatureDefT(*signature_defs_) : nullptr); }
+  external_buffer_groups.reserve(o.external_buffer_groups.size());
+  for (const auto &external_buffer_groups_ : o.external_buffer_groups) { external_buffer_groups.emplace_back((external_buffer_groups_) ? new tflite::ExternalBufferGroupT(*external_buffer_groups_) : nullptr); }
+  external_buffers.reserve(o.external_buffers.size());
+  for (const auto &external_buffers_ : o.external_buffers) { external_buffers.emplace_back((external_buffers_) ? new tflite::ExternalBufferT(*external_buffers_) : nullptr); }
 }
 
 inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
@@ -21775,6 +22077,8 @@ inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
   std::swap(metadata_buffer, o.metadata_buffer);
   std::swap(metadata, o.metadata);
   std::swap(signature_defs, o.signature_defs);
+  std::swap(external_buffer_groups, o.external_buffer_groups);
+  std::swap(external_buffers, o.external_buffers);
   return *this;
 }
 
@@ -21788,13 +22092,15 @@ inline void Model::UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t
   (void)_o;
   (void)_resolver;
   { auto _e = version(); _o->version = _e; }
-  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operator_codes.resize(0); } }
-  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->subgraphs.resize(0); } }
+  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->operator_codes.resize(0); } }
+  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->subgraphs.resize(0); } }
   { auto _e = description(); if (_e) _o->description = _e->str(); }
-  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->buffers.resize(0); } }
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->buffers.resize(0); } }
   { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } else { _o->metadata_buffer.resize(0); } }
-  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metadata.resize(0); } }
-  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->signature_defs.resize(0); } }
+  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->metadata.resize(0); } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->signature_defs.resize(0); } }
+  { auto _e = external_buffer_groups(); if (_e) { _o->external_buffer_groups.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->external_buffer_groups[_i]) { _e->Get(_i)->UnPackTo(_o->external_buffer_groups[_i].get(), _resolver); } else { _o->external_buffer_groups[_i] = std::unique_ptr<tflite::ExternalBufferGroupT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->external_buffer_groups.resize(0); } }
+  { auto _e = external_buffers(); if (_e) { _o->external_buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->external_buffers[_i]) { _e->Get(_i)->UnPackTo(_o->external_buffers[_i].get(), _resolver); } else { _o->external_buffers[_i] = std::unique_ptr<tflite::ExternalBufferT>(_e->Get(_i)->UnPack(_resolver)); } } } else { _o->external_buffers.resize(0); } }
 }
 
 inline ::flatbuffers::Offset<Model> Model::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -21813,6 +22119,8 @@ inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder
   auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
   auto _metadata = _o->metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffer_groups = _o->external_buffer_groups.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBufferGroup>> (_o->external_buffer_groups.size(), [](size_t i, _VectorArgs *__va) { return CreateExternalBufferGroup(*__va->__fbb, __va->__o->external_buffer_groups[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _external_buffers = _o->external_buffers.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ExternalBuffer>> (_o->external_buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateExternalBuffer(*__va->__fbb, __va->__o->external_buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
@@ -21822,7 +22130,9 @@ inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder
       _buffers,
       _metadata_buffer,
       _metadata,
-      _signature_defs);
+      _signature_defs,
+      _external_buffer_groups,
+      _external_buffers);
 }
 
 inline bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_utils.cc b/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
index a173380940d600..cb61ce6243f3ad 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
+++ b/tensorflow/compiler/mlir/lite/schema/schema_utils.cc
@@ -15,8 +15,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/schema/schema_utils.h"
 
 #include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
 
 #include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -59,4 +63,51 @@ BuiltinOperator GetBuiltinCode(const OperatorCodeT* op_code) {
                                              op_code->deprecated_builtin_code));
 }
 
+size_t TensorTypeGetSize(::tflite::TensorType data_type) {
+  switch (data_type) {
+    case ::tflite::TensorType_FLOAT32:
+      static_assert(sizeof(float) == 4, "");
+      return 4;
+    case ::tflite::TensorType_FLOAT16:
+      static_assert(sizeof(int16_t) == 2, "");
+      return 2;
+    case ::tflite::TensorType_INT32:
+      static_assert(sizeof(int32_t) == 4, "");
+      return 4;
+    case ::tflite::TensorType_UINT8:
+      static_assert(sizeof(uint8_t) == 1, "");
+      return 1;
+    case ::tflite::TensorType_INT64:
+      static_assert(sizeof(int64_t) == 8, "");
+      return 8;
+    case ::tflite::TensorType_BOOL:
+      return sizeof(bool);
+    case ::tflite::TensorType_INT16:
+      static_assert(sizeof(int16_t) == 2, "");
+      return 2;
+    case ::tflite::TensorType_COMPLEX64:
+      static_assert(sizeof(std::complex<float>) == 8, "");
+      return 8;
+    case ::tflite::TensorType_INT8:
+      static_assert(sizeof(int8_t) == 1, "");
+      return 1;
+    case ::tflite::TensorType_FLOAT64:
+      static_assert(sizeof(double) == 8, "");
+      return 8;
+    case ::tflite::TensorType_COMPLEX128:
+      static_assert(sizeof(std::complex<double>) == 16, "");
+      return 16;
+    case ::tflite::TensorType_UINT64:
+      static_assert(sizeof(uint64_t) == 8, "");
+      return 8;
+    case ::tflite::TensorType_UINT32:
+      static_assert(sizeof(uint32_t) == 4, "");
+      return 4;
+    case ::tflite::TensorType_UINT16:
+      static_assert(sizeof(uint16_t) == 2, "");
+      return 2;
+    default:
+      return 0;
+  }
+}
 }  // namespace tflite
diff --git a/tensorflow/compiler/mlir/lite/schema/schema_utils.h b/tensorflow/compiler/mlir/lite/schema/schema_utils.h
index 7498aa02ebe5c2..9c32680b85117f 100644
--- a/tensorflow/compiler/mlir/lite/schema/schema_utils.h
+++ b/tensorflow/compiler/mlir/lite/schema/schema_utils.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
 
+#include <cstddef>
+
 #include "flatbuffers/flatbuffers.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 
@@ -28,6 +30,11 @@ BuiltinOperator GetBuiltinCode(const OperatorCode *op_code);
 
 BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code);
 
+// Returns the size of the given TensorType in bytes, or 0 if the TensorType is
+// not supported, this function should be aligned with TfLiteTypeGetSize in
+// lite/kernels/kernel_util.h.
+size_t TensorTypeGetSize(::tflite::TensorType data_type);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 43fada7b0d0b62..cd553040786c72 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -539,6 +539,7 @@ cc_library(
         ":passes_inc_gen",
         ":unfold_splat_constant_pass",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:case",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:conv",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:custom_call",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:dot_general",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index ae672381bacafd..9a0a185443ebc0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -3073,6 +3073,13 @@ func.func @convert_iota_ui64() -> tensor<123xui64> {
   func.return %0 : tensor<123xui64>
 }
 
+// CHECK-LABEL: func @no_convert_iota_ui8
+func.func @no_convert_iota_ui8() -> tensor<123xui8> {
+  // CHECK: "mhlo.iota"
+  %0 = "mhlo.iota"() <{ iota_dimension = 0 : i64 }> : () -> tensor<123xui8>
+  func.return %0 : tensor<123xui8>
+}
+
 // CHECK-LABEL:   func @convert_avgpool_valid(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) <{data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]}> : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
index a77d02e78c1dce..1d8a63130ac1d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
@@ -3721,14 +3721,43 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_back_dims(%arg0: tensor<
 // CHECK: %2 = "tfl.broadcast_to"(%1, %arg1) : (tensor<?x3000x1x1xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
 
 
+// -----
+
+//===----------------------------------------------------------------------===//
+// mhlo.case
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: case_func
+func.func @case_func(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
+  %0 = "mhlo.case"(%arg0) ({
+    %2 = mhlo.add %arg1, %arg2 : tensor<i32>
+    "mhlo.return"(%2) : (tensor<i32>) -> ()
+  }, {
+    %2 = mhlo.multiply %arg1, %arg1 : tensor<i32>
+    "mhlo.return"(%2) : (tensor<i32>) -> ()
+  }) : (tensor<i32>) -> tensor<i32>
+  func.return %0: tensor<i32>
+}
+
+// CHECK: %[[CST:.*]] = arith.constant dense<0> : tensor<i32>
+// CHECK: %[[PRED:.*]] = tfl.not_equal(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK: %[[IF:.*]] = "tfl.if"(%[[PRED]]) ({
+// CHECK:   %[[MUL:.*]] = tfl.mul %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<i32>
+// CHECK:   "tfl.yield"(%[[MUL]]) : (tensor<i32>) -> ()
+// CHECK: }, {
+// CHECK:   %[[ADD:.*]] = tfl.add %arg1, %arg2 {fused_activation_function = "NONE"} : tensor<i32>
+// CHECK:   "tfl.yield"(%[[ADD]]) : (tensor<i32>) -> ()
+// CHECK: }) : (tensor<i1>) -> tensor<i32>
+// CHECK: return %[[IF]] : tensor<i32>
+
 // -----
 
 //===----------------------------------------------------------------------===//
 // mhlo.if
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: if
-func.func @if(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
+// CHECK-LABEL: if_label
+func.func @if_label(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>) {
   %0 = mhlo.add %arg1, %arg2 : tensor<i32>
   %1 = "mhlo.if"(%arg0) ({
     "mhlo.return"(%0) : (tensor<i32>) -> ()
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 3891d0f3fe4db3..7608ff985f1eb9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -2081,8 +2081,10 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
       ConversionPatternRewriter& rewriter) const final {
     RankedTensorType type =
         mlir::dyn_cast_or_null<RankedTensorType>(iota_op.getType());
-    // TF::RangeOp doesn't support UI16.
-    if (!type || type.getElementType().isUnsignedInteger(16)) return failure();
+    // TF::RangeOp doesn't support UI16 and UI8.
+    if (!type || type.getElementType().isUnsignedInteger(16) ||
+        type.getElementType().isUnsignedInteger(8))
+      return failure();
 
     const uint64_t dimension = iota_op.getIotaDimension();
     Type element_type = type.getElementType();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
index 9e2f1cf33f495f..16c194df28f591 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
@@ -320,6 +320,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "case",
+    srcs = ["case.cc"],
+    hdrs = ["case.h"],
+    deps = [
+        ":util",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
+
 cc_library(
     name = "if",
     srcs = ["if.cc"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc
new file mode 100644
index 00000000000000..b50a5e7fd83195
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.cc
@@ -0,0 +1,100 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::odml {
+namespace {
+
+// Legalizes mhlo.case op to tfl.if op.
+// This pattern only supports mhlo.case ops with exactly two branches.
+class LegalizeCaseOp : public OpConversionPattern<mhlo::CaseOp> {
+ public:
+  using OpConversionPattern<mhlo::CaseOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::CaseOp case_op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    // mhlo.case can have N branches, but tfl.if only supports two.
+    if (case_op.getBranches().size() != 2) {
+      return rewriter.notifyMatchFailure(
+          case_op, "can only convert mhlo.case with 2 branches");
+    }
+
+    // `mhlo.case` takes an index, `tfl.if` takes a boolean predicate.
+    // For a 2-branch `mhlo.case` (branch 0 and branch 1), we need to map
+    // the index to a boolean.
+    // According to the mhlo.case spec, an out-of-bounds index defaults to the
+    // index of the last branch, which is 1 in this case.
+    // So, index 0 maps to branch 0, and any other index (1, or out of bounds)
+    // maps to branch 1.
+    // This can be expressed as a predicate `index != 0` for branch 1.
+
+    auto loc = case_op->getLoc();
+    auto index = case_op.getIndex();
+    auto index_type = mlir::cast<ShapedType>(index.getType());
+
+    // Create a constant tensor of the same shape as the index, filled with
+    // zeros.
+    auto const_zero = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getZeroAttr(index_type));
+
+    // Create the predicate `index != 0`.
+    auto pred_type = index_type.clone(rewriter.getI1Type());
+    auto pred = mhlo::CompareOp::create(
+        rewriter, loc, pred_type, index, const_zero,
+        mhlo::ComparisonDirectionAttr::get(rewriter.getContext(),
+                                           mhlo::ComparisonDirection::NE),
+        mhlo::ComparisonTypeAttr{});  // Default comparison type is fine for
+                                      // integers.
+
+    // Create the tfl.if op.
+    auto tfl_if =
+        TFL::IfOp::create(rewriter, loc, case_op.getResultTypes(), pred);
+
+    // Branch 1 of mhlo.case becomes the `then_region` of tfl.if.
+    tfl_if.getThenRegion().takeBody(case_op.getBranches()[1]);
+    ReplaceTerminatorWithYield(tfl_if.getThenRegion(), rewriter);
+
+    // Branch 0 of mhlo.case becomes the `else_region` of tfl.if.
+    tfl_if.getElseRegion().takeBody(case_op.getBranches()[0]);
+    ReplaceTerminatorWithYield(tfl_if.getElseRegion(), rewriter);
+
+    rewriter.replaceOp(case_op, tfl_if.getResults());
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateCasePatterns(MLIRContext* context, RewritePatternSet& patterns,
+                          ConversionTarget& target) {
+  patterns.add<LegalizeCaseOp>(context);
+  // Mark mhlo.case as dynamically legal: it's legal if it does NOT have
+  // exactly 2 branches, as those are the ones we want to convert.
+  target.addDynamicallyLegalOp<mhlo::CaseOp>(
+      [](mhlo::CaseOp op) { return op.getBranches().size() != 2; });
+}
+
+}  // namespace mlir::odml
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h
new file mode 100644
index 00000000000000..11c470a1492630
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h
@@ -0,0 +1,31 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+void PopulateCasePatterns(MLIRContext* context, RewritePatternSet& patterns,
+                          ConversionTarget& target);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_CASE_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index 9518b960f17442..0c43a5c4047a64 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/case.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h"  // IWYU pragma: keep
@@ -479,6 +480,7 @@ void LegalizeHloToTfLitePass::runOnOperation() {
   PopulateWhilePatterns(context, patterns, target);
   PopulateGetDimensionSizePatterns(context, patterns, target);
   PopulateIfPatterns(context, patterns, target);
+  PopulateCasePatterns(context, patterns, target);
   PopulateLegalizeFftPatterns(context, patterns, target);
   PopulateCustomCallPatterns(context, patterns, target);
 
@@ -493,7 +495,6 @@ void LegalizeHloToTfLitePass::runOnOperation() {
 
 }  // namespace
 
-
 // Creates an instance of the pass.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfLitePass() {
   return std::make_unique<LegalizeHloToTfLitePass>();
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 6043e26cb757d8..2fcdfb80b6a0ad 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -261,7 +261,7 @@ func.func @mul_one_quant(%arg0: tensor<32x!quant.uniform<u8:f32, 1.0>>) -> tenso
 
 
 // CHECK-LABEL: @elementwise_unary_ops
-func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
+func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
   %0 = arith.constant dense<-1.0> : tensor<f32>
   %1 = arith.constant dense<1.0> : tensor<f32>
   %2 = arith.constant dense<1.0> : tensor<f32>
@@ -269,6 +269,7 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   %4 = arith.constant dense<4.0> : tensor<f32>
   %5 = arith.constant dense<4.0> : tensor<f32>
   %6 = arith.constant dense<2.0> : tensor<f32>
+  %one = arith.constant dense<1.0> : tensor<f32>
 
   // CHECK-DAG: [[cst0:%.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: [[cst1:%.*]] = arith.constant dense<0.841470957> : tensor<f32>
@@ -277,7 +278,8 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   // CHECK-DAG: [[cst4:%.*]] = arith.constant dense<2.000000e+00> : tensor<f32>
   // CHECK-DAG: [[cst5:%.*]] = arith.constant dense<5.000000e-01> : tensor<f32>
   // CHECK-DAG: [[cst6:%.*]] = arith.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: return [[cst0]], [[cst1]], [[cst2]], [[cst3]], [[cst4]], [[cst5]], [[cst6]]
+  // CHECK-DAG: [[cst7:%.*]] = arith.constant dense<0.761594176> : tensor<f32>
+  // CHECK: return [[cst0]], [[cst1]], [[cst2]], [[cst3]], [[cst4]], [[cst5]], [[cst6]], [[cst7]]
 
   %7 = "tfl.abs"(%0) : (tensor<f32>) -> tensor<f32>
   %8 = "tfl.sin"(%1) : (tensor<f32>) -> tensor<f32>
@@ -286,8 +288,9 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   %11 = "tfl.sqrt"(%4) : (tensor<f32>) -> tensor<f32>
   %12 = "tfl.rsqrt"(%5) : (tensor<f32>) -> tensor<f32>
   %13 = "tfl.square"(%6) : (tensor<f32>) -> tensor<f32>
+  %14 = "tfl.tanh"(%one) : (tensor<f32>) -> tensor<f32>
 
-  func.return %7, %8, %9, %10, %11, %12, %13 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+  func.return %7, %8, %9, %10, %11, %12, %13, %14 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
 }
 
 // CHECK-LABEL: @max_with_neg_f32_max_val
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index e950a5d91b9876..2ce933112a0a43 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -323,21 +323,19 @@ absl::Status ConvertTFExecutorToStablehloFlatbuffer(
 
     // TODO: b/264218457 - Refactor the component below once StableHLO Quantizer
     // can run DRQ. Temporarily using TF Quantization for StableHLO DRQ.
-    if (!converter_flags.has_quantization_options()) {
-      // The default minimum number of elements a weights array must have to be
-      // quantized by this transformation.
-      const int kWeightsMinNumElementsDefault = 1024;
-
-      quantization::QuantizationOptions quantization_options;
-
-      quantization_options.mutable_quantization_method()->set_preset_method(
-          quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8);
-      quantization_options.set_op_set(quantization::UNIFORM_QUANTIZED);
-      quantization_options.set_min_num_elements_for_weights(
-          kWeightsMinNumElementsDefault);
-      quantization::AddQuantizePtqDynamicRangePasses(pass_manager,
-                                                     quantization_options);
-    }
+    // The default minimum number of elements a weights array must have to be
+    // quantized by this transformation.
+    const int kWeightsMinNumElementsDefault = 1024;
+
+    quantization::QuantizationOptions quantization_options;
+
+    quantization_options.mutable_quantization_method()->set_preset_method(
+        quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8);
+    quantization_options.set_op_set(quantization::UNIFORM_QUANTIZED);
+    quantization_options.set_min_num_elements_for_weights(
+        kWeightsMinNumElementsDefault);
+    quantization::AddQuantizePtqDynamicRangePasses(pass_manager,
+                                                   quantization_options);
     if (failed(pass_manager.run(module))) {
       return status_handler.ConsumeStatus();
     }
@@ -350,10 +348,6 @@ absl::Status ConvertTFExecutorToStablehloFlatbuffer(
   pass_manager.addPass(mlir::odml::createPrintOpStatsPass(
       mlir::odml::GetAcceptedStableHLODialects()));
   mlir::odml::AddStablehloOptimizationPasses(pass_manager);
-  if (converter_flags.has_quantization_options()) {
-    stablehlo::quantization::AddQuantizationPasses(
-        pass_manager, converter_flags.quantization_options());
-  }
   if (failed(pass_manager.run(module))) {
     return status_handler.ConsumeStatus();
   }
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc b/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
index 4d6d46e55bb5be..9ccda1d0c95e69 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/op_version.cc
@@ -177,6 +177,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           reinterpret_cast<TfLiteFullyConnectedParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(fully_connected_params != nullptr);
 
+      if (op_sig.inputs.at(1).type == kTfLiteInt2) {
+        return 14;
+      }
+
       if (op_sig.inputs.at(0).type == kTfLiteInt16 &&
           op_sig.inputs.at(1).type == kTfLiteInt4 &&
           op_sig.outputs.at(0).type == kTfLiteInt16) {
@@ -464,6 +468,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SLICE:
+      if (op_sig.inputs.at(0).type == kTfLiteInt4) {
+        return 7;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
         return 6;
       }
@@ -473,7 +480,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.inputs.at(0).type == kTfLiteInt16) {
         return 4;
       }
-      // Version 3 supports string input types.
       if (op_sig.inputs.at(0).type == kTfLiteString) {
         return 3;
       }
@@ -499,6 +505,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_DEQUANTIZE:
+      if (op_sig.inputs.at(0).type == kTfLiteInt2) {
+        return 7;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteInt4) {
         return 6;
       }
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc b/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
index 641a2e45fb8c24..87313665d1811f 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/op_version_test.cc
@@ -733,6 +733,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   };
   fake_op_sig.ext_options.fully_connected.is_per_channel_quantized = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 12);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .inputs = CreateOpSignatureTensorSpecs(
+          std::vector<TfLiteType>{kTfLiteInt8, kTfLiteInt2}),
+      .outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8),
+      .builtin_data = reinterpret_cast<void*>(&fully_connected_params),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 14);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -757,6 +766,12 @@ TEST(OpVersionTest, VersioningDequantizeTest) {
   fake_op_sig.ext_options.dequantize.is_per_channel_quantized = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
 
+  fake_op_sig = {
+      .op = BuiltinOperator_DEQUANTIZE,
+      .inputs = CreateOpSignatureTensorSpecs(kTfLiteInt2),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+
   fake_op_sig = {
       .op = BuiltinOperator_DEQUANTIZE,
       .inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32),
diff --git a/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc b/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
index d7e6b7c9a2064c..aca1b463878966 100644
--- a/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/compiler/mlir/lite/tools/versioning/runtime_version.cc
@@ -139,6 +139,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_FULLY_CONNECTED, 11}, "2.15.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 12}, "2.17.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 13}, "2.18.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 14}, "2.21.0"},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
@@ -294,6 +295,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SLICE, 4}, "2.4.0"},
               {{BuiltinOperator_SLICE, 5}, "2.5.0"},
               {{BuiltinOperator_SLICE, 6}, "2.14.0"},
+              {{BuiltinOperator_SLICE, 7}, "2.21.0"},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
               {{BuiltinOperator_TANH, 3}, "2.3.0"},
@@ -326,6 +328,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_DEQUANTIZE, 4}, "2.2.0"},
               {{BuiltinOperator_DEQUANTIZE, 5}, "2.7.0"},
               {{BuiltinOperator_DEQUANTIZE, 6}, "2.18.0"},
+              {{BuiltinOperator_DEQUANTIZE, 7}, "2.21.0"},
               {{BuiltinOperator_REVERSE_SEQUENCE, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 1}, "1.14.0"},
               {{BuiltinOperator_EQUAL, 2}, "1.14.0"},
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 96412f20633f6a..7453ed54975a5a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -43,7 +43,7 @@ limitations under the License.
 namespace mlir {
 namespace TFL {
 namespace {
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DEF_QUANTIZEVARIABLESPASS
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
 
 using ResourceIdMap =
@@ -80,7 +80,7 @@ Type GetDequantizedTypeFromAssigneVariableOp(VarHandleOp var_handle_op) {
 }
 
 class QuantizeVariablesPass
-    : public QuantizeVariablesPassBase<QuantizeVariablesPass> {
+    : public impl::QuantizeVariablesPassBase<QuantizeVariablesPass> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeVariablesPass)
   explicit QuantizeVariablesPass() = default;
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
index 12fab673d6e43b..1b82ca5b0e61dc 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
@@ -74,7 +74,7 @@ llvm::SmallVector<mlir::APInt> ReadAsHostEndian(ArrayRef<uint8_t> bytes) {
   ret.reserve(elem_count);
 
   const char* data_ptr = reinterpret_cast<const char*>(bytes.data());
-  for (int i = 0; i < elem_count; i++) {
+  for (size_t i = 0; i < elem_count; i++) {
     T val = llvm::support::endian::readNext<T, llvm::endianness::native,
                                             llvm::support::unaligned>(data_ptr);
     ret.push_back(mlir::APInt(sizeof(T) * 8, val));
@@ -362,7 +362,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
       assert(bytes_len % 2 == 0);
       // Supports both BF16 and F16.
       assert(elem_type.isF16() || elem_type.isBF16());
-      int elem_count = bytes_len / 2;
+      size_t elem_count = bytes_len / 2;
 
       if (elem_type.isF16()) {
         std::vector<Eigen::half> values;
@@ -370,7 +370,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
 
         const char* data = reinterpret_cast<const char*>(buffer.data());
 
-        for (int i = 0; i < elem_count; i++) {
+        for (size_t i = 0; i < elem_count; i++) {
           uint16_t bit_repr = llvm::support::endian::readNext<
               uint16_t, llvm::endianness::native, llvm::support::unaligned>(
               data);
@@ -385,7 +385,7 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
 
         const char* data = reinterpret_cast<const char*>(buffer.data());
 
-        for (int i = 0; i < elem_count; i++) {
+        for (size_t i = 0; i < elem_count; i++) {
           uint16_t bit_repr = llvm::support::endian::readNext<
               uint16_t, llvm::endianness::native, llvm::support::unaligned>(
               data);
@@ -398,13 +398,13 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 32: {
       assert(bytes_len % 4 == 0);
-      int elem_count = bytes_len / 4;
+      size_t elem_count = bytes_len / 4;
       std::vector<float> values;
       values.reserve(elem_count);
 
       const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      for (int i = 0; i < elem_count; i++) {
+      for (size_t i = 0; i < elem_count; i++) {
         uint32_t bit_repr =
             llvm::support::endian::readNext<uint32_t, llvm::endianness::native,
                                             llvm::support::unaligned>(data);
@@ -415,13 +415,13 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
     }
     case 64: {
       assert(bytes_len % 8 == 0);
-      int elem_count = bytes_len / 8;
+      size_t elem_count = bytes_len / 8;
       std::vector<double> values;
       values.reserve(elem_count);
 
       const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      for (int i = 0; i < elem_count; i++) {
+      for (size_t i = 0; i < elem_count; i++) {
         uint64_t bit_repr =
             llvm::support::endian::readNext<uint64_t, llvm::endianness::native,
                                             llvm::support::unaligned>(data);
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
index 2acb4dccb88a18..0ae1247e2a156a 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
@@ -43,13 +43,13 @@ void Register(const std::string& op_name, OpRegistry* registry) {
 }  // namespace
 
 TEST(TfTextUtilsTest, TestTfTextRegistered) {
-  std::unique_ptr<OpRegistry> registry(new OpRegistry);
+  std::unique_ptr<OpRegistry> registry = std::make_unique<OpRegistry>();
   Register("WhitespaceTokenizeWithOffsets", registry.get());
   EXPECT_TRUE(IsTFTextRegistered(registry.get()));
 }
 
 TEST(TfTextUtilsTest, TestTfTextNotRegistered) {
-  std::unique_ptr<OpRegistry> registry(new OpRegistry);
+  std::unique_ptr<OpRegistry> registry = std::make_unique<OpRegistry>();
   Register("Test", registry.get());
   EXPECT_FALSE(IsTFTextRegistered(registry.get()));
 }
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index 9d7e689f3b6a3c..0c6a636d38b822 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -124,7 +124,7 @@ class ModifyMlirModulePass : public MlirOptimizationPass {
 };
 
 FunctionDef XTimesTwo() {
-  const Tensor kTwo = test::AsScalar<int64>(2);
+  const Tensor kTwo = test::AsScalar<int64_t>(2);
   return FunctionDefHelper::Define(
       // Name
       "XTimesTwo",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 5eaf5d736262ca..4f2384347a7802 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -251,7 +251,7 @@ std::string ExperimentalConvertSavedModelToMlir(
 
   // Convert the SavedModelV2Bundle to an MLIR module.
 
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
@@ -270,10 +270,10 @@ std::string ExperimentalConvertSavedModelV1ToMlirLite(
     const std::string& saved_model_path, const std::string& exported_names_str,
     const std::string& tags, bool upgrade_legacy, bool show_debug_info,
     TF_Status* status) {
-  std::unordered_set<string> tag_set =
+  std::unordered_set<std::string> tag_set =
       absl::StrSplit(tags, ',', absl::SkipEmpty());
 
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
@@ -299,7 +299,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
     bool show_debug_info, TF_Status* status) {
   // Load the saved model into a SavedModelBundle.
 
-  std::unordered_set<string> tag_set =
+  std::unordered_set<std::string> tag_set =
       absl::StrSplit(tags, ',', absl::SkipEmpty());
 
   tensorflow::SavedModelBundle bundle;
@@ -311,7 +311,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   }
 
   // Convert the SavedModelBundle to an MLIR module.
-  std::vector<string> exported_names =
+  std::vector<std::string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
index ae93231d4d336b..5d6d36ed3a6c7d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
@@ -36,8 +36,6 @@ using ::testing::HasSubstr;
 using ::testing::Key;
 using ::testing::SizeIs;
 using ::testing::StrEq;
-using ::tsl::testing::IsOk;
-using ::tsl::testing::StatusIs;
 
 TEST(CreateRepresentativeDatasetFileMapTest,
      ConfigWithoutExplicitSignatureKeyMappedToServingDefault) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
index a3a09bdb35daaa..2fb8f11a4e4349 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
@@ -49,23 +49,23 @@ class TestEnvBrokenFileSystem : public tsl::Env {
  public:
   TestEnvBrokenFileSystem() = default;
 
-  bool MatchPath(const tsl::string& path, const tsl::string& pattern) override {
+  bool MatchPath(const std::string& path, const std::string& pattern) override {
     return false;
   }
 
   void SleepForMicroseconds(int64_t micros) override {}
 
-  tsl::string GetRunfilesDir() override { return tsl::string("dummy_path"); }
+  std::string GetRunfilesDir() override { return std::string("dummy_path"); }
 
   int64_t GetCurrentThreadId() override { return 0; }
 
   tsl::Thread* StartThread(const tsl::ThreadOptions& thread_options,
-                           const tsl::string& name,
+                           const std::string& name,
                            absl::AnyInvocable<void()> fn) override {
     return nullptr;
   }
 
-  bool GetCurrentThreadName(tsl::string* name) override { return false; }
+  bool GetCurrentThreadName(std::string* name) override { return false; }
 
   void SchedClosure(absl::AnyInvocable<void()> closure) override {}
 
@@ -82,9 +82,9 @@ class TestEnvBrokenFileSystem : public tsl::Env {
     return absl::OkStatus();
   }
 
-  tsl::string FormatLibraryFileName(const tsl::string& name,
-                                    const tsl::string& version) override {
-    return tsl::string("dummy_path");
+  std::string FormatLibraryFileName(const std::string& name,
+                                    const std::string& version) override {
+    return std::string("dummy_path");
   }
 
   // This is the part that would break the `CreateTmpDir` function because it
@@ -95,7 +95,7 @@ class TestEnvBrokenFileSystem : public tsl::Env {
   }
 
  private:
-  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {
     list->push_back("/tmp");
   }
 };
@@ -107,7 +107,7 @@ class TestEnvBrokenFileSystemAndNoLocalTempDirs
  private:
   // This is the part that essentially breaks the `GetLocalTmpFileName` function
   // because it doesn't provide any available temp dirs.
-  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {}
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {}
 };
 
 TEST(IoTest, GetLocalTmpFileNameGivesValidFileName) {
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
index c2e91c5da16e93..1f6464d85f5ef4 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
@@ -4822,7 +4822,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
@@ -5022,7 +5022,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index e23f510182259f..4104cf412acfd8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -335,7 +335,6 @@ def TF_IfRegionOp : TF_Op<"IfRegion",
            "areTypesCompatible",
            "getEntrySuccessorOperands",
            "getRegionInvocationBounds",
-           "getSuccessorRegions"
        ]>
       ]> {
   let summary = "output = cond ? then_branch output : else_branch output";
@@ -395,7 +394,6 @@ def TF_GeneratorDatasetRegionOp : TF_Op<"GeneratorDatasetRegion",
            "areTypesCompatible",
            "getEntrySuccessorOperands",
            "getRegionInvocationBounds",
-           "getSuccessorRegions"
        ]>,
        SingleBlockImplicitTerminator<"YieldOp">,
        TF_GeneratorOpSideEffect,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 59ba13e326a02f..6382f325a47505 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -3003,14 +3003,14 @@ void GeneratorDatasetRegionOp::getRegionInvocationBounds(
 }
 
 OperandRange GeneratorDatasetRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   auto end = this->getOperation()->operand_end();
-  if (point.isParent()) {
+  if (successor.isParent()) {
     // The op itself doesn't branch back to itself.
     return ::mlir::OperandRange(end, end);
-  } else if (point.getRegionOrNull() == &getInit()) {
+  } else if (successor.getSuccessor() == &getInit()) {
     return getInitFuncOtherArgs();
-  } else if (point.getRegionOrNull() == &getNext()) {
+  } else if (successor.getSuccessor() == &getNext()) {
     return getNextFuncOtherArgs();
   } else /* finalize region */ {
     return getFinalizeFuncOtherArgs();
@@ -3024,13 +3024,15 @@ void GeneratorDatasetRegionOp::getSuccessorRegions(
     // The op itself branches to `init` first.
     regions.push_back(
         RegionSuccessor(&getInit(), getInit().front().getArguments()));
-  } else if (point.getRegionOrNull() == &getInit()) {
+  } else if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getInit()) {
     // `init` branches to `next`, passing along the arguments given to `init`'s
     // yield. Said arguments precede the "other args".
     n = getInitFuncOtherArgs().size();
     regions.push_back(RegionSuccessor(
         &getNext(), getNext().front().getArguments().drop_back(n)));
-  } else if (point.getRegionOrNull() == &getNext()) {
+  } else if (point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+             &getNext()) {
     // `next` branches to itself, or to `finalize`, passing all arguments given
     // to `next`s yield.
 
@@ -3045,7 +3047,8 @@ void GeneratorDatasetRegionOp::getSuccessorRegions(
         &getFinalize(), getFinalize().front().getArguments().slice(0, num)));
   } else {
     // `finalize` branches back to the op itself, not passing any arguments.
-    regions.push_back(RegionSuccessor());
+    regions.push_back(RegionSuccessor(
+        point.getTerminatorPredecessorOrNull()->getParentRegion()));
   }
 }
 
@@ -3261,11 +3264,12 @@ void IfRegionOp::getRegionInvocationBounds(
   invocationBounds.assign(2, {0, 1});
 }
 
-OperandRange IfRegionOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+OperandRange IfRegionOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   // IfRegionOp currently only allows one op (the condition), so there are no
   // remaining operands for the successor.
-  assert((point.isParent() ||
-          (point == (*this)->getRegion(0) || point == (*this)->getRegion(1))) &&
+  assert((successor.isParent() ||
+          (successor.getSuccessor() == &(*this)->getRegion(0) ||
+           successor.getSuccessor() == &(*this)->getRegion(1))) &&
          "Invalid IfRegionOp region index.");
   auto end = this->getOperation()->operand_end();
   return ::mlir::OperandRange(end, end);
@@ -3275,16 +3279,20 @@ void IfRegionOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor>& regions) {
   if (!point.isParent()) {
     // The `then` and the `else` region branch back to the parent operation.
-    regions.push_back(RegionSuccessor(getResults()));
+    regions.push_back(
+        RegionSuccessor(point.getTerminatorPredecessorOrNull(), getResults()));
     return;
   } else {
     // The parent can branch to either `then` or `else`.
-    regions.push_back(RegionSuccessor(&getThenBranch()));
+    regions.push_back(
+        RegionSuccessor(&getThenBranch(), getThenBranch().getArguments()));
     Region* elseRegion = &this->getElseBranch();
     if (!elseRegion->empty())
-      regions.push_back(RegionSuccessor(elseRegion));
+      regions.push_back(
+          RegionSuccessor(elseRegion, elseRegion->getArguments()));
     else
-      regions.push_back(RegionSuccessor());
+      regions.push_back(RegionSuccessor(
+          point.getTerminatorPredecessorOrNull()->getParentRegion()));
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 2b839d963fe2e4..23683673fe189a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -3611,8 +3611,8 @@ SmallVector<Region *> WhileRegionOp::getLoopRegions() { return {&getBody()}; }
 //===----------------------------------------------------------------------===//
 
 OperandRange WhileRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
-  if (point.isParent()) {
+    RegionSuccessor successor) {
+  if (successor.isParent()) {
     // WhileRegionOp branches to the condition, which branches to the body. But
     // the op itself doesn't branch back to itself. So this range is empty.
     auto end = this->getOperation()->operand_end();
@@ -3628,21 +3628,28 @@ OperandRange WhileRegionOp::getEntrySuccessorOperands(
 
 void WhileRegionOp::getSuccessorRegions(
     RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  if (!point.isParent() && point == (*this)->getRegion(0)) {
+  if (!point.isParent() &&
+      (point.getTerminatorPredecessorOrNull() &&
+       point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+           &(*this)->getRegion(0))) {
     // 'cond' branches to the body or returns.
     Operation *yield = getCond().front().getTerminator();
     if (yield->getOperands().size() ==
         1 + this->getOperation()->getOperands().size()) {
       regions.push_back(
           RegionSuccessor(&getBody(), getBody().front().getArguments()));
-      regions.push_back(getResults());
+      regions.push_back(RegionSuccessor(getOperation(), getResults()));
     } else {
       // For compatibility with older code, we allow the "yield" in a condition
       // to only yield a single boolean. In that case we can't forward any args.
       regions.push_back(RegionSuccessor(&getBody()));
-      regions.push_back(RegionSuccessor());  // branch back to parent, no args
+      regions.push_back(
+          RegionSuccessor(getOperation(), getResults().take_front(0)));
     }
-  } else if (!point.isParent() && point == (*this)->getRegion(1)) {
+  } else if (!point.isParent() &&
+             (point.getTerminatorPredecessorOrNull() &&
+              point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+                  &(*this)->getRegion(1))) {
     // 'body' branches back to 'cond'.
     regions.push_back(
         RegionSuccessor(&getCond(), getCond().front().getArguments()));
@@ -4510,7 +4517,7 @@ LogicalResult UniformQuantizedClipByValueOp::verify() {
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   if (auto whileOp =
           llvm::dyn_cast<WhileRegionOp>(this->getOperation()->getParentOp())) {
     if (&whileOp.getCond() == this->getOperation()->getParentRegion()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 765ed1171a8449..a3305eef8a0819 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1317,7 +1317,7 @@ func.func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 
 // tf.Region yield number of results should match op number of results
 func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #0 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Operation tf.Yield to parent results: source has 2 operands, but target successor <to parent> needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -1332,7 +1332,7 @@ func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>)
 // -----
 
 func.func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #1 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Operation tf.Yield to parent results: source has 2 operands, but target successor <to parent> needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index bc4487a4e3fd7d..954c318b416150 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -29,6 +28,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
@@ -230,7 +230,7 @@ std::optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
     llvm::StringRef equation) {
   llvm::SmallDenseMap<char, int64_t> map;
   for (int64_t i = 0; i < equation.size(); ++i) {
-    if (!std::isalpha(equation[i])) {
+    if (!llvm::isAlpha(equation[i])) {
       // Unsupported character in the equation.
       return std::nullopt;
     }
@@ -263,7 +263,7 @@ std::optional<llvm::SetVector<char>> GetAvailableLabels(
   const int lhs_size = lhs.size();
   for (int i = 0; i < lhs_size; ++i) {
     const char label = lhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       labels.remove(label);
       ++lhs_count;
     } else if (label == '.') {
@@ -280,7 +280,7 @@ std::optional<llvm::SetVector<char>> GetAvailableLabels(
   const int rhs_size = rhs.size();
   for (int i = 0; i < rhs_size; ++i) {
     const char label = rhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       labels.remove(label);
       ++rhs_count;
     } else if (label == '.') {
@@ -318,7 +318,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_lhs;
   for (int i = 0; i < lhs.size(); ++i) {
     const char label = lhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_lhs.push_back(label);
     } else {
       // Encounter ellipsis: generate unnamed labels then insert to the new
@@ -333,7 +333,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_rhs, new_rhs_labels;
   for (int i = 0; i < rhs.size(); ++i) {
     const char label = rhs[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_rhs.push_back(label);
     } else {
       // Encounter ellipsis: generate unnamed labels then insert to the new
@@ -352,7 +352,7 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   std::string new_output;
   for (int i = 0; i < out.size(); ++i) {
     const char label = out[i];
-    if (std::isalpha(label)) {
+    if (llvm::isAlpha(label)) {
       new_output.push_back(label);
     } else {
       // Encounter ellipsis: we will just insert the generated labels to the new
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
index a2c4a7031ed14b..0cdb563a45eed7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
@@ -49,7 +49,7 @@ static constexpr int kTextFileIndex_LineNumber = -1;
 class InitTextFileToImportPass
     : public impl::InitTextFileToImportPassBase<InitTextFileToImportPass> {
  public:
-  InitTextFileToImportPass() {}
+  InitTextFileToImportPass() = default;
   InitTextFileToImportPass(const InitTextFileToImportPass&) {}
   explicit InitTextFileToImportPass(std::string saved_model_dir) {
     saved_model_dir_ = saved_model_dir;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
index a985cdc11611b4..41c5cd4234f1cc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import_test_pass.cc
@@ -46,7 +46,7 @@ class InitTextFileToImportTestPass
     : public impl::InitTextFileToImportTestPassBase<
           InitTextFileToImportTestPass> {
  public:
-  explicit InitTextFileToImportTestPass() {}
+  explicit InitTextFileToImportTestPass() = default;
 
   StringRef getArgument() const final {
     return "tf-init-text-file-to-import-test";
@@ -115,7 +115,7 @@ class InitTextFileToImportSavedModelTestPass
     : public impl::InitTextFileToImportSavedModelTestPassBase<
           InitTextFileToImportSavedModelTestPass> {
  public:
-  explicit InitTextFileToImportSavedModelTestPass() {}
+  explicit InitTextFileToImportSavedModelTestPass() = default;
 
  private:
   void runOnOperation() override;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 2e023e3e057096..57a41f538f277f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -36,6 +35,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -289,8 +289,10 @@ ObjectNames::ObjectNames(const SavedObjectGraph& object_graph,
                 // - `model.variables.0`
                 // - `model.keras_api.layers.1.keras_api.trainable_variables.0`
                 // - ... 10 more long aliases ending in digits ...
-                return std::make_tuple(isdigit(a.back()), a.size(), a) <
-                       std::make_tuple(isdigit(b.back()), b.size(), b);
+                return std::make_tuple(absl::ascii_isdigit(a.back()), a.size(),
+                                       a) <
+                       std::make_tuple(absl::ascii_isdigit(b.back()), b.size(),
+                                       b);
               });
     for (const std::string& name : kv.second) {
       if (IsExported(name)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
index c48f52576df4e3..0288006ee4d105 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.cc
@@ -39,13 +39,13 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ParseOutputArrayInfo(absl::string_view array_names,
-                                  std::vector<string>* outputs) {
+                                  std::vector<std::string>* outputs) {
   TF_RETURN_IF_ERROR(ParseNodeNames(array_names, *outputs));
   return absl::OkStatus();
 }
 
-absl::Status ParseOutputArrayInfo(const std::vector<string>& output_names,
-                                  std::vector<string>* outputs) {
+absl::Status ParseOutputArrayInfo(const std::vector<std::string>& output_names,
+                                  std::vector<std::string>* outputs) {
   for (auto& output_name : output_names) {
     if (output_name.empty()) continue;
     outputs->push_back(output_name);
@@ -57,8 +57,8 @@ absl::Status ParseInputArrayInfo(absl::string_view array_names,
                                  absl::string_view data_types,
                                  absl::string_view shapes,
                                  GraphImportConfig::InputArrays* inputs) {
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   TF_RETURN_IF_ERROR(ParseNodeNames(array_names, node_names));
   TF_RETURN_IF_ERROR(ParseNodeDataTypes(data_types, node_dtypes));
@@ -113,8 +113,8 @@ static absl::Status HandleSubtype(absl::string_view subtype,
 }
 
 absl::Status ParseInputArrayInfo(
-    const std::vector<string>& node_names,
-    const std::vector<string>& node_dtypes,
+    const std::vector<std::string>& node_names,
+    const std::vector<std::string>& node_dtypes,
     const std::vector<std::optional<std::vector<int>>>& node_shapes,
     GraphImportConfig::InputArrays* inputs) {
   std::vector<std::string> used_node_dtypes;
@@ -148,7 +148,7 @@ absl::Status ParseInputArrayInfo(
   // StringMap doesn't support reserve else reserve input map size here.
   for (int i = 0, end = node_names.size(); i < end; i++) {
     auto& name = node_names[i];
-    const string& type = used_node_dtypes[i];
+    const std::string& type = used_node_dtypes[i];
     if (name.empty()) continue;
 
     auto it_inserted_pair = inputs->insert({name, {}});
@@ -193,7 +193,7 @@ absl::Status ParseNodeShapes(
     std::vector<std::optional<std::vector<int>>>& shapes_vector) {
   shapes_vector.clear();
   if (!shapes_str.empty()) {
-    std::vector<string> node_shapes_str = absl::StrSplit(shapes_str, ':');
+    std::vector<std::string> node_shapes_str = absl::StrSplit(shapes_str, ':');
     for (int i = 0; i < node_shapes_str.size(); i++) {
       if (node_shapes_str[i] == "*") {
         shapes_vector.push_back(std::nullopt);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
index 1119d4e2b33c4f..176773da45fcbc 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
@@ -35,10 +35,10 @@ namespace tensorflow {
 // Parses the command line flag strings to the specification of nodes in
 // the Graph.
 absl::Status ParseOutputArrayInfo(absl::string_view array_names,
-                                  std::vector<string>* outputs);
+                                  std::vector<std::string>* outputs);
 
-absl::Status ParseOutputArrayInfo(const std::vector<string>& output_names,
-                                  std::vector<string>* outputs);
+absl::Status ParseOutputArrayInfo(const std::vector<std::string>& output_names,
+                                  std::vector<std::string>* outputs);
 
 // Parses the command line flag strings to the specification of nodes in
 // the Graph. `data_types` input string can be empty since the flag is optional.
@@ -48,8 +48,8 @@ absl::Status ParseInputArrayInfo(absl::string_view array_names,
                                  GraphImportConfig::InputArrays* inputs);
 
 absl::Status ParseInputArrayInfo(
-    const std::vector<string>& node_names,
-    const std::vector<string>& node_dtypes,
+    const std::vector<std::string>& node_names,
+    const std::vector<std::string>& node_dtypes,
     const std::vector<std::optional<std::vector<int>>>& node_shapes,
     GraphImportConfig::InputArrays* inputs);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 858c70a54a58d6..3706b8afe34d78 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
-#include <cctype>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -99,8 +99,7 @@ std::vector<std::string> BridgeLoggerConfig::GetFilter(
 
 bool BridgeLoggerConfig::ShouldOnlyDumpTopLevelPasses() {
   const char* env_var = getenv(kEnableOnlyTopLevelPassesEnvVar);
-  std::string value(env_var);
-  std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+  std::string value = absl::AsciiStrToLower(env_var);
   // Return true if value is "1" or "true"; otherwise, false.
   return value == "1" || value == "true";
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index b0ad4e265633d8..550ab547498f45 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -249,14 +249,14 @@ absl::StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
     CONVERT_FLAT(DT_BOOL, bool)
     CONVERT_FLAT(DT_FLOAT, float)
     CONVERT_FLAT(DT_DOUBLE, double)
-    CONVERT_FLAT(DT_INT8, int8)
-    CONVERT_FLAT(DT_INT16, int16)
-    CONVERT_FLAT(DT_INT32, int32)
+    CONVERT_FLAT(DT_INT8, int8_t)
+    CONVERT_FLAT(DT_INT16, int16_t)
+    CONVERT_FLAT(DT_INT32, int32_t)
     CONVERT_FLAT(DT_INT64, int64_t)
-    CONVERT_FLAT(DT_UINT8, uint8)
-    CONVERT_FLAT(DT_UINT16, uint16)
-    CONVERT_FLAT(DT_UINT32, uint32)
-    CONVERT_FLAT(DT_UINT64, uint64)
+    CONVERT_FLAT(DT_UINT8, uint8_t)
+    CONVERT_FLAT(DT_UINT16, uint16_t)
+    CONVERT_FLAT(DT_UINT32, uint32_t)
+    CONVERT_FLAT(DT_UINT64, uint64_t)
     CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
     CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index a34553623408d8..b120b6c786edb6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -162,11 +162,11 @@ TEST_F(ConvertTensorTest, Simple) {
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::int4>(
       {static_cast<tsl::int4>(1), static_cast<tsl::int4>(-1)}, DT_INT4,
       mlir::IntegerType::get(&context, 4)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8_t>(
       {1, -1}, DT_INT8, mlir::IntegerType::get(&context, 8)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16_t>(
       {1, -1}, DT_INT16, mlir::IntegerType::get(&context, 16)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32_t>(
       {1, -1}, DT_INT32, mlir::IntegerType::get(&context, 32)));
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64_t>(
       {1, -1}, DT_INT64, mlir::IntegerType::get(&context, 64)));
@@ -175,19 +175,19 @@ TEST_F(ConvertTensorTest, Simple) {
       {static_cast<tsl::uint4>(1), static_cast<tsl::uint4>(2)}, DT_UINT4,
       mlir::IntegerType::get(
           &context, 4, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8_t>(
       {1, 2}, DT_UINT8,
       mlir::IntegerType::get(
           &context, 8, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16_t>(
       {1, 2}, DT_UINT16,
       mlir::IntegerType::get(
           &context, 16, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32_t>(
       {1, 2}, DT_UINT32,
       mlir::IntegerType::get(
           &context, 32, mlir::IntegerType::SignednessSemantics::Unsigned)));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64_t>(
       {1, 2}, DT_UINT64,
       mlir::IntegerType::get(
           &context, 64, mlir::IntegerType::SignednessSemantics::Unsigned)));
@@ -222,11 +222,11 @@ TEST_F(ConvertTensorTest, SimpleDenseResourceElements) {
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::int4>(
       {static_cast<tsl::int4>(1), static_cast<tsl::int4>(-1)}, DT_INT4,
       mlir::IntegerType::get(&context, 4), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8_t>(
       {1, -1}, DT_INT8, mlir::IntegerType::get(&context, 8), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16_t>(
       {1, -1}, DT_INT16, mlir::IntegerType::get(&context, 16), true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32_t>(
       {1, -1}, DT_INT32, mlir::IntegerType::get(&context, 32), true));
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64_t>(
       {1, -1}, DT_INT64, mlir::IntegerType::get(&context, 64), true));
@@ -236,22 +236,22 @@ TEST_F(ConvertTensorTest, SimpleDenseResourceElements) {
       mlir::IntegerType::get(&context, 4,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8_t>(
       {1, 2}, DT_UINT8,
       mlir::IntegerType::get(&context, 8,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16_t>(
       {1, 2}, DT_UINT16,
       mlir::IntegerType::get(&context, 16,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32_t>(
       {1, 2}, DT_UINT32,
       mlir::IntegerType::get(&context, 32,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
       true));
-  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64_t>(
       {1, 2}, DT_UINT64,
       mlir::IntegerType::get(&context, 64,
                              mlir::IntegerType::SignednessSemantics::Unsigned),
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
index 09a76102557c4f..a4f2861276a9bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config_test.cc
@@ -59,9 +59,9 @@ TEST(DataDumperLoggerConfig, TestPassFilter) {
          1);
   setenv("TF_DUMP_GRAPH_PREFIX", "sponge", 1);
 
-  const string kTestFilename = "test.txt";
+  const std::string kTestFilename = "test.txt";
   int print_callback_count = 0;
-  auto get_filename_fn = [](const string &filename, mlir::Operation *op) {
+  auto get_filename_fn = [](const std::string& filename, mlir::Operation* op) {
     return filename;
   };
   auto print_callback = [&](llvm::raw_ostream &out) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index d9249d472b334c..3329bff4c02737 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -126,7 +126,8 @@ void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set) {
   // For device that do not have any metadata, or if we failed to parse metadata
   // from the DeviceSet, we add a unit attribute to the `tf.devices` attribute.
   for (Device* device : device_set->devices()) {
-    string name = DeviceNameUtils::ParsedNameToString(device->parsed_name());
+    std::string name =
+        DeviceNameUtils::ParsedNameToString(device->parsed_name());
 
     if (device->device_type() == DEVICE_GPU) {
       auto metadata = ParseGpuDeviceMetadata(*device, &builder);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index c3e7ae75022348..abf357873a6153 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -52,8 +52,8 @@ class FakeDevice : public Device {
     return errors::Unimplemented("FakeDevice::Sync()");
   }
 
-  static std::unique_ptr<Device> Make(const string& name,
-                                      const string& desc = "") {
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& desc = "") {
     DeviceNameUtils::ParsedName parsed_name;
     DeviceNameUtils::ParseFullName(name, &parsed_name);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
index 7e92860e5ff03e..9d9780d231523f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
@@ -26,12 +26,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void ExpectHasSubstr(const string& s, const string& expected) {
+void ExpectHasSubstr(const std::string& s, const std::string& expected) {
   EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
-void ExpectHasNoSubstr(const string& s, const string& expected) {
+void ExpectHasNoSubstr(const std::string& s, const std::string& expected) {
   EXPECT_FALSE(absl::StrContains(s, expected))
       << "'" << s << "' should not contain '" << expected << "'";
 }
@@ -39,7 +39,7 @@ void ExpectHasNoSubstr(const string& s, const string& expected) {
 // WritableFile that simply concats into string.
 class StringWritableFile : public WritableFile {
  public:
-  explicit StringWritableFile(string* str) : str_(*str) {}
+  explicit StringWritableFile(std::string* str) : str_(*str) {}
 
   absl::Status Append(absl::string_view data) override {
     absl::StrAppend(&str_, data);
@@ -62,7 +62,7 @@ class StringWritableFile : public WritableFile {
   }
 
  private:
-  string& str_;
+  std::string& str_;
 };
 
 TEST(Dump, TextualIrToFileSuccess) {
@@ -72,10 +72,10 @@ TEST(Dump, TextualIrToFileSuccess) {
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   UseMlirForGraphDump(MlirDumpConfig());
-  string ret = DumpGraphToFile("tir", graph);
+  std::string ret = DumpGraphToFile("tir", graph);
   ASSERT_EQ(ret, io::JoinPath(testing::TmpDir(), "tir.mlir"));
 
-  string actual;
+  std::string actual;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), ret, &actual));
 }
 
@@ -86,12 +86,12 @@ TEST(Dump, TextualIrWithOptions) {
                    .Attr("dtype", DT_FLOAT)
                    .Finalize(&graph, &node));
 
-  string actual;
+  std::string actual;
   StringWritableFile file(&actual);
   TF_ASSERT_OK(DumpTextualIRToFile(MlirDumpConfig().emit_location_information(),
                                    graph, /*flib_def=*/nullptr, &file));
 
-  string expected_substr = R"(loc(#loc))";
+  std::string expected_substr = R"(loc(#loc))";
   ExpectHasSubstr(actual, expected_substr);
 }
 
@@ -100,17 +100,17 @@ TEST(Dump, DumpToTFG) {
   Node* node;
   TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
-  string actual;
+  std::string actual;
   StringWritableFile file(&actual);
 
   TF_ASSERT_OK(DumpTextualIRToFile(
       MlirDumpConfig().emit_dialect(MlirDumpConfig::Dialect::kTFG), graph,
       /*flib_def=*/nullptr, &file));
 
-  string expected_substr("tfg.graph");
+  std::string expected_substr("tfg.graph");
   ExpectHasSubstr(actual, expected_substr);
 
-  string not_expected_substr("tf_executor.island");
+  std::string not_expected_substr("tf_executor.island");
   ExpectHasNoSubstr(actual, not_expected_substr);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index b970ca84b326cf..138e13e3719328 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -44,7 +44,7 @@ struct NameCounts {
   llvm::StringMap<int64_t> counts;
 };
 
-std::string MakeUniqueFilename(string name) {
+std::string MakeUniqueFilename(std::string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -274,7 +274,7 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
   // Output dirs "sponge" (case-insensitive) have a special meaning: Dump into
   // the directory specified by the environment variable
   // TEST_UNDECLARED_OUTPUTS_DIR.
-  string lower_path = absl::AsciiStrToLower(path);
+  std::string lower_path = absl::AsciiStrToLower(path);
   if (lower_path == "sponge") {
     if (!tensorflow::io::GetTestUndeclaredOutputsDir(&path)) {
       LOG(ERROR) << "MLIR crash reproducer is set to '" << dir_path.str()
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 9ec1b9970ae777..9e07ece4e0999e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -400,12 +400,12 @@ absl::Status ConvertAttributes(
     if (auto symbol_ref = mlir::dyn_cast<mlir::SymbolRefAttr>(attr)) {
       TF_RETURN_IF_ERROR(ConvertAttribute(
           mlir::cast<mlir::FlatSymbolRefAttr>(symbol_ref), &value));
-      func_call_attrs[string(name)] = std::move(value);
+      func_call_attrs[std::string(name)] = std::move(value);
       continue;
     }
     if (auto func_attr = mlir::dyn_cast<mlir::TF::FuncAttr>(attr)) {
       TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
-      func_call_attrs[string(name)] = std::move(value);
+      func_call_attrs[std::string(name)] = std::move(value);
       continue;
     }
     if (mlir::isa<mlir::AffineMapAttr>(attr)) {
@@ -434,12 +434,12 @@ absl::Status ConvertAttributes(
     // input TensorFlow GraphDef shouldn't contain '.'. If it does appear in
     // the attribute from MLIR, it is treated as an attribute from function
     // calls.
-    std::vector<string> name_tokens =
+    std::vector<std::string> name_tokens =
         absl::StrSplit(name, '.', absl::SkipEmpty());
     TF_RET_CHECK(name_tokens.size() <= 2);
     auto it = func_call_attrs.find(name_tokens[0]);
     if (it == func_call_attrs.end()) {
-      (*values)[string(name)] = std::move(value);
+      (*values)[std::string(name)] = std::move(value);
     } else {
       (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] =
           std::move(value);
@@ -457,7 +457,7 @@ absl::Status SetShapeAttribute(absl::string_view name,
   AttrValue value;
   SetTensorShapeProto(shaped_type, value.mutable_list()->add_shape());
 
-  auto result = values->insert({string(name), value});
+  auto result = values->insert({std::string(name), value});
   if (!result.second) {
     // This should be extremely rare as it means we are adding the same
     // attribute multiple times/have some redundancy in representing this
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
index 50306edb28b067..fa2ff3c8a281fa 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
@@ -59,7 +59,7 @@ absl::Status LoadProtoFromFileImpl(absl::string_view input_filename, T* proto) {
   if (std::error_code error = file_or_err.getError()) {
     return errors::InvalidArgument(
         "Could not open input file ",
-        string(input_filename.data(), input_filename.size()).c_str());
+        std::string(input_filename.data(), input_filename.size()).c_str());
   }
 
   const auto& input_file = *file_or_err;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
index a189cc14555143..fbcdc9e894fbd9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
@@ -41,7 +41,7 @@ const char kTensorPrefix[] = "tftensor$";
 
 }  // namespace
 
-string MangleAttributeName(absl::string_view str) {
+std::string MangleAttributeName(absl::string_view str) {
   return absl::StrCat(kAttributePrefix, str);
 }
 
@@ -66,7 +66,7 @@ MangledKind GetMangledKind(absl::string_view str) {
   }
 }
 
-string MangleShape(const TensorShapeProto& shape) {
+std::string MangleShape(const TensorShapeProto& shape) {
   return absl::StrCat(kTensorShapePrefix, PrintShortTextProto(shape));
 }
 
@@ -74,7 +74,7 @@ absl::Status DemangleShape(absl::string_view str, TensorShapeProto* proto) {
   return ParseTextProto(str, kTensorShapePrefix, proto);
 }
 
-string MangleTensor(const TensorProto& tensor) {
+std::string MangleTensor(const TensorProto& tensor) {
   return absl::StrCat(kTensorPrefix, PrintShortTextProto(tensor));
 }
 
@@ -82,7 +82,7 @@ absl::Status DemangleTensor(absl::string_view str, TensorProto* proto) {
   return ParseTextProto(str, kTensorPrefix, proto);
 }
 
-string MangleDataType(const DataType& dtype) {
+std::string MangleDataType(const DataType& dtype) {
   return absl::StrCat(kDataTypePrefix, DataType_Name(dtype));
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
index a0c14f27b5b38f..7e95a27f0290f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h
@@ -28,7 +28,7 @@ namespace mangling_util {
 enum class MangledKind { kUnknown, kDataType, kTensorShape, kTensor };
 
 // Mangles an attribute name, marking the attribute as a TensorFlow attribute.
-string MangleAttributeName(absl::string_view str);
+std::string MangleAttributeName(absl::string_view str);
 
 // Returns true if 'str' was mangled with MangleAttributeName.
 bool IsMangledAttributeName(absl::string_view str);
@@ -41,17 +41,17 @@ absl::string_view DemangleAttributeName(absl::string_view str);
 MangledKind GetMangledKind(absl::string_view str);
 
 // Return a TensorShapeProto mangled as a string.
-string MangleShape(const TensorShapeProto& shape);
+std::string MangleShape(const TensorShapeProto& shape);
 // Demangle a string mangled with MangleShape.
 absl::Status DemangleShape(absl::string_view str, TensorShapeProto* proto);
 
 // Return a TensorProto mangled as a string.
-string MangleTensor(const TensorProto& tensor);
+std::string MangleTensor(const TensorProto& tensor);
 // Demangle a string mangled with MangleTensor.
 absl::Status DemangleTensor(absl::string_view str, TensorProto* proto);
 
 // Return a DataType mangled as a string.
-string MangleDataType(const DataType& dtype);
+std::string MangleDataType(const DataType& dtype);
 // Demangle a string mangled with MangleDataType.
 absl::Status DemangleDataType(absl::string_view str, DataType* proto);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
index c9a6f6e85c9d4d..c1479fead3a595 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
@@ -133,7 +133,7 @@ absl::Status SetTypeAttribute(absl::string_view name, ContainerT types,
     type_list.add_type(dtype);
   }
 
-  auto result = values->insert({string(name), value});
+  auto result = values->insert({std::string(name), value});
   assert(result.second && "cannot have multiple attributes with the same name");
   (void)result;
 
@@ -164,7 +164,7 @@ void SetShapeAttribute(absl::string_view name, ContainerT shapes,
   // If shape is already set, override it. This can happen if we import
   // without shape inference enabled and so couldn't be removed on import and
   // are not explicitly dropped later.
-  (*values)[string(name)] = value;
+  (*values)[std::string(name)] = value;
 }
 
 // Collects all the unregistered attributes for an TF dialect operation.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index 8cb797a9a9b214..b13e099fde3557 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -214,7 +214,7 @@ absl::StatusOr<std::unique_ptr<Graph>> BuildConstOpGraphWithOutputShapes() {
   std::initializer_list<int64_t> dims = {2, 3, 4, 5};
   Tensor tensor(data_type, TensorShape(dims));
   for (int i = 0; i < 2 * 3 * 4 * 5; ++i) {
-    tensor.flat<int32>()(i) = i;
+    tensor.flat<int32_t>()(i) = i;
   }
 
   NodeDef node;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index 46f7f5de1d0856..74b7304b745033 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -106,9 +106,9 @@ namespace {
 
 // Time the execution of kernels (in CPU cycles). Meant to be used as RAII.
 struct CompilationTimer {
-  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+  uint64_t start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
 
-  uint64 ElapsedCycles() {
+  uint64_t ElapsedCycles() {
     return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
   }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 243f4333a88525..2ab0c3c619b292 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -4864,7 +4864,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
@@ -5064,7 +5064,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
     auto strides_attr = GetI64ElementsAttr(op.getStrides());
-    std::vector<tensorflow::int32> strides{
+    std::vector<int32_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
 
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 159bc8b17bc36b..a6ee4c3e1ffbd0 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -308,7 +308,7 @@ py_strict_library(
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -339,7 +339,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:transpiler",
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.cc b/tensorflow/compiler/mlir/tfr/utils/utils.cc
index f9e70b228c0b71..ddff766c789450 100644
--- a/tensorflow/compiler/mlir/tfr/utils/utils.cc
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
@@ -92,9 +93,9 @@ std::string GetComposeFuncName(StringRef tf_op_name) {
     }
     if (tf_op_name[i] == '.') {
       compose_func_name.push_back('_');
-    } else if (tf_op_name[i] >= 'A' && tf_op_name[i] <= 'Z') {
+    } else if (llvm::isUpper(tf_op_name[i])) {
       compose_func_name.push_back('_');
-      compose_func_name.push_back(tf_op_name[i] + 'a' - 'A');
+      compose_func_name.push_back(llvm::toLower(tf_op_name[i]));
     } else {
       compose_func_name.push_back(tf_op_name[i]);
     }
@@ -106,13 +107,13 @@ std::string GetTFOpName(StringRef compose_func_name) {
   std::string tf_op_name;
   bool after_underscore = false;
   for (int i = 0; i < compose_func_name.size(); ++i) {
-    if (compose_func_name[i] >= 'A' && compose_func_name[i] <= 'Z') {
+    if (llvm::isUpper(compose_func_name[i])) {
       // The field name must not contain uppercase letters.
       return {};
     }
     if (after_underscore) {
-      if (compose_func_name[i] >= 'a' && compose_func_name[i] <= 'z') {
-        tf_op_name.push_back(compose_func_name[i] + 'A' - 'a');
+      if (llvm::isLower(compose_func_name[i])) {
+        tf_op_name.push_back(llvm::toUpper(compose_func_name[i]));
         after_underscore = false;
       } else {
         // The character after a "_" must be a lowercase letter.
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index cc59c9150da769..7f4a602b1330a6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -906,10 +906,6 @@ void CreateFallbackInitializationFunction(
       builder.create<tf_mlrt::CreateOp>(
           func_op.getLoc(), /*resultTypes=*/mlir::TypeRange{},
           /*operands=*/mlir::ValueRange{}, op->getAttrs());
-    } else {
-      // TODO: b/381849919 - Remove this log once the bug is fixed.
-      LOG_FIRST_N(WARNING, 100)
-          << "Skip creation of fallback kernel for op index " << op_index;
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
index 077d662ef4ed1c..6ce41c7f4fe829 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
@@ -45,8 +45,6 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::FloatEq;
 using ::testing::IsEmpty;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
 
 TEST(MlirToByteCodeTest, Basic) {
   constexpr char kBasicMlir[] =
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
index 8f06eb691551ef..6d6d572a79e9f2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
@@ -127,17 +127,17 @@ absl::StatusOr<std::string> EmitToBinary(llvm::StringRef host_triple,
   return ostream.str().str();
 }
 
-absl::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
-                 llvm::StringRef host_triple,
-                 llvm::ArrayRef<std::string> architectures,
-                 llvm::ArrayRef<int64_t> tile_sizes,
-                 llvm::ArrayRef<int64_t> unroll_factors, bool print_ptx,
+absl::Status Run(std::string input_file, std::string output_file,
+                 std::string host_triple,
+                 std::vector<std::string> architectures,
+                 std::vector<int64_t> tile_sizes,
+                 std::vector<int64_t> unroll_factors, bool print_ptx,
                  bool print_llvmir, bool enable_ftz, bool index_64bit,
                  bool jit_compile, bool jit_i64_indexed_for_large_tensors) {
   // Read TF code.
   std::string hlo_code;
   TF_RETURN_IF_ERROR(
-      ReadFileToString(Env::Default(), input_file.str(), &hlo_code));
+      ReadFileToString(Env::Default(), input_file, &hlo_code));
 
   // Compile.
   mlir::DialectRegistry registry;
@@ -160,7 +160,7 @@ absl::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
 
   // Write .a file.
   TF_RETURN_IF_ERROR(
-      WriteStringToFile(Env::Default(), output_file.str(), binary));
+      WriteStringToFile(Env::Default(), output_file, binary));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 0c0bfee0e9407e..15a697ddf75807 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -118,7 +118,7 @@ class GpuKernelToBlobPass
       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
       auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
           llvm_module_copy.get(),
-          tensorflow::se::RocmComputeCapability{arch_str}, options,
+          stream_executor::GpuComputeCapability(tensorflow::se::RocmComputeCapability{arch_str}), options,
           options.DebugString());
       if (!hsaco_or.ok()) {
         return tensorflow::errors::Internal("Failure when generating HSACO");
diff --git a/tensorflow/compiler/mlir/tosa/tfl_passes.h b/tensorflow/compiler/mlir/tosa/tfl_passes.h
index 96d3cabf0c1f1f..02bd007f6fa36c 100644
--- a/tensorflow/compiler/mlir/tosa/tfl_passes.h
+++ b/tensorflow/compiler/mlir/tosa/tfl_passes.h
@@ -42,8 +42,8 @@ struct TOSATFLLegalizationPipelineOptions
       llvm::cl::desc("Dequantize the TFLite softmax"), llvm::cl::init(false)};
 
   TOSATFLLegalizationPipelineOptions() {
-    disabled_patterns = std::nullopt;
-    enabled_patterns = std::nullopt;
+    disabled_patterns = {};
+    enabled_patterns = {};
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index de0872b660d4ec..0475d46a37a091 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -53,8 +53,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> createFuseBiasTFPass();
 // `enabledPatterns` is a set of labels used to filter out input patterns that
 //  do not have one of the labels in this set.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFLPass(
-    ArrayRef<std::string> disabled_patterns = std::nullopt,
-    ArrayRef<std::string> enabled_patterns = std::nullopt);
+    ArrayRef<std::string> disabled_patterns = {},
+    ArrayRef<std::string> enabled_patterns = {});
 
 std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass();
 std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass();
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
index fd50116ba7d1a7..fb5bb77644c211 100644
--- a/tensorflow/compiler/mlir/utils/name_utils.cc
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -31,8 +31,8 @@ namespace {
 // Checks if a character is legal for a TensorFlow node name, with special
 // handling if a character is at the beginning.
 bool IsLegalChar(char c, bool first_char) {
-  if (isalpha(c)) return true;
-  if (isdigit(c)) return true;
+  if (llvm::isAlpha(c)) return true;
+  if (llvm::isDigit(c)) return true;
   if (c == '.') return true;
   if (c == '_') return true;
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 94ca1645435a2a..e44cfddd144a12 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2,7 +2,14 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_cuda_cc_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow/compiler/tests:build_combined_defs.bzl", "tf_xla_combined_py_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
+load(
+    "//tensorflow/compiler/tests:build_defs.bzl",
+    "generate_backend_suites",
+    "tf_xla_py_strict_test",
+    # copybara:uncomment_begin(google-only)
+    # "tpu_backends",
+    # copybara:uncomment_end
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -214,9 +221,8 @@ tf_xla_combined_py_test(
     name = "combined_ops_test_f",
     size = "medium",
     timeout = "long",
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
+    # copybara:uncomment_begin(google-only)
+    # disabled_backends = tpu_backends(),
     # copybara:uncomment_end
     exec_properties = {
         "cpp_link.mem": "16g",
@@ -341,10 +347,6 @@ tf_xla_py_strict_test(
     name = "add_n_test",
     size = "small",
     srcs = ["add_n_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -496,10 +498,6 @@ tf_xla_py_strict_test(
     name = "cond_test",
     size = "small",
     srcs = ["cond_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -1743,12 +1741,8 @@ tf_xla_py_strict_test(
     name = "tensor_list_ops_test",
     size = "small",
     srcs = ["tensor_list_ops_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/286470564): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    # TensorList ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = ["cpu_ondemand"],
+    # TensorList ops are only implemented on CPU.
+    enabled_backends = ["cpu"],
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1905,10 +1899,6 @@ tf_xla_py_strict_test(
     name = "while_test",
     size = "small",
     srcs = ["while_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/291130193): Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
@@ -2165,7 +2155,6 @@ tf_xla_py_strict_test(
         "gpu_a100",
         "gpu_h100",
     ],
-    env = {"XLA_FLAGS": "--xla_backend_extra_options=xla_cpu_disable_new_fusion_emitters=true"},
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -2429,9 +2418,6 @@ tf_xla_py_strict_test(
     name = "where_op_tpu_test",
     size = "small",
     srcs = ["where_op_test.py"],
-    args = [
-        "--tpu_use_tfrt=true",
-    ],
     disabled_backends = [
         "cpu",
         "cpu_ondemand",
diff --git a/tensorflow/compiler/tests/cast_test.py b/tensorflow/compiler/tests/cast_test.py
index bc35db4e05f7d5..453cbeb1312648 100644
--- a/tensorflow/compiler/tests/cast_test.py
+++ b/tensorflow/compiler/tests/cast_test.py
@@ -35,9 +35,10 @@ def test_cast(self):
         dtypes.uint32,
         dtypes.uint64,
     }
-    for src_type in types:
-      for dst_type in types:
-        self._test_cast(src_type, dst_type)
+    with self.session() as session:
+      for src_type in types:
+        for dst_type in types:
+          self._test_cast(src_type, dst_type, session)
 
   def test_cast_fp8(self):
     if platform.system() == "Darwin":
@@ -61,12 +62,13 @@ def test_cast_fp8(self):
         dtypes.uint32,
         dtypes.uint64,
     }
-    for fp8_type in fp8_types:
-      for other_type in other_types | fp8_types:
-        self._test_cast(fp8_type, other_type)
-        self._test_cast(other_type, fp8_type)
+    with self.session() as session:
+      for fp8_type in fp8_types:
+        for other_type in other_types | fp8_types:
+          self._test_cast(fp8_type, other_type, session)
+          self._test_cast(other_type, fp8_type, session)
 
-  def _test_cast(self, src_type, dst_type):
+  def _test_cast(self, src_type, dst_type, session):
     with self.subTest(src_type=src_type, dst_type=dst_type):
       shapes = [[], [4], [2, 3], [2, 0, 4]]
       src_np_dtype = src_type.as_numpy_dtype
@@ -83,6 +85,7 @@ def _test_cast(self, src_type, dst_type):
             lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
             src,
             expected=dst,
+            local_session=session,
         )
 
       # Check special values.
@@ -112,6 +115,7 @@ def _test_cast(self, src_type, dst_type):
           lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
           src,
           expected=dst,
+          local_session=session,
       )
 
   def test_give_me_a_name(self):
diff --git a/tensorflow/compiler/tests/float_ops_test.py b/tensorflow/compiler/tests/float_ops_test.py
index d8743016c20756..67a1ecc967f24c 100644
--- a/tensorflow/compiler/tests/float_ops_test.py
+++ b/tensorflow/compiler/tests/float_ops_test.py
@@ -23,449 +23,522 @@
 class FloatOpsTest(xla_test.XLATestCase):
 
   def test_float_ops(self):
-    for dtype in self.float_types:
-      x = np.arange(-0.90, 0.90, 0.25)
-      self.assert_op_output_matches_expected(
-          math_ops.acos, x.astype(dtype), expected=np.arccos(x).astype(dtype)
-      )
-      self.assert_op_output_matches_expected(
-          math_ops.asin, x.astype(dtype), expected=np.arcsin(x).astype(dtype)
-      )
-      x = np.arange(-3, 3).reshape(1, 3, 2)
-      self.assert_op_output_matches_expected(
-          math_ops.atan, x.astype(dtype), expected=np.arctan(x).astype(dtype)
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.acosh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0, 1.3169579, 1.76274717, 2.06343707], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.asinh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0.88137359, 1.44363548, 1.81844646, 2.09471255], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.atanh,
-          np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype),
-          expected=np.array(
-              [0.10033535, 0.20273255, 0.3095196, 0.42364893], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.ceil,
-          np.array([[-1.7, 1.2]], dtype=dtype),
-          expected=np.array([[-1, 2]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.cosh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.54308063, 3.76219569, 10.067662, 27.30823284], dtype=dtype
-          ),
-      )
-
-      # Disable float16 testing for now
-      if dtype != np.float16:
-        x = np.arange(-10, 10, 1).astype(dtype)
-        with self.session() as session:
+    with self.session() as session:
+      for dtype in self.float_types:
+        x = np.arange(-0.90, 0.90, 0.25)
+        self.assert_op_output_matches_expected(
+            math_ops.acos,
+            x.astype(dtype),
+            expected=np.arccos(x).astype(dtype),
+            local_session=session,
+        )
+        self.assert_op_output_matches_expected(
+            math_ops.asin,
+            x.astype(dtype),
+            expected=np.arcsin(x).astype(dtype),
+            local_session=session,
+        )
+        x = np.arange(-3, 3).reshape(1, 3, 2)
+        self.assert_op_output_matches_expected(
+            math_ops.atan,
+            x.astype(dtype),
+            expected=np.arctan(x).astype(dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.acosh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0, 1.3169579, 1.76274717, 2.06343707], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.asinh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0.88137359, 1.44363548, 1.81844646, 2.09471255], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.atanh,
+            np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype),
+            expected=np.array(
+                [0.10033535, 0.20273255, 0.3095196, 0.42364893], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.ceil,
+            np.array([[-1.7, 1.2]], dtype=dtype),
+            expected=np.array([[-1, 2]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.cosh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.54308063, 3.76219569, 10.067662, 27.30823284], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        # Disable float16 testing for now
+        if dtype != np.float16:
+          x = np.arange(-10, 10, 1).astype(dtype)
           erf_x = session.run(math_ops.erf(x))
           erfc_x = session.run(math_ops.erfc(x))
 
-        self.assert_op_output_matches_expected(math_ops.erf, x, expected=erf_x)
-        self.assert_op_output_matches_expected(
-            math_ops.erfc, x, expected=erfc_x
-        )
-
-      self.assert_op_output_matches_expected(
-          math_ops.exp,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[0.36787945, 2.7182817]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.expm1,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
-          rtol=1e-5,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.floor,
-          np.array([[-1.7, 1.2]], dtype=dtype),
-          expected=np.array([[-2, 1]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.is_finite,
-          np.array(
-              [[-np.inf, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype
-          ),
-          expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool_),
-      )
-
-      # Tests for tf.nn ops.
-      self.assert_op_output_matches_expected(
-          nn_ops.l2_loss, np.array([[[]]], dtype=dtype), expected=dtype(0)
-      )
-
-      self.assert_op_output_matches_expected(nn_ops.l2_loss, dtype(4), dtype(8))
-
-      self.assert_op_output_matches_expected(
-          nn_ops.l2_loss, np.array([[-2, 4]], dtype=dtype), expected=dtype(10)
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.reciprocal,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[1, 0.5]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.log,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0, 0.69314718]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sin,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0.841478, 0.909302]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.cos,
-          np.array([[1, 2]], dtype=dtype),
-          expected=np.array([[0.540297, -0.41614]], dtype=dtype),
-      )
-
-      # Confirm that log1p will remain precise across a range of small values.
-      self.assert_op_output_matches_expected(
-          math_ops.log1p,
-          np.array(
-              [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
-              dtype=dtype,
-          ),
-          expected=np.log1p(
-              np.array(
-                  [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
-                  dtype=dtype,
-              )
-          ).astype(dtype),
-          rtol=1e-15 if dtype == np.float64 else 1e-4,
-          atol=1e-15 if dtype == np.float64 else 1e-4,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.rint,
-          np.array(
-              [
-                  [-1.7, 1.2, 4.0, 0.0],
-                  [-3.5, -2.5, -1.5, -0.5],
-                  [0.5, 1.5, 2.5, 3.5],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
-          ),
-      )
-      self.assert_op_output_matches_expected(
-          math_ops.round,
-          np.array(
-              [
-                  [-1.7, 1.2, 4.0, 0.0],
-                  [-3.5, -2.5, -1.5, -0.5],
-                  [0.5, 1.5, 2.5, 3.5],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.rsqrt,
-          np.array([[4, 16]], dtype=dtype),
-          expected=np.array([[0.5, 0.25]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sigmoid,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [0.7310586, 0.7310586, 0.7310586, 0.7310586],
-                  [0.7310586, 0.880797, 0.95257413, 0.98201376],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sigmoid,
-          np.array([-300, -150, 0, 150, 300], dtype=dtype),
-          expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sinh,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.17520119, 3.62686041, 10.01787493, 27.2899172], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sqrt,
-          np.array([[4, 9]], dtype=dtype),
-          expected=np.array([[2, 3]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.tan,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.tanh,
-          np.array(
-              [[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20], [19, -19, 22, -22]],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [0.76159418, 0.96402758, 0.99505478, 0.99932933],
-                  [1.0, -1.0, np.nan, 1.0],
-                  [1.0, -1.0, 1.0, -1.0],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.log_softmax,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [-1.3862944, -1.3862944, -1.3862944, -1.3862944],
-                  [-3.4401896, -2.4401896, -1.4401897, -0.44018969],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.elu,
-          np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
-          expected=np.array([[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype),
-          rtol=1e-5,
-          atol=1e-6,
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.selu,
-          np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
-          expected=np.array(
-              [[-1.11133074, 0.0, 1.05070099, -1.758090550379974e-05]],
-              dtype=dtype,
-          ),
-          rtol=1e-5,
-          atol=1e-6,
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.relu,
-          np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[0, 1]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.relu6,
-          np.array([[-0.05, 6.05, 5]], dtype=dtype),
-          expected=np.array([[0, 6, 5]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.leaky_relu,
-          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
-          expected=np.array([[-0.4, -0.2, 0.0, 1.0, 2.0]], dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([1, 2, 3, 4], dtype=dtype),
-          expected=np.array(
-              [0.032058604, 0.087144323, 0.23688284, 0.64391428], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
-          expected=np.array(
-              [
-                  [0.25, 0.25, 0.25, 0.25],
-                  [0.032058604, 0.087144323, 0.23688284, 0.64391428],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softmax,
-          np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype),
-          expected=np.array(
-              [
-                  [[0.5, 0.5], [0.5, 0.5]],
-                  [[0.26894142, 0.73105858], [0.26894142, 0.73105858]],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          nn_ops.softsign,
-          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
-          expected=np.array(
-              [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.sign,
-          np.array(
-              [[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0, float("nan")]], dtype=dtype
-          ),
-          expected=np.array(
-              [[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0, float("nan")]], dtype=dtype
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.is_finite,
-          np.array(
-              [[42, float("inf"), -123], [float("nan"), 0, -0.0]], dtype=dtype
-          ),
-          expected=np.array(
-              [[True, False, True], [False, True, True]], dtype=np.bool_
-          ),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array(0.5, dtype=dtype),
-          expected=np.array(np.log(np.pi) / 2, dtype=dtype),
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array(
-              [
-                  [1, 2, 3],
-                  [4, 5, 6],
-                  [1 / 2, 3 / 2, 5 / 2],
-                  [-3 / 2, -7 / 2, -11 / 2],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [0, 0, np.log(2.0)],
-                  [np.log(6.0), np.log(24.0), np.log(120)],
-                  [
-                      np.log(np.pi) / 2,
-                      np.log(np.pi) / 2 - np.log(2),
-                      np.log(np.pi) / 2 - np.log(4) + np.log(3),
-                  ],
-                  [
-                      np.log(np.pi) / 2 - np.log(3) + np.log(4),
-                      np.log(np.pi) / 2 - np.log(105) + np.log(16),
-                      np.log(np.pi) / 2 - np.log(10395) + np.log(64),
-                  ],
-              ],
-              dtype=dtype,
-          ),
-      )
-
-      # The actual result is complex. Take the real part.
-      self.assert_op_output_matches_expected(
-          math_ops.lgamma,
-          np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype),
-          expected=np.array(
-              [
-                  np.log(np.pi) / 2 + np.log(2),
-                  np.log(np.pi) / 2 - np.log(15) + np.log(8),
-                  np.log(np.pi) / 2 - np.log(945) + np.log(32),
-              ],
-              dtype=dtype,
-          ),
-          atol=1e-4,
-      )
-
-      self.assert_op_output_matches_expected(
-          math_ops.digamma,
-          np.array(
-              [
-                  [1.0, 0.5, 1 / 3.0],
-                  [0.25, 1 / 6.0, 0.125],
-                  [2.0, 3.0, 4.0],
-                  [6.0, 8.0, 9.0],
-              ],
-              dtype=dtype,
-          ),
-          expected=np.array(
-              [
-                  [
-                      -np.euler_gamma,
-                      -2 * np.log(2) - np.euler_gamma,
-                      -np.pi / 2 / np.sqrt(3)
-                      - 3 * np.log(3) / 2
-                      - np.euler_gamma,
-                  ],
-                  [
-                      -np.pi / 2 - 3 * np.log(2) - np.euler_gamma,
-                      -np.pi * np.sqrt(3) / 2
-                      - 2 * np.log(2)
-                      - 3 * np.log(3) / 2
-                      - np.euler_gamma,
-                      -np.pi / 2
-                      - 4 * np.log(2)
-                      - (
-                          np.pi
-                          + np.log(2 + np.sqrt(2))
-                          - np.log(2 - np.sqrt(2))
-                      )
-                      / np.sqrt(2)
-                      - np.euler_gamma,
-                  ],
-                  [
-                      1 - np.euler_gamma,
-                      1.5 - np.euler_gamma,
-                      11 / 6.0 - np.euler_gamma,
-                  ],
-                  [
-                      137 / 60.0 - np.euler_gamma,
-                      363 / 140.0 - np.euler_gamma,
-                      761 / 280.0 - np.euler_gamma,
-                  ],
-              ],
-              dtype=dtype,
-          ),
-      )
+          self.assert_op_output_matches_expected(
+              math_ops.erf,
+              x,
+              expected=erf_x,
+              local_session=session,
+          )
+          self.assert_op_output_matches_expected(
+              math_ops.erfc,
+              x,
+              expected=erfc_x,
+              local_session=session,
+          )
+
+        self.assert_op_output_matches_expected(
+            math_ops.exp,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[0.36787945, 2.7182817]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.expm1,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
+            local_session=session,
+            rtol=1e-5,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.floor,
+            np.array([[-1.7, 1.2]], dtype=dtype),
+            expected=np.array([[-2, 1]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.is_finite,
+            np.array(
+                [[-np.inf, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype
+            ),
+            expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool_),
+            local_session=session,
+        )
+
+        # Tests for tf.nn ops.
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            np.array([[[]]], dtype=dtype),
+            expected=dtype(0),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            dtype(4),
+            dtype(8),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.l2_loss,
+            np.array([[-2, 4]], dtype=dtype),
+            expected=dtype(10),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.reciprocal,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[1, 0.5]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.log,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0, 0.69314718]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sin,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0.841478, 0.909302]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.cos,
+            np.array([[1, 2]], dtype=dtype),
+            expected=np.array([[0.540297, -0.41614]], dtype=dtype),
+            local_session=session,
+        )
+
+        # Confirm that log1p will remain precise across a range of small values.
+        self.assert_op_output_matches_expected(
+            math_ops.log1p,
+            np.array(
+                [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                dtype=dtype,
+            ),
+            expected=np.log1p(
+                np.array(
+                    [[1e-14, 1e-15, 0.6, 2] + [x * 1e-5 for x in range(1, 20)]],
+                    dtype=dtype,
+                )
+            ).astype(dtype),
+            local_session=session,
+            rtol=1e-15 if dtype == np.float64 else 1e-4,
+            atol=1e-15 if dtype == np.float64 else 1e-4,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.rint,
+            np.array(
+                [
+                    [-1.7, 1.2, 4.0, 0.0],
+                    [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
+            ),
+            local_session=session,
+        )
+        self.assert_op_output_matches_expected(
+            math_ops.round,
+            np.array(
+                [
+                    [-1.7, 1.2, 4.0, 0.0],
+                    [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.rsqrt,
+            np.array([[4, 16]], dtype=dtype),
+            expected=np.array([[0.5, 0.25]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sigmoid,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [0.7310586, 0.7310586, 0.7310586, 0.7310586],
+                    [0.7310586, 0.880797, 0.95257413, 0.98201376],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sigmoid,
+            np.array([-300, -150, 0, 150, 300], dtype=dtype),
+            expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sinh,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.17520119, 3.62686041, 10.01787493, 27.2899172], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sqrt,
+            np.array([[4, 9]], dtype=dtype),
+            expected=np.array([[2, 3]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.tan,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.tanh,
+            np.array(
+                [
+                    [1, 2, 3, 4],
+                    [np.inf, -np.inf, np.nan, 20],
+                    [19, -19, 22, -22],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [0.76159418, 0.96402758, 0.99505478, 0.99932933],
+                    [1.0, -1.0, np.nan, 1.0],
+                    [1.0, -1.0, 1.0, -1.0],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.log_softmax,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [-1.3862944, -1.3862944, -1.3862944, -1.3862944],
+                    [-3.4401896, -2.4401896, -1.4401897, -0.44018969],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.elu,
+            np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
+            expected=np.array(
+                [[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype
+            ),
+            rtol=1e-5,
+            atol=1e-6,
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.selu,
+            np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
+            expected=np.array(
+                [[-1.11133074, 0.0, 1.05070099, -1.758090550379974e-05]],
+                dtype=dtype,
+            ),
+            rtol=1e-5,
+            atol=1e-6,
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.relu,
+            np.array([[-1, 1]], dtype=dtype),
+            expected=np.array([[0, 1]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.relu6,
+            np.array([[-0.05, 6.05, 5]], dtype=dtype),
+            expected=np.array([[0, 6, 5]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.leaky_relu,
+            np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+            expected=np.array([[-0.4, -0.2, 0.0, 1.0, 2.0]], dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([1, 2, 3, 4], dtype=dtype),
+            expected=np.array(
+                [0.032058604, 0.087144323, 0.23688284, 0.64391428], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype),
+            expected=np.array(
+                [
+                    [0.25, 0.25, 0.25, 0.25],
+                    [0.032058604, 0.087144323, 0.23688284, 0.64391428],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softmax,
+            np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype),
+            expected=np.array(
+                [
+                    [[0.5, 0.5], [0.5, 0.5]],
+                    [[0.26894142, 0.73105858], [0.26894142, 0.73105858]],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            nn_ops.softsign,
+            np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+            expected=np.array(
+                [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.sign,
+            np.array(
+                [[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0, float("nan")]], dtype=dtype
+            ),
+            expected=np.array(
+                [[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0, float("nan")]], dtype=dtype
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.is_finite,
+            np.array(
+                [[42, float("inf"), -123], [float("nan"), 0, -0.0]], dtype=dtype
+            ),
+            expected=np.array(
+                [[True, False, True], [False, True, True]], dtype=np.bool_
+            ),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array(0.5, dtype=dtype),
+            expected=np.array(np.log(np.pi) / 2, dtype=dtype),
+            local_session=session,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array(
+                [
+                    [1, 2, 3],
+                    [4, 5, 6],
+                    [1 / 2, 3 / 2, 5 / 2],
+                    [-3 / 2, -7 / 2, -11 / 2],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [0, 0, np.log(2.0)],
+                    [np.log(6.0), np.log(24.0), np.log(120)],
+                    [
+                        np.log(np.pi) / 2,
+                        np.log(np.pi) / 2 - np.log(2),
+                        np.log(np.pi) / 2 - np.log(4) + np.log(3),
+                    ],
+                    [
+                        np.log(np.pi) / 2 - np.log(3) + np.log(4),
+                        np.log(np.pi) / 2 - np.log(105) + np.log(16),
+                        np.log(np.pi) / 2 - np.log(10395) + np.log(64),
+                    ],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
+
+        # The actual result is complex. Take the real part.
+        self.assert_op_output_matches_expected(
+            math_ops.lgamma,
+            np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype),
+            expected=np.array(
+                [
+                    np.log(np.pi) / 2 + np.log(2),
+                    np.log(np.pi) / 2 - np.log(15) + np.log(8),
+                    np.log(np.pi) / 2 - np.log(945) + np.log(32),
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+            atol=1e-4,
+        )
+
+        self.assert_op_output_matches_expected(
+            math_ops.digamma,
+            np.array(
+                [
+                    [1.0, 0.5, 1 / 3.0],
+                    [0.25, 1 / 6.0, 0.125],
+                    [2.0, 3.0, 4.0],
+                    [6.0, 8.0, 9.0],
+                ],
+                dtype=dtype,
+            ),
+            expected=np.array(
+                [
+                    [
+                        -np.euler_gamma,
+                        -2 * np.log(2) - np.euler_gamma,
+                        -np.pi / 2 / np.sqrt(3)
+                        - 3 * np.log(3) / 2
+                        - np.euler_gamma,
+                    ],
+                    [
+                        -np.pi / 2 - 3 * np.log(2) - np.euler_gamma,
+                        -np.pi * np.sqrt(3) / 2
+                        - 2 * np.log(2)
+                        - 3 * np.log(3) / 2
+                        - np.euler_gamma,
+                        -np.pi / 2
+                        - 4 * np.log(2)
+                        - (
+                            np.pi
+                            + np.log(2 + np.sqrt(2))
+                            - np.log(2 - np.sqrt(2))
+                        )
+                        / np.sqrt(2)
+                        - np.euler_gamma,
+                    ],
+                    [
+                        1 - np.euler_gamma,
+                        1.5 - np.euler_gamma,
+                        11 / 6.0 - np.euler_gamma,
+                    ],
+                    [
+                        137 / 60.0 - np.euler_gamma,
+                        363 / 140.0 - np.euler_gamma,
+                        761 / 280.0 - np.euler_gamma,
+                    ],
+                ],
+                dtype=dtype,
+            ),
+            local_session=session,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index b96608ed392263..fcd3aadbe10c9a 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -110,12 +110,12 @@ namespace {
 int64_t tf_xla_random_seed = 0;
 int32_t tf_xla_test_repetitions = 20;
 int64_t tf_xla_max_tensor_size = 10000LL;
-string* tf_xla_test_device_ptr;       // initial value set in main()
-string* tf_xla_reference_device_ptr;  // initial value set in main()
+std::string* tf_xla_test_device_ptr;       // initial value set in main()
+std::string* tf_xla_reference_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 bool tf_xla_test_use_mlir = false;
 
-string LocalDeviceToFullDeviceName(const string& device) {
+std::string LocalDeviceToFullDeviceName(const std::string& device) {
   return absl::StrCat("/job:localhost/replica:0/task:0/device:", device);
 }
 
@@ -129,7 +129,7 @@ constexpr std::array<DataType, 4> kAllNumberTypes = {
 // operator.
 class OpTestBuilder {
  public:
-  explicit OpTestBuilder(const string& op_name);
+  explicit OpTestBuilder(const std::string& op_name);
 
   // Adds an input 'tensor' as a Placeholder node.
   OpTestBuilder& Input(const Tensor& tensor);
@@ -161,10 +161,11 @@ class OpTestBuilder {
   // sets it to the NodeDef of the operator under test. Fills 'inputs' and
   // 'outputs' with the names of the input placeholder nodes and the output
   // identity nodes, respectively.
-  absl::Status BuildGraph(const string& name_prefix, const string& device,
-                          bool use_jit, GraphDef* graphdef,
-                          NodeDef** test_node_def, std::vector<string>* inputs,
-                          std::vector<string>* outputs) const;
+  absl::Status BuildGraph(const std::string& name_prefix,
+                          const std::string& device, bool use_jit,
+                          GraphDef* graphdef, NodeDef** test_node_def,
+                          std::vector<std::string>* inputs,
+                          std::vector<std::string>* outputs) const;
 
   struct InputDescription {
     Tensor tensor;
@@ -182,7 +183,7 @@ class OpTestBuilder {
   std::vector<InputDescription> inputs_;
 };
 
-OpTestBuilder::OpTestBuilder(const string& op_name) {
+OpTestBuilder::OpTestBuilder(const std::string& op_name) {
   node_def_.set_op(op_name);
 }
 
@@ -247,12 +248,10 @@ OpTestBuilder& OpTestBuilder::Attr(absl::string_view attr_name,
   return *this;
 }
 
-absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
-                                       const string& device, bool use_jit,
-                                       GraphDef* graphdef,
-                                       NodeDef** test_node_def,
-                                       std::vector<string>* inputs,
-                                       std::vector<string>* outputs) const {
+absl::Status OpTestBuilder::BuildGraph(
+    const std::string& name_prefix, const std::string& device, bool use_jit,
+    GraphDef* graphdef, NodeDef** test_node_def,
+    std::vector<std::string>* inputs, std::vector<std::string>* outputs) const {
   OpRegistryInterface* op_registry = OpRegistry::Global();
 
   const OpDef* op_def;
@@ -275,7 +274,7 @@ absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
   // Build feed and fetch nodes.
   for (int i = 0; i < input_types.size(); ++i) {
     NodeDef* def = graphdef->add_node();
-    string name = absl::StrCat(name_prefix, "_input_", i);
+    std::string name = absl::StrCat(name_prefix, "_input_", i);
     TF_RETURN_IF_ERROR(NodeDefBuilder(name, "Placeholder")
                            .Device(device)
                            .Attr("dtype", input_types[i])
@@ -286,7 +285,7 @@ absl::Status OpTestBuilder::BuildGraph(const string& name_prefix,
 
   for (int i = 0; i < output_types.size(); ++i) {
     NodeDef* def = graphdef->add_node();
-    string name = absl::StrCat(name_prefix, "_output_", i);
+    std::string name = absl::StrCat(name_prefix, "_output_", i);
     TF_RETURN_IF_ERROR(NodeDefBuilder(name, "Identity")
                            .Device(device)
                            .Attr("T", output_types[i])
@@ -494,7 +493,7 @@ class OpTest : public ::testing::Test {
                                  const std::vector<int64_t>& spatial_dims);
 
   // Converts an int64 vector to an int32 vector.
-  std::vector<int32> AsInt32s(const std::vector<int64_t>& int64s);
+  std::vector<int32_t> AsInt32s(const std::vector<int64_t>& int64s);
 
   std::mt19937& generator() { return *generator_; }
 
@@ -664,16 +663,16 @@ class TensorGeneratorComplex64 : public TensorGenerator<complex64> {
   }
 };
 
-class TensorGeneratorInt32 : public TensorGenerator<int32> {
+class TensorGeneratorInt32 : public TensorGenerator<int32_t> {
  public:
   explicit TensorGeneratorInt32(OpTest& test) : TensorGenerator(test) {}
   DataType dtype() override { return DT_INT32; }
-  void RandomVals(std::optional<int32> lo, std::optional<int32> hi,
+  void RandomVals(std::optional<int32_t> lo, std::optional<int32_t> hi,
                   bool needs_unique_values,
-                  absl::FixedArray<int32>& vals) override {
-    absl::flat_hash_set<int32> already_generated;
-    std::uniform_int_distribution<int32> distribution(lo.value_or(-(1 << 20)),
-                                                      hi.value_or(1 << 20));
+                  absl::FixedArray<int32_t>& vals) override {
+    absl::flat_hash_set<int32_t> already_generated;
+    std::uniform_int_distribution<int32_t> distribution(lo.value_or(-(1 << 20)),
+                                                        hi.value_or(1 << 20));
     for (int64_t i = 0; i < vals.size(); ++i) {
       int32_t generated;
       do {
@@ -685,13 +684,13 @@ class TensorGeneratorInt32 : public TensorGenerator<int32> {
   }
 };
 
-class TensorGeneratorInt64 : public TensorGenerator<int64> {
+class TensorGeneratorInt64 : public TensorGenerator<int64_t> {
  public:
   explicit TensorGeneratorInt64(OpTest& test) : TensorGenerator(test) {}
   DataType dtype() override { return DT_INT64; }
-  void RandomVals(std::optional<int64> lo, std::optional<int64> hi,
+  void RandomVals(std::optional<int64_t> lo, std::optional<int64_t> hi,
                   bool needs_unique_values,
-                  absl::FixedArray<int64>& vals) override {
+                  absl::FixedArray<int64_t>& vals) override {
     absl::flat_hash_set<int64_t> already_generated;
     std::uniform_int_distribution<int64_t> distribution(
         lo.value_or(-(1LL << 40)), hi.value_or(1LL << 40));
@@ -928,18 +927,19 @@ Tensor OpTest::RandomBoundedTensor(DataType dtype, Tensor lo, Tensor hi) {
       break;
     }
     case DT_INT32: {
-      auto lo_flat = lo.flat<int32>();
-      auto hi_flat = hi.flat<int32>();
-      test::FillFn<int32>(&tensor, [this, &lo_flat, &hi_flat](int i) -> int32 {
-        std::uniform_int_distribution<int32> distribution(lo_flat(i),
-                                                          hi_flat(i));
-        return distribution(generator());
-      });
+      auto lo_flat = lo.flat<int32_t>();
+      auto hi_flat = hi.flat<int32_t>();
+      test::FillFn<int32_t>(
+          &tensor, [this, &lo_flat, &hi_flat](int i) -> int32_t {
+            std::uniform_int_distribution<int32_t> distribution(lo_flat(i),
+                                                                hi_flat(i));
+            return distribution(generator());
+          });
       break;
     }
     case DT_INT64: {
-      auto lo_flat = lo.flat<int64>();
-      auto hi_flat = hi.flat<int64>();
+      auto lo_flat = lo.flat<int64_t>();
+      auto hi_flat = hi.flat<int64_t>();
       test::FillFn<int64_t>(
           &tensor, [this, &lo_flat, &hi_flat](int i) -> int64_t {
             std::uniform_int_distribution<int64_t> distribution(lo_flat(i),
@@ -1021,21 +1021,21 @@ OpTest::BroadcastableDims() {
 
 Tensor OpTest::RandomReductionIndices(int rank) {
   std::bernoulli_distribution random_bool;
-  std::vector<int32> indices;
+  std::vector<int32_t> indices;
   for (int i = 0; i < rank; ++i) {
     if (random_bool(generator())) {
       indices.push_back(i);
     }
   }
-  return test::AsTensor<int32>(indices);
+  return test::AsTensor<int32_t>(indices);
 }
 
 // Helper that converts 'values' to an int32 or int64 Tensor.
 static Tensor AsIntTensor(DataType dtype, const std::vector<int64_t>& values) {
   switch (dtype) {
     case DT_INT32: {
-      std::vector<int32> values32(values.begin(), values.end());
-      return test::AsTensor<int32>(values32);
+      std::vector<int32_t> values32(values.begin(), values.end());
+      return test::AsTensor<int32_t>(values32);
     }
     case DT_INT64:
       return test::AsTensor<int64_t>(values);
@@ -1092,9 +1092,9 @@ OpTest::ConcatArguments OpTest::ChooseConcatArguments(bool int64_idx_allowed) {
   std::vector<int64_t> dims = RandomDims(1, 4, 0, 64);
 
   int axis =
-      std::uniform_int_distribution<int32>(0, dims.size() - 1)(generator());
-  a.axis =
-      use_int64_idx ? test::AsScalar<int64>(axis) : test::AsScalar<int32>(axis);
+      std::uniform_int_distribution<int32_t>(0, dims.size() - 1)(generator());
+  a.axis = use_int64_idx ? test::AsScalar<int64_t>(axis)
+                         : test::AsScalar<int32_t>(axis);
 
   for (int i = 0; i < a.n; ++i) {
     std::vector<int64_t> shape = dims;
@@ -1113,7 +1113,7 @@ OpTest::EinsumArguments OpTest::ChooseEinsumArguments() {
   switch (op_kind) {
     case matmul:
     case batchmatmul: {
-      std::vector<int64> dims;
+      std::vector<int64_t> dims;
       if (op_kind == matmul) {
         a.equation = "ij,jk->ik";
         dims = RandomDims(2, 2);
@@ -1131,7 +1131,7 @@ OpTest::EinsumArguments OpTest::ChooseEinsumArguments() {
     }
     case dot: {
       a.equation = "i,i->";
-      std::vector<int64> dims = RandomDims(1, 1);
+      std::vector<int64_t> dims = RandomDims(1, 1);
       a.lhs_dims = dims;
       a.rhs_dims = dims;
       break;
@@ -1166,11 +1166,11 @@ OpTest::GatherArguments OpTest::ChooseGatherArguments(bool axis_0) {
         a.batch_dims, kDefaultMaxRank - 1);
     axis = axis_distribution(generator());
   }
-  a.axis = test::AsScalar<int32>((int32)axis);
+  a.axis = test::AsScalar<int32_t>((int32_t)axis);
   a.params_shape = RandomDims(axis + 1, kDefaultMaxRank, 1, 16);
   std::vector<int64_t> indices_shape = RandomDims(0, 3, 0, 16);
-  a.indices = RandomBoundedTensor<int32>(DT_INT32, 0, a.params_shape[axis] - 1,
-                                         false, indices_shape);
+  a.indices = RandomBoundedTensor<int32_t>(
+      DT_INT32, 0, a.params_shape[axis] - 1, false, indices_shape);
 
   return a;
 }
@@ -1209,7 +1209,7 @@ OpTest::ScatterArguments OpTest::ChooseScatterArguments() {
   a.indices_type = DT_INT32;
   a.shape = RandomDims(1, kDefaultMaxRank, 1);
   int rank = a.shape.size();
-  std::uniform_int_distribution<int32> index_len_dist(1, rank);
+  std::uniform_int_distribution<int32_t> index_len_dist(1, rank);
   int index_len = index_len_dist(generator());
   std::vector<int64_t> indices_first = RandomDims(1, kDefaultMaxRank - 1, 1);
   std::vector<int64_t> indices_shape(indices_first);
@@ -1219,9 +1219,9 @@ OpTest::ScatterArguments OpTest::ChooseScatterArguments() {
     updates_shape.push_back(a.shape[index_len + i]);
   }
   Tensor indices_lo(a.indices_type, TensorShape(indices_shape));
-  test::FillFn<int32>(&indices_lo, [](int i) -> int32 { return 0; });
+  test::FillFn<int32_t>(&indices_lo, [](int i) -> int32_t { return 0; });
   Tensor indices_hi(a.indices_type, TensorShape(indices_shape));
-  test::FillFn<int32>(&indices_hi, [index_len, &a](int i) -> int32 {
+  test::FillFn<int32_t>(&indices_hi, [index_len, &a](int i) -> int32_t {
     int idx_dim = i % index_len;
     return a.shape[idx_dim] - 1;
   });
@@ -1239,16 +1239,16 @@ OpTest::SliceArguments OpTest::ChooseSliceArguments(bool neg_one_size) {
   a.shape = RandomDims();
   int rank = a.shape.size();
 
-  std::vector<int32> indices(rank);
+  std::vector<int32_t> indices(rank);
   a.size.resize(rank);
   for (int i = 0; i < rank; ++i) {
     indices[i] =
-        std::uniform_int_distribution<int32>(0, a.shape[i])(generator());
+        std::uniform_int_distribution<int32_t>(0, a.shape[i])(generator());
     int64_t low = neg_one_size ? -1 : 0;
     a.size[i] = std::uniform_int_distribution<int64_t>(
         low, a.shape[i] - indices[i])(generator());
   }
-  a.indices = test::AsTensor<int32>(indices);
+  a.indices = test::AsTensor<int32_t>(indices);
 
   return a;
 }
@@ -1341,8 +1341,8 @@ std::vector<int64_t> OpTest::ImageDims(
   return dims;
 }
 
-std::vector<int32> OpTest::AsInt32s(const std::vector<int64_t>& int64s) {
-  return std::vector<int32>(int64s.begin(), int64s.end());
+std::vector<int32_t> OpTest::AsInt32s(const std::vector<int64_t>& int64s) {
+  return std::vector<int32_t>(int64s.begin(), int64s.end());
 }
 
 // Functions for comparing tensors.
@@ -1382,11 +1382,11 @@ bool IsClose<complex64>(const complex64& x, const complex64& y, double atol,
 }
 
 template <typename T>
-string Str(T x) {
+std::string Str(T x) {
   return absl::StrCat(x);
 }
 template <>
-string Str<complex64>(complex64 x) {
+std::string Str<complex64>(complex64 x) {
   return absl::StrCat("(", x.real(), ", ", x.imag(), ")");
 }
 
@@ -1460,7 +1460,7 @@ absl::Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
     case DT_COMPLEX64:
       return TensorsAreCloseImpl<complex64>(a, b, atol, rtol);
     case DT_INT32:
-      return TensorsAreEqualImpl<int32>(a, b);
+      return TensorsAreEqualImpl<int32_t>(a, b);
     case DT_INT64:
       return TensorsAreEqualImpl<int64_t>(a, b);
     case DT_BOOL:
@@ -1499,9 +1499,10 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     VLOG(1) << "Input: " << input_tensors.back().DebugString();
   }
 
-  string reference_device =
+  std::string reference_device =
       LocalDeviceToFullDeviceName(*tf_xla_reference_device_ptr);
-  string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
+  std::string test_device =
+      LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
 
   DeviceNameUtils::ParsedName parsed_name;
   if (!DeviceNameUtils::ParseLocalName(*tf_xla_test_device_ptr, &parsed_name)) {
@@ -1512,8 +1513,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
   ++num_tests_;
 
   GraphDef graph;
-  std::vector<string> expected_inputs, test_inputs;
-  std::vector<string> expected_fetches, test_fetches;
+  std::vector<std::string> expected_inputs, test_inputs;
+  std::vector<std::string> expected_fetches, test_fetches;
   absl::Status status = builder.BuildGraph(
       absl::StrCat("test", num_tests_, "_expected"), reference_device,
       /*use_jit=*/false, &graph, /*test_node_def=*/nullptr, &expected_inputs,
@@ -1550,8 +1551,9 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     return kFatalError;
   }
 
-  std::vector<std::pair<string, Tensor>> expected_feeds(expected_inputs.size());
-  std::vector<std::pair<string, Tensor>> test_feeds(test_inputs.size());
+  std::vector<std::pair<std::string, Tensor>> expected_feeds(
+      expected_inputs.size());
+  std::vector<std::pair<std::string, Tensor>> test_feeds(test_inputs.size());
   CHECK_EQ(input_tensors.size(), expected_inputs.size());
   CHECK_EQ(input_tensors.size(), test_inputs.size());
 
@@ -1707,12 +1709,12 @@ TEST_F(OpTest, ArgMax) {
     auto type = Choose<DataType>({DT_BOOL, DT_FLOAT});
     std::vector<int64_t> dims = RandomDims(1, 5, 1);
     int num_dims = dims.size();
-    int reduce_dim =
-        std::uniform_int_distribution<int32>(-num_dims, num_dims)(generator());
+    int reduce_dim = std::uniform_int_distribution<int32_t>(
+        -num_dims, num_dims)(generator());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ArgMax")
             .RandomInput(type, dims)
-            .Input(test::AsScalar<int32>(reduce_dim))
+            .Input(test::AsScalar<int32_t>(reduce_dim))
             .Attr("T", type)
             .Attr("Tidx", DT_INT32)
             .Attr("output_type", DT_INT32));
@@ -1724,12 +1726,12 @@ TEST_F(OpTest, ArgMin) {
     auto type = Choose<DataType>({DT_BOOL, DT_FLOAT});
     std::vector<int64_t> dims = RandomDims(1, 5, 1);
     int num_dims = dims.size();
-    int reduce_dim =
-        std::uniform_int_distribution<int32>(-num_dims, num_dims)(generator());
+    int reduce_dim = std::uniform_int_distribution<int32_t>(
+        -num_dims, num_dims)(generator());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ArgMin")
             .RandomInput(type, dims)
-            .Input(test::AsScalar<int32>(reduce_dim))
+            .Input(test::AsScalar<int32_t>(reduce_dim))
             .Attr("T", type)
             .Attr("Tidx", DT_INT32)
             .Attr("output_type", DT_INT32));
@@ -1786,7 +1788,7 @@ TEST_F(OpTest, AvgPool) {
         std::uniform_int_distribution<int>(1, dims[2])(generator());
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool")
             .RandomInput(DT_FLOAT, dims)
@@ -1817,7 +1819,7 @@ TEST_F(OpTest, AvgPool3D) {
     int64_t batch = dims[3];
     int64_t feature = dims[4];
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool3D")
             .RandomInput(DT_FLOAT,
@@ -1837,13 +1839,13 @@ TEST_F(OpTest, AvgPoolGrad) {
   Repeatedly([this]() {
     int batch = RandomDim(1), features = RandomDim(1);
     WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
-    std::vector<int32> input_dims =
+    std::vector<int32_t> input_dims =
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
     std::vector<int64_t> output_dims =
         ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPoolGrad")
-            .Input(test::AsTensor<int32>(input_dims))
+            .Input(test::AsTensor<int32_t>(input_dims))
             .RandomInput(DT_FLOAT, output_dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
@@ -1859,13 +1861,13 @@ TEST_F(OpTest, AvgPool3DGrad) {
   Repeatedly([this]() {
     int batch = RandomDim(1), features = RandomDim(1);
     WindowedSpatialDims d = ChooseWindowedSpatialDims(3);
-    std::vector<int32> input_dims =
+    std::vector<int32_t> input_dims =
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features, d.input_dims));
     std::vector<int64_t> output_dims =
         ImageDims(FORMAT_NHWC, batch, features, d.output_dims);
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("AvgPool3DGrad")
-            .Input(test::AsTensor<int32>(input_dims))
+            .Input(test::AsTensor<int32_t>(input_dims))
             .RandomInput(DT_FLOAT, output_dims)
             .Attr("T", DT_FLOAT)
             .Attr("ksize", ImageDims(FORMAT_NHWC, 1, 1, d.kernel_dims))
@@ -1976,8 +1978,8 @@ TEST_F(OpTest, BatchToSpaceND) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("BatchToSpaceND")
             .RandomInput(type, input_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(block_dims.begin(), block_dims.end())))
             .Input(crops)
             .Attr("T", type));
   });
@@ -2202,15 +2204,15 @@ TEST_F(OpTest, ConcatOffset) {
 
     std::vector<int64_t> dims = RandomDims(1);
     int concat_dim =
-        std::uniform_int_distribution<int32>(0, dims.size() - 1)(generator());
+        std::uniform_int_distribution<int32_t>(0, dims.size() - 1)(generator());
 
     OpTestBuilder builder("ConcatOffset");
-    builder.Input(test::AsScalar<int32>(concat_dim));
+    builder.Input(test::AsScalar<int32_t>(concat_dim));
     builder.Attr("N", n);
     for (int i = 0; i < n; ++i) {
-      std::vector<int32> shape(dims.begin(), dims.end());
+      std::vector<int32_t> shape(dims.begin(), dims.end());
       shape[concat_dim] = RandomDim();
-      builder.Input(test::AsTensor<int32>(shape));
+      builder.Input(test::AsTensor<int32_t>(shape));
     }
     return ExpectTfAndXlaOutputsAreClose(builder);
   });
@@ -2284,7 +2286,8 @@ TEST_F(OpTest, IFFT3D) {
 TEST_F(OpTest, RFFT) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({dims[dims.size() - 1]}));
+    Tensor fft_shape =
+        test::AsTensor<int32_t>(AsInt32s({dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT").RandomInput(DT_FLOAT, dims).Input(fft_shape));
   });
@@ -2293,7 +2296,7 @@ TEST_F(OpTest, RFFT) {
 TEST_F(OpTest, RFFT2D) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(2, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(
+    Tensor fft_shape = test::AsTensor<int32_t>(
         AsInt32s({dims[dims.size() - 2], dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT2D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
@@ -2303,7 +2306,7 @@ TEST_F(OpTest, RFFT2D) {
 TEST_F(OpTest, RFFT3D) {
   Repeatedly([this]() {
     std::vector<int64_t> dims = RandomDims(3, kDefaultMaxRank, 3);
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s(
         {dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]}));
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("RFFT3D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
@@ -2315,7 +2318,7 @@ TEST_F(OpTest, IRFFT) {
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 3);
     int64_t orig_size = dims[dims.size() - 1];
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2328,7 +2331,7 @@ TEST_F(OpTest, IRFFT2D) {
     std::vector<int64_t> orig_size = {dims[dims.size() - 2],
                                       dims[dims.size() - 1]};
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT2D")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2341,7 +2344,7 @@ TEST_F(OpTest, IRFFT3D) {
     std::vector<int64_t> orig_size = {
         dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]};
     dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
-    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    Tensor fft_shape = test::AsTensor<int32_t>(AsInt32s({orig_size}));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT3D")
                                              .RandomInput(DT_COMPLEX64, dims)
                                              .Input(fft_shape));
@@ -2387,7 +2390,7 @@ TEST_F(OpTest, Conv2DBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor kernel_shape = test::AsTensor<int32_t>(AsInt32s(
         {d.kernel_dims[0], d.kernel_dims[1], features_in, features_out}));
     DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
@@ -2409,7 +2412,7 @@ TEST_F(OpTest, Conv2DBackpropInput) {
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32_t batch = RandomDim();
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
@@ -2465,7 +2468,7 @@ TEST_F(OpTest, Conv3DBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(
+    Tensor kernel_shape = test::AsTensor<int32_t>(
         AsInt32s({d.kernel_dims[0], d.kernel_dims[1], d.kernel_dims[2],
                   features_in, features_out}));
     DataType type = DT_FLOAT;
@@ -2489,7 +2492,7 @@ TEST_F(OpTest, Conv3DBackpropInput) {
     int features_in = random_int(generator());
     int features_out = random_int(generator());
     int32_t batch = RandomDim(1);
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop =
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
@@ -2587,7 +2590,7 @@ TEST_F(OpTest, DepthwiseConv2DNativeBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims);
     std::vector<int64_t> backprop = ImageDims(
         FORMAT_NHWC, batch, features_in * depth_multiplier, d.output_dims);
-    Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
+    Tensor kernel_shape = test::AsTensor<int32_t>(AsInt32s(
         {d.kernel_dims[0], d.kernel_dims[1], features_in, depth_multiplier}));
     std::vector<int64_t> strides = ImageDims(FORMAT_NHWC, 1, 1, d.stride_dims);
     strides[2] = strides[1];  // Current impl only supports equal strides
@@ -2612,7 +2615,7 @@ TEST_F(OpTest, DepthwiseConv2DBackpropInput) {
     int features_in = random_int(generator());
     int depth_multiplier = random_int(generator());
     int32_t batch = RandomDim();
-    Tensor in_shape = test::AsTensor<int32>(
+    Tensor in_shape = test::AsTensor<int32_t>(
         AsInt32s(ImageDims(FORMAT_NHWC, batch, features_in, d.input_dims)));
     std::vector<int64_t> backprop = ImageDims(
         FORMAT_NHWC, batch, features_in * depth_multiplier, d.output_dims);
@@ -2717,15 +2720,15 @@ TEST_F(OpTest, DynamicStitch) {
     // implementation does so require. However, the native TF implementation
     // leaves undefined values if we don't cover everything, so we can't
     // really test that case anyway.
-    std::vector<int32> indices(size);
+    std::vector<int32_t> indices(size);
     std::iota(indices.begin(), indices.end(), 0);
     std::shuffle(indices.begin(), indices.end(), generator());
 
     int pos = 0;
     for (int i = 0; i < n; ++i) {
       TensorShape shape(index_dims[i]);
-      Tensor t = test::AsTensor<int32>(
-          absl::Span<const int32>(indices).subspan(pos, shape.num_elements()),
+      Tensor t = test::AsTensor<int32_t>(
+          absl::Span<const int32_t>(indices).subspan(pos, shape.num_elements()),
           shape);
       builder.Input(t);
       pos += t.NumElements();
@@ -2785,8 +2788,8 @@ TEST_F(OpTest, EluGrad) {
 TEST_F(OpTest, ScatterNd) {
   Repeatedly([this]() {
     auto a = ChooseScatterArguments();
-    auto shape = test::AsTensor<int32>(
-        std::vector<int32>(a.shape.begin(), a.shape.end()));
+    auto shape = test::AsTensor<int32_t>(
+        std::vector<int32_t>(a.shape.begin(), a.shape.end()));
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ScatterNd")
                                              .Input(a.indices)
                                              .Input(a.updates)
@@ -2859,8 +2862,9 @@ TEST_F(OpTest, ExpandDims) {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> in_dims = RandomDims();
     Tensor dim(DT_INT32, TensorShape());
-    std::uniform_int_distribution<int32> d(-1 - in_dims.size(), in_dims.size());
-    dim.scalar<int32>()() = d(generator());
+    std::uniform_int_distribution<int32_t> d(-1 - in_dims.size(),
+                                             in_dims.size());
+    dim.scalar<int32_t>()() = d(generator());
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ExpandDims")
                                              .RandomInput(type, in_dims)
                                              .Input(dim)
@@ -2872,10 +2876,10 @@ TEST_F(OpTest, Fill) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims();
-    std::vector<int32> shape(dims.begin(), dims.end());
+    std::vector<int32_t> shape(dims.begin(), dims.end());
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Fill")
-            .Input(test::AsTensor<int32>(shape))
+            .Input(test::AsTensor<int32_t>(shape))
             .RandomInput(type, {})
             .Attr("T", type));
   });
@@ -2953,9 +2957,9 @@ TEST_F(OpTest, GatherNd) {
     std::vector<int64_t> output_shape(output_outer_shape);
     output_shape.push_back(index_len);
     Tensor lo(indices_type, TensorShape(output_shape));
-    test::FillFn<int32>(&lo, [](int i) -> int32 { return 0; });
+    test::FillFn<int32_t>(&lo, [](int i) -> int32_t { return 0; });
     Tensor hi(indices_type, TensorShape(output_shape));
-    test::FillFn<int32>(&hi, [index_len, &params_shape](int i) -> int32 {
+    test::FillFn<int32_t>(&hi, [index_len, &params_shape](int i) -> int32_t {
       int idx_dim = i % index_len;
       return params_shape[idx_dim] - 1;
     });
@@ -3020,7 +3024,7 @@ TEST_F(OpTest, InplaceUpdate) {
     x_dims.insert(x_dims.end(), common_dims.begin(), common_dims.end());
     std::vector<int64_t> i_shape{v_dims[0]};
     Tensor i =
-        RandomBoundedTensor<int32>(DT_INT32, 0, x_dims[0] - 1, true, i_shape);
+        RandomBoundedTensor<int32_t>(DT_INT32, 0, x_dims[0] - 1, true, i_shape);
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("InplaceUpdate")
                                              .RandomInput(type, x_dims)
                                              .Input(i)
@@ -3050,7 +3054,7 @@ TEST_F(OpTest, InvertPermutation) {
     // TODO(b/211012712): Once needs_unique_values case is linear instead of
     // quadratic time, use default Dim max instead of 8.
     int64_t len = RandomDim(0, 8);
-    Tensor x = RandomBoundedTensor<int32>(DT_INT32, 0, len - 1, true, {len});
+    Tensor x = RandomBoundedTensor<int32_t>(DT_INT32, 0, len - 1, true, {len});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("InvertPermutation").Input(x).Attr("T", DT_INT32));
   });
@@ -3155,7 +3159,7 @@ TEST_F(OpTest, Lgamma) {
 TEST_F(OpTest, LinSpace) {
   Repeatedly([this]() {
     auto ToScalar = [](DataType type, int x) {
-      if (type == DT_INT32) return test::AsScalar<int32>(x);
+      if (type == DT_INT32) return test::AsScalar<int32_t>(x);
       return test::AsScalar<int64_t>(x);
     };
     std::uniform_int_distribution<int> distribution(-50, 50);
@@ -3294,11 +3298,11 @@ TEST_F(OpTest, MatrixBandPart) {
     auto type = Choose<DataType>(kAllXlaTypes);
     auto index_type = Choose<DataType>({DT_INT32, DT_INT64});
     auto num_lower =
-        RandomBoundedTensor<int32>(index_type, -2 * kDefaultMaxDimensionSize,
-                                   2 * kDefaultMaxDimensionSize, false, {});
+        RandomBoundedTensor<int32_t>(index_type, -2 * kDefaultMaxDimensionSize,
+                                     2 * kDefaultMaxDimensionSize, false, {});
     auto num_upper =
-        RandomBoundedTensor<int32>(index_type, -2 * kDefaultMaxDimensionSize,
-                                   2 * kDefaultMaxDimensionSize, false, {});
+        RandomBoundedTensor<int32_t>(index_type, -2 * kDefaultMaxDimensionSize,
+                                     2 * kDefaultMaxDimensionSize, false, {});
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixBandPart")
                                              .RandomInput(type)
                                              .Input(num_lower)
@@ -3334,12 +3338,12 @@ TEST_F(OpTest, MatrixDiagPartV3) {
     auto type = Choose<DataType>(kAllXlaTypes);
     auto align = Choose<std::string>(
         {"LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"});
-    auto k0 = std::uniform_int_distribution<int32>(
+    auto k0 = std::uniform_int_distribution<int32_t>(
         -2 * kDefaultMaxDimensionSize,
         2 * kDefaultMaxDimensionSize)(generator());
-    auto k1 = std::uniform_int_distribution<int32>(
+    auto k1 = std::uniform_int_distribution<int32_t>(
         k0, 2 * kDefaultMaxDimensionSize)(generator());
-    auto k = test::AsTensor<int32>({k0, k1});
+    auto k = test::AsTensor<int32_t>({k0, k1});
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("MatrixDiagPartV3")
                                              .RandomInput(type)
                                              .Input(k)
@@ -3373,10 +3377,10 @@ TEST_F(OpTest, MatrixSetDiagV2) {
     int64_t max_num_diags = shape[rank - 2] + shape[rank - 1] - 1;
     int64_t num_diags =
         std::uniform_int_distribution<int64_t>(2, max_num_diags)(generator());
-    int32 k0 = std::uniform_int_distribution<int32>(
+    int32_t k0 = std::uniform_int_distribution<int32_t>(
         -shape[rank - 2] + 1, shape[rank - 1] - num_diags)(generator());
-    int32 k1 = k0 + num_diags - 1;
-    Tensor k = test::AsTensor<int32>({k0, k1});
+    int32_t k1 = k0 + num_diags - 1;
+    Tensor k = test::AsTensor<int32_t>({k0, k1});
     int64_t max_diag_len = std::min(shape[rank - 2] + std::min(k1, 0),
                                     shape[rank - 1] + std::min(-k0, 0));
     std::vector<int64_t> diagonal_shape(shape);
@@ -3428,7 +3432,7 @@ TEST_F(OpTest, MaxPool) {
     int stride_rows = random_int(generator()),
         stride_cols = random_int(generator());
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("MaxPool")
             .RandomInput(DT_FLOAT, dims)
@@ -3462,7 +3466,7 @@ TEST_F(OpTest, MaxPool3D) {
     int64_t batch = dims[3];
     int64_t feature = dims[4];
 
-    string padding = Choose<string>({"SAME", "VALID"});
+    std::string padding = Choose<std::string>({"SAME", "VALID"});
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("MaxPool3D")
             .RandomInput(DT_FLOAT,
@@ -3589,20 +3593,20 @@ TEST_F(OpTest, OneHot) {
     int32_t depth = RandomDim();
 
     Tensor indices(DT_INT32, TensorShape(dims));
-    std::uniform_int_distribution<int32> distribution(-depth * 2, depth * 2);
-    test::FillFn<int32>(&indices, [this, &distribution](int i) -> int32 {
+    std::uniform_int_distribution<int32_t> distribution(-depth * 2, depth * 2);
+    test::FillFn<int32_t>(&indices, [this, &distribution](int i) -> int32_t {
       return distribution(generator());
     });
 
-    int axis = std::uniform_int_distribution<int32>(-num_dims - 5,
-                                                    num_dims + 5)(generator());
+    int axis = std::uniform_int_distribution<int32_t>(
+        -num_dims - 5, num_dims + 5)(generator());
 
     OpTestBuilder builder("OneHot");
     builder.Attr("T", type);
     builder.Attr("TI", DT_INT32);
     builder.Attr("axis", axis);
     builder.Input(indices);
-    builder.Input(test::AsScalar<int32>(depth));
+    builder.Input(test::AsScalar<int32_t>(depth));
     builder.RandomInput(type, {});
     builder.RandomInput(type, {});
     return ExpectTfAndXlaOutputsAreClose(builder);
@@ -3625,8 +3629,8 @@ TEST_F(OpTest, Pack) {
 
     std::vector<int64_t> dims = RandomDims();
     int num_dims = dims.size();
-    int axis = std::uniform_int_distribution<int32>(-num_dims - 1,
-                                                    num_dims)(generator());
+    int axis = std::uniform_int_distribution<int32_t>(-num_dims - 1,
+                                                      num_dims)(generator());
 
     OpTestBuilder builder("Pack");
     builder.Attr("T", type);
@@ -3768,7 +3772,7 @@ TEST_F(OpTest, RandomUniform) {
 TEST_F(OpTest, Range) {
   Repeatedly([this]() {
     auto ToScalar = [](DataType type, int x) {
-      if (type == DT_INT32) return test::AsScalar<int32>(x);
+      if (type == DT_INT32) return test::AsScalar<int32_t>(x);
       if (type == DT_INT64) return test::AsScalar<int64_t>(x);
       if (type == DT_FLOAT) return test::AsScalar<float>(x);
       if (type == DT_DOUBLE) return test::AsScalar<double>(x);
@@ -3885,8 +3889,8 @@ TEST_F(OpTest, Reshape) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Reshape")
             .RandomInput(type, dims_before)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(dims_after.begin(), dims_after.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(dims_after.begin(), dims_after.end())))
             .Attr("T", type));
   });
 }
@@ -3912,8 +3916,8 @@ TEST_F(OpTest, ResizeBilinear) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ResizeBilinear")
             .RandomInput(DT_FLOAT, in_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(out_dims.begin(), out_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(out_dims.begin(), out_dims.end())))
             .Attr("T", DT_FLOAT)
             .Attr("align_corners", true));
   });
@@ -3965,14 +3969,14 @@ TEST_F(OpTest, ReverseSequence) {
 
     int batch_size = dims[batch_dim];
     int max_seq_len = dims[seq_dim];
-    std::vector<int32> seq_lens(batch_size);
-    std::uniform_int_distribution<int32> d(0, max_seq_len);
+    std::vector<int32_t> seq_lens(batch_size);
+    std::uniform_int_distribution<int32_t> d(0, max_seq_len);
     absl::c_generate(seq_lens, [&]() { return d(generator()); });
 
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("ReverseSequence")
             .RandomInput(type, dims)
-            .Input(test::AsTensor<int32>(seq_lens))
+            .Input(test::AsTensor<int32_t>(seq_lens))
             .Attr("seq_dim", seq_dim)
             .Attr("batch_dim", batch_dim)
             .Attr("T", type)
@@ -4161,14 +4165,15 @@ TEST_F(OpTest, Size) {
 TEST_F(OpTest, Slice) {
   Repeatedly([this]() {
     SliceArguments a = ChooseSliceArguments(true);
-    std::vector<int32> size;
+    std::vector<int32_t> size;
     size.insert(size.end(), a.size.begin(), a.size.end());
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Slice")
-                                             .RandomInput(a.type, a.shape)
-                                             .Input(a.indices)
-                                             .Input(test::AsTensor<int32>(size))
-                                             .Attr("T", a.type)
-                                             .Attr("Index", a.indices_type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Slice")
+            .RandomInput(a.type, a.shape)
+            .Input(a.indices)
+            .Input(test::AsTensor<int32_t>(size))
+            .Attr("T", a.type)
+            .Attr("Index", a.indices_type));
   });
 }
 
@@ -4302,8 +4307,8 @@ TEST_F(OpTest, SpaceToBatchND) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SpaceToBatchND")
             .RandomInput(type, input_dims)
-            .Input(test::AsTensor<int32>(
-                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(test::AsTensor<int32_t>(
+                std::vector<int32_t>(block_dims.begin(), block_dims.end())))
             .Input(paddings)
             .Attr("T", type));
   });
@@ -4360,16 +4365,16 @@ TEST_F(OpTest, SparseSoftmaxCrossEntropyWithLogits) {
     int64_t batch_size = dims[0];
     int64_t num_classes = dims[1];
 
-    std::vector<int32> indices(batch_size);
+    std::vector<int32_t> indices(batch_size);
     for (int64_t i = 0; i < batch_size; ++i) {
-      indices[i] =
-          std::uniform_int_distribution<int32>(0, num_classes - 1)(generator());
+      indices[i] = std::uniform_int_distribution<int32_t>(
+          0, num_classes - 1)(generator());
     }
 
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SparseSoftmaxCrossEntropyWithLogits")
             .RandomInput(DT_FLOAT, dims)
-            .Input(test::AsTensor<int32>(indices))
+            .Input(test::AsTensor<int32_t>(indices))
             .Attr("T", DT_FLOAT)
             .Attr("Tlabels", DT_INT32));
   });
@@ -4383,18 +4388,19 @@ TEST_F(OpTest, Split) {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims(1);
     std::uniform_int_distribution<int> ud;
-    int32_t dim = std::uniform_int_distribution<int32>(
-        -static_cast<int32>(dims.size()),
-        static_cast<int32>(dims.size()) - 1)(generator());
+    int32_t dim = std::uniform_int_distribution<int32_t>(
+        -static_cast<int32_t>(dims.size()),
+        static_cast<int32_t>(dims.size()) - 1)(generator());
     int n = std::uniform_int_distribution<int>(1, 5)(generator());
     // Ensure 'dim' is evenly divisible by 'n'.
     dims[dim] /= n;
     dims[dim] *= n;
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Split")
-                                             .Input(test::AsScalar<int32>(dim))
-                                             .RandomInput(type, dims)
-                                             .Attr("T", type)
-                                             .Attr("num_split", n));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Split")
+            .Input(test::AsScalar<int32_t>(dim))
+            .RandomInput(type, dims)
+            .Attr("T", type)
+            .Attr("num_split", n));
   });
 }
 
@@ -4405,12 +4411,12 @@ TEST_F(OpTest, SplitV) {
   Repeatedly([this]() {  // NOLINT: due to GTEST_SKIP
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> dims = RandomDims(1, kDefaultMaxRank, 1);
-    int32_t dim = std::uniform_int_distribution<int32>(
-        -static_cast<int32>(dims.size()),
-        static_cast<int32>(dims.size()) - 1)(generator());
+    int32_t dim = std::uniform_int_distribution<int32_t>(
+        -static_cast<int32_t>(dims.size()),
+        static_cast<int32_t>(dims.size()) - 1)(generator());
     int n = std::uniform_int_distribution<int>(
         1, std::min(5, static_cast<int>(dims[dim])))(generator());
-    std::vector<int32> size_splits(n);
+    std::vector<int32_t> size_splits(n);
     for (int i = 0; i < n - 1; ++i) {
       size_splits.push_back(dims[dim] / n);
     }
@@ -4418,8 +4424,8 @@ TEST_F(OpTest, SplitV) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("SplitV")
             .RandomInput(type, dims)
-            .Input(test::AsTensor<int32>(size_splits))
-            .Input(test::AsScalar<int32>(dim))
+            .Input(test::AsTensor<int32_t>(size_splits))
+            .Input(test::AsScalar<int32_t>(dim))
             .Attr("T", type)
             .Attr("num_split", n)
             .Attr("Tlen", DT_INT32));
@@ -4519,12 +4525,12 @@ TEST_F(OpTest, StridedSlice) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> data_dims = RandomDims();
-    std::vector<int32> begin(data_dims.size()), end(data_dims.size());
-    std::vector<int32> strides(data_dims.size());
+    std::vector<int32_t> begin(data_dims.size()), end(data_dims.size());
+    std::vector<int32_t> strides(data_dims.size());
     for (int i = 0; i < data_dims.size(); ++i) {
-      begin[i] = std::uniform_int_distribution<int32>(
+      begin[i] = std::uniform_int_distribution<int32_t>(
           -2 * data_dims[i], 2 * data_dims[i])(generator());
-      end[i] = std::uniform_int_distribution<int32>(
+      end[i] = std::uniform_int_distribution<int32_t>(
           -2 * data_dims[i], 2 * data_dims[i])(generator());
       // TODO(b/31360685): support strides other than 1 or -1
       strides[i] = std::bernoulli_distribution()(generator()) ? 1 : -1;
@@ -4547,9 +4553,9 @@ TEST_F(OpTest, StridedSlice) {
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("StridedSlice")
             .RandomInput(type, data_dims)
-            .Input(test::AsTensor<int32>(begin))
-            .Input(test::AsTensor<int32>(end))
-            .Input(test::AsTensor<int32>(strides))
+            .Input(test::AsTensor<int32_t>(begin))
+            .Input(test::AsTensor<int32_t>(end))
+            .Input(test::AsTensor<int32_t>(strides))
             .Attr("T", type)
             .Attr("Index", DT_INT32)
             .Attr("begin_mask", begin_mask)
@@ -4660,14 +4666,14 @@ TEST_F(OpTest, Tile) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> t_dims = RandomDims(1);
-    std::vector<int32> multiples(t_dims.size());
+    std::vector<int32_t> multiples(t_dims.size());
     for (int i = 0; i < t_dims.size(); ++i) {
       multiples[i] = std::uniform_int_distribution<int>(1, 3)(generator());
     }
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Tile")
             .RandomInput(type, t_dims)
-            .Input(test::AsTensor<int32>(multiples))
+            .Input(test::AsTensor<int32_t>(multiples))
             .Attr("T", type));
   });
 }
@@ -4678,10 +4684,11 @@ TEST_F(OpTest, TopKV2) {
   Repeatedly([this]() {  // NOLINT: due to GTEST_SKIP
     auto type = Choose<DataType>({DT_INT32, DT_FLOAT, DT_INT64});
     auto shape = RandomDims(1);
-    int32 k = std::uniform_int_distribution<int32>(1, shape[0])(generator());
+    int32_t k =
+        std::uniform_int_distribution<int32_t>(1, shape[0])(generator());
     return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("TopKV2")
                                              .RandomInput(type, shape)
-                                             .Input(test::AsScalar<int32>(k))
+                                             .Input(test::AsScalar<int32_t>(k))
                                              .Attr("sorted", RandomBool())
                                              .Attr("T", type));
   });
@@ -4691,13 +4698,14 @@ TEST_F(OpTest, Transpose) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64_t> data_dims = RandomDims();
-    std::vector<int32> perm(data_dims.size());
+    std::vector<int32_t> perm(data_dims.size());
     std::iota(perm.begin(), perm.end(), 0);
     std::shuffle(perm.begin(), perm.end(), generator());
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Transpose")
-                                             .RandomInput(type, data_dims)
-                                             .Input(test::AsTensor<int32>(perm))
-                                             .Attr("T", type));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Transpose")
+            .RandomInput(type, data_dims)
+            .Input(test::AsTensor<int32_t>(perm))
+            .Attr("T", type));
   });
 }
 
@@ -4887,8 +4895,8 @@ TEST_F(OpTest, FusedBatchNormTraining) {
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
-  tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU:0");
-  tensorflow::tf_xla_reference_device_ptr = new tensorflow::string("CPU:0");
+  tensorflow::tf_xla_test_device_ptr = new std::string("GPU:0");
+  tensorflow::tf_xla_reference_device_ptr = new std::string("CPU:0");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag(
           "tf_xla_random_seed", &tensorflow::tf_xla_random_seed,
@@ -4913,7 +4921,7 @@ int main(int argc, char** argv) {
           "tf_xla_test_use_mlir", &tensorflow::tf_xla_test_use_mlir,
           "Use MLIR legalization kernels for the operator under test"),
   };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
     LOG(ERROR) << "\n" << usage;
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index 5b41a8108573ac..938277324f1de6 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -149,8 +149,6 @@ def testSimple3(self):
     expected = np.array([[0., 0.], [11., 12.], [0., 0.]])
     self.assertAllEqual(expected, self._runScatterNd(indices, updates, [3, 2]))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, self._runScatterNd)
 
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
index 61b9b2c25f0291..36a1fe43db8109 100644
--- a/tensorflow/compiler/tests/segment_reduction_ops_test.py
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -78,8 +78,6 @@ def _unsortedSegmentMax(self, data, indices, num_segments):
     return self._segmentReduction(math_ops.unsorted_segment_max, data, indices,
                                   num_segments)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testSegmentSum(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -88,8 +86,6 @@ def testSegmentSum(self):
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
               np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 4))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 24-11-05
   def testSegmentProd(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -98,8 +94,6 @@ def testSegmentProd(self):
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
               np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 4))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 24-11-05
   def testSegmentProdNumSegmentsLess(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -108,8 +102,6 @@ def testSegmentProdNumSegmentsLess(self):
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
               np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 3))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 24-11-05
   def testSegmentProdNumSegmentsMore(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -194,8 +186,6 @@ def testUnsortedSegmentSum0DIndices1DData(self):
           self._unsortedSegmentSum(
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype), 2, 4))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum1DIndices1DData(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -204,8 +194,6 @@ def testUnsortedSegmentSum1DIndices1DData(self):
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
               np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum1DIndices1DDataNegativeIndices(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
@@ -214,8 +202,6 @@ def testUnsortedSegmentSum1DIndices1DDataNegativeIndices(self):
               np.array([0, 1, 2, 3, 4, 5, 6], dtype=dtype),
               np.array([3, -1, 0, 1, 0, -1, 3], dtype=np.int32), 4))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum1DIndices2DDataDisjoint(self):
     for dtype in self.numeric_types:
       data = np.array(
@@ -232,8 +218,6 @@ def testUnsortedSegmentSum1DIndices2DDataDisjoint(self):
                [50, 51, 52, 53], [0, 1, 2, 3], [0, 0, 0, 0]],
               dtype=dtype), y)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum1DIndices2DDataNonDisjoint(self):
     for dtype in self.numeric_types:
       data = np.array(
@@ -249,8 +233,6 @@ def testUnsortedSegmentSum1DIndices2DDataNonDisjoint(self):
                [0, 0, 0, 0]],
               dtype=dtype), y)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum2DIndices3DData(self):
     for dtype in self.numeric_types:
       data = np.array(
@@ -268,8 +250,6 @@ def testUnsortedSegmentSum2DIndices3DData(self):
               ], [0, 0, 0.], [90, 92, 94], [103, 104, 105], [0, 0, 0]],
               dtype=dtype), y)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 25-05-14
   def testUnsortedSegmentSum1DIndices3DData(self):
     for dtype in self.numeric_types:
       data = np.array(
@@ -298,8 +278,6 @@ def testUnsortedSegmentSumShapeError(self):
                             math_ops.unsorted_segment_sum, data, indices,
                             num_segments))
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 24-11-05
   def testUnsortedSegmentOps1DIndices1DDataNegativeIndices(self):
     """Tests for min, max, and prod ops.
 
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index 641af606bb24d1..c27b8070bbb450 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -48,9 +48,9 @@ static bool Initialized = [] {
 class UnaryOpsCompositionTest : public OpsTestBase {
  protected:
   template <typename T>
-  void RunComposedOp(const std::vector<string> op_names, T input_scalar_value,
-                     T expected_scalar_value) {
-    string xla_device_name =
+  void RunComposedOp(const std::vector<std::string> op_names,
+                     T input_scalar_value, T expected_scalar_value) {
+    std::string xla_device_name =
         tensorflow::IsGoogleCudaEnabled() ? DEVICE_XLA_GPU : DEVICE_XLA_CPU;
     SetDevice(DeviceType(xla_device_name),
               std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 7e9069dacfbaca..037560a142998d 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -215,8 +215,6 @@ def testCos(self):
             math_ops.cos, x, expected=np.cos(x), rtol=tol, atol=1e-5
         )
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="Test fails on ROCm.") #TODO(rocm): weekly sync 24-11-05 
   def testSigmoidNumericalStability(self):
     for dtype in self.float_types:
       if dtype != np.float16:
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 20f93d86adfad1..d642418a44c2f5 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -308,7 +308,8 @@ def device_scope(self):
       yield
 
   def assert_op_output_matches_expected(
-      self, op, inp, expected, equality_test=None, rtol=1e-3, atol=1e-5
+      self, op, inp, expected, local_session,
+      equality_test=None, rtol=1e-3, atol=1e-5
   ):
     """Verifies that 'op' produces 'expected' when fed input 'inp' .
 
@@ -316,25 +317,25 @@ def assert_op_output_matches_expected(
       op: operator to test
       inp: numpy input array to use as input to 'op'.
       expected: numpy array representing the expected output of 'op'.
+      local_session: The session to use for the test.
       equality_test: either None, or a function that tests two numpy arrays for
         equality. If None, self.assertAllClose is used.
       rtol: relative tolerance for equality test.
       atol: absolute tolerance for equality test.
     """
-    with self.session() as local_session:
-      with self.test_scope():
-        pinp = array_ops.placeholder(
-            dtypes.as_dtype(inp.dtype), inp.shape, name='a'
-        )
-        output = op(pinp)
-      result = local_session.run(output, {pinp: inp})
-      if equality_test is None:
-        self.assertEqual(output.dtype, expected.dtype)
-        self.assertAllCloseAccordingToType(
-            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03
-        )
-      else:
-        equality_test(result, expected, rtol=rtol, atol=atol)
+    with self.test_scope():
+      pinp = array_ops.placeholder(
+          dtypes.as_dtype(inp.dtype), inp.shape, name='a'
+      )
+      output = op(pinp)
+    result = local_session.run(output, {pinp: inp})
+    if equality_test is None:
+      self.assertEqual(output.dtype, expected.dtype)
+      self.assertAllCloseAccordingToType(
+          expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03
+      )
+    else:
+      equality_test(result, expected, rtol=rtol, atol=atol)
 
   def test_scope(self):
     """Deprecated alias of `device_scope`.
diff --git a/tensorflow/compiler/tf2tensorrt/common/datavec.h b/tensorflow/compiler/tf2tensorrt/common/datavec.h
index eff32f1f521af4..34b419d1d20d62 100644
--- a/tensorflow/compiler/tf2tensorrt/common/datavec.h
+++ b/tensorflow/compiler/tf2tensorrt/common/datavec.h
@@ -27,7 +27,7 @@ namespace tensorrt {
 // Input/output data format for OpConverterTest::BuildAndRun().
 struct InputOutputData {
   size_t TotalBytes() const { return tensor.TotalBytes(); }
-  string name;
+  std::string name;
   Tensor tensor;
 };
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
index c8eb3db2e0b9e4..b4c3052953c677 100755
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/einsum.cc
@@ -739,16 +739,16 @@ class ReIndexer {
   // Initializes the index map with existing lowercase labels.
   ReIndexer(std::string eq) {
     for (char c : eq) {
-      if (islower(c)) {
+      if (absl::ascii_islower(c)) {
         idx_map_[c] = c;
       }
     }
   }
   // Finds new character for uppercase character c.
   char operator()(char c) {
-    if (!std::isupper(c)) return c;
+    if (!absl::ascii_isupper(c)) return c;
     if (idx_map_.count(c) > 0) return idx_map_[c];
-    char new_idx = std::tolower(c);
+    char new_idx = absl::ascii_tolower(c);
 
     // If lower(c) is not used in the equation, use it to replace c.
     if (idx_map_.count(new_idx) == 0) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
index faedcf3de8c427..000c32df25d253 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_parameters.cc
@@ -81,9 +81,7 @@ string ProfileStrategyToName(const ProfileStrategy strategy) {
 }
 
 Status ProfileStrategyFromName(const string& name, ProfileStrategy* strategy) {
-  string name_lowercase(name);
-  std::transform(name.begin(), name.end(), name_lowercase.begin(),
-                 [](unsigned char c) { return std::tolower(c); });
+  std::string name_lowercase = absl::AsciiStrToLower(name);
   if (name_lowercase == "range") {
     *strategy = ProfileStrategy::kRange;
   } else if (name_lowercase == "optimal") {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index 30aff91a76d3b1..d1bf00a53d1cc3 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -99,7 +99,7 @@ string TRTEngineCacheResource::DebugString() const {
 EngineContext* TRTEngineCacheResource::GetEngineContext(
     const std::vector<TensorShape>& input_shapes) {
   EngineContext* engine_context = nullptr;
-  int64 min_matched_batch_size = kint64max;
+  int64 min_matched_batch_size = std::numeric_limits<int64_t>::max();
   for (const auto& pair : cache_) {
     const std::vector<TensorShape>& cached_input_shapes = pair.first;
     // This should not happen, but just for safety.
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 254a1e85c35192..e5545445817ec2 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -138,6 +138,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "encoded_buffer_allocation_info",
+    hdrs = ["encoded_buffer_allocation_info.h"],
+    visibility = [":friends"],
+    deps = [
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+    ],
+)
+
+tf_cc_test(
+    name = "encoded_buffer_allocation_info_test",
+    srcs = ["encoded_buffer_allocation_info_test.cc"],
+    deps = [
+        ":encoded_buffer_allocation_info",
+        "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/backends/cpu:buffer_allocation_info",
+    ],
+)
+
 cc_library(
     name = "tf2xla",
     srcs = ["tf2xla.cc"],
@@ -218,6 +237,7 @@ filegroup(
     name = "xla_compiled_cpu_runtime_hdrs",
     srcs = [
         "allocator.h",
+        "encoded_buffer_allocation_info.h",
         "xla_compiled_cpu_function.h",
         "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
@@ -355,6 +375,7 @@ cc_library(
 #         "@local_tsl//tsl/platform:context",
 #         "@local_tsl//tsl/platform:cord",
 #         "@local_tsl//tsl/platform:env_time",
+#         "@local_tsl//tsl/platform:refcount",
 #         "@local_tsl//tsl/platform:ml_dtypes",
 #         "@local_tsl//tsl/platform:logging",
 #         "@local_tsl//tsl/platform:macros",
@@ -437,6 +458,7 @@ cc_library(
         ":allocator",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
+        ":encoded_buffer_allocation_info",
         "@local_xla//xla/service:custom_call_status_internal",
         "@local_xla//xla/backends/cpu/runtime:rng_state_lib",
         "@local_xla//xla/backends/cpu:alignment",
@@ -502,6 +524,7 @@ cc_library(
     hdrs = ["xla_jit_compiled_cpu_function.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":encoded_buffer_allocation_info",
         ":tf2xla",
         ":tf2xla_proto_cc",
         ":xla_compiled_cpu_function",
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index c7c8702b49b774..d9f6927c09ecd6 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -180,7 +180,7 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_0) {
   // not need to be a constant.
   Output reshape = ops::Reshape(root, arg1, add);
   reshape.node()->AddAttr(kXlaCompileTimeConstantInputsAttr,
-                          std::vector<string>());
+                          std::vector<std::string>());
 
   Graph graph(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&graph));
@@ -203,7 +203,7 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_1) {
 
   // Force const analysis to pretend that the first argument to `add` needs to
   // be a constant.
-  std::vector<string> add_constant_inputs;
+  std::vector<std::string> add_constant_inputs;
   add_constant_inputs.push_back("x");
   add.node()->AddAttr(kXlaCompileTimeConstantInputsAttr, add_constant_inputs);
 
diff --git a/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h
new file mode 100644
index 00000000000000..5981751259967a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h
@@ -0,0 +1,99 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
+#define TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
+
+#include <cstdint>
+
+#include "xla/backends/cpu/buffer_allocation_info.h"
+
+namespace xla {
+namespace cpu {
+
+// Encoded version of `BufferAllocationInfo`, which can be used to reconstruct
+// the `BufferAllocationInfo` later. It's used in the AOT compiler, to
+// represent buffer allocation info as a lightweight struct.
+struct EncodedBufferAllocationInfo {
+  EncodedBufferAllocationInfo(uint64_t packed_kind_and_size,
+                              uint32_t entry_param_number,
+                              uint32_t result_number)
+      : packed_kind_and_size(packed_kind_and_size),
+        entry_param_number(entry_param_number),
+        result_number(result_number) {}
+
+  // Encodes BufferAllocationInfo into the struct that can be used to
+  // reconstruct the BufferAllocationInfo later using the constructor. We need
+  // this because we use BufferAllocationInfo in places where using protocol
+  // buffers would negatively impact binary size.
+  explicit EncodedBufferAllocationInfo(
+      const BufferAllocationInfo& buffer_info) {
+    packed_kind_and_size = Pack(buffer_info.kind(), buffer_info.size());
+    entry_param_number = buffer_info.is_entry_parameter()
+                             ? buffer_info.entry_parameter_number()
+                             : -1;
+    result_number = buffer_info.is_result() ? buffer_info.result_number() : -1;
+  }
+
+  explicit operator BufferAllocationInfo() const {
+    auto kind = UnpackKind(packed_kind_and_size);
+    auto size = UnpackSize(packed_kind_and_size);
+    int32_t entry_param_number = static_cast<int32_t>(this->entry_param_number);
+    int32_t result_number = static_cast<int32_t>(this->result_number);
+
+    switch (kind) {
+      case BufferAllocationInfo::Kind::kConstant:
+        return BufferAllocationInfo::Constant(size);
+      case BufferAllocationInfo::Kind::kTemp:
+        return BufferAllocationInfo::Temp(size);
+      case BufferAllocationInfo::Kind::kParameter:
+        if (entry_param_number >= 0 && result_number >= 0) {
+          return BufferAllocationInfo::InOutParameter(size, entry_param_number,
+                                                      result_number);
+        }
+        if (entry_param_number >= 0) {
+          return BufferAllocationInfo::EntryParameter(size, entry_param_number);
+        }
+        return BufferAllocationInfo::Result(size, result_number);
+      case BufferAllocationInfo::Kind::kThreadLocal:
+        return BufferAllocationInfo::ThreadLocal(size);
+    }
+  }
+
+  static uint64_t Pack(BufferAllocationInfo::Kind kind, uint64_t size) {
+    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
+  }
+
+  static constexpr BufferAllocationInfo::Kind UnpackKind(uint64_t packed) {
+    return static_cast<BufferAllocationInfo::Kind>((packed << 62) >> 62);
+  }
+
+  static constexpr uint64_t UnpackSize(uint64_t packed) { return packed >> 2; }
+
+  uint64_t packed_kind_and_size = 0;
+  uint32_t entry_param_number = -1;
+  uint32_t result_number = -1;
+};
+}  // namespace cpu
+
+// TODO(ezhulenev): This is a temporary hack to keep `tfcompile` code working.
+namespace cpu_function_runtime {
+using BufferInfo = ::xla::cpu::BufferAllocationInfo;
+using EncodedBufferInfo = ::xla::cpu::EncodedBufferAllocationInfo;
+}  // namespace cpu_function_runtime
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_ENCODED_BUFFER_ALLOCATION_INFO_H_
diff --git a/third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
similarity index 88%
rename from third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc
rename to tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
index 3848bb6c4db313..c9fc52100abb33 100644
--- a/third_party/xla/xla/backends/cpu/buffer_allocation_info_test.cc
+++ b/tensorflow/compiler/tf2xla/encoded_buffer_allocation_info_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2025 The OpenXLA Authors.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/buffer_allocation_info.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 
 #include <gtest/gtest.h>
+#include "xla/backends/cpu/buffer_allocation_info.h"
 
 namespace xla::cpu {
 namespace {
 
-TEST(BufferAllocationInfoTest, RoundTrip) {
+TEST(EncodedBufferAllocationInfoTest, RoundTrip) {
   auto round_trip = [](const BufferAllocationInfo& buffer_info) {
     EncodedBufferAllocationInfo encoded(buffer_info);
     BufferAllocationInfo round_trip(encoded);
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index ba297127eae117..2adc83512c6617 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -83,11 +83,11 @@ struct ClusterTupleLessThan {
 };
 
 // TODO(jpienaar): Move to OutputTensor.
-string DebugString(const OutputTensor& tensor) {
+std::string DebugString(const OutputTensor& tensor) {
   return absl::StrCat(tensor.node->name(), ":", tensor.index);
 }
 
-string Branch_Name(BranchType b) {
+std::string Branch_Name(BranchType b) {
   switch (b) {
     case BranchType::kElseBranch:
       return "else";
@@ -100,13 +100,13 @@ string Branch_Name(BranchType b) {
   }
 }
 
-string DebugString(StateMap::CondId cond_state) {
+std::string DebugString(StateMap::CondId cond_state) {
   if (cond_state == nullptr || cond_state->empty()) return "{}";
   using value_type = StateMap::CondState::value_type;
   return absl::StrCat(
       "{",
       absl::StrJoin(*cond_state, ", ",
-                    [](string* output, const value_type& pred_branch) {
+                    [](std::string* output, const value_type& pred_branch) {
                       const OutputTensor& pred = pred_branch.first;
                       const BranchType& branch = pred_branch.second;
                       if (branch == BranchType::kNeither)
@@ -200,7 +200,7 @@ struct CondArgNode {
   explicit CondArgNode(Node* src, int src_output)
       : src(src), src_output(src_output) {}
 
-  string ToString() const {
+  std::string ToString() const {
     return absl::StrCat("src=", src->name(), ":", src_output,
                         " switches=", NodesToString(switches));
   }
@@ -212,11 +212,11 @@ struct CondArgNode {
 };
 using CondArgNodes = std::vector<CondArgNode>;
 
-string DebugString(const CondArgNodes& nodes) {
+std::string DebugString(const CondArgNodes& nodes) {
   return absl::StrCat(
       "[",
       absl::StrJoin(nodes, ", ",
-                    [](string* output, const CondArgNode& node) {
+                    [](std::string* output, const CondArgNode& node) {
                       absl::StrAppend(output, node.ToString());
                     }),
       "]");
@@ -263,20 +263,20 @@ void StateMap::ResetAncestorId(const Node* node, StateMap::AncestorId id) {
 
 void StateMap::MarkDead(const Node* node) { ResetCondId(node, dead_id_); }
 
-string StateMap::CondStateToString(const Node* node) const {
+std::string StateMap::CondStateToString(const Node* node) const {
   return CondStateToString(LookupCondId(node));
 }
 
-string StateMap::CondStateToString(StateMap::CondId id) const {
+std::string StateMap::CondStateToString(StateMap::CondId id) const {
   return DebugString(id);
 }
 
-string StateMap::AncestorStateToString(const Node* node) const {
+std::string StateMap::AncestorStateToString(const Node* node) const {
   if (auto id = LookupAncestorId(node)) {
     return absl::StrCat(
         "{",
         absl::StrJoin(*id, ",",
-                      [](string* output, const AncestorNode& ancestor) {
+                      [](std::string* output, const AncestorNode& ancestor) {
                         absl::StrAppend(output,
                                         ancestor.output_tensor.node->name(),
                                         ":", ancestor.output_tensor.index);
@@ -340,7 +340,7 @@ class Conditional {
 
   // Internal name of conditional. The name is based on the first merge node
   // added.
-  string name() const;
+  std::string name() const;
 
   // The FunctionalizeCond instance that created this.
   FunctionalizeCond* parent_;
@@ -751,7 +751,7 @@ absl::Status Conditional::BuildIfNode(Graph* graph,
   VLOG(2) << "Build cond function for " << name();
   NodeDebugInfo debug_info((*merges_.begin())->def());
   NodeDefBuilder builder(name(), "If", library, &debug_info);
-  const string branch_name[] = {"else_branch", "then_branch"};
+  const std::string branch_name[] = {"else_branch", "then_branch"};
   for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
     int branch_index = static_cast<int>(branch);
 
@@ -817,7 +817,7 @@ absl::Status Conditional::BuildIfNode(Graph* graph,
   builder.Attr("Tcond", DT_BOOL);
   // Add some internal attributes which need to be propagated.
   for (absl::string_view attr_name : kAttrsToPropagate) {
-    string attr_val;
+    std::string attr_val;
     if (GetNodeAttr(predicate_.node->def(), attr_name, &attr_val).ok()) {
       builder.Attr(attr_name, attr_val);
     }
@@ -949,7 +949,7 @@ absl::Status Conditional::BuildAndReplace(
   return absl::OkStatus();
 }
 
-string Conditional::name() const {
+std::string Conditional::name() const {
   CHECK(!merges_.empty());
   return absl::StrCat((*merges_.begin())->name(), "_if");
 }
@@ -958,7 +958,7 @@ absl::Status FunctionalizeCond::AddIdentityNode(const Node* replacee,
                                                 Node* if_node, int port) {
   NodeBuilder id_builder(replacee->name(), "Identity");
   id_builder.Input(if_node, port);
-  string outside_compilation;
+  std::string outside_compilation;
   if (GetNodeAttr(if_node->def(), kXlaOutsideCompilationAttr,
                   &outside_compilation)
           .ok()) {
@@ -1580,7 +1580,7 @@ absl::Status FunctionalizeCond::FunctionalizeInternal() {
   return absl::OkStatus();
 }
 
-void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
+void FunctionalizeCond::DumpGraphWithCondState(const std::string& name) {
   const char* const kCondGroupDebugAttr = "_XlaFunctionalizeCondGroup";
 
   for (Node* n : graph_->nodes()) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index e37555b053d7ed..25d773ad50a105 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -136,11 +136,11 @@ class StateMap {
   BranchType FindBranchOf(CondId id, OutputTensor predicate) const;
 
   // Returns textual representation of node's CondState.
-  string CondStateToString(const Node* node) const;
-  string CondStateToString(CondId id) const;
+  std::string CondStateToString(const Node* node) const;
+  std::string CondStateToString(CondId id) const;
 
   // Returns textual representation of node's AncestorState.
-  string AncestorStateToString(const Node* node) const;
+  std::string AncestorStateToString(const Node* node) const;
 
   // Returns whether the cond state is the dead state.
   bool IsDead(CondId id) const;
@@ -201,7 +201,7 @@ class FunctionalizeCond {
   absl::Status PropagateUpdatedState(const Node* replacee);
 
   // Dump graph with the CondState annotated.
-  void DumpGraphWithCondState(const string& name);
+  void DumpGraphWithCondState(const std::string& name);
 
   // Adds `switch_id` to the list of Switch node ids.
   void AddSwitchId(int switch_id);
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index 50bd47ad73e77e..edb2a7e0ea1b33 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -48,7 +48,7 @@ class FunctionalizeCondTest : public ::testing::Test {
     return fc_->state_map_.GetCondId(state);
   }
 
-  string GetString(const StateMap::StateMap::CondId id) {
+  std::string GetString(const StateMap::StateMap::CondId id) {
     return fc_->state_map_.CondStateToString(id);
   }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index ac38725269bfd9..22b9b9187ecd7d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -51,8 +51,9 @@ namespace tensorflow {
 // Maps function name to
 // - new function name, if the function body was functionalized
 // - std::nullopt, if not
-using FuncMap = std::map<string, std::optional<string>>;
-using FuncMapIter = std::map<string, std::optional<string>>::const_iterator;
+using FuncMap = std::map<std::string, std::optional<std::string>>;
+using FuncMapIter =
+    std::map<std::string, std::optional<std::string>>::const_iterator;
 
 // Returns whether function has been processed before.
 bool FunctionHasBeenProcessed(FuncMapIter func_iter, const FuncMap* func_map) {
@@ -65,8 +66,8 @@ bool FunctionHasBeenModified(FuncMapIter func_iter) {
 }
 
 // Returns a name for the new functionalized version of a function.
-string GetNewFunctionName(
-    const string& func_name, Node* n,
+std::string GetNewFunctionName(
+    const std::string& func_name, Node* n,
     AssociatedFunctionInfo::AssociatedFunctionType func_type,
     FunctionLibraryDefinition* fld) {
   // For SymbolicGradient, `func_name` is always "SymbolicGradient" which
@@ -79,14 +80,15 @@ string GetNewFunctionName(
 }
 
 // Returns name to which a modified function has been mapped.
-const string& GetMappedFunctionName(FuncMapIter func_iter) {
+const std::string& GetMappedFunctionName(FuncMapIter func_iter) {
   DCHECK(func_iter->second.has_value());
   return func_iter->second.value();
 }
 
 // Updates `func_map` with function given by `canonicalized_name`.
-void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
-                       const string& new_func_name, bool function_modified) {
+void UpdateFunctionMap(FuncMap* func_map, const std::string& canonicalized_name,
+                       const std::string& new_func_name,
+                       bool function_modified) {
   // If function was modified store its new name, otherwise add empty entry to
   // record that function has been processed and does not need to be rewritten.
   (*func_map)[canonicalized_name] =
@@ -95,8 +97,9 @@ void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
 
 // Adds new function def to graph's function library if necessary.
 absl::Status AddFunctionDefToGraphLibrary(
-    const string& func_name, const AssociatedFunctionInfo& associated_function,
-    Graph* graph, FunctionLibraryDefinition* fld) {
+    const std::string& func_name,
+    const AssociatedFunctionInfo& associated_function, Graph* graph,
+    FunctionLibraryDefinition* fld) {
   const OpRegistrationData* op_reg_data;
   // We have to be careful with adding the function def since there are three
   // different `OpRegistryInterface`s involved here:
@@ -129,8 +132,8 @@ absl::Status AddFunctionDefToGraphLibrary(
 
 // Functionalizes function given by `func_name`. Update `func_map` accordingly.
 absl::Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    const std::string& func_name, const std::string& new_func_name,
+    const protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
     FuncMap* func_map, bool* function_modified,
     const NodeFilter& node_filter = {});
@@ -165,11 +168,11 @@ absl::Status FunctionalizeControlFlowForNodeAssociatedFunctions(
              associated_functions.size() == 1);
 
       // Process one node-function-pair.
-      string func_name = associated_function.func_name();
-      string canonicalized_name =
+      std::string func_name = associated_function.func_name();
+      std::string canonicalized_name =
           Canonicalize(func_name, AttrSlice(&associated_function.attrs()));
       auto func_iter = func_map->find(canonicalized_name);
-      string new_func_name;
+      std::string new_func_name;
       if (FunctionHasBeenProcessed(func_iter, func_map)) {
         if (FunctionHasBeenModified(func_iter)) {
           *any_function_modified = true;
@@ -202,8 +205,8 @@ absl::Status FunctionalizeControlFlowForNodeAssociatedFunctions(
 }
 
 absl::Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    const std::string& func_name, const std::string& new_func_name,
+    const protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
     FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
     FuncMap* func_map, bool* function_modified, const NodeFilter& node_filter) {
   *function_modified = false;
@@ -341,8 +344,8 @@ absl::Status FunctionalizeControlFlowForXlaPass::Run(
   // Find XLA compile ops and its corresponding FunctionDef.
   // TPUCompile op is not in the map because graph rewriting might happen
   // multiple times, and we want to avoid functionalize it again.
-  static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
-      new std::map<string, string>{
+  static std::map<std::string, std::string>* kNodeTypeToFunctionAttrMapping =
+      new std::map<std::string, std::string>{
           // _TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
           {"_TPUReplicate", "computation"},
           // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
@@ -355,12 +358,12 @@ absl::Status FunctionalizeControlFlowForXlaPass::Run(
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
       continue;
     }
-    const string func_attr = it->second;
+    const std::string func_attr = it->second;
     NameAttrList func;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
     VLOG(2) << "Graph has node " << n->type_string()
             << ". Corresponding function: " << func.name();
-    string new_func_name = options.flib_def->UniqueFunctionName(
+    std::string new_func_name = options.flib_def->UniqueFunctionName(
         absl::StrCat(func.name(), "_f15n_"));
     bool modified;
     TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 7727853a8c4233..24fe7f5e13e7e0 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -46,7 +46,7 @@ namespace {
 
 // Returns the names of the "then" and "else" functions for the If node in a
 // graph.
-absl::Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
+absl::Status FindIfThenAndElse(const GraphDef& graph, std::string* op_name,
                                NameAttrList* then_fn, NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "If") {
@@ -97,7 +97,7 @@ INSTANTIATE_TEST_SUITE_P(
            info) {
       bool restrict_to_tpu_nodes = std::get<0>(info.param);
       bool wrap_cond_in_function = std::get<1>(info.param);
-      string name =
+      std::string name =
           absl::StrCat(restrict_to_tpu_nodes ? "with_filter" : "without_filter",
                        wrap_cond_in_function ? "_in_function" : "_in_graph");
       return name;
@@ -114,7 +114,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
     auto identity_t =
         ops::Identity(scope.WithOpName("cond/Identity"), switch_1.output_true);
-    auto seventeen = ops::Const<int32>(
+    auto seventeen = ops::Const<int32_t>(
         scope.WithOpName("cond").WithControlDependencies(identity_t), 17);
     auto switch_2 = ops::Switch(scope.WithOpName("cond/Switch"), y, less);
     auto mul = ops::Multiply(scope.WithOpName("cond/Mul"), switch_2.output_true,
@@ -122,7 +122,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
     auto identity_f =
         ops::Identity(scope.WithOpName("cond/Identity"), switch_1.output_false);
-    auto twenty_three = ops::Const<int32>(
+    auto twenty_three = ops::Const<int32_t>(
         scope.WithOpName("cond").WithControlDependencies(identity_f), 23);
     auto switch_3 = ops::Switch(scope.WithOpName("cond/Switch"), x, less);
     auto add = ops::Add(scope.WithOpName("cond/false/add"),
@@ -146,7 +146,7 @@ void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
 
 void ConditionalTestFixture::CheckGraphDef(
     const GraphDef& graph_def, const FunctionLibraryDefinition& library) {
-  string op_name;
+  std::string op_name;
   NameAttrList then_fn;
   NameAttrList else_fn;
   TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
@@ -285,7 +285,7 @@ void ConditionalTestFixture::RunTest() {
     FunctionLibraryRuntime::Handle handle;
 
     // Functionalized function name is the type string of `cond_node`.
-    string func_name;
+    std::string func_name;
     for (Node* n : graph.nodes()) {
       if (n->name() == "cond_node") {
         func_name = n->type_string();
@@ -341,7 +341,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
         ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -352,7 +352,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
                                     switch_.output_false);
     auto identity =
         ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
     auto next_iteration =
@@ -405,7 +405,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
     {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto ten = ops::Const<int32>(
+      auto ten = ops::Const<int32_t>(
           scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
       auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
@@ -427,7 +427,7 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
       auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
@@ -463,7 +463,8 @@ FunctionDef GetNoinlineFunctionDef() {
 //   return [x + 1]
 // Define the above function, and add it to the given graph. It's used as the
 // while loop body in NoinlineLoopBody test.
-absl::Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+absl::Status AddNoinlineFunctionToGraph(const std::string& node_name,
+                                        Graph* graph) {
   FunctionDefLibrary fdef_lib;
   *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
@@ -481,7 +482,7 @@ absl::Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
 // x = array_ops.placeholder(dtypes.int32)
 // y = control_flow_ops.while_loop(lambda i: i < 10, increment_fn, [x])
 TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
-  const string& noinline_node_name = "while/increment_fn";
+  const std::string& noinline_node_name = "while/increment_fn";
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -491,7 +492,7 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
                                       "while/while_context");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -585,7 +586,7 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
 }
 
 TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
-  const string& noinline_node_name = "while/increment_fn";
+  const std::string& noinline_node_name = "while/increment_fn";
   Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
@@ -622,7 +623,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
         ops::internal::Enter(scope.WithOpName("while/Enter"), source, "aloop");
     auto merge = ops::Merge(scope.WithOpName("while/Merge"),
                             std::initializer_list<Input>{enter, dummy});
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
         10);
     auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
@@ -631,7 +632,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
         ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
     auto identity =
         ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
     auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
     auto next_iteration =
@@ -673,7 +674,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
     {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
-      auto ten = ops::Const<int32>(
+      auto ten = ops::Const<int32_t>(
           scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
       auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
@@ -695,7 +696,7 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
       auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
@@ -739,14 +740,15 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
                               std::initializer_list<Input>{enter_y, dummy});
 
     // Loop condition
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
-                                       .WithControlDependencies(merge_x.output),
-                                   3);
+    auto three =
+        ops::Const<int32_t>(scope.WithOpName("while/cond/three")
+                                .WithControlDependencies(merge_x.output),
+                            3);
     auto cond_add =
         ops::Add(scope.WithOpName("while/cond/Add"), merge_x.output, three);
-    auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
-                                     .WithControlDependencies(merge_x.output),
-                                 10);
+    auto ten = ops::Const<int32_t>(scope.WithOpName("while/cond/ten")
+                                       .WithControlDependencies(merge_x.output),
+                                   10);
     auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
     auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
 
@@ -765,10 +767,10 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
     auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"),
                                     switch_y.output_true);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
         1);
-    auto two = ops::Const<int32>(
+    auto two = ops::Const<int32_t>(
         scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
         2);
 
@@ -825,14 +827,15 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
       Scope scope = Scope::NewRootScope().ExitOnError();
       auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
       auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
-                                         .WithControlDependencies(arg0.output),
-                                     3);
+      auto three =
+          ops::Const<int32_t>(scope.WithOpName("while/cond/three")
+                                  .WithControlDependencies(arg0.output),
+                              3);
       auto cond_add =
           ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
-                                       .WithControlDependencies(arg0.output),
-                                   10);
+      auto ten = ops::Const<int32_t>(scope.WithOpName("while/cond/ten")
+                                         .WithControlDependencies(arg0.output),
+                                     10);
       auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
       auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less, 0);
 
@@ -859,10 +862,10 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
       auto identity_y =
           ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
 
-      auto one = ops::Const<int32>(
+      auto one = ops::Const<int32_t>(
           scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
           1);
-      auto two = ops::Const<int32>(
+      auto two = ops::Const<int32_t>(
           scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
           2);
 
@@ -922,7 +925,7 @@ INSTANTIATE_TEST_SUITE_P(
       bool mark_inner_loop_tpu = std::get<1>(info.param);
       bool mark_outer_loop_tpu = std::get<2>(info.param);
 
-      string node_string;
+      std::string node_string;
       if (mark_inner_loop_tpu && mark_outer_loop_tpu)
         node_string = "both_loops_tpu";
       else if (!mark_inner_loop_tpu && !mark_outer_loop_tpu)
@@ -930,7 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
       else
         node_string = mark_inner_loop_tpu ? "inner_loop_tpu" : "outer_loop_tpu";
 
-      string name = absl::StrCat(
+      std::string name = absl::StrCat(
           restrict_to_tpu_nodes ? "restricted_" : "unrestricted_", node_string);
       return name;
     });
@@ -961,21 +964,21 @@ void ComplexTestFixture::RunTest() {
     auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
 
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto three = ops::Const<int32_t>(scope.WithOpName("three"), 3);
     auto y = ops::Add(scope.WithOpName("y"), x, three);
 
     auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
                                 TensorShape({}));
 
     // Outer loop
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+    auto zero = ops::Const<int32_t>(scope.WithOpName("outer/Const"), 0);
     auto enter_i =
         ops::internal::Enter(scope.WithOpName("outer/Enter_i"), zero, "outer");
     auto merge_i = ops::Merge(scope.WithOpName("outer/Merge_i"),
                               std::initializer_list<Input>{enter_i, dummy});
-    auto ten = ops::Const<int32>(scope.WithOpName("outer/Less/y")
-                                     .WithControlDependencies(merge_i.output),
-                                 10);
+    auto ten = ops::Const<int32_t>(scope.WithOpName("outer/Less/y")
+                                       .WithControlDependencies(merge_i.output),
+                                   10);
     auto less_i =
         ops::Less(scope.WithOpName("outer/Less_i"), merge_i.output, ten);
     auto outer_loop_cond =
@@ -998,7 +1001,7 @@ void ComplexTestFixture::RunTest() {
                              ops::internal::Enter::Attrs().IsConstant(true));
 
     // Inner loop
-    auto one_j = ops::Const<int32>(
+    auto one_j = ops::Const<int32_t>(
         scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
     auto enter_j = ops::internal::Enter(scope.WithOpName("outer/inner/Enter_j"),
                                         one_j, "inner");
@@ -1018,9 +1021,10 @@ void ComplexTestFixture::RunTest() {
     auto merge_k = ops::Merge(scope.WithOpName("outer/inner/Merge_k"),
                               std::initializer_list<Input>{enter_k, dummy});
 
-    auto five = ops::Const<int32>(scope.WithOpName("outer/inner/Five")
-                                      .WithControlDependencies(merge_j.output),
-                                  5);
+    auto five =
+        ops::Const<int32_t>(scope.WithOpName("outer/inner/Five")
+                                .WithControlDependencies(merge_j.output),
+                            5);
     auto less_j =
         ops::Less(scope.WithOpName("outer/inner/Less_j"), merge_j.output, five);
     auto loop_cond =
@@ -1047,7 +1051,7 @@ void ComplexTestFixture::RunTest() {
     auto assign = ops::AssignAddVariableOp(
         scope.WithOpName("outer/inner/assign_add"), enter_var, add_jkx);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/One")
             .WithControlDependencies(
                 absl::Span<const Operation>{assign.operation}),
@@ -1061,7 +1065,7 @@ void ComplexTestFixture::RunTest() {
         scope.WithOpName("outer/inner/NextIteration_k"), identity_k);
 
     // Body and backedge for outer loop.
-    auto one_outer = ops::Const<int32>(
+    auto one_outer = ops::Const<int32_t>(
         scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
     auto add_i =
         ops::Add(scope.WithOpName("outer/add")
@@ -1086,9 +1090,10 @@ void ComplexTestFixture::RunTest() {
   }
   // Add '_tpu_replicate' attributes as specified.
   for (Node* n : graph.nodes()) {
-    string name = n->name();
-    bool is_inner_node = name.find("outer/inner/") != string::npos;
-    bool is_outer_node = !is_inner_node && name.find("outer/") != string::npos;
+    std::string name = n->name();
+    bool is_inner_node = name.find("outer/inner/") != std::string::npos;
+    bool is_outer_node =
+        !is_inner_node && name.find("outer/") != std::string::npos;
     if ((is_inner_node && mark_inner_loop_tpu_) ||
         (is_outer_node && mark_outer_loop_tpu_)) {
       n->AddAttr("_tpu_replicate", "cluster");
@@ -1159,13 +1164,13 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+    auto three = ops::Const<int32_t>(scope.WithOpName("three"), 3);
     auto y = ops::Add(scope.WithOpName("y"), x, three);
 
     auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
                                 TensorShape({}));
 
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+    auto zero = ops::Const<int32_t>(scope.WithOpName("outer/Const"), 0);
 
     auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
                                std::initializer_list<Input>{zero, y, x, var},
@@ -1184,7 +1189,7 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
-    auto ten = ops::Const<int32>(
+    auto ten = ops::Const<int32_t>(
         scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
         10);
     auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
@@ -1220,14 +1225,14 @@ void ComplexTestFixture::CheckOuterNodesFunctionalized(
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
     auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
+    auto one_j = ops::Const<int32_t>(
         scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
     auto while_op =
         ops::While(scope.WithOpName("outer/inner/LoopCond"),
                    std::initializer_list<Input>{one_j, arg1, arg2, arg3},
                    inner_cond_fn, inner_body_fn);
 
-    auto one_outer = ops::Const<int32>(
+    auto one_outer = ops::Const<int32_t>(
         scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
     auto add_i =
         ops::Add(scope.WithOpName("outer/add")
@@ -1262,7 +1267,7 @@ void ComplexTestFixture::CheckInnerNodesFunctionalized(
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
     auto arg3 = ops::_Arg(scope.WithOpName("arg3"), DT_RESOURCE, 3);
 
-    auto five = ops::Const<int32>(
+    auto five = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
     auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
     auto retval = ops::_Retval(scope.WithOpName("retval0_RetVal"), less_j, 0);
@@ -1299,7 +1304,7 @@ void ComplexTestFixture::CheckInnerNodesFunctionalized(
     auto assign = ops::AssignAddVariableOp(
         scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
 
-    auto one = ops::Const<int32>(
+    auto one = ops::Const<int32_t>(
         scope.WithOpName("outer/inner/One")
             .WithControlDependencies(
                 absl::Span<const Operation>{assign.operation}),
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
index cf3413154b8baa..d8558e7fb2b5fe 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -42,7 +42,7 @@ absl::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
 
 absl::Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames,
+    std::unordered_map<std::string, WhileLoopFrame>* frames,
     const NodeFilter& node_filter) {
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
index 970f62daa42af3..90c50f75e36387 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -47,7 +47,7 @@ struct WhileLoopArg {
 
 // Information about a loop frame.
 struct WhileLoopFrame {
-  string name;
+  std::string name;
 
   // Pointer to the parent frame. The root frame has a pointer to itself.
   WhileLoopFrame* parent = nullptr;
@@ -76,7 +76,7 @@ struct WhileLoopFrame {
 // `FunctionalizeControlFlow` for more details about node filters).
 absl::Status ExtractWhileLoopFrames(
     const std::vector<ControlFlowInfo>& cf_info, const Graph* graph,
-    std::unordered_map<string, WhileLoopFrame>* frames,
+    std::unordered_map<std::string, WhileLoopFrame>* frames,
     const NodeFilter& node_filter = {});
 
 // Check that the graph has no cycle containing the given node.
@@ -97,10 +97,10 @@ absl::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index);
 
 // Returns a textual representation of the names of the nodes in the input.
 template <typename T>
-string NodesToString(const T& nodes) {
+std::string NodesToString(const T& nodes) {
   return absl::StrCat("{",
                       absl::StrJoin(nodes, ",",
-                                    [](string* output, const Node* node) {
+                                    [](std::string* output, const Node* node) {
                                       absl::StrAppend(output, node->name());
                                     }),
                       "}");
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 2c02379c36cd45..b8183afd59481a 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -438,7 +438,7 @@ absl::Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   builder.Attr("body", body_name);
   // Add some internal attributes which need to be propagated.
   for (absl::string_view attr_name : kAttrsToPropagate) {
-    string attr_val;
+    std::string attr_val;
     if (GetNodeAttr(frame->loop_cond->def(), attr_name, &attr_val).ok()) {
       builder.Attr(attr_name, attr_val);
     }
@@ -513,7 +513,7 @@ absl::Status FunctionalizeWhileLoop(Graph* graph,
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
   std::vector<ControlFlowInfo> cf_info;
-  std::vector<string> unreachable_nodes;
+  std::vector<std::string> unreachable_nodes;
   TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes));
   if (!unreachable_nodes.empty()) {
     return errors::InvalidArgument(
@@ -522,7 +522,7 @@ absl::Status FunctionalizeWhileLoop(Graph* graph,
   }
 
   // Builds Frames, indexed by name.
-  std::unordered_map<string, WhileLoopFrame> frames;
+  std::unordered_map<std::string, WhileLoopFrame> frames;
   TF_RETURN_IF_ERROR(
       ExtractWhileLoopFrames(cf_info, graph, &frames, node_filter));
 
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
index 2759ad8384cd81..b331272a2c9504 100644
--- a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -42,7 +42,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-absl::Status GetTestDevice(Session* session, string* test_device) {
+absl::Status GetTestDevice(Session* session, std::string* test_device) {
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session->ListDevices(&devices));
 
@@ -85,7 +85,7 @@ TEST(FusedBatchnormReserveSpaceTest, Test) {
   std::unique_ptr<tensorflow::Session> session(
       tensorflow::NewSession(tensorflow::SessionOptions{}));
 
-  string test_device;
+  std::string test_device;
   TF_ASSERT_OK(GetTestDevice(session.get(), &test_device));
 
   Scope root = tensorflow::Scope::NewRootScope();
@@ -108,8 +108,8 @@ TEST(FusedBatchnormReserveSpaceTest, Test) {
   Output variance =
       Const(root.WithOpName("variance"), Input::Initializer(variance_data));
 
-  string tf_device = absl::StrCat("/device:", test_device, ":0");
-  string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
+  std::string tf_device = absl::StrCat("/device:", test_device, ":0");
+  std::string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
 
   FusedBatchNorm fused_batch_norm_tf(
       root.WithOpName("fused_batch_norm_tf").WithDevice(tf_device), input,
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index f23c423fbb2632..5f794005b7c7c0 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -292,12 +292,12 @@ absl::Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
   }
   if (add_token_input_output) {
-    std::vector<string> token_input_nodes;
+    std::vector<std::string> token_input_nodes;
     TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(&func.attr()),
                                    kXlaTokenInputNodesAttrName,
                                    &token_input_nodes));
     std::vector<xla::XlaOp> token_inputs;
-    for (const string& node_name : token_input_nodes) {
+    for (const std::string& node_name : token_input_nodes) {
       auto token_or = compiler->GetNodeToken(node_name);
       TF_RETURN_IF_ERROR(token_or.status());
       token_inputs.push_back(std::move(token_or).value());
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_test.cc b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
index 3010ac7f0b026b..2dcb2ea0b52d45 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
@@ -104,8 +104,8 @@ class GraphCompilerTest : public ::testing::Test {
     core::ScopedUnref context_unref(xla_context);
     xla_context->Ref();
 
-    auto step_container =
-        std::make_unique<ScopedStepContainer>(0, [this](const string& name) {
+    auto step_container = std::make_unique<ScopedStepContainer>(
+        0, [this](const std::string& name) {
           absl::Status status =
               this->device_->resource_manager()->Cleanup(name);
         });
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index d1c984e26f390a..116c1e68f66fe6 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -44,7 +44,7 @@ const char* const kFetchIdAttr = "_fetch_id";
 const char* const kShapeAttr = "_shape";
 const char* const kDebugNameAttr = "_debug_name";
 
-typedef std::unordered_map<string, Node*> NodeMap;
+typedef std::unordered_map<std::string, Node*> NodeMap;
 
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
@@ -54,14 +54,14 @@ typedef std::unordered_map<string, Node*> NodeMap;
 absl::Status AddArgNodes(
     Graph* graph, const NodeMap& node_map,
     const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-    const std::unordered_map<string, string>& feed_remapping,
+    const std::unordered_map<std::string, std::string>& feed_remapping,
     std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
     const int output_index = 0;
 
-    const string key = TensorIdToString(feed.id());
+    const std::string key = TensorIdToString(feed.id());
     const auto remap_it = feed_remapping.find(key);
     auto node_it = node_map.find(remap_it->second);
     if (node_it == node_map.end()) {
@@ -149,7 +149,7 @@ absl::Status AddRetvalNodes(
 // execution to know the input and output args for the generated function.
 absl::Status RewriteAndPruneGraph(
     Graph* graph, const tf2xla::Config& config,
-    const std::unordered_map<string, string>& feed_remapping) {
+    const std::unordered_map<std::string, std::string>& feed_remapping) {
   NodeMap node_map;
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
@@ -164,7 +164,7 @@ absl::Status RewriteAndPruneGraph(
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
-  std::set<string> missing_feeds, missing_fetches;
+  std::set<std::string> missing_feeds, missing_fetches;
   for (const tf2xla::Feed& feed : config.feed()) {
     missing_feeds.insert(TensorIdToString(feed.id()));
   }
@@ -173,14 +173,14 @@ absl::Status RewriteAndPruneGraph(
   }
   for (const Node* n : graph->op_nodes()) {
     if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
-      string feed_id;
+      std::string feed_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
         return errors::Aborted(FunctionLibraryDefinition::kArgOp,
                                " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
-      string fetch_id;
+      std::string fetch_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
         return errors::Aborted(FunctionLibraryDefinition::kRetOp,
@@ -277,7 +277,7 @@ absl::Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   GraphDef first_copy_def = graph_def;
 
   // Maps from name:port of a feed to the name:port of the placeholder to use.
-  std::unordered_map<string, string> feed_remapping;
+  std::unordered_map<std::string, std::string> feed_remapping;
   TF_RETURN_IF_ERROR(AddPlaceholdersForFeeds(config, g->op_registry(),
                                              &feed_remapping, &first_copy_def));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
index a6ddbfd3a01fef..74c888d37de784 100644
--- a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
@@ -94,9 +94,9 @@ class CollectiveReduceV2Op : public XlaOpKernel {
 
  private:
   DataType dtype_ = DT_INVALID;
-  string merge_op_name_;
-  string final_op_name_;
-  string communication_hint_;
+  std::string merge_op_name_;
+  std::string final_op_name_;
+  std::string communication_hint_;
 
   CollectiveReduceV2Op(const CollectiveReduceV2Op&) = delete;
   void operator=(const CollectiveReduceV2Op&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 0dd528e3dea173..240a099f075aa2 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -48,7 +48,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("exponential_avg_factor", &exponential_avg_factor_));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
@@ -61,7 +61,7 @@ class FusedBatchNormOp : public XlaOpKernel {
                   errors::InvalidArgument(
                       "FusedBatchNormEx supports at most 1 side input."));
       add_side_input_ = (num_side_inputs == 1);
-      string activation_mode;
+      std::string activation_mode;
       OP_REQUIRES_OK(ctx, ctx->GetAttr("activation_mode", &activation_mode));
       OP_REQUIRES(ctx,
                   activation_mode == "Identity" || activation_mode == "Relu",
@@ -249,7 +249,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   explicit FusedBatchNormGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 7c89720292b0a7..94486a104152ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -66,9 +66,11 @@ class BCastArgsOp : public XlaOpKernel {
     Tensor output(val_type, TensorShape({len}));
     for (int64_t i = 0; i < len; ++i) {
       if (val_type == DT_INT32) {
-        output.flat<int32>()(i) = static_cast<int32>(bcast.output_shape()[i]);
+        output.flat<int32_t>()(i) =
+            static_cast<int32_t>(bcast.output_shape()[i]);
       } else {
-        output.flat<int64>()(i) = static_cast<int64>(bcast.output_shape()[i]);
+        output.flat<int64_t>()(i) =
+            static_cast<int64_t>(bcast.output_shape()[i]);
       }
     }
     ctx->SetConstantOutput(0, output);
@@ -129,9 +131,9 @@ class BCastGradArgsOp : public XlaOpKernel {
     Tensor constant(val_type, TensorShape({len}));
     for (int64_t i = 0; i < len; ++i) {
       if (val_type == DT_INT32) {
-        constant.flat<int32>()(i) = static_cast<int32>(v[i]);
+        constant.flat<int32_t>()(i) = static_cast<int32_t>(v[i]);
       } else {
-        constant.flat<int64>()(i) = static_cast<int64>(v[i]);
+        constant.flat<int64_t>()(i) = static_cast<int64_t>(v[i]);
       }
     }
     ctx->SetConstantOutput(idx, constant);
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 2bf4ab52c8b59e..bf428711664d76 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -28,7 +28,7 @@ namespace {
 class BiasOp : public XlaOpKernel {
  public:
   explicit BiasOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
+    std::string data_format;
     if (ctx->GetAttr("data_format", &data_format).ok()) {
       OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
index 510d5225d6f04b..7d323b16d8856e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -55,7 +55,7 @@ class BucketizeOp : public XlaOpKernel {
                                         /*broadcast_dimensions=*/{0}),
                                 xla::S32);
     xla::XlaOp buckets = xla::Reduce(
-        comparison, /*init_value=*/xla::ConstantR0<int32>(builder, 0),
+        comparison, /*init_value=*/xla::ConstantR0<int32_t>(builder, 0),
         /*computation=*/xla::CreateScalarAddComputation(xla::S32, builder),
         /*dimensions_to_reduce=*/{0});
     context->SetOutput(0, buckets);
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index cead6d10c2a0eb..da40d84e73f063 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -66,7 +66,7 @@ XlaCaseOp::GetPrunedBranchesAndIndex(XlaOpKernelContext* ctx) {
     return {unpruned_branches_, ctx->Input(0)};
   }
 
-  int32_t branch_index = branch_index_literal.Get<int32>({});
+  int32_t branch_index = branch_index_literal.Get<int32_t>({});
   if (branch_index < 0 || branch_index >= unpruned_branches_.size()) {
     branch_index = unpruned_branches_.size() - 1;
   }
@@ -187,7 +187,8 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
 
       // Add any TensorArray gradients touched by the then/else computation to
       // the enclosing graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -289,7 +290,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
       // Set token input for this "case" op.
       std::vector<xla::XlaOp> token_inputs;
       token_inputs.reserve(token_input_nodes_.size());
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index a4c01bea65a04d..6574fb4aac4c5e 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -65,8 +65,8 @@ class XlaCaseOp : public XlaOpKernel {
   DataTypeVector input_types_;
   DataTypeVector output_types_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   // Whether to propagate compile time consts into the cond branches.
   // This is not supported by default now since it may cause HBM memory
   // overheads.
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index e8c804791299a7..2c69974d8373dc 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -185,7 +185,7 @@ class StatelessCategoricalOp : public CategoricalOp {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessCategoricalOp(const StatelessCategoricalOp&) = delete;
   void operator=(const StatelessCategoricalOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index d2463a9974b1bb..7ab53f7ad89e75 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -38,7 +38,7 @@ template <typename DstT,
                                   std::is_same<DstT, bfloat16>::value>::type* =
               nullptr>
 DstT CastTo(int32_t src) {
-  return absl::bit_cast<DstT>(static_cast<uint16>(src));
+  return absl::bit_cast<DstT>(static_cast<uint16_t>(src));
 }
 
 // Returns scalar constant with the value in the tensor, if the given proto has
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 3fe22dcb4441e7..59f72e630c0f75 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -163,8 +163,8 @@ absl::Status CheckConvAttrs(const ConvOpAttrs& attrs) {
 absl::Status ConvBackpropComputeDimensionsV2XlaShapes(
     absl::string_view label, int num_spatial_dims,
     const xla::Shape& input_shape, const xla::Shape& filter_shape,
-    const xla::Shape& out_backprop_shape, absl::Span<const int32> dilations,
-    const std::vector<int32>& strides, Padding padding,
+    const xla::Shape& out_backprop_shape, absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
     TensorFormat data_format, ConvBackpropDimensions* dims,
     absl::Span<const int64_t> explicit_paddings) {
   TensorShape input_tensor_shape, filter_tensor_shape,
@@ -203,7 +203,7 @@ absl::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
         ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
-  string data_format;
+  std::string data_format;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format));
   if (!FormatFromString(data_format, &attrs.data_format)) {
     return errors::InvalidArgument("Invalid data format: ", data_format);
@@ -231,7 +231,7 @@ absl::StatusOr<ConvNDOpAttrs> ConvNDOpAttrs::Create(OpKernelConstruction* ctx) {
         ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format_str));
   if (!(data_format_str == "CHANNELS_LAST" ||
         data_format_str == "CHANNELS_FIRST")) {
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 94e454df205df2..e64cebe3970cd8 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -54,8 +54,8 @@ struct ConvOpAttrs {
 
   bool depthwise;
   int num_spatial_dims;
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   std::vector<int64_t> explicit_paddings;
   TensorFormat data_format;
@@ -68,8 +68,8 @@ struct ConvNDOpAttrs {
 
   int groups;
   int batch_dims;
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   std::vector<int64_t> explicit_paddings;
   TensorFormat data_format;
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index b1da0acd61608f..82fdf8ea577e39 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -92,9 +92,9 @@ class ConvNDOp : public XlaOpKernel {
     ConvOpAttrs forward_attrs;
     forward_attrs.depthwise = false;
     forward_attrs.num_spatial_dims = num_spatial_dims;
-    forward_attrs.dilations = attrs_.dilations.empty()
-                                  ? std::vector<int32>(num_spatial_dims + 2, 1)
-                                  : attrs_.dilations;
+    forward_attrs.dilations =
+        attrs_.dilations.empty() ? std::vector<int32_t>(num_spatial_dims + 2, 1)
+                                 : attrs_.dilations;
     forward_attrs.strides = attrs_.strides;
     forward_attrs.padding = attrs_.padding;
     forward_attrs.explicit_paddings = attrs_.explicit_paddings;
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index 226d6248bd00d8..27818415169dbe 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -36,9 +36,9 @@ class DataFormatDimMapOp : public XlaOpKernel {
  public:
   explicit DataFormatDimMapOp(OpKernelConstruction* context)
       : XlaOpKernel(context) {
-    string src_format;
+    std::string src_format;
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
-    string dst_format;
+    std::string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
     OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
                 errors::InvalidArgument(
@@ -69,9 +69,9 @@ class DataFormatDimMapOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* context) override {
     auto builder = context->builder();
     xla::XlaOp dst_indices =
-        xla::ConstantR1(builder, absl::Span<const int32>(dst_idx_));
+        xla::ConstantR1(builder, absl::Span<const int32_t>(dst_idx_));
     const int dims = dst_idx_.size();
-    xla::XlaOp rank = xla::ConstantR0<int32>(builder, dims);
+    xla::XlaOp rank = xla::ConstantR0<int32_t>(builder, dims);
     xla::XlaOp src_indices =
         (xla::ConvertElementType(context->Input(0), xla::S32) + rank) % rank;
     xla::XlaOp output =
@@ -81,7 +81,7 @@ class DataFormatDimMapOp : public XlaOpKernel {
   }
 
  private:
-  std::vector<int32> dst_idx_;
+  std::vector<int32_t> dst_idx_;
 
   DataFormatDimMapOp(const DataFormatDimMapOp&) = delete;
   void operator=(const DataFormatDimMapOp&) = delete;
@@ -146,13 +146,13 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
               input_tensor_shape.DebugString()));
     }
 
-    string src_format_str = src_format_;
-    string dst_format_str = dst_format_;
+    std::string src_format_str = src_format_;
+    std::string dst_format_str = dst_format_;
     if (input_tensor_shape.dim_size(0) == spatial_dim_count) {
       // If the input is a vector of size spatial_dim_count, treat the elements
       // as spatial dimensions.
       auto keep_only_spatial_dimensions =
-          [spatial_dim_count](string* format_str) -> void {
+          [spatial_dim_count](std::string* format_str) -> void {
         auto new_end =
             std::remove_if(format_str->begin(), format_str->end(),
                            [spatial_dim_count](const char dim) {
@@ -164,7 +164,7 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
       keep_only_spatial_dimensions(&src_format_str);
       keep_only_spatial_dimensions(&dst_format_str);
     }
-    std::vector<int32> dst_indices(dim0);
+    std::vector<int32_t> dst_indices(dim0);
     for (int i = 0; i < dim0; ++i) {
       for (int j = 0; j < dim0; ++j) {
         if (src_format_str[i] == dst_format_str[j]) {
@@ -174,14 +174,14 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
       }
     }
     xla::XlaOp indices =
-        xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
+        xla::ConstantR1(builder, absl::Span<const int32_t>(dst_indices));
     xla::XlaOp output = xla::TorchIndexSelect(ctx->Input(0), indices, 0);
     ctx->SetOutput(0, output);
   }
 
  private:
-  string src_format_;
-  string dst_format_;
+  std::string src_format_;
+  std::string dst_format_;
 
   DataFormatVecPermuteOp(const DataFormatVecPermuteOp&) = delete;
   void operator=(const DataFormatVecPermuteOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index e8e2babffd529c..7e93ed9c32e126 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -31,7 +31,7 @@ namespace {
 class DepthToSpaceOp : public XlaOpKernel {
  public:
   explicit DepthToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
index d383c7d0ab4aa3..bc03e14556f9cb 100644
--- a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
@@ -42,7 +42,7 @@ float get_fullrange() {
 class DequantizeOp : public XlaOpKernel {
  public:
   explicit DequantizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string mode_string;
+    std::string mode_string;
     int axis;
     bool narrow_range;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
index 141415bcd0d8c0..a5665baa6e3dc5 100644
--- a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
@@ -39,11 +39,11 @@ class DeviceIndexOp : public XlaOpKernel {
     // When compiling we are not executing on any physical device, so we return
     // a sentinel value (size of the list of devices).
     ctx->SetOutput(
-        0, xla::ConstantR0<int32>(ctx->builder(), device_names_.size()));
+        0, xla::ConstantR0<int32_t>(ctx->builder(), device_names_.size()));
   }
 
  private:
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 REGISTER_XLA_OP(Name("DeviceIndex"), DeviceIndexOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
index ceeea010ee7858..ae7488ad1e1cbd 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
@@ -54,8 +54,8 @@ class DynamicPartitionOp : public XlaOpKernel {
   xla::XlaOp CountS32(XlaOpKernelContext* ctx, xla::XlaOp input,
                       int64_t target) {
     xla::XlaOp equal_dim =
-        xla::Compare(input, xla::ConstantR0<int32>(ctx->builder(), target), {},
-                     xla::ComparisonDirection::kEq);
+        xla::Compare(input, xla::ConstantR0<int32_t>(ctx->builder(), target),
+                     {}, xla::ComparisonDirection::kEq);
     xla::XlaOp casted = xla::ConvertElementType(equal_dim, xla::S32);
     return xla::ReduceAll(
         casted, xla::Zero(ctx->builder(), xla::S32),
@@ -178,8 +178,9 @@ class DynamicPartitionOp : public XlaOpKernel {
       } else {
         xla::XlaOp length;
         if (count_diff != 0) {
-          length = xla::Div(partition_length[i],
-                            xla::ConstantR0<int32>(ctx->builder(), count_diff));
+          length =
+              xla::Div(partition_length[i],
+                       xla::ConstantR0<int32_t>(ctx->builder(), count_diff));
         } else {
           length = CountS32(ctx, ctx->Input(1), /*target=*/i);
         }
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index cb7e4f6f96437e..edf9afb5ae14fb 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -145,8 +145,8 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Construct the reverse mapping, for each index, of which slice of which
     // input it comes from.
-    std::vector<int32> src_input_vector(number_of_indices);
-    std::vector<int32> src_slice_vector(number_of_indices);
+    std::vector<int32_t> src_input_vector(number_of_indices);
+    std::vector<int32_t> src_slice_vector(number_of_indices);
     std::vector<bool> src_index_used(number_of_indices);
     int index_used_count = 0;
     for (int input_num = 0; input_num < indices.size(); input_num++) {
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 4a1de78d9371b3..b9ca65cfbd6371 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -179,9 +179,9 @@ class ExtractImagePatchesOp : public XlaOpKernel {
   }
 
  protected:
-  std::vector<int32> ksizes_;
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> dilations_;
+  std::vector<int32_t> strides_;
   Padding padding_;
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
index b2b1eb3343e698..8075982c766a97 100644
--- a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
@@ -154,7 +154,7 @@ class FusedConv2DInt8Op : public XlaOpKernel {
 
     // Un-vectorize NCHW_VECT_C to NCHW.
     TensorFormat orig_data_format = conv_attrs_.data_format;
-    int64 vect_width = -1;
+    int64_t vect_width = -1;
     switch (conv_attrs_.data_format) {
       case FORMAT_NCHW_VECT_C:
         vect_width = conv_input_shape.dimensions(4);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 2783951e1b6b0f..e94f74d1fed8ef 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -275,7 +275,7 @@ class GatherOp : public XlaOpKernel {
 
   // The number of batch dimensions, as passed in the batch_dims attribute.
   // It must be less than or equal to rank(indices).
-  int32 batch_dims_ = 0;
+  int32_t batch_dims_ = 0;
 };
 
 REGISTER_XLA_OP(Name("Gather"), MlirXlaOpKernel);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
index 033144e9f308e4..2aec21a6db5888 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
@@ -28,7 +28,7 @@ namespace {
 class GatherOp : public XlaOpKernel {
  public:
   explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
@@ -60,7 +60,7 @@ class ScatterOp : public XlaOpKernel {
   explicit ScatterOp(OpKernelConstruction* context) : XlaOpKernel(context) {
     OP_REQUIRES_OK(
         context, context->GetAttr("update_computation", &update_computation_));
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 17db09722ba954..56c86d3d597227 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -84,7 +84,8 @@ static absl::StatusOr<bool> PopulateTensorArrayGradients(
 
       // Add any TensorArray gradients touched by the then/else computation to
       // the enclosing graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -318,7 +319,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     if (has_token_input_output_ && i == num_inputs - 1) {
       // Set token input for this "if" op.
       std::vector<xla::XlaOp> token_inputs;
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
index fc6dd2e08bf41f..c11cfcb08e0b09 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -61,8 +61,8 @@ class XlaIfOp : public XlaOpKernel {
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index a8eb7bbf794268..a2676e095b91b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -352,10 +352,11 @@ struct WhileCondFn {
                                         xla::XlaBuilder* cond_builder) const {
     xla::XlaOp row_idx = values[0];
     xla::XlaOp row_in_bounds =
-        xla::Lt(row_idx, xla::ConstantR0<int32>(cond_builder, num_boxes));
+        xla::Lt(row_idx, xla::ConstantR0<int32_t>(cond_builder, num_boxes));
     xla::XlaOp num_outputs_so_far = values[1];
-    xla::XlaOp results_not_full = xla::Lt(
-        num_outputs_so_far, xla::ConstantR0<int32>(cond_builder, output_size));
+    xla::XlaOp results_not_full =
+        xla::Lt(num_outputs_so_far,
+                xla::ConstantR0<int32_t>(cond_builder, output_size));
     return xla::And(row_in_bounds, results_not_full);
   }
 };
@@ -375,7 +376,7 @@ struct SuppressBodyFn {
     auto num_outputs_so_far = values[1];
     auto iou_mask = values[2];
     auto included_iou = values[3];
-    auto zero = xla::ConstantR0<int32>(builder, 0);
+    auto zero = xla::ConstantR0<int32_t>(builder, 0);
     // Determine if current elem is active using a slice.
     // TODO(b/118437727): The only reason we need an explicit vector is because
     // some old GCCs can't deduce the right type for MakeConstSpan, and
@@ -386,7 +387,7 @@ struct SuppressBodyFn {
     active_elem = xla::Reshape(active_elem, {});
     // Increment output count iff current elem is not suppressed.
     num_outputs_so_far = xla::Select(
-        active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
+        active_elem, num_outputs_so_far + xla::ConstantR0<int32_t>(builder, 1),
         num_outputs_so_far);
     // Slice out the row_idx.
     auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
@@ -412,7 +413,7 @@ struct SuppressBodyFn {
     }
     included_iou =
         xla::Select(cond, xla::And(included_iou, supp_mask), included_iou);
-    row_idx = row_idx + xla::ConstantR0<int32>(builder, 1);
+    row_idx = row_idx + xla::ConstantR0<int32_t>(builder, 1);
     return std::vector<xla::XlaOp>{row_idx, num_outputs_so_far, iou_mask,
                                    included_iou};
   }
@@ -456,7 +457,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
                 errors::InvalidArgument(
                     "scores size ", std::to_string(scores_shape.dim_size(0)),
                     " must equal number of boxes ", std::to_string(num_boxes)));
-    OP_REQUIRES(context, num_boxes <= kint32max,
+    OP_REQUIRES(context, num_boxes <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("XLA compilation requires number of "
                                         "boxes to be <= kint32max, got ",
                                         num_boxes));
@@ -477,7 +478,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     OP_REQUIRES(
         context, output_size >= 0,
         errors::InvalidArgument("Need output_size >= 0, got ", output_size));
-    OP_REQUIRES(context, output_size <= kint32max,
+    OP_REQUIRES(context, output_size <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("Need output_size <= kint32Max, got ",
                                         output_size));
     const xla::XlaOp score_thresh = context->Input("score_threshold");
@@ -564,8 +565,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
 
     std::vector<xla::XlaOp> init_values;
     init_values.reserve(4);
-    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // col_idx
-    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // num_outputs
+    init_values.push_back(xla::ConstantR0<int32_t>(builder, 0));  // col_idx
+    init_values.push_back(xla::ConstantR0<int32_t>(builder, 0));  // num_outputs
     init_values.push_back(iou_thresh_mask);
     init_values.push_back(included_iou);
 
@@ -595,8 +596,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     // can be suppressed by score threshold.
     xla::XlaOp ones_included = xla::Select(
         included,
-        xla::Broadcast(xla::ConstantR0<int32>(builder, 1), {num_boxes}),
-        xla::Broadcast(xla::ConstantR0<int32>(builder, 0), {num_boxes}));
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, 1), {num_boxes}),
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, 0), {num_boxes}));
     // num_valid is scalar. Value should be bound by output_size.
 
     xla::XlaOp num_valid_total = xla::Reduce(
@@ -604,8 +605,8 @@ class NonMaxSuppressionOp : public XlaOpKernel {
         /*init_value=*/xla::ConstantR0<int>(builder, 0),
         /*computation=*/CreateScalarAddComputation(xla::S32, builder),
         /*dimensions_to_reduce=*/{0});
-    xla::XlaOp num_valid =
-        xla::Min(num_valid_total, xla::ConstantR0<int32>(builder, output_size));
+    xla::XlaOp num_valid = xla::Min(
+        num_valid_total, xla::ConstantR0<int32_t>(builder, output_size));
 
     // Re-index into the original scores input tensor, using a Gather.
     // Boxes were suppressed in the sorted domain.
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 58811c10744131..9959f8d4e44be6 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -120,8 +120,8 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
       const int64_t out_size_factor =
           align_corners ? out_size[i] - 1 : out_size[i];
 
-      int64_t gcd = MathUtil::GCD(static_cast<uint64>(in_size_factor),
-                                  static_cast<uint64>(out_size_factor));
+      int64_t gcd = MathUtil::GCD(static_cast<uint64_t>(in_size_factor),
+                                  static_cast<uint64_t>(out_size_factor));
       dims.stride[i] = in_size_factor / gcd;
       dims.kernel_size[i] = out_size_factor / gcd;
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
index f357262a39c35b..5b730cc0a9076d 100644
--- a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
@@ -96,7 +96,7 @@ class InTopKOp : public XlaOpKernel {
         xla::CreateScalarAddComputation(xla::S32, xla_builder), {1});
 
     xla::XlaOp result =
-        xla::And(xla::Lt(num_gt_r1, xla::ConstantR0<int32>(xla_builder, k)),
+        xla::And(xla::Lt(num_gt_r1, xla::ConstantR0<int32_t>(xla_builder, k)),
                  xla::IsFinite(targets_values_r1));
 
     context->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index 718f59e1227dc1..899c0063035b82 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -469,7 +469,7 @@ class TfCallbackDevice : public DeviceBase {
     set_tensorflow_accelerator_device_info(&accelerator_device_info_);
   }
 
-  const string& name() const override { return name_; }
+  const std::string& name() const override { return name_; }
 
   PerOpGpuDevice* MakeGpuDevice() override {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index dfe8a36005b837..aabbd8d3b0514e 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -60,7 +60,7 @@ class ListDiffOp : public XlaOpKernel {
     absl::Status status;
     switch (val_type) {
       case DT_INT32:
-        status = ListDiffWithIndexType<int32>(context, idx_type);
+        status = ListDiffWithIndexType<int32_t>(context, idx_type);
         break;
       case DT_INT64:
         status = ListDiffWithIndexType<int64_t>(context, idx_type);
@@ -111,7 +111,7 @@ class ListDiffOp : public XlaOpKernel {
                                      DataType idx_type) {
     switch (idx_type) {
       case DT_INT32:
-        return ListDiff<Tval, int32>(context);
+        return ListDiff<Tval, int32_t>(context);
       case DT_INT64:
         return ListDiff<Tval, int64_t>(context);
       default:
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
index 48e8f976cc67bb..8e7c966bdf35fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -57,7 +57,7 @@ static inline bool IsLeftAligned(int diag_index, bool left_align_superdiagonal,
 void ReadAlignment(OpKernelConstruction* context,
                    bool* left_align_superdiagonal,
                    bool* left_align_subdiagonal) {
-  string align;
+  std::string align;
   OP_REQUIRES_OK(context, context->GetAttr("align", &align));
 
   *left_align_superdiagonal = align == "LEFT_LEFT" || align == "LEFT_RIGHT";
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index 82dbfb3839312c..215de2bc5067e4 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -78,7 +78,7 @@ class OneHotOp : public XlaOpKernel {
   }
 
  private:
-  int32 axis_;
+  int32_t axis_;
 
   OneHotOp(const OneHotOp&) = delete;
   void operator=(const OneHotOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 1758451faf469f..15b2b5f9d2ebbb 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -113,7 +113,7 @@ class PadOp : public XlaOpKernel {
         high_pad_size = xla::Reshape(high_pad_size, {});
         high_pad_size = xla::ConvertElementType(high_pad_size, xla::S32);
         // Low pad has to be static.
-        xla::XlaOp low_pad_size = xla::ConstantR0<int32>(
+        xla::XlaOp low_pad_size = xla::ConstantR0<int32_t>(
             ctx->builder(), pad_literal.Get<int64_t>({i, 0}));
         xla::XlaOp input_size = xla::GetDimensionSize(input, i);
         xla::XlaOp total_size = low_pad_size + input_size + high_pad_size;
@@ -122,7 +122,7 @@ class PadOp : public XlaOpKernel {
                 total_size, xla::ValueInferenceMode::kUpperBound);
         OP_REQUIRES_OK(ctx, size_upper_bound_status_or.status());
         auto size_upper_bound =
-            size_upper_bound_status_or.value().Get<int32>({});
+            size_upper_bound_status_or.value().Get<int32_t>({});
         OP_REQUIRES(
             ctx, size_upper_bound.has_value(),
             errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index aa7c78b8b8f97a..77db609d997614 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -88,8 +88,8 @@ class PoolingOp : public XlaOpKernel {
         num_spatial_dims_(num_spatial_dims),
         reduction_type_(reduction_type) {
     if (ctx->num_inputs() == 1) {
-      std::vector<int32> ksize_int;
-      std::vector<int32> stride_int;
+      std::vector<int32_t> ksize_int;
+      std::vector<int32_t> stride_int;
       OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
       OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
                   errors::InvalidArgument("Sliding window ksize field must "
@@ -255,15 +255,15 @@ class MaxPoolOp : public PoolingOp {
           ctx->builder()->GetShape(pooling);
       OP_REQUIRES_OK(ctx, result_shape.status());
 
-      int64 num_channels = result_shape->dimensions(1);
+      int64_t num_channels = result_shape->dimensions(1);
       OP_REQUIRES(
           ctx, num_channels % *vect_width == 0,
           errors::FailedPrecondition("Result of NCHW_VECT_C op must have "
                                      "channels multiple of ",
                                      *vect_width, ", but was ", num_channels));
 
-      absl::InlinedVector<int64, 5> new_dims(result_shape->dimensions().begin(),
-                                             result_shape->dimensions().end());
+      absl::InlinedVector<int64_t, 5> new_dims(
+          result_shape->dimensions().begin(), result_shape->dimensions().end());
       new_dims[1] /= *vect_width;
       new_dims.insert(new_dims.begin() + 2, *vect_width);
       pooling =
@@ -298,7 +298,7 @@ class AvgPoolOp : public PoolingOp {
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
                   XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -466,7 +466,7 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
  public:
   explicit MaxPool2DGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -505,7 +505,7 @@ class AvgPoolGradOp : public XlaOpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -561,7 +561,7 @@ class AvgPoolGradOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   std::vector<int64_t> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
 };
@@ -677,7 +677,7 @@ class MaxPoolGradGradOp : public XlaOpKernel {
 
     auto b = ctx->builder();
 
-    auto sixteen = xla::ConstantR0<uint32>(b, 16);
+    auto sixteen = xla::ConstantR0<uint32_t>(b, 16);
     // in (f32) -> round to 7 mantissa bits (bf16)-> 16-high-bit u32.
     //
     // NOTE: Use a ReducePrecision operation instead of a cast to BF16 and back
@@ -702,7 +702,7 @@ class MaxPoolGradGradOp : public XlaOpKernel {
       const xla::Shape scalar = xla::ShapeUtil::MakeShape(xla::F32, {});
       auto lhs = xla::Parameter(rb.get(), 0, scalar, "lhs");
       auto rhs = xla::Parameter(rb.get(), 1, scalar, "rhs");
-      auto sixteen = xla::ConstantR0<int32>(rb.get(), 16);
+      auto sixteen = xla::ConstantR0<int32_t>(rb.get(), 16);
       auto lhs_criteria =
           xla::ShiftLeft(xla::ShiftRightLogical(
                              xla::BitcastConvertType(lhs, xla::S32), sixteen),
@@ -749,7 +749,7 @@ class MaxPool2DGradGradOp : public MaxPoolGradGradOp {
  public:
   explicit MaxPool2DGradGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -767,7 +767,7 @@ class MaxPool3DGradGradOp : public MaxPoolGradGradOp {
  public:
   explicit MaxPool3DGradGradOp(OpKernelConstruction* ctx)
       : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/3) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
     OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index cac9f8a68f234e..961fce9caa7728 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -113,7 +113,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
           errors::Internal("Expected 4 inputs to QuantizeAndDequantize"));
       num_bits = ctx->Input(3);
     } else {
-      num_bits = xla::ConstantR0<int32>(b, num_bits_);
+      num_bits = xla::ConstantR0<int32_t>(b, num_bits_);
     }
 
     const xla::XlaOp zero = XlaHelpers::Zero(b, data_type);
@@ -129,17 +129,17 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     xla::XlaOp min_quantized, max_quantized;
     if (signed_input_) {
       if (narrow_range_) {
-        min_quantized =
-            -Pow(two, ConvertElementType(
-                          num_bits - xla::ConstantR0<int32>(b, 1), xla_type)) +
-            one;
+        min_quantized = -Pow(two, ConvertElementType(
+                                      num_bits - xla::ConstantR0<int32_t>(b, 1),
+                                      xla_type)) +
+                        one;
       } else {
         min_quantized =
             -Pow(two, ConvertElementType(
-                          num_bits - xla::ConstantR0<int32>(b, 1), xla_type));
+                          num_bits - xla::ConstantR0<int32_t>(b, 1), xla_type));
       }
       max_quantized =
-          Pow(two, ConvertElementType(num_bits - xla::ConstantR0<int32>(b, 1),
+          Pow(two, ConvertElementType(num_bits - xla::ConstantR0<int32_t>(b, 1),
                                       xla_type)) -
           one;
     } else {
@@ -222,7 +222,7 @@ class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(
         ctx,
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
index 8f2350f26861c4..dea3ecf85af7b8 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
@@ -140,7 +140,7 @@ absl::StatusOr<int> GetAlgId(XlaOpKernelContext* ctx, int alg_input_idx) {
   if (alg_dtype == DT_INT32) {
     return alg_literal.Get<int>({});
   } else {
-    return alg_literal.Get<int64>({});
+    return alg_literal.Get<int64_t>({});
   }
 }
 
@@ -172,7 +172,7 @@ DataType MaybeConvertBF16ToF32(DataType const& dtype) {
 }
 
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
-    XlaOpKernelContext* ctx, DataType dtype, string device_type_string,
+    XlaOpKernelContext* ctx, DataType dtype, std::string device_type_string,
     TensorShape shape,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> lo_fn,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> hi_fn) {
@@ -190,7 +190,7 @@ absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
 
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(XlaOpKernelContext* ctx,
                                                DataType dtype,
-                                               string device_type_string,
+                                               std::string device_type_string,
                                                xla::Shape xla_shape,
                                                xla::XlaOp lo, xla::XlaOp hi) {
   xla::XlaOp key = ctx->Input(kRandomKeyInputIdx);
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
index 11ff44602f1900..5fb7aa4822834c 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
@@ -73,7 +73,7 @@ DataType MaybeConvertBF16ToF32(DataType const& dtype);
 // type, in the given low and high range, where low and high are expressed in
 // XLA functions.
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
-    XlaOpKernelContext* ctx, DataType dtype, string device_type_string,
+    XlaOpKernelContext* ctx, DataType dtype, std::string device_type_string,
     TensorShape shape,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> lo,
     std::function<xla::XlaOp(xla::XlaBuilder*, xla::PrimitiveType)> hi);
@@ -82,7 +82,7 @@ absl::StatusOr<xla::XlaOp> BuildUniformRandoms(
 // ops.
 absl::StatusOr<xla::XlaOp> BuildUniformRandoms(XlaOpKernelContext* ctx,
                                                DataType dtype,
-                                               string device_type_string,
+                                               std::string device_type_string,
                                                xla::Shape xla_shape,
                                                xla::XlaOp lo, xla::XlaOp hi);
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 6a8a98342c1123..3bfe9e384405b2 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -119,7 +119,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     }
   }
 
-  string desc = ctx->op_kernel().name();
+  std::string desc = ctx->op_kernel().name();
 
   xla::XlaBuilder* const b = ctx->builder();
   // Construct the builder for the reduction lambda.
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index c54c4613d29e44..a1dd0164e73fc7 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -311,7 +311,7 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::Pad(grad_data, xla::Zero(ctx->builder(), warp_type),
                xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
 
-  auto shifting_value = xla::ConstantR1<int32>(
+  auto shifting_value = xla::ConstantR1<int32_t>(
       ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
   auto shifted_gather_indices =
       xla::Add(gather_indices, shifting_value, {last_warp_dim});
@@ -384,7 +384,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::Pad(data, xla::Zero(ctx->builder(), data_type),
                xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
 
-  auto shifting_value = xla::ConstantR1<int32>(
+  auto shifting_value = xla::ConstantR1<int32_t>(
       ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
   auto shifted_gather_indices =
       xla::Add(gather_indices, shifting_value, {last_warp_dim});
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 5cecbf37706283..5c77a4dfe29934 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -134,8 +134,8 @@ class ReverseSequenceOp : public XlaOpKernel {
   }
 
  private:
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
 };
 
 REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index e1e93d614286a3..32b75c26c70212 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -35,7 +35,7 @@ class SendOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  string tensor_name_;
+  std::string tensor_name_;
 
   SendOp(const SendOp&) = delete;
   void operator=(const SendOp&) = delete;
@@ -60,7 +60,7 @@ class RecvOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override;
 
  private:
-  string tensor_name_;
+  std::string tensor_name_;
   xla::Shape shape_;
 
   RecvOp(const RecvOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 108bf3848aae93..d24d1688d188a6 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -104,7 +104,8 @@ class RangeOp : public XlaOpKernel {
     absl::StatusOr<xla::XlaOp> output;
     switch (type) {
       case DT_INT32:
-        output = CreateRangeTensor<int32>(start, limit, delta, ctx->builder());
+        output =
+            CreateRangeTensor<int32_t>(start, limit, delta, ctx->builder());
         break;
       case DT_INT64:
         output =
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 7e8889cb2ccee6..07bf81e9d76b58 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -109,7 +109,7 @@ class XlaSetBoundOp : public XlaOpKernel {
                                 bound_shape.DebugString()));
     int64_t bound;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("bound", &bound));
-    xla::Literal bound_literal = xla::LiteralUtil::CreateR0<int32>(bound);
+    xla::Literal bound_literal = xla::LiteralUtil::CreateR0<int32_t>(bound);
     xla::XlaOp result = xla::CustomCall(
         ctx->builder(), "SetBound", {ctx->Input("input")},
         ctx->InputXlaShape("input").value(), "", false, {}, &bound_literal);
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 57825657b205ab..beb38ce9a273ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -33,15 +33,15 @@ absl::Status TensorShapeToConstant(const TensorShape& input_shape,
                                    Tensor* shape_constant) {
   const int dims = input_shape.dims();
   if (shape_constant->dtype() == DT_INT32) {
-    auto vec = shape_constant->vec<int32>();
+    auto vec = shape_constant->vec<int32_t>();
     for (int i = 0; i < dims; ++i) {
       int64_t dim_size = input_shape.dim_size(i);
-      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32>::max())) {
+      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32_t>::max())) {
         return errors::InvalidArgument(
             "Shape with out_type=int32 does not support tensors > int32max",
             " but dim ", i, " is ", dim_size);
       }
-      vec(i) = static_cast<int32>(dim_size);
+      vec(i) = static_cast<int32_t>(dim_size);
     }
   } else {
     auto vec = shape_constant->vec<int64_t>();
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
index 74e04e035ef3be..0ee9173cda69e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
@@ -101,8 +101,8 @@ absl::Status GetAndValidateAttributes(OpKernelConstruction* ctx,
   return absl::OkStatus();
 }
 
-std::vector<int64_t> GetSliceIndices(absl::Span<const int64> num_partitions,
-                                     absl::Span<const int64> slice_shape,
+std::vector<int64_t> GetSliceIndices(absl::Span<const int64_t> num_partitions,
+                                     absl::Span<const int64_t> slice_shape,
                                      const int index) {
   DCHECK_EQ(num_partitions.size(), slice_shape.size());
 
@@ -213,7 +213,7 @@ class XlaSplitNDBaseOp : public XlaOpKernel {
       // Calculate paddings necessary for slice instead of padding input and
       // slicing subsequently to reduce temporary memory allocation.
       for (int dim = 0; dim < rank; ++dim) {
-        const int64 dim_size = input_shape.dim_size(dim);
+        const int64_t dim_size = input_shape.dim_size(dim);
         if (slice_start_indices[dim] >= dim_size) {
           // Complete padding.
           slice_start_indices[dim] = dim_size;
@@ -387,9 +387,9 @@ class XlaConcatNDBaseOp : public XlaOpKernel {
 
       std::vector<xla::XlaOp> update_slice_start_indices;
       update_slice_start_indices.reserve(rank);
-      for (int64 start_index : slice_start_indices) {
+      for (int64_t start_index : slice_start_indices) {
         update_slice_start_indices.push_back(
-            xla::ConstantR0<int32>(ctx->builder(), start_index));
+            xla::ConstantR0<int32_t>(ctx->builder(), start_index));
       }
       output = xla::DynamicUpdateSlice(output, input_slice,
                                        update_slice_start_indices);
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 844a31f97990fc..b0e337cec20c33 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -180,8 +180,8 @@ class SliceOp : public XlaOpKernel {
               xla::Reshape(xla::Slice(ctx->Input(2), {i}, {i + 1}, {1}), {});
           if (constant_size_is_minus_one && size[i] == -1) {
             // size = input_.dim_size(i) - begin[i]
-            dynamic_size = xla::ConstantR0<int32>(ctx->builder(),
-                                                  input_shape.dim_size(i)) -
+            dynamic_size = xla::ConstantR0<int32_t>(ctx->builder(),
+                                                    input_shape.dim_size(i)) -
                            begin_indices[i];
           }
           auto constant_size = ctx->value_inference().AnalyzeConstant(
@@ -192,7 +192,7 @@ class SliceOp : public XlaOpKernel {
             // triggered when some dimensions's slice sizes are constant while
             // some are dynamic.
             sliced = xla::SliceInDim(
-                sliced, 0, constant_size->Get<int32>({}).value(), 1, i);
+                sliced, 0, constant_size->Get<int32_t>({}).value(), 1, i);
           } else {
             // We gave a generous bound (same as input) to the output, try reset
             // the bound if a tighter one can be found.
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index ac33e0877200dc..180ba322f0fdd0 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -34,7 +34,7 @@ namespace {
 class SpaceToDepthOp : public XlaOpKernel {
  public:
   explicit SpaceToDepthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
index 124e36557f1429..f6d468131ac94e 100644
--- a/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spmd_manual_sharding_ops.cc
@@ -69,8 +69,8 @@ class XlaSpmdFullToShardShapeOp : public XlaOpKernel {
   }
 
  private:
-  string manual_sharding_str_;
-  int32 single_dim_;
+  std::string manual_sharding_str_;
+  int32_t single_dim_;
   std::vector<int64_t> unspecified_dims_;
   XlaSpmdFullToShardShapeOp(const XlaSpmdFullToShardShapeOp&) = delete;
   void operator=(const XlaSpmdFullToShardShapeOp&) = delete;
@@ -120,8 +120,8 @@ class XlaSpmdShardToFullShapeOp : public XlaOpKernel {
 
  private:
   TensorShape full_shape_;
-  string manual_sharding_str_;
-  int32 single_dim_;
+  std::string manual_sharding_str_;
+  int32_t single_dim_;
   std::vector<int64_t> unspecified_dims_;
   XlaSpmdShardToFullShapeOp(const XlaSpmdShardToFullShapeOp&) = delete;
   void operator=(const XlaSpmdShardToFullShapeOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 3c99ad63565266..4672477be3534b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -120,7 +120,7 @@ class StackOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string stack_name_;
+  std::string stack_name_;
 
   StackOp(const StackOp&) = delete;
   void operator=(const StackOp&) = delete;
@@ -152,7 +152,7 @@ class StackPushOp : public XlaOpKernel {
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
@@ -164,7 +164,7 @@ class StackPushOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    resource->SetValue(xla::Tuple(
                        b, {xla::DynamicUpdateSlice(ta, update, start_indices),
-                           xla::Add(index, xla::ConstantR0<int32>(b, 1))})));
+                           xla::Add(index, xla::ConstantR0<int32_t>(b, 1))})));
 
     ctx->SetOutput(0, value);
   }
@@ -204,12 +204,12 @@ class StackPopOp : public XlaOpKernel {
     xla::XlaOp ta = xla::GetTupleElement(state, 0);
     xla::XlaOp index = xla::GetTupleElement(state, 1);
 
-    index = Sub(index, xla::ConstantR0<int32>(b, 1));
+    index = Sub(index, xla::ConstantR0<int32_t>(b, 1));
     OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(stack_shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     auto slice_shape = stack_shape.dim_sizes();
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index e7ff8194b96ce8..80047c5f17cc98 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -511,7 +511,7 @@ class RngSkipOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("RngSkip").CompileTimeConstantInput("algorithm"),
                 RngSkipOp<>);
 
-using RngReadAndSkipOp = RngSkipOp<int32, true>;
+using RngReadAndSkipOp = RngSkipOp<int32_t, true>;
 
 REGISTER_XLA_OP(Name("RngReadAndSkip").CompileTimeConstantInput("alg"),
                 RngReadAndSkipOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index aa71c5c34d2e1a..246981c3465ef1 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -76,7 +76,7 @@ xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
     // `BitcastConvertType(ConvertElementType(u32, U16), BF16)`, to avoid the
     // unclear `ConvertElementType(f32, BF16)` behavior.
     xla::XlaOp output = xla::BitcastConvertType(input, xla::U32) &
-                        xla::ConstantR0<uint32>(builder, 0xFFFF0000);
+                        xla::ConstantR0<uint32_t>(builder, 0xFFFF0000);
     return xla::ConvertElementType(xla::BitcastConvertType(output, xla::F32),
                                    xla::BF16);
   } else {
@@ -184,7 +184,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
   void operator=(const StatelessRandomUniformOp&) = delete;
@@ -240,7 +240,7 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
   void operator=(const StatelessRandomUniformIntOp&) = delete;
@@ -283,7 +283,7 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
       delete;
@@ -336,7 +336,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
   void operator=(const StatelessRandomNormalOp&) = delete;
@@ -384,7 +384,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
   void operator=(const StatelessTruncatedNormalOp&) = delete;
@@ -449,7 +449,7 @@ class StatelessParameterizedTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessParameterizedTruncatedNormalOp(
       const StatelessParameterizedTruncatedNormalOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index ce1fee91ae6a51..689e6ca3f7bf41 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -128,7 +128,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformOp(const StatelessRandomUniformOp&) = delete;
   void operator=(const StatelessRandomUniformOp&) = delete;
@@ -177,7 +177,7 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformIntOp(const StatelessRandomUniformIntOp&) = delete;
   void operator=(const StatelessRandomUniformIntOp&) = delete;
@@ -225,7 +225,7 @@ class StatelessRandomUniformFullIntOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomUniformFullIntOp(const StatelessRandomUniformFullIntOp&) =
       delete;
@@ -295,7 +295,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessRandomNormalOp(const StatelessRandomNormalOp&) = delete;
   void operator=(const StatelessRandomNormalOp&) = delete;
@@ -330,7 +330,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
  private:
   DataType dtype_;
-  string device_type_string_;
+  std::string device_type_string_;
 
   StatelessTruncatedNormalOp(const StatelessTruncatedNormalOp&) = delete;
   void operator=(const StatelessTruncatedNormalOp&) = delete;
@@ -369,7 +369,7 @@ class GetKeyCounterOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetKeyCounterOp(const GetKeyCounterOp&) = delete;
   void operator=(const GetKeyCounterOp&) = delete;
@@ -392,7 +392,7 @@ class GetAlgOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetAlgOp(const GetAlgOp&) = delete;
   void operator=(const GetAlgOp&) = delete;
@@ -430,7 +430,7 @@ class GetKeyCounterAlgOp : public XlaOpKernel {
   }
 
  private:
-  string device_type_string_;
+  std::string device_type_string_;
 
   GetKeyCounterAlgOp(const GetKeyCounterAlgOp&) = delete;
   void operator=(const GetKeyCounterAlgOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index e15196bd756462..1b44d1e07c4bd8 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -142,7 +142,7 @@ class StridedSliceOp : public XlaOpKernel {
       // Pad input to 2x to avoid OOB access.
       slice = xla::Pad(slice, xla::Zero(ctx->builder(), ctx->input_xla_type(0)),
                        padding_config);
-      for (int64 i = 0; i < result_dims_are_dynamic.size(); ++i) {
+      for (int64_t i = 0; i < result_dims_are_dynamic.size(); ++i) {
         if (result_dims_are_dynamic[i]) {
           slice = xla::RemoveDynamicDimension(slice, i);
         }
@@ -178,7 +178,7 @@ class StridedSliceOp : public XlaOpKernel {
           // Can't infer a lower bound.
           return false;
         }
-        return lower_bound->Get<int32>({}) >= 0;
+        return lower_bound->Get<int32_t>({}) >= 0;
       };
       if (begin_mask) {
         begin_index = zero;
@@ -220,7 +220,7 @@ class StridedSliceOp : public XlaOpKernel {
     // size 1 dims of a shape.
     slice = xla::Reshape(slice, final_shape.dim_sizes());
     for (int64_t i = 0; i < final_shape.dims(); ++i) {
-      int64 processing_shape_dim = shape_spec.output_to_processing_mapping[i];
+      int64_t processing_shape_dim = shape_spec.output_to_processing_mapping[i];
       // If processing_shape_dim is -1, it means the output dimension was newly
       // added by new_axis_mask_, which doesn't show up in input.
       if (processing_shape_dim != -1) {
@@ -341,9 +341,9 @@ class StridedSliceOp : public XlaOpKernel {
         int64_t sparse_index = shape_spec.output_to_sparse_mapping[i];
         bool end_is_dynamic =
             sparse_index == -1 ? false : ends_are_dynamic[sparse_index];
-        bool backward_slice = sparse_index == -1
-                                  ? false
-                                  : end_literal.Get<int32>({sparse_index}) < 0;
+        bool backward_slice =
+            sparse_index == -1 ? false
+                               : end_literal.Get<int32_t>({sparse_index}) < 0;
         if (input_is_dynamic || end_is_dynamic) {
           OP_REQUIRES(
               ctx, strides[input_index] == 1,
@@ -363,8 +363,8 @@ class StridedSliceOp : public XlaOpKernel {
                             "sized slice with dynamic negative index %lld. "));
             operand_size = xla::Add(
                 operand_size,
-                xla::ConstantR0<int32>(ctx->builder(),
-                                       end_literal.Get<int32>({sparse_index})));
+                xla::ConstantR0<int32_t>(
+                    ctx->builder(), end_literal.Get<int32_t>({sparse_index})));
           } else {
             // The end of slice with dynamic slice size is the min of operand
             // shape and slice size. E.g., t[:end_size], result size is
@@ -376,13 +376,13 @@ class StridedSliceOp : public XlaOpKernel {
                                       {});
             } else {
               end_size =
-                  xla::ConstantR0<int32>(ctx->builder(), end[input_index]);
+                  xla::ConstantR0<int32_t>(ctx->builder(), end[input_index]);
             }
             operand_size = xla::Min(operand_size, end_size);
           }
           slice = xla::SetDimensionSize(
               slice,
-              xla::Sub(operand_size, xla::ConstantR0<int32>(
+              xla::Sub(operand_size, xla::ConstantR0<int32_t>(
                                          ctx->builder(), begin[input_index])),
               i);
         }
@@ -397,8 +397,8 @@ class StridedSliceOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
 };
 
@@ -634,8 +634,8 @@ class StridedSliceGradOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
 };
 
@@ -751,8 +751,8 @@ class StridedSliceAssignOp : public XlaOpKernel {
   }
 
  private:
-  int32 begin_mask_, end_mask_;
-  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  int32_t begin_mask_, end_mask_;
+  int32_t ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
   DataType dtype_;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 888908e30b2331..e89c3e3b4f837b 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -94,7 +94,7 @@ absl::Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
 
 // Checks that the TensorArray 'resource' has been initialized, and has type
 // 'dtype'. Sets 'shape' to the shape
-absl::Status CheckTensorArrayIsInitialized(const string& op_name,
+absl::Status CheckTensorArrayIsInitialized(const std::string& op_name,
                                            const XlaResource* resource,
                                            DataType dtype) {
   if (resource->kind() != XlaResource::kTensorArray) {
@@ -184,7 +184,7 @@ class TensorArrayOp : public XlaOpKernel {
  private:
   PartialTensorShape element_shape_;
   DataType dtype_;
-  string tensor_array_name_;
+  std::string tensor_array_name_;
 
   TensorArrayOp(const TensorArrayOp&) = delete;
   void operator=(const TensorArrayOp&) = delete;
@@ -218,7 +218,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
@@ -270,7 +270,7 @@ class TensorArrayReadOp : public XlaOpKernel {
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     std::vector<xla::XlaOp> start_indices(ta_shape.dims(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = index;
 
     auto slice_shape = ta_shape.dim_sizes();
@@ -430,7 +430,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
         // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
         auto index = xla::Reshape(xla::Slice(indices, {i}, {i + 1}, {1}), {});
         std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
-                                              xla::ConstantR0<int32>(b, 0));
+                                              xla::ConstantR0<int32_t>(b, 0));
         start_indices[0] = index;
         ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices, dtype_);
       }
@@ -570,7 +570,8 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() = static_cast<int32>(var->max_array_size());
+    size_tensor.scalar<int32_t>()() =
+        static_cast<int32_t>(var->max_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
@@ -609,7 +610,7 @@ class TensorArrayGradOp : public XlaOpKernel {
   }
 
  private:
-  string source_;
+  std::string source_;
 
   TensorArrayGradOp(const TensorArrayGradOp&) = delete;
   void operator=(const TensorArrayGradOp&) = delete;
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index a1f58d5ae9b40e..f128c96c570e6c 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -70,7 +70,7 @@ absl::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
     dynamic_dims.push_back(ctx->Input(1));
   } else {
     dynamic_dims.push_back(
-        xla::ConstantR0<int32>(ctx->builder(), num_elements));
+        xla::ConstantR0<int32_t>(ctx->builder(), num_elements));
   }
   for (int64_t dim = 0; dim < element_shape.dimensions().size(); ++dim) {
     if (dims_are_dynamic[dim]) {
@@ -80,7 +80,7 @@ absl::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
       dynamic_dims.push_back(dynamic_dim_size);
     } else {
       dynamic_dims.push_back(
-          xla::ConstantR0<int32>(ctx->builder(), dynamic_sizes[dim]));
+          xla::ConstantR0<int32_t>(ctx->builder(), dynamic_sizes[dim]));
     }
   }
   list_dynamic_dims.push_back(std::move(dynamic_dims));
@@ -191,7 +191,7 @@ class TensorListReserveOp : public XlaOpKernel {
       OP_REQUIRES_OK(
           ctx,
           SetTensorListPushIndex(
-              new_list, xla::ConstantR0<int32>(ctx->builder(), num_elements),
+              new_list, xla::ConstantR0<int32_t>(ctx->builder(), num_elements),
               &result));
       ctx->SetTensorListOutput(0, result);
       return;
@@ -324,13 +324,13 @@ class TensorListElementShapeOp : public XlaOpKernel {
         ctx->SetOutput(0, xla::ConstantR1<int64_t>(b, list_shape.dimensions()));
         break;
       case DT_INT32: {
-        std::vector<int32> size;
+        std::vector<int32_t> size;
         const auto& dimensions = list_shape.dimensions();
         size.reserve(dimensions.size());
         for (int64_t s : dimensions) {
           size.push_back(s);
         }
-        ctx->SetOutput(0, xla::ConstantR1<int32>(b, size));
+        ctx->SetOutput(0, xla::ConstantR1<int32_t>(b, size));
         break;
       }
       default:
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 683dc4737e6dab..0a7297456fce8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -393,7 +393,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
 
       std::vector<xla::XlaOp> start_indices(
           element_part_shape.dimensions().size() + 1,
-          xla::ConstantR0<int32>(b, 0));
+          xla::ConstantR0<int32_t>(b, 0));
       start_indices[0] = push_index;
 
       xla::XlaOp list_part = xla::GetTupleElement(list, i);
@@ -409,7 +409,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
     xla::XlaOp update = xla::Reshape(element, element_dims);
 
     std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = push_index;
 
     xla::XlaOp list_part = xla::GetTupleElement(list, 0);
@@ -418,7 +418,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
     result_parts.push_back(updated_list_part);
   }
 
-  xla::XlaOp updated_push_index = push_index + xla::ConstantR0<int32>(b, 1);
+  xla::XlaOp updated_push_index = push_index + xla::ConstantR0<int32_t>(b, 1);
   result_parts.push_back(updated_push_index);
 
   *result = xla::Tuple(b, result_parts);
@@ -441,14 +441,14 @@ absl::Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
   int list_tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
   xla::XlaOp push_index = xla::GetTupleElement(list, list_tuple_size - 1);
-  push_index = push_index - xla::ConstantR0<int32>(b, 1);
+  push_index = push_index - xla::ConstantR0<int32_t>(b, 1);
 
   std::vector<xla::XlaOp> list_result_parts, element_result_parts;
   for (int i = 0; i < list_tuple_size - 1; i++) {
     const xla::Shape& list_part_shape =
         xla::ShapeUtil::GetTupleElementShape(list_shape, i);
     std::vector<xla::XlaOp> start_indices(list_part_shape.dimensions().size(),
-                                          xla::ConstantR0<int32>(b, 0));
+                                          xla::ConstantR0<int32_t>(b, 0));
     start_indices[0] = push_index;
 
     std::vector<int64_t> slice_shape =
@@ -496,7 +496,7 @@ absl::Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
   xla::XlaOp update = xla::Reshape(element, element_dims);
 
   std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
-                                        xla::ConstantR0<int32>(b, 0));
+                                        xla::ConstantR0<int32_t>(b, 0));
   start_indices[0] = index;
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
@@ -550,7 +550,7 @@ absl::Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
   const xla::Shape& buffer_shape =
       xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
   std::vector<xla::XlaOp> start_indices(buffer_shape.dimensions().size(),
-                                        xla::ConstantR0<int32>(b, 0));
+                                        xla::ConstantR0<int32_t>(b, 0));
   start_indices[0] = index;
 
   std::vector<int64_t> slice_shape =
@@ -585,7 +585,7 @@ absl::Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
   }
 
   std::vector<xla::XlaOp> result_parts{tensor,
-                                       xla::ConstantR0<int32>(b, push_index)};
+                                       xla::ConstantR0<int32_t>(b, push_index)};
   *result = xla::Tuple(b, result_parts);
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 039320573f4558..9c4e0b63490205 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -137,7 +137,7 @@ class InvertPermutationOp : public XlaOpKernel {
     absl::Status status;
     switch (dtype) {
       case DT_INT32:
-        InvertPermutation<int32>(ctx);
+        InvertPermutation<int32_t>(ctx);
         break;
       case DT_INT64:
         InvertPermutation<int64_t>(ctx);
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
index dbd6cda9d950d0..1d487f70d09d21 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 namespace {
 
 using XlaUnaryOpGenerator = std::function<xla::XlaOp(xla::XlaOp)>;
-using XlaOpGeneratorMap = absl::flat_hash_map<string, XlaUnaryOpGenerator>;
+using XlaOpGeneratorMap = absl::flat_hash_map<std::string, XlaUnaryOpGenerator>;
 
 void PopulateXlaOpGeneratorMap(XlaOpGeneratorMap* op_generator_map) {
   auto add_xla_op_generator = [&](std::string name,
@@ -120,7 +120,7 @@ class UnaryOpsCompositionOp : public XlaOpKernel {
   }
 
  private:
-  std::vector<string> op_names_;
+  std::vector<std::string> op_names_;
 };
 
 REGISTER_XLA_OP(Name("_UnaryOpsComposition"), UnaryOpsCompositionOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index a7a1a438f95b9e..c9ddab9efb6e22 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -165,7 +165,7 @@ class ResourceGatherOp : public XlaOpKernel {
   }
 
  private:
-  int32 batch_dims_;
+  int32_t batch_dims_;
 };
 REGISTER_XLA_OP(Name("ResourceGather"), ResourceGatherOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 415f465f0b5088..57821f74e97024 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -449,7 +449,8 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
       // Add any TensorArray gradients touched by the body to the enclosing
       // graph.
-      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+      for (const std::string& grad_source :
+           update.tensor_array_gradients_accessed) {
         VLOG(4) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
@@ -553,7 +554,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       // Set token input for this "while" op.
       std::vector<xla::XlaOp> token_inputs;
       token_inputs.reserve(token_input_nodes_.size());
-      for (const string& node_name : token_input_nodes_) {
+      for (const std::string& node_name : token_input_nodes_) {
         auto token_or = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token_or.status());
         token_inputs.push_back(token_or.value());
@@ -590,7 +591,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
           } else {
             int32_t dim_size = shape.dimensions(0);
             dynamic_dims.push_back(
-                xla::ConstantR0<int32>(ctx->builder(), dim_size));
+                xla::ConstantR0<int32_t>(ctx->builder(), dim_size));
           }
 
           // Set dynamic dimension size to 0 for element value. Inside the while
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
index 8e9f317ac4f3fe..b1937c14f0bebc 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -61,8 +61,8 @@ class XlaWhileOp : public XlaOpKernel {
   NameAttrList cond_name_attr_;
   NameAttrList body_name_attr_;
   bool has_token_input_output_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   // Whether to propagate compile time consts into the loop body.
   // This is not supported by default now since it may cause HBM memory
   // overheads.
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index 9a2a00c58732f3..e06c0b09ba9938 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -166,13 +166,13 @@ class XlaCallModuleOp : public XlaOpKernel {
   explicit XlaCallModuleOp(OpKernelConstruction *ctx) : XlaOpKernel(ctx) {
     int version;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("version", &version));
-    string module_str;
+    std::string module_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("module", &module_str));
     std::vector<PartialTensorShape> expected_output_shapes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Sout", &expected_output_shapes));
     std::vector<DataType> expected_output_dtypes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &expected_output_dtypes));
-    std::vector<string> dim_args_spec;
+    std::vector<std::string> dim_args_spec;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_args_spec", &dim_args_spec));
     OP_REQUIRES(ctx, dim_args_spec.empty(),
                 absl::UnimplementedError(
@@ -183,9 +183,9 @@ class XlaCallModuleOp : public XlaOpKernel {
                     "The size of Sout (", expected_output_shapes.size(),
                     ") must match the size of Tout (",
                     expected_output_dtypes.size(), ")")));
-    std::vector<string> disabled_checks;
+    std::vector<std::string> disabled_checks;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("disabled_checks", &disabled_checks));
-    std::vector<string> platforms;
+    std::vector<std::string> platforms;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
     // TODO(necula): change this to OP_REQUIRES_OK when 6 months have passed
     // since we added the function_list and has_token_input_output
@@ -222,7 +222,7 @@ class XlaCallModuleOp : public XlaOpKernel {
                                })
               << "])";
     }
-    string compilation_device_type = ctx->device_type().type_string();
+    std::string compilation_device_type = ctx->device_type().type_string();
     compilation_platform_ = "";
     if (compilation_device_type == DEVICE_CPU_XLA_JIT) {
       compilation_platform_ = "CPU";
@@ -293,7 +293,7 @@ class XlaCallModuleOp : public XlaOpKernel {
     xla::XlaOp token_input;
     if (!op_token_input_nodes_.empty()) {
       std::vector<xla::XlaOp> token_inputs;
-      for (const string &node_name : op_token_input_nodes_) {
+      for (const std::string& node_name : op_token_input_nodes_) {
         auto token = compiler->GetNodeToken(node_name);
         OP_REQUIRES_OK(ctx, token.status());
         token_inputs.push_back(token.value());
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
index 139ac17b35c637..99a0ec6d9e38dd 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
@@ -55,8 +55,8 @@ class XlaCustomCallOp : public XlaOpKernel {
   }
 
  private:
-  string target_name_;
-  string backend_config_;
+  std::string target_name_;
+  std::string backend_config_;
   DataType output_type_;
   TensorShape output_shape_;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
index 7b0ea597c63488..6889c093a11201 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
@@ -42,7 +42,7 @@ class XlaDequantizeOp : public XlaOpKernel {
     xla::QuantizedRange range(min_range_, max_range_);
 
     xla::XlaOp output =
-        xla::Dequantize<uint8>(input, range, mode_, transpose_output_);
+        xla::Dequantize<uint8_t>(input, range, mode_, transpose_output_);
     context->SetOutput(0, output);
   }
 
@@ -50,7 +50,7 @@ class XlaDequantizeOp : public XlaOpKernel {
   float min_range_;
   float max_range_;
   bool transpose_output_;
-  string mode_;
+  std::string mode_;
   XlaDequantizeOp(const XlaDequantizeOp&) = delete;
   void operator=(const XlaDequantizeOp&) = delete;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
index 8236e67eeded01..f77cb46c44de8c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
@@ -34,12 +34,12 @@ namespace {
 class XlaDotOp : public XlaOpKernel {
  public:
   explicit XlaDotOp(OpKernelConstruction* context) : XlaOpKernel(context) {
-    string dnums_attr;
+    std::string dnums_attr;
     OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
     OP_REQUIRES(
         context, dnums_.ParsePartialFromString(dnums_attr),
         errors::InvalidArgument("Error parsing convolution dimension numbers"));
-    string precision_config_attr;
+    std::string precision_config_attr;
     OP_REQUIRES_OK(
         context, context->GetAttr("precision_config", &precision_config_attr));
     OP_REQUIRES(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
index 0cfd247bdd1de6..7765de131e865c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
@@ -41,7 +41,7 @@ class XlaSelfAdjointEigOp : public XlaOpKernel {
 
  private:
   bool lower_;
-  int32 max_iter_;
+  int32_t max_iter_;
   float epsilon_;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
index f3bd088ced826a..6639c8003e1a15 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
@@ -37,7 +37,7 @@ class XlaSvdOp : public XlaOpKernel {
   explicit XlaSvdOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
-    string precision_config_attr;
+    std::string precision_config_attr;
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("precision_config", &precision_config_attr));
     OP_REQUIRES(ctx,
@@ -57,7 +57,7 @@ class XlaSvdOp : public XlaOpKernel {
   }
 
  private:
-  int32 max_iter_;
+  int32_t max_iter_;
   float epsilon_;
   xla::PrecisionConfig precision_config_;
 };
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 6a67cfa237af70..0028f8e61cbd11 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -222,7 +222,7 @@ static absl::Status XlaDotShapeFunction(shape_inference::InferenceContext* c) {
     return shape_inference::UnknownShape(c);
   }
 
-  string dimension_numbers_string;
+  std::string dimension_numbers_string;
   TF_RETURN_IF_ERROR(
       c->GetAttr("dimension_numbers", &dimension_numbers_string));
 
@@ -1027,7 +1027,7 @@ REGISTER_OP("XlaEinsum")
     .Attr("equation: string")
     .Attr("T: {complex64, bfloat16, float}")
     .SetShapeFn([](shape_inference::InferenceContext* context) {
-      string equation;
+      std::string equation;
       TF_RETURN_IF_ERROR(context->GetAttr("equation", &equation));
       // XlaEinsum supports only two-input einsum equations.
       if (!absl::StrContains(equation, ",")) {
@@ -1057,9 +1057,9 @@ REGISTER_OP("XlaSpmdFullToShardShape")
       if (!c->RankKnown(input_handle)) {
         return shape_inference::UnknownShape(c);
       }
-      string sharding_attr;
+      std::string sharding_attr;
       TF_RETURN_IF_ERROR(c->GetAttr("manual_sharding", &sharding_attr));
-      int32 single_dim;
+      int32_t single_dim;
       TF_RETURN_IF_ERROR(c->GetAttr("dim", &single_dim));
       xla::OpSharding sharding;
       sharding.ParseFromString(sharding_attr);
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index 84ed56a468df8e..47e76f81a0328c 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -304,7 +304,7 @@ absl::Status MaybeRewriteWhileNode(
                                         resource_input_count, index_mapping));
 
   // Modify cond and body functions.
-  for (auto const& attr_name : std::vector<string>{"cond", "body"}) {
+  for (auto const& attr_name : std::vector<std::string>{"cond", "body"}) {
     NameAttrList attr_value;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &attr_value));
     const FunctionBody* fbody;
@@ -363,7 +363,7 @@ absl::Status MaybeRewriteWhileNode(
 
     // Save the new FunctionDef.
     FunctionDef new_fdef;
-    string new_name =
+    std::string new_name =
         fld->UniqueFunctionName(absl::StrCat(attr_value.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
 
@@ -435,7 +435,7 @@ absl::Status MaybeRewriteIfNode(
 
   std::map<int, int> resource_retval_to_arg, retval_index_mapping;
   for (auto const& attr_name :
-       std::vector<string>{"then_branch", "else_branch"}) {
+       std::vector<std::string>{"then_branch", "else_branch"}) {
     NameAttrList f;
     TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &f));
     const FunctionBody* fbody;
@@ -459,7 +459,7 @@ absl::Status MaybeRewriteIfNode(
 
     // Save the new FunctionDef.
     FunctionDef new_fdef;
-    string new_name =
+    std::string new_name =
         fld->UniqueFunctionName(absl::StrCat(f.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
 
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
index 956f597301d28d..39efe2d682eb12 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
@@ -34,15 +34,16 @@ bool HasResourceInputOrOutput(const OpDef& op_def) {
 }
 
 TEST(ResourceOperationTableTest, HaveAllResourceOps) {
-  absl::flat_hash_map<string, bool> known_resource_ops;
+  absl::flat_hash_map<std::string, bool> known_resource_ops;
   for (absl::string_view known_resource_op :
        resource_op_table_internal::GetKnownResourceOps()) {
     ASSERT_TRUE(
-        known_resource_ops.insert({string(known_resource_op), false}).second);
+        known_resource_ops.insert({std::string(known_resource_op), false})
+            .second);
   }
 
-  std::vector<string> xla_op_names = XlaOpRegistry::GetAllRegisteredOps();
-  for (const string& xla_op_name : xla_op_names) {
+  std::vector<std::string> xla_op_names = XlaOpRegistry::GetAllRegisteredOps();
+  for (const std::string& xla_op_name : xla_op_names) {
     const OpDef* op_def;
     TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef(xla_op_name, &op_def));
     if (HasResourceInputOrOutput(*op_def)) {
@@ -52,7 +53,7 @@ TEST(ResourceOperationTableTest, HaveAllResourceOps) {
     }
   }
 
-  std::vector<string> unnecessary_resource_ops;
+  std::vector<std::string> unnecessary_resource_ops;
   for (const auto& pair : known_resource_ops) {
     if (!pair.second) {
       unnecessary_resource_ops.push_back(pair.first);
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 7e0b70e4df270a..4b285078f94d21 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -50,7 +50,8 @@ xla::OpMetadata CreateOpMetadata(const std::string& op_type,
 }
 
 void AssignOpMetadataToSharding(xla::OpSharding& sharding,
-                                const string& op_type, const string& op_name) {
+                                const std::string& op_type,
+                                const std::string& op_name) {
   auto metadata = CreateOpMetadata(op_type, op_name);
   if (sharding.type() == xla::OpSharding::TUPLE) {
     for (auto& sharding_element : *sharding.mutable_tuple_shardings()) {
@@ -69,7 +70,7 @@ absl::Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
 }  // namespace
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
-    const string& device_name, int num_cores_per_replica,
+    const std::string& device_name, int num_cores_per_replica,
     std::optional<xla::OpSharding> explicit_sharding,
     std::optional<xla::OpMetadata> metadata) {
   if (device_name.empty()) {
@@ -102,7 +103,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
     const NodeDef& node_def, int num_cores_per_replica, bool add_metadata) {
-  const string& device_name = node_def.device();
+  const std::string& device_name = node_def.device();
   TF_ASSIGN_OR_RETURN(std::optional<xla::OpSharding> sharding,
                       GetShardingFromNodeDef(node_def, add_metadata));
   return ParseShardingFromDevice(
@@ -114,7 +115,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
 
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
     const Node& node, int num_cores_per_replica, bool add_metadata) {
-  string device_name = node.assigned_device_name();
+  std::string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
@@ -152,7 +153,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromEdgeSource(
 }
 
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
-  string device_name = src.assigned_device_name();
+  std::string device_name = src.assigned_device_name();
   if (device_name.empty()) {
     device_name = src.requested_device();
   }
@@ -169,7 +170,7 @@ absl::StatusOr<std::optional<xla::OpSharding>> GetShardingFromNodeDefInternal(
   if (!HasNodeAttr(node_def, attribute)) {
     return std::optional<xla::OpSharding>();
   }
-  string value;
+  std::string value;
   xla::OpSharding sharding;
   TF_RETURN_IF_ERROR(GetNodeAttr(node_def, attribute, &value));
   if (tensorflow::DecodeShardingAttribute(value, sharding).failed()) {
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index e579f3ee0ff397..85259e0c729883 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -36,7 +36,7 @@ namespace tensorflow {
 // - a non-value if there is no assigned core or
 // - a sharding set as per xla::sharding_builder::AssignDevice.
 absl::StatusOr<std::optional<xla::OpSharding>> ParseShardingFromDevice(
-    const string& device_name, int num_cores_per_replica,
+    const std::string& device_name, int num_cores_per_replica,
     std::optional<xla::OpSharding> explicit_sharding = std::nullopt,
     std::optional<xla::OpMetadata> metadata = std::nullopt);
 
diff --git a/tensorflow/compiler/tf2xla/sharding_util_test.cc b/tensorflow/compiler/tf2xla/sharding_util_test.cc
index 585e3887fe686c..c987e8f167422f 100644
--- a/tensorflow/compiler/tf2xla/sharding_util_test.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util_test.cc
@@ -33,7 +33,7 @@ TEST(CoreUtilTest, ParseShardingFromDevice) {
   Graph graph(OpRegistry::Global());
 
   auto core_from_sharding =
-      [](std::optional<xla::OpSharding> sharding) -> int64 {
+      [](std::optional<xla::OpSharding> sharding) -> int64_t {
     if (sharding.has_value() &&
         sharding.value().type() == xla::OpSharding::MAXIMAL) {
       return sharding.value().tile_assignment_devices(0);
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index afe82e0de40f62..e8b2a56cdf64d2 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -48,8 +48,8 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
   } else if (node->IsIfNode()) {
     AttrValue device_ordinal_value;
     device_ordinal_value.set_i(device_ordinal);
-    for (const string& attr_name :
-         std::vector<string>{"then_branch", "else_branch"}) {
+    for (const std::string& attr_name :
+         std::vector<std::string>{"then_branch", "else_branch"}) {
       NameAttrList branch_func;
       TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
       (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -59,7 +59,8 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
   } else if (node->IsWhileNode()) {
     AttrValue device_ordinal_value;
     device_ordinal_value.set_i(device_ordinal);
-    for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+    for (const std::string& attr_name :
+         std::vector<std::string>{"cond", "body"}) {
       NameAttrList branch_func;
       TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
       (*branch_func.mutable_attr())["_device_ordinal"] = device_ordinal_value;
@@ -80,39 +81,40 @@ absl::Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
-  ReverseDFS(g,
-             [&](Node* n) {
-               std::vector<string> token_input_nodes;
-               if (!GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName,
-                                &token_input_nodes)
-                        .ok() ||
-                   token_input_nodes.empty()) {
-                 return;
-               }
-
-               if (first_side_effecting_node_on_path != nullptr) {
-                 return;
-               }
-
-               first_side_effecting_node_on_path = n;
-               string original_node_name;
-               TF_CHECK_OK(GetNodeAttr(n->def(),
-                                       kXlaOriginalOutsideCompilationNodeName,
-                                       &original_node_name));
-               results.insert(original_node_name);
-             },
-             [&](Node* n) {
-               if (first_side_effecting_node_on_path == n) {
-                 first_side_effecting_node_on_path = nullptr;
-               }
-             },
-             NodeComparatorName());
+  ReverseDFS(
+      g,
+      [&](Node* n) {
+        std::vector<std::string> token_input_nodes;
+        if (!GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName,
+                         &token_input_nodes)
+                 .ok() ||
+            token_input_nodes.empty()) {
+          return;
+        }
+
+        if (first_side_effecting_node_on_path != nullptr) {
+          return;
+        }
+
+        first_side_effecting_node_on_path = n;
+        std::string original_node_name;
+        TF_CHECK_OK(GetNodeAttr(n->def(),
+                                kXlaOriginalOutsideCompilationNodeName,
+                                &original_node_name));
+        results.insert(original_node_name);
+      },
+      [&](Node* n) {
+        if (first_side_effecting_node_on_path == n) {
+          first_side_effecting_node_on_path = nullptr;
+        }
+      },
+      NodeComparatorName());
   return results;
 }
 
 bool HasSideEffectingNodes(const Graph& g) {
   for (Node* n : g.nodes()) {
-    std::vector<string> token_input_nodes;
+    std::vector<std::string> token_input_nodes;
     if (GetNodeAttr(n->attrs(), kXlaTokenInputNodesAttrName, &token_input_nodes)
             .ok() &&
         !token_input_nodes.empty()) {
@@ -123,10 +125,10 @@ bool HasSideEffectingNodes(const Graph& g) {
 }
 
 absl::Status ParseHostComputeCoreList(
-    absl::Span<const string> list_from_attr,
-    std::map<string, int>* host_compute_core) {
+    absl::Span<const std::string> list_from_attr,
+    std::map<std::string, int>* host_compute_core) {
   for (const auto& hc_core : list_from_attr) {
-    std::vector<string> parts = str_util::Split(hc_core, ":");
+    std::vector<std::string> parts = str_util::Split(hc_core, ":");
     if (parts.size() != 2) {
       return errors::InvalidArgument(
           "Malformed host_compute_core entry ", hc_core,
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index 34f30eb7661bc1..9ba994a16a3c8e 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -61,8 +61,9 @@ bool HasSideEffectingNodes(const Graph& g);
 // Parse the mapping from outside_compilation_subgraph name to core number,
 // which is specified in an attr as a list of strings
 // <subgraph_name>:<core_index>.
-absl::Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
-                                      std::map<string, int>* host_compute_core);
+absl::Status ParseHostComputeCoreList(
+    absl::Span<const std::string> list_from_attr,
+    std::map<std::string, int>* host_compute_core);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/test_util.cc b/tensorflow/compiler/tf2xla/test_util.cc
index 43623a8db8014f..193eb7c08bc08a 100644
--- a/tensorflow/compiler/tf2xla/test_util.cc
+++ b/tensorflow/compiler/tf2xla/test_util.cc
@@ -21,12 +21,12 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status InstantiateFunctionForTest(
-    const string& name, const FunctionLibraryDefinition& library,
+    const std::string& name, const FunctionLibraryDefinition& library,
     InstantiationResultForTest* result) {
   const FunctionDef* fdef = library.Find(name);
   TF_RET_CHECK(fdef != nullptr);
 
-  auto get_func_sig = [&library](const string& op, const OpDef** sig) {
+  auto get_func_sig = [&library](const std::string& op, const OpDef** sig) {
     return library.LookUpOpDef(op, sig);
   };
   InstantiationResult inst;
diff --git a/tensorflow/compiler/tf2xla/test_util.h b/tensorflow/compiler/tf2xla/test_util.h
index 2b2eb4f582af3e..2c9cdc1c352238 100644
--- a/tensorflow/compiler/tf2xla/test_util.h
+++ b/tensorflow/compiler/tf2xla/test_util.h
@@ -41,7 +41,7 @@ struct InstantiationResultForTest {
 // Instantiates a function, producing a GraphDef to compare against the
 // expected graph.
 absl::Status InstantiateFunctionForTest(
-    const string& name, const FunctionLibraryDefinition& library,
+    const std::string& name, const FunctionLibraryDefinition& library,
     InstantiationResultForTest* result);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
index 504e9d0246322e..eccc2dfaf8d4a4 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -32,7 +32,8 @@ namespace tensorflow {
 namespace tf2xla {
 namespace {
 
-void PrintSupportedOps(const string& device, const string& regen_run) {
+void PrintSupportedOps(const std::string& device,
+                       const std::string& regen_run) {
   XlaOpRegistry::RegisterCompilationKernels();
 
   std::vector<const KernelDef*> kdefs =
@@ -46,10 +47,10 @@ void PrintSupportedOps(const string& device, const string& regen_run) {
             << "Operator | Type Constraint\n"
             << "-------- | ---------------" << std::endl;
   for (const KernelDef* kdef : kdefs) {
-    std::vector<string> constraints;
+    std::vector<std::string> constraints;
     constraints.reserve(kdef->constraint().size());
     for (const KernelDef::AttrConstraint& constraint : kdef->constraint()) {
-      std::vector<string> types;
+      std::vector<std::string> types;
       const auto& allowed_values = constraint.allowed_values().list().type();
       types.reserve(allowed_values.size());
       for (int type : allowed_values) {
@@ -70,18 +71,18 @@ void PrintSupportedOps(const string& device, const string& regen_run) {
 }  // namespace
 
 void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
-  std::vector<string> device_names = XlaOpRegistry::BackendNames();
+  std::vector<std::string> device_names = XlaOpRegistry::BackendNames();
   std::sort(device_names.begin(), device_names.end());
 
   // Set up and parse flags.
-  string device;
+  std::string device;
   std::vector<Flag> flag_list = {
       {"device", &device,
        "Name of the compilation device for which to print supported ops, "
        "one of: " +
            absl::StrJoin(device_names, ",")},
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
   QCHECK(parsed_flags_ok) << "\n" << usage;
   QCHECK(XlaOpRegistry::IsBackendRegistered(device))
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index d61d66bfe53b72..72bd28f2b47a8c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -118,8 +118,8 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
-  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
-  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_literal = xla::LiteralUtil::CreateR0<int32_t>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32_t>(32);
   auto x_global_or = client->TransferToServer(x_literal);
   auto y_global_or = client->TransferToServer(y_literal);
   TF_EXPECT_OK(x_global_or.status());
@@ -140,23 +140,23 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
-GraphDef EinsumGraph() {
+GraphDef EinsumGraph(DataType dtype = DT_FLOAT) {
   GraphDef graph_def;
   NodeDef* x = graph_def.add_node();
   x->set_name("x");
   x->set_op("Placeholder");
-  (*x->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  (*x->mutable_attr())["dtype"] = TypeAttrValue(dtype);
   NodeDef* y = graph_def.add_node();
   y->set_name("y");
   y->set_op("Placeholder");
-  (*y->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  (*y->mutable_attr())["dtype"] = TypeAttrValue(dtype);
   NodeDef* einsum = graph_def.add_node();
   einsum->set_name("einsum");
   einsum->set_op("Einsum");
   einsum->add_input("x");
   einsum->add_input("y");
   (*einsum->mutable_attr())["equation"] = StringAttrValue("ij,jk->ik");
-  (*einsum->mutable_attr())["T"] = TypeAttrValue(DT_FLOAT);
+  (*einsum->mutable_attr())["T"] = TypeAttrValue(dtype);
   (*einsum->mutable_attr())["N"] = IntAttrValue(2);
   return graph_def;
 }
@@ -233,6 +233,35 @@ TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
   EXPECT_EQ(num_dots, 1);
 }
 
+TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
+       EinsumIsConvertedToDotWithDefaultPrecisionIfNotF32) {
+  GraphDef graph_def = EinsumGraph(DT_BFLOAT16);
+  tf2xla::Config config = EinsumConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_dots = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "dot") {
+        num_dots++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::DEFAULT);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::DEFAULT);
+      }
+    }
+  }
+  EXPECT_EQ(num_dots, 1);
+}
+
 GraphDef Conv2DGraph() {
   GraphDef graph_def;
   NodeDef* x = graph_def.add_node();
@@ -338,8 +367,8 @@ TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
   TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
-  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
-  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_literal = xla::LiteralUtil::CreateR0<int32_t>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32_t>(32);
   auto x_global_or = client->TransferToServer(x_literal);
   auto y_global_or = client->TransferToServer(y_literal);
   auto unused_global_or = client->TransferToServer(y_literal);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 9f21af2741dcde..042b572c234355 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -58,8 +58,9 @@ absl::Status ValidateTensorId(const tf2xla::TensorId& id) {
   return absl::OkStatus();
 }
 
-absl::Status CheckNameDuplicates(const string& kind, const string& name,
-                                 std::set<string>* names) {
+absl::Status CheckNameDuplicates(const std::string& kind,
+                                 const std::string& name,
+                                 std::set<std::string>* names) {
   if (!name.empty()) {
     if (!names->insert(name).second) {
       return errors::InvalidArgument("duplicate ", kind, " name: ", name);
@@ -68,12 +69,12 @@ absl::Status CheckNameDuplicates(const string& kind, const string& name,
   return absl::OkStatus();
 }
 
-absl::Status CheckFeedFetchNameConflicts(const string& kind,
-                                         const std::set<string>& names) {
+absl::Status CheckFeedFetchNameConflicts(const std::string& kind,
+                                         const std::set<std::string>& names) {
   // We don't allow the feeds or fetches to contain both "foo" and "foo_data",
   // since that will cause a collision in codegen symbols.
-  for (const string& name : names) {
-    const string name_data(name + "_data");
+  for (const std::string& name : names) {
+    const std::string name_data(name + "_data");
     if (names.find(name_data) != names.end()) {
       return errors::InvalidArgument("conflicting ", kind, " name: ", name,
                                      " and ", name_data);
@@ -227,7 +228,7 @@ absl::Status ReplaceRetvalInputWithArg(
 // the function to replace _Arg nodes in `const_input_index_to_node` with Const
 // inputs.
 absl::Status PropagateConstIntoFuncAttr(
-    Node* n, const string& attr_name,
+    Node* n, const std::string& attr_name,
     const absl::flat_hash_map<int, const Node*>& const_input_index_to_node,
     const FunctionLibraryDefinition* lookup_fld, FunctionLibraryDefinition* fld,
     bool passthrough_arg_to_retval = false) {
@@ -255,7 +256,7 @@ absl::Status PropagateConstIntoFuncAttr(
 
   // Save rewritten function.
   FunctionDef replace_fdef;
-  string new_func_name =
+  std::string new_func_name =
       fld->UniqueFunctionName(absl::StrCat(func_attr.name(), "_const_"));
   const StackTracesMap* stack_traces =
       lookup_fld->GetStackTraces(func_attr.name());
@@ -301,7 +302,7 @@ absl::Status PropagateConstIntoIfNode(
   // Rewrite "then_branch" and "else_branch" function, replace usage of those
   // _Arg nodes with corresponding const node.
   for (const auto& attr_name :
-       std::vector<string>{"then_branch", "else_branch"}) {
+       std::vector<std::string>{"then_branch", "else_branch"}) {
     TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
         if_node, attr_name, const_input_index_to_node, lookup_fld, fld));
   }
@@ -309,13 +310,14 @@ absl::Status PropagateConstIntoIfNode(
   return absl::OkStatus();
 }
 
-using GraphCache = absl::flat_hash_map<string, std::unique_ptr<FunctionBody>>;
+using GraphCache =
+    absl::flat_hash_map<std::string, std::unique_ptr<FunctionBody>>;
 
 absl::StatusOr<FunctionBody*> FindOrInsert(
     GraphCache* cache, const NameAttrList& body_attr,
     const FunctionLibraryDefinition* lookup_fld,
     const FunctionLibraryDefinition* fallback_fld) {
-  const string name = body_attr.name();
+  const std::string name = body_attr.name();
   std::unique_ptr<FunctionBody>& value = (*cache)[name];
   if (!value) {
     const FunctionDef* body_func = lookup_fld->Find(name);
@@ -413,7 +415,7 @@ absl::Status PropagateConstIntoAndAroundWhileNode(
   absl::flat_hash_map<int, Node*> const_input_index_to_mutable_node;
   NameAttrList body_attr;
   TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "body", &body_attr));
-  const string fn_name = body_attr.name();
+  const std::string fn_name = body_attr.name();
   const FunctionDef* body_func = lookup_fld->Find(fn_name);
   if (!body_func) {
     return errors::Internal("Propagate: Cannot find body function ", fn_name,
@@ -461,7 +463,7 @@ absl::Status PropagateConstIntoAndAroundWhileNode(
 
   // Rewrite "cond" and "body" function, replace usage of those _Arg nodes with
   // corresponding const node.
-  for (const auto& attr_name : std::vector<string>{"cond", "body"}) {
+  for (const auto& attr_name : std::vector<std::string>{"cond", "body"}) {
     TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
         while_node, attr_name, const_input_index_to_node, lookup_fld, fld,
         /*passthrough_arg_to_retval=*/attr_name == "body"));
@@ -487,7 +489,7 @@ absl::StatusOr<bool> IsLoopInvariant(
 }
 
 absl::Status ValidateConfig(const tf2xla::Config& config) {
-  std::set<string> names;
+  std::set<std::string> names;
   for (const tf2xla::Feed& feed : config.feed()) {
     TF_RETURN_IF_ERROR(ValidateTensorId(feed.id()));
     TF_RETURN_IF_ERROR(TensorShape::IsValidShape(feed.shape()));
@@ -508,19 +510,20 @@ absl::Status ValidateConfig(const tf2xla::Config& config) {
 
 absl::Status AddPlaceholdersForFeeds(
     const tf2xla::Config& config, const OpRegistryInterface* op_registry,
-    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def) {
+    std::unordered_map<std::string, std::string>* feed_remapping,
+    GraphDef* graph_def) {
   struct PlaceholderInfo {
     const tf2xla::Feed* feed = nullptr;  // point to Feed in <config>.
-    string placeholder_name;
+    std::string placeholder_name;
     DataType data_type = DT_INVALID;
   };
 
   // Put each fed tensor into a map by name:port. A map is used for determinism
   // when creating placeholders (genrules want deterministic output).
-  std::map<string, PlaceholderInfo> placeholder_info;
+  std::map<std::string, PlaceholderInfo> placeholder_info;
   for (int i = 0; i < config.feed_size(); ++i) {
     const tf2xla::Feed* feed = &config.feed(i);
-    const string name_port = TensorIdToString(feed->id());
+    const std::string name_port = TensorIdToString(feed->id());
     PlaceholderInfo& info = placeholder_info[name_port];
     info.feed = feed;
     info.placeholder_name = absl::StrCat("aot_feed_", feed->id().output_index(),
@@ -529,7 +532,7 @@ absl::Status AddPlaceholdersForFeeds(
   }
 
   // Verify node exists and determine data type.
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (int i = 0; i < graph_def->node_size(); ++i) {
     name_to_node[graph_def->node(i).name()] = &graph_def->node(i);
   }
@@ -609,25 +612,25 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
   out->clear_node();
 
   // Tensors needed for feeding.
-  std::set<std::pair<string, int>> feed_tensors;
+  std::set<std::pair<std::string, int>> feed_tensors;
   for (const tf2xla::Feed& feed : config.feed()) {
     feed_tensors.insert(
         std::make_pair(feed.id().node_name(), feed.id().output_index()));
   }
 
   // Maps node name to reachability.
-  std::unordered_map<string, std::pair<bool, const NodeDef*>> node_by_name;
+  std::unordered_map<std::string, std::pair<bool, const NodeDef*>> node_by_name;
   for (const NodeDef& node : in.node()) {
     node_by_name[node.name()] = std::pair<bool, const NodeDef*>(false, &node);
   }
 
   // Traverse.
-  std::queue<string> name_queue;
+  std::queue<std::string> name_queue;
   for (int i = 0; i < config.fetch_size(); ++i) {
     name_queue.push(config.fetch(i).id().node_name());
   }
   while (!name_queue.empty()) {
-    const string name = name_queue.front();
+    const std::string name = name_queue.front();
     name_queue.pop();
 
     auto find_it = node_by_name.find(name);
@@ -642,9 +645,9 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
     map_entry.first = true;
 
     // Push input nodes of the currently visited node to name_queue.
-    for (const string& in_edge : map_entry.second->input()) {
+    for (const std::string& in_edge : map_entry.second->input()) {
       auto id = ParseTensorName(in_edge);
-      const string node_name = string(id.first);
+      const std::string node_name = std::string(id.first);
       if (feed_tensors.find(std::make_pair(node_name, id.second)) ==
           feed_tensors.end()) {
         name_queue.push(node_name);
@@ -668,7 +671,7 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
   return absl::OkStatus();
 }
 
-string TensorIdToString(const tf2xla::TensorId& id) {
+std::string TensorIdToString(const tf2xla::TensorId& id) {
   return absl::StrCat(id.node_name(), ":", id.output_index());
 }
 
@@ -682,7 +685,7 @@ absl::Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
         std::optional<xla::OpSharding> sharding,
         ParseShardingFromDevice(
             *possible_match,
-            /*num_cores_per_replica=*/std::numeric_limits<int32>::max(),
+            /*num_cores_per_replica=*/std::numeric_limits<int32_t>::max(),
             /*add_metadata=*/false));
     if (sharding && sharding->type() == xla::OpSharding::MAXIMAL) {
       const int core_annotation = sharding.value().tile_assignment_devices(0);
@@ -709,7 +712,7 @@ void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
 }
 
 namespace {
-uint32 InitialRandomSeed() {
+uint32_t InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
   // is to start with a known checkpoint. This also handles issues when
@@ -724,13 +727,13 @@ uint32 InitialRandomSeed() {
 }
 }  // namespace
 
-uint32 GetXLARandomSeed() {
+uint32_t GetXLARandomSeed() {
   // We initialize counter with an odd number and increment it by two
   // everytime. This ensures that it will never be zero, even
   // after an overflow. When seeded with zero, some XLA backends
   // can return all zeros instead of random numbers.
-  static std::atomic<uint32> counter(InitialRandomSeed());
-  uint32 seed = counter.fetch_add(2);
+  static std::atomic<uint32_t> counter(InitialRandomSeed());
+  uint32_t seed = counter.fetch_add(2);
   std::srand(seed);
   return std::rand() | 1;
 }
@@ -766,7 +769,7 @@ bool HasAssociatedFunction(const NodeDef& node_def,
 std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
     const Node& node, const FunctionLibraryDefinition* fld) {
   std::vector<AssociatedFunctionInfo> results;
-  const string& op = node.type_string();
+  const std::string& op = node.type_string();
   if (fld->Contains(op)) {
     // This is a function call node.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
@@ -795,7 +798,7 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
 absl::Status RewriteAssociatedFunction(
     Graph* graph, Node* node, FunctionLibraryDefinition* fld,
     const AssociatedFunctionInfo& associated_function,
-    const string& rewritten_function_name) {
+    const std::string& rewritten_function_name) {
   switch (associated_function.type()) {
     case AssociatedFunctionInfo::kFunctionCallNode: {
       // Change this node to call the new function.
@@ -834,7 +837,7 @@ absl::Status RewriteAssociatedFunction(
       GradientDef gradient_def;
       gradient_def.set_function_name(func.name());
       gradient_def.set_gradient_func(rewritten_function_name);
-      string original_grad_func = fld->FindGradient(func.name());
+      std::string original_grad_func = fld->FindGradient(func.name());
       if (original_grad_func.empty()) {
         TF_RETURN_IF_ERROR(fld->AddGradientDef(gradient_def));
       } else if (original_grad_func != rewritten_function_name) {
@@ -863,9 +866,9 @@ absl::Status RewriteAssociatedFunction(
 }
 
 absl::Status CachedFunctionHandles::GetOrInstantiate(
-    const string& func_name, AttrSlice attrs,
+    const std::string& func_name, AttrSlice attrs,
     FunctionLibraryRuntime::Handle* handle) {
-  string canonicalized_name = Canonicalize(func_name, attrs);
+  std::string canonicalized_name = Canonicalize(func_name, attrs);
   auto iter = handles_.find(canonicalized_name);
   if (iter != handles_.end()) {
     *handle = iter->second;
@@ -919,8 +922,8 @@ absl::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def) {
 }
 
 absl::StatusOr<Node*> BuildIdentityNode(
-    Graph* graph, const string& node_name, DataType dtype, const Node* input,
-    std::optional<string> requested_device) {
+    Graph* graph, const std::string& node_name, DataType dtype,
+    const Node* input, std::optional<std::string> requested_device) {
   // Create identity node.
   NodeDef ndef;
   ndef.set_name(node_name);
@@ -975,7 +978,7 @@ absl::Status PruneUnreachableFunctionsFromGraph(
   g.ToGraphDef(&graph_def);
   FunctionLibraryDefinition reachable_functions =
       fld->ReachableDefinitions(graph_def);
-  for (const string& func_name : fld->ListFunctionNames()) {
+  for (const std::string& func_name : fld->ListFunctionNames()) {
     if (!reachable_functions.Find(func_name)) {
       TF_RETURN_IF_ERROR(fld->RemoveFunction(func_name));
     }
@@ -1106,7 +1109,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
 
     // Add rewritten backward While body function.
     FunctionDef new_fdef;
-    string new_name = fld->UniqueFunctionName(
+    std::string new_name = fld->UniqueFunctionName(
         absl::StrCat(bwd_body_attr.name(), "_tl_rewrite_"));
     TF_RETURN_IF_ERROR(
         GraphToFunctionDef(*bwd_fbody->graph, new_name, &new_fdef));
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index f2ce3944ac158c..4da5a474d964dc 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -41,7 +41,8 @@ absl::Status ValidateConfig(const tf2xla::Config& config);
 // feeds).
 absl::Status AddPlaceholdersForFeeds(
     const tf2xla::Config& config, const OpRegistryInterface* op_registry,
-    std::unordered_map<string, string>* feed_remapping, GraphDef* graph_def);
+    std::unordered_map<std::string, std::string>* feed_remapping,
+    GraphDef* graph_def);
 
 // Returns in <out> a copy of <in>, pruned to only include fetches from
 // <config>.
@@ -49,7 +50,7 @@ absl::Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
                                GraphDef* out);
 
 // Returns node:port for the given <id>.
-string TensorIdToString(const tf2xla::TensorId& id);
+std::string TensorIdToString(const tf2xla::TensorId& id);
 
 // Updates the sharding of <n> based on the sharding of its neighbors.
 // If <out_edges> is true, outgoing edges from <n> are considered; else incoming
@@ -61,7 +62,7 @@ void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
                                    KernelDef* kdef);
 
 // Returns the next random seed to use for seeding xla rng.
-uint32 GetXLARandomSeed();
+uint32_t GetXLARandomSeed();
 
 // Indicates how a FunctionDef is associated with a graph node (e.g. the node is
 // a function call, or the node has function attrs).
@@ -74,14 +75,14 @@ class AssociatedFunctionInfo {
   };
 
   // The function is an attr of the node.
-  static AssociatedFunctionInfo FunctionAttr(const string& func_name,
+  static AssociatedFunctionInfo FunctionAttr(const std::string& func_name,
                                              const AttrValueMap& attrs,
-                                             const string& attr_name) {
+                                             const std::string& attr_name) {
     return AssociatedFunctionInfo(kFunctionAttr, func_name, attrs, attr_name);
   }
 
   // The node is a function call.
-  static AssociatedFunctionInfo FunctionCall(const string& func_name,
+  static AssociatedFunctionInfo FunctionCall(const std::string& func_name,
                                              const AttrValueMap& attrs) {
     // attr_name will not be used in this case.
     return AssociatedFunctionInfo(kFunctionCallNode, func_name, attrs,
@@ -89,7 +90,7 @@ class AssociatedFunctionInfo {
   }
 
   // The node is a SymbolicGradient op.
-  static AssociatedFunctionInfo SymbolicGradient(const string& func_name,
+  static AssociatedFunctionInfo SymbolicGradient(const std::string& func_name,
                                                  const AttrValueMap& attrs) {
     // attr_name will not be used in this case.
     return AssociatedFunctionInfo(kSymbolicGradient, func_name, attrs,
@@ -98,15 +99,17 @@ class AssociatedFunctionInfo {
 
   AssociatedFunctionType type() const { return type_; }
 
-  const string& func_name() const { return func_name_; }
+  const std::string& func_name() const { return func_name_; }
 
-  const string& attr_name() const { return attr_name_; }
+  const std::string& attr_name() const { return attr_name_; }
 
   const AttrValueMap& attrs() const { return attrs_; }
 
  private:
-  AssociatedFunctionInfo(AssociatedFunctionType type, const string& func_name,
-                         const AttrValueMap& attrs, const string& attr_name)
+  AssociatedFunctionInfo(AssociatedFunctionType type,
+                         const std::string& func_name,
+                         const AttrValueMap& attrs,
+                         const std::string& attr_name)
       : type_(type),
         func_name_(func_name),
         attrs_(attrs),
@@ -114,11 +117,11 @@ class AssociatedFunctionInfo {
 
   // Available for all instances.
   AssociatedFunctionType type_;
-  string func_name_;
+  std::string func_name_;
   AttrValueMap attrs_;
 
   // Only available if the function is defined in an attr.
-  string attr_name_;
+  std::string attr_name_;
 };
 
 // Returns if the NodeDef has associated function.
@@ -142,7 +145,7 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
 absl::Status RewriteAssociatedFunction(
     Graph* graph, Node* node, FunctionLibraryDefinition* fld,
     const AssociatedFunctionInfo& associated_function,
-    const string& rewritten_function_name);
+    const std::string& rewritten_function_name);
 
 // Class to act as cache for FunctionLibraryRuntime::Handle objects.
 class CachedFunctionHandles {
@@ -152,7 +155,7 @@ class CachedFunctionHandles {
   // Populates `handle` for requested function and attributes. If we have
   // instantiated the function with the same attributes before, `handle` will be
   // cached handle; otherwise instantiate the function and populate `handle`.
-  absl::Status GetOrInstantiate(const string& func_name, AttrSlice attrs,
+  absl::Status GetOrInstantiate(const std::string& func_name, AttrSlice attrs,
                                 FunctionLibraryRuntime::Handle* handle);
 
   // Releases all handles in the cache. Returns first non-OK status if any;
@@ -163,7 +166,7 @@ class CachedFunctionHandles {
 
  private:
   FunctionLibraryRuntime* flr_;
-  std::map<string, FunctionLibraryRuntime::Handle> handles_;
+  std::map<std::string, FunctionLibraryRuntime::Handle> handles_;
 
   CachedFunctionHandles(const CachedFunctionHandles&) = delete;
   void operator=(const CachedFunctionHandles&) = delete;
@@ -179,9 +182,9 @@ struct OutEdgeInfo {
 absl::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def);
 
 // Helper function that builds an Identity node.
-absl::StatusOr<Node*> BuildIdentityNode(Graph* graph, const string& node_name,
-                                        DataType dtype, const Node* input,
-                                        std::optional<string> requested_device);
+absl::StatusOr<Node*> BuildIdentityNode(
+    Graph* graph, const std::string& node_name, DataType dtype,
+    const Node* input, std::optional<std::string> requested_device);
 
 // For "If"/"While" nodes, if some of their inputs are Const nodes, rewrite
 // body functions to use the Const nodes instead of original _Arg nodes.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index e66a8a38813474..ef64b82f50e5be 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -157,7 +157,7 @@ TEST(ValidateConfig, ConflictingFetchName) {
   ExpectErrorContains(ValidateConfig(config), "conflicting fetch name");
 }
 
-static tf2xla::Config FetchesConfig(std::vector<string> fetches) {
+static tf2xla::Config FetchesConfig(std::vector<std::string> fetches) {
   tf2xla::Config config;
   for (const auto& fetch_node_name : fetches) {
     auto* fetch = config.add_fetch();
@@ -409,7 +409,7 @@ TEST(PropagateConstIntoFunctionalNodes, CopiedConstNodeHasUniqueName) {
   TF_ASSERT_OK(GetNodeAttr(while_node->def(), "body", &body_fn));
   const FunctionDef* rewritten_body_fn = fld.Find(body_fn.name());
   ASSERT_NE(rewritten_body_fn, nullptr);
-  std::unordered_map<string, NodeDef> nodes;
+  std::unordered_map<std::string, NodeDef> nodes;
   for (const NodeDef& node_def : rewritten_body_fn->node_def()) {
     nodes[node_def.name()] = node_def;
   }
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index ec456344bcfced..007ecef7492600 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -87,6 +87,9 @@ absl::Status DataTypeToPrimitiveType(DataType data_type,
     case tensorflow::DT_FLOAT8_E5M2FNUZ:
       *type = xla::F8E5M2FNUZ;
       return absl::OkStatus();
+    case tensorflow::DT_FLOAT4_E2M1FN:
+      *type = xla::F4E2M1FN;
+      return absl::OkStatus();
     case tensorflow::DT_BFLOAT16:
       *type = xla::BF16;
       return absl::OkStatus();
@@ -122,6 +125,7 @@ absl::StatusOr<DataType> EncodePrimitiveTypeAsDataType(
           {xla::F8E4M3FNUZ, DT_FLOAT8_E4M3FNUZ},
           {xla::F8E4M3B11FNUZ, DT_FLOAT8_E4M3B11FNUZ},
           {xla::F8E5M2FNUZ, DT_FLOAT8_E5M2FNUZ},
+          {xla::F4E2M1FN, DT_FLOAT4_E2M1FN},
           {xla::BF16, DT_BFLOAT16},
           {xla::F16, DT_HALF},
           {xla::F32, DT_FLOAT},
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 215decdb4d8843..add79c369b69ef 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -39,7 +39,7 @@ class XlaCompilationAllocator : public Allocator {
   XlaCompilationAllocator() {}
   ~XlaCompilationAllocator() override {}
 
-  string Name() override { return "xla_compilation"; }
+  std::string Name() override { return "xla_compilation"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // Regardless of the size requested, always allocates an XlaExpression.
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 7ca32b83f158af..5ee45e499cb49e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 
 namespace {
 
-int32 GetResultIndex(const int32* result_index_table, int32 num_results) {
+int32_t GetResultIndex(const int32_t* result_index_table, int32_t num_results) {
   auto it =
       std::min_element(result_index_table, result_index_table + num_results);
 
@@ -150,7 +150,7 @@ int LookupNameIndex(absl::string_view name, const char** names) {
 
 }  // namespace
 
-int XlaCompiledCpuFunction::LookupArgIndex(const string& name) const {
+int XlaCompiledCpuFunction::LookupArgIndex(const std::string& name) const {
   return LookupNameIndex(name, arg_names_);
 }
 
@@ -162,7 +162,7 @@ int XlaCompiledCpuFunction::LookupVariableIndex(absl::string_view name) const {
   return num_args_ - num_variables_ + index;
 }
 
-int XlaCompiledCpuFunction::LookupResultIndex(const string& name) const {
+int XlaCompiledCpuFunction::LookupResultIndex(const std::string& name) const {
   return LookupNameIndex(name, result_names_);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 3d5bff87b3570f..061982db6fd08f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/buffer_allocation_info.h"
 #include "xla/backends/cpu/runtime/rng_state_lib.h"
@@ -128,14 +129,14 @@ class XlaCompiledCpuFunction {
 
     // Result parameter i is described by
     // buffer_infos[result_index_table[i]].
-    const int32* result_index_table_ = nullptr;
+    const int32_t* result_index_table_ = nullptr;
 
     // There are num_results result parameters.
     int64_t num_results_ = 0;
 
     // Entry parameter i is described by
     // buffer_infos[arg_index_table[i]].
-    const int32* arg_index_table_ = nullptr;
+    const int32_t* arg_index_table_ = nullptr;
 
     // There are num_args entry parameters.
     int64_t num_args_ = 0;
@@ -209,7 +210,7 @@ class XlaCompiledCpuFunction {
   // TODO(fschneider): For now this always returns an empty string because there
   // is no support for error reporting in XLA. Remove this once all callers are
   // updated.
-  string error_msg() const { return error_msg_; }
+  std::string error_msg() const { return error_msg_; }
 
   void set_error_msg(absl::string_view error_msg) { error_msg_ = error_msg; }
 
@@ -302,7 +303,7 @@ class XlaCompiledCpuFunction {
   // The index remains constant for every instance of XlaCompiledCpuFunction
   // generated from the same static data, and might not be cheap to determine.
   // Recommended usage is to capture this in a variable for re-use.
-  int LookupArgIndex(const string& name) const;
+  int LookupArgIndex(const std::string& name) const;
 
   // Returns the 0-based index for the variable with the given `name`.
   // Returns -1 if the name wasn't found, or data isn't available.
@@ -318,7 +319,7 @@ class XlaCompiledCpuFunction {
   // The index remains constant for every instance of XlaCompiledCpuFunction
   // generated from the same static data, and might not be cheap to determine.
   // Recommended usage is to capture this in a variable for re-use.
-  int LookupResultIndex(const string& name) const;
+  int LookupResultIndex(const std::string& name) const;
 
   // Returns the name of the argument at `index`.
   // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
@@ -364,7 +365,7 @@ class XlaCompiledCpuFunction {
     return buffer_infos_;
   }
 
-  int32 num_buffers() const { return num_buffers_; }
+  int32_t num_buffers() const { return num_buffers_; }
 
   void** buffer_table() const { return buffer_table_; }
 
@@ -423,7 +424,7 @@ class XlaCompiledCpuFunction {
   }
 
   static void set_static_data_result_index_table(
-      StaticData* static_data, const int32* result_index_table) {
+      StaticData* static_data, const int32_t* result_index_table) {
     static_data->result_index_table_ = result_index_table;
   }
 
@@ -433,7 +434,7 @@ class XlaCompiledCpuFunction {
   }
 
   static void set_static_data_arg_index_table(StaticData* static_data,
-                                              const int32* arg_index_table) {
+                                              const int32_t* arg_index_table) {
     static_data->arg_index_table_ = arg_index_table;
   }
 
@@ -530,21 +531,21 @@ class XlaCompiledCpuFunction {
 
   // Describes the buffers used by the XLA computation.
   const xla::cpu::BufferAllocationInfo* const buffer_infos_;
-  const int32 num_buffers_;
+  const int32_t num_buffers_;
 
   // Indices of expanded result tuple.
-  const int32 num_results_;
-  const int32* const result_index_table_;
+  const int32_t num_results_;
+  const int32_t* const result_index_table_;
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  const int32* const arg_index_table_;
+  const int32_t* const arg_index_table_;
 
   // The number of incoming arguments.
-  const int32 num_args_;
+  const int32_t num_args_;
 
   // The number of incoming variables.
-  const int32 num_variables_;
+  const int32_t num_variables_;
 
   // Shapes of the input arguments.
   const ShapeInfo* const arg_shape_infos_;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 9e761dc6003d80..5088badf28e9cb 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <numeric>
@@ -130,7 +131,7 @@ ComputeArgAndRetvalShardings(const Graph& graph) {
       [](const Node* n) -> absl::StatusOr<std::optional<xla::OpSharding>> {
     TF_ASSIGN_OR_RETURN(
         auto sharding,
-        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max(),
+        ParseShardingFromDevice(*n, std::numeric_limits<int32_t>::max(),
                                 /*add_metadata=*/false));
     return sharding;
   };
@@ -173,7 +174,7 @@ absl::Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   xla_context->Ref();
   absl::Status status;
   auto step_container = std::make_unique<ScopedStepContainer>(
-      step_id, [&status, device](const string& name) {
+      step_id, [&status, device](const std::string& name) {
         status = device->resource_manager()->Cleanup(name);
       });
   TF_RETURN_IF_ERROR(step_container->Create(device->resource_manager(),
@@ -484,8 +485,8 @@ absl::Status BuildComputation(
 
 }  // namespace
 
-string XlaCompiler::Argument::HumanString() const {
-  string common;
+std::string XlaCompiler::Argument::HumanString() const {
+  std::string common;
   if (!name.empty()) {
     common = absl::StrCat(" name=", name);
   }
@@ -503,7 +504,7 @@ string XlaCompiler::Argument::HumanString() const {
       return absl::StrCat("kind=constant-resource", common,
                           " value=", constant_value.DebugString());
     case kResource: {
-      string output = absl::StrCat(
+      std::string output = absl::StrCat(
           "kind=resource", common,
           " resource_kind=", XlaResource::KindToString(resource_kind),
           " initialized=", initialized, " is_fast_mem=", fast_mem);
@@ -543,7 +544,7 @@ XlaCompiler::Argument::DimensionSizesAsInlinedVector() const {
   }
 }
 
-string XlaCompiler::Argument::ShapeHumanString() const {
+std::string XlaCompiler::Argument::ShapeHumanString() const {
   if (absl::holds_alternative<TensorShape>(shape)) {
     return std::get<TensorShape>(shape).DebugString();
   } else {
@@ -592,9 +593,9 @@ XlaCompiler::~XlaCompiler() = default;
 
 int64_t XlaCompiler::NextStepId() { return next_step_id_++; }
 
-uint64 XlaCompiler::SignatureHash::operator()(
-    const std::pair<string, std::vector<Argument>>& signature) const {
-  return std::hash<string>()(signature.first);
+uint64_t XlaCompiler::SignatureHash::operator()(
+    const std::pair<std::string, std::vector<Argument>>& signature) const {
+  return std::hash<std::string>()(signature.first);
 }
 
 static absl::Status GetFunctionBody(const NameAttrList& function,
@@ -703,9 +704,9 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
                 flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
         .IgnoreError();
     auto node_name_index = graph->BuildNodeNameIndex();
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     for (const auto& node_shape_info : shape_info) {
-      const string& node_name = node_shape_info.first;
+      const std::string& node_name = node_shape_info.first;
       const std::vector<InferredShape>& output_shapes = node_shape_info.second;
       const auto& node_iter = node_name_index.find(node_name);
       if (node_iter != node_name_index.end()) {
@@ -726,9 +727,9 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
               flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
       .IgnoreError();
   auto node_name_index = graph->BuildNodeNameIndex();
-  std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+  std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
   for (const auto& node_shape_info : shape_info) {
-    const string& node_name = node_shape_info.first;
+    const std::string& node_name = node_shape_info.first;
     const std::vector<InferredShape>& output_shapes = node_shape_info.second;
     const auto& node_iter = node_name_index.find(node_name);
     if (node_iter != node_name_index.end()) {
@@ -754,7 +755,7 @@ std::vector<std::string> GetValidControlRets(
   // the map with nodes in FunctionDef control_ret_nodes and later query it
   // using the nodes in `graph`. The Node pointers would be different but the
   // Node name is expected to remain the same between the two.
-  absl::flat_hash_map<string, int> control_ret_nodes_map;
+  absl::flat_hash_map<std::string, int> control_ret_nodes_map;
   for (int i = 0; i < orig_control_ret_nodes.size(); ++i) {
     const Node* n = orig_control_ret_nodes[i];
     control_ret_nodes_map[n->name()] = i;
@@ -814,7 +815,7 @@ absl::Status XlaCompiler::CompileFunction(
     const NameAttrList& fn_name_attrs,
     absl::Span<const XlaCompiler::Argument> args,
     XlaCompiler::CompilationResult* result) {
-  string function_id =
+  std::string function_id =
       Canonicalize(fn_name_attrs.name(), AttrSlice(&fn_name_attrs.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
@@ -1325,7 +1326,7 @@ namespace {
 absl::Status ValidateFunctionDef(const FunctionDef* fdef,
                                  const FunctionLibraryDefinition& flib_def) {
   for (const NodeDef& node : fdef->node_def()) {
-    const string& op = node.op();
+    const std::string& op = node.op();
     if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) {
       continue;
     }
@@ -1340,7 +1341,8 @@ absl::Status ValidateFunctionDef(const FunctionDef* fdef,
 // Returned pointer points to the internal string either in node's attributes
 // or in its NodeDef. This pointer is valid as long as the node has not been
 // modified.
-absl::Status GetPotentialFunctionName(const Node& node, const string** name) {
+absl::Status GetPotentialFunctionName(const Node& node,
+                                      const std::string** name) {
   if (node.IsPartitionedCall()) {
     const AttrValue* attr_value;
     TF_RETURN_IF_ERROR(
@@ -1361,7 +1363,8 @@ absl::Status GetPotentialFunctionName(const Node& node, const string** name) {
 // given device_type, invalid data type, missing attributes...)
 absl::Status ValidateGraph(const Graph* graph,
                            const FunctionLibraryDefinition& flib_def,
-                           const DeviceType& device_type, const string& name) {
+                           const DeviceType& device_type,
+                           const std::string& name) {
   // Make sure the XLA compilation kernels are registered.  This operation is
   // idempotent so it is fine if someone called it already.
   XlaOpRegistry::RegisterCompilationKernels();
@@ -1398,7 +1401,7 @@ absl::Status ValidateGraph(const Graph* graph,
     if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
       continue;
     }
-    const string* function_name;
+    const std::string* function_name;
     TF_RETURN_IF_ERROR(GetPotentialFunctionName(*node, &function_name));
     const FunctionDef* fdef = flib_def.Find(*function_name);
     absl::Status s;
@@ -1455,6 +1458,36 @@ class DummyStackTrace : public AbstractStackTrace {
 };
 
 namespace {
+const xla::HloInstructionProto* FindInstructionById(
+    const xla::HloComputationProto& computation, int64_t id) {
+  auto iter =
+      absl::c_find_if(computation.instructions(),
+                      [id](const xla::HloInstructionProto& instruction) {
+                        return instruction.id() == id;
+                      });
+  if (iter == computation.instructions().end()) {
+    return nullptr;
+  }
+  return &(*iter);
+}
+
+bool ShouldAddPrecisionToInstruction(
+    const xla::HloInstructionProto& instruction,
+    const xla::HloComputationProto& computation) {
+  static constexpr std::array<absl::string_view, 2> kOpsPossiblyUsingTF32 = {
+      "dot", "convolution"};
+  if (!absl::c_linear_search(kOpsPossiblyUsingTF32, instruction.opcode())) {
+    return false;
+  }
+  if (instruction.shape().element_type() == xla::F32) {
+    return true;
+  }
+  return absl::c_any_of(instruction.operand_ids(), [&](int64_t operand_id) {
+    const xla::HloInstructionProto* operand =
+        FindInstructionById(computation, operand_id);
+    return operand && operand->shape().element_type() == xla::F32;
+  });
+}
 
 // Add precisions configs to the HLO module to avoid TensorFloat32 computations
 // in XLA.
@@ -1462,13 +1495,7 @@ namespace {
 // Some operations, such as Einsum are converted through MlirXlaOpKernel, which
 // doesn't set the precisions, so we set them all here.
 //
-// TODO(tdanyluk): We may want to restrict this logic to only set the operand
-// precision for F32 operands. (Historically, it was set without regard to
-// operand type in other parts of TF2XLA.)
 void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
-  static constexpr std::array<absl::string_view, 2> kOpsPossiblyUsingTF32 = {
-      "dot", "convolution"};
-
   xla::PrecisionConfig precision_config;
   precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
   precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
@@ -1476,8 +1503,7 @@ void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
   for (xla::HloComputationProto& computation : *module.mutable_computations()) {
     for (xla::HloInstructionProto& instruction :
          *computation.mutable_instructions()) {
-      if (absl::c_find(kOpsPossiblyUsingTF32, instruction.opcode()) !=
-          kOpsPossiblyUsingTF32.end()) {
+      if (ShouldAddPrecisionToInstruction(instruction, computation)) {
         *instruction.mutable_precision_config() = precision_config;
       }
     }
@@ -1487,7 +1513,7 @@ void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
 }  // namespace
 
 absl::Status XlaCompiler::CompileGraph(
-    const XlaCompiler::CompileOptions& options, string const& name,
+    const XlaCompiler::CompileOptions& options, const std::string& name,
     std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
     CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
@@ -1689,7 +1715,7 @@ xla::ChannelHandle XlaCompiler::NewChannel(
   return new_handle;
 }
 
-absl::Status XlaCompiler::GetChannelHandle(const string& key,
+absl::Status XlaCompiler::GetChannelHandle(const std::string& key,
                                            xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
@@ -1701,7 +1727,7 @@ absl::Status XlaCompiler::GetChannelHandle(const string& key,
 }
 
 absl::Status XlaCompiler::GetHostToDeviceChannelHandle(
-    const string& key, xla::ChannelHandle* channel) {
+    const std::string& key, xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
     result.first->second = NewChannel(xla::ChannelHandle::HOST_TO_DEVICE);
@@ -1712,7 +1738,7 @@ absl::Status XlaCompiler::GetHostToDeviceChannelHandle(
 }
 
 absl::Status XlaCompiler::GetDeviceToHostChannelHandle(
-    const string& key, xla::ChannelHandle* channel) {
+    const std::string& key, xla::ChannelHandle* channel) {
   auto result = channels_.emplace(key, xla::ChannelHandle());
   if (result.second) {
     result.first->second = NewChannel(xla::ChannelHandle::DEVICE_TO_HOST);
@@ -1724,7 +1750,7 @@ absl::Status XlaCompiler::GetDeviceToHostChannelHandle(
 
 namespace {
 
-void SetTransfer(const string& key, absl::Span<const DataType> types,
+void SetTransfer(const std::string& key, absl::Span<const DataType> types,
                  absl::Span<const TensorShape> shapes,
                  tf2xla::HostTransferMetadata* transfer) {
   transfer->set_key(key);
@@ -1739,7 +1765,7 @@ void SetTransfer(const string& key, absl::Span<const DataType> types,
 }  // namespace
 
 absl::Status XlaCompiler::SetDeviceToHostMetadata(
-    const string& key, absl::Span<const DataType> types,
+    const std::string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_sends_[key];
@@ -1759,7 +1785,7 @@ absl::Status XlaCompiler::SetDeviceToHostMetadata(
 }
 
 absl::Status XlaCompiler::GetDeviceToHostShapes(
-    const string& key, std::vector<TensorShape>* shapes) const {
+    const std::string& key, std::vector<TensorShape>* shapes) const {
   const auto iter = host_compute_sends_.find(key);
   if (iter == host_compute_sends_.end()) {
     return errors::InvalidArgument(
@@ -1774,7 +1800,7 @@ absl::Status XlaCompiler::GetDeviceToHostShapes(
 }
 
 absl::Status XlaCompiler::SetHostToDeviceMetadata(
-    const string& key, absl::Span<const DataType> types,
+    const std::string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_recvs_.find(key) != host_compute_recvs_.end()) {
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_recvs_[key];
@@ -1794,7 +1820,7 @@ absl::Status XlaCompiler::SetHostToDeviceMetadata(
 }
 
 absl::Status XlaCompiler::GetHostComputeControlDependency(
-    const string& host_compute_name, xla::XlaOp* handle) {
+    const std::string& host_compute_name, xla::XlaOp* handle) {
   const auto iter = host_compute_control_output_.find(host_compute_name);
   if (iter == host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -1807,7 +1833,7 @@ absl::Status XlaCompiler::GetHostComputeControlDependency(
 }
 
 absl::Status XlaCompiler::SetHostComputeControlDependency(
-    const string& host_compute_name, const xla::XlaOp handle) {
+    const std::string& host_compute_name, const xla::XlaOp handle) {
   if (host_compute_control_output_.find(host_compute_name) !=
       host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -1819,7 +1845,7 @@ absl::Status XlaCompiler::SetHostComputeControlDependency(
 }
 
 void XlaCompiler::PushNodeTokenMapping() {
-  node_token_mapping_stack_.emplace(std::map<string, xla::XlaOp>{});
+  node_token_mapping_stack_.emplace(std::map<std::string, xla::XlaOp>{});
 }
 
 absl::Status XlaCompiler::PopNodeTokenMapping() {
@@ -1832,7 +1858,7 @@ absl::Status XlaCompiler::PopNodeTokenMapping() {
   return absl::OkStatus();
 }
 
-absl::Status XlaCompiler::SetNodeToken(const string& node_name,
+absl::Status XlaCompiler::SetNodeToken(const std::string& node_name,
                                        const xla::XlaOp op) {
   if (node_token_mapping_stack_.empty()) {
     return errors::FailedPrecondition(
@@ -1847,7 +1873,8 @@ absl::Status XlaCompiler::SetNodeToken(const string& node_name,
   return absl::OkStatus();
 }
 
-absl::StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(const string& node_name) {
+absl::StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(
+    const std::string& node_name) {
   if (node_token_mapping_stack_.empty()) {
     return errors::FailedPrecondition(
         "Calling GetNodeToken() when node_token_mapping_stack_ is "
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 2beb730eb06fa3..216125f9cb153e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -277,7 +277,8 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  absl::Status CompileGraph(const CompileOptions& options, string const& name,
+  absl::Status CompileGraph(const CompileOptions& options,
+                            const std::string& name,
                             std::unique_ptr<Graph> graph,
                             absl::Span<const Argument> args,
                             CompilationResult* result);
@@ -295,31 +296,32 @@ class XlaCompiler {
   // Channel handles can be used to communicate between different
   // computations. Computations that communicate should be compiled with the
   // same XlaCompiler.
-  absl::Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
+  absl::Status GetChannelHandle(const std::string& key,
+                                xla::ChannelHandle* channel);
 
   // Retrieves the host-to-device channel handle associated with `key`.
   // Allocates a new channel handle if none exists.
-  absl::Status GetHostToDeviceChannelHandle(const string& key,
+  absl::Status GetHostToDeviceChannelHandle(const std::string& key,
                                             xla::ChannelHandle* channel);
 
   // Retrieves the device-to-host channel handle associated with `key`.
   // Allocates a new channel handle if none exists.
-  absl::Status GetDeviceToHostChannelHandle(const string& key,
+  absl::Status GetDeviceToHostChannelHandle(const std::string& key,
                                             xla::ChannelHandle* channel);
 
   // Sets the shapes and types for the device to host transfer associated with
   // 'key'.
-  absl::Status SetDeviceToHostMetadata(const string& key,
+  absl::Status SetDeviceToHostMetadata(const std::string& key,
                                        absl::Span<const DataType> types,
                                        absl::Span<const TensorShape> shapes);
 
   // Gets the shapes the device to host transfer associated with 'key'.
-  absl::Status GetDeviceToHostShapes(const string& key,
+  absl::Status GetDeviceToHostShapes(const std::string& key,
                                      std::vector<TensorShape>* shapes) const;
 
   // Sets the shapes and types for the host to device transfer associated with
   // 'key'.
-  absl::Status SetHostToDeviceMetadata(const string& key,
+  absl::Status SetHostToDeviceMetadata(const std::string& key,
                                        absl::Span<const DataType> types,
                                        absl::Span<const TensorShape> shapes);
 
@@ -334,10 +336,10 @@ class XlaCompiler {
   // 'host_compute_name' can be any string the client wishes to use to identify
   // a given HostCompute Op as long as the names are unique within the
   // compilation.
-  absl::Status GetHostComputeControlDependency(const string& host_compute_name,
-                                               xla::XlaOp* handle);
-  absl::Status SetHostComputeControlDependency(const string& host_compute_name,
-                                               xla::XlaOp handle);
+  absl::Status GetHostComputeControlDependency(
+      const std::string& host_compute_name, xla::XlaOp* handle);
+  absl::Status SetHostComputeControlDependency(
+      const std::string& host_compute_name, xla::XlaOp handle);
 
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
@@ -345,8 +347,8 @@ class XlaCompiler {
 
   void PushNodeTokenMapping();
   absl::Status PopNodeTokenMapping();
-  absl::Status SetNodeToken(const string& node_name, xla::XlaOp op);
-  absl::StatusOr<xla::XlaOp> GetNodeToken(const string& node_name);
+  absl::Status SetNodeToken(const std::string& node_name, xla::XlaOp op);
+  absl::StatusOr<xla::XlaOp> GetNodeToken(const std::string& node_name);
 
   // Sets the function body `fbody` to the one registered as `function`.
   absl::Status FindFunctionBody(const NameAttrList& function,
@@ -405,20 +407,22 @@ class XlaCompiler {
   FunctionLibraryRuntime* flib_runtime_;        // owned by pflr_.
 
   struct SignatureHash {
-    uint64 operator()(
-        const std::pair<string, std::vector<Argument>>& signature) const;
+    uint64_t operator()(
+        const std::pair<std::string, std::vector<Argument>>& signature) const;
   };
 
-  std::unordered_map<std::pair<string, std::vector<Argument>>,
+  std::unordered_map<std::pair<std::string, std::vector<Argument>>,
                      CompilationResult, SignatureHash>
       cache_;
 
-  std::unordered_map<string, xla::ChannelHandle> channels_;
+  std::unordered_map<std::string, xla::ChannelHandle> channels_;
 
-  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
-  std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
+  std::unordered_map<std::string, tf2xla::HostTransferMetadata>
+      host_compute_sends_;
+  std::unordered_map<std::string, tf2xla::HostTransferMetadata>
+      host_compute_recvs_;
 
-  std::unordered_map<string, xla::XlaOp> host_compute_control_output_;
+  std::unordered_map<std::string, xla::XlaOp> host_compute_control_output_;
 
   // This is used to store <node name, token output> mapping. Side-effecting
   // ops call SetNodeToken() to record its token output, so later side-effecting
@@ -427,7 +431,7 @@ class XlaCompiler {
   // It's a stack because we need a mapping like this for each level of nested
   // CompileGraph() call. In CompileGraph(), we will push a new mapping to the
   // stack, and pop the mapping before returning.
-  std::stack<std::map<string, xla::XlaOp>> node_token_mapping_stack_;
+  std::stack<std::map<std::string, xla::XlaOp>> node_token_mapping_stack_;
 
   XlaCompiler(const XlaCompiler&) = delete;
   void operator=(const XlaCompiler&) = delete;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 7d8a4f2c431e80..a29094470b911f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -140,7 +140,7 @@ namespace {
 // compiled kernels.
 class DummyResourceForTest : public ResourceBase {
  public:
-  string DebugString() const override { return "dummy"; }
+  std::string DebugString() const override { return "dummy"; }
   void Increment() { ++value_; }
   int Get() { return value_; }
 
@@ -268,8 +268,8 @@ TEST_F(XlaCompilerTest, Simple) {
                                      std::move(graph), args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -281,7 +281,7 @@ TEST_F(XlaCompilerTest, Simple) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({4, 143});
+  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32_t>({4, 143});
   xla::Literal expected_literal = xla::LiteralUtil::MakeTuple({&expected0});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
 }
@@ -366,8 +366,8 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
                                      args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -484,7 +484,7 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -602,7 +602,7 @@ TEST_F(XlaCompilerTest, MixedOrderArguments) {
     auto read = ops::ReadVariableOp(
         scope.WithControlDependencies(std::vector<Operation>{write}), var,
         DT_INT32);
-    auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+    auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
     auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
     std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
     TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -680,7 +680,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   // func(a) { b=7; c=-a; return b, c; }
   Scope scope = Scope::NewRootScope().ExitOnError();
   auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
-  auto b = ops::Const<int32>(scope.WithOpName("B"), 7);
+  auto b = ops::Const<int32_t>(scope.WithOpName("B"), 7);
   auto c = ops::Neg(scope.WithOpName("C"), a);
   auto d = ops::_Retval(scope.WithOpName("D"), b, 0);
   auto e = ops::_Retval(scope.WithOpName("E"), c, 1);
@@ -710,7 +710,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     EXPECT_FALSE(result.outputs[1].is_constant);
 
     // Tests that the generated computation works.
-    xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
+    xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
     std::unique_ptr<xla::GlobalData> param0_data =
         client_->TransferToServer(param0_literal).value();
 
@@ -718,8 +718,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->Execute(*result.computation, {param0_data.get()}).value();
     xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-    xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32>(7);
-    xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({-7, -42});
+    xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32_t>(7);
+    xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32_t>({-7, -42});
     xla::Literal expected =
         xla::LiteralUtil::MakeTuple({&expected0, &expected1});
     EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected, actual_literal));
@@ -885,7 +885,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
         // The names of instructions were uniquified by the XlaBuilder and the
         // unique ids may be different, the rest of the fields should be
         // identical.
-        string str1, str2;
+        std::string str1, str2;
         LOG(INFO) << "instr1 = " << instr1.DebugString();
         LOG(INFO) << "instr2 = " << instr2.DebugString();
         instr1.AppendPartialToString(&str1);
@@ -904,7 +904,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad1");
   auto grad2 = ops::TensorArrayGrad(scope, arg, grad1.flow_out, "grad2");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto write = ops::TensorArrayWrite(scope, grad1.grad_handle, index, index,
                                      grad2.flow_out);
   auto read = ops::TensorArrayRead(scope, arg, index, write.flow_out, DT_INT32);
@@ -933,12 +933,12 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   const XlaCompiler::ResourceUpdate& update = result.resource_updates[0];
   EXPECT_EQ(0, update.input_index);
   EXPECT_EQ(DT_INT32, update.type);
-  EXPECT_EQ((std::set<string>{"grad1", "grad2"}),
+  EXPECT_EQ((std::set<std::string>{"grad1", "grad2"}),
             update.tensor_array_gradients_accessed);
 
   // Tests that the generated computation works.
-  xla::Literal input_base = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal input_grad2 = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal input_base = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal input_grad2 = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   xla::Literal input = xla::LiteralUtil::MakeTuple({&input_base, &input_grad2});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(input).value();
@@ -947,10 +947,10 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
       client_->Execute(*result.computation, {param0_data.get()}).value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal output_read = xla::LiteralUtil::CreateR0<int32>(42);
-  xla::Literal output_base = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal output_grad1 = xla::LiteralUtil::CreateR1<int32>({0, 1});
-  xla::Literal output_grad2 = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal output_read = xla::LiteralUtil::CreateR0<int32_t>(42);
+  xla::Literal output_base = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal output_grad1 = xla::LiteralUtil::CreateR1<int32_t>({0, 1});
+  xla::Literal output_grad2 = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   xla::Literal output_resource =
       xla::LiteralUtil::MakeTuple({&output_base, &output_grad1, &output_grad2});
   xla::Literal expected_literal =
@@ -964,7 +964,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad1");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto read = ops::TensorArrayRead(scope, arg, index, grad1.flow_out, DT_INT32);
   auto retval = ops::_Retval(scope.WithOpName("retval"), read, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
@@ -996,7 +996,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto flow = ops::Const<float>(scope, {});
   auto grad1 = ops::TensorArrayGrad(scope, arg, flow, "grad2");
-  auto index = ops::Const<int32>(scope, 1);
+  auto index = ops::Const<int32_t>(scope, 1);
   auto read = ops::TensorArrayRead(scope, arg, index, grad1.flow_out, DT_INT32);
   auto retval = ops::_Retval(scope.WithOpName("retval"), read, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
@@ -1067,8 +1067,8 @@ TEST_F(XlaCompilerTest, FunctionCallWithConstants) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
-  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib));
 
   NodeDef def;
@@ -1151,9 +1151,9 @@ TEST_F(XlaCompilerTest, SliceWithDynamicBegins) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   auto begin = ops::_Arg(scope.WithOpName("arg"), DT_INT32, 0);
-  auto size = ops::Const<int32>(scope.WithOpName("value"), {1}, {1});
+  auto size = ops::Const<int32_t>(scope.WithOpName("value"), {1}, {1});
 
   TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(flib));
 
@@ -1188,8 +1188,8 @@ TEST_F(XlaCompilerTest, SliceWithDynamicBegins) {
 
 void RunAndCheckVariablesComputation(
     xla::Client* client, const XlaCompiler::CompilationResult& result) {
-  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32_t>({7, 42});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param0_data =
       client->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1201,8 +1201,8 @@ void RunAndCheckVariablesComputation(
           .value();
   xla::Literal actual_literal = client->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({5, 144});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({4, 143});
+  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32_t>({5, 144});
+  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32_t>({4, 143});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1220,7 +1220,7 @@ TEST_F(XlaCompilerTest, Variables) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_ASSERT_OK(scope.ToGraph(graph.get()));
@@ -1356,7 +1356,7 @@ TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
                                      std::move(graph), args, &result));
 
   // Tests that the generated computation works.
-  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32_t>({-3, 101});
   std::unique_ptr<xla::GlobalData> param1_data =
       client_->TransferToServer(param1_literal).value();
 
@@ -1379,7 +1379,7 @@ TEST_F(XlaCompilerTest, ReturnResourceHandle) {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto r = ops::_Retval(scope.WithOpName("R"), var, 0);
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 1);
 
@@ -1414,7 +1414,7 @@ absl::StatusOr<std::unique_ptr<Graph>> BuildTestGraph() {
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
-  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32_t>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
@@ -1475,9 +1475,9 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Tests that the generated computation works.
   xla::Literal param0_literal =
-      xla::LiteralUtil::CreateR2<int32>({{4, 55}, {1, -3}});
+      xla::LiteralUtil::CreateR2<int32_t>({{4, 55}, {1, -3}});
   xla::Literal param1_literal =
-      xla::LiteralUtil::CreateR1<int32>({22, 11, 33, 404});
+      xla::LiteralUtil::CreateR1<int32_t>({22, 11, 33, 404});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1490,8 +1490,9 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
   xla::Literal expected0 =
-      xla::LiteralUtil::CreateR2<int32>({{27, 67}, {35, 402}});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({26, 66, 34, 401});
+      xla::LiteralUtil::CreateR2<int32_t>({{27, 67}, {35, 402}});
+  xla::Literal expected1 =
+      xla::LiteralUtil::CreateR1<int32_t>({26, 66, 34, 401});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1547,9 +1548,9 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   // Tests that the generated computation works.
   xla::Literal param0_literal =
-      xla::LiteralUtil::CreateR1<int32>({4, 55, 1, -3});
+      xla::LiteralUtil::CreateR1<int32_t>({4, 55, 1, -3});
   xla::Literal param1_literal =
-      xla::LiteralUtil::CreateR1<int32>({22, 11, 33, 404});
+      xla::LiteralUtil::CreateR1<int32_t>({22, 11, 33, 404});
   std::unique_ptr<xla::GlobalData> param0_data =
       client_->TransferToServer(param0_literal).value();
   std::unique_ptr<xla::GlobalData> param1_data =
@@ -1561,8 +1562,10 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR1<int32>({27, 67, 35, 402});
-  xla::Literal expected1 = xla::LiteralUtil::CreateR1<int32>({26, 66, 34, 401});
+  xla::Literal expected0 =
+      xla::LiteralUtil::CreateR1<int32_t>({27, 67, 35, 402});
+  xla::Literal expected1 =
+      xla::LiteralUtil::CreateR1<int32_t>({26, 66, 34, 401});
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1587,8 +1590,8 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
-  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  auto value = ops::Const<int32_t>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32_t>(scope.WithOpName("shape"), {5}, {1});
   TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib));
 
   NodeDef def;
@@ -1684,7 +1687,8 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
   side_effecting_op.set_name("DummySideEffectingOp");
   side_effecting_op.set_op("DummySideEffectingOp");
   AddNodeAttr(kXlaTokenInputNodesAttrName,
-              std::vector<string>{kXlaTokenArgNodeName}, &side_effecting_op);
+              std::vector<std::string>{kXlaTokenArgNodeName},
+              &side_effecting_op);
   AddNodeAttr(kXlaOriginalOutsideCompilationNodeName, side_effecting_op.name(),
               &side_effecting_op);
   absl::Status status;
@@ -1768,8 +1772,8 @@ TEST_F(XlaCompilerTest, OpsWithTensorListInput) {
   }
 
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto element_shape = ops::Const<int32>(scope, {1}, {1});
-  auto max_elements = ops::Const<int32>(scope, {10}, {});
+  auto element_shape = ops::Const<int32_t>(scope, {1}, {1});
+  auto max_elements = ops::Const<int32_t>(scope, {10}, {});
   auto arg = ops::_Arg(scope.WithOpName("arg"), DT_VARIANT, 0);
   std::initializer_list<Output> out = {arg, arg};
   auto add_n = ops::AddN(scope, out);
@@ -1822,7 +1826,7 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
     auto arg0 = ops::_Arg(scope.WithOpName("arg0"), DT_INT32, 0);
     auto arg1 = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
     auto arg2 = ops::_Arg(scope.WithOpName("arg2"), DT_RESOURCE, 2);
-    auto less = ops::Less(scope, arg0, ops::Const<int32>(scope, 10));
+    auto less = ops::Less(scope, arg0, ops::Const<int32_t>(scope, 10));
     (void)ops::_Retval(scope.WithOpName("ret"), less, 0);
     TF_ASSERT_OK(scope.ToGraph(graph.get()));
     FunctionDef fdef;
@@ -1899,9 +1903,9 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
   ASSERT_EQ(output2.input_index, 2);
 
   // Tests that the generated computation works.
-  xla::Literal literal0 = xla::LiteralUtil::CreateR0<int32>(0);
-  xla::Literal literal1 = xla::LiteralUtil::CreateR0<int32>(2);
-  xla::Literal literal2 = xla::LiteralUtil::CreateR0<int32>(1);
+  xla::Literal literal0 = xla::LiteralUtil::CreateR0<int32_t>(0);
+  xla::Literal literal1 = xla::LiteralUtil::CreateR0<int32_t>(2);
+  xla::Literal literal2 = xla::LiteralUtil::CreateR0<int32_t>(1);
   std::unique_ptr<xla::GlobalData> data0 =
       client_->TransferToServer(literal0).value();
   std::unique_ptr<xla::GlobalData> data1 =
@@ -1916,9 +1920,9 @@ TEST_F(XlaCompilerTest, WhileWithResources) {
           .value();
   xla::Literal actual_literal = client_->Transfer(*actual).value();
 
-  xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32>(10);
-  xla::Literal expected1 = xla::LiteralUtil::CreateR0<int32>(2);
-  xla::Literal expected2 = xla::LiteralUtil::CreateR0<int32>(1);
+  xla::Literal expected0 = xla::LiteralUtil::CreateR0<int32_t>(10);
+  xla::Literal expected1 = xla::LiteralUtil::CreateR0<int32_t>(2);
+  xla::Literal expected2 = xla::LiteralUtil::CreateR0<int32_t>(1);
   xla::Literal expected_literal =
       xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
@@ -1978,7 +1982,7 @@ TEST_F(XlaCompilerTest, SetShardingForReturnedTuple) {
 
 TEST_F(XlaCompilerTest, AliasResourceUpdates) {
   Scope scope = Scope::NewRootScope().ExitOnError();
-  auto a = ops::Const<int32>(scope.WithOpName("A"), {1, 2});
+  auto a = ops::Const<int32_t>(scope.WithOpName("A"), {1, 2});
   auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
   auto write = ops::AssignAddVariableOp(scope, var, a);
   auto read = ops::ReadVariableOp(
@@ -2022,7 +2026,7 @@ TEST_F(XlaCompilerTest, AliasResourceUpdates) {
 TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
 
@@ -2035,7 +2039,7 @@ TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
 TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
   std::vector<DataType> types2{DT_FLOAT};
@@ -2051,7 +2055,7 @@ TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
 TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
 
@@ -2064,7 +2068,7 @@ TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
 TEST_F(XlaCompilerTest, SetHostToDeviceMetadataMismatchedDuplicate) {
   XlaCompiler compiler(DefaultOptions());
 
-  const string& key = "comm_key";
+  const std::string& key = "comm_key";
   std::vector<DataType> types{DT_INT32};
   std::vector<TensorShape> shapes{TensorShape({2})};
   std::vector<DataType> types2{DT_FLOAT};
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 92ddf0125aded1..fad607b1ae1333 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -67,7 +67,7 @@ XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
   }
 }
 
-string XlaContext::DebugString() const { return "XLA JIT context"; }
+std::string XlaContext::DebugString() const { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   const int64_t retvals_size = retvals_.size();
@@ -84,7 +84,7 @@ XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
   return LookupOrCreate(type, &max_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -100,7 +100,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
   return LookupOrCreate(type, &min_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -116,7 +116,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
 
 const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
   return LookupOrCreate(type, &add_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -133,7 +133,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
 const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
     const DataType type) {
   return LookupOrCreate(type, &log_add_exp_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building LogAddExp() for " << type_string;
     xla::XlaBuilder b("log_add_exp<" + type_string + ">");
     xla::PrimitiveType xla_type;
@@ -154,7 +154,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
 
 const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
   return LookupOrCreate(type, &mul_func_, [type] {
-    const string type_string = DataTypeString(type);
+    const std::string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
     xla::PrimitiveType xla_type;
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 9184fb4300633c..1d72f0c756f364 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -50,7 +50,7 @@ class XlaContext : public ResourceBase {
              const Graph* graph);
 
   // Virtual method defined by ResourceBase.
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 61bd10e413ccf3..e867dd14209ab8 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -73,7 +73,7 @@ XlaExpression XlaExpression::Resource(XlaResource* resource) {
   return e;
 }
 
-string XlaExpression::HumanString() const {
+std::string XlaExpression::HumanString() const {
   switch (kind_) {
     case Kind::kInvalid:
       return "invalid";
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index d410b79a3da137..ed0041fc9942a0 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -115,7 +115,7 @@ class XlaExpression {
   XlaResource* resource() const { return resource_; }
 
   // Returns a human-readable summary of the expression.
-  string HumanString() const;
+  std::string HumanString() const;
 
   // Returns the value of a kValue or kXlaOp as an xla::XlaOp. Returns
   // an erroneous XlaOp if the expression is not a constant or an expression.
diff --git a/tensorflow/compiler/tf2xla/xla_expression_test.cc b/tensorflow/compiler/tf2xla/xla_expression_test.cc
index 7a0cc34de9af2e..797002476aeb1c 100644
--- a/tensorflow/compiler/tf2xla/xla_expression_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression_test.cc
@@ -38,14 +38,15 @@ class XlaExpressionTest : public ::testing::Test {
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
     builder_ = std::make_unique<xla::XlaBuilder>("acomputation");
-    constant_ = test::AsScalar<int32>(42);
-    op_ = xla::ConstantR0<int32>(builder_.get(), 7);
+    constant_ = test::AsScalar<int32_t>(42);
+    op_ = xla::ConstantR0<int32_t>(builder_.get(), 7);
     non_constant_op_ = xla::Parameter(
         builder_.get(), 0, xla::ShapeUtil::MakeShape(xla::F32, {}), "x");
     resource_ = std::make_unique<XlaResource>(
-        XlaResource::kVariable, /*arg_num=*/0, /*name=*/string("avariable"),
-        DT_INT32, TensorShape({17, 3}), op_, /*tensor_array_size=*/-1,
-        /*tensor_array_gradients=*/std::set<string>(),
+        XlaResource::kVariable, /*arg_num=*/0,
+        /*name=*/std::string("avariable"), DT_INT32, TensorShape({17, 3}), op_,
+        /*tensor_array_size=*/-1,
+        /*tensor_array_gradients=*/std::set<std::string>(),
         /*tensor_array_multiple_writes_aggregate=*/false);
   }
 
@@ -87,8 +88,8 @@ TEST_F(XlaExpressionTest, AsXlaOp) {
                           builder_->BuildConstantSubGraph(const_as_op));
   TF_ASSERT_OK_AND_ASSIGN(xla::Literal value,
                           client_->ComputeConstant(computation));
-  EXPECT_TRUE(xla::LiteralTestUtil::Equal(xla::LiteralUtil::CreateR0<int32>(42),
-                                          value));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      xla::LiteralUtil::CreateR0<int32_t>(42), value));
 }
 
 TEST_F(XlaExpressionTest, GetShape) {
@@ -120,7 +121,7 @@ TEST_F(XlaExpressionTest, ResolveConstant) {
       std::optional<Tensor> op_constant,
       XlaExpression::XlaOp(op_, DT_INT32).ResolveConstant(client_));
   ASSERT_TRUE(op_constant.has_value());
-  test::ExpectTensorEqual<int32>(test::AsScalar<int32>(7), *op_constant);
+  test::ExpectTensorEqual<int32_t>(test::AsScalar<int32_t>(7), *op_constant);
 
   TF_ASSERT_OK_AND_ASSIGN(std::optional<Tensor> op_nonconstant,
                           XlaExpression::XlaOp(non_constant_op_, DT_FLOAT)
@@ -131,7 +132,7 @@ TEST_F(XlaExpressionTest, ResolveConstant) {
       std::optional<Tensor> constant_constant,
       XlaExpression::Constant(constant_).ResolveConstant(client_));
   ASSERT_TRUE(constant_constant.has_value());
-  test::ExpectTensorEqual<int32>(constant_, *constant_constant);
+  test::ExpectTensorEqual<int32_t>(constant_, *constant_constant);
 }
 
 TEST_F(XlaExpressionTest, ResolveConstantOnResource) {
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 38f01c83db8251..0b3425e5b8524a 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -136,7 +136,7 @@ struct XlaResourceUpdate {
   bool modified;
 
   // If the resource is a TensorArray, the set of gradients read or written.
-  std::set<string> tensor_array_gradients_accessed;
+  std::set<std::string> tensor_array_gradients_accessed;
 };
 
 struct XlaCompilationResult {
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 48e562ce5c7810..b374e8c8e81dd6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -76,12 +76,12 @@ int CountResults(
 // tf2xla::{Feed,Fetch,Variable}. We hold the actual strings in nonempty_names,
 // and hold arrays of pointers in name_ptrs, terminated by a nullptr entry.
 template <typename T>
-void CollectNames(const T& entries, std::vector<string>* nonempty_names,
+void CollectNames(const T& entries, std::vector<std::string>* nonempty_names,
                   std::vector<const char*>* name_ptrs) {
   // First collect `nonempty_names`, to ensure the underlying strings won't
   // change out from under us.
   for (const auto& entry : entries) {
-    const string& name = entry.name();
+    const std::string& name = entry.name();
     if (!name.empty()) {
       nonempty_names->push_back(name);
     }
@@ -90,7 +90,7 @@ void CollectNames(const T& entries, std::vector<string>* nonempty_names,
   name_ptrs->reserve(entries.size() + 1);  // +1 for nullptr array terminator
   size_t nonempty_index = 0;
   for (const auto& entry : entries) {
-    const string& name = entry.name();
+    const std::string& name = entry.name();
     if (!name.empty()) {
       name_ptrs->push_back(nonempty_names->at(nonempty_index).c_str());
       ++nonempty_index;
@@ -158,9 +158,9 @@ XlaJitCompiledCpuFunction::Compile(
       xla::cpu::CreateBufferAllocationInfos(cpu_executable->module(),
                                             buffer_assignment);
 
-  std::vector<int32> arg_index_table =
+  std::vector<int32_t> arg_index_table =
       xla::cpu::CreateArgIndexTable(buffer_infos);
-  std::vector<int32> result_index_table =
+  std::vector<int32_t> result_index_table =
       xla::cpu::CreateResultIndexTable(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index 0678c3be6c67f6..6f61f472a2fd5a 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "tensorflow/compiler/tf2xla/encoded_buffer_allocation_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
 #include "xla/backends/cpu/buffer_allocation_info.h"
@@ -85,17 +86,17 @@ class XlaJitCompiledCpuFunction {
   std::vector<xla::cpu::BufferAllocationInfo> buffer_infos_;
 
   // The backing array for the arg index table.
-  std::vector<int32> arg_index_table_;
+  std::vector<int32_t> arg_index_table_;
 
   // The backing array for the result index table.
-  std::vector<int32> result_index_table_;
+  std::vector<int32_t> result_index_table_;
 
   // The backing arrays of arg and result names. We hold the actual strings in
   // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
   // data to refer to.
-  std::vector<string> nonempty_arg_names_;
-  std::vector<string> nonempty_variable_names_;
-  std::vector<string> nonempty_result_names_;
+  std::vector<std::string> nonempty_arg_names_;
+  std::vector<std::string> nonempty_variable_names_;
+  std::vector<std::string> nonempty_result_names_;
   std::vector<const char*> arg_names_;
   std::vector<const char*> variable_names_;
   std::vector<const char*> result_names_;
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index acac1efd73881f..b49e699d6e267f 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -182,18 +182,18 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   ASSERT_EQ(function.num_results(), 1);
 
   // Run the function and check results.
-  *static_cast<int32*>(function.arg_data(0)) = 10;
-  *static_cast<int32*>(function.arg_data(1)) = 32;
+  *static_cast<int32_t*>(function.arg_data(0)) = 10;
+  *static_cast<int32_t*>(function.arg_data(1)) = 32;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 42);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 42);
 
   // Run the function again.
-  *static_cast<int32*>(function.arg_data(0)) = 100;
-  *static_cast<int32*>(function.arg_data(1)) = 320;
+  *static_cast<int32_t*>(function.arg_data(0)) = 100;
+  *static_cast<int32_t*>(function.arg_data(1)) = 320;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 420);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 420);
 
   // Check name to index lookups.
   EXPECT_TRUE(function.HasNameIndices());
@@ -268,20 +268,20 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
   ASSERT_EQ(function.num_results(), 2);
 
   // Run the function and check results.
-  *static_cast<int32*>(function.arg_data(0)) = 10;
-  *static_cast<int32*>(function.arg_data(1)) = 32;
+  *static_cast<int32_t*>(function.arg_data(0)) = 10;
+  *static_cast<int32_t*>(function.arg_data(1)) = 32;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 10);
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(1)), 42);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 10);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(1)), 42);
 
   // Run the function again.
-  *static_cast<int32*>(function.arg_data(0)) = 100;
-  *static_cast<int32*>(function.arg_data(1)) = 320;
+  *static_cast<int32_t*>(function.arg_data(0)) = 100;
+  *static_cast<int32_t*>(function.arg_data(1)) = 320;
   EXPECT_TRUE(function.Run());
   EXPECT_EQ(function.error_msg(), "");
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 100);
-  EXPECT_EQ(*static_cast<int32*>(function.result_data(1)), 420);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(0)), 100);
+  EXPECT_EQ(*static_cast<int32_t*>(function.result_data(1)), 420);
 
   // Check name to index lookups.
   EXPECT_TRUE(function.HasNameIndices());
@@ -325,7 +325,7 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
 
     int VisibleDeviceCount() const override { return 0; }
 
-    const string& Name() const override { return name_; }
+    const std::string& Name() const override { return name_; }
 
     absl::StatusOr<std::unique_ptr<se::DeviceDescription>> DescriptionForDevice(
         int ordinal) const override {
@@ -338,7 +338,7 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
     }
 
    private:
-    string name_;
+    std::string name_;
   };
 
   TF_EXPECT_OK(
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 0456328617e8a8..baefe0138d43dd 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -207,9 +207,9 @@ static absl::Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S16) {
-    *out = literal.Get<int16>({});
+    *out = literal.Get<int16_t>({});
   } else if (literal.shape().element_type() == xla::S32) {
-    *out = literal.Get<int32>({});
+    *out = literal.Get<int32_t>({});
   } else if (literal.shape().element_type() == xla::S64) {
     *out = literal.Get<int64_t>({});
   } else {
@@ -370,7 +370,7 @@ static absl::Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
   int64_t size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
     for (int64_t i = 0; i < size; ++i) {
-      out->push_back(literal.Get<int32>({i}));
+      out->push_back(literal.Get<int32_t>({i}));
     }
   } else if (literal.shape().element_type() == xla::S64) {
     for (int64_t i = 0; i < size; ++i) {
@@ -422,7 +422,7 @@ absl::Status XlaOpKernelContext::ConstantInputAsInt64Literal(
     case xla::S32: {
       *out = xla::Literal(
           xla::ShapeUtil::ChangeElementType(literal.shape(), xla::S64));
-      auto src_data = literal.data<int32>();
+      auto src_data = literal.data<int32_t>();
       for (int64_t i = 0; i < src_data.size(); ++i) {
         out->data<int64_t>()[i] = src_data[i];
       }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 445065971f2a6a..c74db865769229 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -61,7 +61,7 @@ static absl::Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
   NodeDef node_def;
   node_def.set_name("_XlaLaunch-op");
   node_def.set_op("XlaLaunch");
-  string kernel_class_name;
+  std::string kernel_class_name;
   TF_RETURN_IF_ERROR(FindKernelDef(device_type, node_def, /*KernelDef*/ nullptr,
                                    &kernel_class_name));
   VLOG(1) << "LaunchOpHasKernelForDevice"
@@ -128,7 +128,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ void XlaOpRegistry::RegisterCompilationDevice(
-    const string& device_name, const DeviceRegistration& registration) {
+    const std::string& device_name, const DeviceRegistration& registration) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto result =
@@ -138,7 +138,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ void XlaOpRegistry::RegisterBackend(
-    const string& compilation_device_name,
+    const std::string& compilation_device_name,
     absl::Span<const DataType> supported_types, BackendOpFilter op_filter) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
@@ -151,14 +151,14 @@ XlaOpRegistry::~XlaOpRegistry() = default;
 }
 
 /* static */ bool XlaOpRegistry::IsCompilationDevice(
-    const string& device_name) {
+    const std::string& device_name) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   return registry.backends_.find(device_name) != registry.backends_.end();
 }
 
 /* static */ bool XlaOpRegistry::GetCompilationDevice(
-    const string& device_name, const DeviceRegistration** registration) {
+    const std::string& device_name, const DeviceRegistration** registration) {
   XlaOpRegistry& registry = Instance();
 
   // Lazily register the CPU and GPU JIT devices the first time
@@ -235,7 +235,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
   // 2. Process op registration without device allowlists:
   //      this pass registers the kernels for all the other supported backends.
   for (auto& ops : registry.ops_) {
-    const string& op_name = ops.first;
+    const std::string& op_name = ops.first;
     std::vector<std::unique_ptr<OpRegistration>>& op_registrations = ops.second;
     // Partition the op registration so that the ones with device allowlists
     // precede the one without device allowlist.
@@ -247,7 +247,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
     // Collect a set of backend registered by ops with device allowlists.
     // The op registration without allowlists will register a generic kernel
     // for all other backends not in this set.
-    std::unordered_set<string> allowlisted_backend;
+    std::unordered_set<std::string> allowlisted_backend;
     for (auto& op_registration : op_registrations) {
       if (op_registration->has_device_allowlist) {
         allowlisted_backend.insert(op_registration->device_allowlist.begin(),
@@ -267,7 +267,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
       }
       TF_CHECK_OK(lookup_status);
 
-      std::unordered_set<string> type_attrs;
+      std::unordered_set<std::string> type_attrs;
       for (const OpDef::AttrDef& attr_def : op_def->attr()) {
         if (attr_def.type() == "type" || attr_def.type() == "list(type)") {
           type_attrs.insert(attr_def.name());
@@ -309,7 +309,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
         // b) the types allowed by the OpDef, and
         // c) the type constraints.
         bool unsatisfiable_type_constraint = false;
-        for (const string& type_attr : type_attrs) {
+        for (const std::string& type_attr : type_attrs) {
           KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
           attr_constraint->set_name(type_attr);
           auto* allowed_values =
@@ -375,7 +375,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 }
 
 std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
-    const string& compilation_device_name,
+    const std::string& compilation_device_name,
     bool include_compilation_only_kernels) {
   // Ensure compilation kernels registered.
   RegisterCompilationKernels();
@@ -403,8 +403,8 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
-/*static*/ std::vector<string> XlaOpRegistry::GetAllRegisteredOps() {
-  std::vector<string> ops;
+/*static*/ std::vector<std::string> XlaOpRegistry::GetAllRegisteredOps() {
+  std::vector<std::string> ops;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   ops.reserve(registry.ops_.size());
@@ -416,7 +416,7 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
 }
 
 /*static*/ const std::unordered_set<std::string>*
-XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
+XlaOpRegistry::CompileTimeConstantInputArgNames(const std::string& op) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto it = registry.ops_.find(op);
@@ -435,10 +435,10 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
 
   DCHECK(op_def != nullptr || op_kernel != nullptr);
 
-  std::unordered_set<string> compile_time_constant_inputs_from_attr;
-  std::vector<string> compile_time_constant_inputs_vect_from_attr;
+  std::unordered_set<std::string> compile_time_constant_inputs_from_attr;
+  std::vector<std::string> compile_time_constant_inputs_vect_from_attr;
 
-  const std::unordered_set<string>* compile_time_constant_inputs;
+  const std::unordered_set<std::string>* compile_time_constant_inputs;
 
   if (TryGetNodeAttr(node_def, kXlaCompileTimeConstantInputsAttr,
                      &compile_time_constant_inputs_vect_from_attr)) {
@@ -459,7 +459,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
           << " required constants are: "
           << absl::StrJoin(*compile_time_constant_inputs, ", ");
 
-  for (const string& input : *compile_time_constant_inputs) {
+  for (const std::string& input : *compile_time_constant_inputs) {
     if (op_def) {
       NameRangeMap input_name_ranges;
       TF_RETURN_IF_ERROR(
@@ -486,7 +486,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
   return absl::OkStatus();
 }
 
-/*static*/ bool XlaOpRegistry::IsMetadataOp(const string& op) {
+/*static*/ bool XlaOpRegistry::IsMetadataOp(const std::string& op) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   auto it = registry.ops_.find(op);
@@ -500,8 +500,8 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
   return it->second.front()->is_metadata_op;
 }
 
-std::vector<string> XlaOpRegistry::BackendNames() {
-  std::vector<string> names;
+std::vector<std::string> XlaOpRegistry::BackendNames() {
+  std::vector<std::string> names;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   names.reserve(registry.backends_.size());
@@ -511,7 +511,7 @@ std::vector<string> XlaOpRegistry::BackendNames() {
   return names;
 }
 
-bool XlaOpRegistry::IsBackendRegistered(const string& name) {
+bool XlaOpRegistry::IsBackendRegistered(const std::string& name) {
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
   return registry.backends_.find(name) != registry.backends_.end();
@@ -524,7 +524,7 @@ XlaOpRegistry& XlaOpRegistry::Instance() {
 
 XlaOpRegistrationBuilder::XlaOpRegistrationBuilder(absl::string_view name) {
   registration_.reset(new XlaOpRegistry::OpRegistration);
-  registration_->name = string(name);
+  registration_->name = std::string(name);
 }
 
 XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(
@@ -572,7 +572,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowStringType() {
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, DataType allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[string(attr_name)];
+      registration_->type_constraints[std::string(attr_name)];
   types.insert(allowed);
   return *this;
 }
@@ -580,7 +580,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, absl::Span<const DataType> allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[string(attr_name)];
+      registration_->type_constraints[std::string(attr_name)];
   for (DataType t : allowed) {
     types.insert(t);
   }
@@ -628,7 +628,7 @@ XlaBackendRegistrar::XlaBackendRegistrar(
     absl::string_view name, absl::Span<const DataType> types,
     XlaOpRegistry::BackendOpFilter op_filter) {
   XlaOpRegistry& registry = XlaOpRegistry::Instance();
-  registry.RegisterBackend(string(name), types, op_filter);
+  registry.RegisterBackend(std::string(name), types, op_filter);
 
   AddSymbolicExecutionDevice(name);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 5eaf0fb2d42bfa..9ce6e263f8feb4 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -139,7 +139,7 @@ class XlaOpRegistry {
   // Describes how to compile operators assigned to a device.
   struct DeviceRegistration {
     // The name of the an XLA compilation device to use to compile code.
-    string compilation_device_name;
+    std::string compilation_device_name;
 
     // When should we autocluster operators assigned to this device?
     AutoclusteringPolicy autoclustering_policy;
@@ -190,25 +190,25 @@ class XlaOpRegistry {
   // `backend_op_filter` should return true if the op should be registered on
   // the device; it may optionally modify the KernelDef.
   typedef bool (*BackendOpFilter)(KernelDef* kdef);
-  static void RegisterBackend(const string& compilation_device_name,
+  static void RegisterBackend(const std::string& compilation_device_name,
                               absl::Span<const DataType> supported_types,
                               BackendOpFilter op_filter);
 
   // Returns the names of the registered backends.
-  static std::vector<string> BackendNames();
+  static std::vector<std::string> BackendNames();
 
   // Returns true iff a backend with the given name is registered.
-  static bool IsBackendRegistered(const string& name);
+  static bool IsBackendRegistered(const std::string& name);
 
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
   // Does nothing if a registration for `device_name` already exists.
-  static void RegisterCompilationDevice(const string& device_name,
+  static void RegisterCompilationDevice(const std::string& device_name,
                                         const DeviceRegistration& registration);
 
   // Returns whether the device name is for the JIT device used exclusively for
   // TF2XLA conversion.
-  static bool IsCompilationDevice(const string& device_name);
+  static bool IsCompilationDevice(const std::string& device_name);
 
   // Returns the JIT device name associated with 'device_name', setting
   // 'jit_device_name', 'requires_jit', and 'enabled_jit_by_default', if they
@@ -216,7 +216,7 @@ class XlaOpRegistry {
   // JIT device is registered.
   // '*enable_jit_by_default' is set to true if we should try to JIT using this
   // device when the JIT is enabled via the Session OptimizerOptions.
-  static bool GetCompilationDevice(const string& device_name,
+  static bool GetCompilationDevice(const std::string& device_name,
                                    const DeviceRegistration** registration);
 
   // Registers all JIT kernels on JIT devices, if not already registered.
@@ -227,11 +227,11 @@ class XlaOpRegistry {
   // 'compilation_device_name'.  Does not include kernels registered as
   // CompilationOnly, iff include_compilation_only_kernels=false.
   static std::vector<const KernelDef*> DeviceKernels(
-      const string& compilation_device_name,
+      const std::string& compilation_device_name,
       bool include_compilation_only_kernels);
 
   // Returns all operations for which there are XLA kernels on any device.
-  static std::vector<string> GetAllRegisteredOps();
+  static std::vector<std::string> GetAllRegisteredOps();
 
   // Returns (via `result`) the indices of inputs to `node_def` that must be
   // compile-time constants. Returns an empty vector if the op is not
@@ -265,11 +265,11 @@ class XlaOpRegistry {
   // Return names of arguments for a given op which are supposed to be
   // constants.
   static const std::unordered_set<std::string>*
-  CompileTimeConstantInputArgNames(const string& op);
+  CompileTimeConstantInputArgNames(const std::string& op);
 
   // Returns true if `op` is a "metadata" op, one that only looks at the shapes
   // of its operands and not their values.
-  static bool IsMetadataOp(const string& op);
+  static bool IsMetadataOp(const std::string& op);
 
  private:
   friend class XlaBackendRegistrar;
@@ -298,15 +298,15 @@ class XlaOpRegistry {
   };
 
   // Map from compilation device names to a description of the backend.
-  std::unordered_map<string, Backend> backends_ TF_GUARDED_BY(mutex_);
+  std::unordered_map<std::string, Backend> backends_ TF_GUARDED_BY(mutex_);
 
   // Map from Tensorflow device names to the corresponding JIT device metadata.
-  std::unordered_map<string, DeviceRegistration> compilation_devices_
+  std::unordered_map<std::string, DeviceRegistration> compilation_devices_
       TF_GUARDED_BY(mutex_);
 
   // A description of a Tensorflow operator that can be compiled to XLA.
   struct OpRegistration {
-    string name;
+    std::string name;
 
     // Should this operator be registered only on compilation devices, without a
     // dummy kernel registered on the corresponding XLA device?
@@ -325,15 +325,15 @@ class XlaOpRegistry {
     bool allow_string_type = false;
 
     // Mapping from attribute name to a list of supported types.
-    std::unordered_map<string, std::set<DataType>> type_constraints;
+    std::unordered_map<std::string, std::set<DataType>> type_constraints;
 
     // An optional allowlist of devices. If there is no allowlist, all devices
     // are permitted.
     bool has_device_allowlist = false;
-    std::unordered_set<string> device_allowlist;
+    std::unordered_set<std::string> device_allowlist;
 
     // Names of arguments that must be compile-time constants.
-    std::unordered_set<string> compile_time_constant_inputs;
+    std::unordered_set<std::string> compile_time_constant_inputs;
 
     // True if this is a "metadata" op, one that only looks at the shapes of its
     // operands and not their values.
@@ -360,8 +360,8 @@ class XlaOpRegistry {
   // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP.
   // Registrations present under the same key must satisfy IsCompatible above,
   // and this is checked during registration.
-  std::unordered_map<string, std::vector<std::unique_ptr<OpRegistration>>> ops_
-      TF_GUARDED_BY(mutex_);
+  std::unordered_map<std::string, std::vector<std::unique_ptr<OpRegistration>>>
+      ops_ TF_GUARDED_BY(mutex_);
 
   // Have we already registered the JIT kernels on the JIT devices?
   bool jit_kernels_registered_ = false;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 5b894d07e121ba..962b0e473a826c 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -51,29 +51,29 @@ namespace tensorflow {
 }
 
 /*static*/ std::unique_ptr<XlaResource> XlaResource::CreateStack(
-    string name, DataType type, int64_t max_size) {
+    std::string name, DataType type, int64_t max_size) {
   return std::make_unique<XlaResource>(
       XlaResource::kStack, /*arg_num=*/-1, std::move(name), type, TensorShape(),
       /*initial_value=*/xla::XlaOp(),
       /*max_array_size=*/max_size,
-      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_gradients=*/std::set<std::string>{},
       /*tensor_array_multiple_writes_aggregate=*/false);
 }
 
 /*static*/ std::unique_ptr<XlaResource> XlaResource::CreateTensorArray(
-    string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
-    int64_t max_array_size) {
+    std::string name, DataType type, TensorShape shape,
+    xla::XlaOp initial_value, int64_t max_array_size) {
   return std::make_unique<XlaResource>(
       XlaResource::kTensorArray, /*arg_num=*/-1, std::move(name), type, shape,
       initial_value, max_array_size,
-      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_gradients=*/std::set<std::string>{},
       /*tensor_array_multiple_writes_aggregate=*/false);
 }
 
 XlaResource::XlaResource(
-    Kind kind, int arg_num, string name, DataType type, TensorShape shape,
+    Kind kind, int arg_num, std::string name, DataType type, TensorShape shape,
     xla::XlaOp initial_value, int64_t max_array_size,
-    const std::set<string>& tensor_array_gradients,
+    const std::set<std::string>& tensor_array_gradients,
     bool tensor_array_multiple_writes_aggregate,
     const std::optional<ManagedStackTrace>& definition_stack_trace)
     : kind_(kind),
@@ -89,7 +89,7 @@ XlaResource::XlaResource(
       definition_stack_trace_(definition_stack_trace) {
   CHECK(kind_ != kInvalid);
 
-  for (const string& gradient : tensor_array_gradients) {
+  for (const std::string& gradient : tensor_array_gradients) {
     tensor_array_gradients_[gradient].reset(new XlaResource(
         /*kind=*/kTensorArray, /*arg_num=*/-1,
         /*name=*/absl::StrCat("TensorArrayGrad: ", name_), type_, shape_,
@@ -163,7 +163,7 @@ absl::Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
                                               ta_shape.dim_sizes()),
-                               xla::ConstantR0<int32>(builder, 0)});
+                               xla::ConstantR0<int32_t>(builder, 0)});
       break;
     }
 
@@ -175,7 +175,7 @@ absl::Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
 }
 
 absl::Status XlaResource::GetOrCreateTensorArrayGradient(
-    const string& source, xla::XlaBuilder* builder,
+    const std::string& source, xla::XlaBuilder* builder,
     XlaResource** gradient_out) {
   VLOG(2) << "Gradient lookup for resource: " << name_
           << " gradient: " << source;
@@ -214,9 +214,9 @@ absl::Status XlaResource::Pack(xla::XlaOp* pack,
   return absl::OkStatus();
 }
 
-absl::Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
-                                      const xla::XlaOp pack,
-                                      xla::XlaBuilder* builder) {
+absl::Status XlaResource::SetFromPack(
+    const std::set<std::string>& gradient_sources, const xla::XlaOp pack,
+    xla::XlaBuilder* builder) {
   if (gradient_sources.empty()) {
     if (!initialized()) {
       initial_value_ = pack;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index d4c8f7c1c9347f..07c826d21e8b3d 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -43,18 +43,19 @@ class XlaResource {
   static absl::string_view KindToString(Kind kind);
 
   // Creates a new Stack resource.
-  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+  static std::unique_ptr<XlaResource> CreateStack(std::string name,
+                                                  DataType type,
                                                   int64_t max_size);
 
   // Creates a new TensorArray resource.
   static std::unique_ptr<XlaResource> CreateTensorArray(
-      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
-      int64_t max_array_size);
+      std::string name, DataType type, TensorShape shape,
+      xla::XlaOp initial_value, int64_t max_array_size);
 
-  XlaResource(Kind kind, int arg_num, string name, DataType type,
+  XlaResource(Kind kind, int arg_num, std::string name, DataType type,
               TensorShape shape, xla::XlaOp initial_value,
               int64_t max_array_size,
-              const std::set<string>& tensor_array_gradients,
+              const std::set<std::string>& tensor_array_gradients,
               bool tensor_array_multiple_writes_aggregate,
               const std::optional<ManagedStackTrace>& definition_stack_trace =
                   std::nullopt);
@@ -72,7 +73,7 @@ class XlaResource {
   int arg_num() const { return arg_num_; }
 
   // A descriptive name for the resource, used in error messages.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Current type and value of the resource. Uninitialized resources are
   // represented by a default (zero) handle and type DT_INVALID.
@@ -121,7 +122,7 @@ class XlaResource {
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
   // documentation for TensorArrayGradV3 for details.
-  absl::Status GetOrCreateTensorArrayGradient(const string& source,
+  absl::Status GetOrCreateTensorArrayGradient(const std::string& source,
                                               xla::XlaBuilder* builder,
                                               XlaResource** gradient_out);
 
@@ -138,7 +139,7 @@ class XlaResource {
   // If `reset_initial_values` is true, sets the initial_values as well as the
   // values.
   // Opposite of Pack().
-  absl::Status SetFromPack(const std::set<string>& gradient_sources,
+  absl::Status SetFromPack(const std::set<std::string>& gradient_sources,
                            xla::XlaOp pack, xla::XlaBuilder* builder);
 
   bool IsOverwritten() { return is_overwritten_; }
@@ -164,15 +165,15 @@ class XlaResource {
   // string, irrespective of the number of calls to TensorArrayGrad. The map
   // is ordered since values are packed into tuples by Pack() sorted by name
   // order.
-  const std::map<string, std::unique_ptr<XlaResource>>& tensor_array_gradients()
-      const {
+  const std::map<std::string, std::unique_ptr<XlaResource>>&
+  tensor_array_gradients() const {
     return tensor_array_gradients_;
   }
 
  private:
   const Kind kind_;
   const int arg_num_;
-  const string name_;
+  const std::string name_;
 
   DataType type_;
   TensorShape shape_;
@@ -186,7 +187,7 @@ class XlaResource {
   int64_t max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
-  std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
+  std::map<std::string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
   bool is_overwritten_ = false;
 
   std::optional<ManagedStackTrace> definition_stack_trace_;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 561a4ca410c5e0..b76a4ffd8955b9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1944,21 +1944,15 @@ tf_cc_tests(
 )
 
 tf_cc_tests(
-    name = "cell_reader_test",
+    name = "test_utils_test",
     size = "small",
     srcs = [
-        "//tensorflow/core/lib/monitoring:cell_reader_test.cc",
         "//tensorflow/core/lib/monitoring:test_utils_test.cc",
     ],
     deps = [
         ":protos_all_cc",
         ":test",
         ":test_main",
-        "//tensorflow/core/lib/monitoring:cell_reader",
-        "//tensorflow/core/lib/monitoring:counter",
-        "//tensorflow/core/lib/monitoring:gauge",
-        "//tensorflow/core/lib/monitoring:percentile_sampler",
-        "//tensorflow/core/lib/monitoring:sampler",
         "//tensorflow/core/lib/monitoring:test_utils",
         "//tensorflow/core/lib/monitoring:types",
         "//tensorflow/core/platform:errors",
diff --git a/tensorflow/core/activity_watcher/activity.h b/tensorflow/core/activity_watcher/activity.h
index eecd207a33fe27..fba51b43f8a3ce 100644
--- a/tensorflow/core/activity_watcher/activity.h
+++ b/tensorflow/core/activity_watcher/activity.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 
 namespace activity_watcher {
 
-using ActivityId = tsl::uint64;
+using ActivityId = uint64_t;
 constexpr ActivityId kActivityNotRecorded = 0;
 constexpr int kWatcherDisabled = 0;
 
@@ -45,7 +45,7 @@ enum ActivityCategory {
   kRendezvous = 5,
 };
 
-static tsl::string ToString(ActivityCategory category) {
+static std::string ToString(ActivityCategory category) {
   switch (category) {
     case ActivityCategory::kCollective:
       return "Collective";
@@ -64,17 +64,17 @@ static tsl::string ToString(ActivityCategory category) {
 
 // An activity to be recorded.
 struct Activity {
-  using Attributes = absl::flat_hash_map<tsl::string, tsl::string>;
+  using Attributes = absl::flat_hash_map<std::string, std::string>;
   // A human readable title of the activity.
-  tsl::string title;
+  std::string title;
   // The category of the activity.
   ActivityCategory category = ActivityCategory::kMisc;
   // Key/value pairs that are attached to the activity.
   Attributes attributes;
   Activity() = default;
-  Activity(tsl::string title, ActivityCategory category)
+  Activity(std::string title, ActivityCategory category)
       : title(std::move(title)), category(category) {}
-  Activity(tsl::string title, ActivityCategory category, Attributes attributes)
+  Activity(std::string title, ActivityCategory category, Attributes attributes)
       : title(std::move(title)),
         category(category),
         attributes(std::move(attributes)) {}
diff --git a/tensorflow/core/activity_watcher/activity_utils.cc b/tensorflow/core/activity_watcher/activity_utils.cc
index b3631076c5c2d9..58b3909a25789c 100644
--- a/tensorflow/core/activity_watcher/activity_utils.cc
+++ b/tensorflow/core/activity_watcher/activity_utils.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace activity_watcher {
 
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    OpKernelContext* context, std::string name, ActivityCategory category,
     Activity::Attributes additional_attributes) {
   Activity::Attributes attributes(std::move(additional_attributes));
   if (context) {
diff --git a/tensorflow/core/activity_watcher/activity_utils.h b/tensorflow/core/activity_watcher/activity_utils.h
index 64958cd5e09744..749ef1326ae565 100644
--- a/tensorflow/core/activity_watcher/activity_utils.h
+++ b/tensorflow/core/activity_watcher/activity_utils.h
@@ -29,7 +29,7 @@ namespace activity_watcher {
 // A convenient way to create an activity. Writes OpKernelContext information
 // and given attributes to a new activity and returns.
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    OpKernelContext* context, std::string name, ActivityCategory category,
     Activity::Attributes additional_attributes = Activity::Attributes());
 
 }  // namespace activity_watcher
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 76b8cc01324619..caf20c11b93566 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -65,6 +65,7 @@ cc_library(
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 7f844e88ba90c6..3c954cf076ddc8 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -43,26 +43,27 @@ namespace {
 
 constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
 
-string DefaultApiDefDir() {
+std::string DefaultApiDefDir() {
   return GetDataDependencyFilepath(
       io::JoinPath("tensorflow", "core", "api_def", "base_api"));
 }
 
-string PythonApiDefDir() {
+std::string PythonApiDefDir() {
   return GetDataDependencyFilepath(
       io::JoinPath("tensorflow", "core", "api_def", "python_api"));
 }
 
 // Reads golden ApiDef files and returns a map from file name to ApiDef file
 // contents.
-void GetGoldenApiDefs(Env* env, const string& api_files_dir,
-                      std::unordered_map<string, ApiDef>* name_to_api_def) {
-  std::vector<string> matching_paths;
+void GetGoldenApiDefs(
+    Env* env, const std::string& api_files_dir,
+    std::unordered_map<std::string, ApiDef>* name_to_api_def) {
+  std::vector<std::string> matching_paths;
   TF_CHECK_OK(env->GetMatchingPaths(
       io::JoinPath(api_files_dir, kApiDefFilePattern), &matching_paths));
 
   for (auto& file_path : matching_paths) {
-    string file_contents;
+    std::string file_contents;
     TF_CHECK_OK(ReadFileToString(env, file_path, &file_contents));
     file_contents = PBTxtFromMultiline(file_contents);
 
@@ -76,8 +77,9 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir,
 }
 
 void TestAllApiDefsHaveCorrespondingOp(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
-  std::unordered_set<string> op_names;
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
+  std::unordered_set<std::string> op_names;
   for (const auto& op : ops.op()) {
     op_names.insert(op.name());
   }
@@ -89,7 +91,8 @@ void TestAllApiDefsHaveCorrespondingOp(
 }
 
 void TestAllApiDefInputArgsAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -113,7 +116,8 @@ void TestAllApiDefInputArgsAreValid(
 }
 
 void TestAllApiDefOutputArgsAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -137,7 +141,8 @@ void TestAllApiDefOutputArgsAreValid(
 }
 
 void TestAllApiDefAttributeNamesAreValid(
-    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const OpList& ops,
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& op : ops.op()) {
     const auto api_def_iter = api_defs_map.find(op.name());
     if (api_def_iter == api_defs_map.end()) {
@@ -159,7 +164,7 @@ void TestAllApiDefAttributeNamesAreValid(
 }
 
 void TestDeprecatedAttributesSetCorrectly(
-    const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& name_and_api_def : api_defs_map) {
     int num_deprecated_endpoints = 0;
     const auto& api_def = name_and_api_def.second;
@@ -186,7 +191,7 @@ void TestDeprecatedAttributesSetCorrectly(
 }
 
 void TestDeprecationVersionSetCorrectly(
-    const std::unordered_map<string, ApiDef>& api_defs_map) {
+    const std::unordered_map<std::string, ApiDef>& api_defs_map) {
   for (const auto& name_and_api_def : api_defs_map) {
     const auto& name = name_and_api_def.first;
     const auto& api_def = name_and_api_def.second;
@@ -205,13 +210,13 @@ class BaseApiTest : public ::testing::Test {
  protected:
   BaseApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
-    const std::vector<string> multi_line_fields = {"description"};
+    const std::vector<std::string> multi_line_fields = {"description"};
 
     Env* env = Env::Default();
     GetGoldenApiDefs(env, DefaultApiDefDir(), &api_defs_map_);
   }
   OpList ops_;
-  std::unordered_map<string, ApiDef> api_defs_map_;
+  std::unordered_map<std::string, ApiDef> api_defs_map_;
 };
 
 // Check that all ops have an ApiDef.
@@ -233,7 +238,7 @@ TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) {
   TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
 }
 
-string GetOpDefHasDocStringError(const string& op_name) {
+std::string GetOpDefHasDocStringError(const std::string& op_name) {
   return strings::Printf(
       "OpDef for %s has a doc string. "
       "Doc strings must be defined in ApiDef instead of OpDef. "
@@ -301,13 +306,13 @@ class PythonApiTest : public ::testing::Test {
  protected:
   PythonApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
-    const std::vector<string> multi_line_fields = {"description"};
+    const std::vector<std::string> multi_line_fields = {"description"};
 
     Env* env = Env::Default();
     GetGoldenApiDefs(env, PythonApiDefDir(), &api_defs_map_);
   }
   OpList ops_;
-  std::unordered_map<string, ApiDef> api_defs_map_;
+  std::unordered_map<std::string, ApiDef> api_defs_map_;
 };
 
 // Check that ApiDefs have a corresponding op.
diff --git a/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
index 7c4db1f721a032..41868ddc6c649f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
@@ -1,5 +1,12 @@
 op {
   graph_op_name: "ComplexAbs"
+  attr {
+    name: "Tout"
+    description: <<END
+Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+Need to be `tf.float64` when the type of `x` is `tf.complex128`.
+END
+  }
   summary: "Computes the complex absolute value of a tensor."
   description: <<END
 Given a tensor `x` of complex numbers, this operation returns a tensor of type
diff --git a/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
index 8c3bb674311293..3be31658726ada 100644
--- a/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
@@ -1,5 +1,12 @@
 op {
   graph_op_name: "Imag"
+  attr {
+    name: "Tout"
+    description: <<END
+Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+Need to be `tf.float64` when the type of `x` is `tf.complex128`.
+END
+  }
   summary: "Returns the imaginary part of a complex number."
   description: <<END
 Given a tensor `input` of complex numbers, this operation returns a tensor of
diff --git a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
index 3a737420005937..21e6ae2934c1ee 100644
--- a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
@@ -13,12 +13,25 @@ END
 A (batch_size, dim)-tensor holding the per-batch element solutions.
 END
   }
-    out_arg {
+  out_arg {
     name: "segments"
     description: <<END
 An int32 (batch_size, dim)-tensor with the segments.
 END
   }
-  attr { name: "output_dtype"  description: "Dtype of output." }
+  attr {
+    name: "output_dtype"
+    description: <<END
+Dtype of the output tensor.
+
+Note on supported input-output type combinations:
+* For floating-point types, the output has the same dtype as the input.
+* For 8-bit and 16-bit integer inputs, the output is a 32-bit float.
+* For 32-bit and 64-bit integer inputs, the output is a 64-bit float.
+
+Using unsupported dtype pairs (for example, input=float64 with output=float32)
+will result in a "Could not find device for node" error.
+END
+  }
   summary: "Solves a batch of isotonic regression problems."
 }
diff --git a/tensorflow/core/api_def/update_api_def.cc b/tensorflow/core/api_def/update_api_def.cc
index 2324c5adad6ab1..194f3f61014346 100644
--- a/tensorflow/core/api_def/update_api_def.cc
+++ b/tensorflow/core/api_def/update_api_def.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/api_def/excluded_ops.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -124,7 +125,7 @@ bool CheckDocsMatch(const OpDef& op1, const OpDef& op2) {
 
 // Returns true if descriptions and summaries in op match a
 // given single doc-string.
-bool ValidateOpDocs(const OpDef& op, const string& doc) {
+bool ValidateOpDocs(const OpDef& op, const std::string& doc) {
   OpDefBuilder b(op.name());
   // We don't really care about type we use for arguments and
   // attributes. We just want to make sure attribute and argument names
@@ -146,28 +147,28 @@ bool ValidateOpDocs(const OpDef& op, const string& doc) {
 }
 }  // namespace
 
-string RemoveDoc(const OpDef& op, const string& file_contents,
-                 size_t start_location) {
+std::string RemoveDoc(const OpDef& op, const std::string& file_contents,
+                      size_t start_location) {
   // Look for a line starting with .Doc( after the REGISTER_OP.
   const auto doc_start_location = file_contents.find(kDocStart, start_location);
-  const string format_error = strings::Printf(
+  const std::string format_error = strings::Printf(
       "Could not find %s doc for removal. Make sure the doc is defined with "
       "'%s' prefix and '%s' suffix or remove the doc manually.",
       op.name().c_str(), kDocStart, kDocEnd);
-  if (doc_start_location == string::npos) {
+  if (doc_start_location == std::string::npos) {
     std::cerr << format_error << std::endl;
     LOG(ERROR) << "Didn't find doc start";
     return file_contents;
   }
   const auto doc_end_location = file_contents.find(kDocEnd, doc_start_location);
-  if (doc_end_location == string::npos) {
+  if (doc_end_location == std::string::npos) {
     LOG(ERROR) << "Didn't find doc start";
     std::cerr << format_error << std::endl;
     return file_contents;
   }
 
   const auto doc_start_size = sizeof(kDocStart) - 1;
-  string doc_text = file_contents.substr(
+  std::string doc_text = file_contents.substr(
       doc_start_location + doc_start_size,
       doc_end_location - doc_start_location - doc_start_size);
 
@@ -189,12 +190,12 @@ namespace {
 // Remove .Doc calls that follow REGISTER_OP calls for the given ops.
 // We search for REGISTER_OP calls in the given op_files list.
 void RemoveDocs(const std::vector<const OpDef*>& ops,
-                const std::vector<string>& op_files) {
+                const std::vector<std::string>& op_files) {
   // Set of ops that we already found REGISTER_OP calls for.
-  std::set<string> processed_ops;
+  std::set<std::string> processed_ops;
 
   for (const auto& file : op_files) {
-    string file_contents;
+    std::string file_contents;
     bool file_contents_updated = false;
     TF_CHECK_OK(ReadFileToString(Env::Default(), file, &file_contents));
 
@@ -203,11 +204,11 @@ void RemoveDocs(const std::vector<const OpDef*>& ops,
         // We already found REGISTER_OP call for this op in another file.
         continue;
       }
-      string register_call =
+      std::string register_call =
           strings::Printf("REGISTER_OP(\"%s\")", op->name().c_str());
       const auto register_call_location = file_contents.find(register_call);
       // Find REGISTER_OP(OpName) call.
-      if (register_call_location == string::npos) {
+      if (register_call_location == std::string::npos) {
         continue;
       }
       std::cout << "Removing .Doc call for " << op->name() << " from " << file
@@ -228,11 +229,11 @@ void RemoveDocs(const std::vector<const OpDef*>& ops,
 
 // Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
-string CreateApiDef(const OpDef& op) {
+std::string CreateApiDef(const OpDef& op) {
   ApiDefs api_defs;
   FillBaseApiDef(api_defs.add_op(), op);
 
-  const std::vector<string> multi_line_fields = {"description"};
+  const std::vector<std::string> multi_line_fields = {"description"};
   std::string new_api_defs_str;
   ::tensorflow::protobuf::TextFormat::PrintToString(api_defs,
                                                     &new_api_defs_str);
@@ -242,8 +243,8 @@ string CreateApiDef(const OpDef& op) {
 // Creates ApiDef files for any new ops.
 // If op_file_pattern is not empty, then also removes .Doc calls from
 // new op registrations in these files.
-void CreateApiDefs(const OpList& ops, const string& api_def_dir,
-                   const string& op_file_pattern) {
+void CreateApiDefs(const OpList& ops, const std::string& api_def_dir,
+                   const std::string& op_file_pattern) {
   auto* excluded_ops = GetExcludedOps();
   std::vector<const OpDef*> new_ops_with_docs;
 
@@ -252,9 +253,8 @@ void CreateApiDefs(const OpList& ops, const string& api_def_dir,
       continue;
     }
     // Form the expected ApiDef path.
-    string file_path =
-        io::JoinPath(tensorflow::string(api_def_dir), kApiDefFileFormat);
-    file_path = strings::Printf(file_path.c_str(), op.name().c_str());
+    std::string file_name = absl::StrFormat(kApiDefFileFormat, op.name());
+    std::string file_path = io::JoinPath(api_def_dir, file_name);
 
     // Create ApiDef if it doesn't exist.
     if (!Env::Default()->FileExists(file_path).ok()) {
@@ -268,7 +268,7 @@ void CreateApiDefs(const OpList& ops, const string& api_def_dir,
     }
   }
   if (!op_file_pattern.empty()) {
-    std::vector<string> op_files;
+    std::vector<std::string> op_files;
     TF_CHECK_OK(Env::Default()->GetMatchingPaths(op_file_pattern, &op_files));
     RemoveDocs(new_ops_with_docs, op_files);
   }
diff --git a/tensorflow/core/api_def/update_api_def.h b/tensorflow/core/api_def/update_api_def.h
index 1e285c06883efa..1ac71689bba2d0 100644
--- a/tensorflow/core/api_def/update_api_def.h
+++ b/tensorflow/core/api_def/update_api_def.h
@@ -23,14 +23,14 @@ namespace tensorflow {
 
 // Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
-string CreateApiDef(const OpDef& op);
+std::string CreateApiDef(const OpDef& op);
 
 // Removes .Doc call for the given op.
 // If unsuccessful, returns original file_contents and prints an error.
 // start_location - We search for .Doc call starting at this location
 //   in file_contents.
-string RemoveDoc(const OpDef& op, const string& file_contents,
-                 size_t start_location);
+std::string RemoveDoc(const OpDef& op, const std::string& file_contents,
+                      size_t start_location);
 
 // Creates api_def_*.pbtxt files for any new ops (i.e. ops that don't have an
 // api_def_*.pbtxt file yet).
@@ -38,8 +38,8 @@ string RemoveDoc(const OpDef& op, const string& file_contents,
 // look for a REGISTER_OP call for the new ops and removes corresponding
 // .Doc() calls since the newly generated api_def_*.pbtxt files will
 // store the doc strings.
-void CreateApiDefs(const OpList& ops, const string& api_def_dir,
-                   const string& op_file_pattern);
+void CreateApiDefs(const OpList& ops, const std::string& api_def_dir,
+                   const std::string& op_file_pattern);
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
diff --git a/tensorflow/core/api_def/update_api_def_main.cc b/tensorflow/core/api_def/update_api_def_main.cc
index 3fd975ce178b5f..4cf74abf82cb6f 100644
--- a/tensorflow/core/api_def/update_api_def_main.cc
+++ b/tensorflow/core/api_def/update_api_def_main.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 int main(int argc, char** argv) {
-  tensorflow::string api_files_dir;
-  tensorflow::string op_file_pattern;
+  std::string api_files_dir;
+  std::string op_file_pattern;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("api_def_dir", &api_files_dir,
                        "Base directory of api_def*.pbtxt files."),
diff --git a/tensorflow/core/api_def/update_api_def_test.cc b/tensorflow/core/api_def/update_api_def_test.cc
index 4200c9da23c093..23751ffa3ecd25 100644
--- a/tensorflow/core/api_def/update_api_def_test.cc
+++ b/tensorflow/core/api_def/update_api_def_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace {
 
 TEST(UpdateApiDefTest, TestRemoveDocSingleOp) {
-  const string op_def_text = R"opdef(
+  const std::string op_def_text = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Output("output: T")
@@ -32,7 +32,7 @@ REGISTER_OP("Op1")
     .SetShapeFn(shape_inference::UnchangedShape);
 )opdef";
 
-  const string op_def_text_with_doc = R"opdef(
+  const std::string op_def_text_with_doc = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Output("output: T")
@@ -50,7 +50,7 @@ output: Description for output.
 )doc");
 )opdef";
 
-  const string op_text = R"(
+  const std::string op_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
@@ -75,7 +75,7 @@ description: "Description\nfor Op1."
 }
 
 TEST(UpdateApiDefTest, TestRemoveDocMultipleOps) {
-  const string op_def_text = R"opdef(
+  const std::string op_def_text = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .SetShapeFn(shape_inference::UnchangedShape);
@@ -89,7 +89,7 @@ REGISTER_OP("Op3")
     .SetShapeFn(shape_inference::UnchangedShape);
 )opdef";
 
-  const string op_def_text_with_doc = R"opdef(
+  const std::string op_def_text_with_doc = R"opdef(
 REGISTER_OP("Op1")
     .Input("a: T")
     .Doc(R"doc(
@@ -112,21 +112,21 @@ Summary for Op3.
 )doc");
 )opdef";
 
-  const string op1_text = R"(
+  const std::string op1_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
 }
 summary: "Summary for Op1."
 )";
-  const string op2_text = R"(
+  const std::string op2_text = R"(
 name: "Op2"
 input_arg {
   name: "a"
 }
 summary: "Summary for Op2."
 )";
-  const string op3_text = R"(
+  const std::string op3_text = R"(
 name: "Op3"
 input_arg {
   name: "c"
@@ -138,12 +138,12 @@ summary: "Summary for Op3."
   protobuf::TextFormat::ParseFromString(op2_text, &op2);  // NOLINT
   protobuf::TextFormat::ParseFromString(op3_text, &op3);  // NOLINT
 
-  string updated_text =
+  std::string updated_text =
       RemoveDoc(op2, op_def_text_with_doc,
                 op_def_text_with_doc.find("Op2") /* start_location */);
-  EXPECT_EQ(string::npos, updated_text.find("Summary for Op2"));
-  EXPECT_NE(string::npos, updated_text.find("Summary for Op1"));
-  EXPECT_NE(string::npos, updated_text.find("Summary for Op3"));
+  EXPECT_EQ(std::string::npos, updated_text.find("Summary for Op2"));
+  EXPECT_NE(std::string::npos, updated_text.find("Summary for Op1"));
+  EXPECT_NE(std::string::npos, updated_text.find("Summary for Op3"));
 
   updated_text = RemoveDoc(op3, updated_text,
                            updated_text.find("Op3") /* start_location */);
@@ -153,7 +153,7 @@ summary: "Summary for Op3."
 }
 
 TEST(UpdateApiDefTest, TestCreateApiDef) {
-  const string op_text = R"(
+  const std::string op_text = R"(
 name: "Op1"
 input_arg {
   name: "a"
@@ -173,7 +173,7 @@ description: "Description\nfor Op1."
   OpDef op;
   protobuf::TextFormat::ParseFromString(op_text, &op);  // NOLINT
 
-  const string expected_api_def = R"(op {
+  const std::string expected_api_def = R"(op {
   graph_op_name: "Op1"
   in_arg {
     name: "a"
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 5ada8b377b880a..86c504a3fda8e5 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -375,6 +375,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/profiler/lib:traceme",
@@ -1110,6 +1111,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 77231dee43e240..5150869da370f7 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/config/flag_defs.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -229,6 +230,9 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
 BaseCollectiveExecutor::~BaseCollectiveExecutor() {}
 
 void BaseCollectiveExecutor::StartAbort(const absl::Status& s) {
+  if (flags::Global().enable_fatal_error_on_collective_abort.value()) {
+    LOG(FATAL) << "BaseCollectiveExecutor::StartAbort: " << s;
+  }
   absl::Status status;
   {
     mutex_lock l(status_mu_);
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.h b/tensorflow/core/common_runtime/device/device_event_mgr.h
index 7fb0dbc822d676..75847bf66a6e2c 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.h
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.h
@@ -83,7 +83,7 @@ class EventMgr {
   friend class EventMgrFactory;
 
   se::StreamExecutor* const exec_;
-  const int32 polling_active_delay_usecs_;
+  const int32_t polling_active_delay_usecs_;
   mutex mu_;
   condition_variable events_pending_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 65bd6a8dae1e5d..583fce11a0ef28 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -134,9 +134,9 @@ void SetMemory(NodeExecStatsInterface* stats, OpKernelContext* ctx) {
 // Time the execution of kernels (in CPU cycles).  Used to dynamically identify
 // inexpensive kernels which can be dispatched inline.
 struct KernelTimer {
-  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+  uint64_t start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
 
-  uint64 ElapsedCycles() {
+  uint64_t ElapsedCycles() {
     return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
   }
 };
@@ -197,14 +197,14 @@ class ExecutorImpl : public Executor {
     // given node is expensive. The new cost estimate is a weighted average of
     // the old cost estimate and the latest cost. We only update cost estimates
     // for kernels for which IsExpensive() return true.
-    void UpdateCostEstimate(const NodeItem& node, uint64 elapsed_cycles) {
+    void UpdateCostEstimate(const NodeItem& node, uint64_t elapsed_cycles) {
       // N.B. Updates to `cost_estimate` are atomic but unlocked.  Simultaneous
       // updates may result in one or more updates being ignored.  This does not
       // affect correctness but may slow down the update frequency.
       std::atomic_uint_fast64_t& cost_estimate = cost_estimates_[node.node_id];
       auto prev_estimate = cost_estimate.load(std::memory_order_relaxed);
 
-      uint64 new_estimate =
+      uint64_t new_estimate =
           ((kCostDecay - 1) * prev_estimate + elapsed_cycles) / kCostDecay;
 
       cost_estimate.store(new_estimate, std::memory_order_relaxed);
@@ -214,9 +214,9 @@ class ExecutorImpl : public Executor {
     // Initial time (in CPU cycles) we expect an operation to take.  Used to
     // determine whether an operation should be place in a threadpool.
     // Operations start out "expensive".
-    static constexpr uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
-    static constexpr uint64 kOpIsExpensiveThresholdCycles = 8000;
-    static constexpr uint64 kCostDecay = 10;
+    static constexpr uint64_t kInitialCostEstimateCycles = 100 * 1000 * 1000;
+    static constexpr uint64_t kOpIsExpensiveThresholdCycles = 8000;
+    static constexpr uint64_t kCostDecay = 10;
 
     std::vector<bool> is_expensive_;
     // std::unique_ptr<std::atomic<bool>[]> is_expensive_;
@@ -369,14 +369,14 @@ class ExecutorState {
   // Maximum number of kernels that can be scheduled inline. If lots of kernels
   // are ready at the same time, scheduling them in one thread can be very slow.
   // TODO(fishx): Make it configurable if necessary.
-  static constexpr uint64 kInlineScheduleReadyThreshold = 500;
+  static constexpr uint64_t kInlineScheduleReadyThreshold = 500;
 
   // Not owned.
   RendezvousInterface* rendezvous_;
   CollectiveExecutor* collective_executor_ = nullptr;
   const ConfigProto* const session_config_;
   SessionState* session_state_;
-  string session_handle_;
+  std::string session_handle_;
   const SessionMetadata* session_metadata_ = nullptr;
   TensorStore* tensor_store_;
   // Step-local container.
@@ -1099,7 +1099,7 @@ absl::Status ExecutorState<PropagatorStateType>::ProcessOutputs(
     }
     if (s.code() == error::RESOURCE_EXHAUSTED) {
       if (stats_collector_) {
-        string err =
+        std::string err =
             stats_collector_->ReportAllocsOnResourceExhausted(s.message());
         s = errors::CreateWithUpdatedMessage(s, absl::StrCat(s.message(), err));
       } else {
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 850d48694c1982..cbe63568f69de9 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -105,7 +105,7 @@ class Executor {
     const ConfigProto* session_config = nullptr;
     SessionState* session_state = nullptr;
     // Unique session identifier. Can be empty.
-    string session_handle;
+    std::string session_handle;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
index 8d4c440b46aa4b..8346b47748484d 100644
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -29,7 +29,7 @@ namespace {
 
 static mutex executor_factory_lock(LINKER_INITIALIZED);
 
-typedef std::unordered_map<string, ExecutorFactory*> ExecutorFactories;
+typedef std::unordered_map<std::string, ExecutorFactory*> ExecutorFactories;
 ExecutorFactories* executor_factories() {
   static ExecutorFactories* factories = new ExecutorFactories;
   return factories;
@@ -37,7 +37,7 @@ ExecutorFactories* executor_factories() {
 
 }  // namespace
 
-void ExecutorFactory::Register(const string& executor_type,
+void ExecutorFactory::Register(const std::string& executor_type,
                                ExecutorFactory* factory) {
   mutex_lock l(executor_factory_lock);
   if (!executor_factories()->insert({executor_type, factory}).second) {
@@ -47,9 +47,9 @@ void ExecutorFactory::Register(const string& executor_type,
 }
 
 namespace {
-const string RegisteredFactoriesErrorMessageLocked()
+const std::string RegisteredFactoriesErrorMessageLocked()
     TF_SHARED_LOCKS_REQUIRED(executor_factory_lock) {
-  std::vector<string> factory_types;
+  std::vector<std::string> factory_types;
   for (const auto& executor_factory : *executor_factories()) {
     factory_types.push_back(executor_factory.first);
   }
@@ -58,7 +58,7 @@ const string RegisteredFactoriesErrorMessageLocked()
 }
 }  // namespace
 
-absl::Status ExecutorFactory::GetFactory(const string& executor_type,
+absl::Status ExecutorFactory::GetFactory(const std::string& executor_type,
                                          ExecutorFactory** out_factory) {
   tf_shared_lock l(executor_factory_lock);
 
@@ -73,7 +73,7 @@ absl::Status ExecutorFactory::GetFactory(const string& executor_type,
   return absl::OkStatus();
 }
 
-absl::Status NewExecutor(const string& executor_type,
+absl::Status NewExecutor(const std::string& executor_type,
                          const LocalExecutorParams& params, const Graph& graph,
                          std::unique_ptr<Executor>* out_executor) {
   ExecutorFactory* factory = nullptr;
diff --git a/tensorflow/core/common_runtime/executor_factory.h b/tensorflow/core/common_runtime/executor_factory.h
index 14a8d2777bcfcb..3459a4a38b06c9 100644
--- a/tensorflow/core/common_runtime/executor_factory.h
+++ b/tensorflow/core/common_runtime/executor_factory.h
@@ -36,12 +36,13 @@ class ExecutorFactory {
                                    std::unique_ptr<Executor>* out_executor) = 0;
   virtual ~ExecutorFactory() {}
 
-  static void Register(const string& executor_type, ExecutorFactory* factory);
-  static absl::Status GetFactory(const string& executor_type,
+  static void Register(const std::string& executor_type,
+                       ExecutorFactory* factory);
+  static absl::Status GetFactory(const std::string& executor_type,
                                  ExecutorFactory** out_factory);
 };
 
-absl::Status NewExecutor(const string& executor_type,
+absl::Status NewExecutor(const std::string& executor_type,
                          const LocalExecutorParams& params, const Graph& graph,
                          std::unique_ptr<Executor>* out_executor);
 
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 9ca90f01f6c1c2..81719752519e56 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -128,7 +128,7 @@ Tensor V(const float val) {
 // A int32 val -> Tensor<int32>
 Tensor VI(const int32_t val) {
   Tensor tensor(DT_INT32, TensorShape({}));
-  tensor.scalar<int32>()() = val;
+  tensor.scalar<int32_t>()() = val;
   return tensor;
 }
 
@@ -153,10 +153,11 @@ float V(const Tensor& tensor) {
   return tensor.scalar<float>()();
 }
 
-static uint64 kIncarnation = 1;  // Uses in following tests.
+static uint64_t kIncarnation = 1;  // Uses in following tests.
 
-Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
-                          const string& receiver, const string& name) {
+Rendezvous::ParsedKey Key(const std::string& sender, const uint64_t incarnation,
+                          const std::string& receiver,
+                          const std::string& name) {
   Rendezvous::ParsedKey result;
   CHECK(
       Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
@@ -508,8 +509,8 @@ static void BM_executor(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
-  uint64 cur = 0;
-  uint32 r = 1 + rand.Rand32() % width;
+  uint64_t cur = 0;
+  uint32_t r = 1 + rand.Rand32() % width;
   std::vector<Node*> ready_nodes;
   for (int i = 0; i < r; ++i) {
     ready_nodes.push_back(test::graph::NoOp(g, {}));
@@ -589,9 +590,9 @@ static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
   Node* sum = test::graph::Add(g, x, y);
   Node* z = test::graph::Send(g, sum, "z", BOB, 1, ALICE);
 
-  string x_key = test::GetRendezvousKey(x);
-  string y_key = test::GetRendezvousKey(y);
-  string z_key = test::GetRendezvousKey(z);
+  std::string x_key = test::GetRendezvousKey(x);
+  std::string y_key = test::GetRendezvousKey(y);
+  std::string z_key = test::GetRendezvousKey(z);
 
   Tensor val(DT_FLOAT, TensorShape({}));
   val.scalar<float>()() = 3.14;
@@ -603,9 +604,10 @@ static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
 BENCHMARK(BM_FeedInputFetchOutput);
 
 absl::Status ReplaceEdgeWithSendRecv(Graph* g, const Edge* edge,
-                                     const string& tensor, const string& sender,
-                                     const uint64 sender_incarnation,
-                                     const string& receiver) {
+                                     const std::string& tensor,
+                                     const std::string& sender,
+                                     const uint64_t sender_incarnation,
+                                     const std::string& receiver) {
   Node* send;
   NodeDef send_def;
   TF_CHECK_OK(NodeDefBuilder(g->NewName("n"), "_Send")
@@ -662,16 +664,16 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
   FunctionDefLibrary f_lib_proto;
 
   // Define the loop body as a function: `x = x + 1`.
-  const Tensor one_t = test::AsScalar<int32>(1);
+  const Tensor one_t = test::AsScalar<int32_t>(1);
 
-  std::vector<string> args;
+  std::vector<std::string> args;
   args.reserve(loop_vars);
   args.push_back("x: int32");
   for (int i = 1; i < loop_vars; ++i) {
     args.push_back(absl::StrCat("x", i, ": int32"));
   }
 
-  std::vector<string> body_rets;
+  std::vector<std::string> body_rets;
   body_rets.reserve(loop_vars);
   body_rets.push_back("y: int32");
   for (int i = 1; i < loop_vars; ++i) {
@@ -703,7 +705,7 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
       body_nodes);
 
   // Define the loop condition as a function: `x < loop_iters`.
-  const Tensor loop_iters_t = test::AsScalar<int32>(loop_iters);
+  const Tensor loop_iters_t = test::AsScalar<int32_t>(loop_iters);
   *f_lib_proto.add_function() = FunctionDefHelper::Define(
       // Name
       "LessThanOrEqualToN",
@@ -775,7 +777,7 @@ static void BM_WhileLoopHelper(::testing::benchmark::State& state,
           if (edge->dst()->type_string() != "Switch") {
             continue;
           }
-          string tensor_name = absl::StrCat("c", edge->id());
+          std::string tensor_name = absl::StrCat("c", edge->id());
           TF_ASSERT_OK(ReplaceEdgeWithSendRecv(graph.get(), edge, tensor_name,
                                                BOB, 1, ALICE));
         }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index a57fe0323a3273..90080692323345 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -88,7 +88,7 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
@@ -100,7 +100,7 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint64 operator()(const Endpoint& x) const {
+  uint64_t operator()(const Endpoint& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -166,7 +166,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
       : base_flr_(base_flr), lib_def_(std::move(lib_def)) {}
   ~FunctionLibraryRuntimeOverlay() override;
 
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            const InstantiateOptions& options,
                            Handle* handle) override;
 
@@ -192,7 +192,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   absl::Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
                             OpKernel** kernel) override;
 
-  bool IsStateful(const string& function_name) const override;
+  bool IsStateful(const std::string& function_name) const override;
 
   const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
       const override;
@@ -204,7 +204,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   std::function<void(std::function<void()>)>* runner() override;
   const DeviceMgr* device_mgr() const override;
 
-  string DebugString(Handle handle) override;
+  std::string DebugString(Handle handle) override;
   int graph_def_version() const override;
 
   absl::Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
@@ -220,7 +220,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
 FunctionLibraryRuntimeOverlay::~FunctionLibraryRuntimeOverlay() = default;
 
 absl::Status FunctionLibraryRuntimeOverlay::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const InstantiateOptions& options, Handle* handle) {
   // We automatically set the `lib_def` option for all instantiations, if the
   // caller doesn't set this option explicitly.
@@ -284,7 +284,7 @@ absl::Status FunctionLibraryRuntimeOverlay::CreateKernel(
 }
 
 bool FunctionLibraryRuntimeOverlay::IsStateful(
-    const string& function_name) const {
+    const std::string& function_name) const {
   // Important: we do not forward lookup to the base FLR.
   const OpDef* op_def;
   const absl::Status s = lib_def_.LookUpOpDef(function_name, &op_def);
@@ -317,7 +317,7 @@ FunctionLibraryRuntimeOverlay::GetFunctionLibraryDefinition() const {
   return &lib_def_;
 }
 
-string FunctionLibraryRuntimeOverlay::DebugString(Handle handle) {
+std::string FunctionLibraryRuntimeOverlay::DebugString(Handle handle) {
   return base_flr_->DebugString(handle);
 }
 
@@ -348,7 +348,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   ~FunctionLibraryRuntimeImpl() override;
 
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            const InstantiateOptions& options,
                            Handle* handle) override;
 
@@ -375,7 +375,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   absl::Status RunSync(Options opts, Handle handle,
                        CallFrameInterface* call_frame) override;
 
-  bool IsStateful(const string& function) const override;
+  bool IsStateful(const std::string& function) const override;
 
   // TODO: b/396484774 - Consider handling the case where the FLR is already
   // finalized instead of always returning the pointer to the unowned library
@@ -397,7 +397,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const ConfigProto* const config_proto() override { return config_; }
   int graph_def_version() const override { return graph_def_version_; }
 
-  string DebugString(Handle h) override;
+  std::string DebugString(Handle h) override;
 
   absl::Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
                      std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
@@ -416,9 +416,9 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   GraphOptimizer optimizer_;
   const SessionMetadata* const session_metadata_;
   Executor::Args::Runner default_runner_;
-  const string device_name_;
+  const std::string device_name_;
 
-  std::function<absl::Status(const string&, const OpDef**)> get_func_sig_;
+  std::function<absl::Status(const std::string&, const OpDef**)> get_func_sig_;
   std::function<absl::Status(const std::shared_ptr<const NodeProperties>&,
                              OpKernel**)>
       create_kernel_;
@@ -432,13 +432,13 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item {
-    uint64 instantiation_counter = 0;
+    uint64_t instantiation_counter = 0;
     std::unique_ptr<const Graph> graph = nullptr;
     const FunctionLibraryDefinition* lib_def = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
     core::RefCountPtr<FunctionLibraryRuntimeOverlay> overlay_flr = nullptr;
-    string executor_type;
+    std::string executor_type;
     bool allow_small_function_optimizations = false;
     bool allow_control_flow_sync_execution = false;
     bool function_runs_at_most_once = false;
@@ -517,7 +517,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
              absl::flat_hash_map<Handle, std::unique_ptr<Item>>>()),
       function_handle_cache_(std::make_unique<FunctionHandleCache>(this)),
       parent_(parent) {
-  get_func_sig_ = [this](const string& op, const OpDef** sig) {
+  get_func_sig_ = [this](const std::string& op, const OpDef** sig) {
     return base_lib_def_->LookUpOpDef(op, sig);
   };
   create_kernel_ = [this](const std::shared_ptr<const NodeProperties>& props,
@@ -714,7 +714,7 @@ absl::Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
     return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
                                    get_func_sig_, fbody);
   } else {
-    auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+    auto get_func_sig = [lib_def](const std::string& op, const OpDef** sig) {
       return lib_def->LookUpOpDef(op, sig);
     };
     return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -779,7 +779,7 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(
 }
 
 absl::Status FunctionLibraryRuntimeImpl::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const InstantiateOptions& options, Handle* handle) {
   if (!IsLocalTarget(options)) {
     return parent_->Instantiate(function_name, attrs, options, handle);
@@ -796,7 +796,7 @@ absl::Status FunctionLibraryRuntimeImpl::Instantiate(
   // in the canonical key.
   InstantiateOptions options_copy(options);
   options_copy.target = device_name_;
-  const string key = Canonicalize(function_name, attrs, options_copy);
+  const std::string key = Canonicalize(function_name, attrs, options_copy);
 
   {
     mutex_lock l(mu_);
@@ -837,7 +837,7 @@ absl::Status FunctionLibraryRuntimeImpl::Instantiate(
     if (func.name() == kGradientOp) {
       return errors::InvalidArgument("Can't take gradient of SymbolicGradient");
     }
-    const string grad = lib_def->FindGradient(func.name());
+    const std::string grad = lib_def->FindGradient(func.name());
     if (!grad.empty()) {
       return Instantiate(grad, AttrSlice(&func.attr()), options, handle);
     }
@@ -941,7 +941,7 @@ absl::Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
 absl::Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   FunctionLibraryRuntime* flr;
-  string executor_type;
+  std::string executor_type;
   {
     tf_shared_lock l(mu_);
     fbody = (*item)->func_graph;
@@ -1120,8 +1120,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            absl::Span<const Tensor> args,
                                            std::vector<Tensor>* rets,
                                            Item* item, DoneCallback done) {
-  string target_device = parent_->GetDeviceName(handle);
-  string source_device = opts.source_device;
+  std::string target_device = parent_->GetDeviceName(handle);
+  std::string source_device = opts.source_device;
   RendezvousInterface* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   absl::Status s = parent_->GetDeviceContext(target_device, &device_context);
@@ -1436,13 +1436,13 @@ absl::Status FunctionLibraryRuntimeImpl::RunSync(
   return absl::OkStatus();
 }
 
-bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) const {
+bool FunctionLibraryRuntimeImpl::IsStateful(const std::string& func) const {
   const OpDef* op_def;
   const absl::Status s = base_lib_def_->LookUpOpDef(func, &op_def);
   return s.ok() && op_def->is_stateful();
 }
 
-string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
+std::string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   Item* item = nullptr;
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   absl::Status s = GetOrCreateItem(local_handle, &item);
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
index 17263a4465cf31..570791252dda4e 100644
--- a/tensorflow/core/common_runtime/function_def_utils.cc
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -41,7 +41,7 @@ namespace tensorflow {
 absl::Status FunctionDefToBodyHelper(
     core::RefCountPtr<FunctionRecord>&& record, const AttrSlice& attrs,
     const FunctionLibraryDefinition* const lib_def,
-    const std::function<absl::Status(const string&, const OpDef**)>&
+    const std::function<absl::Status(const std::string&, const OpDef**)>&
         get_func_sig,
     std::unique_ptr<FunctionBody>* fbody) {
   // Instantiates the function template into a graph def.
@@ -96,7 +96,8 @@ absl::Status FunctionDefToBodyHelper(core::RefCountPtr<FunctionRecord>&& record,
                                      const AttrSlice& attrs,
                                      const FunctionLibraryDefinition* lib_def,
                                      std::unique_ptr<FunctionBody>* fbody) {
-  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
+  const auto get_func_sig = [&lib_def](const std::string& op,
+                                       const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
   return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -109,7 +110,8 @@ absl::Status FunctionDefToBodyHelper(const FunctionDef& fdef,
                                      std::unique_ptr<FunctionBody>* fbody) {
   core::RefCountPtr<FunctionRecord> record(
       new FunctionRecord(FunctionDef(fdef), {}, true));
-  const auto get_func_sig = [&lib_def](const string& op, const OpDef** sig) {
+  const auto get_func_sig = [&lib_def](const std::string& op,
+                                       const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
   return FunctionDefToBodyHelper(std::move(record), attrs, lib_def,
@@ -125,8 +127,8 @@ bool PrunableStatefulNode(const Node* n) {
   // and can produce different results on each invocation (due to variable
   // updates) but it does not itself modify the variable.
   // TODO(b/341721055): Consolidate this set with other side effect modeling.
-  static const absl::flat_hash_set<string>* prunable_stateful_ops =
-      new absl::flat_hash_set<string>{
+  static const absl::flat_hash_set<std::string>* prunable_stateful_ops =
+      new absl::flat_hash_set<std::string>{
           FunctionLibraryDefinition::kArgOp,
           "ResourceGather",
           "ResourceGatherNd",
diff --git a/tensorflow/core/common_runtime/function_def_utils.h b/tensorflow/core/common_runtime/function_def_utils.h
index cd3b021ec2f3c9..589dd9304edea9 100644
--- a/tensorflow/core/common_runtime/function_def_utils.h
+++ b/tensorflow/core/common_runtime/function_def_utils.h
@@ -55,7 +55,7 @@ absl::Status FunctionDefToBodyHelper(const FunctionDef& fdef,
 absl::Status FunctionDefToBodyHelper(
     core::RefCountPtr<FunctionRecord>&& record, const AttrSlice& attrs,
     const FunctionLibraryDefinition* lib_def,
-    const std::function<absl::Status(const string&, const OpDef**)>&
+    const std::function<absl::Status(const std::string&, const OpDef**)>&
         get_func_sig,
     std::unique_ptr<FunctionBody>* fbody);
 
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 08898cc8052396..adf7ea36fdd99d 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -74,7 +74,7 @@ using ::tsl::testing::StatusIs;
 using FDH = ::tensorflow::FunctionDefHelper;
 using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
 
-absl::Status GetOpSig(const string& op, const OpDef** sig) {
+absl::Status GetOpSig(const std::string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
@@ -220,14 +220,14 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+  absl::Status Instantiate(FunctionLibraryRuntime* flr, const std::string& name,
                            test::function::Attrs attrs,
                            FunctionLibraryRuntime::Handle* handle) {
     return flr->Instantiate(name, attrs, handle);
   }
 
   absl::Status Instantiate(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle) {
@@ -235,7 +235,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(FunctionLibraryRuntime* flr,
-                                 const string& name,
+                                 const std::string& name,
                                  test::function::Attrs attrs,
                                  const std::vector<Tensor>& args,
                                  std::vector<Tensor*> rets) {
@@ -245,7 +245,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -295,7 +295,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRunViaCallFrameInterface(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs, const std::vector<Tensor>& args,
       std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
@@ -331,7 +331,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
-                                     const string& name,
+                                     const std::string& name,
                                      test::function::Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     absl::Status status = flr->Instantiate(name, attrs, &handle);
@@ -347,7 +347,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   std::unique_ptr<Graph> GetGradBody(FunctionLibraryRuntime* flr,
-                                     const string& func,
+                                     const std::string& func,
                                      test::function::Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     absl::Status status = flr->Instantiate(func, attrs, &handle);
@@ -646,9 +646,9 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FDH::Const<int32>("shape", absl::Span<const int32>({1})),
-       FDH::Const<int32>("minval", 0),
-       FDH::Const<int32>("maxval", 10),
+      {FDH::Const<int32_t>("shape", absl::Span<const int32_t>({1})),
+       FDH::Const<int32_t>("minval", 0),
+       FDH::Const<int32_t>("maxval", 10),
        // A stateful node.
        {{"y"},
         "RandomUniformInt",
@@ -665,7 +665,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     // Simple case: instantiating with no state_handle.
     for (int32_t expected : {6, 4}) {
       TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -678,7 +678,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_EQ(handle, handle_non_isolated);
     for (int32_t expected : {0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_non_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -693,7 +693,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_NE(handle, handle_isolated);
     for (int32_t expected : {6, 4, 0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -708,7 +708,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
     EXPECT_NE(handle, handle_isolated);
     for (int32_t expected : {6, 4, 0, 1}) {
       TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
     }
   }
 
@@ -725,7 +725,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
       EXPECT_NE(handle, handle_isolated);
       for (int32_t expected : {6, 4, 0, 1}) {
         TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
-        test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+        test::ExpectTensorEqual<int>(y, test::AsTensor<int32_t>({expected}));
       }
       TF_CHECK_OK(flr0_->ReleaseHandle(handle_isolated));
     }
@@ -1128,9 +1128,9 @@ TEST_F(FunctionLibraryRuntimeTest,
   std::unique_ptr<Graph> g;
   ExpandInlineFunctionsOptions opts;
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Use data outputs as output control source.
   opts.native_options.output_control_src = OutputControlSrc::kDataOutputs;
@@ -1203,9 +1203,9 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndKeepCallerNode) {
     return absl::OkStatus();
   };
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Construct expected graph after function inlining.
   auto expected_graph = [&](const NodeDef& caller) -> GraphDef {
@@ -1266,9 +1266,9 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
   using test::function::NDef;
   using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode;
 
-  const string arg_device = "/job:arg/replica:0/task:0/device:GPU";
-  const string call_device = "/job:call/replica:0/task:1/device:GPU";
-  const string body_device = "/job:body/replica:0/task:1/device:CPU";
+  const std::string arg_device = "/job:arg/replica:0/task:0/device:GPU";
+  const std::string call_device = "/job:call/replica:0/task:1/device:GPU";
+  const std::string body_device = "/job:body/replica:0/task:1/device:CPU";
 
   const FunctionDef func = FDH::Create(
       "AddFunc", {"i: float"}, {"o: float"}, {},
@@ -1291,12 +1291,13 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
     return absl::OkStatus();
   };
 
-  const string input_node = "Func/b/input/_0";
-  const string output_node = "Func/b/output/_1";
-  const string output_control_node = "Func/b/output_control_node/_2";
+  const std::string input_node = "Func/b/input/_0";
+  const std::string output_node = "Func/b/output/_1";
+  const std::string output_control_node = "Func/b/output_control_node/_2";
 
   // Construct expected graph after function inlining.
-  auto expected_graph = [&](const std::vector<string>& placed) -> GraphDef {
+  auto expected_graph =
+      [&](const std::vector<std::string>& placed) -> GraphDef {
     return test::function::GDef(
         {
             NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}, placed[0]),
@@ -1364,7 +1365,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
     auto g = std::make_unique<Graph>(OpRegistry::Global());
     TF_ASSERT_OK(construct_graph(&g));
 
-    const string merged_device = "/job:body/replica:0/task:1/device:CPU:*";
+    const std::string merged_device = "/job:body/replica:0/task:1/device:CPU:*";
 
     ExpandInlineFunctions(flr0_, g.get(), opts);
     GraphDef expected = expected_graph({/*a*/ arg_device,                //
@@ -1400,7 +1401,7 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
        {{"x1"}, "Add", {"o", "o"}, {{"T", T}}},
        {{"x2"}, "Mul", {"a", "x1"}, {{"T", T}}},
        {{"x3"}, "Mul", {"x1", "x2"}, {{"T", T}}},
-       FDH::Const<int32>("shape", {1, 2}),
+       FDH::Const<int32_t>("shape", {1, 2}),
        // A stateful node.
        {{"keep_me"},
         "RandomUniform",
@@ -1410,7 +1411,7 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
        {{"z"}, "Add", {"a", "o"}, {{"T", T}}}});
   Init({stateful_func});
 
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   auto y = test::AsTensor<float>({1.0, 2.0, 3.0, 4.0});
   Tensor z;
 
@@ -1427,15 +1428,15 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
 
   TF_CHECK_OK(InstantiateAndRun(flr0_, "SquareAndAddOneWithStatefulNodes", {},
                                 {x, y}, {&z}));
-  test::ExpectTensorEqual<int>(z, test::AsTensor<int32>({2, 5, 10, 17}));
+  test::ExpectTensorEqual<int>(z, test::AsTensor<int32_t>({2, 5, 10, 17}));
 
   stats_collector.FinalizeAndSwap(&stats);
 
   // Note that we do not expect the nodes named "y", "x1", "x2", or "x3" to
   // execute.
-  std::set<string> expected_node_names(
+  std::set<std::string> expected_node_names(
       {"_SOURCE", "shape", "x", "o", "a", "keep_me", "z", "z_RetVal"});
-  std::set<string> executed_node_names;
+  std::set<std::string> executed_node_names;
   for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
     executed_node_names.insert(node_stats.node_name());
   }
@@ -1475,9 +1476,9 @@ TEST_F(FunctionLibraryRuntimeTest, DoNotPruneControlOutputsFromBody) {
 
   stats_collector.FinalizeAndSwap(&stats);
 
-  std::set<string> expected_node_names(
+  std::set<std::string> expected_node_names(
       {"_SOURCE", "i", "add", "ret", "o_RetVal"});
-  std::set<string> executed_node_names;
+  std::set<std::string> executed_node_names;
   for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
     executed_node_names.insert(node_stats.node_name());
   }
@@ -1645,7 +1646,7 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiationError) {
 
 TEST_F(FunctionLibraryRuntimeTest, Error_BadControlFlow) {
   Init({test::function::InvalidControlFlow()});
-  auto x = test::AsTensor<int32>({0});
+  auto x = test::AsTensor<int32_t>({0});
   DCHECK_EQ(x.dtype(), DT_INT32);
   Tensor y;
   HasError(InstantiateAndRun(flr0_, "InvalidControlFlow", {}, {x}, {&y}),
@@ -2117,7 +2118,7 @@ TEST_F(FunctionLibraryRuntimeTest, FullTypeForInt32) {
        {{"z"}, "Add", {"x", "x"}, {{"T", T}}}});
   Init({int32_func});
 
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   auto y = test::AsTensor<float>({1.0, 2.0, 3.0, 4.0});
   Tensor z;
 
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index 77ee26e29d0e1f..a37f05da7df38e 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -126,8 +126,8 @@ FunctionDef BlockingOpFn() {
 }
 
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
-Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            absl::Span<const Input> inputs) {
+Output Call(Scope* scope, const std::string& op_name,
+            const std::string& fn_name, absl::Span<const Input> inputs) {
   NodeDef def;
   NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
   for (const Input& input : inputs) {
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 9618c4083b869e..b71acef0c83408 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -44,8 +44,8 @@ FunctionDef BlockingOpFn();
 
 // Adds a function call to the given scope and returns the output for the node.
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
-Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            absl::Span<const Input> inputs);
+Output Call(Scope* scope, const std::string& op_name,
+            const std::string& fn_name, absl::Span<const Input> inputs);
 
 }  // namespace function
 }  // namespace test
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index e28eb03fd8787b..4c6846593885f5 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -81,7 +81,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                    FunctionLibraryRuntime::Options opts,
                    const std::vector<Tensor>& args, std::vector<Tensor*> rets,
                    bool add_runner = true) {
-    std::atomic<int32> call_count(0);
+    std::atomic<int32_t> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
@@ -115,14 +115,14 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+  absl::Status Instantiate(FunctionLibraryRuntime* flr, const std::string& name,
                            test::function::Attrs attrs,
                            FunctionLibraryRuntime::Handle* handle) {
     return flr->Instantiate(name, attrs, handle);
   }
 
   absl::Status Instantiate(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle) {
@@ -130,7 +130,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(FunctionLibraryRuntime* flr,
-                                 const string& name,
+                                 const std::string& name,
                                  test::function::Attrs attrs,
                                  const std::vector<Tensor>& args,
                                  std::vector<Tensor*> rets,
@@ -141,7 +141,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      FunctionLibraryRuntime* flr, const string& name,
+      FunctionLibraryRuntime* flr, const std::string& name,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets,
@@ -171,7 +171,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                    FunctionLibraryRuntime::Handle handle,
                    FunctionLibraryRuntime::Options opts,
                    CallFrameInterface* frame, bool add_runner = true) {
-    std::atomic<int32> call_count(0);
+    std::atomic<int32_t> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
@@ -232,7 +232,7 @@ TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
   TF_CHECK_OK(Instantiate(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, &h));
 
   auto x1 = test::AsTensor<float>({1, 2, 3, 4});
-  std::atomic<int32> num_done(0);
+  std::atomic<int32_t> num_done(0);
   FunctionLibraryRuntime::Options opts;
   for (int i = 0; i < 4; ++i) {
     tp1->Schedule([&h, &x1, &opts, &num_done, this]() {
diff --git a/tensorflow/core/common_runtime/function_utils.cc b/tensorflow/core/common_runtime/function_utils.cc
index 5c743abd0e81df..736dcc4db4811b 100644
--- a/tensorflow/core/common_runtime/function_utils.cc
+++ b/tensorflow/core/common_runtime/function_utils.cc
@@ -36,7 +36,7 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
@@ -285,7 +285,7 @@ bool IsFunctionCall(const FunctionLibraryDefinition& lib_def,
   return node.IsFunctionCall();
 }
 
-string NewName(const Node* n, bool pretty) {
+std::string NewName(const Node* n, bool pretty) {
   if (pretty) {
     return absl::StrCat(n->type_string(), n->id());
   } else {
@@ -347,7 +347,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input("unknown");
         continue;
       }
-      const string srcname = NewName(e->src(), pretty);
+      const std::string srcname = NewName(e->src(), pretty);
       if (!e->src()->IsOp()) {
       } else if (e->IsControlEdge()) {
         ndef->add_input(absl::StrCat("^", srcname));
@@ -360,7 +360,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   });
 }
 
-string DebugString(const Graph* g) {
+std::string DebugString(const Graph* g) {
   GraphDef gdef;
   ToGraphDef(g, &gdef);
   return DebugString(gdef);
diff --git a/tensorflow/core/common_runtime/function_utils.h b/tensorflow/core/common_runtime/function_utils.h
index cfbfe86936421b..97cd4cc63e8ea4 100644
--- a/tensorflow/core/common_runtime/function_utils.h
+++ b/tensorflow/core/common_runtime/function_utils.h
@@ -34,7 +34,7 @@ class OpDef;
 
 // Debugging facility.  Returns a debug string for a graph
 // representing an instantiated function.
-string DebugString(const Graph* g);
+std::string DebugString(const Graph* g);
 
 // Dump the contents of the "graph" to log files if the logging level is
 // sufficiently high.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 049155675fcdb6..4d192d8af9fab4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #ifdef TF_GPU_USE_PJRT
 #include "tensorflow/compiler/jit/pjrt_tensor_buffer.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "xla/future.h"
 #include "xla/literal.h"
-#include "xla/pjrt/pjrt_future.h"
 #endif  // TF_GPU_USE_PJRT
 
 #include "tensorflow/core/common_runtime/copy_tensor.h"
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 072c3353c5b8d9..f84dbfac0d3f6d 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -69,7 +70,7 @@ namespace {
 typedef std::tuple<int32, int32> OutputAndControlEdges;
 
 OutputAndControlEdges CountOutputEdges(const Node* n) {
-  DCHECK_LE(n->out_edges().size(), kint32max);
+  DCHECK_LE(n->out_edges().size(), std::numeric_limits<int32_t>::max());
   int32_t num_output_edges = 0;
   int32_t num_output_control_edges = 0;
   for (auto e : n->out_edges()) {
@@ -125,7 +126,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
 
 char* GraphView::InitializeNode(char* ptr, const Node* n) {
   const int id = n->id();
-  CHECK(node_offsets_[id] == kuint32max);  // Initial value in constructor
+  CHECK(node_offsets_[id] ==
+        std::numeric_limits<uint32_t>::max());  // Initial value in constructor
 
   const size_t bytes = NodeItemBytes(n);
   constexpr size_t kItemAlignment = sizeof(NodeItem*);
@@ -137,7 +139,8 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   // (versus 64 bits on most machines if we just stored an array of NodeItem*
   // pointers). Casting to int64 is needed on 32bit CPU to avoid comparing
   // values as "int" vs "size_t" in CHECK_LE.
-  CHECK_LE(static_cast<int64_t>(ptr - space_), kuint32max);
+  CHECK_LE(static_cast<int64_t>(ptr - space_),
+           std::numeric_limits<uint32_t>::max());
   const uint32 offset = static_cast<uint32>(ptr - space_);
   node_offsets_[id] = offset;
   ptr += bytes;
@@ -252,7 +255,7 @@ absl::Status GraphView::Initialize(const Graph* g) {
   num_nodes_ = num_nodes;
   size_t total_bytes = 0;
   for (const Node* n : g->nodes()) {
-    if (n->out_edges().size() > kint32max) {
+    if (n->out_edges().size() > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
           "The executor cannot handle nodes with more than ",
           std::numeric_limits<int32_t>::max(), " output edges. Node ",
@@ -263,7 +266,7 @@ absl::Status GraphView::Initialize(const Graph* g) {
 
   node_offsets_ = new uint32[num_nodes];
   for (int i = 0; i < num_nodes; i++) {
-    node_offsets_[i] = kuint32max;
+    node_offsets_[i] = std::numeric_limits<uint32_t>::max();
   }
 
   space_ = new char[total_bytes];  // NodeItem objects are allocated here
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 83d15e71282024..3864df8a6ce165 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -221,7 +221,7 @@ class GraphView {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
     uint32 offset = node_offsets_[id];
-    return ((offset == kuint32max)
+    return ((offset == std::numeric_limits<uint32_t>::max())
                 ? nullptr
                 : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
   }
@@ -233,7 +233,7 @@ class GraphView {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
     uint32 offset = node_offsets_[id];
-    DCHECK_NE(offset, kuint32max);
+    DCHECK_NE(offset, std::numeric_limits<uint32_t>::max());
     return *reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]);
   }
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
index 998c81efc85d97..a9ebb6f4c3559d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
@@ -242,7 +242,7 @@ absl::Status PluggableDevice::Init(const SessionOptions& options) {
   // callback instead of GPU environment variables: TF_GPU_THREAD_MODE,
   // TF_GPU_THREAD_COUNT, TF_FORCE_GPU_ALLOC_GROWTH,
   // TF_ENABLE_GPU_GARBAGE_COLLECTION, and TF_GPU_HOST_MEM_LIMIT_IN_MB.
-  string device_thread_mode;
+  std::string device_thread_mode;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global",
                                           &device_thread_mode));
   device_thread_mode = absl::AsciiStrToLower(device_thread_mode);
@@ -256,19 +256,19 @@ absl::Status PluggableDevice::Init(const SessionOptions& options) {
       thread_pool_ = std::make_unique<thread::ThreadPool>(
           options.env, ThreadOptions(),
           absl::StrCat("gpu_private_", tf_device_id_.value()),
-          static_cast<int32>(device_thread_count),
+          static_cast<int32_t>(device_thread_count),
           !options.config.experimental().disable_thread_spinning(),
           /*allocator=*/nullptr);
       set_tensorflow_device_thread_pool(thread_pool_.get());
     } else if (device_thread_mode == "gpu_shared") {
       static thread::ThreadPool* thread_pool = new thread::ThreadPool(
           options.env, ThreadOptions(), "gpu_shared",
-          static_cast<int32>(device_thread_count),
+          static_cast<int32_t>(device_thread_count),
           !options.config.experimental().disable_thread_spinning(),
           /*allocator=*/nullptr);
       set_tensorflow_device_thread_pool(thread_pool);
     } else {
-      string error_message =
+      std::string error_message =
           absl::StrCat("Invalid gpu_thread_mode: ", device_thread_mode);
       LOG(WARNING) << error_message;
       return errors::InvalidArgument(error_message);
@@ -293,8 +293,8 @@ Allocator* PluggableDevice::GetAllocator(AllocatorAttributes attr) {
   }
 }
 
-string PluggableDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
-                                                   const int stream_id) {
+std::string PluggableDevice::ComputeOpKernelDebugString(
+    const OpKernel& op_kernel, const int stream_id) {
   return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
                          " on ", platform_name_, tf_device_id_.value(),
                          " stream[", stream_id, "]");
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
index bfcbc16d0eb2da..9ccdc04192e071 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
@@ -48,9 +48,9 @@ namespace tensorflow {
 class PluggableDevice : public LocalDevice {
  public:
   PluggableDevice(const SessionOptions& options, const std::string& name,
-                  const string& device_type, const string& platform_name,
-                  Bytes memory_limit, const DeviceLocality& locality,
-                  TfDeviceId tf_device_id,
+                  const std::string& device_type,
+                  const std::string& platform_name, Bytes memory_limit,
+                  const DeviceLocality& locality, TfDeviceId tf_device_id,
                   const std::string& physical_device_desc,
                   Allocator* device_allocator, Allocator* cpu_allocator,
                   bool sync_every_op);
@@ -99,7 +99,7 @@ class PluggableDevice : public LocalDevice {
   // TODO(penpornk): Investigate renaming `GpuDeviceInfo` to `DeviceInfo`.
   DeviceBase::AcceleratorDeviceInfo* pluggable_device_info_ = nullptr;
   TfDeviceId tf_device_id_;
-  const string platform_name_;
+  const std::string platform_name_;
   const bool sync_every_op_ = false;
   EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
index e4b3ef4c8e7f2b..ac2488d0b57664 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
@@ -94,14 +94,14 @@ bool PluggableDeviceBFCAllocator::GetGarbageCollectionValue() {
 }
 
 PluggableDeviceBFCAllocator::PluggableDeviceBFCAllocator(
-    tsl::SubAllocator* sub_allocator, size_t total_memory, const string& name,
-    bool force_memory_growth_requested)
+    tsl::SubAllocator* sub_allocator, size_t total_memory,
+    const std::string& name, bool force_memory_growth_requested)
     : PluggableDeviceBFCAllocator(sub_allocator, total_memory, GPUOptions(),
                                   name, force_memory_growth_requested) {}
 
 PluggableDeviceBFCAllocator::PluggableDeviceBFCAllocator(
     tsl::SubAllocator* sub_allocator, size_t total_memory,
-    const GPUOptions& gpu_options, const string& name,
+    const GPUOptions& gpu_options, const std::string& name,
     bool force_memory_growth_requested)
     : BFCAllocator(absl::WrapUnique(sub_allocator), total_memory, name, [&] {
         BFCAllocator::Options o;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
index b968b9dbc1c734..9e87b2612343a6 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
@@ -30,11 +30,12 @@ namespace tensorflow {
 class PluggableDeviceBFCAllocator : public BFCAllocator {
  public:
   PluggableDeviceBFCAllocator(tsl::SubAllocator* sub_allocator,
-                              size_t total_memory, const string& name,
+                              size_t total_memory, const std::string& name,
                               bool force_memory_growth_requested);
   PluggableDeviceBFCAllocator(tsl::SubAllocator* sub_allocator,
                               size_t total_memory,
-                              const GPUOptions& gpu_options, const string& name,
+                              const GPUOptions& gpu_options,
+                              const std::string& name,
                               bool force_memory_growth_requested);
   ~PluggableDeviceBFCAllocator() override = default;
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index d580a185f6ed56..855e796ee7903d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -82,7 +82,7 @@ int64_t MinSystemMemory(int64_t available_memory) {
 // Get the memory limit for the virtual device being created on PluggableDevice
 // with 'platform_device_id', when that virtual device is the only
 // virtual device being created on that PluggableDevice.
-absl::Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
+absl::Status SingleVirtualDeviceMemoryLimit(const std::string& platform_name,
                                             const GPUOptions& device_options,
                                             PlatformDeviceId platform_device_id,
                                             int64_t* memory_limit) {
@@ -119,18 +119,18 @@ absl::Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
 }
 }  // namespace
 
-PluggableDeviceFactory::PluggableDeviceFactory(const string& device_type,
-                                               const string& platform_name)
+PluggableDeviceFactory::PluggableDeviceFactory(const std::string& device_type,
+                                               const std::string& platform_name)
     : device_type_(device_type), platform_name_(platform_name) {}
 
 absl::Status PluggableDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
 
   int device_count = platform->VisibleDeviceCount();
   for (int i = 0; i < device_count; ++i) {
-    const string device_name =
+    const std::string device_name =
         absl::StrCat("/physical_device:", device_type_, ":", i);
     devices->push_back(device_name);
   }
@@ -139,7 +139,7 @@ absl::Status PluggableDeviceFactory::ListPhysicalDevices(
 }
 
 absl::Status PluggableDeviceFactory::GetDeviceDetails(
-    int device_index, std::unordered_map<string, string>* details) {
+    int device_index, std::unordered_map<std::string, std::string>* details) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   if (platform == nullptr) {
@@ -163,7 +163,7 @@ absl::Status PluggableDeviceFactory::GetDeviceDetails(
 }
 
 absl::Status PluggableDeviceFactory::CreateDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
@@ -214,20 +214,20 @@ absl::Status PluggableDeviceFactory::CreateDevices(
   return absl::OkStatus();
 }
 
-static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
-                                        const se::DeviceDescription& desc) {
+static std::string GetShortDeviceDescription(
+    PlatformDeviceId platform_device_id, const se::DeviceDescription& desc) {
   return strings::StrCat("device: ", platform_device_id.value(),
                          ", name: ", desc.name(),
                          ", pci bus id: ", desc.pci_bus_id());
 }
 
 absl::Status PluggableDeviceFactory::CreatePluggableDevice(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     TfDeviceId tf_device_id, int64_t memory_limit,
     const DeviceLocality& dev_locality,
     std::vector<std::unique_ptr<Device>>* devices) {
   DCHECK_GE(tf_device_id.value(), 0);
-  const string device_name = strings::StrCat(
+  const std::string device_name = strings::StrCat(
       name_prefix, "/device:", device_type_, ":", tf_device_id.value());
 
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
index 3f6ab10f9951fc..92a145080a0ba4 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h
@@ -34,14 +34,15 @@ limitations under the License.
 namespace tensorflow {
 class PluggableDeviceFactory : public DeviceFactory {
  public:
-  PluggableDeviceFactory(const string& device_type,
-                         const string& platform_name);
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+  PluggableDeviceFactory(const std::string& device_type,
+                         const std::string& platform_name);
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
   absl::Status CreateDevices(
       const SessionOptions& options, const std::string& name_prefix,
       std::vector<std::unique_ptr<Device>>* devices) override;
   absl::Status GetDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details) override;
+      int device_index,
+      std::unordered_map<std::string, std::string>* details) override;
 
  private:
   // Populates *device_localities with the DeviceLocality descriptor for
@@ -57,8 +58,8 @@ class PluggableDeviceFactory : public DeviceFactory {
       const DeviceLocality& dev_locality,
       std::vector<std::unique_ptr<Device>>* devices);
 
-  const string device_type_;
-  const string platform_name_;
+  const std::string device_type_;
+  const std::string platform_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
index 52c09016bddcd1..696248aba12122 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.cc
@@ -25,11 +25,11 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ValidatePluggableDeviceMachineManager(
-    const string& platform_name) {
+    const std::string& platform_name) {
   return se::PlatformManager::PlatformWithName(platform_name).status();
 }
 
-se::Platform* PluggableDeviceMachineManager(const string& platform_name) {
+se::Platform* PluggableDeviceMachineManager(const std::string& platform_name) {
   auto result = se::PlatformManager::PlatformWithName(platform_name);
   if (!result.ok()) {
     LOG(FATAL) << "Could not find platform with name "  // Crash OK
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
index b77917d14701c5..6d385ac31c435d 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h
@@ -30,7 +30,8 @@ namespace tensorflow {
 
 // Initializes the PluggableDevice platform and returns OK if the
 // PluggableDevice platform could be initialized.
-absl::Status ValidatePluggableDeviceMachineManager(const string& platform_name);
+absl::Status ValidatePluggableDeviceMachineManager(
+    const std::string& platform_name);
 
 // Returns the PluggableDevice machine manager singleton, creating it and
 // initializing the PluggableDevices on the machine if needed the first time it
@@ -38,7 +39,7 @@ absl::Status ValidatePluggableDeviceMachineManager(const string& platform_name);
 // environment in the process (e.g., ValidatePluggableDeviceMachineManager()
 // returns OK).
 stream_executor::Platform* PluggableDeviceMachineManager(
-    const string& platform_name);
+    const std::string& platform_name);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index 5e41c8db0c39b6..d348c678a15ea3 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -49,7 +49,7 @@ static absl::Status InitDeviceModule(stream_executor::SEInitPluginFn init_fn) {
     return absl::OkStatus();
   }
 
-  string device_type, platform_name;
+  std::string device_type, platform_name;
   TF_RETURN_IF_ERROR(stream_executor::InitStreamExecutorPlugin(
       init_fn, &device_type, &platform_name));
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
index 581f6b6c5c306f..01f6aa0e97bb00 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
@@ -57,9 +57,9 @@ limitations under the License.
 namespace tensorflow {
 
 /*static*/ PluggableDeviceProcessState* PluggableDeviceProcessState::singleton(
-    const string& device_type, const string& platform_name) {
+    const std::string& device_type, const std::string& platform_name) {
   using ProcessStateMap =
-      std::unordered_map<string, PluggableDeviceProcessState*>;
+      std::unordered_map<std::string, PluggableDeviceProcessState*>;
   static ProcessStateMap* process_state_map = new ProcessStateMap;
   auto iter = process_state_map->find(platform_name);
   if (iter != process_state_map->end()) {
@@ -71,7 +71,7 @@ namespace tensorflow {
 }
 
 PluggableDeviceProcessState::PluggableDeviceProcessState(
-    const string& device_type, const string& platform_name)
+    const std::string& device_type, const std::string& platform_name)
     : pluggable_device_enabled_(false),
       device_type_(device_type),
       platform_name_(platform_name) {
@@ -93,7 +93,7 @@ int PluggableDeviceProcessState::BusIdForPluggableDevice(
 Allocator* PluggableDeviceProcessState::GetPluggableDeviceAllocator(
     const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes) {
   DCHECK(process_state_);
-  const string& allocator_type = options.allocator_type();
+  const std::string& allocator_type = options.allocator_type();
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   mutex_lock lock(mu_);
   tsl::CheckValidTfDeviceId(DeviceType(device_type_),
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
index 6e6b45fe887dca..6afb0daa77a2da 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
@@ -43,8 +43,8 @@ class PluggableDeviceProcessState {
  public:
   // Singleton that manages each platform's per-process state. e.g. allocation
   // of shared resource.
-  static PluggableDeviceProcessState* singleton(const string& device_type,
-                                                const string& platform_name);
+  static PluggableDeviceProcessState* singleton(
+      const std::string& device_type, const std::string& platform_name);
 
   // Query whether any PluggableDevice has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -89,8 +89,8 @@ class PluggableDeviceProcessState {
  protected:
   // PluggableDeviceProcessState is a singleton that should not normally be
   // deleted except at process shutdown.
-  PluggableDeviceProcessState(const string& device_type,
-                              const string& platform_name);
+  PluggableDeviceProcessState(const std::string& device_type,
+                              const std::string& platform_name);
   virtual ~PluggableDeviceProcessState() = default;
 
   ProcessState::MDMap* mem_desc_map() {
@@ -101,8 +101,8 @@ class PluggableDeviceProcessState {
   static PluggableDeviceProcessState* instance_;
   ProcessState* process_state_;  // Not owned.
   bool pluggable_device_enabled_;
-  const string device_type_;
-  const string platform_name_;
+  const std::string device_type_;
+  const std::string platform_name_;
   mutex mu_;
 
   struct AllocatorParts {
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
index 27304954c25b0c..b7e9424982b22a 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
@@ -35,7 +35,7 @@ class PluggableDeviceSimpleAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() const override { return false; }
-  string Name() override { return "Simple allocator"; }
+  std::string Name() override { return "Simple allocator"; }
   std::optional<AllocatorStats> GetStats() override;
 
   AllocatorMemoryType GetMemoryType() const override {
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 29ae03e0d1f996..e74d99fd2af2ad 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 
 PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
                              SubAllocator* allocator,
-                             RoundUpInterface* size_rounder, string name)
+                             RoundUpInterface* size_rounder, std::string name)
     : name_(std::move(name)),
       has_size_limit_(pool_size_limit > 0),
       auto_resize_(auto_resize),
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 6ce3b7886cfa6b..69c1e7a75b88d9 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -55,10 +55,10 @@ class PoolAllocator : public Allocator {
   // malloc/free operations.  This object takes ownership of allocator.
   PoolAllocator(size_t pool_size_limit, bool auto_resize,
                 SubAllocator* allocator, RoundUpInterface* size_rounder,
-                string name);
+                std::string name);
   ~PoolAllocator() override;
 
-  string Name() override { return name_; }
+  std::string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
 
@@ -121,7 +121,7 @@ class PoolAllocator : public Allocator {
   // Delete the least recently used record.
   void EvictOne() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  const string name_;
+  const std::string name_;
   const bool has_size_limit_;
   const bool auto_resize_;
   size_t pool_size_limit_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 98af5aedeedee1..c26495dfa83117 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -92,7 +92,7 @@ int64_t GetParallelSubgraphThreshold() {
 const char ProcessFunctionLibraryRuntime::kDefaultFLRDevice[] = "null";
 
 void ProcessFunctionLibraryRuntime::FunctionData::DistributedInit(
-    DistributedFunctionLibraryRuntime* parent, const string& function_name,
+    DistributedFunctionLibraryRuntime* parent, const std::string& function_name,
     const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -149,16 +149,17 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
 
 /* static */
 absl::Status ProcessFunctionLibraryRuntime::SendTensors(
-    const string& source_device, const string& target_device,
-    const string& key_prefix, int64_t src_incarnation,
+    const std::string& source_device, const std::string& target_device,
+    const std::string& key_prefix, int64_t src_incarnation,
     absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
     RendezvousInterface* rendezvous) {
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   for (int i = 0; i < tensors_to_send.size(); ++i) {
-    string name = strings::StrCat(key_prefix, i);
-    string key = Rendezvous::CreateKey(source_device, src_incarnation,
-                                       target_device, name, FrameAndIter(0, 0));
+    std::string name = absl::StrCat(key_prefix, i);
+    std::string key =
+        Rendezvous::CreateKey(source_device, src_incarnation, target_device,
+                              name, FrameAndIter(0, 0));
     keys.push_back(key);
   }
   TF_RETURN_IF_ERROR(SendTensorsToRendezvous(
@@ -168,17 +169,18 @@ absl::Status ProcessFunctionLibraryRuntime::SendTensors(
 
 /* static */
 void ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
-    const string& source_device, const string& target_device,
-    const string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
+    const std::string& source_device, const std::string& target_device,
+    const std::string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
     DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
     RendezvousInterface* rendezvous, std::vector<Tensor>* received_tensors,
     StatusCallback done) {
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   for (int64_t i = 0; i < num_tensors; ++i) {
-    string name = strings::StrCat(key_prefix, i);
-    string key = Rendezvous::CreateKey(source_device, src_incarnation,
-                                       target_device, name, FrameAndIter(0, 0));
+    std::string name = absl::StrCat(key_prefix, i);
+    std::string key =
+        Rendezvous::CreateKey(source_device, src_incarnation, target_device,
+                              name, FrameAndIter(0, 0));
     keys.push_back(key);
   }
   RecvOutputsFromRendezvousAsync(rendezvous, device_context, alloc_attrs, keys,
@@ -207,7 +209,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetRetTypes(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
-    const string& device_name, int64_t* incarnation) const {
+    const std::string& device_name, int64_t* incarnation) const {
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found.");
@@ -217,14 +219,14 @@ absl::Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::GetDeviceContext(
-    const string& device_name, DeviceContext** device_context) const {
+    const std::string& device_name, DeviceContext** device_context) const {
   *device_context = nullptr;
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found.");
   }
   Device* device = flr->device();
-  string device_type = device->parsed_name().type;
+  std::string device_type = device->parsed_name().type;
   if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return absl::OkStatus();
@@ -281,7 +283,7 @@ void ProcessFunctionLibraryRuntime::InitializeDeviceAndFlr() {
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) const {
+    const std::string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
@@ -299,14 +301,14 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
-    const string& function_key, const string& device_name,
+    const std::string& function_key, const std::string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
   return AddHandleLocked(function_key, device_name, local_handle);
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
-    const string& function_key, const string& device_name,
+    const std::string& function_key, const std::string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   auto h = next_handle_;
   function_data_[h] =
@@ -318,7 +320,8 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
 
 FunctionLibraryRuntime::Handle
 ProcessFunctionLibraryRuntime::AddMultiDeviceHandle(
-    std::unique_ptr<MultiDeviceFunctionData> data, const string& function_key) {
+    std::unique_ptr<MultiDeviceFunctionData> data,
+    const std::string& function_key) {
   mutex_lock l(mu_);
   auto h = next_handle_;
   mdevice_data_[h] = std::move(data);
@@ -338,14 +341,14 @@ bool ProcessFunctionLibraryRuntime::HasMultiDeviceHandle(
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
-    const string& function_key) const {
+    const std::string& function_key) const {
   tf_shared_lock l(mu_);
   return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle,
+    const std::string& device_name, FunctionLibraryRuntime::Handle handle,
     bool include_multi_device) const {
   tf_shared_lock l(mu_);
 
@@ -357,7 +360,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
     if (data.glue_.size() != 1) return kInvalidLocalHandle;
 
     const auto& pair = *data.glue_.begin();
-    const string& func_device_name = pair.first;
+    const std::string& func_device_name = pair.first;
     const ComponentFunctionData& component_data = pair.second;
     if (func_device_name != device_name) return kInvalidLocalHandle;
 
@@ -377,7 +380,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
   return function_data->local_handle();
 }
 
-string ProcessFunctionLibraryRuntime::GetDeviceName(
+std::string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
   auto iter = function_data_.find(handle);
@@ -496,11 +499,11 @@ void ProcessFunctionLibraryRuntime::PublishSubgraphs(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
   // Check if this function has already been instantiated.
-  const string& function_key = Canonicalize(function_name, attrs, options);
+  const std::string& function_key = Canonicalize(function_name, attrs, options);
 
   {
     mutex_lock l(mu_);
@@ -517,12 +520,12 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   if (VLOG_IS_ON(3)) {
     int index = 0;
     VLOG(3) << "Requested input devices:";
-    for (const string& device : options.input_devices) {
+    for (const std::string& device : options.input_devices) {
       VLOG(3) << "    [input " << index++ << "] " << device;
     }
     index = 0;
     VLOG(3) << "Requested output devices:";
-    for (const string& device : options.output_devices) {
+    for (const std::string& device : options.output_devices) {
       VLOG(3) << "    [output " << index++ << "] " << device;
     }
   }
@@ -552,7 +555,7 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   Device* cpu_device;
   TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
 
-  const uint64 optimization_start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t optimization_start_time_usecs = Env::Default()->NowMicros();
   // Look up for optimized function graph in library. If found, skip
   // `OptimizeFunctionGraph` step.
   std::optional<absl::StatusOr<OptimizedFunctionGraph>> optimized_graph_proto =
@@ -593,8 +596,8 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
                                           function_name, *optimized_graph_info,
                                           options, *dev_set, lib_def_,
                                           composite_devices, cpu_device, env_));
-  const uint64 optimization_end_time_usecs = Env::Default()->NowMicros();
-  const uint64 graph_optimization_duration =
+  const uint64_t optimization_end_time_usecs = Env::Default()->NowMicros();
+  const uint64_t graph_optimization_duration =
       optimization_end_time_usecs - optimization_start_time_usecs;
   metrics::UpdateFunctionGraphOptimizationTime(graph_optimization_duration);
   VLOG(1) << "Finished graph optimizations for MultiDevice function \""
@@ -617,11 +620,11 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // We must preserve control returns in each of the function components,
   // otherwise after function inlining we might prune side-effectful nodes.
   const auto control_ret =
-      [&node_name_to_control_ret](const Node* n) -> std::optional<string> {
+      [&node_name_to_control_ret](const Node* n) -> std::optional<std::string> {
     const auto it = node_name_to_control_ret.find(n->name());
     return it != node_name_to_control_ret.end()
                // NOLINTNEXTLINE
-               ? absl::make_optional<string>(it->second)
+               ? absl::make_optional<std::string>(it->second)
                // NOLINTNEXTLINE
                : absl::nullopt;
   };
@@ -659,11 +662,11 @@ absl::Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   auto instantiate_component = [this, dev_set, &data_lib_def, &control_ret,
                                 &options,
-                                &data](const string& target,
+                                &data](const std::string& target,
                                        std::unique_ptr<Graph> subgraph,
                                        ComponentFunctionData* comp_data,
                                        std::function<void(absl::Status)> done) {
-    const string& device_type =
+    const std::string& device_type =
         dev_set->FindDeviceByName(target)->device_type();
 
     bool ints_on_device =
@@ -854,7 +857,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetOutputDevices(
       continue;
     }
 
-    const string& target = pair.first;
+    const std::string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
     Device* target_device = nullptr;
     Device* host = nullptr;
@@ -863,7 +866,7 @@ absl::Status ProcessFunctionLibraryRuntime::GetOutputDevices(
         data->has_remote_outputs = true;
       }
       target_device = device_set()->FindDeviceByName(target);
-      string remote_host;
+      std::string remote_host;
       TF_RETURN_IF_ERROR(
           DeviceNameUtils::DeviceNameToCpuDeviceName(target, &remote_host));
       host = device_set()->FindDeviceByName(remote_host);
@@ -917,14 +920,14 @@ absl::Status ProcessFunctionLibraryRuntime::PrepareRunMultiDevice(
   return absl::OkStatus();
 }
 
-std::vector<string> ProcessFunctionLibraryRuntime::GetOrderedSubgraphs(
+std::vector<std::string> ProcessFunctionLibraryRuntime::GetOrderedSubgraphs(
     const MultiDeviceFunctionData* data) const {
-  std::vector<string> subgraph_keys;
+  std::vector<std::string> subgraph_keys;
   subgraph_keys.reserve(data->glue_.size());
   for (const auto& pair : data->glue_) {
     subgraph_keys.push_back(pair.first);
   }
-  auto send_first_ordering = [&](const string& a, const string& b) {
+  auto send_first_ordering = [&](const std::string& a, const std::string& b) {
     auto a_summary = data->glue_.at(a).async_attributes.summary();
     auto b_summary = data->glue_.at(b).async_attributes.summary();
     if (a_summary == b_summary) {
@@ -969,9 +972,9 @@ absl::Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
   //
   // We assume that the partitioning has a valid deadlock-free ordering and the
   // safety of running synchronously has already been confirmed by this point.
-  std::vector<string> subgraph_keys = GetOrderedSubgraphs(data);
+  std::vector<std::string> subgraph_keys = GetOrderedSubgraphs(data);
 
-  for (const string& target : subgraph_keys) {
+  for (const std::string& target : subgraph_keys) {
     const ComponentFunctionData& comp_data = data->glue_.at(target);
     FunctionLibraryRuntime::Handle comp_handle = comp_data.handle;
 
@@ -1003,9 +1006,9 @@ absl::Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
                        &comp_tensor_rets);
       if (!run_status.ok()) {
         VLOG(2) << "Component function execution failed: " << run_status;
-        const string function_and_msg = strings::StrCat(
-            errors::FormatFunctionForError(data->function_name_), " ",
-            run_status.message());
+        const std::string function_and_msg =
+            absl::StrCat(errors::FormatFunctionForError(data->function_name_),
+                         " ", run_status.message());
         if (opts.rendezvous != nullptr) opts.rendezvous->StartAbort(run_status);
         return errors::CreateWithUpdatedMessage(run_status, function_and_msg);
       } else {
@@ -1067,7 +1070,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
 
   FunctionLibraryRuntime::Options opts_copy = opts;
   for (const auto& pair : data->glue_) {
-    const string& target = pair.first;
+    const std::string& target = pair.first;
     const ComponentFunctionData& comp_data = pair.second;
     FunctionLibraryRuntime::Handle comp_handle = pair.second.handle;
 
@@ -1094,9 +1097,9 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
         VLOG(2) << "Component function execution on target " << target
                 << " from " << data->function_name_ << " with handle "
                 << comp_handle << " failed: " << status;
-        const string function_and_msg = strings::StrCat(
-            errors::FormatFunctionForError(data->function_name_), " ",
-            status.message());
+        const std::string function_and_msg =
+            absl::StrCat(errors::FormatFunctionForError(data->function_name_),
+                         " ", status.message());
         refcounted_done->UpdateStatus(
             errors::CreateWithUpdatedMessage(status, function_and_msg));
         // Cancel the execution of other component functions.
@@ -1147,7 +1150,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
 }
 
 absl::Status ProcessFunctionLibraryRuntime::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
   if (options.is_multi_device_function) {
@@ -1195,7 +1198,7 @@ absl::Status ProcessFunctionLibraryRuntime::IsCrossProcess(
 }
 
 void ProcessFunctionLibraryRuntime::InstantiateRemote(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -1207,7 +1210,7 @@ void ProcessFunctionLibraryRuntime::InstantiateRemote(
   }
   auto target = options.target;
   VLOG(1) << "ProcessFLR Instantiate: " << function_name << " on: " << target;
-  string function_key = Canonicalize(function_name, attrs, options);
+  std::string function_key = Canonicalize(function_name, attrs, options);
   FunctionData* f;
   {
     mutex_lock l(mu_);
@@ -1257,7 +1260,7 @@ absl::Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
   // Release all component function handles.
   absl::Status overall_status;
   for (const auto& it : mdata->glue_) {
-    const string& device = it.first;
+    const std::string& device = it.first;
     FunctionLibraryRuntime::Handle flr_handle = it.second.handle;
     FunctionLibraryRuntime* flr = GetFLR(device);
     if (flr == nullptr) {
@@ -1291,7 +1294,7 @@ absl::Status ProcessFunctionLibraryRuntime::ReleaseHandle(
   }
 
   FunctionLibraryRuntime* flr = nullptr;
-  string target_device;
+  std::string target_device;
   {
     mutex_lock l(mu_);
 
@@ -1455,7 +1458,7 @@ void ProcessFunctionLibraryRuntime::RunInternal(
     std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
     FunctionLibraryRuntime::DoneCallback done) const {
   FunctionLibraryRuntime* flr = nullptr;
-  string target_device;
+  std::string target_device;
   FunctionLibraryRuntime::LocalHandle local_handle;
   {
     tf_shared_lock l(mu_);
@@ -1480,7 +1483,7 @@ void ProcessFunctionLibraryRuntime::RunInternal(
   flr = GetFLR(target_device);
   if (flr != nullptr) {
     auto rendezvous = opts.rendezvous;
-    string source_device = opts.source_device;
+    std::string source_device = opts.source_device;
     DeviceContext* device_context;
     absl::Status s = GetDeviceContext(source_device, &device_context);
     if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 0305bde12e6cba..d37f341ae83531 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -94,8 +94,8 @@ class ProcessFunctionLibraryRuntime {
   // `tensors_to_send` and indicates how the input tensors are allocated. Method
   // takes references on each of the `tensors_to_send`. Method doesn't block.
   static absl::Status SendTensors(
-      const string& source_device, const string& target_device,
-      const string& key_prefix, int64_t src_incarnation,
+      const std::string& source_device, const std::string& target_device,
+      const std::string& key_prefix, int64_t src_incarnation,
       absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
       const std::vector<AllocatorAttributes>& alloc_attrs,
       RendezvousInterface* rendezvous);
@@ -107,23 +107,23 @@ class ProcessFunctionLibraryRuntime {
   // tensors and should either be empty or `num_tensors` in size. Method doesn't
   // block and calls `done` when `num_tensors` are fetched.
   static void ReceiveTensorsAsync(
-      const string& source_device, const string& target_device,
-      const string& key_prefix, int64_t src_incarnation, int64_t num_tensors,
-      DeviceContext* device_context,
+      const std::string& source_device, const std::string& target_device,
+      const std::string& key_prefix, int64_t src_incarnation,
+      int64_t num_tensors, DeviceContext* device_context,
       const std::vector<AllocatorAttributes>& alloc_attrs,
       RendezvousInterface* rendezvous, std::vector<Tensor>* received_tensors,
       StatusCallback done);
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
+  FunctionLibraryRuntime* GetFLR(const std::string& device_name) const;
 
   // Returns the return types for the function identified by handle `h`.
   absl::Status GetRetTypes(FunctionLibraryRuntime::Handle h,
                            DataTypeVector* ret_types);
 
   // Returns the device incarnation for the given device_name.
-  absl::Status GetDeviceIncarnation(const string& device_name,
+  absl::Status GetDeviceIncarnation(const std::string& device_name,
                                     int64_t* incarnation) const;
 
   // For a given canonicalized key signature of the function instantiated
@@ -131,11 +131,12 @@ class ProcessFunctionLibraryRuntime {
   // that value. Uses core/common_runtime/framework/function.h::Canonicalize
   // to canonicalize the function signature.
   FunctionLibraryRuntime::Handle AddHandle(
-      const string& function_key, const string& device_name,
+      const std::string& function_key, const std::string& device_name,
       FunctionLibraryRuntime::LocalHandle local_handle);
 
   // Returns a handle if found for the given key, else returns kInvalidHandle.
-  FunctionLibraryRuntime::Handle GetHandle(const string& function_key) const;
+  FunctionLibraryRuntime::Handle GetHandle(
+      const std::string& function_key) const;
 
   // For the given handle instantiated on device `device_name` returns the local
   // index of instantiation of that function. If the function was not
@@ -146,7 +147,7 @@ class ProcessFunctionLibraryRuntime {
   // with a single component that is placed on `device_name`, then this method
   // will return the local handle for that component.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle,
+      const std::string& device_name, FunctionLibraryRuntime::Handle handle,
       bool include_multi_device = false) const;
 
   // Fills `output_devices` with the devices on which the results will
@@ -161,7 +162,7 @@ class ProcessFunctionLibraryRuntime {
   // Allows for function_name to be instantiated on different devices
   // as specified in attrs.
   absl::Status Instantiate(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle);
 
@@ -273,7 +274,7 @@ class ProcessFunctionLibraryRuntime {
     // The handle for the instantiated component function.
     FunctionLibraryRuntime::Handle handle;
     // The name for the component function.
-    string name;
+    std::string name;
     // arg_indices.size() is the number of arguments to the component function.
     // The i-th argument of the component function comes from the
     // `arg_indices[i]`-th argument of the multi-device function.
@@ -297,8 +298,8 @@ class ProcessFunctionLibraryRuntime {
   // The fields are filled in during instantiation. Once the object is
   // added to mdevice_data_, all fields are constant.
   struct MultiDeviceFunctionData {
-    MultiDeviceFunctionData(const string& function_name,
-                            const string& function_key, int num_outputs,
+    MultiDeviceFunctionData(const std::string& function_name,
+                            const std::string& function_key, int num_outputs,
                             DataTypeVector ret_types)
         : function_name_(function_name),
           function_key_(function_key),
@@ -308,9 +309,9 @@ class ProcessFunctionLibraryRuntime {
           is_cross_process_(false),
           has_remote_outputs(false) {}
 
-    const string function_name_;
-    const string function_key_;
-    uint64 instantiation_counter_;
+    const std::string function_name_;
+    const std::string function_key_;
+    uint64_t instantiation_counter_;
     // Stored here to resize the output tensor vector when function is run.
     const int num_outputs_;
     DataTypeVector ret_types_;
@@ -325,12 +326,12 @@ class ProcessFunctionLibraryRuntime {
 
     // Maps the device name to the information about the component function
     // be run on this device.
-    std::unordered_map<string, ComponentFunctionData> glue_;
+    std::unordered_map<std::string, ComponentFunctionData> glue_;
   };
 
   struct CleanUpItem {
-    string device;
-    uint64 step_id;
+    std::string device;
+    uint64_t step_id;
     FunctionLibraryRuntime::Handle local_handle;
   };
 
@@ -343,18 +344,18 @@ class ProcessFunctionLibraryRuntime {
 
  private:
   FunctionLibraryRuntime::Handle AddHandleLocked(
-      const string& function_key, const string& device_name,
+      const std::string& function_key, const std::string& device_name,
       FunctionLibraryRuntime::LocalHandle local_handle)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
-  absl::Status GetDeviceContext(const string& device_name,
+  absl::Status GetDeviceContext(const std::string& device_name,
                                 DeviceContext** device_context) const;
 
   // Looks up the information for the given `handle` and returns the name
   // of the device where the function is registered.
-  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
+  std::string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
 
   // Removes handle from the state owned by this object.
   absl::Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
@@ -380,19 +381,19 @@ class ProcessFunctionLibraryRuntime {
   absl::Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
 
   absl::Status InstantiateMultiDevice(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle);
 
   void InstantiateRemote(
-      const string& function_name, AttrSlice attrs,
+      const std::string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       FunctionLibraryRuntime::Handle* handle,
       FunctionLibraryRuntime::DoneCallback done);
 
   FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
       const std::unique_ptr<MultiDeviceFunctionData> data,
-      const string& function_key);
+      const std::string& function_key);
 
   bool HasMultiDeviceHandle(FunctionLibraryRuntime::Handle handle) const;
 
@@ -426,7 +427,7 @@ class ProcessFunctionLibraryRuntime {
                                        InternalArgs* comp_args);
 #endif  // IS_MOBILE_PLATFORM
 
-  std::vector<string> GetOrderedSubgraphs(
+  std::vector<std::string> GetOrderedSubgraphs(
       const MultiDeviceFunctionData* data) const;
 
   absl::Status PrepareRunMultiDevice(
@@ -458,15 +459,15 @@ class ProcessFunctionLibraryRuntime {
   // (to be executed on `target_device`) function.
   class FunctionData {
    public:
-    FunctionData(const string& target_device,
+    FunctionData(const std::string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle,
-                 const string& function_key)
+                 const std::string& function_key)
         : target_device_(target_device),
           local_handle_(local_handle),
           function_key_(function_key) {}
 
-    const string& target_device() { return target_device_; }
-    const string& function_key() { return function_key_; }
+    const std::string& target_device() { return target_device_; }
+    const std::string& function_key() { return function_key_; }
 
     FunctionLibraryRuntime::LocalHandle local_handle() {
       mutex_lock l(mu_);
@@ -476,7 +477,8 @@ class ProcessFunctionLibraryRuntime {
     // Initializes the FunctionData object by potentially making an Initialize
     // call to the DistributedFunctionLibraryRuntime.
     void DistributedInit(
-        DistributedFunctionLibraryRuntime* parent, const string& function_name,
+        DistributedFunctionLibraryRuntime* parent,
+        const std::string& function_name,
         const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
         const FunctionLibraryRuntime::InstantiateOptions& options,
         FunctionLibraryRuntime::DoneCallback done);
@@ -489,9 +491,9 @@ class ProcessFunctionLibraryRuntime {
    private:
     mutex mu_;
 
-    const string target_device_;
+    const std::string target_device_;
     FunctionLibraryRuntime::LocalHandle local_handle_ TF_GUARDED_BY(mu_);
-    const string function_key_;
+    const std::string function_key_;
     bool is_cross_process_ TF_GUARDED_BY(mu_) = false;
     bool init_started_ TF_GUARDED_BY(mu_) = false;
     absl::Status init_result_ TF_GUARDED_BY(mu_);
@@ -516,7 +518,7 @@ class ProcessFunctionLibraryRuntime {
   std::vector<CompositeDevice*> composite_devices_ TF_GUARDED_BY(mu_);
 
   // Holds all the function instantiations. Maps function_keys to handles.
-  std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
+  std::unordered_map<std::string, FunctionLibraryRuntime::Handle> table_
       TF_GUARDED_BY(mu_);
 
   // Function data for instantiated remote functions.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index df2f3db3f68ca7..5458203f8c592c 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -59,7 +59,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
  public:
   explicit TestClusterFLR(DeviceMgr* device_mgr) : device_mgr_(device_mgr) {}
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -82,7 +82,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override {}
 
   DeviceMgr* remote_device_mgr() const override { return device_mgr_; }
@@ -169,7 +169,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status Instantiate(
-      const string& name, test::function::Attrs attrs,
+      const std::string& name, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       FunctionLibraryRuntime::Handle* handle) {
     return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
@@ -214,7 +214,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
   template <typename T, typename K>
   absl::Status RunWithRuntime(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const T& args, std::vector<K*> rets,
@@ -270,7 +270,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status Run(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets,
@@ -280,7 +280,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status RunWithPackedArgs(
-      const string& name, FunctionLibraryRuntime::Options opts,
+      const std::string& name, FunctionLibraryRuntime::Options opts,
       test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
       const FunctionArgsInterface& args, std::vector<FunctionRet*> rets,
@@ -503,7 +503,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
 TEST_F(ProcessFunctionLibraryRuntimeTest,
        SameDeviceXTimesFourInt32MultiDevice) {
   Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.remote_execution = true;
@@ -515,13 +515,13 @@ TEST_F(ProcessFunctionLibraryRuntimeTest,
   Tensor y;
   TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({4, 8, 12, 16}));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest,
        MultipleCallsSameDeviceXTimesMultiDevice) {
   Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.remote_execution = true;
@@ -533,10 +533,10 @@ TEST_F(ProcessFunctionLibraryRuntimeTest,
   Tensor y;
   TF_CHECK_OK(Run("XTimesTwoInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({2, 4, 6, 8}));
   TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
                   {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({4, 8, 12, 16}));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
@@ -668,7 +668,7 @@ bool IsCUDATensor(const Tensor& t) {
 void TestTwoDeviceMult(
     ProcessFunctionLibraryRuntimeTest* fixture,
     const FunctionLibraryRuntime::InstantiateOptions& inst_opts,
-    const string& error = "") {
+    const std::string& error = "") {
   fixture->Init({test::function::TwoDeviceMult()});
   FunctionLibraryRuntime::Options opts;
   auto x = test::AsTensor<float>({1, 2, 3});
@@ -764,18 +764,18 @@ void TestTwoDeviceInputOutput(
   test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({30, 60}));
 }
 
-std::vector<string> CompleteDevices(const std::vector<string>& v) {
-  std::vector<string> result;
+std::vector<std::string> CompleteDevices(const std::vector<std::string>& v) {
+  std::vector<std::string> result;
   result.reserve(v.size());
-  for (const string& s : v) {
-    result.push_back(strings::StrCat("/job:a/replica:0/task:0/device:", s));
+  for (const std::string& s : v) {
+    result.push_back(absl::StrCat("/job:a/replica:0/task:0/device:", s));
   }
   return result;
 }
 
 FunctionLibraryRuntime::InstantiateOptions MakeOptions(
-    const string& target, const std::vector<string>& input_devices,
-    const std::vector<string>& output_devices) {
+    const std::string& target, const std::vector<std::string>& input_devices,
+    const std::vector<std::string>& output_devices) {
   FunctionLibraryRuntime::InstantiateOptions inst_opts;
   inst_opts.target = target;
   inst_opts.input_devices = CompleteDevices(input_devices);
@@ -924,8 +924,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_EmptyBodySwap) {
   test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2}));
 }
 
-Tensor GetResourceHandle(const string& var_name, const string& container,
-                         const string& device_name) {
+Tensor GetResourceHandle(const std::string& var_name,
+                         const std::string& container,
+                         const std::string& device_name) {
   ResourceHandle handle;
   handle.set_device(device_name);
   handle.set_container(container);
@@ -1189,8 +1190,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FunctionDefHelper::Const<int32>("shape", absl::Span<const int32>({1})),
-       FunctionDefHelper::Const<int32>("minval", 0),
+      {FunctionDefHelper::Const<int32_t>("shape",
+                                         absl::Span<const int32_t>({1})),
+       FunctionDefHelper::Const<int32_t>("minval", 0),
        {{"maxval"}, "ReadVariableOp", {"x"}, {{"dtype", T}}, {}},
        // A stateful node.
        {{"y"},
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index a91eb74f1ef464..c79b42faffe83c 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -46,7 +46,7 @@ namespace tensorflow {
 ProcessState::ProcessState()
     : numa_enabled_(false), cpu_allocators_cached_(0) {}
 
-string ProcessState::MemDesc::DebugString() {
+std::string ProcessState::MemDesc::DebugString() {
   return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
                          ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index dd667cc236a8e9..eb0b7f53a8c7a4 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -51,7 +51,7 @@ class ProcessState : public ProcessStateInterface {
           dev_index(0),
           gpu_registered(false),
           nic_registered(false) {}
-    string DebugString();
+    std::string DebugString();
   };
 
   // If NUMA Allocators are desired, call this before calling any
@@ -122,7 +122,7 @@ class RecordingAllocator : public Allocator {
                      ProcessState::MemDesc md, mutex* mu)
       : mm_(mm), a_(a), md_(md), mu_(mu) {}
 
-  string Name() override { return a_->Name(); }
+  std::string Name() override { return a_->Name(); }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     void* p = a_->AllocateRaw(alignment, num_bytes);
     mutex_lock l(*mu_);
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 65733614bdc54c..233dcde498a6bc 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -35,12 +35,12 @@ namespace tensorflow {
 namespace {
 
 // Use environment setting if specified (init once)
-int32 GetEnvNumInterOpThreads() {
+int32_t GetEnvNumInterOpThreads() {
   static int32_t env_num_threads = NumInterOpThreadsFromEnvironment();
   return env_num_threads;
 }
 
-int32 DefaultNumInterOpThreads() {
+int32_t DefaultNumInterOpThreads() {
 #ifndef __ANDROID__
   int32_t env_num_threads = GetEnvNumInterOpThreads();
   if (env_num_threads > 0) {
@@ -90,13 +90,13 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
-int32 NumInterOpThreadsFromEnvironment() {
+int32_t NumInterOpThreadsFromEnvironment() {
   int32_t num;
   const char* val = std::getenv("TF_NUM_INTEROP_THREADS");
   return (val && absl::SimpleAtoi(val, &num)) ? num : 0;
 }
 
-int32 NumIntraOpThreadsFromEnvironment() {
+int32_t NumIntraOpThreadsFromEnvironment() {
   int32_t num;
   const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
   return (val && absl::SimpleAtoi(val, &num)) ? num : 0;
@@ -122,7 +122,7 @@ int32 DefaultNumIntraOpThreads() {
   return port::MaxParallelism();
 }
 #endif  // defined(ENABLE_ONEDNN_OPENMP) && defined(ENABLE_MKL)
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
+int32_t NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32_t inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op > 0) return inter_op;
   const int32_t env_inter_op = GetEnvNumInterOpThreads();
@@ -169,7 +169,7 @@ void SchedClosure(absl::AnyInvocable<void()> closure) {
   if (!tsl::tracing::EventCollector::IsEnabled()) {
     return Env::Default()->SchedClosure(std::move(closure));
   }
-  uint64 id = tsl::tracing::GetUniqueArg();
+  uint64_t id = tsl::tracing::GetUniqueArg();
   tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure, id);
 
   Env::Default()->SchedClosure([id, closure = std::move(closure)]() mutable {
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index cc2bc4390793c0..682556d19fbfad 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -32,10 +32,10 @@ namespace tensorflow {
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
 // Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
-int32 NumInterOpThreadsFromEnvironment();
+int32_t NumInterOpThreadsFromEnvironment();
 
 // Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
-int32 NumIntraOpThreadsFromEnvironment();
+int32_t NumIntraOpThreadsFromEnvironment();
 
 // Returns the number of inter op threads specified in `options` or a default.
 // If no value or a negative value is specified in the provided options, then
@@ -43,7 +43,7 @@ int32 NumIntraOpThreadsFromEnvironment();
 // environment variable. If neither a value is specified in the options or in
 // the environment, this function will return a reasonable default value based
 // on the number of schedulable CPUs, and any MKL and OpenMP configurations.
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+int32_t NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
 // The number is set if `num_threads` > 0, otherwise it will be configured by
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 71aac10bf6887a..28ae706c3f08b7 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -40,9 +40,9 @@ class ProfileHandler {
   // - label: Extra content for timeline click text.
   // - op_type: String name of the Op.
   // - details: Main content for timeline click text.
-  virtual void RecordOneOp(const string& device, const NodeExecStats& stats,
-                           bool is_copy, absl::string_view label,
-                           absl::string_view op_type,
+  virtual void RecordOneOp(const std::string& device,
+                           const NodeExecStats& stats, bool is_copy,
+                           absl::string_view label, absl::string_view op_type,
                            absl::string_view details) = 0;
 
   // Records that the current step finished.
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index dee1903b112b2b..6d65024cc4f50a 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -159,7 +159,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         if (need_create_iter) {
           tsl::profiler::TraceMe activit1y(
               [&]() {
-                return strings::StrCat(
+                return absl::StrCat(
                     "PropagateOutputs::NextIteration::CreateIterationState");
               },
               tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
@@ -259,7 +259,7 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
   const ImmutableExecutorState::FrameInfo& frame_info =
       immutable_state_.get_enter_frame_info(node_item);
 
-  const uint64 child_id = Hash64Combine(
+  const uint64_t child_id = Hash64Combine(
       frame->frame_id,
       Hash64Combine(iter_state->iter_num, Hash64(frame_info.name)));
 
@@ -275,7 +275,7 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
   // Need to create a new frame instance.
   // Note that this new frame instance is created without any locks.
   if (vlog_) {
-    const string child_name = strings::StrCat(
+    const std::string child_name = strings::StrCat(
         frame->frame_name, ";", iter_state->iter_num, ";", frame_info.name);
     VLOG(2) << "Create frame: " << child_name << " id: " << child_id;
   }
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index bdfea225a5ac2d..238cb0552b2c67 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -255,11 +255,11 @@ class PropagatorState {
     // The name of this frame, which is the concatenation of its parent
     // frame name, the iteration of the parent frame when this frame was
     // created, and the value of the attr 'frame_name'.
-    string frame_name;
+    std::string frame_name;
 
     // The unique id for this frame. Generated by fingerprinting
     // frame_name.
-    uint64 frame_id;
+    uint64_t frame_id;
 
     // The iteration state of its parent frame when this frame is created.
     // nullptr if there is no parent frame. The frame_name/parent_iter pair
@@ -543,7 +543,7 @@ class PropagatorState {
   // child frame is a hash composed of the ID of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+  absl::flat_hash_map<uint64_t, FrameState*> outstanding_frames_
       TF_GUARDED_BY(mu_);
 
   PropagatorState(const PropagatorState&) = delete;
@@ -579,12 +579,12 @@ class OrderedPropagatorState : public PropagatorState {
 
    private:
     static bool compare(TaggedNode const& lhs, TaggedNode const& rhs) {
-      std::tuple<int, uint64, int64_t> lhs_prio{lhs.node_item->node_id,
-                                                lhs.input_frame->frame_id,
-                                                lhs.input_iter->iter_num};
-      std::tuple<int, uint64, int64_t> rhs_prio{rhs.node_item->node_id,
-                                                rhs.input_frame->frame_id,
-                                                rhs.input_iter->iter_num};
+      std::tuple<int, uint64_t, int64_t> lhs_prio{lhs.node_item->node_id,
+                                                  lhs.input_frame->frame_id,
+                                                  lhs.input_iter->iter_num};
+      std::tuple<int, uint64_t, int64_t> rhs_prio{rhs.node_item->node_id,
+                                                  rhs.input_frame->frame_id,
+                                                  rhs.input_iter->iter_num};
       return lhs_prio < rhs_prio;
     }
 
diff --git a/tensorflow/core/common_runtime/quantize_training.cc b/tensorflow/core/common_runtime/quantize_training.cc
index c800552b5d3bca..3459153ed7dace 100644
--- a/tensorflow/core/common_runtime/quantize_training.cc
+++ b/tensorflow/core/common_runtime/quantize_training.cc
@@ -35,18 +35,18 @@ namespace tensorflow {
 namespace {
 
 // TODO(suharshs): If desired, make these values configurable.
-const uint32 kAllowedInputs = 2;
+const uint32_t kAllowedInputs = 2;
 const float kEMADecay = 0.999;
 
 // Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
 const auto* nodes_to_rewrite =
-    new std::unordered_set<string, StringPieceHasher>{"MatMul", "Conv2D"};
+    new std::unordered_set<std::string, StringPieceHasher>{"MatMul", "Conv2D"};
 
 // Contains necessary parameters to convert an edge.
 struct EdgeToConvert {
   // edge is not owned here.
   const Edge* edge;
-  int32 num_bits;
+  int32_t num_bits;
   bool signed_input;
   bool range_given;
   float input_min;
@@ -67,7 +67,7 @@ struct EdgeToConvert {
 // TODO(jmchen): Make this check more robust as it is not guaranteed that the
 // forward node will not be named with a leading "gradients".
 inline bool IsGradientNode(const Graph* graph, const Node* node) {
-  static const string tag = "gradients";
+  static const std::string tag = "gradients";
   return (node->name().compare(0, tag.size(), tag) == 0);
 }
 
@@ -76,7 +76,7 @@ inline bool IsGradientNode(const Graph* graph, const Node* node) {
 // Returns true if the root tensor op type is known, false otherwise.
 bool FindType(const Graph* graph, const Node* node, bool* signed_input,
               bool* range_given, float* input_min, float* input_max) {
-  const string& src_op = node->type_string();
+  const std::string& src_op = node->type_string();
   if (src_op == "Const" || src_op == "Variable" || src_op == "VariableV2") {
     *signed_input = true;
     *range_given = false;
@@ -154,7 +154,7 @@ absl::Status FindSaveOp(const Graph* graph, Node** save_op,
 Node* FindRestoreAllOp(const Graph* graph, absl::string_view save_prefix) {
   for (Node* node : graph->op_nodes()) {
     // The restore_all op should have the same prefix of the save_op.
-    if (node->name() == strings::StrCat(save_prefix, "/restore_all")) {
+    if (node->name() == absl::StrCat(save_prefix, "/restore_all")) {
       return node;
     }
   }
@@ -254,21 +254,21 @@ absl::Status AddRestoreVariableSubgraphs(
   if (restore_all == nullptr) {
     return errors::InvalidArgument("graph has SaveOp, but no restore_all NoOp");
   }
-  const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
-  const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
+  const std::string restore_op_name = absl::StrCat(name_prefix, "/RestoreV2");
+  const std::string assign_op_name = absl::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
     // Add an extra prefix after calling graph->NewName because the "unique"
     // name may conflict with names generated for Send nodes.
     // TODO(b/77547936): fix this more generally and get rid of the extra prefix
     // here.
-    string new_restore_op_name =
-        strings::StrCat(graph->NewName(restore_op_name), "_qt");
-    string new_assign_op_name =
-        strings::StrCat(graph->NewName(assign_op_name), "_qt");
-    string tensor_names_op_name =
-        strings::StrCat(new_restore_op_name, "/tensor_names");
-    string shape_and_slices_op_name =
-        strings::StrCat(new_restore_op_name, "/shape_and_slices");
+    std::string new_restore_op_name =
+        absl::StrCat(graph->NewName(restore_op_name), "_qt");
+    std::string new_assign_op_name =
+        absl::StrCat(graph->NewName(assign_op_name), "_qt");
+    std::string tensor_names_op_name =
+        absl::StrCat(new_restore_op_name, "/tensor_names");
+    std::string shape_and_slices_op_name =
+        absl::StrCat(new_restore_op_name, "/shape_and_slices");
 
     // Construct the tensor_names input with the variable name.
     Node* tensor_names;
@@ -329,32 +329,32 @@ absl::Status AddSaveAndRestore(Graph* graph,
 
 // Sets output to the Node that computes reduction axes corresponding to all
 // dimensions of input and return.
-absl::Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
-                               Node** output) {
-  name_prefix = strings::StrCat(name_prefix, "/ReductionAxes");
+absl::Status MakeReductionAxes(Graph* graph, std::string name_prefix,
+                               Node* input, Node** output) {
+  name_prefix = absl::StrCat(name_prefix, "/ReductionAxes");
   Node* start;
   Tensor zero_tensor(DT_INT32, TensorShape());
-  zero_tensor.flat<int32>()(0) = 0;
+  zero_tensor.flat<int32_t>()(0) = 0;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/RangeStart"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/RangeStart"), "Const")
           .Attr("dtype", DT_INT32)
           .Attr("value", zero_tensor)
           .Finalize(graph, &start));
   Node* delta;
   Tensor one_tensor(DT_INT32, TensorShape());
-  one_tensor.flat<int32>()(0) = 1;
+  one_tensor.flat<int32_t>()(0) = 1;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/RangeDelta"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/RangeDelta"), "Const")
           .Attr("dtype", DT_INT32)
           .Attr("value", one_tensor)
           .Finalize(graph, &delta));
   Node* rank;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/InputRank"), "Rank")
+      NodeBuilder(absl::StrCat(name_prefix, "/InputRank"), "Rank")
           .Input(input)
           .Finalize(graph, &rank));
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/ReductionAxes"), "Range")
+      NodeBuilder(absl::StrCat(name_prefix, "/ReductionAxes"), "Range")
           .Input(start)
           .Input(rank)
           .Input(delta)
@@ -363,45 +363,43 @@ absl::Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
 }
 
 // Computes the exponential moving average of input, updated in update_variable.
-absl::Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
+absl::Status MakeExponentialMovingAverage(Graph* graph, std::string name_prefix,
                                           const NodeBuilder::NodeOut& input,
                                           Node* decay, Node* update_variable,
                                           Node** assign_value) {
   // variable_t+1 = variable_t - [(variable_t - value) * (1 - decay)]
-  name_prefix = strings::StrCat(name_prefix, "/EMA");
+  name_prefix = absl::StrCat(name_prefix, "/EMA");
   Node* one;
   Tensor one_tensor(DT_FLOAT, TensorShape());
   one_tensor.flat<float>()(0) = 1.0;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/OneConst"), "Const")
+      NodeBuilder(absl::StrCat(name_prefix, "/OneConst"), "Const")
           .Attr("dtype", DT_FLOAT)
           .Attr("value", one_tensor)
           .Finalize(graph, &one));
   Node* decay_complement;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/DecayComplement"), "Sub")
+      NodeBuilder(absl::StrCat(name_prefix, "/DecayComplement"), "Sub")
           .Input(one)
           .Input(decay)
           .Finalize(graph, &decay_complement));
 
   Node* value_diff;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/ValueDiff"), "Sub")
-          .Input(update_variable)
-          .Input(input)
-          .Finalize(graph, &value_diff));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/ValueDiff"), "Sub")
+                         .Input(update_variable)
+                         .Input(input)
+                         .Finalize(graph, &value_diff));
   Node* update_value;
   TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/UpdateValue"), "Mul")
+      NodeBuilder(absl::StrCat(name_prefix, "/UpdateValue"), "Mul")
           .Input(value_diff)
           .Input(decay_complement)
           .Finalize(graph, &update_value));
 
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/EMAValue"), "Sub")
-          .Input(update_variable)
-          .Input(update_value)
-          .Finalize(graph, assign_value));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/EMAValue"), "Sub")
+                         .Input(update_variable)
+                         .Input(update_value)
+                         .Finalize(graph, assign_value));
   return absl::OkStatus();
 }
 
@@ -416,25 +414,24 @@ absl::Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
 //       |         EMA    init_val
 //       |           \      /
 //       +----------- assign
-absl::Status MakeInitializedEMAVariable(Graph* graph, const string& name,
+absl::Status MakeInitializedEMAVariable(Graph* graph, const std::string& name,
                                         Node* decay, Node* init_val,
                                         std::vector<Node*>* added_variables,
                                         Node** var) {
   // TODO(suharshs): Update this to use ResourceVariables when they are ready.
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name, "/Variable"), "VariableV2")
-          .Attr("shape", TensorShape())
-          .Attr("dtype", DT_FLOAT)
-          .Finalize(graph, var));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Variable"), "VariableV2")
+                         .Attr("shape", TensorShape())
+                         .Attr("dtype", DT_FLOAT)
+                         .Finalize(graph, var));
   added_variables->push_back(*var);
 
   Node* is_initialized;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/IsInitialized"),
-                                 "IsVariableInitialized")
-                         .Input(*var)
-                         .Finalize(graph, &is_initialized));
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(absl::StrCat(name, "/IsInitialized"), "IsVariableInitialized")
+          .Input(*var)
+          .Finalize(graph, &is_initialized));
   Node* switch_node;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Switch"), "Switch")
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Switch"), "Switch")
                          .Input(init_val)
                          .Input(is_initialized)
                          .Finalize(graph, &switch_node));
@@ -446,20 +443,19 @@ absl::Status MakeInitializedEMAVariable(Graph* graph, const string& name,
                                                   decay, *var, &ema_value));
 
   Node* assign_value;
-  TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat(name, "/Merge"), "Merge")
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/Merge"), "Merge")
                          .Input({output_false, ema_value})
                          .Finalize(graph, &assign_value));
 
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name, "/AssignValue"), "Assign")
-          .Input(*var)
-          .Input(assign_value)
-          .Finalize(graph, var));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name, "/AssignValue"), "Assign")
+                         .Input(*var)
+                         .Input(assign_value)
+                         .Finalize(graph, var));
   return absl::OkStatus();
 }
 
 // Computes the min and max EMA of input and stores them in min_var and max_var.
-absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
+absl::Status MakeEMAMinMaxVars(Graph* graph, const std::string& name_prefix,
                                Node* input, std::vector<Node*>* added_variables,
                                Node** min_var, Node** max_var) {
   // TODO(suharshs): The decay will be constant, so we could make only one for
@@ -468,23 +464,22 @@ absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
   Tensor decay_tensor(DT_FLOAT, TensorShape());
   decay_tensor.flat<float>()(0) = kEMADecay;
   Node* decay;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(strings::StrCat(name_prefix, "/Decay"), "Const")
-          .Attr("dtype", DT_FLOAT)
-          .Attr("value", decay_tensor)
-          .Finalize(graph, &decay));
+  TF_RETURN_IF_ERROR(NodeBuilder(absl::StrCat(name_prefix, "/Decay"), "Const")
+                         .Attr("dtype", DT_FLOAT)
+                         .Attr("value", decay_tensor)
+                         .Finalize(graph, &decay));
 
   Node* reduction_axes;
   TF_RETURN_IF_ERROR(
       MakeReductionAxes(graph, name_prefix, input, &reduction_axes));
   Node* min;
-  string min_name = strings::StrCat(name_prefix, "/Min");
+  std::string min_name = absl::StrCat(name_prefix, "/Min");
   TF_RETURN_IF_ERROR(NodeBuilder(min_name, "Min")
                          .Input(input)
                          .Input(reduction_axes)
                          .Finalize(graph, &min));
   Node* max;
-  string max_name = strings::StrCat(name_prefix, "/Max");
+  std::string max_name = absl::StrCat(name_prefix, "/Max");
   TF_RETURN_IF_ERROR(NodeBuilder(max_name, "Max")
                          .Input(input)
                          .Input(reduction_axes)
@@ -498,7 +493,7 @@ absl::Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix,
 
 // Makes an input min and max constant if the range is given. Otherwise, makes
 // min and max variables that are updated by an EMA.
-absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
+absl::Status MakeInputMinMax(Graph* graph, const std::string& name_prefix,
                              const EdgeToConvert& edge,
                              std::vector<Node*>* added_variables,
                              Node** input_min, Node** input_max) {
@@ -508,14 +503,14 @@ absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
     Tensor input_min_tensor(DT_FLOAT, TensorShape());
     input_min_tensor.flat<float>()(0) = edge.input_min;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat(name_prefix, "/InputMin"), "Const")
+        NodeBuilder(absl::StrCat(name_prefix, "/InputMin"), "Const")
             .Attr("dtype", DT_FLOAT)
             .Attr("value", input_min_tensor)
             .Finalize(graph, input_min));
     Tensor input_max_tensor(DT_FLOAT, TensorShape());
     input_max_tensor.flat<float>()(0) = edge.input_max;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat(name_prefix, "/InputMax"), "Const")
+        NodeBuilder(absl::StrCat(name_prefix, "/InputMax"), "Const")
             .Attr("dtype", DT_FLOAT)
             .Attr("value", input_max_tensor)
             .Finalize(graph, input_max));
@@ -532,8 +527,8 @@ absl::Status MakeInputMinMax(Graph* graph, const string& name_prefix,
 // Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
 // (and required input nodes) based on edge.
 // The result is stored in convert_node.
-absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
-                            const string& quant_op_type,
+absl::Status MakeQuantizeOp(Graph* graph, const std::string& name_prefix,
+                            const std::string& quant_op_type,
                             const EdgeToConvert& edge,
                             std::vector<Node*>* added_variables,
                             Node** convert_node) {
@@ -541,7 +536,7 @@ absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
   Node* input_max;
   TF_RETURN_IF_ERROR(MakeInputMinMax(graph, name_prefix, edge, added_variables,
                                      &input_min, &input_max));
-  string quant_name = strings::StrCat(name_prefix, "/", quant_op_type);
+  std::string quant_name = absl::StrCat(name_prefix, "/", quant_op_type);
   if (quant_op_type == "QuantizeAndDequantizeV2") {
     TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
                            .Input(edge.edge->src())
@@ -566,15 +561,15 @@ absl::Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
 
 // Insert conversion op, connect it to the graph and remove the old edge.
 absl::Status ProcessTargetEdges(
-    Graph* graph, const string& quant_op_type,
+    Graph* graph, const std::string& quant_op_type,
     const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
-  std::unordered_map<string, Node*, StringPieceHasher> name_index;
+  std::unordered_map<std::string, Node*, StringPieceHasher> name_index;
   std::vector<Node*> added_variables;
   for (const EdgeToConvert edge : target_edges) {
     Node* convert_node;
-    string name_prefix = edge.edge->src()->name();
+    std::string name_prefix = edge.edge->src()->name();
 
     auto iter = name_index.find(name_prefix);
     if (iter == name_index.end()) {
@@ -596,7 +591,8 @@ absl::Status ProcessTargetEdges(
 
 }  // namespace
 
-absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
+absl::Status DoQuantizeTraining(int32_t num_bits,
+                                const std::string& quant_op_type,
                                 Graph* graph) {
   if (graph == nullptr) {
     return errors::InvalidArgument("Cannot accept empty graph pointer.");
@@ -663,7 +659,7 @@ absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
 
 absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
                                           int32_t num_bits,
-                                          const string& quant_op_type,
+                                          const std::string& quant_op_type,
                                           GraphDef* result_graphdef) {
   Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
@@ -678,8 +674,8 @@ absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
 }
 
 absl::Status DoQuantizeTrainingOnSerializedGraphDef(
-    const string& input_graph_string, int32_t num_bits,
-    const string& quant_op_type, string* result_graph_string) {
+    const std::string& input_graph_string, int32_t num_bits,
+    const std::string& quant_op_type, std::string* result_graph_string) {
   // First create the graph from the GraphDef.
   GraphDef input_graphdef;
   if (!ParseProtoUnlimited(&input_graphdef, input_graph_string)) {
diff --git a/tensorflow/core/common_runtime/quantize_training.h b/tensorflow/core/common_runtime/quantize_training.h
index de3ed6b476b24a..21f794cbec8f2c 100644
--- a/tensorflow/core/common_runtime/quantize_training.h
+++ b/tensorflow/core/common_runtime/quantize_training.h
@@ -35,21 +35,20 @@ namespace tensorflow {
 //    - num_bits out of range.
 //    - g is null.
 //    - More than 1 unknown ops encountered.
-absl::Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
-                                Graph* g);
+absl::Status DoQuantizeTraining(int32_t num_bits,
+                                const std::string& quant_op_type, Graph* g);
 
 // Converts the input serialized GraphDef and returns a rewritten serialized
 // GraphDef for quantized training.
-absl::Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
-                                                    int32_t num_bits,
-                                                    const string& quant_op_type,
-                                                    string* result_graph);
+absl::Status DoQuantizeTrainingOnSerializedGraphDef(
+    const std::string& input_graph, int32_t num_bits,
+    const std::string& quant_op_type, std::string* result_graph);
 
 // Converts the input GraphDef and returns a rewritten GraphDef for quantized
 // training.
 absl::Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
                                           int32_t num_bits,
-                                          const string& quant_op_type,
+                                          const std::string& quant_op_type,
                                           GraphDef* result_graphdef);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/quantize_training_test.cc b/tensorflow/core/common_runtime/quantize_training_test.cc
index 7f2e1b0e709d35..5d4a1ac2618de2 100644
--- a/tensorflow/core/common_runtime/quantize_training_test.cc
+++ b/tensorflow/core/common_runtime/quantize_training_test.cc
@@ -51,7 +51,7 @@ class QuantizeTrainingTest : public ::testing::Test {
     return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
   }
 
-  absl::Status Placeholder(Graph* g, const string& name, TensorShape shape,
+  absl::Status Placeholder(Graph* g, const std::string& name, TensorShape shape,
                            Node** out) {
     TF_RETURN_IF_ERROR(NodeBuilder(name, "Placeholder")
                            .Attr("dtype", DT_FLOAT)
@@ -60,7 +60,7 @@ class QuantizeTrainingTest : public ::testing::Test {
     return absl::OkStatus();
   }
 
-  absl::Status FindNode(Graph* g, const string& name, Node** out) {
+  absl::Status FindNode(Graph* g, const std::string& name, Node** out) {
     for (Node* node : g->nodes()) {
       if (node->name() == name) {
         *out = node;
@@ -111,15 +111,14 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
   // Quantize_and_dequantize node for identity should have signed_input==true.
   Node* identity_q_node;
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+      FindNode(g, absl::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &identity_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*identity_q_node->attrs().Find("signed_input")));
   // Quantize_and_dequantize node for relu should have signed_input==false.
   Node* relu_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &relu_q_node));
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &relu_q_node));
   ASSERT_EQ("false",
             SummarizeAttrValue(*relu_q_node->attrs().Find("signed_input")));
 }
@@ -161,16 +160,15 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
 
   // Quantize_and_dequantize node for relu6 should have range_given==true.
   Node* relu6_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
-               &relu6_q_node));
+  TF_ASSERT_OK(FindNode(g,
+                        absl::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
+                        &relu6_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*relu6_q_node->attrs().Find("range_given")));
   // Quantize_and_dequantize node for relu should have range_given==true.
   Node* relu_q_node;
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &relu_q_node));
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &relu_q_node));
   ASSERT_EQ("true",
             SummarizeAttrValue(*relu_q_node->attrs().Find("range_given")));
 }
@@ -215,18 +213,17 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   absl::Status s = FindNode(
-      g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"), &found_node);
+      g, absl::StrCat(d->name(), "/QuantizeAndDequantizeV2"), &found_node);
   EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/QuantizeAndDequantizeV2"), &found_node));
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
-               &found_node));
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
+      FindNode(g, absl::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &found_node));
-  TF_ASSERT_OK(FindNode(
-      g, strings::StrCat(c->name(), "/QuantizeAndDequantizeV2"), &found_node));
+  TF_ASSERT_OK(FindNode(g, absl::StrCat(c->name(), "/QuantizeAndDequantizeV2"),
+                        &found_node));
 }
 
 TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
@@ -269,18 +266,17 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   absl::Status s = FindNode(
-      g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"), &found_node);
+      g, absl::StrCat(d->name(), "/FakeQuantWithMinMaxVars"), &found_node);
   EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(FindNode(
+      g, absl::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"), &found_node));
   TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"),
-               &found_node));
-  TF_ASSERT_OK(
-      FindNode(g, strings::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
+      FindNode(g, absl::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
                &found_node));
-  TF_ASSERT_OK(FindNode(
-      g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
+  TF_ASSERT_OK(FindNode(g, absl::StrCat(c->name(), "/FakeQuantWithMinMaxVars"),
+                        &found_node));
 }
 
 TEST_F(QuantizeTrainingTest, QuantizeSerializedGraphDef) {
@@ -301,10 +297,10 @@ TEST_F(QuantizeTrainingTest, QuantizeSerializedGraphDef) {
   // Convert the graph to the graphdef string.
   GraphDef input_graph;
   graph->ToGraphDef(&input_graph);
-  string input_string;
+  std::string input_string;
   input_graph.SerializeToString(&input_string);
 
-  string result_string;
+  std::string result_string;
   TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(
       input_string, num_bits, "QuantizeAndDequantizeV2", &result_string));
 
@@ -400,8 +396,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
 
   // The min and max values of the relu6 quantization should be constant values
   // of 0 and 6.
-  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
-  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::string min_const_name = absl::StrCat(relu6->name(), "/InputMin");
+  std::string max_const_name = absl::StrCat(relu6->name(), "/InputMax");
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
@@ -416,8 +412,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
 
   // The value of the min and max should be set to the min and max of a1 since
   // this is the first run that initializes the EMA variables.
-  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
-  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  std::string min_var_name = absl::StrCat(relu->name(), "/Min/Variable");
+  std::string max_var_name = absl::StrCat(relu->name(), "/Max/Variable");
   TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
   EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
@@ -494,8 +490,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
 
   // The min and max values of the relu6 quantization should be constant values
   // of 0 and 6.
-  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
-  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::string min_const_name = absl::StrCat(relu6->name(), "/InputMin");
+  std::string max_const_name = absl::StrCat(relu6->name(), "/InputMax");
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
@@ -510,8 +506,8 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
 
   // The value of the min and max should be set to the min and max of a1 since
   // this is the first run that initializes the EMA variables.
-  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
-  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  std::string min_var_name = absl::StrCat(relu->name(), "/Min/Variable");
+  std::string max_var_name = absl::StrCat(relu->name(), "/Max/Variable");
   TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
   EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
   EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index 0bfc121b23cab4..a4c15f74b49774 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 
 /* static */
 std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
-    const string& new_base, Device* underlying, bool owns_underlying,
+    const std::string& new_base, Device* underlying, bool owns_underlying,
     bool isolate_session_state,
     thread::ThreadPoolInterface* underlying_threadpool) {
   DeviceNameUtils::ParsedName parsed_name;
@@ -39,9 +39,9 @@ std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
   CHECK(underlying_parsed_name.has_id);
   parsed_name.type = underlying_parsed_name.type;
   parsed_name.id = underlying_parsed_name.id;
-  string name = DeviceNameUtils::FullName(parsed_name.job, parsed_name.replica,
-                                          parsed_name.task, parsed_name.type,
-                                          parsed_name.id);
+  std::string name = DeviceNameUtils::FullName(
+      parsed_name.job, parsed_name.replica, parsed_name.task, parsed_name.type,
+      parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
   // Call absl::WrapUnique to access private constructor.
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 4a0e1057b398a4..687f61f8eff2d8 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -30,7 +30,7 @@ namespace tensorflow {
 class RenamedDevice : public Device {
  public:
   static std::unique_ptr<Device> NewRenamedDevice(
-      const string& new_base, Device* underlying, bool owns_underlying,
+      const std::string& new_base, Device* underlying, bool owns_underlying,
       bool isolate_session_state,
       thread::ThreadPoolInterface* underlying_threadpool = nullptr);
 
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index f1a199ba97250d..1d6e53c6585068 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -99,9 +99,9 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
   if (in.dtype() != DT_VARIANT) {
     // Variants are handled by CopyTensor::ViaDMA.
     AllocationAttributes aa;
-    uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
-    std::function<uint64()> freed_by_func = [dst_device,
-                                             &safe_alloc_frontier]() {
+    uint64_t safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+    std::function<uint64_t()> freed_by_func = [dst_device,
+                                               &safe_alloc_frontier]() {
       safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
       return safe_alloc_frontier;
     };
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index 532f4e84a2f9f2..8f4e7acbf77ed5 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -22,7 +22,8 @@ namespace tensorflow {
 absl::Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send) {
+    const std::vector<std::string>& keys,
+    absl::Span<const Tensor> tensors_to_send) {
   if (keys.size() != tensors_to_send.size()) {
     return errors::InvalidArgument(
         "keys and tensors_to_send are not the same size. keys.size() = ",
@@ -56,7 +57,7 @@ absl::Status SendTensorsToRendezvous(
 void RecvOutputsFromRendezvousAsync(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
+    const std::vector<std::string>& keys, std::vector<Tensor>* received_tensors,
     StatusCallback done) {
   if (keys.empty()) {
     done(absl::OkStatus());
@@ -69,8 +70,8 @@ void RecvOutputsFromRendezvousAsync(
   }
 
   received_tensors->reserve(keys.size());
-  std::vector<
-      std::tuple<string, Tensor*, Rendezvous::ParsedKey, AllocatorAttributes>>
+  std::vector<std::tuple<std::string, Tensor*, Rendezvous::ParsedKey,
+                         AllocatorAttributes>>
       arguments;
   for (int i = 0; i < keys.size(); ++i) {
     Rendezvous::ParsedKey parsed;
@@ -90,7 +91,7 @@ void RecvOutputsFromRendezvousAsync(
 
   auto status_cb = new ReffedStatusCallback(std::move(done));
   for (auto& p : arguments) {
-    const string& key = std::get<0>(p);
+    const std::string& key = std::get<0>(p);
     Tensor* val = std::get<1>(p);
     Rendezvous::ParsedKey parsed = std::get<2>(p);
     Rendezvous::Args rendez_args;
@@ -124,7 +125,7 @@ absl::Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
   // Receives values requested by the caller.
   Rendezvous::ParsedKey parsed;
   for (auto& p : *out) {
-    const string& key = p.first;
+    const std::string& key = p.first;
     Tensor* val = &p.second;
     bool is_dead = false;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(key, &parsed));
diff --git a/tensorflow/core/common_runtime/rendezvous_util.h b/tensorflow/core/common_runtime/rendezvous_util.h
index 8ed1dd7a11ad16..1c9ac0ef221a54 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.h
+++ b/tensorflow/core/common_runtime/rendezvous_util.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef std::map<string, Tensor> NamedTensors;
+typedef std::map<std::string, Tensor> NamedTensors;
 typedef std::function<void(const absl::Status&)> StatusCallback;
 
 // Uses `rendezvous` to send tensors in `tensors_to_send`. `device_context`
@@ -33,7 +33,8 @@ typedef std::function<void(const absl::Status&)> StatusCallback;
 absl::Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send);
+    const std::vector<std::string>& keys,
+    absl::Span<const Tensor> tensors_to_send);
 
 // Uses `rendezvous` to obtain tensors. `device_context` should be the
 // DeviceContext associated with the receiving device. `alloc_attrs` contains
@@ -42,7 +43,7 @@ absl::Status SendTensorsToRendezvous(
 void RecvOutputsFromRendezvousAsync(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
+    const std::vector<std::string>& keys, std::vector<Tensor>* received_tensors,
     StatusCallback done);
 
 absl::Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
diff --git a/tensorflow/core/common_runtime/rendezvous_util_test.cc b/tensorflow/core/common_runtime/rendezvous_util_test.cc
index 484746ce416f0b..f2c866c307905c 100644
--- a/tensorflow/core/common_runtime/rendezvous_util_test.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util_test.cc
@@ -32,20 +32,20 @@ class RendezvousUtilTest : public ::testing::Test {
 };
 
 // string -> Tensor<string>
-Tensor V(const string& content) {
+Tensor V(const std::string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
 // Tensor<string> -> string
-string V(const Tensor& tensor) {
+std::string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
   return tensor.scalar<tstring>()();
 }
 
-string MakeStringKey(const string& name) {
+std::string MakeStringKey(const std::string& name) {
   return Rendezvous::CreateKey(
       "/job:localhost/replica:0/task:0/device:CPU:0", 0,
       "/job:localhost/replica:0/task:0/device:GPU:0", name, FrameAndIter(0, 0));
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
index 7da785ca6f54e3..9dfa50ae0dc2a4 100644
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -70,8 +70,8 @@ bool HasCpuDevice(const Node* node) {
 // Convert the CPU device name to the corresponding CPU device name. If
 // multiple local CPU devices are enabled, the CPU device name will also
 // contain the device id.
-absl::Status DeviceNameToCpuDeviceNameWithDeviceId(const string& device_name,
-                                                   string* host_device_name) {
+absl::Status DeviceNameToCpuDeviceNameWithDeviceId(
+    const std::string& device_name, std::string* host_device_name) {
   DeviceNameUtils::ParsedName device;
   if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
     return absl::InternalError(
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 3f4cf1498769a0..e60117f588f8c0 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -45,7 +45,7 @@ class ReplicateHelper {
 
   // Replicate the given node to an allowed device.
   absl::Status ReplicateNode(const Node* node,
-                             const std::vector<string>& allowed_devices,
+                             const std::vector<std::string>& allowed_devices,
                              int allowed_device_index, Graph* graph) {
     auto& replicated_nodes = replicated_nodes_map_.at(node);
     if (replicated_nodes[allowed_device_index] != nullptr) {
@@ -53,8 +53,8 @@ class ReplicateHelper {
     }
     const auto& device = allowed_devices.at(allowed_device_index);
     NodeDef node_def = node->def();
-    const string suffix = strings::StrCat("/R", allowed_device_index);
-    node_def.set_name(graph->NewName(strings::StrCat(node_def.name(), suffix)));
+    const std::string suffix = absl::StrCat("/R", allowed_device_index);
+    node_def.set_name(graph->NewName(absl::StrCat(node_def.name(), suffix)));
     TF_ASSIGN_OR_RETURN(Node * replicated_node, graph->AddNode(node_def));
     replicated_node->set_assigned_device_name(device);
     if (replicated_node->IsArg()) {
@@ -83,7 +83,7 @@ class ReplicateHelper {
   // Replace an edge (composite device -> composite device) with
   // N edges (allowed devices -> allowed devices).
   absl::Status ReplicateFromCompositeDeviceToCompositeDevice(
-      const Edge* edge, const std::vector<string>& allowed_devices,
+      const Edge* edge, const std::vector<std::string>& allowed_devices,
       Graph* graph) {
     const std::vector<Node*>& src_replicated_nodes =
         replicated_nodes_map_.at(edge->src());
@@ -115,12 +115,12 @@ class ReplicateHelper {
   // Control edge: replace an edge (composite device -> a regular device) with
   // N edges (allowed devices -> a regular device).
   absl::Status ReplicateFromCompositeDeviceToRegularDevice(
-      const Edge* edge, const std::vector<string>& allowed_devices,
+      const Edge* edge, const std::vector<std::string>& allowed_devices,
       Graph* graph) {
     const std::vector<Node*>& src_replicated_nodes =
         replicated_nodes_map_.at(edge->src());
     Node* dst = edge->dst();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     bool found_src_node = false;
     for (int i = 0; i < allowed_devices.size(); ++i) {
       if (allowed_devices.at(i) == dst_device) {
@@ -198,7 +198,7 @@ class ReplicateHelper {
 
 // Replicate the nodes in cluster_nodes and update edges.
 absl::Status ReplicateNodesAndEdges(
-    const std::vector<string>& allowed_devices,
+    const std::vector<std::string>& allowed_devices,
     absl::flat_hash_map<Node*, int>* cluster_nodes, ReplicateHelper* helper,
     Graph* graph) {
   // Contains nodes in cluster_nodes whose out nodes are all on physical
@@ -253,19 +253,19 @@ absl::Status ReplicateNodesAndEdges(
 }  // namespace
 
 absl::Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, const std::vector<string>*>&
+    const absl::flat_hash_map<std::string, const std::vector<std::string>*>&
         composite_devices,
     Graph* graph) {
   VLOG(1) << "Starting ReplicatePerReplicaNodesInFunctionGraph";
   VLOG(1) << "Graph #nodes " << graph->num_nodes() << " #edges "
           << graph->num_edges();
-  std::set<string> composite_device_names;
+  std::set<std::string> composite_device_names;
   for (const auto& it : composite_devices) {
     composite_device_names.insert(it.first);
   }
   // Map from a composite device to a cluster of nodes assigned to the
   // composite device and the numbers of their out edges to process.
-  absl::flat_hash_map<string, absl::flat_hash_map<Node*, int>>
+  absl::flat_hash_map<std::string, absl::flat_hash_map<Node*, int>>
       composite_device_to_cluster_nodes;
   for (Node* n : graph->op_nodes()) {
     if (composite_device_names.find(n->assigned_device_name()) !=
@@ -284,7 +284,7 @@ absl::Status ReplicatePerReplicaNodesInFunctionGraph(
   }
 
   for (auto& it : composite_device_to_cluster_nodes) {
-    const std::vector<string>& allowed_devices =
+    const std::vector<std::string>& allowed_devices =
         *composite_devices.at(it.first);
     if (allowed_devices.empty()) {
       return errors::InvalidArgument("No allowed device of composite device: ",
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
index 4be95ea32ca44b..414bd21de35361 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.h
@@ -35,7 +35,7 @@ namespace tensorflow {
 // dependency.
 // TODO(b/145922293): Register it as a POST_REWRITE_FOR_EXEC pass.
 absl::Status ReplicatePerReplicaNodesInFunctionGraph(
-    const absl::flat_hash_map<string, const std::vector<string>*>&
+    const absl::flat_hash_map<std::string, const std::vector<std::string>*>&
         composite_devices,
     Graph* graph);
 
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
index ff6fcb4b8bc735..f0a859286fba06 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
@@ -40,7 +40,7 @@ class GraphHelper {
     }
   }
 
-  Node* GetNodeByName(const string& name) {
+  Node* GetNodeByName(const std::string& name) {
     const auto it = nodes_by_name_.find(name);
     if (it != nodes_by_name_.end()) {
       return it->second;
@@ -53,7 +53,8 @@ class GraphHelper {
     return nullptr;
   }
 
-  void SetAssignedDevice(const string& node_name, const string& device_name) {
+  void SetAssignedDevice(const std::string& node_name,
+                         const std::string& device_name) {
     CHECK_NOTNULL(GetNodeByName(node_name))
         ->set_assigned_device_name(device_name);
   }
@@ -68,14 +69,14 @@ class GraphHelper {
     EXPECT_EQ(arg_num, expected_num);
   }
 
-  void CheckAssignedDevice(const string& node_name,
-                           const string& expected_device_name) {
+  void CheckAssignedDevice(const std::string& node_name,
+                           const std::string& expected_device_name) {
     EXPECT_EQ(expected_device_name,
               CHECK_NOTNULL(GetNodeByName(node_name))->assigned_device_name());
   }
 
-  void CheckAssignedDevicePrefix(const string& node_name,
-                                 const string& expected_device_name) {
+  void CheckAssignedDevicePrefix(const std::string& node_name,
+                                 const std::string& expected_device_name) {
     auto assigned =
         CHECK_NOTNULL(GetNodeByName(node_name))->assigned_device_name();
     EXPECT_EQ(assigned.rfind(expected_device_name, 0), 0);
@@ -85,21 +86,21 @@ class GraphHelper {
   const Graph& graph_;
   // Maps from a node name to a Node* in the graph. We use an ordered map here
   // to ensure stability of GetNodeByName().
-  std::map<string, Node*> nodes_by_name_;
+  std::map<std::string, Node*> nodes_by_name_;
 };
 
 TEST(ReplicatePerReplicaNodesTest, SingleCompositeDevice) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   Output arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
-  auto one = ops::Const<int32>(scope.WithOpName("one"), 1);
+  auto one = ops::Const<int32_t>(scope.WithOpName("one"), 1);
   auto write = ops::AssignVariableOp(scope.WithOpName("write"), arg, one);
   auto ret = ops::_Retval(
       scope.WithOpName("ret").WithControlDependencies({write}), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
@@ -143,8 +144,8 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDeviceToSingleDevice) {
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
   auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
@@ -183,11 +184,11 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
   auto add = ops::Add(scope.WithOpName("add"), identity0, identity1);
   auto ret = ops::_Retval(scope.WithOpName("ret"), add, 0);
 
-  const std::vector<string> underlying_devices_0 = {"/device:TPU:0",
-                                                    "/device:TPU:1"};
-  const std::vector<string> underlying_devices_1 = {"/device:TPU:2",
-                                                    "/device:TPU:3"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices_0 = {"/device:TPU:0",
+                                                         "/device:TPU:1"};
+  const std::vector<std::string> underlying_devices_1 = {"/device:TPU:2",
+                                                         "/device:TPU:3"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices_0},
                            {"/device:TPU_COMPOSITE:1", &underlying_devices_1}};
 
@@ -232,9 +233,9 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
 }
 
 TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   FunctionDefLibrary fdef_lib;
@@ -311,9 +312,9 @@ TEST(ReplicatePerReplicaNodesTest, DeadArgNodes) {
   auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
   auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
 
-  const std::vector<string> underlying_devices = {"/device:TPU:0",
-                                                  "/device:TPU:1"};
-  const absl::flat_hash_map<string, const std::vector<string>*>
+  const std::vector<std::string> underlying_devices = {"/device:TPU:0",
+                                                       "/device:TPU:1"};
+  const absl::flat_hash_map<std::string, const std::vector<std::string>*>
       composite_devices = {{"/device:TPU_COMPOSITE:0", &underlying_devices}};
 
   Graph graph(OpRegistry::Global());
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index a12acfdf64c9dd..ff44370ecbd451 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -61,8 +61,8 @@ namespace {
 // RingAlg instances.  Note that the exec_key will differentiate between
 // different instances consequently we don't need to further differentiate
 // between subclasses of RingAlg.
-string RingAlgBufKey(const string& name, const string& exec_key, int pass,
-                     int section, int source_rank) {
+std::string RingAlgBufKey(const std::string& name, const std::string& exec_key,
+                          int pass, int section, int source_rank) {
   if (READABLE_KEYS) {
     return strings::StrCat(name, "(", exec_key, "):pass(", pass, "):section(",
                            section, "):srcrank(", source_rank, ")");
@@ -97,7 +97,7 @@ RingAlg::RingField* RingAlg::PCQueue::Dequeue() {
   return rf;
 }
 
-RingAlg::RingAlg(CollectiveType type, const string& name)
+RingAlg::RingAlg(CollectiveType type, const std::string& name)
     : type_(type),
       name_(name),
       col_ctx_(nullptr),
@@ -163,10 +163,10 @@ absl::Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
   }
 
   if (VLOG_IS_ON(2)) {
-    string subdiv_buf;
+    std::string subdiv_buf;
     for (const int subdiv_offset :
          col_params->instance.impl_details.subdiv_offsets) {
-      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+      absl::StrAppend(&subdiv_buf, " ", subdiv_offset);
     }
     VLOG(2) << "Dynamically generated " << num_subdivs
             << " subdiv_offsets:" << subdiv_buf << " tensor_size "
@@ -178,7 +178,7 @@ absl::Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
 }  // namespace
 
 absl::Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
-  const string& device_name =
+  const std::string& device_name =
       col_params->group.members[col_params->default_rank].device.name();
   // Each subdiv permutation is a ring formed by rotating each
   // single-task subsequence of devices by an offset.  This makes most
@@ -190,7 +190,7 @@ absl::Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->group.members[0].task;
+  const std::string* prior_task_name = &col_params->group.members[0].task;
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
     if (col_params->group.members[di].task != *prior_task_name) {
@@ -265,7 +265,7 @@ absl::Status RingAlg::InitializeCollectiveContext(
       &col_ctx->device_locality);
 }
 
-string RingAlg::TensorDebugString(const Tensor& tensor) {
+std::string RingAlg::TensorDebugString(const Tensor& tensor) {
   const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info =
       col_ctx_->op_ctx->device()->tensorflow_accelerator_device_info();
   if (accelerator_device_info) {
@@ -383,11 +383,11 @@ void RingAlg::AdvanceToSecondPass(RingField* rf) {
   VLOG(3) << "IncrRingField new value " << rf->DebugString();
 }
 
-string RingAlg::RingField::DebugString() const {
-  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
-                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
-                              " action=", action);
-  strings::StrAppend(&rv, " pass=", second_pass);
+std::string RingAlg::RingField::DebugString() const {
+  std::string rv = strings::StrCat(
+      "RingField rank=", rank, " chunk_idx=", chunk_idx, " subdiv=", subdiv_idx,
+      " sc_idx=", sc_idx, " action=", action);
+  absl::StrAppend(&rv, " pass=", second_pass);
   strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
                      " is_final=", is_final, " recv_is_remote=", recv_is_remote,
                      " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
@@ -396,8 +396,8 @@ string RingAlg::RingField::DebugString() const {
 
 void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
   DCHECK(rf->do_send);
-  string send_buf_key = RingAlgBufKey(name_, col_ctx_->exec_key,
-                                      rf->second_pass, rf->sc_idx, rf->rank);
+  std::string send_buf_key = RingAlgBufKey(
+      name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx, rf->rank);
   VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
           << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
           << rf->sc_idx;
@@ -415,7 +415,7 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
 
 void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
   DCHECK(rf->do_recv);
-  string recv_buf_key =
+  std::string recv_buf_key =
       RingAlgBufKey(name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
                     (rf->rank + (group_size_ - 1)) % group_size_);
   VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
@@ -434,9 +434,9 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
       col_ctx_->op_ctx->cancellation_manager(), done);
 }
 
-string RingAlg::FieldState() {
-  string s = strings::StrCat(
-      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64>(this)),
+std::string RingAlg::FieldState() {
+  std::string s = strings::StrCat(
+      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64_t>(this)),
       " exec ", col_ctx_->exec_key, " step_id=", col_ctx_->step_id,
       " state of all ", rfv_.size(), " fields:");
   for (int i = 0; i < rfv_.size(); ++i) {
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
index d2294f830db2c1..b54da03a01a739 100644
--- a/tensorflow/core/common_runtime/ring_alg.h
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -31,7 +31,7 @@ class Device;
 // for specific collective functions.
 class RingAlg : public CollectiveImplementationInterface {
  public:
-  explicit RingAlg(CollectiveType type, const string& name);
+  explicit RingAlg(CollectiveType type, const std::string& name);
   ~RingAlg() override {}
 
   // Establishes the requested number of subdivision permutations based on the
@@ -63,11 +63,11 @@ class RingAlg : public CollectiveImplementationInterface {
 
   // Tracks progress of actions on a single subfield of the entire tensor.
   struct RingField {
-    int16 chunk_idx;     // major division index
-    int16 subdiv_idx;    // minor division index
-    int16 sc_idx;        // subchunk index
-    int16 rank;          // rank within subdiv permutation
-    int16 recv_dev_idx;  // dev from which value should be recv'd
+    int16_t chunk_idx;     // major division index
+    int16_t subdiv_idx;    // minor division index
+    int16_t sc_idx;        // subchunk index
+    int16_t rank;          // rank within subdiv permutation
+    int16_t recv_dev_idx;  // dev from which value should be recv'd
     RingFieldAction action;
     bool second_pass;
     bool recv_is_remote = false;
@@ -78,7 +78,7 @@ class RingAlg : public CollectiveImplementationInterface {
     Tensor chunk;           // alias to field values
     Tensor tmp_chunk;
     absl::Status status;
-    string DebugString() const;
+    std::string DebugString() const;
   };
   virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
                              int field_idx);
@@ -87,8 +87,8 @@ class RingAlg : public CollectiveImplementationInterface {
   void DispatchRecv(RingField* rf, const StatusCallback& done);
 
   // For constructing log messages for debugging.
-  string FieldState();
-  string TensorDebugString(const Tensor& tensor);
+  std::string FieldState();
+  std::string TensorDebugString(const Tensor& tensor);
 
   // Producer/Consumer Queue of RingField structs.
   class PCQueue {
@@ -104,7 +104,7 @@ class RingAlg : public CollectiveImplementationInterface {
   };
 
   const CollectiveType type_;
-  const string name_;
+  const std::string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
   StatusCallback done_;
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index bc016b366696d4..bd85f07aef1840 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -71,7 +71,7 @@ void RingGatherer::Run(StatusCallback done) {
   DCHECK_GT(num_subdivs_, 0);
 
   if (VLOG_IS_ON(1)) {
-    string buf;
+    std::string buf;
     for (int r = 0; r < col_params_->group.members.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
                          col_params_->group.members[r].device.name(), "\n");
@@ -79,10 +79,10 @@ void RingGatherer::Run(StatusCallback done) {
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
          ++sd) {
-      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      absl::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
       for (auto x :
            col_params_->instance.impl_details.subdiv_permutations[sd]) {
-        strings::StrAppend(&buf, x, ", ");
+        absl::StrAppend(&buf, x, ", ");
       }
     }
     VLOG(1) << "RingGatherer::Run for device " << col_ctx_->device_name
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 595ff502737b93..884fb17340c4c0 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -105,7 +105,7 @@ class RingGathererTest : public ::testing::Test {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
         EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
       }
     } else {
       // Confirm that every device accumulated the same set of correct
@@ -130,7 +130,7 @@ class RingGathererTest : public ::testing::Test {
             GenerateEvenSubdivOffsets(test_env->num_devices_per_worker,
                                       num_subdivs);
       }
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index c448f021f055f0..3ad099caee9b9b 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -67,7 +67,7 @@ void RingReducer::Run(StatusCallback done) {
   CHECK_GT(num_subdivs_, 0);
 
   if (VLOG_IS_ON(1)) {
-    string buf;
+    std::string buf;
     for (int r = 0; r < col_params_->group.members.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
                          col_params_->group.members[r].device.name(), "\n");
@@ -75,10 +75,10 @@ void RingReducer::Run(StatusCallback done) {
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
          ++sd) {
-      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      absl::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
       for (auto x :
            col_params_->instance.impl_details.subdiv_permutations[sd]) {
-        strings::StrAppend(&buf, x, ", ");
+        absl::StrAppend(&buf, x, ", ");
       }
     }
     VLOG(1) << "RingReducer::Run for device " << col_ctx_->device_name
@@ -129,9 +129,9 @@ void RingReducer::ContinueAfterInputCopy() {
     // can be provided to the kernel in host memory?
     Tensor group_size_val = ca_->Scalar(group_size_);
     if (col_params_->group.device_type != "CPU") {
-      uint64 safe_alloc_frontier = col_ctx_->device->SafeAllocFrontier(0);
+      uint64_t safe_alloc_frontier = col_ctx_->device->SafeAllocFrontier(0);
       AllocationAttributes aa;
-      std::function<uint64()> freed_by_func = [this, &safe_alloc_frontier]() {
+      std::function<uint64_t()> freed_by_func = [this, &safe_alloc_frontier]() {
         safe_alloc_frontier =
             col_ctx_->device->SafeAllocFrontier(safe_alloc_frontier);
         return safe_alloc_frontier;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index d4baa4aaef652e..bedfa64134de51 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -138,7 +138,7 @@ class RingReducerTest : public ::testing::Test {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
         EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
       }
     } else {
       // Confirm that every device computed the same correct reduction value.
@@ -165,7 +165,7 @@ class RingReducerTest : public ::testing::Test {
             GenerateEvenSubdivOffsets(test_env->num_devices_per_worker,
                                       num_subdivs);
       }
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
@@ -200,7 +200,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<CollectiveTestEnv> test_env_;
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
   mutex mu_;
-  int32 reduce_counter_ TF_GUARDED_BY(mu_) = 0;
+  int32_t reduce_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 
 class RingReducerInitParamsTest : public ::testing::Test {
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 1b3d39a8c6e996..24e7e089784e17 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 
 ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32_t scope_id,
-                                 const string& name,
+                                 const std::string& name,
                                  const absl::Span<const Field> fields,
                                  int32_t expected_call_count,
                                  ScopedAllocatorContainer* container)
@@ -69,7 +69,7 @@ void* ScopedAllocator::AllocateRaw(int32_t field_index, size_t num_bytes) {
       return nullptr;
     }
 
-    int32_t num_fields = static_cast<int32>(fields_.size());
+    int32_t num_fields = static_cast<int32_t>(fields_.size());
     if (field_index >= num_fields) {
       LOG(ERROR) << "ScopedAllocator " << name_
                  << " received unexpected field number " << field_index;
@@ -228,8 +228,8 @@ void ScopedAllocatorInstance::DeallocateRaw(void* p) {
   if (del) delete this;
 }
 
-string ScopedAllocatorInstance::Name() {
-  return strings::StrCat(scoped_allocator_->name(), "_field_", field_index_);
+std::string ScopedAllocatorInstance::Name() {
+  return absl::StrCat(scoped_allocator_->name(), "_field_", field_index_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 5b22deb264ce52..8c894372fbee15 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -33,7 +33,7 @@ class ScopedAllocator {
   // A subrange of the TensorBuffer associated with this object that
   // will be the backing memory for one aliased tensor.
   struct Field {
-    int32 scope_id;
+    int32_t scope_id;
     size_t offset;
     size_t bytes_requested;
     size_t bytes_allocated;
@@ -71,13 +71,13 @@ class ScopedAllocator {
   void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_);
   Tensor backing_tensor_;
   TensorBuffer* tbuf_;
-  int32 id_;
+  int32_t id_;
   std::string name_;
   ScopedAllocatorContainer* container_;
   std::vector<Field> fields_;
   mutex mu_;
-  int32 expected_call_count_ TF_GUARDED_BY(mu_);
-  int32 live_alloc_count_ TF_GUARDED_BY(mu_);
+  int32_t expected_call_count_ TF_GUARDED_BY(mu_);
+  int32_t live_alloc_count_ TF_GUARDED_BY(mu_);
 };
 
 // An Allocator that will return a pointer into the backing buffer of
@@ -117,7 +117,7 @@ class ScopedAllocatorInstance : public Allocator {
  private:
   mutex mu_;
   ScopedAllocator* scoped_allocator_;
-  int32 field_index_;
+  int32_t field_index_;
   bool allocated_ TF_GUARDED_BY(mu_);
   bool deallocated_ TF_GUARDED_BY(mu_);
   bool in_table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index 47ddfabbc27efe..d4fe07b5f27d2b 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -20,7 +20,8 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ScopedAllocatorContainer::AddScopedAllocator(
-    const Tensor& backing_tensor, int32_t scope_id, const string& scope_name,
+    const Tensor& backing_tensor, int32_t scope_id,
+    const std::string& scope_name,
     const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   VLOG(1) << "AddScopedAllocator " << mgr_->device_name()
@@ -152,7 +153,7 @@ ScopedAllocatorContainer* ScopedAllocatorMgr::GetContainer(int64_t step_id) {
 
 absl::Status ScopedAllocatorMgr::AddScopedAllocator(
     const Tensor& backing_tensor, int64_t step_id, int32_t scope_id,
-    const string& scope_name,
+    const std::string& scope_name,
     const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   ScopedAllocatorContainer* sac = GetContainer(step_id);
@@ -164,7 +165,7 @@ absl::Status ScopedAllocatorMgr::AddScopedAllocator(
 size_t ScopedAllocatorMgr::PopulateFields(
     int32_t scope_id, const absl::Span<const TensorShape>& shapes,
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
-  const int32_t num_fields = static_cast<int32>(shapes.size());
+  const int32_t num_fields = static_cast<int32_t>(shapes.size());
   fields->resize(num_fields);
   // At the end of iteration `i`, `offset` points to the offset from the start
   // of the backing buffer until the end of `field[i].bytes_allocated`.  This
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index dbbf7c3249ae54..22924a7005e892 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -54,7 +54,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
   int64_t step_id_;
   mutex mu_;
   struct SAField {
-    int32 field_index;
+    int32_t field_index;
     union {
       ScopedAllocator* scoped_allocator;
       ScopedAllocatorInstance* instance;
@@ -67,7 +67,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
         : field_index(ScopedAllocator::kBackingIndex),
           scoped_allocator(nullptr) {}
   };
-  std::unordered_map<int32, SAField> allocators_ TF_GUARDED_BY(mu_);
+  std::unordered_map<int32_t, SAField> allocators_ TF_GUARDED_BY(mu_);
 };
 
 // At most one of these exists per device.
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index ab0d769ceebe8b..59deffca41c19c 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -36,27 +36,29 @@ Session::Session() {}
 
 Session::~Session() {}
 
-absl::Status Session::Run(const RunOptions& run_options,
-                          const std::vector<std::pair<string, Tensor> >& inputs,
-                          const std::vector<string>& output_tensor_names,
-                          const std::vector<string>& target_tensor_names,
-                          std::vector<Tensor>* outputs,
-                          RunMetadata* run_metadata) {
+absl::Status Session::Run(
+    const RunOptions& run_options,
+    const std::vector<std::pair<std::string, Tensor> >& inputs,
+    const std::vector<std::string>& output_tensor_names,
+    const std::vector<std::string>& target_tensor_names,
+    std::vector<Tensor>* outputs, RunMetadata* run_metadata) {
   return errors::Unimplemented(
       "Run with options is not supported for this session.");
 }
 
-absl::Status Session::PRunSetup(const std::vector<string>& input_names,
-                                const std::vector<string>& output_names,
-                                const std::vector<string>& target_nodes,
-                                string* handle) {
+absl::Status Session::PRunSetup(const std::vector<std::string>& input_names,
+                                const std::vector<std::string>& output_names,
+                                const std::vector<std::string>& target_nodes,
+                                std::string* handle) {
   return errors::Unimplemented(
       "Partial run is not supported for this session.");
 }
 
 absl::Status Session::PRun(
-    const string& handle, const std::vector<std::pair<string, Tensor> >& inputs,
-    const std::vector<string>& output_names, std::vector<Tensor>* outputs) {
+    const std::string& handle,
+    const std::vector<std::pair<std::string, Tensor> >& inputs,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor>* outputs) {
   return errors::Unimplemented(
       "Partial run is not supported for this session.");
 }
@@ -96,7 +98,7 @@ absl::Status NewSession(const SessionOptions& options, Session** out_session) {
 }
 
 absl::Status Reset(const SessionOptions& options,
-                   const std::vector<string>& containers) {
+                   const std::vector<std::string>& containers) {
   SessionFactory* factory;
   TF_RETURN_IF_ERROR(SessionFactory::GetFactory(options, &factory));
   return factory->Reset(options, containers);
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index c21f1dc9483ee2..fc28ab4e05e887 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -33,7 +33,7 @@ static mutex* get_session_factory_lock() {
   return &session_factory_lock;
 }
 
-typedef std::unordered_map<string, SessionFactory*> SessionFactories;
+typedef std::unordered_map<std::string, SessionFactory*> SessionFactories;
 SessionFactories* session_factories() {
   static SessionFactories* factories = new SessionFactories;
   return factories;
@@ -41,7 +41,7 @@ SessionFactories* session_factories() {
 
 }  // namespace
 
-void SessionFactory::Register(const string& runtime_type,
+void SessionFactory::Register(const std::string& runtime_type,
                               SessionFactory* factory) {
   mutex_lock l(*get_session_factory_lock());
   if (!session_factories()->insert({runtime_type, factory}).second) {
@@ -51,17 +51,17 @@ void SessionFactory::Register(const string& runtime_type,
 }
 
 namespace {
-const string RegisteredFactoriesErrorMessageLocked() {
-  std::vector<string> factory_types;
+const std::string RegisteredFactoriesErrorMessageLocked() {
+  std::vector<std::string> factory_types;
   for (const auto& session_factory : *session_factories()) {
     factory_types.push_back(session_factory.first);
   }
-  return strings::StrCat("Registered factories are {",
-                         absl::StrJoin(factory_types, ", "), "}.");
+  return absl::StrCat("Registered factories are {",
+                      absl::StrJoin(factory_types, ", "), "}.");
 }
-string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target,
-                         "\" config: ", options.config.ShortDebugString());
+std::string SessionOptionsToString(const SessionOptions& options) {
+  return absl::StrCat("target: \"", options.target,
+                      "\" config: ", options.config.ShortDebugString());
 }
 }  // namespace
 
@@ -69,7 +69,7 @@ absl::Status SessionFactory::GetFactory(const SessionOptions& options,
                                         SessionFactory** out_factory) {
   mutex_lock l(*get_session_factory_lock());  // could use reader lock
 
-  std::vector<std::pair<string, SessionFactory*>> candidate_factories;
+  std::vector<std::pair<std::string, SessionFactory*>> candidate_factories;
   for (const auto& session_factory : *session_factories()) {
     if (session_factory.second->AcceptsOptions(options)) {
       VLOG(2) << "SessionFactory type " << session_factory.first
@@ -93,7 +93,7 @@ absl::Status SessionFactory::GetFactory(const SessionOptions& options,
     // the number of sessions grows.
     // TODO(mrry): Consider providing a system-default fallback option
     // in this case.
-    std::vector<string> factory_types;
+    std::vector<std::string> factory_types;
     factory_types.reserve(candidate_factories.size());
     for (const auto& candidate_factory : candidate_factories) {
       factory_types.push_back(candidate_factory.first);
diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h
index ffadb29ae21a6c..3c9d08db121c68 100644
--- a/tensorflow/core/common_runtime/session_factory.h
+++ b/tensorflow/core/common_runtime/session_factory.h
@@ -61,12 +61,13 @@ class SessionFactory {
   //
   // Sessions that support resource containers should override this function.
   virtual absl::Status Reset(const SessionOptions& options,
-                             const std::vector<string>& containers) {
+                             const std::vector<std::string>& containers) {
     return errors::Unimplemented("Reset()");
   }
 
   virtual ~SessionFactory() {}
-  static void Register(const string& runtime_type, SessionFactory* factory);
+  static void Register(const std::string& runtime_type,
+                       SessionFactory* factory);
   static absl::Status GetFactory(const SessionOptions& options,
                                  SessionFactory** out_factory);
 };
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
index 47341276fef563..5a236367357099 100644
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -23,7 +23,8 @@ namespace tensorflow {
 // kTensorHandleResourceTypeName.
 const char* SessionState::kTensorHandleResourceTypeName = "TensorHandle";
 
-absl::Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
+absl::Status SessionState::GetTensor(const std::string& handle,
+                                     Tensor* tensor) {
   mutex_lock l(state_lock_);
   auto it = tensors_.find(handle);
   if (it == tensors_.end()) {
@@ -34,7 +35,7 @@ absl::Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
   return absl::OkStatus();
 }
 
-absl::Status SessionState::AddTensor(const string& handle,
+absl::Status SessionState::AddTensor(const std::string& handle,
                                      const Tensor& tensor) {
   mutex_lock l(state_lock_);
   if (!tensors_.insert({handle, tensor}).second) {
@@ -44,7 +45,7 @@ absl::Status SessionState::AddTensor(const string& handle,
   return absl::OkStatus();
 }
 
-absl::Status SessionState::DeleteTensor(const string& handle) {
+absl::Status SessionState::DeleteTensor(const std::string& handle) {
   mutex_lock l(state_lock_);
   if (tensors_.erase(handle) == 0) {
     return errors::InvalidArgument("Failed to delete a tensor with handle '",
@@ -58,7 +59,7 @@ int64_t SessionState::GetNewId() {
   return tensor_id_++;
 }
 
-absl::Status TensorStore::AddTensor(const string& name,
+absl::Status TensorStore::AddTensor(const std::string& name,
                                     const TensorAndKey& tk) {
   mutex_lock l(lock_);
   if (!tensors_.insert({name, tk}).second) {
@@ -69,18 +70,18 @@ absl::Status TensorStore::AddTensor(const string& name,
   return absl::OkStatus();
 }
 
-absl::Status TensorStore::SaveTensors(const std::vector<string>& output_names,
-                                      SessionState* session_state) {
+absl::Status TensorStore::SaveTensors(
+    const std::vector<std::string>& output_names, SessionState* session_state) {
   mutex_lock l(lock_);
   if (!tensors_.empty()) {
     // Save only the tensors in output_names in the session.
-    for (const string& name : output_names) {
+    for (const std::string& name : output_names) {
       TensorId id(ParseTensorName(name));
-      const string op_name(id.first);
+      const std::string op_name(id.first);
       auto it = tensors_.find(op_name);
       if (it != tensors_.end()) {
         // Save the tensor to the session state.
-        string key = it->second.GetHandle(op_name);
+        std::string key = it->second.GetHandle(op_name);
         TF_RETURN_IF_ERROR(session_state->AddTensor(key, it->second.tensor));
       }
     }
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 0893140693fdf9..bc4787864315b0 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -162,7 +162,7 @@ absl::Status ShapeRefiner::InferShapesForFunction(
     const FunctionDef* function_def, AttrSlice attributes,
     InferenceContext* outer_context) {
   const Graph* graph;
-  const string& fname = function_def->signature().name();
+  const std::string& fname = function_def->signature().name();
   auto it = functions_.find(fname);
   if (it != functions_.end()) {
     graph = it->second.get();
@@ -170,7 +170,7 @@ absl::Status ShapeRefiner::InferShapesForFunction(
     InstantiationResult result;
     TF_RETURN_IF_ERROR(InstantiateFunction(
         *function_def, attributes,
-        [this](const string& op, const OpDef** sig) {
+        [this](const std::string& op, const OpDef** sig) {
           return this->function_library_->LookUpOpDef(op, sig);
         },
         &result));
@@ -476,7 +476,7 @@ absl::Status ShapeRefiner::EvaluateConstantIntScalarEdge(
           scalar.NumElements());
     }
     if (scalar.dtype() == DT_INT32) {
-      *result = scalar.scalar<int32>()();
+      *result = scalar.scalar<int32_t>()();
     } else {
       if (scalar.dtype() != DT_INT64) {
         return errors::InvalidArgument(
@@ -515,7 +515,7 @@ absl::Status ShapeRefiner::ConstantPartialShape(
           "of '-1' is required to represent an unknown shape.");
     }
     if (t.dims() == 0) {
-      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32_t>()() == -1) {
         *result = target_context->UnknownShape();
         return absl::OkStatus();
       } else if (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1) {
@@ -531,7 +531,7 @@ absl::Status ShapeRefiner::ConstantPartialShape(
 
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
-  const string& src_op = input_edge->src()->type_string();
+  const std::string& src_op = input_edge->src()->type_string();
   if (src_context->Value(src_context->Dim(src_shape, 0)) == 0) {
     // Source tensor is a vector of length 0, so the shape it
     // represents is as scalar.
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 111303a095f5ae..f67e5dd4b388e7 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -87,7 +87,7 @@ class ShapeRefiner {
   }
 
   // Getters and setters for graph_def_version_.
-  int32 graph_def_version() const { return graph_def_version_; }
+  int32_t graph_def_version() const { return graph_def_version_; }
   void set_graph_def_version(int32_t version) { graph_def_version_ = version; }
 
   void set_require_shape_inference_fns(bool require_shape_inference_fns) {
@@ -250,7 +250,7 @@ class ShapeRefiner {
       shape_inference::InferenceContext* context,
       shape_inference::InferenceContext* outer_context = nullptr);
 
-  int32 graph_def_version_;
+  int32_t graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
   // The lifetime of the tensors are bound to the runner, so it should be the
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index c54f26e7cc460c..580a987b3ccffd 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -281,9 +281,10 @@ TEST_F(ShapeRefinerTest, ExtractConstantSubgraphMultiOutput) {
   // input_tensor from the shape function.
   {
     Scope root = Scope::NewRootScope();
-    auto small = ops::Const(root, {static_cast<int32>(1), TensorShape({1, 1})});
+    auto small =
+        ops::Const(root, {static_cast<int32_t>(1), TensorShape({1, 1})});
     auto large = ops::Const(
-        root, {static_cast<int32>(2), TensorShape({4, kMaxTensorSize / 2})});
+        root, {static_cast<int32_t>(2), TensorShape({4, kMaxTensorSize / 2})});
     Node* multi;
     TF_ASSERT_OK(NodeBuilder("MI", "MultiIdentity")
                      .Input(std::vector<NodeBuilder::NodeOut>{small.node(),
@@ -313,7 +314,7 @@ TEST_F(ShapeRefinerTest, ExtractConstantSubgraphMultiOutput) {
     // The add adds 1 and 2 together, and its output has kMaxTensorSize*2
     // elements.
     shape_inference::InferenceContext* ctx = m.GetContext(shape_v2);
-    EXPECT_EQ(strings::StrCat("[", kMaxTensorSize * 2 * 3, "]"),
+    EXPECT_EQ(absl::StrCat("[", kMaxTensorSize * 2 * 3, "]"),
               ctx->DebugString(ctx->output(0)));
   }
 }
@@ -380,7 +381,7 @@ REGISTER_OP("ShapeData")
       std::vector<shape_inference::DimensionHandle> dims;
       dims.reserve(shape_data->NumElements());
       for (int i = 0; i < shape_data->NumElements(); ++i) {
-        dims.emplace_back(c->MakeDim(shape_data->flat<int32>()(i)));
+        dims.emplace_back(c->MakeDim(shape_data->flat<int32_t>()(i)));
       }
 
       c->set_output(0, c->MakeShape(dims));
@@ -418,7 +419,7 @@ REGISTER_OP("ShapeVectorForAllElements")
       }
       int64_t total = 0;
       for (int i = 0; i < shape_data->NumElements(); ++i) {
-        total += shape_data->flat<int32>()(i);
+        total += shape_data->flat<int32_t>()(i);
       }
 
       c->set_output(0, c->Vector(total));
@@ -487,7 +488,8 @@ TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt64) {
 
   // Create variable 2x4 tensor.
   auto input = ops::Variable(
-      root, {2, 4, static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+      root,
+      {2, 4, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT64);
 
   // Shape is a vector of 2 elements (2,4)
@@ -521,7 +523,8 @@ TEST_F(ShapeRefinerTest, PropagateShapeAcrossTensorContentInt32Overflow) {
 
   // Create variable 2x4 tensor.
   auto input = ops::Variable(
-      root, {2, 4, static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+      root,
+      {2, 4, static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT32);
 
   // Shape is a vector of 2 elements (2,4)
@@ -607,7 +610,7 @@ TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt64) {
   auto input = ops::Variable(
       root,
       {1, 2, 3, 4, 5,
-       static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+       static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT64);
 
   // 5! * int32_max_value * 2.
@@ -638,7 +641,7 @@ TEST_F(ShapeRefinerTest, PropagateSizeAcrossTensorContentInt32Overflow) {
   auto input = ops::Variable(
       root,
       {1, 2, 3, 4, 5,
-       static_cast<int64_t>(std::numeric_limits<int32>::max()) * 2},
+       static_cast<int64_t>(std::numeric_limits<int32_t>::max()) * 2},
       DT_INT32);
 
   // 5!.
@@ -845,7 +848,7 @@ absl::Status PartialTensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
     return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(
-      c->MakeShapeFromTensorShape(TensorShape({t->flat<int32>()(0)}), &out));
+      c->MakeShapeFromTensorShape(TensorShape({t->flat<int32_t>()(0)}), &out));
   c->set_output(0, out);
   return absl::OkStatus();
 }
@@ -967,10 +970,10 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
 
   InputList inputs{
       // clang-format off
-      Input(ops::Const<int32>(root, 10)),
-      Input(ops::Const<int32>(root, 20)),
+      Input(ops::Const<int32_t>(root, 10)),
+      Input(ops::Const<int32_t>(root, 20)),
       Input(Output(scalar_non_const)),
-      Input(ops::Const<int32>(root, 40)),
+      Input(ops::Const<int32_t>(root, 40)),
   };  // clang-format on
   auto pack = ops::Stack(root, inputs);
   TF_ASSERT_OK(root.status());
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index af721c1893baa0..3855c6a3d6cfce 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -35,7 +35,7 @@ SimplePropagatorState::SimplePropagatorState(
       vlog_(vlog || VLOG_IS_ON(1)),
       input_tensors_(finfo.total_inputs),
       pending_(
-          new std::atomic<int32>[immutable_state.graph_view().num_nodes()]),
+          new std::atomic<int32_t>[immutable_state.graph_view().num_nodes()]),
       active_(vlog_ ? new std::vector<bool>(
                           immutable_state.graph_view().num_nodes())
                     : nullptr),
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 3c53a5f900414f..8ef9775f93aee8 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -167,7 +167,7 @@ class SimplePropagatorState {
   // is never concurrent access to the same entry.
   std::vector<Entry> input_tensors_;
 
-  std::unique_ptr<std::atomic<int32>[]> pending_;
+  std::unique_ptr<std::atomic<int32_t>[]> pending_;
 
   // If `vlog_` is true, this stores a bit vector of active nodes, indexed by
   // node ID.
diff --git a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
index 10226237cbb8e3..5eb084d0def629 100644
--- a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
+++ b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
@@ -109,12 +109,12 @@ void RedirectEdge(Graph* graph, Node* old_src_node, Node* dst_node,
 }
 
 // Find the corresponding host device name from the TPU device name.
-string GetHostDeviceName(Node* tpu_node) {
+std::string GetHostDeviceName(Node* tpu_node) {
   auto device_name = tpu_node->requested_device();
   if (device_name.empty()) device_name = tpu_node->assigned_device_name();
   DeviceNameUtils::ParsedName parsed_device_name;
   DeviceNameUtils::ParseFullName(device_name, &parsed_device_name);
-  string host_device_name = DeviceNameUtils::FullName(
+  std::string host_device_name = DeviceNameUtils::FullName(
       parsed_device_name.job, parsed_device_name.replica,
       parsed_device_name.task, /*type=*/"CPU", /*id=*/0);
   return host_device_name;
@@ -143,7 +143,8 @@ int GetTPUTaskId(Node* tpu_node) {
 // Build the fill op. Its value is 0 and the fill op is put on the host device
 // with the same task id as the TPUExecute node.
 Node* BuildFillOp(GraphDefBuilder::Options& bopts, Node* tpu_node,
-                  Node* in_node, int input_index, string host_device_name) {
+                  Node* in_node, int input_index,
+                  std::string host_device_name) {
   // Find the output_shape vector
   auto output_shape_vec = GetOutputShapeVec(in_node);
   if (!output_shape_vec.has_value()) return nullptr;
@@ -191,7 +192,7 @@ absl::Status ReplaceIciDummyVariables(Graph* graph, int input_index,
       continue;
     }
 
-    string host_device_name = GetHostDeviceName(tpu_node);
+    std::string host_device_name = GetHostDeviceName(tpu_node);
 
     // If the node corresponding to host_device_name is already in the graph,
     // replace the edge from in_node to tpu_node with the edge from
diff --git a/tensorflow/core/common_runtime/single_threaded_executor.cc b/tensorflow/core/common_runtime/single_threaded_executor.cc
index a7c30baec739ad..c737d274fbcd64 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor.cc
@@ -65,8 +65,8 @@ namespace {
 typedef absl::InlinedVector<TensorValue, 4UL> TensorValueVec;
 typedef absl::InlinedVector<AllocatorAttributes, 4UL> AllocatorAttributeVec;
 
-static const string& kSingleThreadedExecutor =
-    *new string("SINGLE_THREADED_EXECUTOR");
+static const std::string& kSingleThreadedExecutor =
+    *new std::string("SINGLE_THREADED_EXECUTOR");
 
 class SingleThreadedExecutorImpl : public Executor {
  public:
diff --git a/tensorflow/core/common_runtime/single_threaded_executor_test.cc b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
index 334ada5ad0a389..b081e17d86a978 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor_test.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
@@ -170,8 +170,9 @@ float V(const Tensor& tensor) {
   return tensor.scalar<float>()();
 }
 
-Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
-                          const string& receiver, const string& name) {
+Rendezvous::ParsedKey Key(const std::string& sender, const uint64_t incarnation,
+                          const std::string& receiver,
+                          const std::string& name) {
   Rendezvous::ParsedKey result;
   TF_CHECK_OK(
       Rendezvous::ParseKey(Rendezvous::CreateKey(sender, incarnation, receiver,
@@ -363,8 +364,8 @@ void BM_executor(::testing::benchmark::State& state) {
   Graph* g = new Graph(OpRegistry::Global());
   random::PhiloxRandom philox(1729, 17);
   random::SimplePhilox rand(&philox);
-  uint64 cur = 0;
-  uint32 r = 1 + rand.Rand32() % width;
+  uint64_t cur = 0;
+  uint32_t r = 1 + rand.Rand32() % width;
   std::vector<Node*> ready_nodes;
   for (int i = 0; i < r; ++i) {
     ready_nodes.push_back(test::graph::NoOp(g, {}));
@@ -392,7 +393,7 @@ void BM_executor(::testing::benchmark::State& state) {
   test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
       .Run(state);
-  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetLabel(absl::StrCat("Nodes = ", cur));
   state.SetItemsProcessed(cur * static_cast<int64_t>(state.iterations()));
 }
 
@@ -424,7 +425,7 @@ void BM_const_identity(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR",
                   /*old_benchmark_api=*/false)
       .Run(state);
-  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetLabel(absl::StrCat("Nodes = ", (1 + outputs_per_const) * width));
   state.SetItemsProcessed((1 + outputs_per_const) * width *
                           static_cast<int64_t>(state.iterations()));
 }
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.cc b/tensorflow/core/common_runtime/stats_publisher_interface.cc
index 8b04ac9f80523d..610efbdadb7dc8 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.cc
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.cc
@@ -43,7 +43,8 @@ class NoOpStatsPublisher : public StatsPublisherInterface {
                              function_records) override {}
 
   std::unique_ptr<ProfileHandler> GetProfileHandler(
-      uint64 step, int64_t execution_count, const RunOptions& ropts) override {
+      uint64_t step, int64_t execution_count,
+      const RunOptions& ropts) override {
     return nullptr;
   }
 
@@ -74,7 +75,7 @@ StatsPublisherFactory StatsPublisherInterface::GetStatsPublisherFactory() {
 }
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
-    const string& session, const BuildGraphOptions& bopts,
+    const std::string& session, const BuildGraphOptions& bopts,
     const SessionOptions& sopts) {
   return std::unique_ptr<StatsPublisherInterface>(new NoOpStatsPublisher);
 }
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index 450683e643dc0c..2f0e3221be97cb 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -61,7 +61,7 @@ class StatsPublisherInterface {
   //
   // This method may return a null pointer, if no handler was created.
   virtual std::unique_ptr<ProfileHandler> GetProfileHandler(
-      uint64 step, int64_t execution_count, const RunOptions& ropts) = 0;
+      uint64_t step, int64_t execution_count, const RunOptions& ropts) = 0;
 
   virtual ~StatsPublisherInterface() {}
 
@@ -77,7 +77,7 @@ class StatsPublisherInterface {
 };
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
-    const string& session, const BuildGraphOptions& bopts,
+    const std::string& session, const BuildGraphOptions& bopts,
     const SessionOptions& sopts);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 03fe4d946bdb0d..cc32e668309402 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -36,7 +36,7 @@ const int kMaxAllocReportNodes = 100;
 const float kMaxAllocReportFraction = 0.99;
 
 struct AllocStats {
-  std::map<int64_t, std::vector<string>> nodes_by_size;
+  std::map<int64_t, std::vector<std::string>> nodes_by_size;
   int64_t total_bytes = 0;
   int64_t total_nodes = 0;
 };
@@ -65,39 +65,39 @@ NodeExecStatsWrapper::NodeExecStatsWrapper(
       node_(node),
       step_stats_collector_(step_stats_collector) {}
 
-void NodeExecStatsWrapper::Done(const string& device) {
+void NodeExecStatsWrapper::Done(const std::string& device) {
   // TODO(tucker): merge with the DetailText function in session.cc in a common
   // location.
   DCHECK(node_);
-  string memory;
+  std::string memory;
   for (auto& all : stats_->memory()) {
     int64_t tot = all.total_bytes();
     if (tot >= 0.1 * 1048576.0) {
       int64_t peak = all.peak_bytes();
       if (peak > 0) {
         memory =
-            strings::StrCat(memory, "[", all.allocator_name(),
-                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
-                                            peak / 1048576.0));
+            absl::StrCat(memory, "[", all.allocator_name(),
+                         strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
+                                         peak / 1048576.0));
       } else {
-        memory = strings::StrCat(memory, "[", all.allocator_name(),
-                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
+        memory = absl::StrCat(memory, "[", all.allocator_name(),
+                              strings::Printf(" %.1fMB] ", tot / 1048576.0));
       }
     }
   }
   const AttrSlice attrs(*node_);
-  string text;
+  std::string text;
   if (IsSend(node_)) {
-    string tensor_name;
+    std::string tensor_name;
     TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string recv_device;
+    std::string recv_device;
     TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
     text = strings::StrCat(memory, node_->name(), " = ", node_->op(), "(",
                            tensor_name, " @", recv_device, ")");
   } else if (IsRecv(node_)) {
-    string tensor_name;
+    std::string tensor_name;
     TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string send_device;
+    std::string send_device;
     TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
     text = strings::StrCat(memory, node_->name(), " = ", node_->op(), "(",
                            tensor_name, " @", send_device, ")");
@@ -197,7 +197,7 @@ void NodeExecStatsWrapper::Finalize() {
 StepStatsCollector::StepStatsCollector(StepStats* step_stats)
     : finalized_(false), step_stats_(step_stats) {}
 
-static int ExtractGpuWithStreamAll(string device_name) {
+static int ExtractGpuWithStreamAll(std::string device_name) {
   // Check if the device name matches the ".*gpu:(\\d+)/stream:all$" regexp,
   // and if it does return the stream index (always positive). If it doesn't
   // return -1.
@@ -220,7 +220,7 @@ static int ExtractGpuWithStreamAll(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture(capture);
+    std::string ordered_capture(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(absl::SimpleAtoi(ordered_capture, &gpu_id));
@@ -228,7 +228,7 @@ static int ExtractGpuWithStreamAll(string device_name) {
   }
 }
 
-static int ExtractGpuWithoutStream(string device_name) {
+static int ExtractGpuWithoutStream(std::string device_name) {
   // Check if the device name matches the ".*gpu:(\\d+)$" regexp,
   // and if it does return the stream index (always positive). If it doesn't
   // return -1.
@@ -249,7 +249,7 @@ static int ExtractGpuWithoutStream(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture(capture);
+    std::string ordered_capture(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(absl::SimpleAtoi(ordered_capture, &gpu_id));
@@ -259,7 +259,7 @@ static int ExtractGpuWithoutStream(string device_name) {
 
 void StepStatsCollector::BuildCostModel(
     CostModelManager* cost_model_manager,
-    const std::unordered_map<string, const Graph*>& device_map) {
+    const std::unordered_map<std::string, const Graph*>& device_map) {
   mutex_lock lock(mu_);
 
   if (!finalized_) {
@@ -282,7 +282,7 @@ void StepStatsCollector::BuildCostModel(
 
   for (int i = 0; i < step_stats_->dev_stats_size(); ++i) {
     const DeviceStepStats& device_stats = step_stats_->dev_stats(i);
-    const string& device_name = device_stats.device();
+    const std::string& device_name = device_stats.device();
     const int gpu_id = ExtractGpuWithStreamAll(device_name);
     if (gpu_id >= 0) {
       // These are gpu hardware stats
@@ -296,7 +296,7 @@ void StepStatsCollector::BuildCostModel(
 
   for (auto& itr : per_device_stats) {
     const absl::string_view device_name = itr.first;
-    const int gpu_id = ExtractGpuWithoutStream(string(device_name));
+    const int gpu_id = ExtractGpuWithoutStream(std::string(device_name));
     if (gpu_id >= 0) {
       // Reference the gpu hardware stats in addition to the regular stats
       // for this gpu device if they're available.
@@ -324,10 +324,10 @@ void StepStatsCollector::BuildCostModel(
 
     const DeviceStats& dev_stats = per_device_stats.find(device)->second;
 
-    std::unordered_map<string, NodeExecStats> name_to_hw_node_stats;
+    std::unordered_map<std::string, NodeExecStats> name_to_hw_node_stats;
     if (dev_stats.hardware_stats) {
       for (const auto& node_stats : dev_stats.hardware_stats->node_stats()) {
-        string node_name = node_stats.node_name();
+        std::string node_name = node_stats.node_name();
         // Remove the part of op name (e.g. :Conv2D) in the end of a node name.
         size_t pos = node_name.find_first_of(':');
         if (pos != std::string::npos) {
@@ -368,7 +368,8 @@ void StepStatsCollector::BuildCostModel(
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
-        string node_name = dev_stats.regular_stats->node_stats(i).node_name();
+        std::string node_name =
+            dev_stats.regular_stats->node_stats(i).node_name();
         if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
                                             name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
@@ -383,14 +384,14 @@ void StepStatsCollector::BuildCostModel(
   }
 }
 
-void StepStatsCollector::Save(const string& device,
+void StepStatsCollector::Save(const std::string& device,
                               NodeExecStats* node_stats_pb) {
   Save(device,
        new NodeExecStatsWrapper(std::unique_ptr<NodeExecStats>(node_stats_pb),
                                 nullptr, this));
 }
 
-void StepStatsCollector::Save(const string& device,
+void StepStatsCollector::Save(const std::string& device,
                               NodeExecStatsWrapper* node_stats) {
   if (!node_stats) return;
   VLOG(1) << "Save dev " << device << " node stats " << node_stats->stats();
@@ -410,9 +411,9 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
-void StepStatsCollector::SaveThreadName(const string& device,
-                                        const uint32 thread_id,
-                                        const string& thread_name) {
+void StepStatsCollector::SaveThreadName(const std::string& device,
+                                        const uint32_t thread_id,
+                                        const std::string& thread_name) {
   VLOG(1) << "Save dev " << device << " thread id " << thread_id << " name "
           << thread_name;
   {
@@ -434,17 +435,17 @@ NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
   return new NodeExecStatsWrapper(node, this);
 }
 
-string StepStatsCollector::ReportAllocsOnResourceExhausted(
+std::string StepStatsCollector::ReportAllocsOnResourceExhausted(
     const absl::string_view err) {
   mutex_lock l(mu_);
   if (err.find("OOM") == err.npos) {
     return "";
   }
   // <device, allocator> -> AllocStats
-  std::map<std::pair<string, string>, AllocStats> allocs_map;
-  string report = "\n";
+  std::map<std::pair<std::string, std::string>, AllocStats> allocs_map;
+  std::string report = "\n";
   for (const auto& dev_stat : dev_stats_) {
-    const string& device = dev_stat.first;
+    const std::string& device = dev_stat.first;
     // Only print the device that has OOM.
     // TODO(xpan): Extract device from err first to speed it up.
     if (err.find(device) == err.npos) {
@@ -490,7 +491,7 @@ string StepStatsCollector::ReportAllocsOnResourceExhausted(
     // Print allocations stats of the <device, allocator> pair.
     for (auto it = dev_allocs_stats.nodes_by_size.rbegin();
          it != dev_allocs_stats.nodes_by_size.rend(); ++it) {
-      for (const string& node_name : it->second) {
+      for (const std::string& node_name : it->second) {
         reported_bytes += it->first;
         strings::StrAppend(&report, "  ",
                            strings::HumanReadableNumBytes(it->first), " from ",
@@ -532,7 +533,7 @@ void StepStatsCollector::FinalizeInternal() {
     return;
   }
   finalized_ = true;
-  std::map<string, DeviceStepStats*> dev_stats_pb;
+  std::map<std::string, DeviceStepStats*> dev_stats_pb;
   for (auto& ds : *step_stats_->mutable_dev_stats()) {
     dev_stats_pb[ds.device()] = &ds;
   }
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 277630cd40f9de..1c3503a8101654 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -51,7 +51,7 @@ class NodeExecStatsInterface {
   // Called when the statistics collection for the node has finished. Once this
   // method is called, the caller should not make assumptions about the validity
   // of this object.
-  virtual void Done(const string& device) = 0;
+  virtual void Done(const std::string& device) = 0;
 
   // Called immediately after this node starts being processed by the executor.
   virtual void RecordExecutorStarted() = 0;
@@ -101,7 +101,7 @@ class NodeExecStatsWrapper : public NodeExecStatsInterface {
   // Destructor calls Finalize() to release the TrackingAllocators.
   ~NodeExecStatsWrapper() override { Finalize(); }
 
-  void Done(const string& device) override;
+  void Done(const std::string& device) override;
   void RecordExecutorStarted() override;
   void RecordComputeStarted() override;
   void RecordComputeEnded() override;
@@ -148,7 +148,8 @@ class StepStatsCollectorInterface {
   // `err` message needs to contain device name and allocator name, e.g.:
   // "ResourceExhaustedError: OOM when allocating tensor ...
   // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
-  virtual string ReportAllocsOnResourceExhausted(absl::string_view err) = 0;
+  virtual std::string ReportAllocsOnResourceExhausted(
+      absl::string_view err) = 0;
 };
 
 // StepStatsCollector manages the collection of a StepStats object.
@@ -164,19 +165,19 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   // device_map.
   void BuildCostModel(
       CostModelManager* cost_model_manager,
-      const std::unordered_map<string, const Graph*>& device_map);
+      const std::unordered_map<std::string, const Graph*>& device_map);
 
   // Saves node statistics to the DeviceStats object associated with device.
   // Should be called before Finalize.
-  void Save(const string& device, NodeExecStats* node_stats_pb);
-  void Save(const string& device, NodeExecStatsWrapper* node_stats);
+  void Save(const std::string& device, NodeExecStats* node_stats_pb);
+  void Save(const std::string& device, NodeExecStatsWrapper* node_stats);
 
   // Saves thread name.
-  void SaveThreadName(const string& device, const uint32 thread_id,
-                      const string& thread_name);
+  void SaveThreadName(const std::string& device, const uint32_t thread_id,
+                      const std::string& thread_name);
 
   NodeExecStatsInterface* CreateNodeExecStats(const NodeDef* node) override;
-  string ReportAllocsOnResourceExhausted(absl::string_view err) override;
+  std::string ReportAllocsOnResourceExhausted(absl::string_view err) override;
 
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
@@ -188,19 +189,21 @@ class StepStatsCollector : public StepStatsCollectorInterface {
  private:
   // TODO(suharshs): Make this configurable if its not possible to find a value
   // that works for all cases.
-  static constexpr uint64 kMaxCollectedNodes = 1 << 20;
+  static constexpr uint64_t kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
-  typedef std::unordered_map<uint32, string> ThreadNamesMap;
+  typedef std::unordered_map<uint32_t, std::string> ThreadNamesMap;
 
   void FinalizeInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   bool finalized_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, NodeStatsVector> dev_stats_ TF_GUARDED_BY(mu_);
-  std::unordered_map<string, ThreadNamesMap> thread_names_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, NodeStatsVector> dev_stats_
+      TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, ThreadNamesMap> thread_names_
+      TF_GUARDED_BY(mu_);
   StepStats* step_stats_ TF_GUARDED_BY(mu_);
-  uint64 collected_nodes_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t collected_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/testlib_ops.cc b/tensorflow/core/common_runtime/testlib_ops.cc
index 11970bee114128..d36ad0a20dc0a0 100644
--- a/tensorflow/core/common_runtime/testlib_ops.cc
+++ b/tensorflow/core/common_runtime/testlib_ops.cc
@@ -46,7 +46,7 @@ class ErrorOp : public OpKernel {
   }
 
  private:
-  string errmsg_;
+  std::string errmsg_;
   bool log_error_ = false;
 };
 REGISTER_KERNEL_BUILDER(Name("Error").Device(DEVICE_CPU), ErrorOp);
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 23166b69540083..8ada1107f7f044 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -54,7 +54,7 @@ info. It does not have any negative impact on performance. */
 namespace tensorflow {
 
 ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
-                                   const string& name, Bytes memory_limit,
+                                   const std::string& name, Bytes memory_limit,
                                    const DeviceLocality& locality,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h
index 08175ccb1f231c..4e6c0b87935082 100644
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 // CPU device implementation.
 class ThreadPoolDevice : public LocalDevice {
  public:
-  ThreadPoolDevice(const SessionOptions& options, const string& name,
+  ThreadPoolDevice(const SessionOptions& options, const std::string& name,
                    Bytes memory_limit, const DeviceLocality& locality,
                    Allocator* allocator);
   ~ThreadPoolDevice() override;
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 3ac8ea5ae8b68c..a6756935e63e27 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -29,14 +29,14 @@ namespace tensorflow {
 // TODO(zhifengc/tucker): Figure out the bytes of available RAM.
 class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override {
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override {
     devices->push_back("/physical_device:CPU:0");
 
     return absl::OkStatus();
   }
 
   absl::Status CreateDevices(
-      const SessionOptions& options, const string& name_prefix,
+      const SessionOptions& options, const std::string& name_prefix,
       std::vector<std::unique_ptr<Device>>* devices) override {
     int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
@@ -45,7 +45,7 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
       n = iter->second;
     }
     for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:CPU:", i);
+      std::string name = absl::StrCat(name_prefix, "/device:CPU:", i);
       std::unique_ptr<ThreadPoolDevice> tpd;
       if (options.config.experimental().use_numa_affinity()) {
         int numa_node = i % num_numa_nodes;
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 8239e1c2196767..0434c287f31a5e 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -125,7 +125,7 @@ absl::Status update_inferred_type(Node* target, const FullTypeDef& t,
   return absl::OkStatus();
 }
 
-absl::StatusOr<FullTypeDef> run_inference(const string& fn_name,
+absl::StatusOr<FullTypeDef> run_inference(const std::string& fn_name,
                                           const TypeRefVector& in_types) {
   // TODO(b/224776031): Things remaining to implement:
   //  * look up function by name
diff --git a/tensorflow/core/common_runtime/type_inference_test.cc b/tensorflow/core/common_runtime/type_inference_test.cc
index 068f81ea191ace..6f7a165e695326 100644
--- a/tensorflow/core/common_runtime/type_inference_test.cc
+++ b/tensorflow/core/common_runtime/type_inference_test.cc
@@ -60,7 +60,6 @@ TEST(TypeInferenceTest, BasicStraightline) {
 
   Node* ds;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -100,7 +99,6 @@ TEST(TypeInferenceTest, CyclicGraphWithV1ControlFlow) {
 
   Node* ds;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -443,7 +441,6 @@ TEST(ReverseTypeInferenceTest, BasicVDependency) {
 
   Node* ds;  // This node has a type constructor.
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(NodeBuilder("ds", "RangeDataset", &root.graph()->flib_def())
                    .Input({NodeBuilder::NodeOut(start.node())})
@@ -491,7 +488,6 @@ TEST(ReverseTypeInferenceTest, FromUnsetType) {
 
   Node* it;
   TensorShapeProto shape;
-  shape.mutable_dim();
   shape.set_unknown_rank(false);
   TF_ASSERT_OK(
       NodeBuilder("it", "AnonymousIteratorV2", &root.graph()->flib_def())
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index d6bc4d9531173f..f257876ad907ad 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -69,6 +69,9 @@ class Flags {
                   "graphs.")
   TF_DECLARE_FLAG(enable_graph_debug_info_caching_for_stack_frames, true,
                   "If true, graph debug info will cache the stack frames.")
+  TF_DECLARE_FLAG(
+      enable_fatal_error_on_collective_abort, false,
+      "If true, a fatal error will be raised when a collective is aborted.")
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 9da0ba1b64b0b7..13b3fcb0d135bf 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -56,5 +56,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(enable_function_pruning_before_inlining)
   TF_PY_DECLARE_FLAG(enable_skip_encapsulation_for_non_tpu_graphs)
   TF_PY_DECLARE_FLAG(enable_graph_debug_info_caching_for_stack_frames)
+  TF_PY_DECLARE_FLAG(enable_fatal_error_on_collective_abort)
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index bd65978ab1da1f..cfc5e4b18208ef 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/core/data/compression_utils.h"
 
 #include <cstddef>
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -123,7 +125,7 @@ absl::Status CompressElement(const std::vector<Tensor>& element,
     }
   }
 
-  if (iov.NumBytes() > kuint32max) {
+  if (iov.NumBytes() > std::numeric_limits<uint32_t>::max()) {
     return errors::OutOfRange("Encountered dataset element of size ",
                               iov.NumBytes(),
                               ", exceeding the 4GB Snappy limit.");
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 14f385dcaa48cb..a9300194089efe 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -493,7 +494,7 @@ bool MatchesAnyVersion(absl::string_view op_prefix,
     return true;
   }
   size_t index = op_to_match.length() - 1;
-  while (isdigit(op_to_match[index])) {
+  while (absl::ascii_isdigit(op_to_match[index])) {
     index--;
   }
   return (op_to_match[index] == 'V') && (op_prefix.length() == index);
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index cf99ef7088c5ae..b8f0434adae02d 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -718,8 +718,6 @@ INSTANTIATE_TEST_SUITE_P(Test, GetOptimizationsTest,
                                            GetOptimizationTestCase4()));
 
 TEST(DeterministicOpsTest, GetOptimizations) {
-  // TODO(b/259305727): Re-enable for MacOS when the bug is fixed.
-#if !defined(__APPLE__)
   tsl::test::DeterministicOpsScope det_scope;
   Options options;
   // options.deterministic should be ignored when deterministic ops are enabled.
@@ -729,7 +727,6 @@ TEST(DeterministicOpsTest, GetOptimizations) {
   EXPECT_THAT(std::vector<string>(actual_enabled.begin(), actual_enabled.end()),
               ::testing::UnorderedElementsAreArray({"make_deterministic"}));
   EXPECT_EQ(actual_disabled.size(), 0);
-#endif
 }
 
 REGISTER_DATASET_EXPERIMENT("test_only_experiment",
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 36aa6492a22faa..1a79089fbccc0f 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -119,7 +119,7 @@ absl::Status DataServiceClient::Initialize(
           << " in tf.data service client.";
   dispatcher_ = std::make_unique<DataServiceDispatcherClient>(params_.address,
                                                               params_.protocol);
-  int64_t deadline_micros = kint64max;
+  int64_t deadline_micros = std::numeric_limits<int64_t>::max();
   std::optional<std::string> job_name;
   if (!params_.job_name.empty()) {
     job_name = params_.job_name;
@@ -668,7 +668,7 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
       }
       VLOG(3) << "Processing task " << task_to_process->info.task_id();
     }
-    int64_t deadline_micros = kint64max;
+    int64_t deadline_micros = std::numeric_limits<int64_t>::max();
     absl::Status s = GetElementTraced(task_to_process.get(), deadline_micros,
                                       /*enqueue_result=*/!IsCoordinatedRead(),
                                       allow_skip, result);
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index 87608b858eedee..c06acb3e332ddf 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -379,9 +379,9 @@ absl::Status DataServiceDispatcherClient::DisableCompressionAtRuntime(
 }
 
 absl::Status DataServiceDispatcherClient::EnsureInitialized() {
-  return grpc_util::Retry([this] { return Initialize(); },
-                          "Initialize dispatcher client",
-                          /*deadline_micros=*/kint64max);
+  return grpc_util::Retry(
+      [this] { return Initialize(); }, "Initialize dispatcher client",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max());
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index 2b85af5aa20b73..a4d82ede95d362 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -232,7 +232,7 @@ std::shared_ptr<model::Model> CachingTaskRunner::model() const {
 
 RoundRobinTaskRunner::RoundRobinTaskRunner(
     std::unique_ptr<TaskIterator> iterator, int64_t num_consumers,
-    string worker_address)
+    std::string worker_address)
     : num_consumers_(num_consumers),
       worker_address_(worker_address),
       buffer_(num_consumers_),
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index 79d698f9edc65f..9f208b6bc0c35e 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -261,7 +261,7 @@ class PrefetchThread {
 class RoundRobinTaskRunner : public TaskRunner {
  public:
   RoundRobinTaskRunner(std::unique_ptr<TaskIterator> iterator,
-                       int64_t num_consumers, string worker_address);
+                       int64_t num_consumers, std::string worker_address);
 
   absl::Status GetNext(const GetElementRequest& req,
                        GetElementResult& result) override;
@@ -280,7 +280,7 @@ class RoundRobinTaskRunner : public TaskRunner {
   // start.
   absl::Status PrepareRound(const GetElementRequest& req);
   const int64_t num_consumers_;
-  const string worker_address_;
+  const std::string worker_address_;
   mutex mu_;
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
   // Condition variable notified whenever we start a new round of round-robin.
@@ -291,7 +291,7 @@ class RoundRobinTaskRunner : public TaskRunner {
       requests_ TF_GUARDED_BY(mu_);
   // Index of the first round we plan to serve. At startup, this is the minimum
   // of all requested element indices.
-  int64_t first_round_ TF_GUARDED_BY(mu_) = kint64max;
+  int64_t first_round_ TF_GUARDED_BY(mu_) = std::numeric_limits<int64_t>::max();
   int64_t current_round_ TF_GUARDED_BY(mu_) = -1;
   bool round_skipped_ TF_GUARDED_BY(mu_) = false;
   // Buffered results for the current round.
diff --git a/tensorflow/core/data/service/thread_safe_buffer_test.cc b/tensorflow/core/data/service/thread_safe_buffer_test.cc
index ea4008b3886dde..b486a078cf92cc 100644
--- a/tensorflow/core/data/service/thread_safe_buffer_test.cc
+++ b/tensorflow/core/data/service/thread_safe_buffer_test.cc
@@ -167,7 +167,7 @@ TEST_P(ThreadSafeBufferTest, BlockWriterWhenBufferIsFull) {
     ASSERT_THAT(buffer.Push(Tensor("Test tensor")), absl_testing::IsOk());
   }
 
-  uint64 push_time = 0;
+  uint64_t push_time = 0;
   auto thread = absl::WrapUnique(Env::Default()->StartThread(
       /*thread_options=*/{}, /*name=*/"writer_thread", [&buffer, &push_time]() {
         ASSERT_THAT(buffer.Push(Tensor("Test tensor")), absl_testing::IsOk());
@@ -176,7 +176,7 @@ TEST_P(ThreadSafeBufferTest, BlockWriterWhenBufferIsFull) {
 
   // Popping an element unblocks the `Push` call.
   Env::Default()->SleepForMicroseconds(10000);
-  uint64 pop_time = Env::Default()->NowMicros();
+  uint64_t pop_time = Env::Default()->NowMicros();
   ASSERT_THAT(buffer.Pop(), absl_testing::IsOk());
   thread.reset();
   EXPECT_LE(pop_time, push_time);
diff --git a/tensorflow/core/data/service/utils.cc b/tensorflow/core/data/service/utils.cc
index 4f79b9384de3b7..c4a1ea3dfd351a 100644
--- a/tensorflow/core/data/service/utils.cc
+++ b/tensorflow/core/data/service/utils.cc
@@ -44,7 +44,7 @@ absl::Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(path, &file));
   io::RecordReader reader(file.get());
-  uint64 offset = 0;
+  uint64_t offset = 0;
   tstring record;
   TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
   if (!dataset_def.ParseFromString(record)) {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index c89c8a1c4881f4..f5978a573b24e4 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/worker_impl.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -183,9 +184,9 @@ absl::Status DataServiceWorkerImpl::Start(
     mutex_lock l(mu_);
     return !cancelled_;
   };
-  TF_RETURN_IF_ERROR(grpc_util::Retry([this]() { return Heartbeat(); },
-                                      should_retry, "Worker heartbeat.",
-                                      /*deadline_micros=*/kint64max));
+  TF_RETURN_IF_ERROR(grpc_util::Retry(
+      [this]() { return Heartbeat(); }, should_retry, "Worker heartbeat.",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max()));
   LOG(INFO) << "Worker registered with dispatcher running at "
             << config_.dispatcher_address()
             << ". Worker config: " << config_.DebugString();
@@ -248,10 +249,10 @@ DataServiceWorkerImpl::CreateDispatcherClient() const TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     return !cancelled_;
   };
-  TF_RETURN_IF_ERROR(
-      grpc_util::Retry([&dispatcher]() { return dispatcher->Initialize(); },
-                       should_retry, "Initialize dispatcher client.",
-                       /*deadline_micros=*/kint64max));
+  TF_RETURN_IF_ERROR(grpc_util::Retry(
+      [&dispatcher]() { return dispatcher->Initialize(); }, should_retry,
+      "Initialize dispatcher client.",
+      /*deadline_micros=*/std::numeric_limits<int64_t>::max()));
   return dispatcher;
 }
 
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 576cbed01fb633..9946b3dc213020 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -710,7 +710,7 @@ absl::Status Reader::MakeNestedDataset(
     datasets.push_back(
         new Dataset(DatasetContext(DatasetContext::Params(
                         {"SnapshotDatasetReader",
-                         strings::StrCat("SnapshotDatasetReader/_", i)})),
+                         absl::StrCat("SnapshotDatasetReader/_", i)})),
                     shard_dirs.at(i), compression_type, version, dtypes, shapes,
                     dataset_start_index));
     datasets.back()->Initialize(/*metadata=*/{});
diff --git a/tensorflow/core/data/snapshot_utils.h b/tensorflow/core/data/snapshot_utils.h
index f083cbe495fa72..1af57f169efd97 100644
--- a/tensorflow/core/data/snapshot_utils.h
+++ b/tensorflow/core/data/snapshot_utils.h
@@ -65,10 +65,10 @@ constexpr char kShardDirectorySuffix[] = ".shard";
 enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
 // Returns the name of the "hash" directory for the given base path and hash ID.
-std::string HashDirectory(const std::string& path, uint64 hash);
+std::string HashDirectory(const std::string& path, uint64_t hash);
 
 // Returns the name of the "run" directory for the given base path and run ID.
-std::string RunDirectory(const std::string& hash_directory, uint64 run_id);
+std::string RunDirectory(const std::string& hash_directory, uint64_t run_id);
 std::string RunDirectory(const std::string& hash_directory,
                          const std::string& run_id);
 
@@ -78,7 +78,7 @@ std::string ShardDirectory(const std::string& run_directory, int64_t shard_id);
 
 // Returns the checkpoint file name for the given directory and checkpoint ID.
 std::string GetCheckpointFileName(const std::string& shard_directory,
-                                  uint64 checkpoint_id);
+                                  uint64_t checkpoint_id);
 
 // This is a interface class that exposes snapshot writing functionality.
 class Writer {
@@ -132,7 +132,7 @@ class TFRecordWriter : public Writer {
 // Writes snapshot with a custom (legacy) file format.
 class CustomWriter : public Writer {
  public:
-  static constexpr const size_t kHeaderSize = sizeof(uint64);
+  static constexpr const size_t kHeaderSize = sizeof(uint64_t);
 
   static constexpr const char* const kClassName = "SnapshotWriter";
   static constexpr const char* const kWriteStringPiece = "WriteStringPiece";
@@ -210,7 +210,7 @@ class Reader {
   // the `version`, `compression_type`, and `dtypes` arguments passed into
   // `Writer` and `Reader` must be the same for the reading to succeed.
   static absl::Status Create(Env* env, const std::string& filename,
-                             const string& compression_type, int version,
+                             const std::string& compression_type, int version,
                              const DataTypeVector& dtypes,
                              std::unique_ptr<Reader>* out_reader);
 
@@ -221,7 +221,8 @@ class Reader {
   // contains all the elements written out to each individual snapshot file.
   static absl::Status MakeNestedDataset(
       Env* env, const std::vector<std::string>& shard_dirs,
-      const string& compression_type, int version, const DataTypeVector& dtypes,
+      const std::string& compression_type, int version,
+      const DataTypeVector& dtypes,
       const std::vector<PartialTensorShape>& shapes, int64_t start_index,
       DatasetBase** output);
 
@@ -253,7 +254,8 @@ class TFRecordReaderImpl {
   // tensorflow/compiler/xla/tsl/lib/io/compression.h.
   // `output_buffer_size` specifies the buffer size required by Snappy/Zlib
   // compression algorithms. Ignored if compression is not enabled.
-  TFRecordReaderImpl(const std::string& filename, const string& compression,
+  TFRecordReaderImpl(const std::string& filename,
+                     const std::string& compression,
                      std::optional<int64_t> output_buffer_size = std::nullopt);
 
   // Initializes the reader. Callers must initialize the reader before calling
@@ -279,14 +281,14 @@ class TFRecordReaderImpl {
   uint64_t offset_ = 0;
   uint64_t bytes_read_ = 0;
 
-  const string compression_;
+  const std::string compression_;
   const std::optional<int64_t> output_buffer_size_;
 };
 
 // Reads snapshots previously written with `TFRecordWriter`.
 class TFRecordReader : public Reader {
  public:
-  TFRecordReader(const std::string& filename, const string& compression,
+  TFRecordReader(const std::string& filename, const std::string& compression,
                  const DataTypeVector& dtypes,
                  std::optional<int64_t> output_buffer_size = std::nullopt)
       : reader_impl_(filename, compression, output_buffer_size),
@@ -321,14 +323,14 @@ class CustomReader : public Reader {
   // TODO(b/148804377): Set this in a smarter fashion.
   static constexpr const int64_t kSnappyReaderOutputBufferSizeBytes =
       32 << 20;  // 32 MiB
-  static constexpr const size_t kHeaderSize = sizeof(uint64);
+  static constexpr const size_t kHeaderSize = sizeof(uint64_t);
 
   static constexpr const char* const kClassName = "SnapshotReader";
   static constexpr const char* const kReadString = "ReadString";
   static constexpr const char* const kReadCord = "ReadCord";
   static constexpr const char* const kSeparator = "::";
 
-  CustomReader(const std::string& filename, const string& compression_type,
+  CustomReader(const std::string& filename, const std::string& compression_type,
                int version, const DataTypeVector& dtypes);
 
   absl::Status ReadTensors(std::vector<Tensor>* read_tensors) override;
@@ -356,7 +358,7 @@ class CustomReader : public Reader {
   std::string filename_;
   std::unique_ptr<RandomAccessFile> file_;
   std::unique_ptr<io::InputStreamInterface> input_stream_;
-  const string compression_type_;
+  const std::string compression_type_;
   const int version_;
   const DataTypeVector dtypes_;
   int num_simple_ = 0;
@@ -366,18 +368,18 @@ class CustomReader : public Reader {
 
 // Writes snapshot metadata to the given directory.
 absl::Status WriteMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     const experimental::SnapshotMetadataRecord* metadata);
 
 // Writes distributed snapshot metadata to the given directory. An error is
 // returned if `dir` is unable to be created or if `metadata` is unable to be
 // written.
 absl::Status WriteMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     const experimental::DistributedSnapshotMetadata* metadata);
 
 // Reads snapshot metadata from the given directory.
-absl::Status ReadMetadataFile(Env* env, const string& dir,
+absl::Status ReadMetadataFile(Env* env, const std::string& dir,
                               experimental::SnapshotMetadataRecord* metadata,
                               bool* file_exists);
 
@@ -386,17 +388,17 @@ absl::Status ReadMetadataFile(Env* env, const string& dir,
 // returned. If the file exists in `dir` but is unable to be opened, an error
 // is returned.
 absl::Status ReadMetadataFile(
-    Env* env, const string& dir,
+    Env* env, const std::string& dir,
     experimental::DistributedSnapshotMetadata* metadata, bool* file_exists);
 
 // Writes a dataset graph to the given directory.
-absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
+absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64_t hash,
                               const GraphDef* graph);
 
 absl::Status DetermineOpState(
     const std::string& mode_string, bool file_exists,
     const experimental::SnapshotMetadataRecord* metadata,
-    uint64 pending_snapshot_expiry_seconds, Mode* mode);
+    uint64_t pending_snapshot_expiry_seconds, Mode* mode);
 
 // Represents a dataset element or EOF.
 struct ElementOrEOF {
@@ -420,9 +422,9 @@ struct ElementOrEOF {
 class AsyncWriter {
  public:
   explicit AsyncWriter(Env* env, int64_t file_index,
-                       const std::string& shard_directory, uint64 checkpoint_id,
-                       const std::string& compression, int64_t version,
-                       const DataTypeVector& output_types,
+                       const std::string& shard_directory,
+                       uint64_t checkpoint_id, const std::string& compression,
+                       int64_t version, const DataTypeVector& output_types,
                        std::function<void(absl::Status)> done);
 
   // Writes the given tensors. The method is non-blocking and returns without
@@ -437,7 +439,7 @@ class AsyncWriter {
   void Consume(ElementOrEOF* be) TF_LOCKS_EXCLUDED(mu_);
   bool ElementAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   absl::Status WriterThread(Env* env, const std::string& shard_directory,
-                            uint64 checkpoint_id,
+                            uint64_t checkpoint_id,
                             const std::string& compression, int64_t version,
                             DataTypeVector output_types);
 
diff --git a/tensorflow/core/data/split_utils.cc b/tensorflow/core/data/split_utils.cc
index 5248e6370781b6..44eda7649af7cc 100644
--- a/tensorflow/core/data/split_utils.cc
+++ b/tensorflow/core/data/split_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -82,7 +83,7 @@ absl::Status IndexSplitProvider::Restore(
 int64_t IndexSplitProvider::Cardinality() const {
   // RandomDataset uses kint64max to simulate infinite splits.
   // See RandomDatasetOp::Dataset::MakeSplitProviders.
-  if (n_ == tsl::kint64max) {
+  if (n_ == std::numeric_limits<int64_t>::max()) {
     return kInfiniteCardinality;
   }
   return n_;
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index 8052db9883010d..74c87c5103e21b 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -162,7 +162,7 @@ absl::Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
         return absl::OkStatus();
       }});
 
-  string fetch_node = "";
+  std::string fetch_node = "";
   for (const auto& node : graph_def.node()) {
     if (node.op() == "_Retval") {
       fetch_node = node.input(0);
diff --git a/tensorflow/core/data/standalone_test.cc b/tensorflow/core/data/standalone_test.cc
index b60b4e752492e9..aa8e7259b5d90b 100644
--- a/tensorflow/core/data/standalone_test.cc
+++ b/tensorflow/core/data/standalone_test.cc
@@ -514,7 +514,7 @@ constexpr const char* const kMapGraphNoAutotuneProto = R"pb(
 
 TEST(Scalar, Standalone) {
   struct TestCase {
-    string graph_string;
+    std::string graph_string;
     std::vector<int64_t> expected_outputs;
   };
   auto test_cases = {
diff --git a/tensorflow/core/data/stats_utils.cc b/tensorflow/core/data/stats_utils.cc
index 80c1de6dbd5576..12c12b8907994e 100644
--- a/tensorflow/core/data/stats_utils.cc
+++ b/tensorflow/core/data/stats_utils.cc
@@ -33,40 +33,40 @@ ABSL_CONST_INIT const char kFeaturesCount[] = "features_count";
 ABSL_CONST_INIT const char kFeatureValuesCount[] = "feature_values_count";
 ABSL_CONST_INIT const char kExamplesCount[] = "examples_count";
 
-string ExecutionTimeHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kExecutionTime);
+std::string ExecutionTimeHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kExecutionTime);
 }
 
-string ThreadUtilizationScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kThreadUtilization);
+std::string ThreadUtilizationScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kThreadUtilization);
 }
 
-string BufferSizeScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferSize);
+std::string BufferSizeScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferSize);
 }
 
-string BufferCapacityScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferCapacity);
+std::string BufferCapacityScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferCapacity);
 }
 
-string BufferUtilizationHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kBufferUtilization);
+std::string BufferUtilizationHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kBufferUtilization);
 }
 
-string FilterdElementsScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFilteredElements);
+std::string FilterdElementsScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFilteredElements);
 }
 
-string DroppedElementsScalarName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kDroppedElements);
+std::string DroppedElementsScalarName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kDroppedElements);
 }
 
-string FeatureHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFeaturesCount);
+std::string FeatureHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFeaturesCount);
 }
 
-string FeatureValueHistogramName(const string& prefix) {
-  return strings::StrCat(prefix, kDelimiter, kFeatureValuesCount);
+std::string FeatureValueHistogramName(const std::string& prefix) {
+  return absl::StrCat(prefix, kDelimiter, kFeatureValuesCount);
 }
 
 }  // namespace stats_utils
diff --git a/tensorflow/core/data/stats_utils.h b/tensorflow/core/data/stats_utils.h
index 5fa1eae397b39e..22a40b9be963a7 100644
--- a/tensorflow/core/data/stats_utils.h
+++ b/tensorflow/core/data/stats_utils.h
@@ -33,33 +33,33 @@ extern const char kFeatureValuesCount[];
 extern const char kExamplesCount[];
 
 // Name for tf.data function execution time (in ns) histogram metrics.
-string ExecutionTimeHistogramName(const string& prefix);
+std::string ExecutionTimeHistogramName(const std::string& prefix);
 
 // Name for thread utilization (ratio of threads being used and maximum number
 // of threads allocated) scalar metrics.
-string ThreadUtilizationScalarName(const string& prefix);
+std::string ThreadUtilizationScalarName(const std::string& prefix);
 
 // Name for buffer size scalar metrics.
-string BufferSizeScalarName(const string& prefix);
+std::string BufferSizeScalarName(const std::string& prefix);
 
 // Name for buffer capacity (maximum allocated buffer size) scalar metrics.
-string BufferCapacityScalarName(const string& prefix);
+std::string BufferCapacityScalarName(const std::string& prefix);
 
 // Name for buffer utilization (ratio of buffer size and maximum allocated
 // buffer size.) histogram metrics.
-string BufferUtilizationHistogramName(const string& prefix);
+std::string BufferUtilizationHistogramName(const std::string& prefix);
 
 // Name for filtered elements scalar metrics.
-string FilterdElementsScalarName(const string& prefix);
+std::string FilterdElementsScalarName(const std::string& prefix);
 
 // Name for dropped elements scalar mereics.
-string DroppedElementsScalarName(const string& prefix);
+std::string DroppedElementsScalarName(const std::string& prefix);
 
 // Name for features count histogram metrics.
-string FeatureHistogramName(const string& prefix);
+std::string FeatureHistogramName(const std::string& prefix);
 
 // Name for feature-values count histogram metrics.
-string FeatureValueHistogramName(const string& prefix);
+std::string FeatureValueHistogramName(const std::string& prefix);
 
 }  // namespace stats_utils
 }  // namespace data
diff --git a/tensorflow/core/data/unbounded_thread_pool.cc b/tensorflow/core/data/unbounded_thread_pool.cc
index 3ffbbd5c70569f..6d196322298612 100644
--- a/tensorflow/core/data/unbounded_thread_pool.cc
+++ b/tensorflow/core/data/unbounded_thread_pool.cc
@@ -59,7 +59,7 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
  public:
   explicit LogicalThreadFactory(UnboundedThreadPool* pool) : pool_(pool) {}
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) override {
     auto done = std::make_shared<absl::Notification>();
     pool_->ScheduleOnWorkQueue(std::move(fn), done);
diff --git a/tensorflow/core/data/unbounded_thread_pool.h b/tensorflow/core/data/unbounded_thread_pool.h
index 1b89024a8db86e..1046c8ad5e7bb7 100644
--- a/tensorflow/core/data/unbounded_thread_pool.h
+++ b/tensorflow/core/data/unbounded_thread_pool.h
@@ -35,9 +35,9 @@ namespace data {
 // `UnboundedWorkQueue`.
 class UnboundedThreadPool : public thread::ThreadPoolInterface {
  public:
-  UnboundedThreadPool(Env* env, const string& thread_name)
+  UnboundedThreadPool(Env* env, const std::string& thread_name)
       : unbounded_work_queue_(env, thread_name) {}
-  UnboundedThreadPool(Env* env, const string& thread_name,
+  UnboundedThreadPool(Env* env, const std::string& thread_name,
                       const ThreadOptions& thread_options)
       : unbounded_work_queue_(env, thread_name, thread_options) {}
   ~UnboundedThreadPool() override = default;
diff --git a/tensorflow/core/debug/bfc_dump_reader.cc b/tensorflow/core/debug/bfc_dump_reader.cc
index 9ff9dd9d474e7b..dbad9888c99caa 100644
--- a/tensorflow/core/debug/bfc_dump_reader.cc
+++ b/tensorflow/core/debug/bfc_dump_reader.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-MemoryDump ReadDumpFile(const string& fname) {
+MemoryDump ReadDumpFile(const std::string& fname) {
   absl::Status status;
-  uint64 file_size = 0;
+  uint64_t file_size = 0;
   status = Env::Default()->GetFileSize(fname, &file_size);
   if (!status.ok()) {
     LOG(ERROR) << "Failed to get size of " << fname;
@@ -66,7 +66,7 @@ MemoryDump FilterByChunkType(MemoryDump md, const char chunk_type) {
   return filtered;
 }
 
-void PrintChunk(const MemChunk& mc, const uint64 ac_offset, bool freed_at,
+void PrintChunk(const MemChunk& mc, const uint64_t ac_offset, bool freed_at,
                 const int64_t total_bytes, int64_t* cumulative_bytes) {
   // A size class corresponding approximately to log base 100.
   int size_class = floor(0.5 * log10(static_cast<double>(mc.size())));
@@ -120,7 +120,7 @@ void PrintSortedChunks(
   chunks.reserve(md.chunk_size());
   int64_t total_bytes = 0;
   int64_t cumulative_bytes = 0;
-  uint64 max_action_count = 0;
+  uint64_t max_action_count = 0;
   for (auto& it : md.chunk()) {
     chunks.push_back(&it);
     total_bytes += it.size();
@@ -129,7 +129,7 @@ void PrintSortedChunks(
     }
   }
   sort(chunks.begin(), chunks.end(), compare);
-  uint64 last_end = 0;
+  uint64_t last_end = 0;
   for (int i = 0; i < chunks.size(); ++i) {
     const MemChunk* c = chunks[i];
     if (by_addr && i > 0 && last_end != c->address()) {
@@ -174,12 +174,12 @@ void PrintChunksBySize(const MemoryDump& md, bool by_age, bool freed_at) {
       by_age, freed_at, false /*by_addr*/);
 }
 
-void PrintChunksByOpName(const MemoryDump& md, const string& op_name,
+void PrintChunksByOpName(const MemoryDump& md, const std::string& op_name,
                          bool by_age, bool freed_at) {
   printf("------------Chunks matching \"%s\":----------------------\n",
          op_name.c_str());
   MemoryDump filtered;
-  uint64 total_bytes = 0;
+  uint64_t total_bytes = 0;
   filtered.set_allocator_name(md.allocator_name());
   for (const auto& it : md.bin_summary()) {
     *filtered.add_bin_summary() = it;
@@ -203,7 +203,7 @@ void PrintChunksByOpName(const MemoryDump& md, const string& op_name,
 void PrintSizeHistory(const MemoryDump& md, bool by_age) {
   printf("------------Allocated Bytes by Action Count--------\n");
   printf("num snapshots: %d\n", md.snap_shot_size());
-  uint64 max_action_count = 0;
+  uint64_t max_action_count = 0;
   if (by_age) {
     for (auto& it : md.snap_shot()) {
       if (it.action_count() > max_action_count) {
diff --git a/tensorflow/core/debug/debug_callback_registry.cc b/tensorflow/core/debug/debug_callback_registry.cc
index 97967a3f040eca..5ee0d53d507624 100644
--- a/tensorflow/core/debug/debug_callback_registry.cc
+++ b/tensorflow/core/debug/debug_callback_registry.cc
@@ -28,20 +28,20 @@ DebugCallbackRegistry* DebugCallbackRegistry::singleton() {
   return instance_;
 }
 
-void DebugCallbackRegistry::RegisterCallback(const string& key,
+void DebugCallbackRegistry::RegisterCallback(const std::string& key,
                                              EventCallback callback) {
   mutex_lock lock(mu_);
   keyed_callback_[key] = std::move(callback);
 }
 
 DebugCallbackRegistry::EventCallback* DebugCallbackRegistry::GetCallback(
-    const string& key) {
+    const std::string& key) {
   mutex_lock lock(mu_);
   auto iter = keyed_callback_.find(key);
   return iter == keyed_callback_.end() ? nullptr : &iter->second;
 }
 
-void DebugCallbackRegistry::UnregisterCallback(const string& key) {
+void DebugCallbackRegistry::UnregisterCallback(const std::string& key) {
   mutex_lock lock(mu_);
   keyed_callback_.erase(key);
 }
diff --git a/tensorflow/core/debug/debug_callback_registry.h b/tensorflow/core/debug/debug_callback_registry.h
index 94b57401418eb9..c3cf8d665af9d9 100644
--- a/tensorflow/core/debug/debug_callback_registry.h
+++ b/tensorflow/core/debug/debug_callback_registry.h
@@ -45,14 +45,14 @@ class DebugCallbackRegistry {
   static DebugCallbackRegistry* singleton();
 
   // Returns the registered callback, or nullptr, for key.
-  EventCallback* GetCallback(const string& key);
+  EventCallback* GetCallback(const std::string& key);
 
   // Associates callback with key.  This must be called by clients observing
   // nodes to be exported by this callback router before running a session.
-  void RegisterCallback(const string& key, EventCallback callback);
+  void RegisterCallback(const std::string& key, EventCallback callback);
 
   // Removes the callback associated with key.
-  void UnregisterCallback(const string& key);
+  void UnregisterCallback(const std::string& key);
 
  private:
   DebugCallbackRegistry();
@@ -61,7 +61,7 @@ class DebugCallbackRegistry {
   mutex mu_;
 
   // Maps debug_url keys to callbacks for routing observed tensors.
-  std::map<string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
+  std::map<std::string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
 
   static DebugCallbackRegistry* instance_;
 };
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 10ee5a3d33b8ad..9b0fc5c517c170 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -30,8 +30,8 @@ namespace tensorflow {
 namespace {
 
 // TODO(cais): Switch to safe_strtob when available.
-absl::Status ParseBoolString(const string& bool_str, bool* bool_val) {
-  const string lower_bool_str = absl::AsciiStrToLower(bool_str);
+absl::Status ParseBoolString(const std::string& bool_str, bool* bool_val) {
+  const std::string lower_bool_str = absl::AsciiStrToLower(bool_str);
   if (lower_bool_str == "false" || lower_bool_str == "f" ||
       lower_bool_str == "0") {
     *bool_val = false;
@@ -60,15 +60,15 @@ absl::Status DebugNodeInserter::InsertNodes(
   }
 
   // Debug ops and URLs for wildcard node names (if any).
-  std::vector<string> default_debug_ops;
-  std::vector<string> default_debug_urls;
+  std::vector<std::string> default_debug_ops;
+  std::vector<std::string> default_debug_urls;
 
   // A map from tensor name (e.g., "node_a:0") to list of debug op names
   // (e.g., {"DebugIdentity", "DebugNanCount"})
-  std::unordered_map<string, std::vector<string>> tensor_watches;
+  std::unordered_map<std::string, std::vector<std::string>> tensor_watches;
   // A map from tensor name to debug_url.
-  std::unordered_map<string, std::vector<string>> tensor_watch_urls;
-  std::unordered_map<string, bool> tensor_tolerate_failures;
+  std::unordered_map<std::string, std::vector<std::string>> tensor_watch_urls;
+  std::unordered_map<std::string, bool> tensor_tolerate_failures;
 
   // Cache the proto content for fast lookup later
   for (const DebugTensorWatch& watch : watches) {
@@ -105,11 +105,11 @@ absl::Status DebugNodeInserter::InsertNodes(
       }
     }
 
-    string tensor_name =
+    std::string tensor_name =
         absl::StrCat(watch.node_name(), ":", watch.output_slot());
 
-    std::vector<string> debug_ops;
-    for (const string& debug_op : watch.debug_ops()) {
+    std::vector<std::string> debug_ops;
+    for (const std::string& debug_op : watch.debug_ops()) {
       debug_ops.push_back(debug_op);
     }
 
@@ -117,8 +117,8 @@ absl::Status DebugNodeInserter::InsertNodes(
     tensor_tolerate_failures[tensor_name] =
         watch.tolerate_debug_op_creation_failures();
 
-    std::vector<string> urls;
-    for (const string& url : watch.debug_urls()) {
+    std::vector<std::string> urls;
+    for (const std::string& url : watch.debug_urls()) {
       urls.push_back(url);
     }
     tensor_watch_urls[tensor_name] = urls;
@@ -148,7 +148,7 @@ absl::Status DebugNodeInserter::InsertNodes(
     // Iterate through all output slots of the node.
     for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
          ++src_output_slot) {
-      const string tensor_name =
+      const std::string tensor_name =
           absl::StrCat(src_node->name(), ":", src_output_slot);
       const bool explicit_tensor_match =
           tensor_watches.find(tensor_name) != tensor_watches.end();
@@ -176,10 +176,10 @@ absl::Status DebugNodeInserter::InsertNodes(
                                              src_output_slot, &memory_type));
 
       // Create the copy node for the watched tensor.
-      const std::vector<string> debug_ops = explicit_tensor_match
-                                                ? tensor_watches[tensor_name]
-                                                : default_debug_ops;
-      const std::vector<string> debug_urls =
+      const std::vector<std::string> debug_ops =
+          explicit_tensor_match ? tensor_watches[tensor_name]
+                                : default_debug_ops;
+      const std::vector<std::string> debug_urls =
           explicit_tensor_match ? tensor_watch_urls[tensor_name]
                                 : default_debug_urls;
       Node* copy_node;
@@ -200,7 +200,7 @@ absl::Status DebugNodeInserter::InsertNodes(
       // Create all requested debug nodes and their edges to the Copy node.
       std::vector<Node*> debug_nodes;
       for (size_t i = 0; i < debug_ops.size(); ++i) {
-        const string& debug_op_name = debug_ops[i];
+        const std::string& debug_op_name = debug_ops[i];
 
         Node* debug_node;
         absl::Status debug_s = CreateDebugNode(
@@ -280,17 +280,17 @@ void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
 }
 
 // static
-const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
-                                                const int output_slot) {
+const std::string DebugNodeInserter::GetCopyNodeName(
+    const std::string& node_name, const int output_slot) {
   // For example, if the watched node is named "node1" and the output slot
   // is 0, the debug node will be called: __copy_node1_0
   return absl::StrCat("__copy_", node_name, "_", output_slot);
 }
 
 // static
-const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
-                                                 const int debug_op_num,
-                                                 const string& debug_op_name) {
+const std::string DebugNodeInserter::GetDebugNodeName(
+    const std::string& tensor_name, const int debug_op_num,
+    const std::string& debug_op_name) {
   // For example, if the watched node is named "node1" and the debug op that
   // watches the output slot of node1 is of the type "DebugNanCount", the
   // debug node will be called: __dbg_node1_0_0_DebugNanCount.
@@ -301,23 +301,24 @@ const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
 // static
 absl::Status DebugNodeInserter::CreateCopyNode(
     Graph* graph, const DeviceType device_type, const bool is_host_memory,
-    const string& src_node_name, const int src_output, const DataType src_dt,
-    const string& tensor_name, const std::vector<string>& debug_ops,
-    const std::vector<string>& debug_urls, Node** copy_node) {
-  const string kGatedGrpcAttributeKey = "gated_grpc";
+    const std::string& src_node_name, const int src_output,
+    const DataType src_dt, const std::string& tensor_name,
+    const std::vector<std::string>& debug_ops,
+    const std::vector<std::string>& debug_urls, Node** copy_node) {
+  const std::string kGatedGrpcAttributeKey = "gated_grpc";
 
   NodeDef node_def;
   const KernelDef* kdef;
 
-  const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
-  const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
+  const std::string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
+  const std::string copy_node_name = GetCopyNodeName(src_node_name, src_output);
 
   // Cross debug_ops and debug_urls to get the list of debug ops and watches.
-  std::vector<string> debug_ops_spec;
-  for (const string& debug_op : debug_ops) {
-    for (const string& debug_url : debug_urls) {
-      string debug_op_name_proper;
-      std::unordered_map<string, string> custom_attributes;
+  std::vector<std::string> debug_ops_spec;
+  for (const std::string& debug_op : debug_ops) {
+    for (const std::string& debug_url : debug_urls) {
+      std::string debug_op_name_proper;
+      std::unordered_map<std::string, std::string> custom_attributes;
       TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
                                           &custom_attributes));
 
@@ -363,24 +364,25 @@ absl::Status DebugNodeInserter::CreateCopyNode(
 
 // static
 absl::Status DebugNodeInserter::ParseDebugOpName(
-    const string& debug_op_name, string* debug_op_name_proper,
-    std::unordered_map<string, string>* attributes) {
+    const std::string& debug_op_name, std::string* debug_op_name_proper,
+    std::unordered_map<std::string, std::string>* attributes) {
   const size_t l_index = debug_op_name.find('(');
   const size_t r_index = debug_op_name.find(')');
-  if (l_index == string::npos && r_index == string::npos) {
+  if (l_index == std::string::npos && r_index == std::string::npos) {
     *debug_op_name_proper = debug_op_name;
   } else {
-    if (l_index == string::npos || l_index == 0 ||
+    if (l_index == std::string::npos || l_index == 0 ||
         r_index != debug_op_name.size() - 1) {
       return absl::InvalidArgumentError(
           absl::StrCat("Malformed debug op name \"", debug_op_name, "\""));
     }
 
     *debug_op_name_proper = debug_op_name.substr(0, l_index);
-    string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
+    std::string arguments =
+        debug_op_name.substr(l_index + 1, r_index - l_index - 1);
 
-    std::vector<string> attribute_segs = str_util::Split(arguments, ";");
-    for (const string& attribute_seg : attribute_segs) {
+    std::vector<std::string> attribute_segs = str_util::Split(arguments, ";");
+    for (const std::string& attribute_seg : attribute_segs) {
       absl::string_view seg(attribute_seg);
       str_util::RemoveWhitespaceContext(&seg);
       if (seg.empty()) {
@@ -388,13 +390,13 @@ absl::Status DebugNodeInserter::ParseDebugOpName(
       }
 
       const size_t eq_index = seg.find('=');
-      if (eq_index == string::npos) {
+      if (eq_index == std::string::npos) {
         return absl::InvalidArgumentError(absl::StrCat(
             "Malformed attributes in debug op name \"", debug_op_name, "\""));
       }
 
-      const string key(seg.substr(0, eq_index));
-      const string value(
+      const std::string key(seg.substr(0, eq_index));
+      const std::string value(
           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
       if (key.empty() || value.empty()) {
         return absl::InvalidArgumentError(absl::StrCat(
@@ -415,17 +417,18 @@ absl::Status DebugNodeInserter::ParseDebugOpName(
 
 // static
 absl::Status DebugNodeInserter::SetDebugNodeAttributes(
-    Node* debug_node, const std::unordered_map<string, string>& attributes) {
-  std::unordered_set<string> unfulfilled_keys;
+    Node* debug_node,
+    const std::unordered_map<std::string, std::string>& attributes) {
+  std::unordered_set<std::string> unfulfilled_keys;
   for (const auto& item : attributes) {
     unfulfilled_keys.insert(item.first);
   }
 
   for (const auto& attr : debug_node->op_def().attr()) {
     if (attributes.find(attr.name()) != attributes.end()) {
-      const string& attr_value = attributes.at(attr.name());
+      const std::string& attr_value = attributes.at(attr.name());
       if (attr.type() == "string") {
-        debug_node->AddAttr<string>(attr.name(), attr_value);
+        debug_node->AddAttr<std::string>(attr.name(), attr_value);
       } else if (attr.type() == "float") {
         float float_value = 0.0;
         if (!absl::SimpleAtof(attr_value, &float_value)) {
@@ -472,19 +475,19 @@ absl::Status DebugNodeInserter::SetDebugNodeAttributes(
 
 // static
 absl::Status DebugNodeInserter::CreateDebugNode(
-    Graph* graph, const Device& device, const string& src_copy_node_name,
-    const DataType src_dt, const string& tensor_name,
-    const std::vector<string>& debug_urls, const int debug_op_num,
-    const string& debug_op_name, Node** debug_node) {
+    Graph* graph, const Device& device, const std::string& src_copy_node_name,
+    const DataType src_dt, const std::string& tensor_name,
+    const std::vector<std::string>& debug_urls, const int debug_op_num,
+    const std::string& debug_op_name, Node** debug_node) {
   NodeDef node_def;
   const KernelDef* kdef;
 
-  string debug_op_name_proper;
-  std::unordered_map<string, string> custom_attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> custom_attributes;
   TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
                                       &custom_attributes));
 
-  const string debug_node_name =
+  const std::string debug_node_name =
       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
   auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
                      .Input(src_copy_node_name, 0, src_dt)
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 27cfb357e2b9d9..9552becbe7b27c 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -82,20 +82,21 @@ class DebugNodeInserter {
   static void DeparallelizeWhileLoops(Graph* graph, Device* device);
 
   // Get canonical name of a copy node.
-  static const string GetCopyNodeName(const string& node_name,
-                                      const int output_slot);
+  static const std::string GetCopyNodeName(const std::string& node_name,
+                                           const int output_slot);
 
   // Get canonical name of a debug node.
-  static const string GetDebugNodeName(const string& tensor_name,
-                                       const int debug_op_num,
-                                       const string& debug_op_name);
+  static const std::string GetDebugNodeName(const std::string& tensor_name,
+                                            const int debug_op_num,
+                                            const std::string& debug_op_name);
 
  private:
   static absl::Status CreateCopyNode(
       Graph* graph, const DeviceType device_type, const bool is_host_memory,
-      const string& src_node_name, const int src_output, const DataType src_dt,
-      const string& tensor_name, const std::vector<string>& debug_ops,
-      const std::vector<string>& debug_urls, Node** copy_node);
+      const std::string& src_node_name, const int src_output,
+      const DataType src_dt, const std::string& tensor_name,
+      const std::vector<std::string>& debug_ops,
+      const std::vector<std::string>& debug_urls, Node** copy_node);
 
   // Parse the debug_op_name string to extract proper op name and attributes.
   // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
@@ -104,17 +105,18 @@ class DebugNodeInserter {
   // with semicolons (";"), which optional whitespace in between, e.g.,
   // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
   static absl::Status ParseDebugOpName(
-      const string& debug_op_name, string* debug_op_name_proper,
-      std::unordered_map<string, string>* attributes);
+      const std::string& debug_op_name, std::string* debug_op_name_proper,
+      std::unordered_map<std::string, std::string>* attributes);
 
   static absl::Status SetDebugNodeAttributes(
-      Node* debug_node, const std::unordered_map<string, string>& attributes);
+      Node* debug_node,
+      const std::unordered_map<std::string, std::string>& attributes);
 
   static absl::Status CreateDebugNode(
-      Graph* graph, const Device& device, const string& src_copy_node_name,
-      const DataType src_dt, const string& tensor_name,
-      const std::vector<string>& debug_urls, const int debug_op_num,
-      const string& debug_op_name, Node** debug_node);
+      Graph* graph, const Device& device, const std::string& src_copy_node_name,
+      const DataType src_dt, const std::string& tensor_name,
+      const std::vector<std::string>& debug_urls, const int debug_op_num,
+      const std::string& debug_op_name, Node** debug_node);
   // TODO(cais): Cut down the number of args to this method.
 
   friend class DebugGraphUtilsTest;
diff --git a/tensorflow/core/debug/debug_graph_utils_test.cc b/tensorflow/core/debug/debug_graph_utils_test.cc
index 207b8bc1b3c1f7..d1184d5d18c498 100644
--- a/tensorflow/core/debug/debug_graph_utils_test.cc
+++ b/tensorflow/core/debug/debug_graph_utils_test.cc
@@ -25,16 +25,16 @@ namespace tensorflow {
 class DebugGraphUtilsTest : public ::testing::Test {
  protected:
   absl::Status ParseDebugOpName(
-      const string& debug_op_name, string* debug_op_name_proper,
-      std::unordered_map<string, string>* attributes) {
+      const std::string& debug_op_name, std::string* debug_op_name_proper,
+      std::unordered_map<std::string, std::string>* attributes) {
     return DebugNodeInserter::ParseDebugOpName(
         debug_op_name, debug_op_name_proper, attributes);
   }
 };
 
 TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   TF_ASSERT_OK(
       ParseDebugOpName("DebugIdentity", &debug_op_name_proper, &attributes));
   ASSERT_EQ("DebugIdentity", debug_op_name_proper);
@@ -42,8 +42,8 @@ TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   absl::Status s = ParseDebugOpName("(mute_if_healthy=true)",
                                     &debug_op_name_proper, &attributes);
@@ -59,8 +59,8 @@ TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   absl::Status s = ParseDebugOpName("DebugNumericSummary(=)",
                                     &debug_op_name_proper, &attributes);
@@ -89,8 +89,8 @@ TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary()", &debug_op_name_proper,
                                 &attributes));
@@ -106,8 +106,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   TF_ASSERT_OK(ParseDebugOpName(
       "DebugNumericSummary(mute_if_healthy=true; threshold=300.0)",
       &debug_op_name_proper, &attributes));
@@ -128,8 +128,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicateAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
   absl::Status s = ParseDebugOpName(
       "DebugNumericSummary(mute_if_healthy=true; lower_bound=3; "
       "mute_if_healthy=false;)",
@@ -138,8 +138,8 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicateAttributes) {
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithWhitespaceInAttributes) {
-  string debug_op_name_proper;
-  std::unordered_map<string, string> attributes;
+  std::string debug_op_name_proper;
+  std::unordered_map<std::string, std::string> attributes;
 
   TF_ASSERT_OK(ParseDebugOpName(
       "DebugNumericSummary(  mute_if_healthy=true; threshold=300.0  )",
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index f6618666101361..19c79a04d2123d 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -34,7 +34,7 @@ class GrpcDebugTest : public ::testing::Test {
  protected:
   struct ServerData {
     int port;
-    string url;
+    std::string url;
     std::unique_ptr<test::TestEventListenerImpl> server;
     std::unique_ptr<thread::ThreadPool> thread_pool;
   };
@@ -86,7 +86,7 @@ TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
   SetChannelConnectionTimeoutMicros(kShortTimeoutMicros);
   ASSERT_EQ(kShortTimeoutMicros, GetChannelConnectionTimeoutMicros());
 
-  const string& kInvalidGrpcUrl =
+  const std::string& kInvalidGrpcUrl =
       absl::StrCat("grpc://localhost:", testing::PickUnusedPortOrDie());
   Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
   tensor.flat<float>()(0) = 42.0;
@@ -98,10 +98,11 @@ TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
   TF_ASSERT_OK(DebugIO::CloseDebugURL(kInvalidGrpcUrl));
 
   ASSERT_FALSE(publish_status.ok());
-  const string expected_error_msg = strings::StrCat(
+  const std::string expected_error_msg = strings::StrCat(
       "Failed to connect to gRPC channel at ", kInvalidGrpcUrl.substr(7),
       " within a timeout of ", kShortTimeoutMicros / 1e6, " s");
-  ASSERT_NE(string::npos, publish_status.message().find(expected_error_msg));
+  ASSERT_NE(std::string::npos,
+            publish_status.message().find(expected_error_msg));
 }
 
 TEST_F(GrpcDebugTest, ConnectionToDelayedStartingServerWorks) {
@@ -149,7 +150,7 @@ TEST_F(GrpcDebugTest, SendSingleDebugTensorViaGrpcTest) {
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 1}));
-  tensor.flat<tstring>()(0) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(0) = std::string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const absl::Status status = DebugIO::PublishDebugTensor(
@@ -158,14 +159,14 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   ASSERT_NE(status.message().find("string value at index 0 from debug "
                                   "node foo_tensor:0:DebugIdentity does "
                                   "not fit gRPC message size limit"),
-            string::npos);
+            std::string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
 
 TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   Tensor tensor(DT_STRING, TensorShape({1, 2}));
   tensor.flat<tstring>()(0) = "A";
-  tensor.flat<tstring>()(1) = string(5000 * 1024, 'A');
+  tensor.flat<tstring>()(1) = std::string(5000 * 1024, 'A');
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo_tensor", 0, "DebugIdentity");
   const absl::Status status = DebugIO::PublishDebugTensor(
@@ -174,7 +175,7 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   ASSERT_NE(status.message().find("string value at index 1 from debug "
                                   "node foo_tensor:0:DebugIdentity does "
                                   "not fit gRPC message size limit"),
-            string::npos);
+            std::string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
 
@@ -197,7 +198,7 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
   int tensor_count TF_GUARDED_BY(mu) = 0;
   std::vector<absl::Status> statuses TF_GUARDED_BY(mu);
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
 
   // Set up the concurrent tasks of sending Tensors via an Event stream to the
   // server.
@@ -210,7 +211,7 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
     }
 
     // Different concurrent tasks will send different tensors.
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
     absl::Status publish_status = DebugIO::PublishDebugTensor(
         DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                      absl::StrCat("synchronized_node_", this_count), 0,
@@ -247,7 +248,7 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
   // One prep tensor plus kSends concurrent tensors are expected.
   ASSERT_EQ(kSends, server_data_.server->node_names.size());
   for (size_t i = 0; i < server_data_.server->node_names.size(); ++i) {
-    std::vector<string> items =
+    std::vector<std::string> items =
         str_util::Split(server_data_.server->node_names[i], '_');
     int tensor_index;
     strings::safe_strto32(items[2], &tensor_index);
@@ -267,10 +268,10 @@ TEST_F(GrpcDebugTest, SendDebugTensorsThroughMultipleRoundsUsingGrpcGating) {
   Tensor tensor(DT_INT32, TensorShape({1, 1}));
   tensor.flat<int>()(0) = 42;
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
   for (int i = 0; i < 3; ++i) {
     server_data_.server->ClearReceivedDebugData();
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
 
     // On the 1st send (i == 0), gating is disabled, so data should be sent.
     // On the 2nd send (i == 1), gating is enabled, and the server has enabled
@@ -315,10 +316,10 @@ TEST_F(GrpcDebugTest, SendDebugTensorsThroughMultipleRoundsUnderReadWriteMode) {
   Tensor tensor(DT_INT32, TensorShape({1, 1}));
   tensor.flat<int>()(0) = 42;
 
-  const std::vector<string> urls({server_data_.url});
+  const std::vector<std::string> urls({server_data_.url});
   for (int i = 0; i < 3; ++i) {
     server_data_.server->ClearReceivedDebugData();
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
 
     // On the 1st send (i == 0), gating is disabled, so data should be sent.
     // On the 2nd send (i == 1), gating is enabled, and the server has enabled
@@ -367,8 +368,8 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
 
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, "foo:0:DebugIdentity",
@@ -398,9 +399,9 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateDebugNodeOnMultipleEmptyEnabledSets) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
-  const string kGrpcUrl3 = "grpc://localhost:3335";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kGrpcUrl3 = "grpc://localhost:3335";
 
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, "foo:0:DebugIdentity",
@@ -434,14 +435,14 @@ TEST_F(GrpcDebugTest, TestGateDebugNodeOnNonEmptyEnabledSetAndEmptyURLs) {
       "grpc://localhost:3333", "foo:0:DebugIdentity",
       EventReply::DebugOpStateChange::READ_ONLY);
 
-  std::vector<string> debug_urls_1;
+  std::vector<std::string> debug_urls_1;
   ASSERT_FALSE(
       DebugIO::IsDebugNodeGateOpen("foo:1:DebugIdentity", debug_urls_1));
 }
 
 TEST_F(GrpcDebugTest, TestGateCopyNodeOnEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kWatch1 = "foo:0:DebugIdentity";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kWatch1 = "foo:0:DebugIdentity";
 
   ASSERT_FALSE(DebugIO::IsCopyNodeGateOpen(
       {DebugWatchAndURLSpec(kWatch1, kGrpcUrl1, true)}));
@@ -456,10 +457,10 @@ TEST_F(GrpcDebugTest, TestGateCopyNodeOnEmptyEnabledSet) {
 }
 
 TEST_F(GrpcDebugTest, TestGateCopyNodeOnNonEmptyEnabledSet) {
-  const string kGrpcUrl1 = "grpc://localhost:3333";
-  const string kGrpcUrl2 = "grpc://localhost:3334";
-  const string kWatch1 = "foo:0:DebugIdentity";
-  const string kWatch2 = "foo:1:DebugIdentity";
+  const std::string kGrpcUrl1 = "grpc://localhost:3333";
+  const std::string kGrpcUrl2 = "grpc://localhost:3334";
+  const std::string kWatch1 = "foo:0:DebugIdentity";
+  const std::string kWatch2 = "foo:1:DebugIdentity";
   DebugGrpcIO::SetDebugNodeKeyGrpcState(
       kGrpcUrl1, kWatch1, EventReply::DebugOpStateChange::READ_ONLY);
 
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index 2b593ae6601cd1..0f3dfb8bb737f4 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -44,11 +44,11 @@ ::grpc::Status TestEventListenerImpl::SendEvents(
     } else if (event.has_summary()) {
       const Summary::Value& val = event.summary().value(0);
 
-      std::vector<string> name_items =
+      std::vector<std::string> name_items =
           tensorflow::str_util::Split(val.node_name(), ':');
 
-      const string node_name = name_items[0];
-      const string debug_op = name_items[2];
+      const std::string node_name = name_items[0];
+      const std::string debug_op = name_items[2];
 
       const TensorProto& tensor_proto = val.tensor();
       Tensor tensor(tensor_proto.dtype());
@@ -156,7 +156,7 @@ void TestEventListenerImpl::StopServer() {
   }
 }
 
-bool PollTillFirstRequestSucceeds(const string& server_url,
+bool PollTillFirstRequestSucceeds(const std::string& server_url,
                                   const size_t max_attempts) {
   const int kSleepDurationMicros = 100 * 1000;
   size_t n_attempts = 0;
@@ -168,7 +168,7 @@ bool PollTillFirstRequestSucceeds(const string& server_url,
   prep_tensor.flat<float>()(0) = 42.0f;
 
   while (n_attempts++ < max_attempts) {
-    const uint64 wall_time = Env::Default()->NowMicros();
+    const uint64_t wall_time = Env::Default()->NowMicros();
     absl::Status publish_s = DebugIO::PublishDebugTensor(
         DebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", "prep_node", 0,
                      "DebugIdentity"),
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 2a57df8d866331..415ce6435c7bdf 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -48,12 +48,12 @@ class TestEventListenerImpl final : public grpc::EventListener::Service {
       const EventReply::DebugOpStateChange::State new_state,
       const DebugNodeKey& debug_node_key);
 
-  std::vector<string> debug_metadata_strings;
-  std::vector<string> encoded_graph_defs;
-  std::vector<string> device_names;
-  std::vector<string> node_names;
-  std::vector<int32> output_slots;
-  std::vector<string> debug_ops;
+  std::vector<std::string> debug_metadata_strings;
+  std::vector<std::string> encoded_graph_defs;
+  std::vector<std::string> device_names;
+  std::vector<std::string> node_names;
+  std::vector<int32_t> output_slots;
+  std::vector<std::string> debug_ops;
   std::vector<Tensor> debug_tensors;
 
  private:
@@ -77,7 +77,7 @@ class TestEventListenerImpl final : public grpc::EventListener::Service {
 //
 // Returns:
 //   Whether the polling succeeded within max_attempts.
-bool PollTillFirstRequestSucceeds(const string& server_url,
+bool PollTillFirstRequestSucceeds(const std::string& server_url,
                                   const size_t max_attempts);
 
 }  // namespace test
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 50677be5fa3769..430bc36ea1a96c 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -66,8 +66,8 @@ constexpr absl::string_view kDumpSubDirName = "node-io-dump";
 // shape). It does not set the value.tensor field, which should be set by the
 // caller separately.
 Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
-                             const uint64 wall_time_us, const size_t num_chunks,
-                             const size_t chunk_index,
+                             const uint64_t wall_time_us,
+                             const size_t num_chunks, const size_t chunk_index,
                              const DataType& tensor_dtype,
                              const TensorShapeProto& tensor_shape) {
   Event event;
@@ -92,7 +92,7 @@ Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
   metadata.set_chunk_index(chunk_index);
 
   // Encode the data in JSON.
-  string json_output;
+  std::string json_output;
   tensorflow::protobuf::util::JsonPrintOptions json_options;
   json_options.always_print_fields_with_no_presence = true;
   auto status = tensorflow::protobuf::util::MessageToJsonString(
@@ -120,7 +120,7 @@ Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
 // (i.e., an estimate that is usually too large, but never too small under the
 // gRPC message size limit) of the Varint-encoded length, to workaround the lack
 // of a portable length function.
-const size_t StringValMaxBytesInProto(const string& str) {
+const size_t StringValMaxBytesInProto(const std::string& str) {
 #if defined(PLATFORM_GOOGLE)
   return str.size() + DebugGrpcIO::kGrpcMaxVarintLengthSize;
 #else
@@ -131,11 +131,12 @@ const size_t StringValMaxBytesInProto(const string& str) {
 // Breaks a string Tensor (represented as a TensorProto) as a vector of Event
 // protos.
 absl::Status WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
-                                      const uint64 wall_time_us,
+                                      const uint64_t wall_time_us,
                                       const size_t chunk_size_limit,
                                       TensorProto* tensor_proto,
                                       std::vector<Event>* events) {
-  const protobuf::RepeatedPtrField<string>& strs = tensor_proto->string_val();
+  const protobuf::RepeatedPtrField<std::string>& strs =
+      tensor_proto->string_val();
   const size_t num_strs = strs.size();
   const size_t chunk_size_ub = chunk_size_limit > 0
                                    ? chunk_size_limit
@@ -191,7 +192,8 @@ absl::Status WrapStringTensorAsEvents(const DebugNodeKey& debug_node_key,
 // If chunk_size_limit <= 0, the tensor will not be broken into chunks, i.e., a
 // length-1 vector will be returned, regardless of the size of the tensor.
 absl::Status WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
-                                const Tensor& tensor, const uint64 wall_time_us,
+                                const Tensor& tensor,
+                                const uint64_t wall_time_us,
                                 const size_t chunk_size_limit,
                                 std::vector<Event>* events) {
   TensorProto tensor_proto;
@@ -237,10 +239,11 @@ absl::Status WrapTensorAsEvents(const DebugNodeKey& debug_node_key,
 // TOCTOU race condition is not of concern here due to the fact that tfdbg
 // sets parallel_iterations attribute of all while_loops to 1 to prevent
 // the same node from between executed multiple times concurrently.
-string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
-  string out = absl::StrCat(in, "_", timestamp);
+std::string AppendTimestampToFilePath(const std::string& in,
+                                      const uint64_t timestamp) {
+  std::string out = absl::StrCat(in, "_", timestamp);
 
-  uint64 i = 1;
+  uint64_t i = 1;
   while (Env::Default()->FileExists(out).ok()) {
     out = strings::StrCat(in, "_", timestamp, "-", i);
     ++i;
@@ -251,11 +254,10 @@ string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
 #ifndef PLATFORM_WINDOWS
 // Publishes encoded GraphDef through a gRPC debugger stream, in chunks,
 // conforming to the gRPC message size limit.
-absl::Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
-                                            const string& device_name,
-                                            const int64_t wall_time,
-                                            const string& debug_url) {
-  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
+absl::Status PublishEncodedGraphDefInChunks(
+    const std::string& encoded_graph_def, const std::string& device_name,
+    const int64_t wall_time, const std::string& debug_url) {
+  const uint64_t hash = ::tensorflow::Hash64(encoded_graph_def);
   const size_t total_length = encoded_graph_def.size();
   const size_t num_chunks =
       static_cast<size_t>(std::ceil(static_cast<float>(total_length) /
@@ -297,11 +299,12 @@ const char* const DebugIO::kGraphTag = "graph_";
 
 const char* const DebugIO::kHashTag = "hash";
 
-absl::Status ReadEventFromFile(const string& dump_file_path, Event* event) {
+absl::Status ReadEventFromFile(const std::string& dump_file_path,
+                               Event* event) {
   Env* env(Env::Default());
 
-  string content;
-  uint64 file_size = 0;
+  std::string content;
+  uint64_t file_size = 0;
 
   absl::Status s = env->GetFileSize(dump_file_path, &file_size);
   if (!s.ok()) {
@@ -333,10 +336,11 @@ const char* const DebugIO::kMemoryURLScheme = "memcbk://";
 // Publishes debug metadata to a set of debug URLs.
 absl::Status DebugIO::PublishDebugMetadata(
     const int64_t global_step, const int64_t session_run_index,
-    const int64_t executor_step_index, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_nodes,
-    const std::unordered_set<string>& debug_urls) {
+    const int64_t executor_step_index,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::string>& target_nodes,
+    const std::unordered_set<std::string>& debug_urls) {
   std::ostringstream oss;
 
   // Construct a JSON string to carry the metadata.
@@ -370,24 +374,24 @@ absl::Status DebugIO::PublishDebugMetadata(
   oss << "]";
   oss << "}";
 
-  const string json_metadata = oss.str();
+  const std::string json_metadata = oss.str();
   Event event;
   event.set_wall_time(static_cast<double>(Env::Default()->NowMicros()));
   LogMessage* log_message = event.mutable_log_message();
   log_message->set_message(json_metadata);
 
   absl::Status status;
-  for (const string& url : debug_urls) {
+  for (const std::string& url : debug_urls) {
     if (absl::StartsWith(absl::AsciiStrToLower(url), kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
       Event grpc_event;
 
       // Determine the path (if any) in the grpc:// URL, and add it as a field
       // of the JSON string.
-      const string address = url.substr(strlen(DebugIO::kFileURLScheme));
-      const string path = address.find('/') == string::npos
-                              ? ""
-                              : address.substr(address.find('/'));
+      const std::string address = url.substr(strlen(DebugIO::kFileURLScheme));
+      const std::string path = address.find('/') == std::string::npos
+                                   ? ""
+                                   : address.substr(address.find('/'));
       grpc_event.set_wall_time(event.wall_time());
       LogMessage* log_message_grpc = grpc_event.mutable_log_message();
       log_message_grpc->set_message(
@@ -400,8 +404,8 @@ absl::Status DebugIO::PublishDebugMetadata(
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
-      const string core_metadata_path = AppendTimestampToFilePath(
+      const std::string dump_root_dir = url.substr(strlen(kFileURLScheme));
+      const std::string core_metadata_path = AppendTimestampToFilePath(
           io::JoinPath(
               dump_root_dir,
               absl::StrCat(DebugNodeKey::kMetadataFilePrefix,
@@ -410,8 +414,8 @@ absl::Status DebugIO::PublishDebugMetadata(
                                                           session_run_index)))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
-          event, string(io::Dirname(core_metadata_path)),
-          string(io::Basename(core_metadata_path))));
+          event, std::string(io::Dirname(core_metadata_path)),
+          std::string(io::Basename(core_metadata_path))));
     }
   }
 
@@ -420,13 +424,13 @@ absl::Status DebugIO::PublishDebugMetadata(
 
 absl::Status DebugIO::PublishDebugTensor(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const absl::Span<const string> debug_urls,
+    const uint64_t wall_time_us, const absl::Span<const std::string> debug_urls,
     const bool gated_grpc, const int64_t step_id) {
   int32_t num_failed_urls = 0;
   std::vector<absl::Status> fail_statuses;
-  for (const string& url : debug_urls) {
+  for (const std::string& url : debug_urls) {
     if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
+      const std::string dump_root_dir = url.substr(strlen(kFileURLScheme));
 
       const int64_t tensorBytes =
           tensor.IsInitialized() ? tensor.TotalBytes() : 0;
@@ -465,7 +469,7 @@ absl::Status DebugIO::PublishDebugTensor(
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
     } else if (absl::StartsWith(absl::AsciiStrToLower(url), kMemoryURLScheme)) {
-      const string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
+      const std::string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
       auto* callback_registry = DebugCallbackRegistry::singleton();
       auto* callback = callback_registry->GetCallback(dump_root_dir);
       CHECK(callback) << "No callback registered for: " << dump_root_dir;
@@ -479,7 +483,7 @@ absl::Status DebugIO::PublishDebugTensor(
   if (num_failed_urls == 0) {
     return absl::OkStatus();
   } else {
-    string error_message = strings::StrCat(
+    std::string error_message = strings::StrCat(
         "Publishing to ", num_failed_urls, " of ", debug_urls.size(),
         " debug target URLs failed, due to the following errors:");
     for (absl::Status& status : fail_statuses) {
@@ -492,18 +496,19 @@ absl::Status DebugIO::PublishDebugTensor(
 
 absl::Status DebugIO::PublishDebugTensor(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const absl::Span<const string> debug_urls) {
+    const uint64_t wall_time_us,
+    const absl::Span<const std::string> debug_urls) {
   return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
                             false);
 }
 
 absl::Status DebugIO::PublishGraph(
-    const Graph& graph, const string& device_name,
-    const std::unordered_set<string>& debug_urls) {
+    const Graph& graph, const std::string& device_name,
+    const std::unordered_set<std::string>& debug_urls) {
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
 
-  string buf;
+  std::string buf;
   graph_def.SerializeToString(&buf);
 
   const int64_t now_micros = Env::Default()->NowMicros();
@@ -512,13 +517,13 @@ absl::Status DebugIO::PublishGraph(
   event.set_graph_def(buf);
 
   absl::Status status = absl::OkStatus();
-  for (const string& debug_url : debug_urls) {
+  for (const std::string& debug_url : debug_urls) {
     if (absl::StartsWith(debug_url, kFileURLScheme)) {
-      const string dump_root_dir =
+      const std::string dump_root_dir =
           io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
                        DebugNodeKey::DeviceNameToDevicePath(device_name));
-      const uint64 graph_hash = ::tensorflow::Hash64(buf);
-      const string file_name =
+      const uint64_t graph_hash = ::tensorflow::Hash64(buf);
+      const std::string file_name =
           strings::StrCat(DebugNodeKey::kMetadataFilePrefix, DebugIO::kGraphTag,
                           DebugIO::kHashTag, graph_hash, "_", now_micros);
 
@@ -556,10 +561,10 @@ bool DebugIO::IsCopyNodeGateOpen(
 #endif
 }
 
-bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
-                                  const std::vector<string>& debug_urls) {
+bool DebugIO::IsDebugNodeGateOpen(const std::string& watch_key,
+                                  const std::vector<std::string>& debug_urls) {
 #ifndef PLATFORM_WINDOWS
-  for (const string& debug_url : debug_urls) {
+  for (const std::string& debug_url : debug_urls) {
     if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
                           DebugIO::kGrpcURLScheme)) {
       return true;
@@ -575,8 +580,8 @@ bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
 #endif
 }
 
-bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
-                                 const string& debug_url) {
+bool DebugIO::IsDebugURLGateOpen(const std::string& watch_key,
+                                 const std::string& debug_url) {
 #ifndef PLATFORM_WINDOWS
   if (debug_url != kGrpcURLScheme) {
     return true;
@@ -588,7 +593,7 @@ bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
 #endif
 }
 
-absl::Status DebugIO::CloseDebugURL(const string& debug_url) {
+absl::Status DebugIO::CloseDebugURL(const std::string& debug_url) {
   if (absl::StartsWith(debug_url, DebugIO::kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
     return DebugGrpcIO::CloseGrpcStream(debug_url);
@@ -603,10 +608,10 @@ absl::Status DebugIO::CloseDebugURL(const string& debug_url) {
 
 absl::Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                           const Tensor& tensor,
-                                          const uint64 wall_time_us,
-                                          const string& dump_root_dir,
-                                          string* dump_file_path) {
-  const string file_path =
+                                          const uint64_t wall_time_us,
+                                          const std::string& dump_root_dir,
+                                          std::string* dump_file_path) {
+  const std::string file_path =
       GetDumpFilePath(dump_root_dir, debug_node_key, wall_time_us);
 
   if (dump_file_path != nullptr) {
@@ -618,9 +623,9 @@ absl::Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
 
 absl::Status DebugFileIO::DumpTensorToDirForNodeDumping(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& dump_root_dir,
-    string* dump_file_path, const int64_t step_id) {
-  const string file_path = GetDumpFilePathForNodeDumping(
+    const uint64_t wall_time_us, const std::string& dump_root_dir,
+    std::string* dump_file_path, const int64_t step_id) {
+  const std::string file_path = GetDumpFilePathForNodeDumping(
       dump_root_dir, debug_node_key, wall_time_us, step_id);
   if (dump_file_path != nullptr) {
     *dump_file_path = file_path;
@@ -629,9 +634,9 @@ absl::Status DebugFileIO::DumpTensorToDirForNodeDumping(
   return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
 }
 
-string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
-                                    const DebugNodeKey& debug_node_key,
-                                    const uint64 wall_time_us) {
+std::string DebugFileIO::GetDumpFilePath(const std::string& dump_root_dir,
+                                         const DebugNodeKey& debug_node_key,
+                                         const uint64_t wall_time_us) {
   return AppendTimestampToFilePath(
       io::JoinPath(dump_root_dir, debug_node_key.device_path,
                    strings::StrCat(debug_node_key.node_name, "_",
@@ -640,9 +645,9 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
       wall_time_us);
 }
 
-string DebugFileIO::GetDumpFilePathForNodeDumping(
-    const string& dump_root_dir, const DebugNodeKey& debug_node_key,
-    const uint64 wall_time_us, const int64_t step_id) {
+std::string DebugFileIO::GetDumpFilePathForNodeDumping(
+    const std::string& dump_root_dir, const DebugNodeKey& debug_node_key,
+    const uint64_t wall_time_us, const int64_t step_id) {
   return AppendTimestampToFilePath(
       io::JoinPath(
           dump_root_dir, kDumpSubDirName, absl::StrCat("step-", step_id),
@@ -654,8 +659,8 @@ string DebugFileIO::GetDumpFilePathForNodeDumping(
 }
 
 absl::Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
-                                               const string& dir_name,
-                                               const string& file_name) {
+                                               const std::string& dir_name,
+                                               const std::string& file_name) {
   Env* env(Env::Default());
 
   absl::Status s = RecursiveCreateDir(env, dir_name);
@@ -665,9 +670,9 @@ absl::Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
                                      ", due to: ", s.message()));
   }
 
-  const string file_path = io::JoinPath(dir_name, file_name);
+  const std::string file_path = io::JoinPath(dir_name, file_name);
 
-  string event_str;
+  std::string event_str;
   event_proto.SerializeToString(&event_str);
 
   std::unique_ptr<WritableFile> f = nullptr;
@@ -680,21 +685,21 @@ absl::Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
 
 absl::Status DebugFileIO::DumpTensorToEventFile(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& file_path) {
+    const uint64_t wall_time_us, const std::string& file_path) {
   std::vector<Event> events;
   TF_RETURN_IF_ERROR(
       WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
-  return DumpEventProtoToFile(events[0], string(io::Dirname(file_path)),
-                              string(io::Basename(file_path)));
+  return DumpEventProtoToFile(events[0], std::string(io::Dirname(file_path)),
+                              std::string(io::Basename(file_path)));
 }
 
-absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
+absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const std::string& dir) {
   if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
     // The path already exists as a directory. Return OK right away.
     return absl::OkStatus();
   }
 
-  string parent_dir(io::Dirname(dir));
+  std::string parent_dir(io::Dirname(dir));
   if (!env->FileExists(parent_dir).ok()) {
     // The parent path does not exist yet, create it first.
     absl::Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
@@ -724,13 +729,13 @@ absl::Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
 }
 
 // Default total disk usage limit: 100 GBytes
-const uint64 DebugFileIO::kDefaultGlobalDiskBytesLimit = 107374182400L;
-uint64 DebugFileIO::global_disk_bytes_limit_ = 0;
-uint64 DebugFileIO::disk_bytes_used_ = 0;
+const uint64_t DebugFileIO::kDefaultGlobalDiskBytesLimit = 107374182400L;
+uint64_t DebugFileIO::global_disk_bytes_limit_ = 0;
+uint64_t DebugFileIO::disk_bytes_used_ = 0;
 
 mutex DebugFileIO::bytes_mu_(LINKER_INITIALIZED);
 
-bool DebugFileIO::requestDiskByteUsage(uint64 bytes) {
+bool DebugFileIO::requestDiskByteUsage(uint64_t bytes) {
   mutex_lock l(bytes_mu_);
   if (global_disk_bytes_limit_ == 0) {
     const char* env_tfdbg_disk_bytes_limit = getenv("TFDBG_DISK_BYTES_LIMIT");
@@ -760,13 +765,13 @@ void DebugFileIO::resetDiskByteUsage() {
 }
 
 #ifndef PLATFORM_WINDOWS
-DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
+DebugGrpcChannel::DebugGrpcChannel(const std::string& server_stream_addr)
     : server_stream_addr_(server_stream_addr),
       url_(absl::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
 
 absl::Status DebugGrpcChannel::Connect(const int64_t timeout_micros) {
   ::grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32_t>::max());
   // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   channel_ = ::grpc::CreateCustomChannel(
@@ -801,9 +806,10 @@ void DebugGrpcChannel::ReceiveAndProcessEventReplies(const size_t max_replies) {
          ReadEventReply(&event_reply)) {
     for (const EventReply::DebugOpStateChange& debug_op_state_change :
          event_reply.debug_op_state_changes()) {
-      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
-                                         debug_op_state_change.output_slot(),
-                                         ":", debug_op_state_change.debug_op());
+      std::string watch_key =
+          strings::StrCat(debug_op_state_change.node_name(), ":",
+                          debug_op_state_change.output_slot(), ":",
+                          debug_op_state_change.debug_op());
       DebugGrpcIO::SetDebugNodeKeyGrpcState(url_, watch_key,
                                             debug_op_state_change.state());
     }
@@ -832,17 +838,17 @@ const size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
 
 const size_t DebugGrpcIO::kGrpcMaxVarintLengthSize = 6;
 
-std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
 DebugGrpcIO::GetStreamChannels() {
-  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
-      stream_channels =
-          new std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>();
+  static std::unordered_map<
+      std::string, std::unique_ptr<DebugGrpcChannel>>* stream_channels =
+      new std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>();
   return stream_channels;
 }
 
 absl::Status DebugGrpcIO::SendTensorThroughGrpcStream(
     const DebugNodeKey& debug_node_key, const Tensor& tensor,
-    const uint64 wall_time_us, const string& grpc_stream_url,
+    const uint64_t wall_time_us, const std::string& grpc_stream_url,
     const bool gated) {
   if (gated &&
       !IsReadGateOpen(grpc_stream_url, debug_node_key.debug_node_name)) {
@@ -868,7 +874,7 @@ absl::Status DebugGrpcIO::SendTensorThroughGrpcStream(
 }
 
 absl::Status DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
-    EventReply* event_reply, const string& grpc_stream_url) {
+    EventReply* event_reply, const std::string& grpc_stream_url) {
   DebugGrpcChannel* debug_grpc_channel = nullptr;
   TF_RETURN_IF_ERROR(
       GetOrCreateDebugGrpcChannel(grpc_stream_url, &debug_grpc_channel));
@@ -881,16 +887,16 @@ absl::Status DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
 }
 
 absl::Status DebugGrpcIO::GetOrCreateDebugGrpcChannel(
-    const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
-  const string addr_with_path =
+    const std::string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
+  const std::string addr_with_path =
       absl::StartsWith(grpc_stream_url, DebugIO::kGrpcURLScheme)
           ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
           : grpc_stream_url;
-  const string server_stream_addr =
+  const std::string server_stream_addr =
       addr_with_path.substr(0, addr_with_path.find('/'));
   {
     mutex_lock l(streams_mu_);
-    std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+    std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
         stream_channels = GetStreamChannels();
     if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       std::unique_ptr<DebugGrpcChannel> channel(
@@ -905,7 +911,7 @@ absl::Status DebugGrpcIO::GetOrCreateDebugGrpcChannel(
 }
 
 absl::Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
-    const Event& event_proto, const string& grpc_stream_url,
+    const Event& event_proto, const std::string& grpc_stream_url,
     const bool receive_reply) {
   DebugGrpcChannel* debug_grpc_channel;
   TF_RETURN_IF_ERROR(
@@ -924,15 +930,15 @@ absl::Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
   return absl::OkStatus();
 }
 
-bool DebugGrpcIO::IsReadGateOpen(const string& grpc_debug_url,
-                                 const string& watch_key) {
+bool DebugGrpcIO::IsReadGateOpen(const std::string& grpc_debug_url,
+                                 const std::string& watch_key) {
   const DebugNodeName2State* enabled_node_to_state =
       GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   return enabled_node_to_state->find(watch_key) != enabled_node_to_state->end();
 }
 
-bool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
-                                  const string& watch_key) {
+bool DebugGrpcIO::IsWriteGateOpen(const std::string& grpc_debug_url,
+                                  const std::string& watch_key) {
   const DebugNodeName2State* enabled_node_to_state =
       GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   auto it = enabled_node_to_state->find(watch_key);
@@ -943,10 +949,10 @@ bool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
   }
 }
 
-absl::Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
+absl::Status DebugGrpcIO::CloseGrpcStream(const std::string& grpc_stream_url) {
   mutex_lock l(streams_mu_);
 
-  std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+  std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
       stream_channels = GetStreamChannels();
   if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
     // Stream of the specified address exists. Close it and remove it from
@@ -961,18 +967,18 @@ absl::Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
   }
 }
 
-std::unordered_map<string, DebugGrpcIO::DebugNodeName2State>*
+std::unordered_map<std::string, DebugGrpcIO::DebugNodeName2State>*
 DebugGrpcIO::GetEnabledDebugOpStates() {
-  static std::unordered_map<string, DebugNodeName2State>*
+  static std::unordered_map<std::string, DebugNodeName2State>*
       enabled_debug_op_states =
-          new std::unordered_map<string, DebugNodeName2State>();
+          new std::unordered_map<std::string, DebugNodeName2State>();
   return enabled_debug_op_states;
 }
 
 DebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
-    const string& grpc_debug_url) {
+    const std::string& grpc_debug_url) {
   static mutex* debug_ops_state_mu = new mutex();
-  std::unordered_map<string, DebugNodeName2State>* states =
+  std::unordered_map<std::string, DebugNodeName2State>* states =
       GetEnabledDebugOpStates();
 
   mutex_lock l(*debug_ops_state_mu);
@@ -984,7 +990,7 @@ DebugGrpcIO::DebugNodeName2State* DebugGrpcIO::GetEnabledDebugOpStatesAtUrl(
 }
 
 void DebugGrpcIO::SetDebugNodeKeyGrpcState(
-    const string& grpc_debug_url, const string& watch_key,
+    const std::string& grpc_debug_url, const std::string& watch_key,
     const EventReply::DebugOpStateChange::State new_state) {
   DebugNodeName2State* states = GetEnabledDebugOpStatesAtUrl(grpc_debug_url);
   if (new_state == EventReply::DebugOpStateChange::DISABLED) {
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 95864c714682b6..99107971f0f2b4 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -36,15 +36,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-absl::Status ReadEventFromFile(const string& dump_file_path, Event* event);
+absl::Status ReadEventFromFile(const std::string& dump_file_path, Event* event);
 
 struct DebugWatchAndURLSpec {
-  DebugWatchAndURLSpec(const string& watch_key, const string& url,
+  DebugWatchAndURLSpec(const std::string& watch_key, const std::string& url,
                        const bool gated_grpc)
       : watch_key(watch_key), url(url), gated_grpc(gated_grpc) {}
 
-  const string watch_key;
-  const string url;
+  const std::string watch_key;
+  const std::string url;
   const bool gated_grpc;
 };
 
@@ -63,10 +63,11 @@ class DebugIO {
 
   static absl::Status PublishDebugMetadata(
       const int64_t global_step, const int64_t session_run_index,
-      const int64_t executor_step_index, const std::vector<string>& input_names,
-      const std::vector<string>& output_names,
-      const std::vector<string>& target_nodes,
-      const std::unordered_set<string>& debug_urls);
+      const int64_t executor_step_index,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_nodes,
+      const std::unordered_set<std::string>& debug_urls);
 
   // Publishes a tensor to a debug target URL.
   //
@@ -82,13 +83,15 @@ class DebugIO {
   //   step_id: Step ID associated with the tensor.
   static absl::Status PublishDebugTensor(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const absl::Span<const string> debug_urls,
-      bool gated_grpc, int64_t step_id = -1);
+      const uint64_t wall_time_us,
+      const absl::Span<const std::string> debug_urls, bool gated_grpc,
+      int64_t step_id = -1);
 
   // Convenience overload of the method above for no gated_grpc by default.
   static absl::Status PublishDebugTensor(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const absl::Span<const string> debug_urls);
+      const uint64_t wall_time_us,
+      const absl::Span<const std::string> debug_urls);
 
   // Publishes a graph to a set of debug URLs.
   //
@@ -96,8 +99,8 @@ class DebugIO {
   //   graph: The graph to be published.
   //   debug_urls: The set of debug URLs to publish the graph to.
   static absl::Status PublishGraph(
-      const Graph& graph, const string& device_name,
-      const std::unordered_set<string>& debug_urls);
+      const Graph& graph, const std::string& device_name,
+      const std::unordered_set<std::string>& debug_urls);
 
   // Determines whether a copy node needs to perform deep-copy of input tensor.
   //
@@ -126,8 +129,8 @@ class DebugIO {
   //
   // Returns:
   //   Whether this debug op should proceed.
-  static bool IsDebugNodeGateOpen(const string& watch_key,
-                                  const std::vector<string>& debug_urls);
+  static bool IsDebugNodeGateOpen(const std::string& watch_key,
+                                  const std::vector<std::string>& debug_urls);
 
   // Determines whether debug information should be sent through a grpc://
   // debug URL given the current gRPC gating status.
@@ -141,10 +144,10 @@ class DebugIO {
   // Returns:
   //   Whether the sending of debug data to the debug_url should
   //     proceed.
-  static bool IsDebugURLGateOpen(const string& watch_key,
-                                 const string& debug_url);
+  static bool IsDebugURLGateOpen(const std::string& watch_key,
+                                 const std::string& debug_url);
 
-  static absl::Status CloseDebugURL(const string& debug_url);
+  static absl::Status CloseDebugURL(const std::string& debug_url);
 };
 
 // Helper class for debug ops.
@@ -171,15 +174,15 @@ class DebugFileIO {
   //   dump_file_path: The actual dump file path (passed as reference).
   static absl::Status DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                       const Tensor& tensor,
-                                      const uint64 wall_time_us,
-                                      const string& dump_root_dir,
-                                      string* dump_file_path);
+                                      const uint64_t wall_time_us,
+                                      const std::string& dump_root_dir,
+                                      std::string* dump_file_path);
 
   // Similar to the above, but for node inputs/outputs dumping feature.
   static absl::Status DumpTensorToDirForNodeDumping(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      uint64 wall_time_us, const string& dump_root_dir, string* dump_file_path,
-      int64_t step_id);
+      uint64_t wall_time_us, const std::string& dump_root_dir,
+      std::string* dump_file_path, int64_t step_id);
 
   // Get the full path to the dump file.
   //
@@ -190,14 +193,14 @@ class DebugFileIO {
   //   output_slot: Output slot index of the said node, e.g., 0.
   //   debug_op: Name of the debug op, e.g., DebugIdentity.
   //   wall_time_us: Time stamp of the dumped tensor, in microseconds (us).
-  static string GetDumpFilePath(const string& dump_root_dir,
-                                const DebugNodeKey& debug_node_key,
-                                const uint64 wall_time_us);
+  static std::string GetDumpFilePath(const std::string& dump_root_dir,
+                                     const DebugNodeKey& debug_node_key,
+                                     const uint64_t wall_time_us);
 
   // Similar to the above, but for node inputs/outputs dumping feature.
-  static string GetDumpFilePathForNodeDumping(
-      const string& dump_root_dir, const DebugNodeKey& debug_node_key,
-      uint64 wall_time_us, int64_t step_id);
+  static std::string GetDumpFilePathForNodeDumping(
+      const std::string& dump_root_dir, const DebugNodeKey& debug_node_key,
+      uint64_t wall_time_us, int64_t step_id);
 
   // Dumps an Event proto to a file.
   //
@@ -206,8 +209,8 @@ class DebugFileIO {
   //   dir_name: Directory path.
   //   file_name: Base file name.
   static absl::Status DumpEventProtoToFile(const Event& event_proto,
-                                           const string& dir_name,
-                                           const string& file_name);
+                                           const std::string& dir_name,
+                                           const std::string& file_name);
 
   // Request additional bytes to be dumped to the file system.
   //
@@ -222,31 +225,31 @@ class DebugFileIO {
   // Returns:
   //   Whether the request is approved given the total dumping
   //   limit.
-  static bool requestDiskByteUsage(uint64 bytes);
+  static bool requestDiskByteUsage(uint64_t bytes);
 
   // Reset the disk byte usage to zero.
   static void resetDiskByteUsage();
 
-  static uint64 global_disk_bytes_limit_;
+  static uint64_t global_disk_bytes_limit_;
 
  private:
   // Encapsulates the Tensor in an Event protobuf and write it to file.
   static absl::Status DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
                                             const Tensor& tensor,
-                                            const uint64 wall_time_us,
-                                            const string& file_path);
+                                            const uint64_t wall_time_us,
+                                            const std::string& file_path);
 
   // Implemented ad hoc here for now.
   // TODO(cais): Replace with shared implementation once http://b/30497715 is
   // fixed.
-  static absl::Status RecursiveCreateDir(Env* env, const string& dir);
+  static absl::Status RecursiveCreateDir(Env* env, const std::string& dir);
 
   // Tracks how much disk has been used so far.
-  static uint64 disk_bytes_used_;
+  static uint64_t disk_bytes_used_;
   // Mutex for thread-safe access to disk_bytes_used_.
   static mutex bytes_mu_;
   // Default limit for the disk space.
-  static const uint64 kDefaultGlobalDiskBytesLimit;
+  static const uint64_t kDefaultGlobalDiskBytesLimit;
 
   friend class DiskUsageLimitTest;
 };
@@ -282,7 +285,7 @@ class DebugGrpcChannel {
   //   server_stream_addr: Address (host name and port) of the debug stream
   //     server implementing the EventListener service (see
   //     debug_service.proto). E.g., "127.0.0.1:12345".
-  explicit DebugGrpcChannel(const string& server_stream_addr);
+  explicit DebugGrpcChannel(const std::string& server_stream_addr);
 
   virtual ~DebugGrpcChannel() {}
 
@@ -337,8 +340,8 @@ class DebugGrpcChannel {
   absl::Status ReceiveServerRepliesAndClose();
 
  private:
-  string server_stream_addr_;
-  string url_;
+  std::string server_stream_addr_;
+  std::string url_;
   ::grpc::ClientContext ctx_;
   std::shared_ptr<::grpc::Channel> channel_;
   std::unique_ptr<grpc::EventListener::Stub> stub_;
@@ -356,7 +359,7 @@ class DebugGrpcIO {
   // Sends a tensor through a debug gRPC stream.
   static absl::Status SendTensorThroughGrpcStream(
       const DebugNodeKey& debug_node_key, const Tensor& tensor,
-      const uint64 wall_time_us, const string& grpc_stream_url,
+      const uint64_t wall_time_us, const std::string& grpc_stream_url,
       const bool gated);
 
   // Sends an Event proto through a debug gRPC stream.
@@ -373,40 +376,40 @@ class DebugGrpcIO {
   // Returns:
   //   The Status of the operation.
   static absl::Status SendEventProtoThroughGrpcStream(
-      const Event& event_proto, const string& grpc_stream_url,
+      const Event& event_proto, const std::string& grpc_stream_url,
       const bool receive_reply = false);
 
   // Receive an EventReply proto through a debug gRPC stream.
   static absl::Status ReceiveEventReplyProtoThroughGrpcStream(
-      EventReply* event_reply, const string& grpc_stream_url);
+      EventReply* event_reply, const std::string& grpc_stream_url);
 
   // Check whether a debug watch key is read-activated at a given gRPC URL.
-  static bool IsReadGateOpen(const string& grpc_debug_url,
-                             const string& watch_key);
+  static bool IsReadGateOpen(const std::string& grpc_debug_url,
+                             const std::string& watch_key);
 
   // Check whether a debug watch key is write-activated (i.e., read- and
   // write-activated) at a given gRPC URL.
-  static bool IsWriteGateOpen(const string& grpc_debug_url,
-                              const string& watch_key);
+  static bool IsWriteGateOpen(const std::string& grpc_debug_url,
+                              const std::string& watch_key);
 
   // Closes a gRPC stream to the given address, if it exists.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to SendTensorThroughGrpcStream().
-  static absl::Status CloseGrpcStream(const string& grpc_stream_url);
+  static absl::Status CloseGrpcStream(const std::string& grpc_stream_url);
 
   // Set the gRPC state of a debug node key.
   // TODO(cais): Include device information in watch_key.
   static void SetDebugNodeKeyGrpcState(
-      const string& grpc_debug_url, const string& watch_key,
+      const std::string& grpc_debug_url, const std::string& watch_key,
       const EventReply::DebugOpStateChange::State new_state);
 
  private:
   using DebugNodeName2State =
-      std::unordered_map<string, EventReply::DebugOpStateChange::State>;
+      std::unordered_map<std::string, EventReply::DebugOpStateChange::State>;
 
   // Returns a global map from grpc debug URLs to the corresponding
   // DebugGrpcChannels.
-  static std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
+  static std::unordered_map<std::string, std::unique_ptr<DebugGrpcChannel>>*
   GetStreamChannels();
 
   // Get a DebugGrpcChannel object at a given URL, creating one if necessary.
@@ -420,15 +423,16 @@ class DebugGrpcIO {
   // Returns:
   //   Status of this operation.
   static absl::Status GetOrCreateDebugGrpcChannel(
-      const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel);
+      const std::string& grpc_stream_url,
+      DebugGrpcChannel** debug_grpc_channel);
 
   // Returns a map from debug URL to a map from debug op name to enabled state.
-  static std::unordered_map<string, DebugNodeName2State>*
+  static std::unordered_map<std::string, DebugNodeName2State>*
   GetEnabledDebugOpStates();
 
   // Returns a map from debug op names to enabled state, for a given debug URL.
   static DebugNodeName2State* GetEnabledDebugOpStatesAtUrl(
-      const string& grpc_debug_url);
+      const std::string& grpc_debug_url);
 
   // Clear enabled debug op state from all debug URLs (if any).
   static void ClearEnabledWatchKeys();
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index d09465d80e5a01..fde63f53331cf1 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -107,7 +107,7 @@ TEST_F(DebugIOUtilsTest, DebugNodeKeysIsHashable) {
 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
   Initialize();
 
-  const string test_dir =
+  const std::string test_dir =
       absl::StrCat(testing::TmpDir(), "/DumpFloatTensorToFileSunnyDay");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
@@ -115,11 +115,11 @@ TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
 
   // Append levels of nonexisting directories, to test that the function can
   // create directories.
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
 
-  string dump_file_path;
+  std::string dump_file_path;
   TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_path));
 
@@ -154,16 +154,16 @@ TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
 TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
   Initialize();
 
-  const string test_dir =
+  const std::string test_dir =
       absl::StrCat(testing::TmpDir(), "/DumpStringTensorToFileSunnyDay");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
   }
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "quux/grault/tensor_b", 1, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  string dump_file_name;
+  std::string dump_file_name;
   absl::Status s = DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_b_, wall_time, test_dir, &dump_file_name);
   ASSERT_TRUE(s.ok());
@@ -209,17 +209,17 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
   Initialize();
 
   // First, create the file at the path.
-  const string test_dir =
+  const std::string test_dir =
       absl::StrCat(testing::TmpDir(), "/DumpTensorToFileCannotCreateDirectory");
   if (!env_->FileExists(test_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(test_dir).ok());
   }
-  const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+  const std::string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
   const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
                                    "DebugIdentity");
-  const string txt_file_dir =
+  const std::string txt_file_dir =
       io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
-  const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+  const std::string txt_file_name = io::JoinPath(txt_file_dir, "baz");
   if (!env_->FileExists(txt_file_dir).ok()) {
     ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
   }
@@ -238,9 +238,9 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
   // Second, try to dump the tensor to a path that requires "baz" to be a
   // directory, which should lead to an error.
 
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  string dump_file_name;
+  std::string dump_file_name;
   absl::Status s = DebugFileIO::DumpTensorToDir(
       kDebugNodeKey, *tensor_a_, wall_time, test_dir, &dump_file_name);
   ASSERT_FALSE(s.ok());
@@ -261,13 +261,13 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
   const int kNumDumpRoots = 3;
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
-  std::vector<string> dump_roots;
-  std::vector<string> dump_file_paths;
-  std::vector<string> urls;
+  std::vector<std::string> dump_roots;
+  std::vector<std::string> dump_file_paths;
+  std::vector<std::string> urls;
   for (int i = 0; i < kNumDumpRoots; ++i) {
-    string dump_root =
+    std::string dump_root =
         absl::StrCat(testing::TmpDir(), "/PublicTensorToMultipleFileUrls_", i);
 
     dump_roots.push_back(dump_root);
@@ -331,10 +331,10 @@ TEST_F(DebugIOUtilsTest, PublishTensorToMemoryCallback) {
 
   const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
                                    "foo/bar/qux/tensor_a", 0, "DebugIdentity");
-  const uint64 wall_time = env_->NowMicros();
+  const uint64_t wall_time = env_->NowMicros();
 
   bool called = false;
-  std::vector<string> urls = {"memcbk://test_callback"};
+  std::vector<std::string> urls = {"memcbk://test_callback"};
   ;
 
   auto* callback_registry = DebugCallbackRegistry::singleton();
@@ -367,8 +367,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
 
   thread::ThreadPool* tp =
       new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs);
-  const uint64 wall_time = env_->NowMicros();
-  const string dump_root_base =
+  const uint64_t wall_time = env_->NowMicros();
+  const std::string dump_root_base =
       absl::StrCat(testing::TmpDir(),
                    "/PublishTensorConcurrentlyToPartiallyOverlappingPaths");
   if (!env_->FileExists(dump_root_base).ok()) {
@@ -376,8 +376,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
   }
 
   mutex mu;
-  std::vector<string> dump_roots TF_GUARDED_BY(mu);
-  std::vector<string> dump_file_paths TF_GUARDED_BY(mu);
+  std::vector<std::string> dump_roots TF_GUARDED_BY(mu);
+  std::vector<std::string> dump_file_paths TF_GUARDED_BY(mu);
 
   int dump_count TF_GUARDED_BY(mu) = 0;
   int done_count TF_GUARDED_BY(mu) = 0;
@@ -387,8 +387,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
              &dump_file_paths, &wall_time, &kDebugNodeKey, &kConcurrentPubs,
              &all_done]() {
     // "gumpy" is the shared directory part of the path.
-    string dump_root;
-    string debug_url;
+    std::string dump_root;
+    std::string debug_url;
     {
       mutex_lock l(mu);
       dump_root =
@@ -401,7 +401,7 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
       debug_url = absl::StrCat("file://", dump_root);
     }
 
-    std::vector<string> urls;
+    std::vector<std::string> urls;
     urls.push_back(debug_url);
 
     absl::Status s =
diff --git a/tensorflow/core/debug/debug_node_key.cc b/tensorflow/core/debug/debug_node_key.cc
index 1fa51f138c2f6f..09510b8df1bfb8 100644
--- a/tensorflow/core/debug/debug_node_key.cc
+++ b/tensorflow/core/debug/debug_node_key.cc
@@ -26,9 +26,11 @@ const char* const DebugNodeKey::kMetadataFilePrefix = "_tfdbg_";
 
 const char* const DebugNodeKey::kDeviceTag = "device_";
 
-DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
-                           const int32_t output_slot, const string& debug_op,
-                           const string& io_of_node, const bool is_input,
+DebugNodeKey::DebugNodeKey(const std::string& device_name,
+                           const std::string& node_name,
+                           const int32_t output_slot,
+                           const std::string& debug_op,
+                           const std::string& io_of_node, const bool is_input,
                            const int32_t io_index)
     : device_name(device_name),
       node_name(node_name),
@@ -52,7 +54,8 @@ bool DebugNodeKey::operator!=(const DebugNodeKey& other) const {
   return !((*this) == other);
 }
 
-const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+const std::string DebugNodeKey::DeviceNameToDevicePath(
+    const std::string& device_name) {
   return absl::StrCat(kMetadataFilePrefix, kDeviceTag,
                       str_util::StringReplace(
                           str_util::StringReplace(device_name, ":", "_", true),
diff --git a/tensorflow/core/debug/debug_node_key.h b/tensorflow/core/debug/debug_node_key.h
index 5decb5cc683643..867e0809314324 100644
--- a/tensorflow/core/debug/debug_node_key.h
+++ b/tensorflow/core/debug/debug_node_key.h
@@ -27,28 +27,29 @@ struct DebugNodeKey {
   static const char* const kMetadataFilePrefix;
   static const char* const kDeviceTag;
 
-  DebugNodeKey(const string& device_name, const string& node_name,
-               int32_t output_slot, const string& debug_op,
-               const string& io_of_node = "", bool is_input = false,
+  DebugNodeKey(const std::string& device_name, const std::string& node_name,
+               int32_t output_slot, const std::string& debug_op,
+               const std::string& io_of_node = "", bool is_input = false,
                int32_t io_index = -1);
 
   // Converts a device name string to a device path string.
   // E.g., /job:localhost/replica:0/task:0/cpu:0 will be converted to
   //   ,job_localhost,replica_0,task_0,cpu_0.
-  static const string DeviceNameToDevicePath(const string& device_name);
+  static const std::string DeviceNameToDevicePath(
+      const std::string& device_name);
 
   bool operator==(const DebugNodeKey& other) const;
   bool operator!=(const DebugNodeKey& other) const;
 
-  const string device_name;
-  const string node_name;
-  const int32 output_slot;
-  const string debug_op;
-  const string debug_node_name;
-  const string device_path;
-  const string io_of_node;
+  const std::string device_name;
+  const std::string node_name;
+  const int32_t output_slot;
+  const std::string debug_op;
+  const std::string debug_node_name;
+  const std::string device_path;
+  const std::string io_of_node;
   const bool is_input;
-  const int32 io_index;
+  const int32_t io_index;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debugger_state_impl.cc b/tensorflow/core/debug/debugger_state_impl.cc
index a1545ad1aa1516..23b70b431d8dd0 100644
--- a/tensorflow/core/debug/debugger_state_impl.cc
+++ b/tensorflow/core/debug/debugger_state_impl.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 DebuggerState::DebuggerState(const DebugOptions& debug_options) {
   for (const DebugTensorWatch& watch :
        debug_options.debug_tensor_watch_opts()) {
-    for (const string& url : watch.debug_urls()) {
+    for (const std::string& url : watch.debug_urls()) {
       debug_urls_.insert(url);
     }
   }
@@ -33,16 +33,17 @@ DebuggerState::DebuggerState(const DebugOptions& debug_options) {
 }
 
 DebuggerState::~DebuggerState() {
-  for (const string& debug_url : debug_urls_) {
+  for (const std::string& debug_url : debug_urls_) {
     DebugIO::CloseDebugURL(debug_url).IgnoreError();
   }
 }
 
 absl::Status DebuggerState::PublishDebugMetadata(
     const int64_t global_step, const int64_t session_run_index,
-    const int64_t executor_step_index, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_names) {
+    const int64_t executor_step_index,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::string>& target_names) {
   return DebugIO::PublishDebugMetadata(global_step, session_run_index,
                                        executor_step_index, input_names,
                                        output_names, target_names, debug_urls_);
@@ -55,11 +56,11 @@ absl::Status DebugGraphDecorator::DecorateGraph(Graph* graph, Device* device) {
 }
 
 absl::Status DebugGraphDecorator::PublishGraph(const Graph& graph,
-                                               const string& device_name) {
-  std::unordered_set<string> debug_urls;
+                                               const std::string& device_name) {
+  std::unordered_set<std::string> debug_urls;
   for (const DebugTensorWatch& watch :
        debug_options_.debug_tensor_watch_opts()) {
-    for (const string& url : watch.debug_urls()) {
+    for (const std::string& url : watch.debug_urls()) {
       debug_urls.insert(url);
     }
   }
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
index c34aa8bb51a917..73e74738d59d3c 100644
--- a/tensorflow/core/debug/debugger_state_impl.h
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -34,12 +34,13 @@ class DebuggerState : public DebuggerStateInterface {
   // details.
   absl::Status PublishDebugMetadata(
       const int64_t global_step, const int64_t session_run_count,
-      const int64_t executor_step_count, const std::vector<string>& input_names,
-      const std::vector<string>& output_names,
-      const std::vector<string>& target_names) override;
+      const int64_t executor_step_count,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::string>& target_names) override;
 
  private:
-  std::unordered_set<string> debug_urls_;
+  std::unordered_set<std::string> debug_urls_;
 };
 
 class DebugGraphDecorator : public DebugGraphDecoratorInterface {
@@ -50,7 +51,7 @@ class DebugGraphDecorator : public DebugGraphDecoratorInterface {
 
   absl::Status DecorateGraph(Graph* graph, Device* device) override;
   absl::Status PublishGraph(const Graph& graph,
-                            const string& device_name) override;
+                            const std::string& device_name) override;
 
  private:
   DebugOptions debug_options_;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index e5f5ef7620ab99..4e58928e5693dd 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -49,7 +49,7 @@ SessionOptions Devices(int num_cpus, int num_gpus) {
   return result;
 }
 
-void CreateGraphDef(GraphDef* graph_def, string node_names[3]) {
+void CreateGraphDef(GraphDef* graph_def, std::string node_names[3]) {
   Graph graph(OpRegistry::Global());
 
   Tensor a_tensor(DT_FLOAT, TensorShape({1, 2}));
@@ -77,7 +77,7 @@ void IsSingleFloatValue(const Tensor& val, float expected_val) {
   ASSERT_EQ(val.flat<float>()(0), expected_val);
 }
 
-SessionOptions Options(const string& target, int placement_period) {
+SessionOptions Options(const std::string& target, int placement_period) {
   SessionOptions options;
   // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
   // string.
@@ -115,18 +115,19 @@ class GrpcSessionDebugTest : public ::testing::Test {
     }
   }
 
-  const string GetDebugURL() { return debug_url_; }
+  const std::string GetDebugURL() { return debug_url_; }
 
-  void LoadTensorDumps(const string& subdir, std::vector<Tensor>* tensors) {
-    const string dirpath = io::JoinPath(dump_dir_, subdir);
+  void LoadTensorDumps(const std::string& subdir,
+                       std::vector<Tensor>* tensors) {
+    const std::string dirpath = io::JoinPath(dump_dir_, subdir);
     if (!(Env::Default()->IsDirectory(dirpath).ok())) {
       return;
     }
 
-    std::vector<string> filenames;
+    std::vector<std::string> filenames;
     TF_ASSERT_OK(Env::Default()->GetChildren(dirpath, &filenames));
 
-    for (const string& filename : filenames) {
+    for (const std::string& filename : filenames) {
       Event event;
       TF_ASSERT_OK(ReadEventFromFile(io::JoinPath(dirpath, filename), &event));
       if (event.summary().value().size() == 1) {
@@ -144,13 +145,13 @@ class GrpcSessionDebugTest : public ::testing::Test {
     debug_url_ = absl::StrCat("file://", dump_dir_);
   }
 
-  string dump_dir_;
-  string debug_url_;
+  std::string dump_dir_;
+  std::string debug_url_;
 };
 
 TEST_F(GrpcSessionDebugTest, FileDebugURL) {
   GraphDef graph;
-  string node_names[3];
+  std::string node_names[3];
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
@@ -216,7 +217,8 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
   TF_CHECK_OK(session->Close());
 }
 
-void SetDevice(GraphDef* graph, const string& name, const string& dev) {
+void SetDevice(GraphDef* graph, const std::string& name,
+               const std::string& dev) {
   for (size_t i = 0; i < graph->node_size(); ++i) {
     if (graph->node(i).name() == name) {
       graph->mutable_node(i)->set_device(dev);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 476ab423154c88..13d130d289418c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -107,7 +107,7 @@ GraphMgr::Item::~Item() {
 // NOTE: node->device_name() is not set by GraphConstructor.  We
 // expects that NodeDef in GraphDef given to workers fully specifies
 // device names.
-static string SplitByDevice(const Node* node) {
+static std::string SplitByDevice(const Node* node) {
   return node->assigned_device_name();
 }
 
@@ -144,7 +144,7 @@ absl::Status GraphMgr::DecorateAndPublishGraphForDebug(
 //
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
-absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
+absl::Status GraphMgr::InitItem(const std::string& handle, const GraphDef& gdef,
                                 const GraphOptions& graph_options,
                                 const DebugOptions& debug_options,
                                 const ConfigProto& config_proto,
@@ -187,14 +187,14 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, gdef, &graph));
 
   // Splits "graph" into multiple subgraphs by device names.
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [this](const string& prefix) {
+  popts.new_name = [this](const std::string& prefix) {
     mutex_lock l(mu_);
     return absl::StrCat(prefix, "_G", next_id_++);
   };
-  popts.get_incarnation = [this](const string& name) -> int64 {
+  popts.get_incarnation = [this](const std::string& name) -> int64_t {
     Device* device = nullptr;
     absl::Status s = device_mgr_->LookupDevice(name, &device);
     if (s.ok()) {
@@ -211,7 +211,7 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
     TF_RETURN_IF_ERROR(AddControlEdges(popts, &partitions));
   }
 
-  std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
+  std::unordered_map<std::string, std::unique_ptr<Graph>> partition_graphs;
   for (auto& partition : partitions) {
     std::unique_ptr<Graph> device_graph(new Graph(OpRegistry::Global()));
     GraphConstructorOptions device_opts;
@@ -236,7 +236,7 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   const auto& optimizer_opts = graph_options.optimizer_options();
   GraphOptimizer optimizer(optimizer_opts);
   for (auto& p : partition_graphs) {
-    const string& device_name = p.first;
+    const std::string& device_name = p.first;
     std::unique_ptr<Graph>& subgraph = p.second;
     item->units.resize(item->units.size() + 1);
     ExecutionUnit* unit = &(item->units.back());
@@ -316,14 +316,14 @@ absl::Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
   return absl::OkStatus();
 }
 
-absl::Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
+absl::Status GraphMgr::Register(const std::string& handle, const GraphDef& gdef,
                                 const GraphOptions& graph_options,
                                 const DebugOptions& debug_options,
                                 const ConfigProto& config_proto,
                                 int64_t collective_graph_key,
                                 WorkerSession* session,
                                 DistributedFunctionLibraryRuntime* cluster_flr,
-                                string* graph_handle) {
+                                std::string* graph_handle) {
   Item* item = new Item;
   absl::Status s =
       InitItem(handle, gdef, graph_options, debug_options, config_proto,
@@ -344,7 +344,7 @@ absl::Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
   return absl::OkStatus();
 }
 
-absl::Status GraphMgr::Deregister(const string& handle) {
+absl::Status GraphMgr::Deregister(const std::string& handle) {
   Item* item = nullptr;
   // Removes one item from table_.
   {
@@ -380,7 +380,7 @@ absl::Status GraphMgr::DeregisterAll() {
 absl::Status GraphMgr::SendInputs(const int64_t step_id,
                                   const NamedTensors& in) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   std::vector<Tensor> tensors_to_send;
   keys.reserve(in.size());
   tensors_to_send.reserve(in.size());
@@ -419,7 +419,7 @@ absl::Status GraphMgr::RecvOutputs(const int64_t step_id, NamedTensors* out) {
 void GraphMgr::RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
                                 StatusCallback done) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   std::vector<Tensor>* received_keys = new std::vector<Tensor>;
   keys.reserve(out->size());
   received_keys->reserve(out->size());
@@ -443,13 +443,13 @@ void GraphMgr::RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
 }
 
 void GraphMgr::ExecuteAsync(
-    const string& handle, const int64_t step_id, const ExecutorOpts& opts,
+    const std::string& handle, const int64_t step_id, const ExecutorOpts& opts,
     const NamedTensors& in, WorkerSession* session,
     StepStatsCollector* collector, MutableRunGraphResponseWrapper* response,
     CancellationManager* cancellation_manager,
     tsl::CoordinationServiceAgent* coordination_service_agent,
     StatusCallback done) {
-  const uint64 start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t start_time_usecs = Env::Default()->NowMicros();
   tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish or RunGraphDone.
       [step_id] {
@@ -498,7 +498,7 @@ void GraphMgr::ExecuteAsync(
   // Sends values specified by the caller.
   size_t input_size = 0;
   if (s.ok()) {
-    std::vector<string> keys;
+    std::vector<std::string> keys;
     std::vector<Tensor> tensors_to_send;
     keys.reserve(in.size());
     tensors_to_send.reserve(in.size());
@@ -543,17 +543,19 @@ void GraphMgr::ExecuteAsync(
 }
 
 void GraphMgr::StartParallelExecutors(
-    const string& handle, int64_t step_id, Item* item, Rendezvous* rendezvous,
-    CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
-    CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
-    WorkerSession* session, int64_t start_time_usecs,
+    const std::string& handle, int64_t step_id, Item* item,
+    Rendezvous* rendezvous, CollectiveExecutor::Handle* ce_handle,
+    StepStatsCollector* collector, CostGraphDef* cost_graph,
+    CancellationManager* cancellation_manager, WorkerSession* session,
+    int64_t start_time_usecs,
     tsl::CoordinationServiceAgent* coordination_service_agent,
     StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
-  ScopedStepContainer* step_container = new ScopedStepContainer(
-      step_id,
-      [this](const string& name) { device_mgr_->ClearContainers({name}); });
+  ScopedStepContainer* step_container =
+      new ScopedStepContainer(step_id, [this](const std::string& name) {
+        device_mgr_->ClearContainers({name});
+      });
   // NOTE: Transfer one ref of rendezvous and item.
   ExecutorBarrier* barrier =
       new ExecutorBarrier(num_units, rendezvous,
@@ -602,7 +604,7 @@ void GraphMgr::BuildCostModel(Item* item, StepStatsCollector* collector,
                               CostGraphDef* cost_graph) {
   if (collector && !skip_cost_models_) {
     // Build the cost model
-    std::unordered_map<string, const Graph*> device_to_graph;
+    std::unordered_map<std::string, const Graph*> device_to_graph;
     for (const auto& unit : item->units) {
       if (unit.build_cost_model > 0) {
         device_to_graph[unit.device->name()] = unit.graph.get();
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 5c8c7ce0f20c95..3458771a21e9b1 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -85,21 +85,21 @@ class GraphMgr {
 
   // Registers a graph. Fills in "handle". The registered graph retains a
   // reference to cluster_flr to do cross process function calls.
-  absl::Status Register(const string& handle, const GraphDef& gdef,
+  absl::Status Register(const std::string& handle, const GraphDef& gdef,
                         const GraphOptions& graph_options,
                         const DebugOptions& debug_options,
                         const ConfigProto& config_proto,
                         int64_t collective_graph_key, WorkerSession* session,
                         DistributedFunctionLibraryRuntime* cluster_flr,
-                        string* graph_handle);
+                        std::string* graph_handle);
 
   // Executes one step of a registered graph "handle".
   //
   // If "out" is not nullptr, "out" specifies all keys the execution
   // should receive upon finish.
-  typedef std::map<string, Tensor> NamedTensors;
+  typedef std::map<std::string, Tensor> NamedTensors;
   typedef std::function<void(const absl::Status&)> StatusCallback;
-  void ExecuteAsync(const string& handle, const int64_t step_id,
+  void ExecuteAsync(const std::string& handle, const int64_t step_id,
                     const ExecutorOpts& opts, const NamedTensors& in,
                     WorkerSession* session, StepStatsCollector* collector,
                     MutableRunGraphResponseWrapper* response,
@@ -113,7 +113,7 @@ class GraphMgr {
                         StatusCallback done);
 
   // Deregisters a graph.
-  absl::Status Deregister(const string& handle);
+  absl::Status Deregister(const std::string& handle);
 
   // Deregister all graphs.
   absl::Status DeregisterAll();
@@ -137,10 +137,10 @@ class GraphMgr {
     ~Item() override;
 
     // Session handle.
-    string session;
+    std::string session;
 
     // Graph handle.
-    string handle;
+    std::string handle;
 
     // Session configuration options for the graph.
     ConfigProto session_config;
@@ -177,13 +177,14 @@ class GraphMgr {
   // TODO(zhifengc): If the client does not call Deregister, we'll
   // lose memory over time. We should implement a timeout-based
   // mechanism to gc these graphs.
-  std::unordered_map<string, Item*> table_;
+  std::unordered_map<std::string, Item*> table_;
 
   void StartParallelExecutors(
-      const string& handle, int64_t step_id, Item* item, Rendezvous* rendezvous,
-      CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
-      CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
-      WorkerSession* session, int64_t start_time_usecs,
+      const std::string& handle, int64_t step_id, Item* item,
+      Rendezvous* rendezvous, CollectiveExecutor::Handle* ce_handle,
+      StepStatsCollector* collector, CostGraphDef* cost_graph,
+      CancellationManager* cancellation_manager, WorkerSession* session,
+      int64_t start_time_usecs,
       tsl::CoordinationServiceAgent* coordination_service_agent,
       StatusCallback done);
 
@@ -194,7 +195,7 @@ class GraphMgr {
   void BuildCostModel(Item* item, StepStatsCollector* collector,
                       CostGraphDef* cost_graph);
 
-  absl::Status InitItem(const string& handle, const GraphDef& gdef,
+  absl::Status InitItem(const std::string& handle, const GraphDef& gdef,
                         const GraphOptions& graph_options,
                         const DebugOptions& debug_options,
                         const ConfigProto& config_proto,
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 95aed8f498efc8..5935465711a02e 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 
+#include <limits>
 #include <queue>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -280,7 +281,7 @@ Microseconds GreedyScheduler::ComputeSchedule(
 const Node* GreedyScheduler::GetNodeWithHighestPriority(
     const std::vector<const Node*>& nodes) {
   const Node* curr_node = nullptr;
-  int64_t curr_priority = kint64max;
+  int64_t curr_priority = std::numeric_limits<int64_t>::max();
   for (const Node* n : nodes) {
     if ((*priority_)[n->id()] < curr_priority) {
       curr_node = n;
diff --git a/tensorflow/core/example/example_parser_configuration.cc b/tensorflow/core/example/example_parser_configuration.cc
index 085d215656c978..7f3cbcd2a49936 100644
--- a/tensorflow/core/example/example_parser_configuration.cc
+++ b/tensorflow/core/example/example_parser_configuration.cc
@@ -30,7 +30,7 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status FindNodeIndexByName(const tensorflow::GraphDef& graph,
-                                 const string& node_name, int* node_idx) {
+                                 const std::string& node_name, int* node_idx) {
   for (int i = 0; i < graph.node_size(); ++i) {
     const auto& node = graph.node(i);
     if (node.name() == node_name) {
@@ -42,7 +42,7 @@ absl::Status FindNodeIndexByName(const tensorflow::GraphDef& graph,
 }
 
 absl::Status ExtractExampleParserConfiguration(
-    const tensorflow::GraphDef& graph, const string& node_name,
+    const tensorflow::GraphDef& graph, const std::string& node_name,
     tensorflow::Session* session,
     std::vector<FixedLenFeature>* fixed_len_features,
     std::vector<VarLenFeature>* var_len_features) {
@@ -95,7 +95,7 @@ absl::Status ExtractExampleParserConfiguration(
 
   // We must fetch the configuration input tensors to the ParseExample op.
   // Skipping index = 0, which is the serialized proto input.
-  std::vector<string> fetch_names(node.input_size() - 1);
+  std::vector<std::string> fetch_names(node.input_size() - 1);
   for (int i = 1; i < node.input_size(); ++i) {
     fetch_names[i - 1] = node.input(i);
   }
@@ -134,7 +134,7 @@ absl::Status ExtractExampleParserConfiguration(
   int sparse_shapes_output_start = sparse_values_output_start + num_sparse;
   int dense_values_output_start = sparse_shapes_output_start + num_sparse;
 
-  string node_output_prefix = absl::StrCat(node_name, ":");
+  std::string node_output_prefix = absl::StrCat(node_name, ":");
 
   for (int i = 0; i < num_sparse; ++i) {
     VarLenFeature& config = (*var_len_features)[i];
@@ -166,7 +166,7 @@ absl::Status ExampleParserConfigurationProtoToFeatureVectors(
     std::vector<VarLenFeature>* var_len_features) {
   const auto& feature_map = config_proto.feature_map();
   for (auto it = feature_map.cbegin(); it != feature_map.cend(); ++it) {
-    string key = it->first;
+    std::string key = it->first;
     const auto& config = it->second;
     if (config.has_fixed_len_feature()) {
       const auto& fixed_config = config.fixed_len_feature();
diff --git a/tensorflow/core/example/example_parser_configuration.h b/tensorflow/core/example/example_parser_configuration.h
index dd2aacaee2c078..b202b035da16c5 100644
--- a/tensorflow/core/example/example_parser_configuration.h
+++ b/tensorflow/core/example/example_parser_configuration.h
@@ -38,7 +38,7 @@ namespace tensorflow {
 // Given a graph and the node_name of a ParseExample op,
 // extract the FixedLenFeature/VarLenFeature configurations.
 absl::Status ExtractExampleParserConfiguration(
-    const tensorflow::GraphDef& graph, const string& node_name,
+    const tensorflow::GraphDef& graph, const std::string& node_name,
     tensorflow::Session* session,
     std::vector<FixedLenFeature>* fixed_len_features,
     std::vector<VarLenFeature>* var_len_features);
diff --git a/tensorflow/core/example/example_parser_configuration_test.cc b/tensorflow/core/example/example_parser_configuration_test.cc
index 8abbd705cbcbe7..d83984d3373139 100644
--- a/tensorflow/core/example/example_parser_configuration_test.cc
+++ b/tensorflow/core/example/example_parser_configuration_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
+void ReadFileToStringOrDie(Env* env, const std::string& filename,
+                           std::string* output) {
   TF_CHECK_OK(ReadFileToString(env, filename, output));
 }
 
@@ -42,8 +43,8 @@ std::unique_ptr<Session> CreateSession() {
 class ExtractExampleParserConfigurationTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    string proto_string;
-    string filename =
+    std::string proto_string;
+    std::string filename =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/example/testdata/parse_example_graph_def.pbtxt");
     ReadFileToStringOrDie(Env::Default(), filename, &proto_string);
diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc
index 374bbe6093b717..8192c7b9ffa420 100644
--- a/tensorflow/core/example/feature_util_test.cc
+++ b/tensorflow/core/example/feature_util_test.cc
@@ -455,9 +455,9 @@ TEST(AppendFeatureValuesTest, StringValuesUsingInitializerList) {
 TEST(AppendFeatureValuesTest, StringVariablesUsingInitializerList) {
   Example example;
 
-  string string1("FOO");
-  string string2("BAR");
-  string string3("BAZ");
+  std::string string1("FOO");
+  std::string string2("BAR");
+  std::string string3("BAZ");
 
   AppendFeatureValues({string1, string2, string3}, "tag", &example);
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index e7aa3a0bf21c17..d8d38eb58e9ae2 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -96,7 +96,7 @@ constexpr int kMaxTensorNestDepth = 100;
 // to serialize, compute hash based on TensorProto string representation.
 // This approach may result different hash codes with identical Tensors if they
 // are defined with different TensorProto representations.
-uint64 TensorProtoHash(const TensorProto& tp) {
+uint64_t TensorProtoHash(const TensorProto& tp) {
   Tensor tensor(tp.dtype());
   bool success = tensor.FromProto(tp);
   if (success) {
@@ -112,7 +112,7 @@ uint64 TensorProtoHash(const TensorProto& tp) {
 // string representation. Tensors with identical content potentially can have a
 // different hash code if they are defined with different TensorProto
 // representations.
-uint64 FastTensorProtoHash(const TensorProto& tp) {
+uint64_t FastTensorProtoHash(const TensorProto& tp) {
   if (attr_value_util_internal::TensorByteSize(tp) >
       kMaxAttrValueTensorByteSize) {
     return DeterministicProtoHash64(tp);
@@ -180,15 +180,17 @@ bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs,
   return AreSerializedProtosEqual(lhs_tp, rhs_tp);
 }
 
-using TensorProtoHasher = std::function<uint64(const TensorProto&)>;
+using TensorProtoHasher = std::function<uint64_t(const TensorProto&)>;
 
-uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
+uint64_t AttrValueHash(const AttrValue& a,
+                       const TensorProtoHasher& tensor_hash) {
   if (a.has_tensor()) return tensor_hash(a.tensor());
 
   if (a.has_func()) {
     const NameAttrList& func = a.func();
-    uint64 h = Hash64(func.name());
-    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
+    uint64_t h = Hash64(func.name());
+    std::map<std::string, AttrValue> map(func.attr().begin(),
+                                         func.attr().end());
     for (const auto& pair : map) {
       h = Hash64(pair.first.data(), pair.first.size(), h);
       h = Hash64Combine(AttrValueHash(pair.second, tensor_hash), h);
@@ -200,8 +202,8 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   return DeterministicProtoHash64(a);
 }
 
-string SummarizeString(const string& str) {
-  string escaped = absl::CEscape(str);
+std::string SummarizeString(const std::string& str) {
+  std::string escaped = absl::CEscape(str);
 
   // If the string is long, replace the middle with ellipses.
   constexpr int kMaxStringSummarySize = 80;
@@ -216,7 +218,7 @@ string SummarizeString(const string& str) {
   }
 }
 
-string SummarizeTensor(const TensorProto& tensor_proto) {
+std::string SummarizeTensor(const TensorProto& tensor_proto) {
   Tensor t;
   int64_t tensor_byte_size =
       attr_value_util_internal::TensorByteSize(tensor_proto);
@@ -233,8 +235,8 @@ string SummarizeTensor(const TensorProto& tensor_proto) {
   return t.DebugString();
 }
 
-string SummarizeFunc(const NameAttrList& func) {
-  std::vector<string> entries;
+std::string SummarizeFunc(const NameAttrList& func) {
+  std::vector<std::string> entries;
   for (const auto& p : func.attr()) {
     entries.push_back(absl::StrCat(p.first, "=", SummarizeAttrValue(p.second)));
   }
@@ -242,7 +244,8 @@ string SummarizeFunc(const NameAttrList& func) {
   return absl::StrCat(func.name(), "[", absl::StrJoin(entries, ", "), "]");
 }
 
-bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit, string to_parse) {
+bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit,
+                                                std::string to_parse) {
   int nests = 0;
   int maxed_out = to_parse.length();
   int open_curly = to_parse.find('{');
@@ -292,7 +295,7 @@ bool ParseAttrValueHelper_TensorNestsUnderLimit(int limit, string to_parse) {
 
 }  // namespace
 
-string SummarizeAttrValue(const AttrValue& attr_value) {
+std::string SummarizeAttrValue(const AttrValue& attr_value) {
   switch (attr_value.value_case()) {
     case AttrValue::kS:
       return SummarizeString(attr_value.s());
@@ -309,7 +312,7 @@ string SummarizeAttrValue(const AttrValue& attr_value) {
     case AttrValue::kTensor:
       return SummarizeTensor(attr_value.tensor());
     case AttrValue::kList: {
-      std::vector<string> pieces;
+      std::vector<std::string> pieces;
       if (attr_value.list().s_size() > 0) {
         for (int i = 0; i < attr_value.list().s_size(); ++i) {
           pieces.push_back(SummarizeString(attr_value.list().s(i)));
@@ -472,7 +475,7 @@ absl::Status AttrValueHasType(const AttrValue& attr_value,
 bool ParseAttrValue(absl::string_view type, absl::string_view text,
                     AttrValue* out) {
   // Parse type.
-  string field_name;
+  std::string field_name;
   bool is_list = absl::ConsumePrefix(&type, "list(");
   if (absl::ConsumePrefix(&type, "string")) {
     field_name = "s";
@@ -500,7 +503,7 @@ bool ParseAttrValue(absl::string_view type, absl::string_view text,
   }
 
   // Construct a valid text proto message to parse.
-  string to_parse;
+  std::string to_parse;
   if (is_list) {
     // TextFormat parser considers "i: 7" to be the same as "i: [7]",
     // but we only want to allow list values with [].
@@ -550,8 +553,8 @@ void SetAttrValue(const AttrValue& value, AttrValue* out) { *out = value; }
   DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD)        \
   DEFINE_SET_ATTR_VALUE_LIST(gtl::ArraySlice<ARG_TYPE>, FIELD)
 
-DEFINE_SET_ATTR_VALUE_ONE(const string&, s)
-DEFINE_SET_ATTR_VALUE_LIST(absl::Span<const string>, s)
+DEFINE_SET_ATTR_VALUE_ONE(const std::string&, s)
+DEFINE_SET_ATTR_VALUE_LIST(absl::Span<const std::string>, s)
 DEFINE_SET_ATTR_VALUE_BOTH(const char*, s)
 DEFINE_SET_ATTR_VALUE_BOTH(int64_t, i)
 DEFINE_SET_ATTR_VALUE_BOTH(int32_t, i)
@@ -585,7 +588,7 @@ void SetAttrValue(const absl::Span<const absl::string_view> value,
   }
 }
 
-void MoveAttrValue(std::vector<string>&& value, AttrValue* out) {
+void MoveAttrValue(std::vector<std::string>&& value, AttrValue* out) {
   out->mutable_list()->Clear();  // Create list() even if value empty.
   for (auto& v : value) {
     out->mutable_list()->add_s(std::move(v));
@@ -689,8 +692,8 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
     const NameAttrList& af = a.func();
     const NameAttrList& bf = b.func();
     if (af.name() != bf.name()) return false;
-    std::unordered_map<string, AttrValue> am(af.attr().begin(),
-                                             af.attr().end());
+    std::unordered_map<std::string, AttrValue> am(af.attr().begin(),
+                                                  af.attr().end());
     for (const auto& bm_pair : bf.attr()) {
       const auto& iter = am.find(bm_pair.first);
       if (iter == am.end()) return false;
@@ -708,11 +711,11 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
   return AreSerializedProtosEqual(a, b);
 }
 
-uint64 AttrValueHash(const AttrValue& a) {
+uint64_t AttrValueHash(const AttrValue& a) {
   return AttrValueHash(a, TensorProtoHash);
 }
 
-uint64 FastAttrValueHash(const AttrValue& a) {
+uint64_t FastAttrValueHash(const AttrValue& a) {
   return AttrValueHash(a, FastTensorProtoHash);
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index b6f7c972c71624..135bfe67231f37 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -76,12 +76,12 @@ void SetAttrValue(const Tensor& value, AttrValue* out);
 void SetAttrValue(const TensorProto& value, AttrValue* out);
 void SetAttrValue(const NameAttrList& value, AttrValue* out);
 
-void SetAttrValue(absl::Span<const string> value, AttrValue* out);
+void SetAttrValue(absl::Span<const std::string> value, AttrValue* out);
 void SetAttrValue(absl::Span<const tstring> value, AttrValue* out);
 void SetAttrValue(absl::Span<const char* const> value, AttrValue* out);
 void SetAttrValue(absl::Span<const absl::string_view> value, AttrValue* out);
 void SetAttrValue(absl::Span<const int64_t> value, AttrValue* out);
-void SetAttrValue(absl::Span<const int32> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int32_t> value, AttrValue* out);
 void SetAttrValue(absl::Span<const float> value, AttrValue* out);
 void SetAttrValue(absl::Span<const double> value, AttrValue* out);
 void SetAttrValue(absl::Span<const bool> value, AttrValue* out);
@@ -97,7 +97,7 @@ void SetAttrValue(absl::Span<const NameAttrList> value, AttrValue* out);
 
 void SetAttrValue(const AttrValue& value, AttrValue* out);
 
-void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
+void MoveAttrValue(std::vector<std::string>&& value, AttrValue* out);
 
 // Returns a hash of `a` that is consistent with AreAttrValuesEqual. In other
 // words, if two AttrValues compare equal according to AreAttrValuesEqual,
@@ -105,7 +105,7 @@ void MoveAttrValue(std::vector<string>&& value, AttrValue* out);
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 AttrValueHash(const AttrValue& a);
+uint64_t AttrValueHash(const AttrValue& a);
 
 // WARNING: Equality check might return false-negative for large (> 32mb)
 // tensors defined with different TensorProto representations.
@@ -117,7 +117,7 @@ uint64 AttrValueHash(const AttrValue& a);
 // bool_val), they will have different hash code and equals will return false.
 // Small (less than 32mb) tensors with different TensorProto representations
 // hashed/compared by their tensor content.
-uint64 FastAttrValueHash(const AttrValue& a);
+uint64_t FastAttrValueHash(const AttrValue& a);
 // Returns true if a and b have the same value. If false negatives are allowed,
 // then compares proto representation to avoid construction of large (> 32mb)
 // tensors.
@@ -134,7 +134,7 @@ bool HasPlaceHolder(const AttrValue& val);
 // SubstituteFunc is given a placeholder string. If the placeholder is
 // unknown, SubstituteFunc returns false. Otherwise, overwrites the
 // attr value and returns true.
-using SubstituteFunc = std::function<bool(const string&, AttrValue*)>;
+using SubstituteFunc = std::function<bool(const std::string&, AttrValue*)>;
 bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index 4e8daeb8f04dde..d6d685ef4c49f0 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -36,14 +36,14 @@ AttrValue V(T value) {
   return ret;
 }
 
-AttrValue P(const string& p) {
+AttrValue P(const std::string& p) {
   AttrValue ret;
   ret.set_placeholder(p);
   return ret;
 }
 
-AttrValue F(const string& name,
-            std::vector<std::pair<string, AttrValue>> pairs) {
+AttrValue F(const std::string& name,
+            std::vector<std::pair<std::string, AttrValue>> pairs) {
   AttrValue ret;
   ret.mutable_func()->set_name(name);
   ret.mutable_func()->mutable_attr()->insert(pairs.begin(), pairs.end());
@@ -51,7 +51,8 @@ AttrValue F(const string& name,
 }
 
 AttrValue Fs(
-    std::vector<std::pair<string, std::vector<std::pair<string, AttrValue>>>>
+    std::vector<
+        std::pair<std::string, std::vector<std::pair<std::string, AttrValue>>>>
         funcs) {
   AttrValue ret;
   for (const auto& func : funcs) {
@@ -82,7 +83,7 @@ TEST(AttrValueUtil, HasType) {
 }
 
 SubstituteFunc ReplaceTWith(const AttrValue& val) {
-  return [val](const string& placeholder, AttrValue* target) {
+  return [val](const std::string& placeholder, AttrValue* target) {
     if (placeholder == "T") {
       *target = val;
       return true;
@@ -142,14 +143,14 @@ TEST(AttrValueUtil, DeepAttr) {
 
 TEST(AttrValueUtil, SummarizeAttrValueDoesNotElideShortStrings) {
   AttrValue attr_value;
-  SetAttrValue(string(40, '-'), &attr_value);
-  EXPECT_EQ(absl::StrCat("\"", string(40, '-'), "\""),
+  SetAttrValue(std::string(40, '-'), &attr_value);
+  EXPECT_EQ(absl::StrCat("\"", std::string(40, '-'), "\""),
             SummarizeAttrValue(attr_value));
 }
 
 TEST(AttrValueUtil, SummarizeAttrValueElidesLongStrings) {
   AttrValue attr_value;
-  SetAttrValue(string(80, '-'), &attr_value);
+  SetAttrValue(std::string(80, '-'), &attr_value);
   EXPECT_EQ("\"----------...----------\"", SummarizeAttrValue(attr_value));
 }
 
@@ -197,7 +198,7 @@ TEST(AttrValueUtil, TensorByteSizeShouldNotOverflow) {
   }
 }
 
-AttrValue FromText(const string& text) {
+AttrValue FromText(const std::string& text) {
   AttrValue attr;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &attr));
   return attr;
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index b2f4b856ec9feb..e4456d888df736 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -32,11 +32,11 @@ struct RegistrationInfo {
   // what is effectively a static instance of the collective implementation.
   // During param resolution of collective ops we return this static instance.
   // The actual op execution gets a fresh instance using `factory`.
-  RegistrationInfo(const string& n, CollectiveRegistry::Factory f)
+  RegistrationInfo(const std::string& n, CollectiveRegistry::Factory f)
       : name(n),
         factory(std::move(f)),
         param_resolver_instance(this->factory()) {}
-  string name;
+  std::string name;
   CollectiveRegistry::Factory factory;
   CollectiveImplementationInterface* param_resolver_instance;
 };
@@ -48,13 +48,13 @@ std::vector<RegistrationInfo>* MutableCollectiveRegistry() {
 }
 }  // namespace
 
-string CollGroupRuntimeDetails::ToString() const {
+std::string CollGroupRuntimeDetails::ToString() const {
   return absl::StrCat("CollGroupRuntimeDetails {communicator_key=",
                       absl::CEscape(communicator_key), "}");
 }
 
-string CollGroupParams::ToString() const {
-  string v = strings::StrCat(
+std::string CollGroupParams::ToString() const {
+  std::string v = strings::StrCat(
       "CollGroupParams {group_key=", group_key, " group_size=", group_size,
       " device_type=", device_type.type_string(), " num_tasks=", num_tasks,
       " runtime_details=", runtime_details.ToString(), " devices {");
@@ -94,8 +94,8 @@ CollInstanceParams& CollInstanceParams::operator=(
   return *this;
 }
 
-string CollInstanceParams::ToString() const {
-  string v =
+std::string CollInstanceParams::ToString() const {
+  std::string v =
       strings::StrCat("CollInstanceParams { instance_key=", instance_key,
                       " type=", type, " data_type=", DataTypeString(data_type),
                       " shape=", shape.DebugString(), " devices {");
@@ -134,8 +134,9 @@ string CollInstanceParams::ToString() const {
   return v;
 }
 
-string CollectiveParams::ToString() const {
-  string v = absl::StrCat("CollectiveParams ", name, " {", group.ToString());
+std::string CollectiveParams::ToString() const {
+  std::string v =
+      absl::StrCat("CollectiveParams ", name, " {", group.ToString());
   absl::StrAppend(&v, " ", instance.ToString());
   strings::StrAppend(&v, " default_rank=", default_rank,
                      " is_source=", is_source, " source_rank=", source_rank,
@@ -156,7 +157,7 @@ CollectiveContext::CollectiveContext(
     CollectiveExecutor* col_exec, NcclCommunicatorInterface* nccl_communicator,
     const DeviceMgr* dev_mgr, OpKernelContext* ctx,
     OpKernelContext::Params* op_params, const CollectiveParams* col_params,
-    const string& exec_key, int64_t step_id, const Tensor* input,
+    const std::string& exec_key, int64_t step_id, const Tensor* input,
     Tensor* output)
     : col_exec(col_exec),
       nccl_communicator(nccl_communicator),
@@ -177,14 +178,14 @@ int64_t CollectiveExecutor::kInvalidId = -1;
 
 /*static*/
 absl::Status CollectiveRegistry::Lookup(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation) {
   return LookupHelper(collective_name, implementation, false);
 }
 
 /*static*/
 absl::Status CollectiveRegistry::LookupParamResolverInstance(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation) {
   return LookupHelper(collective_name, implementation, true);
 }
@@ -198,7 +199,7 @@ void CollectiveRegistry::GetAll(
 }
 
 /*static*/
-absl::Status CollectiveRegistry::Register(const string& collective_name,
+absl::Status CollectiveRegistry::Register(const std::string& collective_name,
                                           Factory factory) {
   std::vector<RegistrationInfo>* registry = MutableCollectiveRegistry();
   for (const RegistrationInfo& reg_info : *registry) {
@@ -212,7 +213,7 @@ absl::Status CollectiveRegistry::Register(const string& collective_name,
 
 /*static*/
 absl::Status CollectiveRegistry::LookupHelper(
-    const string& collective_name,
+    const std::string& collective_name,
     CollectiveImplementationInterface** implementation, bool param_resolver) {
   std::vector<RegistrationInfo>* registry = MutableCollectiveRegistry();
   for (const RegistrationInfo& reg_info : *registry) {
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 8fca00f0e3b515..cdb22129e813d4 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -56,16 +56,16 @@ enum CollectiveType {
 // the OpKernel.  Currently, this struct is used to set communicator key for
 // NCCL-based collective implementation.
 struct CollGroupRuntimeDetails {
-  string communicator_key;  // for communicator-based techniques e.g. NCCL
-  string ToString() const;
+  std::string communicator_key;  // for communicator-based techniques e.g. NCCL
+  std::string ToString() const;
 };
 
 struct CollGroupMember {
   DeviceAttributes device;
-  string task;
+  std::string task;
   bool is_local;
   // User provided rank
-  int32 rank = -1;
+  int32_t rank = -1;
 };
 
 // Data common to all members of a device group.
@@ -73,8 +73,8 @@ struct CollGroupMember {
 // particular to an instance so it is stored there.
 struct CollGroupParams {
   // Inputs from Collective ops:
-  int32 group_key;
-  int32 group_size;
+  int32_t group_key;
+  int32_t group_size;
   DeviceType device_type;
   int user_specified_rank = -1;  // rank provided by the user.
   // Generated from Collective Group Resolver:
@@ -83,10 +83,10 @@ struct CollGroupParams {
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
   // Task -> number of devices on that task.
-  std::unordered_map<string, int32> num_devices_per_task;
-  int32 num_tasks;  // number of distinct tasks in group
+  std::unordered_map<std::string, int32_t> num_devices_per_task;
+  int32_t num_tasks;  // number of distinct tasks in group
   CollGroupRuntimeDetails runtime_details;
-  string ToString() const;
+  std::string ToString() const;
   CollGroupParams()
       : group_key(0), group_size(0), device_type(DEVICE_CPU), num_tasks(0) {}
 };
@@ -99,7 +99,7 @@ struct CollGroupParams {
 // interpretation.  On first execution the runtime will update this
 // structure with decisions that will guide all subsequent executions.
 struct CollImplDetails {
-  string collective_name;
+  std::string collective_name;
   std::vector<std::vector<int>> subdiv_permutations;
   // subdiv_offsets and max_subdivs_per_device are used together as follows:
   // When subdiv_offsets is provided (non-empty) it is used as is. When
@@ -110,10 +110,10 @@ struct CollImplDetails {
   int max_subdivs_per_device = -1;  // Upper bound on subdivisions per device.
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
-  std::vector<int32>
-      dependencies;           // collective instances on which this node depends
-  string communication_hint;  // user-supplied hint for implementation choice,
-                              // e.g. ring or nccl
+  std::vector<int32_t>
+      dependencies;  // collective instances on which this node depends
+  std::string communication_hint;  // user-supplied hint for implementation
+                                   // choice, e.g. ring or nccl
   float timeout_seconds;      // If non zero, set a completion timeout for the
                               // collective op to detect staleness.
 };
@@ -122,16 +122,16 @@ struct CollImplDetails {
 // TODO(b/163171014) Refactor this struct to not be a union of all fields.
 struct CollInstanceParams {
   // Identifies all participating graph nodes.
-  int32 instance_key = -1;
+  int32_t instance_key = -1;
   // The full identifier includes both instance_key and step_id.
   int64_t step_id = 0;
   CollectiveType type = UNDEFINED_COLLECTIVE;
   DataType data_type = DT_FLOAT;
   TensorShape shape = {0};
   CollImplDetails impl_details;
-  string ToString() const;
+  std::string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
-  std::vector<string> devices;  // permuter only
+  std::vector<std::string> devices;  // permuter only
 
   // For permuter only
   // Each rank in the permutation is a receiver.
@@ -148,7 +148,7 @@ struct CollectiveParams : public core::RefCounted {
   CollGroupParams group;
   CollInstanceParams instance;
 
-  string name = "";        // node name used only for log or error messages
+  std::string name = "";   // node name used only for log or error messages
   int default_rank = -1;   // index of this op within device_names
   bool is_source = false;  // broadcast only
   int source_rank = -1;    // broadcast only
@@ -156,7 +156,7 @@ struct CollectiveParams : public core::RefCounted {
   std::vector<int> subdiv_rank;
   OpKernel* merge_op = nullptr;  // reduction only
   OpKernel* final_op = nullptr;  // reduction only
-  string ToString() const;
+  std::string ToString() const;
   bool run_group_initialization = true;
   bool is_stateless = false;
 };
@@ -169,12 +169,12 @@ class DeviceResolverInterface {
   virtual ~DeviceResolverInterface() {}
 
   // Populates *attributes with the DeviceAttributes of the specified device.
-  virtual absl::Status GetDeviceAttributes(const string& device,
+  virtual absl::Status GetDeviceAttributes(const std::string& device,
                                            DeviceAttributes* attributes) = 0;
 
   // Returns all device attributes of a task.
   virtual absl::Status GetAllDeviceAttributes(
-      const string& task, std::vector<DeviceAttributes>* attributes) = 0;
+      const std::string& task, std::vector<DeviceAttributes>* attributes) = 0;
 
   // Updates device attributes. It returns error if any device already
   // exists in the DeviceResolver and has a different incarnation.
@@ -284,19 +284,17 @@ class CollectiveRemoteAccess {
  public:
   virtual ~CollectiveRemoteAccess() {}
 
-  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
-                            bool peer_is_local, const string& key,
-                            Device* to_device, DeviceContext* to_device_ctx,
-                            const AllocatorAttributes& to_alloc_attr,
-                            Tensor* to_tensor,
-                            const DeviceLocality& client_locality,
-                            int dev_to_dev_stream_index,
-                            CancellationManager* cancellation_manager,
-                            const StatusCallback& done) = 0;
-
-  virtual void PostToPeer(const string& peer_device, const string& peer_task,
-                          const string& key, Device* from_device,
-                          DeviceContext* from_device_ctx,
+  virtual void RecvFromPeer(
+      const std::string& peer_device, const std::string& peer_task,
+      bool peer_is_local, const std::string& key, Device* to_device,
+      DeviceContext* to_device_ctx, const AllocatorAttributes& to_alloc_attr,
+      Tensor* to_tensor, const DeviceLocality& client_locality,
+      int dev_to_dev_stream_index, CancellationManager* cancellation_manager,
+      const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const std::string& peer_device,
+                          const std::string& peer_task, const std::string& key,
+                          Device* from_device, DeviceContext* from_device_ctx,
                           const AllocatorAttributes& from_alloc_attr,
                           const Tensor* from_tensor,
                           const DeviceLocality& client_locality,
@@ -306,7 +304,8 @@ class CollectiveRemoteAccess {
   // Checks the health of a collective peer. It probes the peer to see if it is
   // alive. Note that if a peer has restarted, it's considered a different one,
   // so CheckPeerHealth fails.
-  virtual void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+  virtual void CheckPeerHealth(const std::string& peer_task,
+                               int64_t timeout_in_ms,
                                const StatusCallback& done) = 0;
 
   virtual BufRendezvous* buf_rendezvous() = 0;
@@ -322,7 +321,7 @@ class CollectiveExecutor : public core::RefCounted {
 
   virtual void ExecuteAsync(OpKernelContext* ctx,
                             const CollectiveParams* col_params,
-                            const string& exec_key, StatusCallback done) {
+                            const std::string& exec_key, StatusCallback done) {
     done(errors::Internal(
         "A collective Op has been called in a context in which "
         "a CollectiveExecutor has not been provided."));
@@ -404,27 +403,28 @@ struct CollectiveContext {
   OpKernelContext* op_ctx;                       // Not owned
   OpKernelContext::Params* op_params;            // Not owned
   core::IntrusivePtr<const CollectiveParams> col_params;
-  const string exec_key;
+  const std::string exec_key;
   const int64_t step_id;
   const Tensor* input;  // Not owned
   Tensor* output;       // Not owned
   Device* device;       // The device for which this instance labors
-  const string device_name;
+  const std::string device_name;
   DeviceLocality device_locality;
 
   CollectiveContext(CollectiveExecutor* col_exec,
                     NcclCommunicatorInterface* nccl_communicator,
                     const DeviceMgr* dev_mgr, OpKernelContext* ctx,
                     OpKernelContext::Params* op_params,
-                    const CollectiveParams* col_params, const string& exec_key,
-                    int64_t step_id, const Tensor* input, Tensor* output);
+                    const CollectiveParams* col_params,
+                    const std::string& exec_key, int64_t step_id,
+                    const Tensor* input, Tensor* output);
 };
 
 class NcclCommunicatorInterface {
  public:
   virtual ~NcclCommunicatorInterface() = default;
 
-  virtual string GenerateCommunicatorKey() = 0;
+  virtual std::string GenerateCommunicatorKey() = 0;
 
   virtual void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
                        StatusCallback done) = 0;
@@ -474,7 +474,7 @@ class CollectiveRegistry {
   // `collective_name`.  If found, creates an instance of the implementation and
   // assign to `implementation`.
   static absl::Status Lookup(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation);
 
   // Looks up a previously registered CollectiveImplementation under
@@ -482,7 +482,7 @@ class CollectiveRegistry {
   // implementation via `implementation`.  This instance should only be used to
   // call InitializateCollectiveParams.
   static absl::Status LookupParamResolverInstance(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation);
 
   // Returns all registered collective implementations.
@@ -496,10 +496,11 @@ class CollectiveRegistry {
   // the CollectiveImplementation.  Also creates a static instance of the
   // implementation - this instance is used during param resolution and should
   // only be used to call InitializeCollectiveParams.
-  static absl::Status Register(const string& collective_name, Factory factory);
+  static absl::Status Register(const std::string& collective_name,
+                               Factory factory);
 
   static absl::Status LookupHelper(
-      const string& collective_name,
+      const std::string& collective_name,
       CollectiveImplementationInterface** implementation, bool param_resolver);
 };
 
@@ -507,7 +508,7 @@ class CollectiveRegistry {
 // create a global static object.
 class CollectiveRegistration {
  public:
-  CollectiveRegistration(const string& collective_name,
+  CollectiveRegistration(const std::string& collective_name,
                          CollectiveRegistry::Factory factory) {
     TF_CHECK_OK(CollectiveRegistry::Register(collective_name, factory));
   }
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 0f495b17a69544..bcfd94424e59c7 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -164,10 +164,10 @@ absl::Status EinsumShape(shape_inference::InferenceContext* c) {
   // We assume that the equation has a valid format. Either (x),(y)->(z)
   // or (x)->(z), where each of (x), (y) and (z) are concatenation of zero or
   // more latin alphabets and contains at most one ellipsis ('...').
-  string equation;
+  std::string equation;
   TF_RETURN_IF_ERROR(c->GetAttr("equation", &equation));
-  absl::InlinedVector<string, 2> input_labels;
-  string output_labels;
+  absl::InlinedVector<std::string, 2> input_labels;
+  std::string output_labels;
   TF_RETURN_IF_ERROR(
       ValidateEinsumEquation(equation, &input_labels, &output_labels));
 
@@ -391,7 +391,7 @@ absl::Status BiasAddShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
 
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   if (s.ok() && data_format == "NCHW") {
@@ -449,7 +449,7 @@ absl::Status BiasAddShape(shape_inference::InferenceContext* c) {
 absl::Status BiasAddGradShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   if (s.ok() && data_format == "NCHW") {
@@ -465,7 +465,7 @@ absl::Status BiasAddGradShape(shape_inference::InferenceContext* c) {
 
 absl::Status CheckFormatConstraintsOnShape(
     const TensorFormat tensor_format, const ShapeHandle shape_handle,
-    const string& tensor_name, shape_inference::InferenceContext* c) {
+    const std::string& tensor_name, shape_inference::InferenceContext* c) {
   if (tensor_format == FORMAT_NCHW_VECT_C) {
     // Check that the vect dim has size 4 or 32.
     const int num_dims = c->Rank(shape_handle);
@@ -593,7 +593,7 @@ namespace {
 
 absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
                              bool supports_explicit_padding) {
-  string data_format_str, filter_format_str;
+  std::string data_format_str, filter_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
   }
@@ -626,7 +626,7 @@ absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
 
   if (dilations.size() != 4) {
@@ -635,7 +635,7 @@ absl::Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
         dilations.size());
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
   // strides.size() should be 4 (NCHW) even if the input is 5 (NCHW_VECT_C).
@@ -808,7 +808,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
   }
 
   // Default format is NHWC for 2D and NDHWC for 3D.
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   bool channels_last_format;
   if (data_format_str == "CHANNELS_LAST") {
@@ -827,7 +827,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
   // Determine number of spatial dims.
   int spatial_dims = standard_input_rank - 2;
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
   // Default case.
   if (dilations.empty()) {
@@ -840,7 +840,7 @@ absl::Status ConvShape(shape_inference::InferenceContext* c) {
         " values, but got: ", dilations.size()));
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != standard_input_rank) {
     return absl::InvalidArgumentError(
@@ -1004,10 +1004,10 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &filter_shape));
 
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
 
   if (dilations.size() != 5) {
@@ -1016,7 +1016,7 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
         dilations.size());
   }
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
     return errors::InvalidArgument(
@@ -1113,7 +1113,7 @@ absl::Status Conv3DShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status Conv2DBackpropInputShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
   }
@@ -1188,7 +1188,7 @@ absl::Status Conv2DBackpropFilterWithBiasShape(
     shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   // Fetch the data_format attribute, which may not exist.
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -1213,7 +1213,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
   if (strides.size() != 4) {
@@ -1223,7 +1223,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
         strides.size());
   }
 
-  std::vector<int32> dilations;
+  std::vector<int32_t> dilations;
   if (!c->GetAttr("dilations", &dilations).ok()) {
     dilations.resize(4, 1);
   }
@@ -1235,7 +1235,7 @@ absl::Status DepthwiseConv2DNativeShapeImpl(
         dilations.size());
   }
 
-  string data_format_str;
+  std::string data_format_str;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   TensorFormat data_format;
   if (!s.ok() || !FormatFromString(data_format_str, &data_format)) {
@@ -1338,7 +1338,7 @@ absl::Status DepthwiseConv2DNativeShapeWithExplicitPadding(
 }
 
 absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1354,7 +1354,7 @@ absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
@@ -1362,7 +1362,7 @@ absl::Status AvgPoolShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 4) {
     return errors::InvalidArgument(
@@ -1415,7 +1415,7 @@ absl::Status AvgPoolGradShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1465,7 +1465,7 @@ absl::Status FusedBatchNormV3Shape(shape_inference::InferenceContext* c) {
 absl::Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(FusedBatchNormV3Shape(c));
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1488,7 +1488,7 @@ absl::Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1537,7 +1537,7 @@ absl::Status FusedBatchNormGradExShape(shape_inference::InferenceContext* c) {
     return absl::OkStatus();
   }
 
-  string data_format_str;
+  std::string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
@@ -1565,19 +1565,20 @@ absl::Status FusedBatchNormGradExShape(shape_inference::InferenceContext* c) {
 }
 
 absl::Status ReadDiagIndex(InferenceContext* c, const Tensor* diag_index_tensor,
-                           int32* lower_diag_index, int32* upper_diag_index) {
+                           int32_t* lower_diag_index,
+                           int32_t* upper_diag_index) {
   // This function assumes that the shape of diag_index_tensor is fully defined.
   if (diag_index_tensor->dims() == 0) {
-    *lower_diag_index = diag_index_tensor->scalar<int32>()();
+    *lower_diag_index = diag_index_tensor->scalar<int32_t>()();
     *upper_diag_index = *lower_diag_index;
   } else {
     int32_t num_elements = diag_index_tensor->dim_size(0);
     if (num_elements == 1) {
-      *lower_diag_index = diag_index_tensor->vec<int32>()(0);
+      *lower_diag_index = diag_index_tensor->vec<int32_t>()(0);
       *upper_diag_index = *lower_diag_index;
     } else if (num_elements == 2) {
-      *lower_diag_index = diag_index_tensor->vec<int32>()(0);
-      *upper_diag_index = diag_index_tensor->vec<int32>()(1);
+      *lower_diag_index = diag_index_tensor->vec<int32_t>()(0);
+      *upper_diag_index = diag_index_tensor->vec<int32_t>()(1);
     } else {
       return errors::InvalidArgument(
           "diag_index must be a vector with one or two elements. It has ",
@@ -1815,7 +1816,7 @@ absl::Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c) {
 
 absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
                               bool supports_explicit_padding) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1831,7 +1832,7 @@ absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 4) {
     return errors::InvalidArgument(
@@ -1839,7 +1840,7 @@ absl::Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 4) {
     return errors::InvalidArgument(
@@ -1924,7 +1925,7 @@ absl::Status MaxPoolShapeWithExplicitPadding(
 
 absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
                             int num_inputs) {
-  string data_format_str;
+  std::string data_format_str;
   TensorFormat data_format;
   absl::Status s = c->GetAttr("data_format", &data_format_str);
   if (s.ok()) {
@@ -1940,8 +1941,8 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, input_shape, "input", c));
 
-  std::vector<int32> kernel_sizes;
-  std::vector<int32> strides;
+  std::vector<int32_t> kernel_sizes;
+  std::vector<int32_t> strides;
 
   if (c->num_inputs() + 2 == num_inputs) {
     TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
@@ -1962,7 +1963,7 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
       return absl::OkStatus();
     }
     kernel_sizes.resize(kernel_sizes_tensor->shape().num_elements());
-    auto kernel_sizes_vec = kernel_sizes_tensor->flat<int32>();
+    auto kernel_sizes_vec = kernel_sizes_tensor->flat<int32_t>();
     std::copy_n(&kernel_sizes_vec(0), kernel_sizes.size(),
                 kernel_sizes.begin());
 
@@ -1972,7 +1973,7 @@ absl::Status MaxPoolV2Shape(shape_inference::InferenceContext* c,
       return absl::OkStatus();
     }
     strides.resize(strides_tensor->shape().num_elements());
-    auto strides_vec = strides_tensor->flat<int32>();
+    auto strides_vec = strides_tensor->flat<int32_t>();
     std::copy_n(&strides_vec(0), strides.size(), strides.begin());
   }
 
@@ -2029,10 +2030,10 @@ absl::Status Pool3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
 
-  string data_format;
+  std::string data_format;
   absl::Status s = c->GetAttr("data_format", &data_format);
 
-  std::vector<int32> strides;
+  std::vector<int32_t> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
   if (strides.size() != 5) {
     return errors::InvalidArgument(
@@ -2041,7 +2042,7 @@ absl::Status Pool3DShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
-  std::vector<int32> kernel_sizes;
+  std::vector<int32_t> kernel_sizes;
   TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
   if (kernel_sizes.size() != 5) {
     return errors::InvalidArgument(
@@ -2181,8 +2182,8 @@ absl::Status ReductionShape(InferenceContext* c) {
   const int32_t input_rank = c->Rank(input);
   std::set<int64_t> true_indices;
   if (reduction_indices_t->dtype() == DataType::DT_INT32) {
-    TF_RETURN_IF_ERROR(ReductionShapeHelper<int32>(reduction_indices_t,
-                                                   input_rank, &true_indices));
+    TF_RETURN_IF_ERROR(ReductionShapeHelper<int32_t>(
+        reduction_indices_t, input_rank, &true_indices));
   } else if (reduction_indices_t->dtype() == DataType::DT_INT64) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int64_t>(
         reduction_indices_t, input_rank, &true_indices));
@@ -2247,13 +2248,13 @@ absl::Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
   // shape.
   int64_t concat_dim;
   if (concat_dim_t->dtype() == DT_INT32) {
-    concat_dim = static_cast<int64_t>(concat_dim_t->flat<int32>()(0));
+    concat_dim = static_cast<int64_t>(concat_dim_t->flat<int32_t>()(0));
   } else {
     concat_dim = concat_dim_t->flat<int64_t>()(0);
   }
 
   // Minimum required number of dimensions.
-  const int64 min_rank = concat_dim < 0 ? -concat_dim : concat_dim + 1;
+  const int64_t min_rank = concat_dim < 0 ? -concat_dim : concat_dim + 1;
 
   ShapeHandle output_before;
   ShapeHandle output_after;
@@ -2510,7 +2511,7 @@ absl::Status SliceShape(InferenceContext* c) {
           SliceHelper<int64_t>(c, begin_value, sizes_value, &dims));
     } else {
       TF_RETURN_IF_ERROR(
-          SliceHelper<int32>(c, begin_value, sizes_value, &dims));
+          SliceHelper<int32_t>(c, begin_value, sizes_value, &dims));
     }
     c->set_output(0, c->MakeShape(dims));
     return absl::OkStatus();
@@ -2749,7 +2750,7 @@ absl::Status SparseReduceShapeFn(InferenceContext* c) {
   const Tensor* axes_tensor = c->input_tensor(3);
   if (shape_tensor != nullptr && axes_tensor != nullptr) {
     auto shape_vec = shape_tensor->flat<int64_t>();
-    auto axes_vec = axes_tensor->flat<int32>();
+    auto axes_vec = axes_tensor->flat<int32_t>();
 
     int64_t ndims = shape_vec.size();
     absl::flat_hash_set<int64_t> axes;
@@ -2797,7 +2798,7 @@ absl::Status QuantizedConv2DShape(InferenceContext* c) {
 }
 
 absl::Status FusedQuantizedConvShape(InferenceContext* c, int num_dims) {
-  std::vector<string> fused_ops;
+  std::vector<std::string> fused_ops;
   TF_RETURN_IF_ERROR(c->GetAttr("fused_ops", &fused_ops));
   ShapeHandle unused, channel;
   bool fused_sum, fused_bias, fused_requantize;
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 9bc8a20208096f..a97915901cc027 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -220,7 +220,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
 TEST(CommonShapeFnsTest, Einsum_ShapeFn) {
   ShapeInferenceTestOp op("Einsum");
-  auto set_equation = [&op](int n, string equation) {
+  auto set_equation = [&op](int n, std::string equation) {
     std::vector<NodeDefBuilder::NodeOut> input_list;
     input_list.reserve(n);
     for (int i = 0; i < n; ++i) {
@@ -629,8 +629,9 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
 TEST(CommonShapeFnsTest, ConvTest) {
   ShapeInferenceTestOp op("Conv");
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      string data_format, int batch_dims, int groups) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding, std::string data_format,
+                      int batch_dims, int groups) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -715,9 +716,11 @@ TEST(CommonShapeFnsTest, ConvTest) {
 
 TEST(CommonShapeFnsTest, Conv2DFormatsTest) {
   ShapeInferenceTestOp op("Conv2D");
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::string& filter_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -761,15 +764,17 @@ TEST(CommonShapeFnsTest, Conv2DFormatsTest) {
   INFER_OK(op, "[1,1,4,4,32];[32,1,2,1,32]", "[d0_0,1,3,2,d0_4]");
 }
 
-class Conv2DShapeTest : public ::testing::TestWithParam<string> {};
+class Conv2DShapeTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(Conv2DShapeTest, Conv2DShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
-    string format;
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::string& filter_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
+    std::string format;
     if (op.name == "Conv")
       format = (data_format == "NHWC") ? "CHANNELS_LAST" : "CHANNELS_FIRST";
     else
@@ -974,13 +979,14 @@ TEST_P(Conv2DShapeTest, Conv2DShapeTest) {
 }
 
 TEST_P(Conv2DShapeTest, Conv2DDilatedShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& dilations,
-                      const std::vector<int32>& strides, const string& padding,
-                      const string& data_format,
-                      const std::vector<int32>& explicit_paddings = {}) {
-    string format;
+  auto set_op = [&op](const std::vector<int32_t>& dilations,
+                      const std::vector<int32_t>& strides,
+                      const std::string& padding,
+                      const std::string& data_format,
+                      const std::vector<int32_t>& explicit_paddings = {}) {
+    std::string format;
     if (op.name == "Conv")
       format = (data_format == "NHWC") ? "CHANNELS_LAST" : "CHANNELS_FIRST";
     else
@@ -1129,8 +1135,8 @@ TEST(CommonShapeFnsTest, Conv3DShapeRankTest) {
 
 TEST(CommonShapeFnsTest, Conv3DGroupsTest) {
   ShapeInferenceTestOp op("Conv3D");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1166,13 +1172,13 @@ TEST(CommonShapeFnsTest, Conv3DGroupsTest) {
 INSTANTIATE_TEST_SUITE_P(CommonShapeFnsTest, Conv2DShapeTest,
                          ::testing::Values("Conv2D", "Conv"));
 
-class Conv3DShapeTest : public ::testing::TestWithParam<string> {};
+class Conv3DShapeTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(Conv3DShapeTest, Conv3DShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1245,11 +1251,11 @@ TEST_P(Conv3DShapeTest, Conv3DShapeTest) {
 }
 
 TEST_P(Conv3DShapeTest, Conv3DDilatedShapeTest) {
-  const string op_name = GetParam();
+  const std::string op_name = GetParam();
   ShapeInferenceTestOp op(op_name);
-  auto set_op = [&op](const std::vector<int32>& dilations,
-                      const std::vector<int32>& strides,
-                      const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& dilations,
+                      const std::vector<int32_t>& strides,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", op.name)
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
@@ -1300,7 +1306,7 @@ INSTANTIATE_TEST_SUITE_P(CommonShapeFnsTest, Conv3DShapeTest,
 
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
-  std::vector<int32> strides = {{1, 1, 1, 1}};
+  std::vector<int32_t> strides = {{1, 1, 1, 1}};
   TF_CHECK_OK(NodeDefBuilder("test", "DepthwiseConv2dNative")
                   .Input("input", 0, DT_FLOAT)
                   .Input("filter", 0, DT_FLOAT)
@@ -1344,9 +1350,10 @@ TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
 
 TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
   ShapeInferenceTestOp op("AvgPool");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding,
-                      const string& data_format) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding,
+                      const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "AvgPool")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1390,9 +1397,10 @@ TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
 
 TEST(CommonShapeFnsTest, MaxPool2DShapeTest) {
   ShapeInferenceTestOp op("MaxPool");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding,
-                      const string& data_format) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding,
+                      const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPool")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1426,9 +1434,10 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
   ShapeInferenceTestOp op("MaxPoolV2");
   Tensor ksizes_tensor, strides_tensor;
   auto set_op = [&op, &ksizes_tensor, &strides_tensor](
-                    const std::vector<int32>& strides,
-                    const std::vector<int32>& ksizes, const string& padding,
-                    const string& data_format) {
+                    const std::vector<int32_t>& strides,
+                    const std::vector<int32_t>& ksizes,
+                    const std::string& padding,
+                    const std::string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPoolV2")
                     .Input("input", 0, DT_FLOAT)
                     .Input("ksize", 1, DT_INT32)
@@ -1436,11 +1445,11 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
                     .Attr("padding", padding)
                     .Attr("data_format", data_format)
                     .Finalize(&op.node_def));
-    ksizes_tensor = test::AsTensor<int32>(ksizes);
+    ksizes_tensor = test::AsTensor<int32_t>(ksizes);
     op.input_tensors.resize(3);
     op.input_tensors[0] = nullptr;
     op.input_tensors[1] = &ksizes_tensor;
-    strides_tensor = test::AsTensor<int32>(strides);
+    strides_tensor = test::AsTensor<int32_t>(strides);
     op.input_tensors[2] = &strides_tensor;
   };
 
@@ -1466,8 +1475,9 @@ TEST(CommonShapeFnsTest, MaxPoolV22DShapeTest) {
 
 TEST(CommonShapeFnsTest, Pool3DShapeTest) {
   ShapeInferenceTestOp op("MaxPool3D");
-  auto set_op = [&op](const std::vector<int32>& strides,
-                      const std::vector<int32>& ksizes, const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& ksizes,
+                      const std::string& padding) {
     TF_CHECK_OK(NodeDefBuilder("test", "MaxPool3D")
                     .Input("input", 0, DT_FLOAT)
                     .Attr("strides", strides)
@@ -1524,28 +1534,28 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
   INFER_OK(op, "[2,4,5];[2]", "?");
   INFER_OK(op, "?;[2]", "?");
 
-  Tensor indices = test::AsTensor<int32>({1, 2});
+  Tensor indices = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[1] = &indices;
 
   // Reduction indices available
   INFER_OK(op, "[2,4,5];[2]", "[d0_0]");
 
   // Wrapped indices
-  indices = test::AsTensor<int32>({-1, -2});
+  indices = test::AsTensor<int32_t>({-1, -2});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[2]", "[d0_0]");
 
   // Scalar
-  indices = test::AsScalar<int32>(0);
+  indices = test::AsScalar<int32_t>(0);
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[]", "[d0_1,d0_2]");
 
-  indices = test::AsScalar<int32>(-4);
+  indices = test::AsScalar<int32_t>(-4);
   op.input_tensors[1] = &indices;
   INFER_ERROR("Invalid reduction dimension", op, "[2,4,5];[]");
 
   // Empty reduction indices
-  indices = test::AsTensor<int32>({});
+  indices = test::AsTensor<int32_t>({});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[0]", "[d0_0,d0_1,d0_2]");
 
@@ -1555,7 +1565,7 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
                    .Input("reduction_indices", 1, DT_INT32)
                    .Attr("keep_dims", true)
                    .Finalize(&op.node_def));
-  indices = test::AsTensor<int32>({-1, -2});
+  indices = test::AsTensor<int32_t>({-1, -2});
   op.input_tensors[1] = &indices;
   INFER_OK(op, "[2,4,5];[2]", "[d0_0, 1, 1]");
 
@@ -1572,9 +1582,9 @@ TEST(CommonShapeFnsTest, Reduce_ShapeFn) {
   INFER_OK(op, "[?,?,?];[?,?]", "[?,?,?]");
   // And when the tensor is specified, it's still allowed.
   op.input_tensors[1] = &indices;
-  indices = test::AsTensor<int32>({-1, -2}, TensorShape({2, 1}));
+  indices = test::AsTensor<int32_t>({-1, -2}, TensorShape({2, 1}));
   INFER_OK(op, "[2,4,5];[2,1]", "[d0_0, 1, 1]");
-  indices = test::AsTensor<int32>({-1, -2}, TensorShape({1, 2}));
+  indices = test::AsTensor<int32_t>({-1, -2}, TensorShape({1, 2}));
   INFER_OK(op, "[2,4,5];[1,2]", "[d0_0, 1, 1]");
 }
 
diff --git a/tensorflow/core/framework/control_flow.h b/tensorflow/core/framework/control_flow.h
index 3cc270b323d92f..a70ecb85214e31 100644
--- a/tensorflow/core/framework/control_flow.h
+++ b/tensorflow/core/framework/control_flow.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-const uint64 kIllegalFrameId = ~0uLL;
+const uint64_t kIllegalFrameId = ~0uLL;
 const int64_t kIllegalIterId = -1;
 
 // For the purpose of control flow, every tensor produced by TensorFlow is
@@ -30,12 +30,12 @@ const int64_t kIllegalIterId = -1;
 // 'frame_id' and an 'iter_id'. The tensor value it represents is produced
 // in the frame with frame_id at the iteration of iter_id.
 struct FrameAndIter {
-  uint64 frame_id = kIllegalFrameId;
+  uint64_t frame_id = kIllegalFrameId;
   int64_t iter_id = kIllegalIterId;
 
   FrameAndIter() {}
 
-  FrameAndIter(uint64 frame, int64_t iter) {
+  FrameAndIter(uint64_t frame, int64_t iter) {
     frame_id = frame;
     iter_id = iter;
   }
@@ -48,7 +48,7 @@ struct FrameAndIter {
 struct FrameAndIterHash {
   size_t operator()(const FrameAndIter& key) const {
     // Make sure there are no padding bytes that we don't want
-    CHECK_EQ(sizeof(uint64) + sizeof(int64_t), sizeof(FrameAndIter));
+    CHECK_EQ(sizeof(uint64_t) + sizeof(int64_t), sizeof(FrameAndIter));
     return Hash64(reinterpret_cast<const char*>(&key), sizeof(FrameAndIter));
   }
 };
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 491eb5293f22ad..69593a67d90352 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -52,8 +52,9 @@ static mutex* get_dataset_op_registry_lock() {
   return &dataset_op_registry_lock;
 }
 
-static std::unordered_set<string>* get_dataset_op_registry() {
-  static std::unordered_set<string>* names = new std::unordered_set<string>;
+static std::unordered_set<std::string>* get_dataset_op_registry() {
+  static std::unordered_set<std::string>* names =
+      new std::unordered_set<std::string>;
   return names;
 }
 
@@ -97,8 +98,8 @@ class DatasetVariantWrapper {
 
   DatasetBase* get() const { return dataset_; }
 
-  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
-  string DebugString() const {
+  std::string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
+  std::string DebugString() const {
     if (dataset_) {
       return dataset_->DebugString();
     } else {
@@ -131,9 +132,11 @@ class WrappedDatasetVariantWrapper {
 
   Tensor get() const { return ds_tensor_; }
 
-  string TypeName() const { return "tensorflow::WrappedDatasetVariantWrapper"; }
+  std::string TypeName() const {
+    return "tensorflow::WrappedDatasetVariantWrapper";
+  }
 
-  string DebugString() const {
+  std::string DebugString() const {
     return "tensorflow::WrappedDatasetVariantWrapper::DebugString";
   }
 
@@ -324,7 +327,7 @@ absl::Status GraphDefBuilderWrapper::AddDataset(
 }
 
 absl::Status GraphDefBuilderWrapper::AddFunction(
-    SerializationContext* ctx, const string& function_name,
+    SerializationContext* ctx, const std::string& function_name,
     const FunctionLibraryDefinition& lib_def) {
   if (b_->HasFunction(function_name)) {
     VLOG(1) << "Function with name " << function_name << "already exists in"
@@ -338,7 +341,7 @@ absl::Status GraphDefBuilderWrapper::AddFunction(
   }
   FunctionDefLibrary def;
   *def.add_function() = *f_def;
-  const string gradient_func = lib_def.FindGradient(function_name);
+  const std::string gradient_func = lib_def.FindGradient(function_name);
   if (!gradient_func.empty()) {
     GradientDef* g_def = def.add_gradient();
     g_def->set_function_name(function_name);
@@ -380,8 +383,8 @@ void GraphDefBuilderWrapper::AddTensorInternal(const Tensor& val,
       b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
 }
 
-bool GraphDefBuilderWrapper::HasAttr(const string& name,
-                                     const string& attr_name) const {
+bool GraphDefBuilderWrapper::HasAttr(const std::string& name,
+                                     const std::string& attr_name) const {
   const OpDef* op_def = nullptr;
   absl::Status s = b_->opts().op_registry()->LookUpOpDef(name, &op_def);
   if (!s.ok() || op_def == nullptr) {
@@ -535,11 +538,11 @@ absl::Status MemoryCheckpoint::Save(IteratorStateWriter* writer) const {
 absl::Status IteratorBase::InitializeBase(IteratorContext* ctx,
                                           const IteratorBase* parent) {
   parent_ = parent;
-  id_ =
-      Hash64CombineUnordered(Hash64(prefix()), reinterpret_cast<uint64>(this));
+  id_ = Hash64CombineUnordered(Hash64(prefix()),
+                               reinterpret_cast<uint64_t>(this));
   if (parent_) {
     parent_id_ = Hash64CombineUnordered(Hash64(parent_->prefix()),
-                                        reinterpret_cast<uint64>(parent_));
+                                        reinterpret_cast<uint64_t>(parent_));
     // This block of code is executed only when `parent_` is not a `nullptr`
     // because we do not create a `Node` in the `Model` for `RootDataset`.
     if (const auto& model = ctx->model()) {
@@ -626,17 +629,17 @@ std::string FullName(const std::string& prefix, const std::string& name) {
   return strings::StrCat(kFullNameRandomHex, kPipe, prefix, kColon, name);
 }
 
-absl::Status ExtractIteratorPrefix(absl::string_view key, string* prefix) {
+absl::Status ExtractIteratorPrefix(absl::string_view key, std::string* prefix) {
   if (!absl::StartsWith(key, data::kFullNameRandomHex)) {
     return errors::InvalidArgument("Key: ", key,
                                    " was not generated using full_name.");
   }
-  std::vector<string> split_keys = str_util::Split(key, data::kPipe);
+  std::vector<std::string> split_keys = str_util::Split(key, data::kPipe);
   if (split_keys.size() != 2) {
     return errors::InvalidArgument("Key: ", key,
                                    " was not generated using full_name.");
   }
-  string real_key = split_keys[1];
+  std::string real_key = split_keys[1];
   const int pos = real_key.rfind(kColon);
   *prefix = real_key.substr(0, pos);
   return absl::OkStatus();
@@ -811,10 +814,11 @@ absl::Status DatasetBase::ComputeNumSources() {
   return absl::OkStatus();
 }
 
-absl::Status DatasetBase::CheckRandomAccessCompatible(const int64 index) const {
+absl::Status DatasetBase::CheckRandomAccessCompatible(
+    const int64_t index) const {
   CardinalityOptions options;
   options.set_compute_level(CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
-  int64 cardinality = Cardinality(options);
+  int64_t cardinality = Cardinality(options);
   if (cardinality == kInfiniteCardinality ||
       cardinality == kUnknownCardinality) {
     return tensorflow::errors::FailedPrecondition(
@@ -829,13 +833,13 @@ absl::Status DatasetBase::CheckRandomAccessCompatible(const int64 index) const {
   return absl::OkStatus();
 }
 
-absl::Status DatasetBase::Get(OpKernelContext* ctx, int64 index,
+absl::Status DatasetBase::Get(OpKernelContext* ctx, int64_t index,
                               std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
 }
 
-absl::Status DatasetBase::Get(AnyContext ctx, int64 index,
+absl::Status DatasetBase::Get(AnyContext ctx, int64_t index,
                               std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
@@ -876,7 +880,7 @@ absl::Status DatasetBase::MergeOptionsFromInputs() {
 
 absl::Status DatasetBase::MakeIterator(
     IteratorContext* ctx, const IteratorBase* parent,
-    const string& output_prefix,
+    const std::string& output_prefix,
     std::unique_ptr<IteratorBase>* iterator) const {
   if (type_string() == "OptionsDataset" || type_string() == "FinalizeDataset") {
     std::vector<const DatasetBase*> inputs;
@@ -1103,8 +1107,8 @@ DatasetBaseIterator::~DatasetBaseIterator() {
   params_.dataset->Unref();
 }
 
-string DatasetBaseIterator::BuildTraceMeName() {
-  string result =
+std::string DatasetBaseIterator::BuildTraceMeName() {
+  std::string result =
       strings::StrCat(params_.prefix, "#", traceme_metadata_, ",id=", id_);
   if (parent_) {
     absl::StrAppend(&result, ",parent_id=", parent_id_);
@@ -1274,8 +1278,8 @@ void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-string DatasetOpKernel::TraceString(const OpKernelContext& ctx,
-                                    bool verbose) const {
+std::string DatasetOpKernel::TraceString(const OpKernelContext& ctx,
+                                         bool verbose) const {
   return tsl::profiler::TraceMeOp(name_view(), type_string_view());
 }
 
@@ -1310,7 +1314,7 @@ bool DatasetOpKernel::IsDatasetOp(const OpDef& op_def) {
 
   // Check if the suffix matches "DatasetV[0-9]+".
   size_t index = op_name.length() - 1;
-  while (index >= 0 && isdigit(op_name[index])) {
+  while (index >= 0 && absl::ascii_isdigit(op_name[index])) {
     index--;
   }
   constexpr absl::string_view kDatasetPrefix = "DatasetV";
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b807208647c1cb..2471c3dc08cc0a 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -87,7 +87,7 @@ void MergeOptions(const protobuf::MessageLite& source,
                   protobuf::MessageLite* destination);
 }  // namespace internal
 
-using TraceMeMetadata = std::vector<std::pair<absl::string_view, string>>;
+using TraceMeMetadata = std::vector<std::pair<absl::string_view, std::string>>;
 
 // Maps the index of dataset elements to a globally shuffled index. See the
 // comment for IteratorContext::Params::index_mapper for more details.
@@ -211,7 +211,7 @@ class IteratorStateWriter {
 std::string FullName(const std::string& prefix, const std::string& name);
 
 // Extracts iterator prefix from key generated by `FullName`.
-absl::Status ExtractIteratorPrefix(absl::string_view key, string* prefix);
+absl::Status ExtractIteratorPrefix(absl::string_view key, std::string* prefix);
 
 // Interface for objects that can be checkpointed.
 class Checkpointable {
@@ -264,7 +264,7 @@ class GraphDefBuilderWrapper {
     return absl::OkStatus();
   }
 
-  absl::Status AddVector(const std::vector<string>& val, Node** output) {
+  absl::Status AddVector(const std::vector<std::string>& val, Node** output) {
     Tensor val_t = Tensor(DataTypeToEnum<tstring>::v(),
                           TensorShape({static_cast<int64_t>(val.size())}));
     for (size_t i = 0; i < val.size(); i++) {
@@ -350,7 +350,7 @@ class GraphDefBuilderWrapper {
   // or any of its dependent functions are stateful, and the context does not
   // explicitly permit stateful functions, returns an InvalidArgument error.
   absl::Status AddFunction(SerializationContext* ctx,
-                           const string& function_name,
+                           const std::string& function_name,
                            const FunctionLibraryDefinition& lib_def);
 
   template <typename T>
@@ -371,9 +371,10 @@ class GraphDefBuilderWrapper {
  private:
   void AddPlaceholderInternal(const Tensor& val, Node** output);
   void AddTensorInternal(const Tensor& val, Node** output);
-  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+  bool HasAttr(const std::string& op_type_name,
+               const std::string& attr_name) const;
 
-  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+  bool HasAttr(const OpDef* op_def, const std::string& attr_name) const {
     for (const auto& attr : op_def->attr()) {
       if (attr.name() == attr_name) {
         return true;
@@ -509,35 +510,35 @@ class MemoryCheckpoint final : public IteratorStateWriter {
 
   // BEGIN implementation of `IteratorStateWriter` interface
   absl::Status WriteScalar(absl::string_view key, int64_t val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteScalar(prefix, key, val);
   }
   absl::Status WriteScalar(absl::string_view name, absl::string_view key,
                            int64_t val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     int_values_[id] = val;
     return absl::OkStatus();
   }
   absl::Status WriteScalar(absl::string_view key, const tstring& val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteScalar(prefix, key, val);
   }
   absl::Status WriteScalar(absl::string_view name, absl::string_view key,
                            const tstring& val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     str_values_[id] = val;
     return absl::OkStatus();
   }
   absl::Status WriteTensor(absl::string_view key, const Tensor& val) override {
-    string prefix;
+    std::string prefix;
     TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
     return WriteTensor(prefix, key, val);
   }
   absl::Status WriteTensor(absl::string_view name, absl::string_view key,
                            const Tensor& val) override {
-    auto id = id_registry_->Add(string(name), string(key));
+    auto id = id_registry_->Add(std::string(name), std::string(key));
     tensor_values_[id] = val;
     return absl::OkStatus();
   }
@@ -614,7 +615,8 @@ class SerializationContext {
         : resource_mgr(ctx->resource_manager()),
           device_name(ctx->device()->attributes().name()) {}
 
-    std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+    std::vector<std::pair<std::string, Tensor>>* input_list =
+        nullptr;  // Not owned.
 
     // Indicates what to do if the dataset depends on external state.
     ExternalStatePolicy external_state_policy =
@@ -653,7 +655,7 @@ class SerializationContext {
 
   explicit SerializationContext(Params params) : params_(params) {}
 
-  std::vector<std::pair<string, Tensor>>* input_list() {
+  std::vector<std::pair<std::string, Tensor>>* input_list() {
     return params_.input_list;
   }
 
@@ -773,7 +775,7 @@ class IteratorContext {
     // Records the number of ParallelInterleave operations in the path from the
     // root node to this node (not including this node) in the input pipeline
     // tree.
-    int64 interleave_depth = 0;
+    int64_t interleave_depth = 0;
 
     // Marks whether the iterator is restored from a checkpoint.
     bool is_restoring = false;
@@ -795,7 +797,7 @@ class IteratorContext {
     std::function<void(std::function<void()>)> runner = nullptr;
 
     // Number of threads used for executing user-defined functions.
-    int32 runner_threadpool_size = 0;
+    int32_t runner_threadpool_size = 0;
 
     // Split providers indicating which splits to process. May be empty,
     // indicating that the iterator should process all splits.
@@ -891,7 +893,7 @@ class IteratorContext {
 
   MemoryCheckpoint* checkpoint() { return &checkpoint_; }
 
-  int64 interleave_depth() { return params_.interleave_depth; }
+  int64_t interleave_depth() { return params_.interleave_depth; }
 
   bool is_restoring() { return params_.is_restoring; }
 
@@ -909,7 +911,7 @@ class IteratorContext {
     return &params_.runner;
   }
 
-  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+  int32_t runner_threadpool_size() { return params_.runner_threadpool_size; }
 
   std::vector<std::shared_ptr<SplitProvider>> split_providers() const {
     return params_.split_providers;
@@ -949,7 +951,7 @@ class IteratorContext {
     params_.index_mapper = index_mapper;
   };
 
-  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
+  std::unique_ptr<thread::ThreadPool> CreateThreadPool(const std::string& name,
                                                        int num_threads) {
     if (params_.thread_pool) {
       // Create a `ThreadPool` instance by wrapping `params_.thread_pool` (which
@@ -1010,7 +1012,7 @@ class IteratorContext {
     }
   }
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) {
     if (params_.thread_factory) {
       return params_.thread_factory->StartThread(name, std::move(fn));
@@ -1133,7 +1135,7 @@ class IteratorBase : public Checkpointable {
 
   // Returns a string that identifies the sequence of iterators leading up to
   // this iterator.
-  virtual const string& prefix() const = 0;
+  virtual const std::string& prefix() const = 0;
 
   // Indicates whether the iterator is compatible with symbolic checkpointing.
   virtual bool SymbolicCheckpointCompatible() const { return false; }
@@ -1248,9 +1250,9 @@ class IteratorBase : public Checkpointable {
 class DatasetContext {
  public:
   struct Params {
-    string type_string;  // op type name of this dataset.
-    string node_name;    // graph node name of this dataset op, uniquely
-                         // identifying the dataset in the graph.
+    std::string type_string;  // op type name of this dataset.
+    std::string node_name;    // graph node name of this dataset op, uniquely
+                              // identifying the dataset in the graph.
   };
 
   explicit DatasetContext(Params params) : params_(std::move(params)) {}
@@ -1260,8 +1262,8 @@ class DatasetContext {
     params_.node_name = ctx->op_kernel().name();
   }
 
-  const string& type_string() const { return params_.type_string; }
-  const string& node_name() const { return params_.node_name; }
+  const std::string& type_string() const { return params_.type_string; }
+  const std::string& node_name() const { return params_.node_name; }
 
  private:
   Params params_;
@@ -1304,11 +1306,11 @@ class DatasetBase : public core::RefCounted {
       : type_string_(ctx.type_string()), node_name_(ctx.node_name()) {}
 
   // Op type name of this dataset.
-  const string& type_string() const { return type_string_; }
+  const std::string& type_string() const { return type_string_; }
 
   // Graph node name of this dataset op, uniquely identifying the dataset in
   // the graph.
-  const string& node_name() const { return node_name_; }
+  const std::string& node_name() const { return node_name_; }
 
   const Metadata& metadata() const { return metadata_; }
 
@@ -1330,18 +1332,18 @@ class DatasetBase : public core::RefCounted {
   // The prefix identifies the sequence of iterators leading up to the newly
   // created iterator.
   absl::Status MakeIterator(IteratorContext* ctx, const IteratorBase* parent,
-                            const string& output_prefix,
+                            const std::string& output_prefix,
                             std::unique_ptr<IteratorBase>* iterator) const;
 
   absl::Status MakeIterator(IteratorContext&& ctx, const IteratorBase* parent,
-                            const string& output_prefix,
+                            const std::string& output_prefix,
                             std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIterator(&ctx, parent, output_prefix, iterator);
   }
 
   // Returns a new iterator restored from the checkpoint data in `reader`.
   absl::Status MakeIteratorFromCheckpoint(
-      IteratorContext* ctx, const string& output_prefix,
+      IteratorContext* ctx, const std::string& output_prefix,
       IteratorStateReader* reader,
       std::unique_ptr<IteratorBase>* iterator) const {
     std::unique_ptr<IteratorBase> it;
@@ -1357,7 +1359,7 @@ class DatasetBase : public core::RefCounted {
   }
 
   absl::Status MakeIteratorFromCheckpoint(
-      IteratorContext&& ctx, const string& output_prefix,
+      IteratorContext&& ctx, const std::string& output_prefix,
       IteratorStateReader* reader,
       std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIteratorFromCheckpoint(&ctx, output_prefix, reader, iterator);
@@ -1405,7 +1407,7 @@ class DatasetBase : public core::RefCounted {
   }
 
   // A human-readable debug string for this dataset.
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Stores the dataset's input datasets in `*inputs`. The pointers stored in
   // `*inputs` are borrowed. The only valid non-ok return status is
@@ -1423,16 +1425,16 @@ class DatasetBase : public core::RefCounted {
   virtual absl::Status CheckExternalState() const = 0;
 
   // Indicates whether the dataset is compatible with random access.
-  absl::Status CheckRandomAccessCompatible(const int64 index) const;
+  absl::Status CheckRandomAccessCompatible(const int64_t index) const;
 
   // Return the element at a particular index for a randomly accessible dataset.
-  virtual absl::Status Get(OpKernelContext* ctx, int64 index,
+  virtual absl::Status Get(OpKernelContext* ctx, int64_t index,
                            std::vector<Tensor>* out_tensors) const;
 
   // Same as above, but with an `AnyContext`, which can be constructed from
   // either an `OpKernelContext` or `IteratorContext`. Used to support datasets
   // that provide random access through both the dataset and iterator APIs.
-  virtual absl::Status Get(AnyContext ctx, int64 index,
+  virtual absl::Status Get(AnyContext ctx, int64_t index,
                            std::vector<Tensor>* out_tensors) const;
 
   // Returns true if the dataset and its inputs support random access.
@@ -1487,7 +1489,7 @@ class DatasetBase : public core::RefCounted {
                                           Node** node) const = 0;
 
   virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const = 0;
+      const std::string& prefix) const = 0;
 
   void set_options(const Options& options) { options_ = options; }
 
@@ -1505,8 +1507,8 @@ class DatasetBase : public core::RefCounted {
   // how they appear for this dataset.
   absl::Status MergeOptionsFromInputs();
 
-  const string type_string_;
-  const string node_name_;
+  const std::string type_string_;
+  const std::string node_name_;
   Metadata metadata_;
   Options options_;
   mutable mutex mu_;
@@ -1527,7 +1529,7 @@ class DatasetBaseIterator : public IteratorBase {
     const DatasetBase* dataset;
 
     // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
+    const std::string prefix;
   };
 
   explicit DatasetBaseIterator(const BaseParams& params);
@@ -1544,13 +1546,13 @@ class DatasetBaseIterator : public IteratorBase {
     return params_.dataset->output_shapes();
   }
 
-  const string& prefix() const override { return params_.prefix; }
+  const std::string& prefix() const override { return params_.prefix; }
 
   // Returns a name to be used for the TraceMe event.
   //
   // NOTE: TraceMe supports passing key-value pairs of "arguments" using the
   // following format "name#arg_1=value_,...,arg_n=value_n".
-  string BuildTraceMeName();
+  std::string BuildTraceMeName();
 
   absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                        bool* end_of_sequence) final;
@@ -1602,7 +1604,7 @@ class DatasetBaseIterator : public IteratorBase {
   virtual absl::Status SkipInternal(IteratorContext* ctx, int num_to_skip,
                                     bool* end_of_sequence, int* num_skipped);
 
-  string full_name(const string& name) const {
+  std::string full_name(const std::string& name) const {
     return FullName(params_.prefix, name);
   }
 
@@ -1693,7 +1695,7 @@ class DatasetBaseIterator : public IteratorBase {
     return ctx->model() && node_;
   }
 
-  string traceme_metadata_;
+  std::string traceme_metadata_;
   BaseParams params_;
 };
 
@@ -1707,7 +1709,7 @@ class DatasetIterator : public DatasetBaseIterator {
     const DatasetType* dataset;
 
     // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
+    const std::string prefix;
   };
 
   explicit DatasetIterator(const Params& params)
@@ -1774,7 +1776,8 @@ class DatasetOpKernel : public OpKernel {
   // names that end with "Dataset" or "DatasetV[0-9]+".
   static bool IsDatasetOp(const OpDef& op_def);
 
-  string TraceString(const OpKernelContext& ctx, bool verbose) const override;
+  std::string TraceString(const OpKernelContext& ctx,
+                          bool verbose) const override;
 
  protected:
   // Subclasses should implement this method. It will be called during Compute
diff --git a/tensorflow/core/framework/dataset_stateful_op_allowlist.h b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
index cc25c801bf60b1..14b16b309ea5c1 100644
--- a/tensorflow/core/framework/dataset_stateful_op_allowlist.h
+++ b/tensorflow/core/framework/dataset_stateful_op_allowlist.h
@@ -25,17 +25,17 @@ namespace data {
 // See below macro for usage details.
 class AllowlistedStatefulOpRegistry {
  public:
-  absl::Status Add(string op_name) {
+  absl::Status Add(std::string op_name) {
     op_names_.insert(std::move(op_name));
     return absl::OkStatus();
   }
 
-  absl::Status Remove(string op_name) {
+  absl::Status Remove(std::string op_name) {
     op_names_.erase(op_name);
     return absl::OkStatus();
   }
 
-  bool Contains(const string& op_name) { return op_names_.count(op_name); }
+  bool Contains(const std::string& op_name) { return op_names_.count(op_name); }
 
   static AllowlistedStatefulOpRegistry* Global() {
     static auto* reg = new AllowlistedStatefulOpRegistry;
@@ -49,7 +49,7 @@ class AllowlistedStatefulOpRegistry {
   AllowlistedStatefulOpRegistry operator=(
       AllowlistedStatefulOpRegistry const& copy) = delete;
 
-  std::unordered_set<string> op_names_;
+  std::unordered_set<std::string> op_names_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/framework/dataset_test.cc b/tensorflow/core/framework/dataset_test.cc
index 66213ea5721b13..b572de72e54113 100644
--- a/tensorflow/core/framework/dataset_test.cc
+++ b/tensorflow/core/framework/dataset_test.cc
@@ -68,8 +68,8 @@ TEST_P(DatasetTestTotalBytes, TestTotalBytes) {
 }
 
 std::vector<Tensor> tensor_tf_int_32s() {
-  return {test::AsTensor<int32>({1, 2, 3, 4, 5}),
-          test::AsTensor<int32>({1, 2, 3, 4})};
+  return {test::AsTensor<int32_t>({1, 2, 3, 4, 5}),
+          test::AsTensor<int32_t>({1, 2, 3, 4})};
 }
 
 std::vector<Tensor> tensor_tf_int_64s() {
diff --git a/tensorflow/core/framework/device.cc b/tensorflow/core/framework/device.cc
index 59730e3ce1d436..1adb6e7eaf1641 100644
--- a/tensorflow/core/framework/device.cc
+++ b/tensorflow/core/framework/device.cc
@@ -41,8 +41,8 @@ void Device::Sync(const DoneCallback& done) { done(Sync()); }
 
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
-    const string& name, DeviceType device, Bytes memory_limit,
-    const DeviceLocality& locality, const string& physical_device_desc) {
+    const std::string& name, DeviceType device, Bytes memory_limit,
+    const DeviceLocality& locality, const std::string& physical_device_desc) {
   DeviceAttributes da;
   da.set_name(name);
   do {
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 44db0a284f1f79..891d45f237e61e 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -66,7 +66,7 @@ const DeviceAttributes& DeviceBase::attributes() const {
   std::abort();
 }
 
-const string& DeviceBase::name() const {
+const std::string& DeviceBase::name() const {
   LOG(FATAL) << "DeviceBase does not implement name()";  // Crash OK
   std::abort();
 }
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index fe5099fa361429..15c4e6bba6ae9e 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -269,7 +269,7 @@ class DeviceBase {
   // device memory tagged with an earlier freed-at count is really unencumbered
   // by pending uses.  For this to be useful the device memory allocator must
   // be tagging deallocated memory chunks using the same counter.
-  virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
+  virtual uint64_t SafeAllocFrontier(uint64_t old_value) { return 0; }
 
   // Copies `input_tensor` to `output_tensor`, where both tensors are on this
   // device. This function assumes that `output_tensor` has already been
diff --git a/tensorflow/core/framework/device_factory.cc b/tensorflow/core/framework/device_factory.cc
index 392b44f2eb177c..d6374d41a93bb7 100644
--- a/tensorflow/core/framework/device_factory.cc
+++ b/tensorflow/core/framework/device_factory.cc
@@ -47,14 +47,14 @@ struct FactoryItem {
   bool is_pluggable_device;
 };
 
-std::unordered_map<string, FactoryItem>& device_factories() {
-  static std::unordered_map<string, FactoryItem>* factories =
-      new std::unordered_map<string, FactoryItem>;
+std::unordered_map<std::string, FactoryItem>& device_factories() {
+  static std::unordered_map<std::string, FactoryItem>* factories =
+      new std::unordered_map<std::string, FactoryItem>;
   return *factories;
 }
 
-bool IsDeviceFactoryEnabled(const string& device_type) {
-  std::vector<string> enabled_devices;
+bool IsDeviceFactoryEnabled(const std::string& device_type) {
+  std::vector<std::string> enabled_devices;
   TF_CHECK_OK(tensorflow::ReadStringsFromEnvVar(
       /*env_var_name=*/"TF_ENABLED_DEVICE_TYPES", /*default_val=*/"",
       &enabled_devices));
@@ -67,9 +67,9 @@ bool IsDeviceFactoryEnabled(const string& device_type) {
 }  // namespace
 
 // static
-int32 DeviceFactory::DevicePriority(const string& device_type) {
+int32_t DeviceFactory::DevicePriority(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter != factories.end()) {
     return iter->second.priority;
@@ -78,9 +78,9 @@ int32 DeviceFactory::DevicePriority(const string& device_type) {
   return -1;
 }
 
-bool DeviceFactory::IsPluggableDevice(const string& device_type) {
+bool DeviceFactory::IsPluggableDevice(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter != factories.end()) {
     return iter->second.is_pluggable_device;
@@ -89,7 +89,7 @@ bool DeviceFactory::IsPluggableDevice(const string& device_type) {
 }
 
 // static
-void DeviceFactory::Register(const string& device_type,
+void DeviceFactory::Register(const std::string& device_type,
                              std::unique_ptr<DeviceFactory> factory,
                              int priority, bool is_pluggable_device) {
   if (!IsDeviceFactoryEnabled(device_type)) {
@@ -98,7 +98,7 @@ void DeviceFactory::Register(const string& device_type,
     return;
   }
   mutex_lock l(*get_device_factory_lock());
-  std::unordered_map<string, FactoryItem>& factories = device_factories();
+  std::unordered_map<std::string, FactoryItem>& factories = device_factories();
   auto iter = factories.find(device_type);
   if (iter == factories.end()) {
     factories[device_type] = {std::move(factory), priority,
@@ -113,7 +113,7 @@ void DeviceFactory::Register(const string& device_type,
   }
 }
 
-DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
+DeviceFactory* DeviceFactory::GetFactory(const std::string& device_type) {
   tf_shared_lock l(*get_device_factory_lock());
   auto it = device_factories().find(device_type);
   if (it == device_factories().end()) {
@@ -128,7 +128,7 @@ DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
 }
 
 absl::Status DeviceFactory::ListAllPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   // CPU first. A CPU device is required.
   // TODO(b/183974121): Consider merge the logic into the loop below.
   auto cpu_factory = GetFactory("CPU");
@@ -156,7 +156,7 @@ absl::Status DeviceFactory::ListAllPhysicalDevices(
 }
 
 absl::Status DeviceFactory::ListPluggablePhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   tf_shared_lock l(*get_device_factory_lock());
   for (auto& p : device_factories()) {
     if (p.second.is_pluggable_device) {
@@ -168,7 +168,7 @@ absl::Status DeviceFactory::ListPluggablePhysicalDevices(
 }
 
 absl::Status DeviceFactory::GetAnyDeviceDetails(
-    int device_index, std::unordered_map<string, string>* details) {
+    int device_index, std::unordered_map<std::string, std::string>* details) {
   if (device_index < 0) {
     return errors::InvalidArgument("Device index out of bounds: ",
                                    device_index);
@@ -183,7 +183,7 @@ absl::Status DeviceFactory::GetAnyDeviceDetails(
   }
 
   // TODO(b/183974121): Consider merge the logic into the loop below.
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   TF_RETURN_IF_ERROR(cpu_factory->ListPhysicalDevices(&devices));
   if (device_index < devices.size()) {
     return cpu_factory->GetDeviceDetails(device_index, details);
@@ -211,7 +211,7 @@ absl::Status DeviceFactory::GetAnyDeviceDetails(
 }
 
 absl::Status DeviceFactory::AddCpuDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   auto cpu_factory = GetFactory("CPU");
   if (!cpu_factory) {
@@ -228,7 +228,7 @@ absl::Status DeviceFactory::AddCpuDevices(
 }
 
 absl::Status DeviceFactory::AddDevices(
-    const SessionOptions& options, const string& name_prefix,
+    const SessionOptions& options, const std::string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
   // CPU first. A CPU device is required.
   // TODO(b/183974121): Consider merge the logic into the loop below.
@@ -263,9 +263,9 @@ absl::Status DeviceFactory::AddDevices(
   return absl::OkStatus();
 }
 
-std::unique_ptr<Device> DeviceFactory::NewDevice(const string& type,
-                                                 const SessionOptions& options,
-                                                 const string& name_prefix) {
+std::unique_ptr<Device> DeviceFactory::NewDevice(
+    const std::string& type, const SessionOptions& options,
+    const std::string& name_prefix) {
   auto device_factory = GetFactory(type);
   if (!device_factory) {
     return nullptr;
diff --git a/tensorflow/core/framework/device_factory.h b/tensorflow/core/framework/device_factory.h
index 8b07d15cfc0dac..e30a4538fa939a 100644
--- a/tensorflow/core/framework/device_factory.h
+++ b/tensorflow/core/framework/device_factory.h
@@ -58,34 +58,35 @@ class DeviceFactory {
   // Helper for tests.  Create a single device of type "type".  The
   // returned device is always numbered zero, so if creating multiple
   // devices of the same type, supply distinct name_prefix arguments.
-  static std::unique_ptr<Device> NewDevice(const string& type,
+  static std::unique_ptr<Device> NewDevice(const std::string& type,
                                            const SessionOptions& options,
-                                           const string& name_prefix);
+                                           const std::string& name_prefix);
 
   // Iterate through all device factories and build a list of all of the
   // possible physical devices.
   //
   // CPU is are added first.
-  static absl::Status ListAllPhysicalDevices(std::vector<string>* devices);
+  static absl::Status ListAllPhysicalDevices(std::vector<std::string>* devices);
 
   // Iterate through all device factories and build a list of all of the
   // possible pluggable physical devices.
   static absl::Status ListPluggablePhysicalDevices(
-      std::vector<string>* devices);
+      std::vector<std::string>* devices);
 
   // Get details for a specific device among all device factories.
   // 'device_index' indexes into devices from ListAllPhysicalDevices.
   static absl::Status GetAnyDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details);
+      int device_index, std::unordered_map<std::string, std::string>* details);
 
   // For a specific device factory list all possible physical devices.
-  virtual absl::Status ListPhysicalDevices(std::vector<string>* devices) = 0;
+  virtual absl::Status ListPhysicalDevices(
+      std::vector<std::string>* devices) = 0;
 
   // Get details for a specific device for a specific factory. Subclasses
   // can store arbitrary device information in the map. 'device_index' indexes
   // into devices from ListPhysicalDevices.
   virtual absl::Status GetDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details) {
+      int device_index, std::unordered_map<std::string, std::string>* details) {
     return absl::OkStatus();
   }
 
@@ -106,7 +107,7 @@ class DeviceFactory {
   // higher than the packaged devices.  See calls to
   // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
   // for built-in devices.
-  static int32 DevicePriority(const std::string& device_type);
+  static int32_t DevicePriority(const std::string& device_type);
 
   // Returns true if 'device_type' is registered from plugin. Returns false if
   // 'device_type' is a first-party device.
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ec424f890883eb..c295e18bca197b 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -36,7 +36,7 @@ class FakeInputImpl {
   absl::Status AddInputToBuilder();
 
  private:
-  static string FakeNodeName(int in_index);
+  static std::string FakeNodeName(int in_index);
   absl::Status GetN(int* n) const;
   absl::Status GetDataType(DataType* dt) const;
   void NSources(int n, DataType dt) const;
@@ -44,7 +44,7 @@ class FakeInputImpl {
 
   const OpDef* const op_def_;
   const OpDef::ArgDef* const arg_;
-  const string in_node_;
+  const std::string in_node_;
   const NodeDef* const node_def_;
   NodeDefBuilder* const builder_;
 
@@ -120,9 +120,9 @@ absl::Status FakeInputImpl::AddInputToBuilder() {
 }
 
 // static
-string FakeInputImpl::FakeNodeName(int in_index) {
+std::string FakeInputImpl::FakeNodeName(int in_index) {
   char c = 'a' + (in_index % 26);
-  return string(&c, 1);
+  return std::string(&c, 1);
 }
 
 absl::Status FakeInputImpl::GetN(int* n) const {
diff --git a/tensorflow/core/framework/full_type_inference_util.cc b/tensorflow/core/framework/full_type_inference_util.cc
index 029ca251b536c2..2fc478466337e7 100644
--- a/tensorflow/core/framework/full_type_inference_util.cc
+++ b/tensorflow/core/framework/full_type_inference_util.cc
@@ -342,7 +342,7 @@ TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx) {
       };
 }
 
-TypeInferenceFn FunctionCall(const string& func_attr_name) {
+TypeInferenceFn FunctionCall(const std::string& func_attr_name) {
   return [func_attr_name](const TypeRefVector& input_types,
                           const FunctionTypeInferrer& infer_function_rets)
              -> absl::StatusOr<FullTypeDef> {
diff --git a/tensorflow/core/framework/full_type_inference_util.h b/tensorflow/core/framework/full_type_inference_util.h
index 3117613bcd130f..211768f4a0083b 100644
--- a/tensorflow/core/framework/full_type_inference_util.h
+++ b/tensorflow/core/framework/full_type_inference_util.h
@@ -122,7 +122,7 @@ TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx);
 // Helper for ops with semantics of calling a function. The function is
 // specified indirectly, as the name of an attribute that holds the actual
 // function name.
-TypeInferenceFn FunctionCall(const string& func_attr_name);
+TypeInferenceFn FunctionCall(const std::string& func_attr_name);
 
 // Compose the type of a function by concatenating the outputs of multiple
 // type inference functions. If func_list is {type inference function 1, type
diff --git a/tensorflow/core/framework/full_type_util.cc b/tensorflow/core/framework/full_type_util.cc
index 0bc3e04faf0e49..ea5fad4f704ff3 100644
--- a/tensorflow/core/framework/full_type_util.cc
+++ b/tensorflow/core/framework/full_type_util.cc
@@ -51,7 +51,7 @@ OpTypeConstructor Nullary(FullTypeId t) {
   };
 }
 
-OpTypeConstructor Unary(FullTypeId t, const string& var_name) {
+OpTypeConstructor Unary(FullTypeId t, const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
@@ -93,7 +93,8 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype) {
   };
 }
 
-OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name) {
+OpTypeConstructor UnaryTensorContainer(FullTypeId t,
+                                       const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
@@ -110,7 +111,7 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name) {
 }
 
 OpTypeConstructor VariadicTensorContainer(FullTypeId t,
-                                          const string& var_name) {
+                                          const std::string& var_name) {
   return [t, var_name](OpDef* op_def) {
     FullTypeDef* tdef =
         op_def->mutable_output_arg(0)->mutable_experimental_full_type();
diff --git a/tensorflow/core/framework/full_type_util.h b/tensorflow/core/framework/full_type_util.h
index 76045e1bf1a777..392871a189651e 100644
--- a/tensorflow/core/framework/full_type_util.h
+++ b/tensorflow/core/framework/full_type_util.h
@@ -52,7 +52,7 @@ OpTypeConstructor NoOutputs();
 OpTypeConstructor Nullary(FullTypeId t);
 
 // Helper for a type constructor of <t>[FT_VAR[<var_name>]].
-OpTypeConstructor Unary(FullTypeId t, const string& var_name);
+OpTypeConstructor Unary(FullTypeId t, const std::string& var_name);
 
 // Helper for a type constructor of <t>[FT_ANY].
 OpTypeConstructor UnaryGeneric(FullTypeId t);
@@ -61,7 +61,8 @@ OpTypeConstructor UnaryGeneric(FullTypeId t);
 OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype);
 
 // Helper for a type constructor of <t>[FT_VAR[<var_name>]].
-OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
+OpTypeConstructor UnaryTensorContainer(FullTypeId t,
+                                       const std::string& var_name);
 
 // Helper for a type constructor of
 // <t>[FT_FOR_EACH[
@@ -69,7 +70,8 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
 //     FT_TENSOR[FT_VAR[<var_name>]],
 //     FT_VAR[<var_name>]].
 // Multi-valued type variables will expand the template (see full_type.proto).
-OpTypeConstructor VariadicTensorContainer(FullTypeId t, const string& var_name);
+OpTypeConstructor VariadicTensorContainer(FullTypeId t,
+                                          const std::string& var_name);
 
 // Type specialization and inference logic. This function narrows the type
 // specified in an op definition. Such types are usually generic and dependent
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 2b778ca0134c70..91653d2cb0936f 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -122,7 +122,7 @@ absl::Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
 namespace {
 
 template <typename T>
-void AddAttr(const string& name, const T& val, NodeDef* ndef) {
+void AddAttr(const std::string& name, const T& val, NodeDef* ndef) {
   SetAttrValue(val, &((*ndef->mutable_attr())[name]));
 }
 
@@ -207,7 +207,7 @@ class FunctionInstantiationHelper {
             "Expected arg_index to be equal to the number of nodes in result.",
             " Got ", arg_index, " and ", result_.nodes.size());
       }
-      string name = arg_def.name();
+      std::string name = arg_def.name();
       if (dtypes.size() > 1) {
         absl::StrAppend(&name, "_", i);
       }
@@ -259,7 +259,7 @@ class FunctionInstantiationHelper {
           ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
       // Note that we rely on the backwards-compatibility test enforcing
       // that output_arg(*).name() doesn't change here.
-      const string base_name =
+      const std::string base_name =
           absl::StrCat(node.name(), ":", node_sig->output_arg(i).name());
       TF_RETURN_IF_ERROR(
           AddItem(base_name, {false, arg_index, start, is_type_list, dtypes}));
@@ -299,7 +299,7 @@ class FunctionInstantiationHelper {
               " >= ", fnode.input_size());
         }
         // Look up the next input.
-        const string& input_name = fnode.input(fnode_arg_index);
+        const std::string& input_name = fnode.input(fnode_arg_index);
         const auto* item = GetItemOrNull(input_name);
         if (item == nullptr) {
           return errors::InvalidArgument(
@@ -331,15 +331,15 @@ class FunctionInstantiationHelper {
 
     // Control deps.
     for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
-      const string& input = fnode.input(i);
+      const std::string& input = fnode.input(i);
       if (input.empty() || input[0] != '^') {
         return errors::InvalidArgument("Expected input[", i, "] == '", input,
                                        "' to be a control input.");
       }
       int nid = -1;
-      const string node_name = input.substr(1);
-      const string node_colon = node_name + ":";
-      const string node_colon_bound = node_name + ";";
+      const std::string node_name = input.substr(1);
+      const std::string node_colon = node_name + ":";
+      const std::string node_colon_bound = node_name + ";";
       // index_ is a map sorted lexicographically, so the key we are looking for
       // must lie in the range [node_name, node_colon_bound).
       auto it = index_.lower_bound(node_name);
@@ -379,7 +379,7 @@ class FunctionInstantiationHelper {
 
   absl::Status AddReturnNode(
       const OpDef::ArgDef& ret_def, AttrSlice attrs,
-      const ::tensorflow::protobuf::Map<string, string>& ret_map,
+      const ::tensorflow::protobuf::Map<std::string, std::string>& ret_map,
       bool ints_on_device, int* ret_index) {
     auto ret_iter = ret_map.find(ret_def.name());
     if (ret_iter == ret_map.end()) {
@@ -401,7 +401,7 @@ class FunctionInstantiationHelper {
                                      DataTypeVectorString(item->dtypes));
     }
     for (size_t i = 0; i < dtypes.size(); ++i) {
-      string name = absl::StrCat(ret_def.name(), "_RetVal");
+      std::string name = absl::StrCat(ret_def.name(), "_RetVal");
       if (dtypes.size() > 1) {
         absl::StrAppend(&name, "_", i);
       }
@@ -456,7 +456,7 @@ class FunctionInstantiationHelper {
   };
 
   // Adds an item into the input name index.
-  absl::Status AddItem(const string& name, const NameInfoItem& item) {
+  absl::Status AddItem(const std::string& name, const NameInfoItem& item) {
     if (!index_.insert({name, item}).second) {
       return errors::InvalidArgument(
           absl::StrCat("Duplicated ", item.is_func_arg ? "arg" : "ret",
@@ -466,20 +466,20 @@ class FunctionInstantiationHelper {
     return absl::OkStatus();
   }
 
-  const NameInfoItem* GetItemOrNull(const string& name) const {
+  const NameInfoItem* GetItemOrNull(const std::string& name) const {
     return gtl::FindOrNull(index_, name);
   }
 
-  string Dep(int node_index) const {
+  std::string Dep(int node_index) const {
     return absl::StrCat("^", Name(node_index));
   }
 
-  string Name(int node_index) const {
+  std::string Name(int node_index) const {
     CHECK_LT(node_index, nodes_.size());
     return nodes_[node_index].name;
   }
 
-  string Name(int node_index, int output_index) const {
+  std::string Name(int node_index, int output_index) const {
     if (output_index == 0) {
       return Name(node_index);
     } else {
@@ -487,7 +487,7 @@ class FunctionInstantiationHelper {
     }
   }
 
-  NodeDef* AddNode(const string& name) {
+  NodeDef* AddNode(const std::string& name) {
     result_.nodes.emplace_back();
     NodeDef* gnode = &result_.nodes.back();
     gnode->set_name(name);
@@ -510,11 +510,11 @@ class FunctionInstantiationHelper {
   GetFunctionSignature get_function_;
   InstantiationResult& result_;
   // A small index for all names that can be used as a node's input arguments.
-  std::map<string, NameInfoItem> index_;
+  std::map<std::string, NameInfoItem> index_;
   // This contains information about a node in the new graph including the node
   // names and input nodes' indexes.
   struct NodeInfo {
-    string name;
+    std::string name;
     // Data inputs where <n, k> means arg k of node n.
     std::vector<std::pair<int, int>> data_inputs;
     // Control inputs (dependencies).
@@ -525,8 +525,8 @@ class FunctionInstantiationHelper {
 };
 
 // Various helpers Print(proto) to print relevant protos to ascii.
-string Print(const OpDef::ArgDef& arg) {
-  string out;
+std::string Print(const OpDef::ArgDef& arg) {
+  std::string out;
   absl::StrAppend(&out, arg.name(), ":");
   if (arg.is_ref()) absl::StrAppend(&out, "Ref(");
   if (!arg.number_attr().empty()) {
@@ -545,13 +545,13 @@ string Print(const OpDef::ArgDef& arg) {
 // When hash_string_attrs = true, string attributes are hashed instead of being
 // truncated with ellipses. This is done to reduce the chance of collisions when
 // looking up functions using the canonical representation.
-string Print(const AttrValue& attr_value,
-             const bool hash_string_attrs = false) {
+std::string Print(const AttrValue& attr_value,
+                  const bool hash_string_attrs = false) {
   if (attr_value.value_case() == AttrValue::kType) {
     return DataTypeString(attr_value.type());
   } else if ((attr_value.value_case() == AttrValue::kList) &&
              (attr_value.list().type_size() > 0)) {
-    string ret = "{";
+    std::string ret = "{";
     for (int i = 0; i < attr_value.list().type_size(); ++i) {
       if (i > 0) absl::StrAppend(&ret, ", ");
       absl::StrAppend(&ret, DataTypeString(attr_value.list().type(i)));
@@ -562,7 +562,7 @@ string Print(const AttrValue& attr_value,
     if (attr_value.func().attr_size() == 0) {
       return attr_value.func().name();
     }
-    std::vector<string> entries;
+    std::vector<std::string> entries;
     for (const auto& p : attr_value.func().attr()) {
       entries.push_back(absl::StrCat(p.first, "=", Print(p.second)));
     }
@@ -576,11 +576,11 @@ string Print(const AttrValue& attr_value,
 }
 
 // TODO(josh11b): Merge this with SummarizeNodeDef().
-string Print(const NodeDef& n) {
-  string out;
+std::string Print(const NodeDef& n) {
+  std::string out;
   absl::StrAppend(&out, n.name(), " = ", n.op());
   if (n.attr_size() > 0) {
-    std::vector<string> entries;
+    std::vector<std::string> entries;
     for (auto& a : n.attr()) {
       entries.push_back(absl::StrCat(a.first, "=", Print(a.second)));
     }
@@ -598,7 +598,7 @@ string Print(const NodeDef& n) {
   }
   absl::StrAppend(&out, "(");
   std::vector<absl::string_view> dat;
-  std::vector<string> dep;
+  std::vector<std::string> dep;
   for (absl::string_view s : n.input()) {
     if (absl::ConsumePrefix(&s, "^")) {
       dep.emplace_back(s);
@@ -613,8 +613,8 @@ string Print(const NodeDef& n) {
   return out;
 }
 
-string Print(const FunctionDef& fdef) {
-  string out;
+std::string Print(const FunctionDef& fdef) {
+  std::string out;
   const OpDef& sig = fdef.signature();
   absl::StrAppend(&out, "\n", sig.name());
   if (sig.attr_size() > 0) {
@@ -654,7 +654,7 @@ string Print(const FunctionDef& fdef) {
   return out;
 }
 
-string Print(absl::Span<const NodeDef* const> nodes) {
+std::string Print(absl::Span<const NodeDef* const> nodes) {
   std::vector<const NodeDef*> arg;
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
@@ -678,7 +678,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
   };
   std::sort(arg.begin(), arg.end(), comp);
   std::sort(ret.begin(), ret.end(), comp);
-  string out;
+  std::string out;
   absl::StrAppend(&out, "\n(");
   auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
@@ -714,7 +714,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
     // The _RetVal op should have a unique non-control input. We assert that
     // here and add it to the output.
     bool found_non_control_input = false;
-    for (const string& input : n->input()) {
+    for (const std::string& input : n->input()) {
       if (!input.empty() && input[0] != '^') {
         DCHECK_EQ(found_non_control_input, false)
             << "RetVal node has more than one non-control input: "
@@ -735,7 +735,7 @@ string Print(absl::Span<const NodeDef* const> nodes) {
   return out;
 }
 
-absl::Status AddDefaultAttrs(const string& op,
+absl::Status AddDefaultAttrs(const std::string& op,
                              const GetFunctionSignature& get_function,
                              AttrValueMap* attrs) {
   const OpDef* op_def = nullptr;
@@ -799,7 +799,8 @@ absl::Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     }
   }
 
-  auto substitute = [attr_values, &sig](const string& name, AttrValue* val) {
+  auto substitute = [attr_values, &sig](const std::string& name,
+                                        AttrValue* val) {
     // Look for a specified value...
     if (const AttrValue* v = attr_values.FindByString(name)) {
       *val = *v;
@@ -870,9 +871,9 @@ absl::Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   return absl::OkStatus();
 }
 
-string DebugString(const FunctionDef& func_def) { return Print(func_def); }
+std::string DebugString(const FunctionDef& func_def) { return Print(func_def); }
 
-string DebugString(const GraphDef& instantiated_func_def) {
+std::string DebugString(const GraphDef& instantiated_func_def) {
   std::vector<const NodeDef*> ptrs;
   for (const NodeDef& n : instantiated_func_def.node()) {
     ptrs.push_back(&n);
@@ -880,7 +881,7 @@ string DebugString(const GraphDef& instantiated_func_def) {
   return Print(ptrs);
 }
 
-string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
+std::string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
   std::vector<const NodeDef*> ptrs;
   for (const NodeDef& n : instantiated_func_nodes) {
     ptrs.push_back(&n);
@@ -888,8 +889,8 @@ string DebugString(absl::Span<const NodeDef> instantiated_func_nodes) {
   return Print(ptrs);
 }
 
-string DebugStringWhole(const GraphDef& gdef) {
-  string ret;
+std::string DebugStringWhole(const GraphDef& gdef) {
+  std::string ret;
   for (const auto& fdef : gdef.library().function()) {
     absl::StrAppend(&ret, Print(fdef));
   }
@@ -905,8 +906,8 @@ namespace {
 // Returns the name -> attr mapping of fdef's attrs that have a value set. In
 // Python, it's possible to access unset attrs, which returns a default value
 // and adds an unset attr to the map.
-std::map<string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
-  std::map<string, AttrValue> set_attrs;
+std::map<std::string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
+  std::map<std::string, AttrValue> set_attrs;
   for (const auto& pair : fdef.attr()) {
     if (pair.second.value_case() != AttrValue::VALUE_NOT_SET) {
       set_attrs[pair.first] = pair.second;
@@ -920,8 +921,8 @@ std::map<string, AttrValue> GetSetAttrs(const FunctionDef& fdef) {
 bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   if (!OpDefEqual(f1.signature(), f2.signature())) return false;
 
-  std::map<string, AttrValue> f1_attrs = GetSetAttrs(f1);
-  std::map<string, AttrValue> f2_attrs = GetSetAttrs(f2);
+  std::map<std::string, AttrValue> f1_attrs = GetSetAttrs(f1);
+  std::map<std::string, AttrValue> f2_attrs = GetSetAttrs(f2);
   if (f1_attrs.size() != f2_attrs.size()) return false;
   for (const auto& iter1 : f1_attrs) {
     auto iter2 = f2_attrs.find(iter1.first);
@@ -933,25 +934,25 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
     return false;
   }
 
-  std::map<string, string> ret1(f1.ret().begin(), f1.ret().end());
-  std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
+  std::map<std::string, std::string> ret1(f1.ret().begin(), f1.ret().end());
+  std::map<std::string, std::string> ret2(f2.ret().begin(), f2.ret().end());
   if (ret1 != ret2) return false;
 
-  std::map<string, string> control_ret1(f1.control_ret().begin(),
-                                        f1.control_ret().end());
-  std::map<string, string> control_ret2(f2.control_ret().begin(),
-                                        f2.control_ret().end());
+  std::map<std::string, std::string> control_ret1(f1.control_ret().begin(),
+                                                  f1.control_ret().end());
+  std::map<std::string, std::string> control_ret2(f2.control_ret().begin(),
+                                                  f2.control_ret().end());
   if (control_ret1 != control_ret2) return false;
 
   return true;
 }
 
-uint64 FunctionDefHash(const FunctionDef& fdef) {
+uint64_t FunctionDefHash(const FunctionDef& fdef) {
   // signature
-  uint64 h = OpDefHash(fdef.signature());
+  uint64_t h = OpDefHash(fdef.signature());
 
   // attrs
-  std::map<string, AttrValue> attrs = GetSetAttrs(fdef);
+  std::map<std::string, AttrValue> attrs = GetSetAttrs(fdef);
   for (const auto& p : attrs) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64Combine(AttrValueHash(p.second), h);
@@ -961,15 +962,15 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
   h = Hash64Combine(RepeatedNodeDefHash(fdef.node_def()), h);
 
   // output names
-  std::map<string, string> ret(fdef.ret().begin(), fdef.ret().end());
+  std::map<std::string, std::string> ret(fdef.ret().begin(), fdef.ret().end());
   for (const auto& p : ret) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64(p.second.data(), p.second.size(), h);
   }
 
   // control output names
-  std::map<string, string> control_ret(fdef.control_ret().begin(),
-                                       fdef.control_ret().end());
+  std::map<std::string, std::string> control_ret(fdef.control_ret().begin(),
+                                                 fdef.control_ret().end());
   for (const auto& p : control_ret) {
     h = Hash64(p.first.data(), p.first.size(), h);
     h = Hash64(p.second.data(), p.second.size(), h);
@@ -981,14 +982,14 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
 static constexpr const char* const kExecutorAttr = "_executor";
 
 /* static */
-string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
-                                            AttrSlice attrs) {
+std::string FunctionLibraryRuntime::ExecutorType(
+    const InstantiateOptions& options, AttrSlice attrs) {
   if (!options.executor_type.empty()) {
     return options.executor_type;
   } else if (const AttrValue* executor_attr = attrs.Find(kExecutorAttr)) {
     return executor_attr->s();
   } else {
-    return string();
+    return std::string();
   }
 }
 
@@ -999,7 +1000,7 @@ class AttrKeyAndValue {
     kRaw,
     kCEscape,
   };
-  AttrKeyAndValue(absl::string_view key_name, int key_suffix, string value,
+  AttrKeyAndValue(absl::string_view key_name, int key_suffix, std::string value,
                   ValueRepresentationOp value_op = kRaw)
       : key_name_(key_name),
         key_suffix_(key_suffix),
@@ -1016,7 +1017,7 @@ class AttrKeyAndValue {
     }
   }
 
-  void AppendTo(bool first, string* s) const {
+  void AppendTo(bool first, std::string* s) const {
     absl::string_view v;
     bool add_escaped = false;
     if ((value_op_ == kCEscape) && NeedsEscaping(value_)) {
@@ -1037,9 +1038,9 @@ class AttrKeyAndValue {
   }
 
  private:
-  static bool NeedsEscaping(const string& s) {
+  static bool NeedsEscaping(const std::string& s) {
     for (auto c : s) {
-      if (!isalnum(c) && (c != ' ')) {
+      if (!absl::ascii_isalnum(c) && (c != ' ')) {
         return true;
       }
     }
@@ -1049,16 +1050,17 @@ class AttrKeyAndValue {
   absl::string_view key_name_;
   int key_suffix_;  // -1 if missing
   ValueRepresentationOp value_op_;
-  string value_;
+  std::string value_;
 };
 }  // namespace
 
-string GetFunctionResourceInputDevice(
+std::string GetFunctionResourceInputDevice(
     const Tensor& input, const int arg_index, const FunctionDef& function_def,
-    absl::flat_hash_map<string, std::vector<string>>* composite_devices) {
+    absl::flat_hash_map<std::string, std::vector<std::string>>*
+        composite_devices) {
   const auto& handles = input.flat<ResourceHandle>();
   const ResourceHandle& handle0 = handles(0);
-  string composite_device;
+  std::string composite_device;
   auto iter = function_def.arg_attr().find(arg_index);
   if (iter != function_def.arg_attr().end()) {
     auto arg_attr = iter->second.attr().find("_composite_device");
@@ -1078,8 +1080,9 @@ string GetFunctionResourceInputDevice(
   }
 }
 
-string Canonicalize(const string& funcname, AttrSlice attrs,
-                    const FunctionLibraryRuntime::InstantiateOptions& options) {
+std::string Canonicalize(
+    const std::string& funcname, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
   absl::InlinedVector<AttrKeyAndValue, 8> entries;
   entries.reserve(attrs.size() + static_cast<int>(!options.target.empty()) +
                   options.input_devices.size());
@@ -1118,12 +1121,13 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         AttrKeyAndValue("_state_handle", -1, options.state_handle));
   }
-  string executor_type = FunctionLibraryRuntime::ExecutorType(options, attrs);
+  std::string executor_type =
+      FunctionLibraryRuntime::ExecutorType(options, attrs);
   if (!executor_type.empty()) {
     entries.push_back(AttrKeyAndValue(kExecutorAttr, -1, executor_type));
   }
   if (options.config_proto.ByteSize() > 0) {
-    string config_proto_serialized;
+    std::string config_proto_serialized;
     SerializeToStringDeterministic(options.config_proto,
                                    &config_proto_serialized);
     entries.push_back(AttrKeyAndValue("_config_proto", -1,
@@ -1131,7 +1135,7 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
                                       AttrKeyAndValue::kCEscape));
   }
   std::sort(entries.begin(), entries.end());
-  string result = absl::StrCat(funcname, "[");
+  std::string result = absl::StrCat(funcname, "[");
   bool first = true;
   for (const auto& entry : entries) {
     entry.AppendTo(first, &result);
@@ -1141,7 +1145,7 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
   return result;
 }
 
-string Canonicalize(const string& funcname, AttrSlice attrs) {
+std::string Canonicalize(const std::string& funcname, AttrSlice attrs) {
   static const FunctionLibraryRuntime::InstantiateOptions* kEmptyOptions =
       new FunctionLibraryRuntime::InstantiateOptions;
   return Canonicalize(funcname, attrs, *kEmptyOptions);
@@ -1373,12 +1377,13 @@ void FunctionLibraryDefinition::Initialize(
   }
 }
 
-bool FunctionLibraryDefinition::Contains(const string& func) const {
+bool FunctionLibraryDefinition::Contains(const std::string& func) const {
   tf_shared_lock l(mu_);
   return records_.find(func) != records_.end();
 }
 
-const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
+const FunctionDef* FunctionLibraryDefinition::Find(
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   auto result = FindHelper(func);
   if (result) {
@@ -1389,13 +1394,13 @@ const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
 }
 
 core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindRecord(
-    const string& func) const {
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   return FindHelper(func);
 }
 
 core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindHelper(
-    const string& func) const {
+    const std::string& func) const {
   auto iter = records_.find(func);
   if (iter == records_.end()) {
     return nullptr;
@@ -1474,7 +1479,7 @@ absl::Status FunctionLibraryDefinition::AddHelper(FunctionRecord* registration,
 }
 
 absl::Status FunctionLibraryDefinition::CopyFunctionDefFrom(
-    const string& name, const FunctionLibraryDefinition& other) {
+    const std::string& name, const FunctionLibraryDefinition& other) {
   if (default_registry() != other.default_registry()) {
     return errors::InvalidArgument(
         "Cannot copy function '", name,
@@ -1516,7 +1521,7 @@ absl::Status FunctionLibraryDefinition::AddGradientDef(
 absl::Status FunctionLibraryDefinition::AddGradientDefHelper(
     const GradientDef& grad, bool* added) {
   *added = false;
-  string* entry = &func_grad_[grad.function_name()];
+  std::string* entry = &func_grad_[grad.function_name()];
   if (!entry->empty()) {
     if (*entry != grad.gradient_func()) {
       return errors::InvalidArgument(
@@ -1545,8 +1550,8 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
   mutex_lock l2(other.mu_);
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
-  std::vector<string> funcs;
-  std::vector<string> funcs_with_grads;
+  std::vector<std::string> funcs;
+  std::vector<std::string> funcs_with_grads;
   absl::Status s;
   bool added;
   for (const auto& [name, record] : other.records_) {
@@ -1603,8 +1608,8 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   mutex_lock l(mu_);
-  std::vector<string> funcs;
-  std::vector<string> funcs_with_grads;
+  std::vector<std::string> funcs;
+  std::vector<std::string> funcs_with_grads;
   absl::Status s;
   bool added;
   for (FunctionDef& fdef : *lib_def.mutable_function()) {
@@ -1641,7 +1646,7 @@ absl::Status FunctionLibraryDefinition::AddLibrary(
 }
 
 absl::Status FunctionLibraryDefinition::ReplaceFunction(
-    const string& func, const FunctionDef& fdef,
+    const std::string& func, const FunctionDef& fdef,
     const StackTracesMap& stack_traces) {
   mutex_lock l(mu_);
   bool added;
@@ -1660,14 +1665,15 @@ absl::Status FunctionLibraryDefinition::ReplaceGradient(
   return absl::OkStatus();
 }
 
-absl::Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
+absl::Status FunctionLibraryDefinition::RemoveFunction(
+    const std::string& func) {
   mutex_lock l(mu_);
   TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
   return absl::OkStatus();
 }
 
 absl::Status FunctionLibraryDefinition::RemoveFunctionHelper(
-    const string& func) {
+    const std::string& func) {
   auto iter = records_.find(func);
   if (iter == records_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent function '",
@@ -1688,7 +1694,8 @@ void FunctionLibraryDefinition::Clear() {
   func_grad_.clear();
 }
 
-absl::Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
+absl::Status FunctionLibraryDefinition::RemoveGradient(
+    const std::string& func) {
   const auto& i = func_grad_.find(func);
   if (i == func_grad_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent gradient '",
@@ -1699,16 +1706,16 @@ absl::Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
 }
 
 absl::Status FunctionLibraryDefinition::Remove(
-    const std::vector<string>& funcs,
-    const std::vector<string>& funcs_with_grads) {
+    const std::vector<std::string>& funcs,
+    const std::vector<std::string>& funcs_with_grads) {
   absl::Status s;
-  for (const string& f : funcs) {
+  for (const std::string& f : funcs) {
     s = RemoveFunctionHelper(f);
     if (!s.ok()) {
       return s;
     }
   }
-  for (const string& f : funcs_with_grads) {
+  for (const std::string& f : funcs_with_grads) {
     s = RemoveGradient(f);
     if (!s.ok()) {
       return s;
@@ -1717,17 +1724,19 @@ absl::Status FunctionLibraryDefinition::Remove(
   return absl::OkStatus();
 }
 
-string FunctionLibraryDefinition::FindGradient(const string& func) const {
+std::string FunctionLibraryDefinition::FindGradient(
+    const std::string& func) const {
   tf_shared_lock l(mu_);
   return gtl::FindWithDefault(func_grad_, func, "");
 }
 
-string FunctionLibraryDefinition::FindGradientHelper(const string& func) const {
+std::string FunctionLibraryDefinition::FindGradientHelper(
+    const std::string& func) const {
   return gtl::FindWithDefault(func_grad_, func, "");
 }
 
 absl::Status FunctionLibraryDefinition::LookUp(
-    const string& op, const OpRegistrationData** op_reg_data) const {
+    const std::string& op, const OpRegistrationData** op_reg_data) const {
   tf_shared_lock l(mu_);
   auto iter = records_.find(op);
   if (iter != records_.end()) {
@@ -1737,11 +1746,11 @@ absl::Status FunctionLibraryDefinition::LookUp(
   return default_registry_->LookUp(op, op_reg_data);
 }
 
-string FunctionLibraryDefinition::UniqueFunctionName(
+std::string FunctionLibraryDefinition::UniqueFunctionName(
     absl::string_view prefix) const {
   tf_shared_lock l(mu_);
   int index = 0;
-  string name = absl::StrCat(prefix, index);
+  std::string name = absl::StrCat(prefix, index);
   while (records_.find(name) != records_.end()) {
     ++index;
     name = absl::StrCat(prefix, index);
@@ -1763,8 +1772,8 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   if (!TryGetNodeAttr(ndef, kFuncAttr, &forward_func_attrs)) {
     return nullptr;
   }
-  const string& func_name = forward_func_attrs->name();
-  const string& grad_name = FindGradient(func_name);
+  const std::string& func_name = forward_func_attrs->name();
+  const std::string& grad_name = FindGradient(func_name);
   // If 'func' has a user-defined gradient function, uses the grad
   // function's attrs to see if noinline is specified. Otherwise,
   // uses func's attrs.
@@ -1782,8 +1791,8 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
-std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
-  std::vector<string> function_names;
+std::vector<std::string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<std::string> function_names;
   tf_shared_lock l(mu_);
   function_names.reserve(records_.size());
   for (const auto& it : records_) {
@@ -1808,7 +1817,7 @@ FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
 
 template <typename T>
 absl::Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
-                                                const string& attr,
+                                                const std::string& attr,
                                                 T* value) const {
   const FunctionDef* fdef = GetAttrImpl(ndef);
   if (fdef && TryGetNodeAttr(AttrSlice(&fdef->attr()), attr, value)) {
@@ -1819,7 +1828,7 @@ absl::Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
 
 template <typename T>
 absl::Status FunctionLibraryDefinition::GetAttr(const Node& node,
-                                                const string& attr,
+                                                const std::string& attr,
                                                 T* value) const {
   return GetAttr(node.def(), attr, value);
 }
@@ -1839,25 +1848,25 @@ constexpr char kApiImplements[] = "api_implements";
 
 template <typename NodeType, typename NodeIter, typename OpTypeGetter,
           typename AttrGetter>
-std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
-                                    NodeIter begin, NodeIter end,
-                                    OpTypeGetter op_type_getter,
-                                    AttrGetter attr_getter) {
+std::set<std::string> ReachableFunctions(const FunctionLibraryDefinition& flib,
+                                         NodeIter begin, NodeIter end,
+                                         OpTypeGetter op_type_getter,
+                                         AttrGetter attr_getter) {
   // Functions that are reachable from the graph.
-  std::set<string> reachable_funcs;
+  std::set<std::string> reachable_funcs;
 
   // For any functions, if it has attribute "api_implements" =
   // "some_interface" and it is reachable, then it means any other
   // function with same attribute name and value could also be potentially
   // reachable, eg via implementation_selector swapping the nodedef.
-  absl::flat_hash_set<string> reachable_api_interface;
+  absl::flat_hash_set<std::string> reachable_api_interface;
 
   // Functions might be reachable from the nested function calls, so we keep a
   // queue of functions that we have to check.
   absl::InlinedVector<core::RefCountPtr<FunctionRecord>, 4> func_queue;
 
   // Add reachable and not already processed functions to the functions queue.
-  const auto add_to_func_queue = [&](const string& func_name) {
+  const auto add_to_func_queue = [&](const std::string& func_name) {
     auto record = flib.FindRecord(func_name);
     if (record && reachable_funcs.find(func_name) == reachable_funcs.end()) {
       func_queue.push_back(std::move(record));
@@ -1866,19 +1875,20 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
 
   // If any function with certain API name is reachable, all the other functions
   // with same API name should also be checked.
-  const auto add_function_with_api_interface = [&](const string& api_name) {
-    if (!reachable_api_interface.contains(api_name)) {
-      reachable_api_interface.insert(api_name);
-      for (const auto& func_name : flib.ListFunctionNames()) {
-        const auto record = flib.FindRecord(func_name);
-        const auto attr_it = record->fdef().attr().find(kApiImplements);
-        if (attr_it != record->fdef().attr().end() &&
-            attr_it->second.s() == api_name) {
-          add_to_func_queue(func_name);
+  const auto add_function_with_api_interface =
+      [&](const std::string& api_name) {
+        if (!reachable_api_interface.contains(api_name)) {
+          reachable_api_interface.insert(api_name);
+          for (const auto& func_name : flib.ListFunctionNames()) {
+            const auto record = flib.FindRecord(func_name);
+            const auto attr_it = record->fdef().attr().find(kApiImplements);
+            if (attr_it != record->fdef().attr().end() &&
+                attr_it->second.s() == api_name) {
+              add_to_func_queue(func_name);
+            }
+          }
         }
-      }
-    }
-  };
+      };
 
   const auto process_attr_value = [&](const AttrValue& attr_value) {
     // 1. AttrValue.func
@@ -1913,7 +1923,7 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
     auto func = std::move(func_queue.back());
     func_queue.pop_back();
 
-    const string& func_name = func->fdef().signature().name();
+    const std::string& func_name = func->fdef().signature().name();
     reachable_funcs.insert(func_name);
 
     const auto attr_it = func->fdef().attr().find(kApiImplements);
@@ -1937,7 +1947,7 @@ std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
     std::for_each(func_body.begin(), func_body.end(), process_node_def);
 
     // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
+    const std::string grad_func_name = flib.FindGradient(func_name);
     if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
   }
 
@@ -1949,19 +1959,19 @@ template <typename NodeType, typename NodeIter, typename OpTypeGetter,
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, NodeIter begin, NodeIter end,
     OpTypeGetter op_type_getter, AttrGetter attr_getter) {
-  std::set<string> reachable_funcs = ReachableFunctions<NodeType>(
+  std::set<std::string> reachable_funcs = ReachableFunctions<NodeType>(
       flib, begin, end, op_type_getter, attr_getter);
 
   FunctionLibraryDefinition reachable_flib(flib.default_registry(),
                                            FunctionDefLibrary());
 
-  for (const string& func_name : reachable_funcs) {
+  for (const std::string& func_name : reachable_funcs) {
     // This should never fail, because we copy functions from a valid flib and
     // use the same default registry.
     absl::Status added = reachable_flib.CopyFunctionDefFrom(func_name, flib);
     TF_DCHECK_OK(added);
 
-    const string grad_func_name = flib.FindGradient(func_name);
+    const std::string grad_func_name = flib.FindGradient(func_name);
     if (!grad_func_name.empty()) {
       GradientDef grad;
       grad.set_function_name(func_name);
@@ -1975,9 +1985,9 @@ FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
   return reachable_flib;
 }
 
-string AllocatorAttributesToString(
+std::string AllocatorAttributesToString(
     const std::vector<AllocatorAttributes>& attrs) {
-  string result("[");
+  std::string result("[");
   // AllocatorAttribute::DebugString produces around 85 bytes now.
   result.reserve(100 * attrs.size());
   for (const AllocatorAttributes& attr : attrs) {
@@ -2036,7 +2046,7 @@ FunctionLibraryDefinition::ReachableDefinitions(
   }
 }
 
-string FunctionLibraryRuntime::Options::DebugString() const {
+std::string FunctionLibraryRuntime::Options::DebugString() const {
   return absl::StrCat(
       "FLR::Options(step_id=", step_id, " rendezvous=", IsSet(rendezvous),
       " cancellation_manager=", IsSet(cancellation_manager),
@@ -2060,8 +2070,8 @@ void FunctionDefHelper::AttrValueWrapper::InitFromString(
 }
 
 FunctionDefHelper::AttrValueWrapper FunctionDefHelper::FunctionRef(
-    const string& name,
-    absl::Span<const std::pair<string, AttrValueWrapper>> attrs) {
+    const std::string& name,
+    absl::Span<const std::pair<std::string, AttrValueWrapper>> attrs) {
   AttrValueWrapper ret;
   ret.proto.mutable_func()->set_name(name);
   for (const auto& a : attrs) {
@@ -2077,10 +2087,10 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
   for (const auto& a : this->attr) {
     n.mutable_attr()->insert({a.first, a.second.proto});
   }
-  for (const string& a : this->arg) {
+  for (const std::string& a : this->arg) {
     n.add_input(a);
   }
-  for (const string& d : this->dep) {
+  for (const std::string& d : this->dep) {
     n.add_input(absl::StrCat("^", d));
   }
   if (!this->device.empty()) {
@@ -2099,11 +2109,11 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
 
 /* static */
 FunctionDef FunctionDefHelper::Create(
-    const string& function_name, absl::Span<const string> in_def,
-    absl::Span<const string> out_def, absl::Span<const string> attr_def,
-    absl::Span<const Node> node_def,
-    absl::Span<const std::pair<string, string>> ret_def,
-    absl::Span<const std::pair<string, string>> control_ret_def) {
+    const std::string& function_name, absl::Span<const std::string> in_def,
+    absl::Span<const std::string> out_def,
+    absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+    absl::Span<const std::pair<std::string, std::string>> ret_def,
+    absl::Span<const std::pair<std::string, std::string>> control_ret_def) {
   FunctionDef fdef;
 
   // Signature
@@ -2149,19 +2159,19 @@ FunctionDef FunctionDefHelper::Create(
 
 /* static */
 FunctionDef FunctionDefHelper::Create(
-    const string& function_name, absl::Span<const string> in_def,
-    absl::Span<const string> out_def, absl::Span<const string> attr_def,
-    absl::Span<const Node> node_def,
-    absl::Span<const std::pair<string, string>> ret_def) {
+    const std::string& function_name, absl::Span<const std::string> in_def,
+    absl::Span<const std::string> out_def,
+    absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+    absl::Span<const std::pair<std::string, std::string>> ret_def) {
   return Create(function_name, in_def, out_def, attr_def, node_def, ret_def,
                 /*control_ret_def=*/{});
 }
 
 /* static */
-FunctionDef FunctionDefHelper::Define(const string& name,
-                                      absl::Span<const string> arg_def,
-                                      absl::Span<const string> ret_def,
-                                      absl::Span<const string> attr_def,
+FunctionDef FunctionDefHelper::Define(const std::string& name,
+                                      absl::Span<const std::string> arg_def,
+                                      absl::Span<const std::string> ret_def,
+                                      absl::Span<const std::string> attr_def,
                                       absl::Span<const Node> node_def) {
   FunctionDef fdef;
   OpDefBuilder b(name);
@@ -2174,7 +2184,7 @@ FunctionDef FunctionDefHelper::Define(const string& name,
   fdef.mutable_signature()->Swap(&op_reg_data.op_def);
 
   // Mapping from legacy output names to NodeDef outputs.
-  std::unordered_map<string, string> ret_index;
+  std::unordered_map<std::string, std::string> ret_index;
   for (const auto& a : fdef.signature().input_arg()) {
     ret_index[a.name()] = a.name();
   }
@@ -2190,13 +2200,13 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     for (const auto& a : src.attr) {
       n->mutable_attr()->insert({a.first, a.second.proto});
     }
-    for (const string& a : src.arg) {
+    for (const std::string& a : src.arg) {
       const auto iter = ret_index.find(a);
       CHECK(iter != ret_index.end())
           << "Node input '" << a << "' in '" << n->name() << "' of " << name;
       n->add_input(iter->second);
     }
-    for (const string& d : src.dep) {
+    for (const std::string& d : src.dep) {
       n->add_input(absl::StrCat("^", d));
     }
 
@@ -2227,29 +2237,29 @@ FunctionDef FunctionDefHelper::Define(const string& name,
   return fdef;
 }
 
-FunctionDef FunctionDefHelper::Define(absl::Span<const string> arg_def,
-                                      absl::Span<const string> ret_def,
-                                      absl::Span<const string> attr_def,
+FunctionDef FunctionDefHelper::Define(absl::Span<const std::string> arg_def,
+                                      absl::Span<const std::string> ret_def,
+                                      absl::Span<const std::string> attr_def,
                                       absl::Span<const Node> node_def) {
   return Define("_", arg_def, ret_def, attr_def, node_def);
 }
 
 namespace gradient {
 
-typedef std::unordered_map<string, Creator> OpGradFactory;
+typedef std::unordered_map<std::string, Creator> OpGradFactory;
 
 OpGradFactory* GetOpGradFactory() {
   static OpGradFactory* factory = new OpGradFactory;
   return factory;
 }
 
-bool RegisterOp(const string& op, Creator func) {
+bool RegisterOp(const std::string& op, Creator func) {
   CHECK(GetOpGradFactory()->insert({op, func}).second)
       << "Duplicated gradient for " << op;
   return true;
 }
 
-absl::Status GetOpGradientCreator(const string& op, Creator* creator) {
+absl::Status GetOpGradientCreator(const std::string& op, Creator* creator) {
   auto fac = GetOpGradFactory();
   auto iter = fac->find(op);
   if (iter == fac->end()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 7fbf120afd6741..ed2ec8c075db08 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -125,7 +125,7 @@ class FunctionDefHelper {
   // Constructs an AttrValue.func given the "name" and "attrs".
   static AttrValueWrapper FunctionRef(
       const std::string& name,
-      absl::Span<const std::pair<string, AttrValueWrapper>> attrs);
+      absl::Span<const std::pair<std::string, AttrValueWrapper>> attrs);
   static AttrValueWrapper FunctionRef(const std::string& name) {
     return FunctionRef(name, {});
   }
@@ -141,11 +141,11 @@ class FunctionDefHelper {
   struct Node {
     // When constructing a NodeDef, the first entry in ret is used as
     // the node name, the remaining values are ignored.
-    std::vector<string> ret;
+    std::vector<std::string> ret;
     std::string op;
-    std::vector<string> arg;
-    std::vector<std::pair<string, AttrValueWrapper>> attr;
-    std::vector<string> dep;
+    std::vector<std::string> arg;
+    std::vector<std::pair<std::string, AttrValueWrapper>> attr;
+    std::vector<std::string> dep;
     std::string device;
 
     // Required if the op has zero outputs. Otherwise, ret[0] used as name if
@@ -157,8 +157,8 @@ class FunctionDefHelper {
       CHECK(!ret.empty());
       return ret[0];
     }
-    std::vector<string> original_node_names;
-    std::vector<string> original_func_names;
+    std::vector<std::string> original_node_names;
+    std::vector<std::string> original_func_names;
 
     NodeDef ToNodeDef() const;
   };
@@ -170,33 +170,33 @@ class FunctionDefHelper {
   // - `control_ret_def` holds a mapping from the function control
   //   output names to the nodes from `node_def`.
   static FunctionDef Create(
-      const std::string& function_name, absl::Span<const string> in_def,
-      absl::Span<const string> out_def, absl::Span<const string> attr_def,
-      absl::Span<const Node> node_def,
-      absl::Span<const std::pair<string, string>> ret_def,
-      absl::Span<const std::pair<string, string>> control_ret_def);
+      const std::string& function_name, absl::Span<const std::string> in_def,
+      absl::Span<const std::string> out_def,
+      absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+      absl::Span<const std::pair<std::string, std::string>> ret_def,
+      absl::Span<const std::pair<std::string, std::string>> control_ret_def);
 
   // Creates a FunctionDef from the given parameters. Node inputs must use
   // function encoding (node_name:output_name[:output_index]).
   // - `ret_def` holds a mapping from the function output names from `out_def`
   //   to the node outputs from `node_def`.
   static FunctionDef Create(
-      const std::string& function_name, absl::Span<const string> in_def,
-      absl::Span<const string> out_def, absl::Span<const string> attr_def,
-      absl::Span<const Node> node_def,
-      absl::Span<const std::pair<string, string>> ret_def);
+      const std::string& function_name, absl::Span<const std::string> in_def,
+      absl::Span<const std::string> out_def,
+      absl::Span<const std::string> attr_def, absl::Span<const Node> node_def,
+      absl::Span<const std::pair<std::string, std::string>> ret_def);
 
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const std::string& function_name,
-                            absl::Span<const string> arg_def,
-                            absl::Span<const string> ret_def,
-                            absl::Span<const string> attr_def,
+                            absl::Span<const std::string> arg_def,
+                            absl::Span<const std::string> ret_def,
+                            absl::Span<const std::string> attr_def,
                             absl::Span<const Node> node_def);
 
   // Defines an anonymous function. I.e., its name is not relevant.
-  static FunctionDef Define(absl::Span<const string> arg_def,
-                            absl::Span<const string> ret_def,
-                            absl::Span<const string> attr_def,
+  static FunctionDef Define(absl::Span<const std::string> arg_def,
+                            absl::Span<const std::string> ret_def,
+                            absl::Span<const std::string> attr_def,
                             absl::Span<const Node> node_def);
 
   // Helpers to construct a constant scalar.
@@ -258,7 +258,7 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(
 // GetFunctionSignature(func name, opdef) returns OK if the func name is found
 // and opdef is filled with a pointer to the corresponding signature
 // (a OpDef proto). Otherwise, returns an error.
-typedef std::function<absl::Status(const string&, const OpDef**)>
+typedef std::function<absl::Status(const std::string&, const OpDef**)>
     GetFunctionSignature;
 
 struct InstantiationResult {
@@ -293,7 +293,7 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
 // Return a hash of `fdef` that is consistent with FunctionDefsEqual method.
 // In other words, if two fdefs compare equal, their hash values will be the
 // same.
-uint64 FunctionDefHash(const FunctionDef& fdef);
+uint64_t FunctionDefHash(const FunctionDef& fdef);
 
 class CallFrameInterface {
  public:
@@ -566,7 +566,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   }
 
   // Returns all the function names in the FunctionLibraryDefinition.
-  std::vector<string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
+  std::vector<std::string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
 
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
@@ -658,7 +658,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   void Initialize(const FunctionDefLibrary& library,
                   const FunctionDefLibraryStackTraces& library_traces);
 
-  core::RefCountPtr<FunctionRecord> FindHelper(const string& func) const
+  core::RefCountPtr<FunctionRecord> FindHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
   std::string FindGradientHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
@@ -681,8 +681,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   // Remove all functions in `funcs` and all gradients of functions in
   // `funcs_with_grads` from this library.
-  absl::Status Remove(const std::vector<string>& funcs,
-                      const std::vector<string>& funcs_with_grads)
+  absl::Status Remove(const std::vector<std::string>& funcs,
+                      const std::vector<std::string>& funcs_with_grads)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove `func` from the library. Returns non-OK Status unless `func` is in
@@ -698,10 +698,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   mutable mutex mu_;
   const OpRegistryInterface* default_registry_;
-  gtl::FlatMap<string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
-  gtl::FlatMap<string, string> func_grad_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<std::string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<std::string, std::string> func_grad_ TF_GUARDED_BY(mu_);
   // Maps from function name to optimized function graph.
-  gtl::FlatMap<string, std::function<absl::StatusOr<OptimizedFunctionGraph>()>>
+  gtl::FlatMap<std::string,
+               std::function<absl::StatusOr<OptimizedFunctionGraph>()>>
       optimized_function_graph_creator_map_ TF_GUARDED_BY(mu_);
 };
 
@@ -752,7 +753,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // function's inputs. The device of resource inputs must be the device
     // backing the resource, not the CPU device backing the resource handle.
     // Must have the same length as number of inputs to the function.
-    std::vector<string> input_devices;
+    std::vector<std::string> input_devices;
 
     // For multi-device functions, a vector of canonical device names for
     // function's outputs.
@@ -780,14 +781,15 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // resource output, and node producing that resource is a function call,
     // runtime will leave device specification empty and will rely on Placer to
     // infer correct device.
-    std::vector<string> output_devices;
+    std::vector<std::string> output_devices;
 
     // If set, it indicates the original output indices of a component function.
     absl::optional<std::vector<int>> ret_indices = absl::nullopt;
 
     // Maps from a CompositeDevice name to a list of underlying physical
     // devices.
-    absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+    absl::flat_hash_map<std::string, const std::vector<std::string>*>
+        composite_devices;
 
     // This interface is EXPERIMENTAL and subject to change.
     //
@@ -836,8 +838,8 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
 
     // If provided, this optimization function will be invoked before
     // the placer for multi-device functions.
-    std::function<absl::Status(std::vector<string> /*ret_node_names*/,
-                               std::vector<string> /*keep_node_names*/,
+    std::function<absl::Status(std::vector<std::string> /*ret_node_names*/,
+                               std::vector<std::string> /*keep_node_names*/,
                                FunctionLibraryDefinition*, const DeviceSet&,
                                Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
         optimize_graph_fn;
@@ -899,7 +901,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
     // Instantiates the function enabling soft placement or outside compilation.
     bool allow_soft_placement = false;
   };
-  typedef uint64 Handle;
+  typedef uint64_t Handle;
   virtual absl::Status Instantiate(const std::string& function_name,
                                    AttrSlice attrs,
                                    const InstantiateOptions& options,
@@ -1055,7 +1057,7 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
   // Returns the graph version number.
   virtual int graph_def_version() const = 0;
 
-  typedef uint64 LocalHandle;
+  typedef uint64_t LocalHandle;
 
   // Creates a copy of ProcessFunctionLibraryRuntime (transferring ownership to
   // the caller), FunctionLibraryRuntime (owned by the returned
@@ -1088,7 +1090,8 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
 // `composite_devices` if the input device is a composite device.
 std::string GetFunctionResourceInputDevice(
     const Tensor& input, const int arg_index, const FunctionDef& function_def,
-    absl::flat_hash_map<string, std::vector<string>>* composite_devices);
+    absl::flat_hash_map<std::string, std::vector<std::string>>*
+        composite_devices);
 
 // Returns a canonicalized string for the instantiation of the function of the
 // given "name", attributes "attrs", and "options".
@@ -1173,7 +1176,7 @@ class DistributedFunctionLibraryRuntime {
                    FunctionLibraryRuntime::DoneCallback done) = 0;
 
   // Clean up a previously instantiated function on remote worker.
-  virtual void CleanUp(uint64 step_id,
+  virtual void CleanUp(uint64_t step_id,
                        FunctionLibraryRuntime::LocalHandle handle,
                        FunctionLibraryRuntime::DoneCallback done) = 0;
 
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
index 6b9119b681af88..d0d995cbcc3712 100644
--- a/tensorflow/core/framework/function_handle_cache.cc
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -33,10 +33,10 @@ FunctionHandleCache::~FunctionHandleCache() {
 }
 
 absl::Status FunctionHandleCache::Instantiate(
-    const string& function_name, AttrSlice attrs,
+    const std::string& function_name, AttrSlice attrs,
     FunctionLibraryRuntime::InstantiateOptions options,
     FunctionLibraryRuntime::Handle* handle) {
-  string key = Canonicalize(function_name, attrs, options);
+  std::string key = Canonicalize(function_name, attrs, options);
   FunctionLibraryRuntime::Handle h;
   {
     tf_shared_lock l(mu_);
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
index 1bd67138d1964f..317c53823c1685 100644
--- a/tensorflow/core/framework/function_handle_cache.h
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -34,7 +34,7 @@ class FunctionHandleCache {
   //
   // The cache retains the ownership of the handle. In particular, the caller
   // should not invoke `ReleaseHandle`.
-  absl::Status Instantiate(const string& function_name, AttrSlice attrs,
+  absl::Status Instantiate(const std::string& function_name, AttrSlice attrs,
                            FunctionLibraryRuntime::InstantiateOptions options,
                            FunctionLibraryRuntime::Handle* handle);
 
@@ -45,8 +45,8 @@ class FunctionHandleCache {
  private:
   mutex mu_;
   FunctionLibraryRuntime* lib_ = nullptr;  // not owned
-  const string state_handle_;
-  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+  const std::string state_handle_;
+  std::unordered_map<std::string, FunctionLibraryRuntime::Handle> handles_
       TF_GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 1a396876e00166..fcae39d0277bab 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -54,7 +54,7 @@ using ::testing::UnorderedElementsAreArray;
 class Attrs {
  public:
   Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
-        std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>
             attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
@@ -69,7 +69,7 @@ class Attrs {
 
 typedef FunctionDefHelper FDH;
 
-absl::Status GetOpSig(const string& op, const OpDef** sig) {
+absl::Status GetOpSig(const std::string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
@@ -490,7 +490,7 @@ WXPlusB[T:{float, double}](w:T, x:T, b:T) -> (y:T) {
 }
 
 TEST(TFunc, Body_TypeList) {
-  const Tensor kZero = test::AsScalar<int32>(0);
+  const Tensor kZero = test::AsScalar<int32_t>(0);
   auto fdef = FDH::Create(
       // Name
       "Test",
@@ -633,7 +633,7 @@ TEST(TFunc, IntsOnDeviceArgSet) {
   EXPECT_EQ("_DeviceRetval", result.nodes[4].op());
 }
 
-static void HasError(const absl::Status& s, const string& substr) {
+static void HasError(const absl::Status& s, const std::string& substr) {
   EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
@@ -1229,7 +1229,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   TF_EXPECT_OK(lib_def.AddLibrary(lib_def));
 }
 
-GradientDef MakeGradDef(const string& f, const string& g) {
+GradientDef MakeGradDef(const std::string& f, const std::string& g) {
   GradientDef grad;
   grad.set_function_name(f);
   grad.set_gradient_func(g);
@@ -1239,8 +1239,8 @@ GradientDef MakeGradDef(const string& f, const string& g) {
 TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
   // Create lib def containing two functions with equal names
   FunctionDefLibrary proto;
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
   *proto.add_function() = test::function::XTimesTwo();
   FunctionDef fdef = test::function::XTimesFour();
   fdef.mutable_signature()->set_name(x2_name);
@@ -1275,9 +1275,9 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
 }
 
 TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_FuncConflict) {
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
-  const string wx_name = test::function::WXPlusB().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
+  const std::string wx_name = test::function::WXPlusB().signature().name();
 
   // Create FunctionLibraryDefinition with
   // (func = XTimesTwo, grad = XTimesFour)
@@ -1311,9 +1311,9 @@ TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_FuncConflict) {
 }
 
 TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_GradConflict) {
-  const string x2_name = test::function::XTimesTwo().signature().name();
-  const string x4_name = test::function::XTimesFour().signature().name();
-  const string wx_name = test::function::WXPlusB().signature().name();
+  const std::string x2_name = test::function::XTimesTwo().signature().name();
+  const std::string x4_name = test::function::XTimesFour().signature().name();
+  const std::string wx_name = test::function::WXPlusB().signature().name();
 
   // Create FunctionLibraryDefinition with
   // (func = XTimesTwo, grad = XTimesFour)
@@ -1372,8 +1372,8 @@ TEST(FunctionLibraryDefinitionTest, ListFunctionNames) {
   TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
   TF_CHECK_OK(lib_def.AddFunctionDef(test::function::WXPlusB()));
 
-  const std::vector<string> function_names = lib_def.ListFunctionNames();
-  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  const std::vector<std::string> function_names = lib_def.ListFunctionNames();
+  const std::vector<std::string> expected = {"XTimesTwo", "WXPlusB"};
   EXPECT_EQ(function_names, expected);
 }
 
@@ -1399,7 +1399,7 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
 }
 
 template <typename T>
-void SetAttrValue(FunctionDef* fdef, const string& attr, const T& value) {
+void SetAttrValue(FunctionDef* fdef, const std::string& attr, const T& value) {
   AttrValue attr_value;
   SetAttrValue(value, &attr_value);
   fdef->mutable_attr()->insert({attr, attr_value});
@@ -1421,7 +1421,7 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_FuncWithAttr) {
   TF_EXPECT_OK(lib.GetAttr(ndef, "annotation", &annotation));
   EXPECT_EQ(annotation, true);
 
-  string str;
+  std::string str;
   TF_EXPECT_OK(lib.GetAttr(ndef, "options", &str));
   EXPECT_EQ(str, "some string data");
 }
@@ -1462,8 +1462,8 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
   using ::tensorflow::test::function::NDef;
   using FDH = ::tensorflow::FunctionDefHelper;
 
-  const auto make_simple_fdef = [](const string& name,
-                                   const string& interface_name) {
+  const auto make_simple_fdef = [](const std::string& name,
+                                   const std::string& interface_name) {
     auto func_def = FDH::Create(
         name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
         {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
@@ -1616,7 +1616,7 @@ TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
   const FunctionDef fdef1 = test::function::XTimesTwo();
   FunctionDef fdef2 = test::function::XTimesTwo();
-  uint64 hash1 = FunctionDefHash(fdef1);
+  uint64_t hash1 = FunctionDefHash(fdef1);
   EXPECT_TRUE(FunctionDefsEqual(fdef1, fdef2));
   EXPECT_EQ(hash1, FunctionDefHash(fdef2));
 
@@ -1760,7 +1760,7 @@ TEST(InstantiateFunctionTest, ResourceInputDevice) {
   *(*arg_attrs.mutable_attr())["_composite_device"].mutable_s() =
       "/device:COMPOSITE:0";
   (*fdef.mutable_arg_attr())[0] = arg_attrs;
-  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+  absl::flat_hash_map<std::string, std::vector<std::string>> composite_devices;
 
   Tensor arg0(DT_RESOURCE, TensorShape({2}));
   ResourceHandle resource_handle0;
@@ -1773,9 +1773,9 @@ TEST(InstantiateFunctionTest, ResourceInputDevice) {
   Tensor arg1(DT_RESOURCE, TensorShape({}));
   arg1.scalar<ResourceHandle>()() = resource_handle0;
 
-  const string device0 = GetFunctionResourceInputDevice(
+  const std::string device0 = GetFunctionResourceInputDevice(
       arg0, /*arg_index=*/0, fdef, &composite_devices);
-  const string device1 = GetFunctionResourceInputDevice(
+  const std::string device1 = GetFunctionResourceInputDevice(
       arg1, /*arg_index=*/1, fdef, &composite_devices);
 
   EXPECT_EQ(device0, "/device:COMPOSITE:0");
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 5e5c64d2a2a5ee..1b968b939365a7 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -48,13 +48,14 @@ GraphDef GDef(absl::Span<const NodeDef> nodes,
 }
 
 // Helper to construct a NodeDef.
-NodeDef NDef(absl::string_view name, absl::string_view op,
-             absl::Span<const string> inputs,
-             absl::Span<const std::pair<string, FDH::AttrValueWrapper>> attrs,
-             const string& device) {
+NodeDef NDef(
+    absl::string_view name, absl::string_view op,
+    absl::Span<const std::string> inputs,
+    absl::Span<const std::pair<std::string, FDH::AttrValueWrapper>> attrs,
+    const std::string& device) {
   NodeDef n;
-  n.set_name(string(name));
-  n.set_op(string(op));
+  n.set_name(name);
+  n.set_op(op);
   for (const auto& in : inputs) n.add_input(in);
   n.set_device(device);
   for (const auto& na : attrs)
@@ -609,8 +610,8 @@ FunctionDef XYXLessThanOrEqualToN(int64_t N) {
 }
 
 FunctionDef RandomUniformLess() {
-  const Tensor kZero = test::AsScalar<int32>(0);
-  const Tensor kOne = test::AsScalar<int32>(1);
+  const Tensor kZero = test::AsScalar<int32_t>(0);
+  const Tensor kOne = test::AsScalar<int32_t>(1);
   const Tensor k005 = test::AsScalar<float>(0.05);
 
   return FDH::Define(
@@ -703,8 +704,8 @@ FunctionDef MakeBatchDataset() {
 }
 
 FunctionDef MakeMapDataset(bool has_other_args) {
-  std::vector<string> args = {"input_dataset: variant"};
-  std::vector<string> inputs = {"input_dataset"};
+  std::vector<std::string> args = {"input_dataset: variant"};
+  std::vector<std::string> inputs = {"input_dataset"};
   if (has_other_args) {
     args.emplace_back("other_arguments: Targuments");
     inputs.emplace_back("other_arguments");
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 93cae697e62d15..b4cbf057cbe0a8 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -34,15 +34,14 @@ namespace function {
 class Attrs {
  public:
   Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
-        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
     }
   }
 
-  Attrs(
-      const std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>>&
-          attrs) {
+  Attrs(const std::vector<
+        std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
     for (const auto& aval : attrs) {
       map_.insert({aval.first, aval.second.proto});
     }
@@ -55,12 +54,12 @@ class Attrs {
 };
 
 // Helper to construct a NodeDef.
-NodeDef NDef(
-    absl::string_view name, absl::string_view op,
-    absl::Span<const string> inputs,
-    absl::Span<const std::pair<string, FunctionDefHelper::AttrValueWrapper>>
-        attrs = {},
-    const string& device = "");
+NodeDef NDef(absl::string_view name, absl::string_view op,
+             absl::Span<const std::string> inputs,
+             absl::Span<const std::pair<std::string,
+                                        FunctionDefHelper::AttrValueWrapper>>
+                 attrs = {},
+             const std::string& device = "");
 
 // Helper to construct a GraphDef proto.
 GraphDef GDef(absl::Span<const NodeDef> nodes,
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index c603ced808d370..9f54e3eecfdccd 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-string SummarizeGraphDef(const GraphDef& graph_def) {
-  string ret;
+std::string SummarizeGraphDef(const GraphDef& graph_def) {
+  std::string ret;
   absl::StrAppend(&ret, "versions = ", graph_def.versions().ShortDebugString(),
                   ";\n");
   for (const NodeDef& node : graph_def.node()) {
@@ -85,7 +85,7 @@ absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 static absl::Status RemoveNewDefaultAttrsFromNodeDef(
     NodeDef* node_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed) {
+    std::set<std::pair<std::string, std::string>>* op_attr_removed) {
   const OpDef* producer_op_def;
   const OpDef* consumer_op_def;
   TF_RETURN_IF_ERROR(
@@ -93,7 +93,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   TF_RETURN_IF_ERROR(
       consumer_op_registry.LookUpOpDef(node_def->op(), &consumer_op_def));
 
-  std::vector<string> to_remove;
+  std::vector<std::string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
     if (!absl::StartsWith(attr.first, "_") &&
@@ -117,7 +117,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   // We separate identifying which attrs should be removed from
   // actually removing them to avoid invalidating the loop iterators
   // above.
-  for (const string& attr_name : to_remove) {
+  for (const std::string& attr_name : to_remove) {
     node_def->mutable_attr()->erase(attr_name);
     if (op_attr_removed != nullptr) {
       op_attr_removed->insert(std::make_pair(node_def->op(), attr_name));
@@ -127,7 +127,7 @@ static absl::Status RemoveNewDefaultAttrsFromNodeDef(
   return absl::OkStatus();
 }
 
-static bool IsFunction(const GraphDef& graph_def, const string& op_name) {
+static bool IsFunction(const GraphDef& graph_def, const std::string& op_name) {
   for (const auto& func_def : graph_def.library().function()) {
     if (op_name == func_def.signature().name()) return true;
   }
@@ -137,7 +137,7 @@ static bool IsFunction(const GraphDef& graph_def, const string& op_name) {
 absl::Status RemoveNewDefaultAttrsFromGraphDef(
     GraphDef* graph_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed) {
+    std::set<std::pair<std::string, std::string>>* op_attr_removed) {
   // TODO(joshL): Make IsFunction() faster by collecting the names of
   // all functions as a preprocessing step.
   for (int n = 0; n < graph_def->node_size(); ++n) {
@@ -184,7 +184,7 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
     for (const OpDef::AttrDef& attr_def : op_def->attr()) {
       if (attr_def.has_default_value()) {
         AttrValueMap* attrs = node->mutable_attr();
-        const string& name = attr_def.name();
+        const std::string& name = attr_def.name();
         auto iter = attrs->find(name);
         if (iter != attrs->end()) {
           const AttrValue& default_value = attr_def.default_value();
@@ -202,9 +202,9 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
 }
 
 void OpsUsedByGraph(const GraphDef& graph_def,
-                    std::set<string>* ops_used_in_graph) {
+                    std::set<std::string>* ops_used_in_graph) {
   // Map function names to definitions.
-  std::unordered_map<string, const FunctionDef*> name_to_function;
+  std::unordered_map<std::string, const FunctionDef*> name_to_function;
   for (const auto& function : graph_def.library().function()) {
     name_to_function.insert(
         std::make_pair(function.signature().name(), &function));
@@ -212,11 +212,11 @@ void OpsUsedByGraph(const GraphDef& graph_def,
 
   // Collect the sorted list of op names.  Since functions can reference
   // functions, we need a recursive traversal.
-  std::set<string> used_ops;  // Includes both primitive ops and functions
+  std::set<std::string> used_ops;  // Includes both primitive ops and functions
   std::vector<const FunctionDef*> functions_to_process;  // A subset of used_ops
   // Collect the logic to mark an op in a lambda; it'll be used twice below.
   const auto mark_op_as_used = [&used_ops, &functions_to_process,
-                                &name_to_function](const string& op) {
+                                &name_to_function](const std::string& op) {
     if (used_ops.insert(op).second) {
       // If it's a function, we'll need to process further
       const auto it = name_to_function.find(op);
@@ -239,7 +239,7 @@ void OpsUsedByGraph(const GraphDef& graph_def,
   // Filter out function names to produce output.
   // TODO(josh11b): Change the above code to produce this directly.
   ops_used_in_graph->clear();
-  for (const string& op_name : used_ops) {
+  for (const std::string& op_name : used_ops) {
     if (name_to_function.find(op_name) == name_to_function.end()) {
       ops_used_in_graph->insert(op_name);
     }
@@ -249,12 +249,12 @@ void OpsUsedByGraph(const GraphDef& graph_def,
 absl::Status StrippedOpListForGraph(const GraphDef& graph_def,
                                     const OpRegistryInterface& op_registry,
                                     OpList* stripped_op_list) {
-  std::set<string> used_ops;
+  std::set<std::string> used_ops;
   OpsUsedByGraph(graph_def, &used_ops);
 
   // Build the stripped op list in sorted order, ignoring functions.
   stripped_op_list->clear_op();
-  for (const string& op_name : used_ops) {
+  for (const std::string& op_name : used_ops) {
     const OpDef* op_def;
     TF_RETURN_IF_ERROR(op_registry.LookUpOpDef(op_name, &op_def));
     OpDef* stripped_op = stripped_op_list->add_op();
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index a164ac310fe4ed..b3e335e776f3f6 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -29,7 +29,7 @@ class NodeDef;
 
 // Produce a human-readable version of a GraphDef that is more concise
 // than a text-format proto.
-string SummarizeGraphDef(const GraphDef& graph_def);
+std::string SummarizeGraphDef(const GraphDef& graph_def);
 
 // Validates the syntax of a GraphDef provided externally.
 //
@@ -97,7 +97,7 @@ absl::Status AddDefaultAttrsToGraphDef(GraphDef* graph_def,
 absl::Status RemoveNewDefaultAttrsFromGraphDef(
     GraphDef* graph_def, const OpRegistryInterface& consumer_op_registry,
     const OpRegistryInterface& producer_op_registry,
-    std::set<std::pair<string, string>>* op_attr_removed);
+    std::set<std::pair<std::string, std::string>>* op_attr_removed);
 
 // Goes over the `nodes` and removes attributes that are set to their
 // default values according to op_registry.
@@ -115,7 +115,7 @@ void StripDefaultAttributes(const OpRegistryInterface& op_registry,
 //
 // This returns the ops used as a set of strings.
 void OpsUsedByGraph(const GraphDef& graph_def,
-                    std::set<string>* ops_used_in_graph);
+                    std::set<std::string>* ops_used_in_graph);
 
 // This function computes the stripped_op_list field of MetaGraphDef
 // and similar protos.  The op_registry should contain the ops used to
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 12a1ee29fe792e..503f2cc93af194 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -59,7 +59,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeWithDefault) {
                    .Finalize(graph_def.add_node()));
   GraphDef expected_graph_def = graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(RemoveNewDefaultAttrsFromGraphDef(&graph_def, registry, registry,
                                                  &op_attr_removed));
 
@@ -80,7 +80,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeNoDefault) {
                    .Finalize(graph_def.add_node()));
   GraphDef expected_graph_def = graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(RemoveNewDefaultAttrsFromGraphDef(&graph_def, registry, registry,
                                                  &op_attr_removed));
 
@@ -106,7 +106,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UsesDefault) {
   TF_ASSERT_OK(NodeDefBuilder("uses_default", "UsesDefault", &producer_registry)
                    .Finalize(produced_graph_def.add_node()));
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -116,7 +116,8 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UsesDefault) {
                    .Finalize(expected_graph_def.add_node()));
   TF_EXPECT_GRAPH_EQ(expected_graph_def, produced_graph_def);
 
-  std::set<std::pair<string, string>> expected_removed({{"UsesDefault", "a"}});
+  std::set<std::pair<std::string, std::string>> expected_removed(
+      {{"UsesDefault", "a"}});
   EXPECT_EQ(expected_removed, op_attr_removed);
 }
 
@@ -142,7 +143,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, ChangedFromDefault) {
                    .Finalize(produced_graph_def.add_node()));
   GraphDef expected_graph_def = produced_graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -174,7 +175,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, UnderscoreAttrs) {
                    .Finalize(produced_graph_def.add_node()));
   GraphDef expected_graph_def = produced_graph_def;
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -213,7 +214,7 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, HasFunction) {
   TF_ASSERT_OK(NodeDefBuilder("call_func", "my_func", &function_registry)
                    .Finalize(produced_graph_def.add_node()));
 
-  std::set<std::pair<string, string>> op_attr_removed;
+  std::set<std::pair<std::string, std::string>> op_attr_removed;
   TF_ASSERT_OK(
       RemoveNewDefaultAttrsFromGraphDef(&produced_graph_def, consumer_registry,
                                         producer_registry, &op_attr_removed));
@@ -231,7 +232,8 @@ TEST(RemoveNewDefaultAttrsFromGraphDefTest, HasFunction) {
   EXPECT_EQ(expected_graph_def.library().DebugString(),
             produced_graph_def.library().DebugString());
 
-  std::set<std::pair<string, string>> expected_removed({{"UsesDefault", "a"}});
+  std::set<std::pair<std::string, std::string>> expected_removed(
+      {{"UsesDefault", "a"}});
   EXPECT_EQ(expected_removed, op_attr_removed);
 }
 
@@ -272,7 +274,7 @@ TEST(StripDefaultAttributesTest, NonDefaultNotStripped) {
 TEST(StrippedOpListForGraphTest, FlatTest) {
   // Make four ops
   OpList op_list;
-  for (const string& op : {"A", "B", "C", "D"}) {
+  for (const std::string& op : {"A", "B", "C", "D"}) {
     OpDef* op_def = op_list.add_op();
     op_def->set_name(op);
     op_def->set_summary("summary");
@@ -282,7 +284,7 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
 
   // Make a graph which uses two ops once and twice, respectively.
   // The result should be independent of the ordering.
-  const string graph_ops[4][3] = {
+  const std::string graph_ops[4][3] = {
       {"C", "B", "B"}, {"B", "C", "B"}, {"B", "B", "C"}, {"C", "C", "B"}};
   for (const bool use_function : {false, true}) {
     for (int order = 0; order < 4; order++) {
@@ -290,13 +292,13 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
       if (use_function) {
         FunctionDef* function_def = graph_def.mutable_library()->add_function();
         function_def->mutable_signature()->set_name("F");
-        for (const string& op : graph_ops[order]) {
+        for (const std::string& op : graph_ops[order]) {
           function_def->add_node_def()->set_op(op);
         }
         graph_def.add_node()->set_op("F");
       } else {
-        for (const string& op : graph_ops[order]) {
-          string name = absl::StrCat("name", graph_def.node_size());
+        for (const std::string& op : graph_ops[order]) {
+          std::string name = absl::StrCat("name", graph_def.node_size());
           NodeDef* node = graph_def.add_node();
           node->set_name(name);
           node->set_op(op);
@@ -319,9 +321,9 @@ TEST(StrippedOpListForGraphTest, FlatTest) {
       }
 
       // Should get the same result using OpsUsedByGraph().
-      std::set<string> used_ops;
+      std::set<std::string> used_ops;
       OpsUsedByGraph(graph_def, &used_ops);
-      ASSERT_EQ(std::set<string>({"B", "C"}), used_ops);
+      ASSERT_EQ(std::set<std::string>({"B", "C"}), used_ops);
     }
   }
 }
@@ -356,9 +358,9 @@ TEST(StrippedOpListForGraphTest, NestedFunctionTest) {
     ASSERT_EQ(stripped_op_list.op(0).name(), "A");
 
     // Should get the same result using OpsUsedByGraph().
-    std::set<string> used_ops;
+    std::set<std::string> used_ops;
     OpsUsedByGraph(graph_def, &used_ops);
-    ASSERT_EQ(std::set<string>({"A"}), used_ops);
+    ASSERT_EQ(std::set<std::string>({"A"}), used_ops);
   }
 }
 
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 95b6287c4e56b6..b3226c6fac490b 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -51,45 +51,45 @@ class NodeNameMapping {
 
   // Normalize the input name and make it unique. This is the same as the
   // function for output, expect that it adds a name mapping for the name.
-  string GetInputName(const string& name);
+  std::string GetInputName(const std::string& name);
 
   // Normalize the output name and make it unique.
-  string GetOutputName(const string& name);
+  std::string GetOutputName(const std::string& name);
 
   // Make the node name unique.
-  string Uniquify(const string& name);
+  std::string Uniquify(const std::string& name);
 
   // Records name as a used name. If this name is already used,
   // returns an error status.
-  absl::Status UseOutputName(const string& name);
+  absl::Status UseOutputName(const std::string& name);
 
   // Look up how a node name was previously normalized/uniquified.
   // Returns empty if name was never seen.
-  string Lookup(const string& name) const;
+  std::string Lookup(const std::string& name) const;
 
  private:
-  string UniquifyHelper(const string& name);
-  static string Normalize(string name);
+  std::string UniquifyHelper(const std::string& name);
+  static std::string Normalize(std::string name);
 
   // The normalized/uniquified names already used as
   // input names (in signature), output names (in signature), and node names
   // (in node_def).
   // This is a superset of values in name_mapping_.
-  absl::flat_hash_map<string, uint64> used_names_;
+  absl::flat_hash_map<std::string, uint64_t> used_names_;
   // Mapping from original node name from the graph to the normalized
   // and uniquified version of it.
-  absl::flat_hash_map<string, string> name_mapping_;
+  absl::flat_hash_map<std::string, std::string> name_mapping_;
 };
 
-string NodeNameMapping::Normalize(string name) {
+std::string NodeNameMapping::Normalize(std::string name) {
   // Convert letters to lowercase and non-alphanumeric characters to '_'.
   if (name.empty()) return "unknown";
   const int n = name.size();
   for (int i = 0; i < n; ++i) {
     char c = name[i];
-    if (isalnum(c)) {
-      if (isupper(c)) {
-        name[i] = tolower(c);
+    if (absl::ascii_isalnum(c)) {
+      if (absl::ascii_isupper(c)) {
+        name[i] = absl::ascii_tolower(c);
       }
     } else {
       name[i] = '_';
@@ -99,45 +99,45 @@ string NodeNameMapping::Normalize(string name) {
   // Find the first letter and start with it.
   int i = 0;
   for (; i < n; ++i) {
-    if (isalpha(name[i])) break;
+    if (absl::ascii_isalpha(name[i])) break;
   }
 
   // Return "unknown" if none of the name's chars were letters.
   return i == n ? "unknown" : name.substr(i);
 }
 
-string NodeNameMapping::UniquifyHelper(const string& name) {
+std::string NodeNameMapping::UniquifyHelper(const std::string& name) {
   auto it = used_names_.emplace(name, 0);
   // If the name hasn't been used yet, use it as-is.
   if (it.second) return name;
 
   // Add a suffix to name to make it unique.
   while (true) {
-    const string candidate = absl::StrCat(name, "_", it.first->second);
+    const std::string candidate = absl::StrCat(name, "_", it.first->second);
     it.first->second++;
     if (used_names_.emplace(candidate, 0).second) return candidate;
   }
 }
 
-string NodeNameMapping::GetInputName(const string& name) {
-  const string& input_name = UniquifyHelper(Normalize(name));
+std::string NodeNameMapping::GetInputName(const std::string& name) {
+  const std::string& input_name = UniquifyHelper(Normalize(name));
   name_mapping_[name] = input_name;
   return input_name;
 }
 
-string NodeNameMapping::GetOutputName(const string& name) {
-  const string& input_name = UniquifyHelper(Normalize(name));
+std::string NodeNameMapping::GetOutputName(const std::string& name) {
+  const std::string& input_name = UniquifyHelper(Normalize(name));
   // Don't add it to name_mapping_ since this name is not for a node.
   return input_name;
 }
 
-string NodeNameMapping::Uniquify(const string& name) {
-  const string uniqued = UniquifyHelper(name);
+std::string NodeNameMapping::Uniquify(const std::string& name) {
+  const std::string uniqued = UniquifyHelper(name);
   name_mapping_[name] = uniqued;
   return uniqued;
 }
 
-absl::Status NodeNameMapping::UseOutputName(const string& name) {
+absl::Status NodeNameMapping::UseOutputName(const std::string& name) {
   const auto& iter = used_names_.find(name);
   if (iter != used_names_.end()) {
     return errors::InvalidArgument(
@@ -148,19 +148,19 @@ absl::Status NodeNameMapping::UseOutputName(const string& name) {
   return absl::OkStatus();
 }
 
-string NodeNameMapping::Lookup(const string& name) const {
+std::string NodeNameMapping::Lookup(const std::string& name) const {
   const auto iter = name_mapping_.find(name);
-  if (iter == name_mapping_.end()) return string();
+  if (iter == name_mapping_.end()) return std::string();
   return iter->second;
 }
 
 absl::Status FillFunctionBody(
-    const string& fn_name, const NodeNameMapping& node_names,
+    const std::string& fn_name, const NodeNameMapping& node_names,
     const std::vector<const Node*>& body_nodes,
-    const absl::flat_hash_map<string, string>& tensor_renaming,
+    const absl::flat_hash_map<std::string, std::string>& tensor_renaming,
     bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
     bool allow_destructive_reads, FunctionDef* fdef) {
-  absl::flat_hash_set<string> func_attr_names;
+  absl::flat_hash_set<std::string> func_attr_names;
   for (const auto& func_attr : fdef->signature().attr()) {
     func_attr_names.insert(func_attr.name());
   }
@@ -263,7 +263,7 @@ absl::Status FillFunctionBody(
     for (const Edge* edge : control_edges) {
       // Add this control input only if the src node is in the body or a part of
       // the inputs.
-      const string normalized = node_names.Lookup(edge->src()->name());
+      const std::string normalized = node_names.Lookup(edge->src()->name());
       // If we did not find a name for the source of control edge, this
       // source must be outside of the body, and not an input. Raise an error.
       if (normalized.empty()) {
@@ -322,15 +322,16 @@ absl::Status FillFunctionBody(
 }
 
 absl::Status GraphToFunctionDefHelper(
-    const Graph& fn_body, const string& fn_name, bool append_hash_to_fn_name,
-    bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
     const std::vector<const Node*>& body_nodes,
     const std::vector<OutputTensor>& inputs,
     const std::vector<OutputTensor>& outputs,
-    const std::vector<string>& output_names,
+    const std::vector<std::string>& output_names,
     const std::vector<const Node*>& control_outputs,
-    const std::vector<string>& control_output_names, const char* description,
-    bool allow_destructive_reads, FunctionDef* fdef) {
+    const std::vector<std::string>& control_output_names,
+    const char* description, bool allow_destructive_reads, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
   }
@@ -350,7 +351,7 @@ absl::Status GraphToFunctionDefHelper(
   //  - For tensors produced by nodes in function's body:
   //    {flat_tensor_name -> nested_tensor_name}
   //    e.g. {Add:3 -> add_0:z:1}
-  absl::flat_hash_map<string, string> tensor_renaming;
+  absl::flat_hash_map<std::string, std::string> tensor_renaming;
 
   // Fill outputs in function's signature.
   // We fill the outputs first to prevent output_names from colliding
@@ -380,7 +381,7 @@ absl::Status GraphToFunctionDefHelper(
     int idx = inputs[i].index;
     OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg();
     argdef->set_type(node->output_type(idx));
-    const string& input_name = node_names.GetInputName(node->name());
+    const std::string& input_name = node_names.GetInputName(node->name());
     argdef->set_name(input_name);
     FunctionDef::ArgAttrs arg_attrs;
     int64_t resource_arg_unique_id = -1;
@@ -431,7 +432,7 @@ absl::Status GraphToFunctionDefHelper(
   // in tensor_renaming.
   for (const Node* node : body_nodes) {
     // Make sure node_name does not collide with an input or output name.
-    const string& node_name = node_names.Uniquify(node->name());
+    const std::string& node_name = node_names.Uniquify(node->name());
     // For each output_arg in the op_def, the output_ranges
     // map will have [start, end] range of indices that this arg produces
     // among all the output tensors of this op.
@@ -443,8 +444,8 @@ absl::Status GraphToFunctionDefHelper(
       int index_start = output.second.first;
       int index_end = output.second.second;
       for (int i = index_start; i < index_end; ++i) {
-        const string& original_name = absl::StrCat(node->name(), ":", i);
-        const string& new_name =
+        const std::string& original_name = absl::StrCat(node->name(), ":", i);
+        const std::string& new_name =
             strings::StrCat(node_name, ":", output_name, ":", i - index_start);
         // Record the mapping if this tensor is not already mapped.
         // Tensor can be already mapped if it is used as an input.
@@ -461,10 +462,10 @@ absl::Status GraphToFunctionDefHelper(
 
   // Remap return values.
   for (int r = 0; r < fdef->signature().output_arg_size(); ++r) {
-    const string& ret_name = fdef->signature().output_arg(r).name();
+    const std::string& ret_name = fdef->signature().output_arg(r).name();
     // We convert this flat tensor name to the nested value
     // (e.g. `add:z:1`) that we stored in tensor_renaming.
-    string return_value;
+    std::string return_value;
     if (outputs[r].node->IsRetval()) {
       Edge const* edge;
       TF_RETURN_IF_ERROR(outputs[r].node->input_edge(0, &edge));
@@ -484,8 +485,8 @@ absl::Status GraphToFunctionDefHelper(
   }
 
   if (append_hash_to_fn_name) {
-    const uint64 hash = FunctionDefHash(*fdef);
-    string encoded;
+    const uint64_t hash = FunctionDefHash(*fdef);
+    std::string encoded;
     TF_RETURN_IF_ERROR(Base64Encode(
         absl::string_view(reinterpret_cast<const char*>(&hash), sizeof(hash)),
         &encoded));
@@ -508,9 +509,9 @@ absl::Status GraphToFunctionDefHelper(
         ") and the number of control output names (",
         control_output_names.size(), ") to match but they do not.");
   }
-  std::set<string> control_output_names_set;
+  std::set<std::string> control_output_names_set;
   for (int i = 0; i < control_outputs.size(); ++i) {
-    string signature_name;
+    std::string signature_name;
     if (!control_output_names.empty()) {
       signature_name = control_output_names[i];
     } else {
@@ -523,7 +524,7 @@ absl::Status GraphToFunctionDefHelper(
       return errors::InvalidArgument("Repeated control output name: ",
                                      signature_name);
     }
-    const string control_output_node =
+    const std::string control_output_node =
         node_names.Lookup(control_outputs[i]->name());
     if (control_output_node.empty()) {
       return errors::InvalidArgument(
@@ -531,7 +532,7 @@ absl::Status GraphToFunctionDefHelper(
     }
     (*fdef->mutable_control_ret())[signature_name] = control_output_node;
   }
-  for (const string& control_output : control_output_names_set) {
+  for (const std::string& control_output : control_output_names_set) {
     fdef->mutable_signature()->add_control_output(control_output);
   }
 
@@ -539,9 +540,9 @@ absl::Status GraphToFunctionDefHelper(
 }
 
 absl::Status GraphToFunctionDefHelper(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
-    const std::vector<string>& output_names, bool allow_destructive_reads,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
+    const std::vector<std::string>& output_names, bool allow_destructive_reads,
     FunctionDef* fdef) {
   auto add_arg_or_retval = [](Node* node,
                               std::vector<OutputTensor>* args_or_retvals) {
@@ -566,7 +567,7 @@ absl::Status GraphToFunctionDefHelper(
   std::vector<OutputTensor> inputs;
   std::vector<OutputTensor> outputs;
   std::vector<const Node*> control_outputs;
-  std::vector<string> control_output_names;
+  std::vector<std::string> control_output_names;
   for (Node* node : graph.op_nodes()) {
     if (node->IsArg()) {
       TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
@@ -591,7 +592,7 @@ absl::Status GraphToFunctionDefHelper(
 
   auto validate_args_retvals =
       [](const std::vector<OutputTensor>& args_or_retvals,
-         const string& op_type) {
+         const std::string& op_type) {
         for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
           if (args_or_retvals[i].node == nullptr) {
             return errors::InvalidArgument("Missing '", op_type,
@@ -614,17 +615,17 @@ absl::Status GraphToFunctionDefHelper(
 
 }  // anonymous namespace
 
-absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                                bool append_hash_to_fn_name,
-                                bool set_stateful_from_nodes,
-                                bool copy_placeholder_attrs_from_nodes,
-                                const std::vector<const Node*>& body_nodes,
-                                const std::vector<OutputTensor>& inputs,
-                                const std::vector<OutputTensor>& outputs,
-                                const std::vector<string>& output_names,
-                                const std::vector<const Node*>& control_outputs,
-                                const std::vector<string>& control_output_names,
-                                const char* description, FunctionDef* fdef) {
+absl::Status GraphToFunctionDef(
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<std::string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<std::string>& control_output_names,
+    const char* description, FunctionDef* fdef) {
   return GraphToFunctionDefHelper(
       fn_body, fn_name, append_hash_to_fn_name, set_stateful_from_nodes,
       copy_placeholder_attrs_from_nodes, body_nodes, inputs, outputs,
@@ -634,20 +635,20 @@ absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
 }
 
 absl::Status GraphToFunctionDef(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, control_ret,
                                   /*output_names=*/{},
                                   /*allow_destructive_reads=*/false, fdef);
 }
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 FunctionDef* fdef) {
   return GraphToFunctionDef(graph, name, /*control_ret=*/nullptr, fdef);
 }
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 const std::vector<std::string>& output_names,
                                 FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, /*control_ret=*/nullptr,
@@ -656,8 +657,8 @@ absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
 }
 
 absl::Status GraphToFunctionDef(
-    std::unique_ptr<Graph> graph, const string& name,
-    const std::function<std::optional<string>(const Node*)>& control_ret,
+    std::unique_ptr<Graph> graph, const std::string& name,
+    const std::function<std::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(*graph, name, control_ret,
                                   /*output_names=*/{},
diff --git a/tensorflow/core/framework/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
index 369b86ecea5e03..4558af7938f312 100644
--- a/tensorflow/core/framework/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -29,17 +29,17 @@ namespace tensorflow {
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
 // function graph_to_function_def(), which is located in
 // tensorflow/python/framework/graph_to_function_def.py.
-absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                                bool append_hash_to_fn_name,
-                                bool set_stateful_from_nodes,
-                                bool copy_placeholder_attrs_from_nodes,
-                                const std::vector<const Node*>& body_nodes,
-                                const std::vector<OutputTensor>& inputs,
-                                const std::vector<OutputTensor>& outputs,
-                                const std::vector<string>& output_names,
-                                const std::vector<const Node*>& control_outputs,
-                                const std::vector<string>& control_output_names,
-                                const char* description, FunctionDef* fdef);
+absl::Status GraphToFunctionDef(
+    const Graph& fn_body, const std::string& fn_name,
+    bool append_hash_to_fn_name, bool set_stateful_from_nodes,
+    bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<std::string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<std::string>& control_output_names,
+    const char* description, FunctionDef* fdef);
 
 // Converts 'graph' to a FunctionDef 'fdef', with name 'name':
 //
@@ -50,20 +50,20 @@ absl::Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
 //     `control_output` in Op definition (see OpDef). Control output name must
 //     be unique for all control output nodes.
 absl::Status GraphToFunctionDef(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const Graph& graph, const std::string& name,
+    const std::function<absl::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef);
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 FunctionDef* fdef);
 
-absl::Status GraphToFunctionDef(const Graph& graph, const string& name,
+absl::Status GraphToFunctionDef(const Graph& graph, const std::string& name,
                                 const std::vector<std::string>& output_names,
                                 FunctionDef* fdef);
 
 absl::Status GraphToFunctionDef(
-    std::unique_ptr<Graph> graph, const string& name,
-    const std::function<std::optional<string>(const Node*)>& control_ret,
+    std::unique_ptr<Graph> graph, const std::string& name,
+    const std::function<std::optional<std::string>(const Node*)>& control_ret,
     FunctionDef* fdef);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index d71f6b9ff47a3b..719f9af233758e 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -47,7 +47,7 @@ FunctionDef RemoveDebugInfo(const FunctionDef& def) {
 }
 
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
-                      string* diff) {
+                      std::string* diff) {
   // TODO(phawkins) use a more sophisticated equality test.
   if (a.DebugString() != b.DebugString()) {
     if (diff) {
@@ -95,7 +95,7 @@ TEST(GraphToFunctionDefTest, Basics) {
       },
       {{"h", "G:sum:0"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -119,7 +119,7 @@ TEST(GraphToFunctionDefTest, OverrideOutputNames) {
                                 {},             // body
                                 {{"b", "a"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -168,7 +168,7 @@ TEST(GraphToFunctionDefTest, ArgAttrShape) {
   attrs.mutable_attr()->insert({"_output_shapes", output_shapes});
   (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -199,7 +199,7 @@ TEST(GraphToFunctionDefTest, ArgAttrPrivateAttr) {
   attrs.mutable_attr()->insert({"_name", private_attr});
   (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -266,7 +266,7 @@ TEST(GraphToFunctionDefTest, ArgAttrConstInput) {
   (*fdef_expected.mutable_signature()->mutable_description()) =
       "ArgAttrConstInput";
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -374,7 +374,7 @@ TEST(GraphToFunctionDefTest, ControlDependencies) {
       },
       {{"c", "b:y:0"}});  // return values
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
@@ -395,8 +395,9 @@ TEST(GraphToFunctionDefTest, ControlOutputs) {
   TF_EXPECT_OK(ConvertGraphDefToGraph(options, graph_def, graph.get()));
 
   // Add a 'b' node to the control return set.
-  const auto control_ret = [](const Node* n) -> absl::optional<string> {
-    if (n->name() == "b") return absl::make_optional<string>("must_execute");
+  const auto control_ret = [](const Node* n) -> absl::optional<std::string> {
+    if (n->name() == "b")
+      return absl::make_optional<std::string>("must_execute");
     return absl::nullopt;
   };
 
@@ -415,7 +416,7 @@ TEST(GraphToFunctionDefTest, ControlOutputs) {
                                 {{"c", "b:y:0"}},          // return values
                                 {{"must_execute", "b"}});  // control returns
 
-  string diff;
+  std::string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
 
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index c9788b0a08c45f..7b7e90df8bab2a 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -55,8 +55,8 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64_t>(
 }
 
 template <>
-KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
-    const char* attr_name, absl::Span<const string> allowed) {
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<std::string>(
+    const char* attr_name, absl::Span<const std::string> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
@@ -67,11 +67,11 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
 }
 
 template <>
-KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
-    const char* attr_name, string allowed) {
-  return AttrConstraint(
-      attr_name,
-      absl::Span<const string>(std::initializer_list<string>({allowed})));
+KernelDefBuilder& KernelDefBuilder::AttrConstraint<std::string>(
+    const char* attr_name, std::string allowed) {
+  return AttrConstraint(attr_name,
+                        absl::Span<const std::string>(
+                            std::initializer_list<std::string>({allowed})));
 }
 
 template <>
diff --git a/tensorflow/core/framework/kernel_def_builder_test.cc b/tensorflow/core/framework/kernel_def_builder_test.cc
index fa37b114abbe22..eefa454beb763e 100644
--- a/tensorflow/core/framework/kernel_def_builder_test.cc
+++ b/tensorflow/core/framework/kernel_def_builder_test.cc
@@ -48,7 +48,7 @@ TEST(KernelDefBuilderTest, TypeConstraint) {
 
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
-            .TypeConstraint<int32>("U")
+            .TypeConstraint<int32_t>("U")
             .TypeConstraint<bool>("V")
             .Build();
 
@@ -95,7 +95,7 @@ TEST(KernelDefBuilderTest, Int64Constraint) {
             .Device(DEVICE_GPU)
             .AttrConstraint("U",
                             absl::Span<const int64_t>{int64_t{5}, int64_t{17}})
-            .AttrConstraint("V", string("proto"))
+            .AttrConstraint("V", std::string("proto"))
             .Build();
 
   protobuf::TextFormat::ParseFromString(
@@ -136,7 +136,7 @@ TEST(KernelDefBuilderTest, StringConstraint) {
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
             .AttrConstraint("U", absl::Span<const char* const>{"boo", "ya"})
-            .AttrConstraint("V", string("proto"))
+            .AttrConstraint("V", std::string("proto"))
             .Build();
 
   protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/core/framework/kernel_def_util_test.cc b/tensorflow/core/framework/kernel_def_util_test.cc
index a2e4aa82fafd56..a15fa7b0cfbe0f 100644
--- a/tensorflow/core/framework/kernel_def_util_test.cc
+++ b/tensorflow/core/framework/kernel_def_util_test.cc
@@ -24,13 +24,13 @@ namespace tensorflow {
 
 namespace {
 
-NodeDef NodeDefFromText(const string& text) {
+NodeDef NodeDefFromText(const std::string& text) {
   NodeDef node_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
   return node_def;
 }
 
-KernelDef KernelDefFromText(const string& text) {
+KernelDef KernelDefFromText(const std::string& text) {
   KernelDef kernel_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &kernel_def));
   return kernel_def;
@@ -38,8 +38,8 @@ KernelDef KernelDefFromText(const string& text) {
 
 class AttrsMatchTest : public ::testing::Test {
  protected:
-  void ExpectStatus(const string& node_def_str, const string& kernel_def_str,
-                    error::Code code) {
+  void ExpectStatus(const std::string& node_def_str,
+                    const std::string& kernel_def_str, error::Code code) {
     bool match;
     auto status = KernelAttrsMatch(KernelDefFromText(kernel_def_str),
                                    NodeDefFromText(node_def_str), &match);
@@ -53,7 +53,7 @@ class AttrsMatchTest : public ::testing::Test {
 };
 
 TEST_F(AttrsMatchTest, ValidConstraint) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "ValidConstraint-op"
     op: "ValidConstraint"
     attr {
@@ -63,7 +63,7 @@ TEST_F(AttrsMatchTest, ValidConstraint) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "ValidConstraint"
     device_type: "CPU"
     constraint {
@@ -79,7 +79,7 @@ TEST_F(AttrsMatchTest, ValidConstraint) {
 }
 
 TEST_F(AttrsMatchTest, BadConstraint) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "BadConstraint-op"
     op: "BadConstraint"
     attr {
@@ -89,7 +89,7 @@ TEST_F(AttrsMatchTest, BadConstraint) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "BadConstraint"
     device_type: "CPU"
     constraint {
@@ -105,7 +105,7 @@ TEST_F(AttrsMatchTest, BadConstraint) {
 }
 
 TEST_F(AttrsMatchTest, Unimplemented) {
-  string node_def_str = R"(
+  std::string node_def_str = R"(
     name: "BadConstraint-op"
     op: "BadConstraint"
     attr {
@@ -115,7 +115,7 @@ TEST_F(AttrsMatchTest, Unimplemented) {
       }
     }
   )";
-  string kernel_def_str = R"(
+  std::string kernel_def_str = R"(
     op: "BadConstraint"
     device_type: "CPU"
     constraint {
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index a8ad5ba42069a7..df63471f59dff3 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -46,10 +46,10 @@ struct Library {
 absl::Status LoadDynamicLibrary(const char* library_filename, void** result,
                                 const void** buf, size_t* len) {
   static mutex mu(LINKER_INITIALIZED);
-  static std::unordered_map<string, Library> loaded_libs;
+  static std::unordered_map<std::string, Library> loaded_libs;
   Env* env = Env::Default();
   Library library;
-  std::unordered_set<string> seen_op_names;
+  std::unordered_set<std::string> seen_op_names;
   {
     mutex_lock lock(mu);
     if (loaded_libs.find(library_filename) != loaded_libs.end()) {
@@ -90,7 +90,7 @@ absl::Status LoadDynamicLibrary(const char* library_filename, void** result,
       loaded_libs[library_filename] = library;
     }
   }
-  string str;
+  std::string str;
   library.op_list.SerializeToString(&str);
   char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
   memcpy(str_buf, str.data(), str.length());
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index fffc5f8864e992..6a56c1695d35b9 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -141,7 +141,7 @@ LocalRendezvous::~LocalRendezvous() {
 }
 
 namespace {
-uint64 KeyHash(const absl::string_view& k) {
+uint64_t KeyHash(const absl::string_view& k) {
   return Hash64(k.data(), k.size());
 }
 }  // namespace
@@ -149,7 +149,7 @@ uint64 KeyHash(const absl::string_view& k) {
 absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
                                    const Rendezvous::Args& send_args,
                                    const Tensor& val, const bool is_dead) {
-  uint64 key_hash = KeyHash(key.FullKey());
+  uint64_t key_hash = KeyHash(key.FullKey());
   DVLOG(2) << "Send " << this << " " << key_hash << " " << key.FullKey();
 
   if (is_dead) {
@@ -158,7 +158,7 @@ absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
         "The number of dead values sent between a pair of devices.",
         "send_device", "recv_device");
     rendezvous_dead_values_sent
-        ->GetCell(string(key.src_device), string(key.dst_device))
+        ->GetCell(std::string(key.src_device), std::string(key.dst_device))
         ->IncrementBy(1);
   }
 
@@ -229,7 +229,7 @@ absl::Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
 void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
                                 const Rendezvous::Args& recv_args,
                                 Rendezvous::DoneCallback done) {
-  uint64 key_hash = KeyHash(key.FullKey());
+  uint64_t key_hash = KeyHash(key.FullKey());
   DVLOG(2) << "Recv " << this << " " << key_hash << " " << key.FullKey();
   tsl::core::RefCountPtr<Rendezvous> rc_keep_alive;
 
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 332daaa6c02060..628bd4642f4762 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -82,7 +82,7 @@ class LocalRendezvous {
     Item* tail = nullptr;
   };
 
-  typedef gtl::FlatMap<uint64, ItemQueue> Table;
+  typedef gtl::FlatMap<uint64_t, ItemQueue> Table;
 
   const int num_buckets_;
   // Pointer to the owner class of this LocalRendezvous if it is refcounted,
diff --git a/tensorflow/core/framework/log_memory.cc b/tensorflow/core/framework/log_memory.cc
index b168957ef7ed03..4fc2b86e18f156 100644
--- a/tensorflow/core/framework/log_memory.cc
+++ b/tensorflow/core/framework/log_memory.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-const string LogMemory::kLogMemoryLabel = "__LOG_MEMORY__";
+const std::string LogMemory::kLogMemoryLabel = "__LOG_MEMORY__";
 
 bool LogMemory::IsEnabled() { return VLOG_IS_ON(2); }
 
@@ -28,23 +28,23 @@ namespace {
 // Write the proto entry to LOG(INFO).
 template <typename T>
 void OutputToLog(const T& proto) {
-  string type_name(proto.GetTypeName());
+  std::string type_name(proto.GetTypeName());
   const size_t index = type_name.find_last_of('.');
-  if (index != string::npos) type_name = type_name.substr(index + 1);
+  if (index != std::string::npos) type_name = type_name.substr(index + 1);
   LOG(INFO) << LogMemory::kLogMemoryLabel << " " << type_name << " { "
             << proto.ShortDebugString() << " }";
 }
 
 }  // namespace
 
-void LogMemory::RecordStep(const int64_t step_id, const string& handle) {
+void LogMemory::RecordStep(const int64_t step_id, const std::string& handle) {
   MemoryLogStep step;
   step.set_step_id(step_id);
   step.set_handle(handle);
   OutputToLog(step);
 }
 
-void LogMemory::RecordTensorAllocation(const string& kernel_name,
+void LogMemory::RecordTensorAllocation(const std::string& kernel_name,
                                        const int64_t step_id,
                                        const Tensor& tensor) {
   MemoryLogTensorAllocation allocation;
@@ -55,14 +55,14 @@ void LogMemory::RecordTensorAllocation(const string& kernel_name,
 }
 
 void LogMemory::RecordTensorDeallocation(const int64_t allocation_id,
-                                         const string& allocator_name) {
+                                         const std::string& allocator_name) {
   MemoryLogTensorDeallocation deallocation;
   deallocation.set_allocation_id(allocation_id);
   deallocation.set_allocator_name(allocator_name);
   OutputToLog(deallocation);
 }
 
-void LogMemory::RecordTensorOutput(const string& kernel_name,
+void LogMemory::RecordTensorOutput(const std::string& kernel_name,
                                    const int64_t step_id, const int index,
                                    const Tensor& tensor) {
   MemoryLogTensorOutput output;
@@ -73,7 +73,7 @@ void LogMemory::RecordTensorOutput(const string& kernel_name,
   OutputToLog(output);
 }
 
-void LogMemory::RecordRawAllocation(const string& operation,
+void LogMemory::RecordRawAllocation(const std::string& operation,
                                     const int64_t step_id, size_t num_bytes,
                                     void* ptr, Allocator* allocator) {
   MemoryLogRawAllocation allocation;
@@ -86,7 +86,7 @@ void LogMemory::RecordRawAllocation(const string& operation,
   OutputToLog(allocation);
 }
 
-void LogMemory::RecordRawDeallocation(const string& operation,
+void LogMemory::RecordRawDeallocation(const std::string& operation,
                                       const int64_t step_id, void* ptr,
                                       Allocator* allocator, bool deferred) {
   MemoryLogRawDeallocation deallocation;
diff --git a/tensorflow/core/framework/logging.cc b/tensorflow/core/framework/logging.cc
index 14f23b06d0e5e3..d10b4d555fd00f 100644
--- a/tensorflow/core/framework/logging.cc
+++ b/tensorflow/core/framework/logging.cc
@@ -36,13 +36,13 @@ bool RegisterListener(void (*listener)(const char*)) {
   return true;
 }
 
-bool LogToListeners(string msg, string end) {
+bool LogToListeners(std::string msg, std::string end) {
   auto listeners = logging::GetListeners();
   if (listeners->empty()) {
     return false;
   }
 
-  string ended_msg = absl::StrCat(msg, end);
+  std::string ended_msg = absl::StrCat(msg, end);
 
   for (auto& listener : *listeners) {
     listener(ended_msg.c_str());
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index 06524726e5cfc1..ccc167ca91474e 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -133,7 +133,7 @@ class LookupInterface : public ResourceBase {
   absl::Status CheckFindArguments(const Tensor& keys,
                                   const Tensor& default_value);
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("A lookup table of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 8b187beb125740..11317fa9656c1f 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -43,7 +43,7 @@ int GetTotal(const NameRangeMap& name_map) {
 // to DEVICE_MEMORY except those args in host_memory_args.  Removes
 // elements of host_memory_args that were used.
 void MemoryTypesHelper(const NameRangeMap& name_map,
-                       std::vector<string>* host_memory_args,
+                       std::vector<std::string>* host_memory_args,
                        MemoryTypeVector* memory_types) {
   // Update args that have been marked as in "HOST_MEMORY".
   size_t keep = 0;
@@ -62,10 +62,10 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
   host_memory_args->resize(keep);
 }
 
-bool IsFunctionCallOp(const string& op_type) {
+bool IsFunctionCallOp(const std::string& op_type) {
   return op_type == "SymbolicGradient" || op_type == "PartitionedCall" ||
          op_type == "StatefulPartitionedCall" || op_type == "While" ||
-         op_type == "StatelessWhile" || op_type == "If" || 
+         op_type == "StatelessWhile" || op_type == "If" ||
          op_type == "StatelessIf";
 }
 
@@ -110,11 +110,11 @@ absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
   bool is_fn = IsFunctionCallOp(ndef.op());
   bool has_kernel_def = status.ok() && !is_fn;
   auto host_memory_required = [&](const DataType& dt) {
-    bool int32_on_device = 
+    bool int32_on_device =
         has_kernel_def || device_type.type_string() == "TPU" || has_xla_compile;
     return DataTypeAlwaysOnHost(dt) || (dt == DT_INT32 && !int32_on_device);
   };
-  
+
   //  Edge cases:
   //  1. If[Tcond=DT_BOOL, Tin=[DT_FLOAT,DT_INT32], Tout=[DT_FLOAT,DT_INT32]]
   //     * Tcond marked HostMemory by kernel_def
@@ -146,17 +146,16 @@ absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
       out_mtypes->resize(GetTotal(out_names), DEVICE_MEMORY);
     }
 
-    // Fills in host memory types based on the kernel def
-    if(kdef != nullptr) {  // can this ever be false?
-      const auto& from_proto = kdef->host_memory_arg();
-      std::vector<string> host_memory_args(from_proto.begin(), from_proto.end());
-      MemoryTypesHelper(inp_names, &host_memory_args, inp_mtypes);
-      MemoryTypesHelper(out_names, &host_memory_args, out_mtypes);
-      if (!host_memory_args.empty()) {
-        return errors::InvalidArgument(
-            "HostMemory args '", absl::StrJoin(host_memory_args, "', '"),
-            "' not found in OpDef: ", SummarizeOpDef(*op_def));
-      }
+    // Fills in host memory types based on the kernel def.
+    const auto& from_proto = kdef->host_memory_arg();
+    std::vector<std::string> host_memory_args(from_proto.begin(),
+                                              from_proto.end());
+    MemoryTypesHelper(inp_names, &host_memory_args, inp_mtypes);
+    MemoryTypesHelper(out_names, &host_memory_args, out_mtypes);
+    if (!host_memory_args.empty()) {
+      return errors::InvalidArgument(
+          "HostMemory args '", absl::StrJoin(host_memory_args, "', '"),
+          "' not found in OpDef: ", SummarizeOpDef(*op_def));
     }
   } else {
     inp_mtypes->resize(inp_dtypes.size(), DEVICE_MEMORY);
@@ -177,7 +176,7 @@ absl::Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
     }
   }
 
-  std::vector<int32> hostmem_attr;
+  std::vector<int32_t> hostmem_attr;
   if (TryGetNodeAttr(ndef, "_input_hostmem", &hostmem_attr)) {
     for (int32_t i : hostmem_attr) {
       if (0 <= i && i < inp_mtypes->size()) {
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index dafcef280b48e5..c55d7e46a89140 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -305,7 +305,7 @@ auto* tf_data_pipeline_processing_time = tsl::monitoring::Gauge<double, 1>::New(
     "in microseconds",
     "id");
 
-auto* tf_data_auto_shard = tsl::monitoring::Gauge<int64, 2>::New(
+auto* tf_data_auto_shard = tsl::monitoring::Gauge<int64_t, 2>::New(
     "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
     "name");
 
@@ -490,39 +490,41 @@ std::string GraphOptimizationSourceMapping(GraphOptimizationSource source) {
   }
 }
 
-void RecordTFDataFetchOp(const string& name) {
+void RecordTFDataFetchOp(const std::string& name) {
   tf_data_fetch_op_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataAutotune(const string& name) {
+void RecordTFDataAutotune(const std::string& name) {
   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
 }
 
 tsl::monitoring::CounterCell* GetTFDataBytesConsumedCounter(
-    const string& name) {
+    const std::string& name) {
   return tf_data_bytes_consumed_counter->GetCell(name);
 }
 
 tsl::monitoring::CounterCell* GetTFDataBytesProducedCounter(
-    const string& name) {
+    const std::string& name) {
   return tf_data_bytes_produced_counter->GetCell(name);
 }
 
-tsl::monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
+tsl::monitoring::CounterCell* GetTFDataBytesReadCounter(
+    const std::string& name) {
   return tf_data_bytes_read_counter->GetCell(name);
 }
 
-tsl::monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
+tsl::monitoring::CounterCell* GetTFDataElementsCounter(
+    const std::string& name) {
   return tf_data_elements_counter->GetCell(name);
 }
 
 tsl::monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
-    const string& id) {
+    const std::string& id) {
   return tf_data_model_gauge->GetCell(id);
 }
 
 tsl::monitoring::GaugeCell<double>* GetTFDataPipelineProcessingTimeGauge(
-    const string& id) {
+    const std::string& id) {
   return tf_data_pipeline_processing_time->GetCell(id);
 }
 
@@ -530,23 +532,23 @@ void RecordTFDataBytesFetched(int64_t num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 
-void RecordTFDataExperiment(const string& name) {
+void RecordTFDataExperiment(const std::string& name) {
   tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentLive(const string& name) {
+void RecordTFDataExperimentLive(const std::string& name) {
   tf_data_experiment_live_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentOptIn(const string& name) {
+void RecordTFDataExperimentOptIn(const std::string& name) {
   tf_data_experiment_opt_in_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataExperimentOptOut(const string& name) {
+void RecordTFDataExperimentOptOut(const std::string& name) {
   tf_data_experiment_opt_out_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataFingerprint(const string& name) {
+void RecordTFDataFingerprint(const std::string& name) {
   tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
 
@@ -557,18 +559,18 @@ void RecordTFDataServiceRuntimeCompressionDecision(bool compression_disabled) {
       ->IncrementBy(1);
 }
 
-void RecordTFDataServiceCompressionAction(const string& action) {
+void RecordTFDataServiceCompressionAction(const std::string& action) {
   tf_data_service_compression->GetCell(action)->IncrementBy(1);
 }
 
-void RecordTFDataServiceGetElementDuration(const string& data_transfer_protocol,
-                                           uint64 duration_us) {
+void RecordTFDataServiceGetElementDuration(
+    const std::string& data_transfer_protocol, uint64_t duration_us) {
   tf_data_service_get_element_duration_usecs_histogram
       ->GetCell(data_transfer_protocol)
       ->Add(duration_us);
 }
 
-void RecordTFDataGetNextDuration(uint64 duration_us) {
+void RecordTFDataGetNextDuration(uint64_t duration_us) {
   static auto* tf_data_get_next_duration_cell =
       tf_data_get_next_duration_usecs_histogram->GetCell();
   tf_data_get_next_duration_cell->Add(duration_us);
@@ -586,25 +588,25 @@ void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
   tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
 }
 
-void RecordTFDataIteratorBusy(uint64 duration_us) {
+void RecordTFDataIteratorBusy(uint64_t duration_us) {
   static auto* tf_data_iterator_busy_cell =
       tf_data_iterator_busy_counter->GetCell();
   tf_data_iterator_busy_cell->IncrementBy(duration_us);
 }
 
-void RecordTFDataIteratorLifetime(uint64 duration_us) {
+void RecordTFDataIteratorLifetime(uint64_t duration_us) {
   static auto* tf_data_iterator_lifetime_cell =
       tf_data_iterator_lifetime_counter->GetCell();
   tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
 }
 
-void RecordTFDataIteratorGap(uint64 duration_us) {
+void RecordTFDataIteratorGap(uint64_t duration_us) {
   static auto* tf_data_iterator_gap_msec_histogram_cell =
       tf_data_iterator_gap_msec_histogram->GetCell();
   tf_data_iterator_gap_msec_histogram_cell->Add(duration_us * 0.001);
 }
 
-void RecordTFDataOptimization(const string& name, int64_t num_changes) {
+void RecordTFDataOptimization(const std::string& name, int64_t num_changes) {
   tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
 }
 
@@ -641,7 +643,7 @@ void RecordTFDataServiceClientIterators(
 }
 
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol, bool user_specified) {
+    const std::string& data_transfer_protocol, bool user_specified) {
   std::string nature = user_specified ? "specified" : "default";
   tf_data_service_data_transfer_protocol_used_by_nature
       ->GetCell(data_transfer_protocol, nature)
@@ -649,16 +651,16 @@ void RecordTFDataServiceDataTransferProtocolUsed(
 }
 
 void RecordTFDataServiceDataTransferProtocolFallback(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message) {
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message) {
   tf_data_service_data_transfer_protocol_fallback
       ->GetCell(data_transfer_protocol, error::Code_Name(code), error_message)
       ->IncrementBy(1);
 }
 
 void RecordTFDataServiceDataTransferProtocolError(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message) {
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message) {
   tf_data_service_data_transfer_protocol_error
       ->GetCell(data_transfer_protocol, error::Code_Name(code), error_message)
       ->IncrementBy(1);
@@ -688,7 +690,8 @@ void RecordTFDataServiceOptimalNumberOfWorkers(int64_t number_of_workers) {
   tf_data_service_optimal_number_of_workers->GetCell()->Set(number_of_workers);
 }
 
-void RecordTFDataFilename(const string& name, const string& filename) {
+void RecordTFDataFilename(const std::string& name,
+                          const std::string& filename) {
   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
 }
 
@@ -697,7 +700,7 @@ void RecordTFDataFileLoggerAttempts() {
 }
 
 void RecordTFDataFileLoggerErrors(error::Code error_code,
-                                  const string& error_message) {
+                                  const std::string& error_message) {
   tf_data_file_logger_errors_counter
       ->GetCell(error::Code_Name(error_code), error_message)
       ->IncrementBy(1);
@@ -710,39 +713,40 @@ void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files) {
 
 void RecordTFDataFileLoggerErrorsNumFiles(size_t num_files,
                                           error::Code error_code,
-                                          const string& error_message) {
+                                          const std::string& error_message) {
   tf_data_file_logger_errors_num_files_counter
       ->GetCell(error::Code_Name(error_code), error_message)
       ->IncrementBy(num_files);
 }
 
-void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
-                           int64 num_workers, int64 num_replicas) {
+void RecordTFDataAutoShard(const std::string& id, data::AutoShardPolicy policy,
+                           int64_t num_workers, int64_t num_replicas) {
   tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
   tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
   tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
 }
 
 void RecordTFDataAutoShardRewriteBatchSize(
-    bool eligible, const std::vector<string>& ineligible_reason) {
+    bool eligible, const std::vector<std::string>& ineligible_reason) {
   tf_data_auto_shard_rewrite_batch_size_eligible
       ->GetCell(eligible ? "true" : "false")
       ->IncrementBy(1);
-  for (const string& reason : ineligible_reason) {
+  for (const std::string& reason : ineligible_reason) {
     tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
         1);
   }
 }
 
-void RecordTFDataAutotuneStoppingCriteria(const string& name) {
+void RecordTFDataAutotuneStoppingCriteria(const std::string& name) {
   tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
 }
 
-void RecordTFDataDebug(const string& event) {
+void RecordTFDataDebug(const std::string& event) {
   tf_data_debug->GetCell(event)->IncrementBy(1);
 }
 
-void RecordTFDataError(const string& error_type, const string& status_code) {
+void RecordTFDataError(const std::string& error_type,
+                       const std::string& status_code) {
   tf_data_error->GetCell(error_type, status_code)->IncrementBy(1);
 }
 
@@ -750,7 +754,7 @@ void RecordTFDataFrameworkType(const std::string& framework_type) {
   tf_data_framework_type->GetCell(framework_type)->IncrementBy(1);
 }
 
-void RecordParseDenseFeature(int64 num_features) {
+void RecordParseDenseFeature(int64_t num_features) {
   static auto* parse_dense_feature_counter_cell =
       parse_dense_feature_counter->GetCell();
   parse_dense_feature_counter_cell->IncrementBy(num_features);
@@ -797,7 +801,7 @@ void UpdateAotBefMlirLoadCount() {
   aot_bef_mlir_load_count_cell->IncrementBy(1);
 }
 
-void UpdateGraphExecTime(const uint64 running_time_usecs) {
+void UpdateGraphExecTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* graph_runs_cell = graph_runs->GetCell();
     static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
@@ -809,13 +813,13 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateGraphPendingQueueLength(uint64 len) {
+void UpdateGraphPendingQueueLength(uint64_t len) {
   static auto* graph_pending_queue_length_cell =
       graph_pending_queue_length_histogram->GetCell();
   graph_pending_queue_length_cell->Add(len);
 }
 
-void UpdateGraphBuildTime(const uint64 running_time_usecs) {
+void UpdateGraphBuildTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* build_graph_calls_cell = build_graph_calls->GetCell();
     static auto* build_graph_time_usecs_cell =
@@ -825,7 +829,7 @@ void UpdateGraphBuildTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
+void UpdateFunctionGraphOptimizationTime(const uint64_t running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* function_graph_optimization_time_usecs_cell =
         function_graph_optimization_time_usecs->GetCell();
@@ -834,7 +838,7 @@ void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
   }
 }
 
-void UpdateFunctionGraphOptimizationSavingTime(const uint64 saving_time_usecs,
+void UpdateFunctionGraphOptimizationSavingTime(const uint64_t saving_time_usecs,
                                                GraphOptimizationSource source) {
   if (saving_time_usecs > 0) {
     std::string mapped_source = GraphOptimizationSourceMapping(source);
@@ -845,7 +849,7 @@ void UpdateFunctionGraphOptimizationSavingTime(const uint64 saving_time_usecs,
   }
 }
 
-uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+uint64_t GetFunctionGraphOptimizationSavingTimeUsecs(
     GraphOptimizationSource source) {
   std::string mapped_source = GraphOptimizationSourceMapping(source);
   return graph_optimization_saving_time_usecs->GetCell(mapped_source)->value();
@@ -904,14 +908,14 @@ int64_t GetFunctionGraphOptimizationCacheLoadCount(
   return graph_optimization_cache_load_count->GetCell(mapped_source)->value();
 }
 
-void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
+void UpdateTpuVariableDistributionTime(const uint64_t distribution_time_usecs) {
   if (distribution_time_usecs > 0) {
     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
         distribution_time_usecs);
   }
 }
 
-void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
+void UpdateXlaCompilationTime(const uint64_t compilation_time_usecs) {
   if (compilation_time_usecs > 0) {
     static auto* xla_compilations_cell = xla_compilations->GetCell();
     static auto* xla_compilation_time_usecs_cell =
@@ -921,32 +925,32 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
   }
 }
 
-void RecordUnusedOutput(const string& op_name) {
+void RecordUnusedOutput(const std::string& op_name) {
   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
 }
 
-void RecordPipelineProcessingTime(const string& id,
+void RecordPipelineProcessingTime(const std::string& id,
                                   double pipeline_processing_time_usec) {
   GetTFDataPipelineProcessingTimeGauge(id)->Set(pipeline_processing_time_usec);
 }
 
-void IncrementTestCounter(const string& name, const string& label) {
+void IncrementTestCounter(const std::string& name, const std::string& label) {
   test_counters->GetCell(name, label)->IncrementBy(1);
 }
 
-const tsl::monitoring::CounterCell* TestCounter(const string& name,
-                                                const string& label) {
+const tsl::monitoring::CounterCell* TestCounter(const std::string& name,
+                                                const std::string& label) {
   return test_counters->GetCell(name, label);
 }
 
-TestDelta::TestDelta(const string& name, const string& label)
+TestDelta::TestDelta(const std::string& name, const std::string& label)
     : cell_(TestCounter(name, label)) {
   Reset();
 }
 
 void TestDelta::Reset() { last_value_ = cell_->value(); }
 
-int64 TestDelta::Get() { return cell_->value() - last_value_; }
+int64_t TestDelta::Get() { return cell_->value() - last_value_; }
 
 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
@@ -1020,12 +1024,13 @@ void IncrementPhase2XlaCompilerCounter(Phase2XlaCompilerMetric metric) {
       ->IncrementBy(1);
 }
 
-void UpdateTpuErrorCounter(const string& op, const string& error_type) {
+void UpdateTpuErrorCounter(const std::string& op,
+                           const std::string& error_type) {
   tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
 }
 
-void UpdateEagerClientErrorCounter(const string& error_source,
-                                   const string& error_type) {
+void UpdateEagerClientErrorCounter(const std::string& error_source,
+                                   const std::string& error_type) {
   eager_client_error_counter->GetCell(error_source, error_type)->IncrementBy(1);
 }
 
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 18b52c49ecf61b..4d84c1f615adae 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -39,24 +39,24 @@ enum class GraphOptimizationSource {
 // Records when a data-fetching tf.data operation is executed.
 //
 // The `name` argument identifies the operation type (e.g. "ToSingleElementOp").
-void RecordTFDataFetchOp(const string& name);
+void RecordTFDataFetchOp(const std::string& name);
 
 // Records that a tf.data.Dataset executed by the program used autotuning.
 //
 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
-void RecordTFDataAutotune(const string& name);
+void RecordTFDataAutotune(const std::string& name);
 
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesConsumedCounter(const std::string& name);
 
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesProducedCounter(const std::string& name);
 
 // Returns a counter than can be used to record the number of bytes read from
 // the filesystem by a tf.data.Dataset source.
@@ -64,43 +64,43 @@ monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
 //
 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
-monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
+monitoring::CounterCell* GetTFDataBytesReadCounter(const std::string& name);
 
 // Returns a counter than can be used to record the number of elements produced
 // by a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
-monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
+monitoring::CounterCell* GetTFDataElementsCounter(const std::string& name);
 
 // Returns a gauge than can be used to record the performance model information.
 //
 // The `id` argument represents the (unique) model ID.
 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
-    const string& id);
+    const std::string& id);
 
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64_t num_bytes);
 
 // Records the number of times a tf.data experiment was applied.
-void RecordTFDataExperiment(const string& name);
+void RecordTFDataExperiment(const std::string& name);
 
 // Records the number of times a tf.data experiment could have been applied.
-void RecordTFDataExperimentLive(const string& name);
+void RecordTFDataExperimentLive(const std::string& name);
 
 // Records the number of times a tf.data experiment was opted into.
-void RecordTFDataExperimentOptIn(const string& experiment_name);
+void RecordTFDataExperimentOptIn(const std::string& experiment_name);
 
 // Records the number of times a tf.data experiment was opted out of.
-void RecordTFDataExperimentOptOut(const string& experiment_name);
+void RecordTFDataExperimentOptOut(const std::string& experiment_name);
 
 // Records the time (in microseconds) spent generating an element and
 // transferring it over the network for the given protocol.
-void RecordTFDataServiceGetElementDuration(const string& data_transfer_protocol,
-                                           uint64 duration_us);
+void RecordTFDataServiceGetElementDuration(
+    const std::string& data_transfer_protocol, uint64_t duration_us);
 
 // Records the time (in microseconds) spent in a single invocation of
 // `ItertatorResource::GetNext()`.
-void RecordTFDataGetNextDuration(uint64 duration_us);
+void RecordTFDataGetNextDuration(uint64_t duration_us);
 
 // Records the histogram of ratios of tf.data autotune algorithm used RAM over
 // the ram budget.
@@ -115,7 +115,7 @@ void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
 //
 // The `name` argument identifies the Dataset graph fingerprint,
 // created using GraphHash().
-void RecordTFDataFingerprint(const string& name);
+void RecordTFDataFingerprint(const std::string& name);
 
 // Records the event of a tf.data service pipeline getting a runtime
 // compression decision.
@@ -123,26 +123,26 @@ void RecordTFDataServiceRuntimeCompressionDecision(bool compression_decision);
 
 // Records the event of a tf.data service pipeline making the compression
 // related action.
-void RecordTFDataServiceCompressionAction(const string& action);
+void RecordTFDataServiceCompressionAction(const std::string& action);
 
 // Records the time (in microseconds) during which `IteratorResource` was busy
 // processing at least one `GetNext()` request.
-void RecordTFDataIteratorBusy(uint64 duration_us);
+void RecordTFDataIteratorBusy(uint64_t duration_us);
 
 // Records the time (in microseconds) between `IteratorResource` receiving the
 // first `GetNext()` request and responding to the last `GetNext()` request.
-void RecordTFDataIteratorLifetime(uint64 duration_us);
+void RecordTFDataIteratorLifetime(uint64_t duration_us);
 
 // Records the time histogram (in microseconds) between `IteratorResource`
 // responding to a `GetNext()` request and receiving the next `GetNext()`
 // request.
-void RecordTFDataIteratorGap(uint64 duration_us);
+void RecordTFDataIteratorGap(uint64_t duration_us);
 
 // Records the number of independent graph changes resulting from the
 // application of a tf.data optimization.
 //
 // The `name` argument identifies the optimization (e.g. "noop_elimination").
-void RecordTFDataOptimization(const string& name, int64_t num_changes);
+void RecordTFDataOptimization(const std::string& name, int64_t num_changes);
 
 // Records that a tf.data service worker has been created.
 void RecordTFDataServiceWorkerCreated();
@@ -160,21 +160,21 @@ void RecordTFDataServiceClientIterators(
 // `data_transfer_protocol` to get data from the worker server and whether or
 // not the user explicitly specified the protocol.
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol, bool user_specified);
+    const std::string& data_transfer_protocol, bool user_specified);
 
 // Records that a tf.data service worker client fell back to gRPC rather than
 // use `data_transfer_protocol` because of an error of type `code` with message
 // `error_message`.
 void RecordTFDataServiceDataTransferProtocolFallback(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message);
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message);
 
 // Records that a tf.data service worker client got an error of non-retriable
 // type `code` with message `error_message` when trying to transfer data over
 // `data_transfer_protocol`.
 void RecordTFDataServiceDataTransferProtocolError(
-    const string& data_transfer_protocol, error::Code code,
-    const string& error_message);
+    const std::string& data_transfer_protocol, error::Code code,
+    const std::string& error_message);
 
 // Records tf.data service cross-trainer cache queries.
 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
@@ -195,7 +195,7 @@ void RecordTFDataServiceOptimalNumberOfWorkers(int64_t number_of_workers);
 // Records the file name read by a tf.data Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
-void RecordTFDataFilename(const string& name, const string& filename);
+void RecordTFDataFilename(const std::string& name, const std::string& filename);
 
 // Records the total attempts made by file logger.
 void RecordTFDataFileLoggerAttempts();
@@ -203,7 +203,7 @@ void RecordTFDataFileLoggerAttempts();
 // Records an error of type `code` with message `error_message` encountered by
 // file logger.
 void RecordTFDataFileLoggerErrors(error::Code code,
-                                  const string& error_message);
+                                  const std::string& error_message);
 
 // Records the total number of files attempted to be logged by file logger.
 void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files);
@@ -212,15 +212,15 @@ void RecordTFDataFileLoggerAttemptedNumFiles(size_t num_files);
 // `code` with message `error_message` during logging by file logger with this
 // error code.
 void RecordTFDataFileLoggerErrorsNumFiles(size_t num_files, error::Code code,
-                                          const string& error_message);
+                                          const std::string& error_message);
 
 // Records statistics of tf.data auto sharding.
 //
 // The `id` is a unique identifier of the input pipeline. The `policy`
 // identifies the auto-sharding policy used, the `num_workers` identifies the
 // number of workers, and `num_replicas` identifies the number of replicas.
-void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
-                           int64 num_workers, int64 num_replicas);
+void RecordTFDataAutoShard(const std::string& id, data::AutoShardPolicy policy,
+                           int64_t num_workers, int64_t num_replicas);
 
 // Records statistics of whether we can rewrite batch size in tf.data auto
 // sharding.
@@ -229,26 +229,27 @@ void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
 // indicates whether the input pipeline is eligible for the rewrite. The
 // `ineligible_reason` is the reason if the input pipeline is ineligible.
 void RecordTFDataAutoShardRewriteBatchSize(
-    bool eligible, const std::vector<string>& ineligible_reason);
+    bool eligible, const std::vector<std::string>& ineligible_reason);
 
 // Records the number of times each tf.data autotuning algorithm stopping
 // criterion is met.
-void RecordTFDataAutotuneStoppingCriteria(const string& name);
+void RecordTFDataAutotuneStoppingCriteria(const std::string& name);
 
 // Records the number of times this event occured, for debugging.
-void RecordTFDataDebug(const string& event);
+void RecordTFDataDebug(const std::string& event);
 
 // Records the number of times an error of this type occurred with this status
 // code.
-void RecordTFDataError(const string& error_type, const string& error_code);
+void RecordTFDataError(const std::string& error_type,
+                       const std::string& error_code);
 
 // Records the framework type used to build the tf.data.Dataset.
 void RecordTFDataFrameworkType(const std::string& framework_type);
 
 // Records the number of times tf.data file logger encountered an error of this
 // type occurred with this status code.
-void RecordTFDataFileLoggerError(const string& error_type,
-                                 const string& error_code);
+void RecordTFDataFileLoggerError(const std::string& error_type,
+                                 const std::string& error_code);
 
 // Records parsing of dense tensor features.
 void RecordParseDenseFeature(int64_t num_features);
@@ -266,14 +267,14 @@ void RecordGraphOutputTensors(const size_t size);
 // Records the number of cores requested by graphs with XLA SPMD enabled.
 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
 
-void UpdateGraphExecTime(const uint64 running_time_usecs);
-void UpdateGraphPendingQueueLength(uint64 len);
+void UpdateGraphExecTime(const uint64_t running_time_usecs);
+void UpdateGraphPendingQueueLength(uint64_t len);
 
 // Records that one output of an op of type `op_name` was unused.
-void RecordUnusedOutput(const string& op_name);
+void RecordUnusedOutput(const std::string& op_name);
 
 // Records the pipeline processing time in microseconds
-void RecordPipelineProcessingTime(const string& id,
+void RecordPipelineProcessingTime(const std::string& id,
                                   double pipeline_processing_time_usec);
 
 // Increments the count of binaries loaded from the persistent cache.
@@ -295,17 +296,17 @@ void UpdateAotBefMlirLoadCount();
 // When executing eagerly, this will not record any activity.
 //
 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
-void UpdateGraphBuildTime(const uint64 running_time_usecs);
+void UpdateGraphBuildTime(const uint64_t running_time_usecs);
 
 // Updates the metric stored for time spent optimizing function graphs.
-void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
+void UpdateFunctionGraphOptimizationTime(const uint64_t running_time_usecs);
 
 // Updates the metric stored for time saved by caching graph optimization.
-void UpdateFunctionGraphOptimizationSavingTime(uint64 saving_time_usec,
+void UpdateFunctionGraphOptimizationSavingTime(uint64_t saving_time_usec,
                                                GraphOptimizationSource source);
 
 // Retrieves the total time saved by the graph optimization caching.
-uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+uint64_t GetFunctionGraphOptimizationSavingTimeUsecs(
     GraphOptimizationSource source);
 
 // Increments the hit count for the graph optimization cache.
@@ -463,10 +464,10 @@ class ScopedCounter final {
 
   // Returns duration of the current interval in case the timer has started.
   // Returns nullopt otherwise.
-  std::optional<uint64> DurationMicroSec() const {
-    return started_ ? std::optional<uint64>(accumulated_time_ +
-                                            Env::Default()->NowMicros() -
-                                            start_time_)
+  std::optional<uint64_t> DurationMicroSec() const {
+    return started_ ? std::optional<uint64_t>(accumulated_time_ +
+                                              Env::Default()->NowMicros() -
+                                              start_time_)
                     : std::nullopt;
   }
 
@@ -492,7 +493,7 @@ class ScopedCounter final {
  private:
   template <std::size_t... S>
   void ReportInternal(std::index_sequence<S...>) {
-    uint64 time_interval = Env::Default()->NowMicros() - start_time_;
+    uint64_t time_interval = Env::Default()->NowMicros() - start_time_;
     time_interval += accumulated_time_;
     if (time_interval > 0) {
       counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
@@ -508,8 +509,8 @@ class ScopedCounter final {
   monitoring::Counter<NumLabels>* counter_;
   std::array<std::string, NumLabels> labels_;
   bool started_{false};
-  uint64 start_time_;
-  uint64 accumulated_time_;
+  uint64_t start_time_;
+  uint64_t accumulated_time_;
 };
 
 // Returns a counter used to capture timing metrics for graph optimization
@@ -517,32 +518,33 @@ class ScopedCounter final {
 monitoring::Counter<2>* GetGraphOptimizationCounter();
 
 // Updates metrics for time to distribute variables to all TPU hosts.
-void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
+void UpdateTpuVariableDistributionTime(const uint64_t distribution_time_usecs);
 
 // Updates the metrics stored about time XLA spents compiling graphs.
-void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
+void UpdateXlaCompilationTime(const uint64_t compilation_time_usecs);
 
 // Increments (by 1) a simple integer counter that is exposed for testing.
-void IncrementTestCounter(const string& name, const string& label);
+void IncrementTestCounter(const std::string& name, const std::string& label);
 
 // Read-only access to a counter for testing.
-const monitoring::CounterCell* TestCounter(const string& name,
-                                           const string& label);
+const monitoring::CounterCell* TestCounter(const std::string& name,
+                                           const std::string& label);
 
 // Read-only wrapper for a TestCounter to track increments between calls.
 class TestDelta {
  public:
-  TestDelta(const string& name, const string& label);
+  TestDelta(const std::string& name, const std::string& label);
   void Reset();
-  int64 Get();
+  int64_t Get();
 
  private:
   const monitoring::CounterCell* cell_;
-  int64 last_value_;
+  int64_t last_value_;
 };
-void UpdateTpuErrorCounter(const string& op, const string& error_type);
-void UpdateEagerClientErrorCounter(const string& error_source,
-                                   const string& error_type);
+void UpdateTpuErrorCounter(const std::string& op,
+                           const std::string& error_type);
+void UpdateEagerClientErrorCounter(const std::string& error_source,
+                                   const std::string& error_type);
 
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 654b15b9ac0201..0d05c8d72b69d7 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -240,7 +240,7 @@ bool AreAllParametersMax(const Model::ModelParameters& parameters) {
 }
 
 // Records the ram usage of hill climbing algorithm.
-void RecordAutotuneRamUsage(int64 ram_budget, double max_buffered_bytes) {
+void RecordAutotuneRamUsage(int64_t ram_budget, double max_buffered_bytes) {
   if (ram_budget == 0) {
     return;
   }
@@ -1227,8 +1227,8 @@ class UnknownRatio : public Node {
   // The processing time is the sum of the self processing time and the product
   // of the ratio estimate and the sum of processing times of inputs.
   void TotalProcessingTimeLocked(
-      absl::flat_hash_map<string, double>* processing_times,
-      absl::flat_hash_map<string, double>* total_processing_times) override
+      absl::flat_hash_map<std::string, double>* processing_times,
+      absl::flat_hash_map<std::string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
@@ -1400,13 +1400,13 @@ class AsyncUnknownRatio : public AsyncRatio {
 
 thread_local int64_t Node::work_start_;
 
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max) {
   return std::make_shared<Parameter>(name, state, min, max);
 }
 
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max, double value) {
   std::shared_ptr<Parameter> parameter =
@@ -1415,7 +1415,7 @@ std::shared_ptr<Parameter> MakeParameter(const string& name,
   return parameter;
 }
 
-std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
+std::shared_ptr<Parameter> MakeNonTunableParameter(const std::string& name,
                                                    double value) {
   return std::make_shared<Parameter>(name, nullptr, /*min=*/value,
                                      /*max=*/value);
@@ -1649,8 +1649,8 @@ Node::ModelParameters Node::CollectNodeTunableParameters() const {
   return parameters;
 }
 
-string Node::DebugString() const {
-  absl::flat_hash_map<string, string> debug_strings;
+std::string Node::DebugString() const {
+  absl::flat_hash_map<std::string, std::string> debug_strings;
   tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
   for (const auto& node :
@@ -2035,9 +2035,10 @@ void Node::CollectTunableParametersHelper(
   }
 }
 
-void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
-    const TF_SHARED_LOCKS_REQUIRED(mu_) {
-  string result;
+void Node::DebugStringHelper(
+    absl::flat_hash_map<std::string, std::string>* debug_strings) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
+  std::string result;
   absl::StrAppend(&result, long_name(), ":\n");
   absl::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
   absl::StrAppend(&result, "  buffered_bytes=", buffered_bytes_.load(), "\n");
@@ -2047,7 +2048,7 @@ void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
   absl::StrAppend(&result, "  bytes_produced=", bytes_produced_.load(), "\n");
   absl::StrAppend(&result, "  processing_time=", processing_time_.load(), "\n");
   absl::StrAppend(&result, "  num_elements=", num_elements_.load(), "\n");
-  string inputs;
+  std::string inputs;
   for (auto& input : inputs_) {
     absl::StrAppend(&inputs, input->long_name(), ",");
   }
@@ -2080,7 +2081,7 @@ std::shared_ptr<Node> Node::SnapshotHelper(
     {
       mutex_lock l2(cloned_current->mu_);
       cloned_current->parameters_ =
-          absl::flat_hash_map<string, std::shared_ptr<Parameter>>();
+          absl::flat_hash_map<std::string, std::shared_ptr<Parameter>>();
       for (const auto& [parameter_name, parameter_ptr] : parameters_) {
         cloned_current->parameters_[parameter_name] =
             std::make_shared<Parameter>(parameter_ptr);
@@ -2257,7 +2258,7 @@ Model::Model(std::optional<std::string> dataset_name)
     : dataset_name_(std::move(dataset_name)),
       optimization_period_ms_(kOptimizationPeriodMinMs),
       safe_to_collect_metrics_(std::make_shared<GuardedBool>(true)) {
-  model_id_ = absl::StrCat(reinterpret_cast<uint64>(this));
+  model_id_ = absl::StrCat(reinterpret_cast<uint64_t>(this));
   model_gauge_cell_ = metrics::GetTFDataModelGauge(model_id_);
 
   // Capture `safe_to_collect_metrics_` by value to avoid use-after-free issues
@@ -2297,7 +2298,7 @@ Model::~Model() {
   metrics::RecordPipelineProcessingTime(model_id_, 0);
 }
 
-void Model::AddNode(Node::Factory factory, const string& name,
+void Model::AddNode(Node::Factory factory, const std::string& name,
                     std::shared_ptr<Node> parent,
                     std::shared_ptr<Node>* out_node) {
   // The name captures the sequence of iterators joined by `::`. We only use the
@@ -2935,7 +2936,7 @@ void Model::OptimizeStageBasedNonAsyncInterleaveManyNodes(
                               node_tunable_parameters.end());
   }
   // Initialize the parallelism parameter values to minimal before tuning.
-  for (std::pair<string, std::shared_ptr<Parameter>>& pair :
+  for (std::pair<std::string, std::shared_ptr<Parameter>>& pair :
        tunable_parameters) {
     if (pair.second->name != kParallelism) {
       continue;
@@ -3206,7 +3207,8 @@ absl::Status Model::FromProto(ModelProto model_proto,
   return absl::OkStatus();
 }
 
-absl::Status Model::Save(const string& fname, std::shared_ptr<Node> snapshot,
+absl::Status Model::Save(const std::string& fname,
+                         std::shared_ptr<Node> snapshot,
                          const OptimizationParams& optimization_params) {
   ModelProto model_proto;
   std::unique_ptr<Model> model_snapshot = std::make_unique<Model>();
@@ -3222,7 +3224,8 @@ absl::Status Model::Save(const string& fname, std::shared_ptr<Node> snapshot,
   return WriteBinaryProto(Env::Default(), fname, model_proto);
 }
 
-absl::Status Model::Load(const string& fname, std::unique_ptr<Model>* model,
+absl::Status Model::Load(const std::string& fname,
+                         std::unique_ptr<Model>* model,
                          OptimizationParams* optimization_params) {
   ModelProto model_proto;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index fd47c91842721c..c8c39768dc2e6a 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -97,8 +97,8 @@ struct SharedState {
 
 // Represents a parameter.
 struct Parameter {
-  Parameter(const string& name, std::shared_ptr<SharedState> state, double min,
-            double max)
+  Parameter(const std::string& name, std::shared_ptr<SharedState> state,
+            double min, double max)
       : name(name),
         // Sometimes non-autotune nodes (with `autotune_=false`) may contain
         // parameters (for example inputs of parallel interleave dataset which
@@ -121,7 +121,7 @@ struct Parameter {
         state(parameter->state) {}
 
   // Human-readable name of the parameter.
-  const string name;
+  const std::string name;
 
   // Identifies the model value of the parameter. This can be different from
   // the actual value (e.g. during optimization search).
@@ -138,18 +138,18 @@ struct Parameter {
 };
 
 // Returns a new tunable parameter with the value set to `min`.
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max);
 
 // Returns a new tunable parameter with the value set to `value` instead
 // of `min`.
-std::shared_ptr<Parameter> MakeParameter(const string& name,
+std::shared_ptr<Parameter> MakeParameter(const std::string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max, double value);
 
 // Returns a new non-tunable parameter.
-std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
+std::shared_ptr<Parameter> MakeNonTunableParameter(const std::string& name,
                                                    double value);
 
 // Class for managing the ram budget of an iterator. This is necessary for
@@ -283,7 +283,7 @@ class Node {
   // Arguments for `Node` constructor.
   struct Args {
     int64_t id;
-    string name;
+    std::string name;
     std::shared_ptr<Node> output;
   };
 
@@ -292,10 +292,10 @@ class Node {
   using NodePairList =
       std::list<std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>>;
   using ModelParameters =
-      std::vector<std::pair<string, std::shared_ptr<Parameter>>>;
-  using NodeValues = absl::flat_hash_map<string, double>;
+      std::vector<std::pair<std::string, std::shared_ptr<Parameter>>>;
+  using NodeValues = absl::flat_hash_map<std::string, double>;
   using ParameterGradients =
-      absl::flat_hash_map<std::pair<string, string>, double>;
+      absl::flat_hash_map<std::pair<std::string, std::string>, double>;
 
   explicit Node(Args args)
       : id_(args.id),
@@ -413,10 +413,12 @@ class Node {
   }
 
   // Returns a longer node name that is guaranteed to be unique.
-  string long_name() const { return absl::StrCat(name_, "(id:", id_, ")"); }
+  std::string long_name() const {
+    return absl::StrCat(name_, "(id:", id_, ")");
+  }
 
   // Returns the node name.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Returns the number of elements produced by the node.
   int64_t num_elements() const TF_LOCKS_EXCLUDED(mu_) { return num_elements_; }
@@ -426,7 +428,7 @@ class Node {
   std::shared_ptr<Node> output_shared() { return output_weak_ptr_.lock(); }
 
   // Returns the parameter value.
-  double parameter_value(const string& name) const TF_LOCKS_EXCLUDED(mu_) {
+  double parameter_value(const std::string& name) const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return parameters_.at(name)->state->value;
   }
@@ -564,7 +566,7 @@ class Node {
   ModelParameters CollectNodeTunableParameters() const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a human-readable representation of this node.
-  string DebugString() const TF_LOCKS_EXCLUDED(mu_);
+  std::string DebugString() const TF_LOCKS_EXCLUDED(mu_);
 
   // Flushes the metrics recorded by this node.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
@@ -645,7 +647,7 @@ class Node {
   // Used for (incrementally) recording metrics. The class is thread-safe.
   class Metrics {
    public:
-    explicit Metrics(const string& name)
+    explicit Metrics(const std::string& name)
         : bytes_consumed_counter_(metrics::GetTFDataBytesConsumedCounter(name)),
           bytes_produced_counter_(metrics::GetTFDataBytesProducedCounter(name)),
           num_elements_counter_(metrics::GetTFDataElementsCounter(name)),
@@ -787,8 +789,9 @@ class Node {
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Build up debug string for the node and store in the debug strings map.
-  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
-      const TF_SHARED_LOCKS_REQUIRED(mu_);
+  void DebugStringHelper(
+      absl::flat_hash_map<std::string, std::string>* debug_strings) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
   std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> cloned_output,
@@ -827,7 +830,7 @@ class Node {
 
   mutable mutex mu_;
   const int64_t id_;
-  const string name_;
+  const std::string name_;
 
   // Indicates whether the subtree rooted in this node should be included in
   // autotuning. In particular, if this is `false`, then the subtree is excluded
@@ -844,7 +847,7 @@ class Node {
   std::atomic<int64_t> processing_time_;
   std::atomic<bool> record_metrics_;
   Metrics metrics_;
-  absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters_
+  absl::flat_hash_map<std::string, std::shared_ptr<Parameter>> parameters_
       TF_GUARDED_BY(mu_);
 
   // Statistic of inputs processing time history.
@@ -952,7 +955,7 @@ class Model {
   }
 
   // Adds a node with the given name and given parent.
-  void AddNode(Node::Factory factory, const string& name,
+  void AddNode(Node::Factory factory, const std::string& name,
                std::shared_ptr<Node> parent, std::shared_ptr<Node>* out_node)
       TF_LOCKS_EXCLUDED(mu_);
 
@@ -1014,12 +1017,13 @@ class Model {
 
   // Saves this model with a given snapshot and its optimization parameters to a
   // file. Note that the file directory must already exist.
-  absl::Status Save(const string& fname, std::shared_ptr<Node> snapshot,
+  absl::Status Save(const std::string& fname, std::shared_ptr<Node> snapshot,
                     const OptimizationParams& optimization_params);
 
   // Loads a model and its optimization parameters from a file with the given
   // name.
-  static absl::Status Load(const string& fname, std::unique_ptr<Model>* model,
+  static absl::Status Load(const std::string& fname,
+                           std::unique_ptr<Model>* model,
                            OptimizationParams* optimization_params);
 
   // Records gap time between consecutive `GetNext()` calls.
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index b7d42eaa0522d3..6ad728f1a0de2c 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -54,7 +54,7 @@ std::function<int64_t(int64_t)> RamBudgetFunc(int64_t budget) {
   return [budget](int64_t) { return budget; };
 }
 
-int64_t CountParametersOnNode(const string& node_name,
+int64_t CountParametersOnNode(const std::string& node_name,
                               const Model::ModelParameters& parameters) {
   int64_t cnt = 0;
   for (const auto& pair : parameters) {
@@ -865,10 +865,11 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
       (new_output_time - output_time) / kParameterStep, kComparisonPrecision);
 }
 
-class AsyncKnownRatioGradientTest : public ::testing::TestWithParam<string> {};
+class AsyncKnownRatioGradientTest
+    : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(AsyncKnownRatioGradientTest, Model) {
-  const string parameter_name = GetParam();
+  const std::string parameter_name = GetParam();
   const double input_time = 100;
   const int64_t num_inputs_per_output = 2;
 
@@ -1165,7 +1166,7 @@ TEST(SaveModelTest, Model) {
 
   // Make Save->Load roundtrip.
   Env* env = Env::Default();
-  string tmpFile;
+  std::string tmpFile;
   EXPECT_TRUE(env->LocalTempFilename(&tmpFile));
   tmpFile += "_autotune_model_test";
 
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 92fb66395efbf8..fcbb4b7d3672a3 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -32,7 +32,7 @@ NodeDefBuilder::NodeOut::NodeOut() {
 }
 
 void NodeDefBuilder::NodeOut::Reset(absl::string_view n, int i, DataType dt) {
-  node = string(n);
+  node = std::string(n);
   index = i;
   data_type = dt;
 }
@@ -41,9 +41,9 @@ NodeDefBuilder::NodeDefBuilder(absl::string_view name,
                                absl::string_view op_name,
                                const OpRegistryInterface* op_registry,
                                const NodeDebugInfo* debug) {
-  node_def_.set_name(string(name));
+  node_def_.set_name(name);
   const absl::Status status =
-      op_registry->LookUpOpDef(string(op_name), &op_def_);
+      op_registry->LookUpOpDef(std::string(op_name), &op_def_);
   if (status.ok()) {
     Initialize();
   } else {
@@ -62,7 +62,7 @@ NodeDefBuilder::NodeDefBuilder(absl::string_view name,
 
 NodeDefBuilder::NodeDefBuilder(absl::string_view name, const OpDef* op_def)
     : op_def_(op_def) {
-  node_def_.set_name(string(name));
+  node_def_.set_name(name);
   Initialize();
 }
 
@@ -182,7 +182,7 @@ void NodeDefBuilder::AddInput(absl::string_view src_node, int src_index) {
   } else if (src_index > 0) {
     node_def_.add_input(absl::StrCat(src_node, ":", src_index));
   } else {
-    node_def_.add_input(string(src_node));
+    node_def_.add_input(std::string(src_node));
   }
 }
 
@@ -210,13 +210,13 @@ NodeDefBuilder& NodeDefBuilder::ControlInput(absl::string_view src_node) {
 }
 
 NodeDefBuilder& NodeDefBuilder::Device(absl::string_view device_spec) {
-  node_def_.set_device(string(device_spec));
+  node_def_.set_device(device_spec);
   return *this;
 }
 
 absl::Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) {
-  const std::vector<string>* errors_ptr = &errors_;
-  std::vector<string> errors_storage;
+  const std::vector<std::string>* errors_ptr = &errors_;
+  std::vector<std::string> errors_storage;
   if (op_def_ != nullptr && inputs_specified_ < op_def_->input_arg_size()) {
     // Since this is a const method, to add an error, we have to make
     // a copy of the existing errors.
@@ -318,9 +318,9 @@ ATTR(const TensorProto&)
 ATTR(const NameAttrList&)
 ATTR(absl::Span<const absl::string_view>)
 ATTR(absl::Span<const char* const>)
-ATTR(absl::Span<const string>)
+ATTR(absl::Span<const std::string>)
 ATTR(absl::Span<const tstring>)
-ATTR(absl::Span<const int32>)
+ATTR(absl::Span<const int32_t>)
 ATTR(absl::Span<const int64_t>)
 ATTR(absl::Span<const float>)
 ATTR(absl::Span<const bool>)
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index 47b14f185800cf..6b74b20fd85ad3 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -56,7 +56,7 @@ class NodeDefBuilder {
     NodeOut(absl::string_view n, int i, DataType dt);
     NodeOut();  // uninitialized, call Reset() before use.
     void Reset(absl::string_view n, int i, DataType dt);
-    string node;
+    std::string node;
     int index;
     DataType data_type;
   };
@@ -112,9 +112,10 @@ class NodeDefBuilder {
                        absl::Span<const absl::string_view> value);
   NodeDefBuilder& Attr(absl::string_view name,
                        absl::Span<const char* const> value);
-  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const string> value);
+  NodeDefBuilder& Attr(absl::string_view name,
+                       absl::Span<const std::string> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const tstring> value);
-  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int32> value);
+  NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int32_t> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const int64_t> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const float> value);
   NodeDefBuilder& Attr(absl::string_view name, absl::Span<const bool> value);
@@ -145,7 +146,7 @@ class NodeDefBuilder {
   absl::Status Finalize(NodeDef* node_def, bool consume = false);
 
   // Accessors for the values set in the constructor.
-  const string& node_name() const { return node_def_.name(); }
+  const std::string& node_name() const { return node_def_.name(); }
   const OpDef& op_def() const { return *op_def_; }
 
  private:
@@ -189,8 +190,8 @@ class NodeDefBuilder {
   const OpDef* op_def_;
   NodeDef node_def_;
   int inputs_specified_;
-  std::vector<string> control_inputs_;
-  std::vector<string> errors_;
+  std::vector<std::string> control_inputs_;
+  std::vector<std::string> errors_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index b5429579bc889b..c769537ab13d94 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -79,12 +79,12 @@ class NodeDefBuilderTest : public ::testing::Test {
   // Calls Finalize() and verifies it returns an error.
   // Each message must appear as a substring of the error.
   void ExpectFailures(NodeDefBuilder& builder,  // NOLINT
-                      const std::vector<string>& messages) {
+                      const std::vector<std::string>& messages) {
     NodeDef node_def;
     absl::Status status = builder.Finalize(&node_def);
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    for (const string& message : messages) {
+    for (const std::string& message : messages) {
       EXPECT_TRUE(absl::StrContains(status.message(), message))
           << status << ", " << message;
     }
@@ -93,14 +93,14 @@ class NodeDefBuilderTest : public ::testing::Test {
   // Calls Finalize() and verifies it returns an error.
   // Message must appear as a substring of the error.
   void ExpectFailure(NodeDefBuilder& builder,  // NOLINT
-                     const string& message) {
+                     const std::string& message) {
     ExpectFailures(builder, {message});
   }
 
   // Like ExpectFailure(), except that the error can come from
   // ValidateNodeDef().
   void ExpectInvalid(NodeDefBuilder& builder,  // NOLINT
-                     const string& message) {
+                     const std::string& message) {
     NodeDef node_def;
     absl::Status status = builder.Finalize(&node_def);
     if (status.ok()) {
@@ -822,9 +822,9 @@ TEST_F(NodeDefBuilderTest, AttrManyDefault) {
                     .Input(FakeInput(DT_FLOAT))
                     .Attr("a", "foo")
                     .Attr("e", "foo")
-                    .Attr("b", std::vector<string>({"bar", "baz"}))
+                    .Attr("b", std::vector<std::string>({"bar", "baz"}))
                     .Attr("f", 1.0f),
-                {DT_FLOAT}, {}, R"proto(
+                {DT_FLOAT}, {}, R"pb(
     op: "AttrManyDefaultAndInferred"
     input: "a"
     attr {
@@ -854,7 +854,7 @@ TEST_F(NodeDefBuilderTest, AttrManyDefault) {
     attr {
       key: "d"
       value { f: 0.3 }
-    })proto");
+    })pb");
 }
 
 TEST_F(NodeDefBuilderTest, AttrListDefault) {
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 507e2275afc3b5..42c5e841c99417 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -64,18 +64,18 @@ AttrSlice::AttrSlice(const NodeDef& node_def)
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
-  string ret;
+std::string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
+  std::string ret;
 
   // We sort the attrs so the output is deterministic.
-  std::vector<string> attr_names;
+  std::vector<std::string> attr_names;
   attr_names.reserve(attrs.size());
   for (const auto& attr : attrs) {
     attr_names.push_back(attr.first);
   }
   std::sort(attr_names.begin(), attr_names.end());
   bool first = true;
-  for (const string& attr_name : attr_names) {
+  for (const std::string& attr_name : attr_names) {
     if (!first) absl::StrAppend(&ret, ", ");
     first = false;
     absl::StrAppend(&ret, attr_name, "=",
@@ -91,18 +91,18 @@ string SummarizeAttrsHelper(AttrSlice attrs, absl::string_view device) {
   return ret;
 }
 
-string AttrSlice::SummarizeNode() const {
+std::string AttrSlice::SummarizeNode() const {
   return ndef_
              ? SummarizeNodeDef(*ndef_)
              : absl::StrCat(
                    "[", SummarizeAttrsHelper(*this, absl::string_view()), "]");
 }
 
-string AttrSlice::DebugString() const {
-  std::vector<string> attr_key_vals;
+std::string AttrSlice::DebugString() const {
+  std::vector<std::string> attr_key_vals;
   attr_key_vals.reserve(attrs()->size());
   for (const auto& it : *this) {
-    const string& name = it.first;
+    const std::string& name = it.first;
     const AttrValue& attr_value = it.second;
     attr_key_vals.push_back(
         absl::StrCat(name, "=", SummarizeAttrValue(attr_value)));
@@ -110,15 +110,17 @@ string AttrSlice::DebugString() const {
   return absl::StrJoin(attr_key_vals, ", ");
 }
 
-string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
-  string ret = absl::StrCat(errors::FormatNodeNameForError(node_def.name()),
-                            " = ", node_def.op(), "[");
+std::string SummarizeNodeDef(const NodeDef& node_def,
+                             int max_inputs_in_summary) {
+  std::string ret =
+      absl::StrCat(errors::FormatNodeNameForError(node_def.name()), " = ",
+                   node_def.op(), "[");
   absl::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   absl::StrAppend(&ret, "](");
 
   // Output inputs, including control inputs, verbatim.
   bool first = true;
-  for (const string& input : node_def.input()) {
+  for (const std::string& input : node_def.input()) {
     if (!first) absl::StrAppend(&ret, ", ");
     first = false;
     if (max_inputs_in_summary-- == 0) {
@@ -131,22 +133,22 @@ string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
   return ret;
 }
 
-string SummarizeAttrs(const NodeDef& node_def) {
+std::string SummarizeAttrs(const NodeDef& node_def) {
   return SummarizeAttrsHelper(node_def, node_def.device());
 }
 
-string FormatNodeDefForError(
+std::string FormatNodeDefForError(
     absl::string_view node_name, bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info) {
   return !has_experimental_debug_info ||
                  experimental_debug_info.original_node_names().empty()
-             ? errors::FormatNodeNameForError(string(node_name))
+             ? errors::FormatNodeNameForError(node_name)
              : errors::FormatOriginalNodeLocationForError(
                    experimental_debug_info.original_node_names(),
                    experimental_debug_info.original_func_names());
 }
 
-string FormatNodeDefForError(const NodeDef& node_def) {
+std::string FormatNodeDefForError(const NodeDef& node_def) {
   return FormatNodeDefForError(node_def.name(),
                                node_def.has_experimental_debug_info(),
                                node_def.experimental_debug_info());
@@ -174,7 +176,7 @@ const AttrValue* AttrSlice::Find(absl::string_view attr_name) const {
   return nullptr;
 }
 
-const AttrValue* AttrSlice::FindByString(const string& attr_name) const {
+const AttrValue* AttrSlice::FindByString(const std::string& attr_name) const {
   auto iter = attrs()->find(attr_name);
   if (iter != attrs()->end()) {
     return &iter->second;
@@ -205,7 +207,7 @@ absl::Status AttrSlice::Find(absl::string_view attr_name,
   return CheckFind(attr_name, *attr_value);
 }
 
-absl::Status AttrSlice::FindByString(const string& attr_name,
+absl::Status AttrSlice::FindByString(const std::string& attr_name,
                                      const AttrValue** attr_value) const {
   *attr_value = FindByString(attr_name);
   return CheckFind(attr_name, *attr_value);
@@ -288,19 +290,19 @@ bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
   }
 DEFINE_GET_ATTR(tstring, s, "string", emplace_back, v, ;)
 DEFINE_TRY_GET_ATTR(tstring, s, "string", emplace_back, v, ;)
-DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
-DEFINE_TRY_GET_ATTR(string, s, "string", emplace_back, v, ;)
+DEFINE_GET_ATTR(std::string, s, "string", emplace_back, v, ;)
+DEFINE_TRY_GET_ATTR(std::string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64_t, i, "int", emplace_back, v, ;)
 DEFINE_TRY_GET_ATTR(int64_t, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(
-    int32, i, "int", emplace_back, static_cast<int32>(v),
-    if (static_cast<int64_t>(static_cast<int32>(v)) != v) {
+    int32_t, i, "int", emplace_back, static_cast<int32_t>(v),
+    if (static_cast<int64_t>(static_cast<int32_t>(v)) != v) {
       return errors::InvalidArgument("Attr ", attr_name, " has value ", v,
                                      " out of range for an int32");
     })
 DEFINE_TRY_GET_ATTR(
-    int32, i, "int", emplace_back, static_cast<int32>(v),
-    if (static_cast<int64_t>(static_cast<int32>(v)) != v) {
+    int32_t, i, "int", emplace_back, static_cast<int32_t>(v),
+    if (static_cast<int64_t>(static_cast<int32_t>(v)) != v) {
       static int log_counter = 0;
       if (log_counter < 10) {
         log_counter++;
@@ -345,13 +347,13 @@ DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
 #undef DEFINE_GET_ATTR
 
 bool HasNodeAttr(const NodeDef& node_def, absl::string_view attr_name) {
-  return node_def.attr().find(string(attr_name)) != node_def.attr().end();
+  return node_def.attr().find(std::string(attr_name)) != node_def.attr().end();
 }
 
-static const string& kEmptyString = *new string();
+static const std::string& kEmptyString = *new std::string();
 
-const string& GetNodeAttrString(const AttrSlice& attrs,
-                                absl::string_view attr_name) {
+const std::string& GetNodeAttrString(const AttrSlice& attrs,
+                                     absl::string_view attr_name) {
   const AttrValue* attr_value = attrs.Find(attr_name);
   if (attr_value == nullptr) {
     return kEmptyString;
@@ -364,7 +366,7 @@ const string& GetNodeAttrString(const AttrSlice& attrs,
 }
 
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<const string*>* value) {
+                    std::vector<const std::string*>* value) {
   const AttrValue* attr_value = attrs.Find(attr_name);
   if (attr_value == nullptr) {
     return false;
@@ -456,7 +458,7 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          Padding* value) {
-  string str_value;
+  std::string str_value;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, attr_name, &str_value));
   return GetPaddingFromString(str_value, value);
 }
@@ -473,7 +475,7 @@ absl::Status AddArgToSig(const NodeDefOrAttrSlice& node_or_attrs,
     TF_RETURN_IF_ERROR(
         GetNodeAttr(node_or_attrs, arg_def.number_attr(), &repeats));
     // We can't handle outputs that are larger than int32 sizes.
-    if (static_cast<int64_t>(static_cast<int32>(repeats)) != repeats) {
+    if (static_cast<int64_t>(static_cast<int32_t>(repeats)) != repeats) {
       return errors::InvalidArgument("Number of outputs is too big: ", repeats);
     }
     if (repeats < 0) {
@@ -645,10 +647,10 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   bool seen_control = false;
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
-  for (const string& input : node_def.input()) {
+  for (const std::string& input : node_def.input()) {
     if (absl::StartsWith(input, "^")) {
       seen_control = true;
-      if (input.find(':') != string::npos) {
+      if (input.find(':') != std::string::npos) {
         return errors::InvalidArgument("Control input '", input,
                                        "' must not have ':' in NodeDef: ",
                                        FormatNodeDefForError(node_def));
@@ -662,7 +664,7 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     }
   }
 
-  std::unordered_map<string, const OpDef::AttrDef*> op_attrs;
+  std::unordered_map<std::string, const OpDef::AttrDef*> op_attrs;
   for (const auto& attr : op_def.attr()) {
     if (!gtl::InsertIfNotPresent(&op_attrs, attr.name(), &attr)) {
       return errors::InvalidArgument("OpDef has duplicate attr name '",
@@ -700,7 +702,7 @@ absl::Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
 
   // Were all attrs in the OpDef found in the NodeDef?
   if (!op_attrs.empty()) {
-    string attrs;
+    std::string attrs;
     for (const auto& attr_pair : op_attrs) {
       if (!attrs.empty()) absl::StrAppend(&attrs, "', '");
       absl::StrAppend(&attrs, attr_pair.first);
@@ -870,7 +872,8 @@ const absl::string_view kColocationGroupPrefixStringPiece(
 
 }  // namespace
 
-absl::Status ValidateOpInput(const string& input_name, bool* is_control_input) {
+absl::Status ValidateOpInput(const std::string& input_name,
+                             bool* is_control_input) {
   *is_control_input = false;
   if (IsValidDataInputName(input_name)) {
     return absl::OkStatus();
@@ -882,7 +885,7 @@ absl::Status ValidateOpInput(const string& input_name, bool* is_control_input) {
   }
 }
 
-absl::Status ValidateNodeName(const string& node_name) {
+absl::Status ValidateNodeName(const std::string& node_name) {
   if (IsValidNodeName(node_name)) {
     return absl::OkStatus();
   } else {
@@ -896,7 +899,7 @@ absl::Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
     return AttachDef(s, node_def);
   }
   bool in_control_inputs = false;
-  for (const string& input_name : node_def.input()) {
+  for (const std::string& input_name : node_def.input()) {
     bool is_control_input;
     s = ValidateOpInput(input_name, &is_control_input);
     if (!s.ok()) {
@@ -915,7 +918,7 @@ absl::Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 
 absl::Status AttachDef(const absl::Status& status, const NodeDef& node_def,
                        bool allow_multiple_formatted_node) {
-  string node_error;
+  std::string node_error;
   if (!allow_multiple_formatted_node &&
       absl::StrContains(status.message(), "{{node ")) {
     node_error = node_def.name();
@@ -930,11 +933,11 @@ absl::Status AttachDef(const absl::Status& status, const NodeDef& node_def,
 void AddNodeAttr(absl::string_view name, const AttrValue& value,
                  NodeDef* node_def) {
   node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(string(name), value));
+      AttrValueMap::value_type(std::string(name), value));
 }
 
 void AddNodeAttr(absl::string_view name, AttrValue&& value, NodeDef* node_def) {
-  (*node_def->mutable_attr())[string(name)] = std::move(value);
+  (*node_def->mutable_attr())[std::string(name)] = std::move(value);
 }
 
 #define ADD_NODE_ATTR(T)                                           \
@@ -957,8 +960,8 @@ ADD_NODE_ATTR(const TensorProto&)
 ADD_NODE_ATTR(const NameAttrList&)
 ADD_NODE_ATTR(absl::Span<const absl::string_view>)
 ADD_NODE_ATTR(absl::Span<const char* const>)
-ADD_NODE_ATTR(absl::Span<const string>)
-ADD_NODE_ATTR(absl::Span<const int32>)
+ADD_NODE_ATTR(absl::Span<const std::string>)
+ADD_NODE_ATTR(absl::Span<const int32_t>)
 ADD_NODE_ATTR(absl::Span<const int64_t>)
 ADD_NODE_ATTR(absl::Span<const float>)
 ADD_NODE_ATTR(absl::Span<const bool>)
@@ -973,7 +976,7 @@ ADD_NODE_ATTR(absl::Span<const NameAttrList>)
 
 void AddAttr(absl::string_view name, const AttrValue& value,
              AttrValueMap* map) {
-  map->insert(AttrValueMap::value_type(string(name), value));
+  map->insert(AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_ATTR(T)                                            \
@@ -994,7 +997,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
   // Update frame name to avoid multiple LoopCond nodes in one frame.
   if (uniquify_frame_name &&
       (node_def->op() == "Enter" || node_def->op() == "RefEnter")) {
-    string frame_name;
+    std::string frame_name;
     TF_RETURN_IF_ERROR(GetNodeAttr(*node_def, "frame_name", &frame_name));
     AttrValue& attr = (*node_def->mutable_attr())["frame_name"];
     frame_name = absl::StrCat(prefix, frame_name, suffix);
@@ -1005,7 +1008,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
 }
 
 absl::Status MaybeAddPrefixToColocationConstraints(
-    const std::unordered_set<string>& match, absl::string_view prefix,
+    const std::unordered_set<std::string>& match, absl::string_view prefix,
     NodeDef* node_def) {
   auto attr = node_def->mutable_attr()->find(kColocationAttrName);
   if (attr == node_def->mutable_attr()->end()) {
@@ -1016,7 +1019,7 @@ absl::Status MaybeAddPrefixToColocationConstraints(
   for (size_t i = 0; i < constraints_size; ++i) {
     absl::string_view original(constraints_list->s(i));
     if (absl::ConsumePrefix(&original, kColocationGroupPrefixStringPiece)) {
-      if (match.find(string(original)) != match.end()) {
+      if (match.find(std::string(original)) != match.end()) {
         (*constraints_list->mutable_s(i)) =
             absl::StrCat(kColocationGroupPrefix, prefix, original);
       }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 2b82c596fee301..1dd97f9e4137db 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -81,7 +81,7 @@ std::string FormatNodeDefForError(
     absl::string_view node_name, bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 
-typedef protobuf::Map<string, AttrValue> AttrValueMap;
+typedef protobuf::Map<std::string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
 // The type of the attr is based on the type of value.
@@ -109,9 +109,9 @@ void AddNodeAttr(absl::string_view name,
                  absl::Span<const absl::string_view> value, NodeDef* node_def);
 void AddNodeAttr(absl::string_view name, absl::Span<const char* const> value,
                  NodeDef* node_def);
-void AddNodeAttr(absl::string_view name, absl::Span<const string> value,
+void AddNodeAttr(absl::string_view name, absl::Span<const std::string> value,
                  NodeDef* node_def);
-void AddNodeAttr(absl::string_view name, absl::Span<const int32> value,
+void AddNodeAttr(absl::string_view name, absl::Span<const int32_t> value,
                  NodeDef* node_def);
 void AddNodeAttr(absl::string_view name, absl::Span<const int64_t> value,
                  NodeDef* node_def);
@@ -221,7 +221,7 @@ absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          int64_t* value);  // type: "int"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         int32* value);  // type: "int"
+                         int32_t* value);  // type: "int"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          float* value);  // type: "float"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -236,14 +236,15 @@ absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          PartialTensorShape* value);  // type: "shape"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          Tensor* value);  // type: "tensor"
-absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         std::vector<string>* value);  // type "list(string)"
+absl::Status GetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<std::string>* value);  // type "list(string)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<tstring>* value);  // type "list(tstring)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<int64_t>* value);  // type "list(int)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                         std::vector<int32>* value);  // type "list(int)"
+                         std::vector<int32_t>* value);  // type "list(int)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                          std::vector<float>* value);  // type "list(float)"
 absl::Status GetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -302,7 +303,7 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<int64_t>* value);  // type: "int"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    int32* value);  // type: "int"
+                    int32_t* value);  // type: "int"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     float* value);  // type: "float"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -313,11 +314,11 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     TensorShape* value);  // type: "shape"
 
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<string>* value);  // type: "list(string)"
+                    std::vector<std::string>* value);  // type: "list(string)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<tstring>* value);  // type: "list(tstring)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<int32>* value);  // type: "list(int)"
+                    std::vector<int32_t>* value);  // type: "list(int)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
                     std::vector<float>* value);  // type: "list(float)"
 bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
@@ -329,8 +330,9 @@ bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
 
 // Overloads of TryGetNodeAttr() that avoid copying the non-POD attribute
 // values.
-bool TryGetNodeAttr(const AttrSlice& attrs, absl::string_view attr_name,
-                    std::vector<const string*>* value);  // type: "list(string)"
+bool TryGetNodeAttr(
+    const AttrSlice& attrs, absl::string_view attr_name,
+    std::vector<const std::string*>* value);  // type: "list(string)"
 bool TryGetNodeAttr(
     const AttrSlice& attrs, absl::string_view attr_name,
     std::vector<const TensorShapeProto*>* value);  // type: "list(shape)"
@@ -442,7 +444,7 @@ absl::Status AddPrefixAndSuffixToNode(absl::string_view prefix,
 // Appends the given prefix to the colocation group name if the name exists
 // in `to_match`.
 absl::Status MaybeAddPrefixToColocationConstraints(
-    const std::unordered_set<string>& match, absl::string_view prefix,
+    const std::unordered_set<std::string>& match, absl::string_view prefix,
     NodeDef* node_def);
 
 // Updates the colocation constraint name with the one provided in the map (if
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 5296dcc7075dc6..66a37a41ee3f8a 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -37,7 +37,7 @@ OpDef ToOpDef(const OpDefBuilder& builder) {
   return op_reg_data.op_def;
 }
 
-NodeDef ToNodeDef(const string& text) {
+NodeDef ToNodeDef(const std::string& text) {
   NodeDef node_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
   return node_def;
@@ -56,7 +56,7 @@ void ExpectSuccess(const NodeDef& good, const OpDef& op_def) {
 }
 
 void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
-                   const string& message) {
+                   const std::string& message) {
   absl::Status status = ValidateNodeDef(bad, op_def);
 
   EXPECT_FALSE(status.ok()) << "NodeDef: " << SummarizeNodeDef(bad)
@@ -322,7 +322,7 @@ void ExpectValidSyntax(const NodeDef& good) {
       << "NodeDef: " << SummarizeNodeDef(good);
 }
 
-void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
+void ExpectInvalidSyntax(const NodeDef& bad, const std::string& message) {
   absl::Status status = ValidateExternalNodeDefSyntax(bad);
 
   ASSERT_FALSE(status.ok()) << "NodeDef: " << SummarizeNodeDef(bad);
@@ -761,11 +761,11 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   node_def.set_name("enter");
   node_def.set_op("Enter");
   AddNodeAttr("frame_name", "test_frame", &node_def);
-  const string prefix = "prefix/";
-  const string suffix = "/suffix";
+  const std::string prefix = "prefix/";
+  const std::string suffix = "/suffix";
   TF_ASSERT_OK(AddPrefixAndSuffixToNode(prefix, suffix, &node_def));
   EXPECT_EQ("prefix/enter/suffix", node_def.name());
-  string frame_name;
+  std::string frame_name;
   TF_ASSERT_OK(GetNodeAttr(node_def, "frame_name", &frame_name));
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
@@ -780,15 +780,15 @@ TEST(MaybeAddPrefixToColocationConstraints, Basic) {
                absl::StrCat(kColocationGroupPrefix, "Node3")},
               &node_def);
 
-  std::unordered_set<string> match;
+  std::unordered_set<std::string> match;
   match.insert("Node1");
   match.insert("Node3");
   TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
-  std::vector<string> coloc_constraints;
+  std::vector<std::string> coloc_constraints;
   TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
-  EXPECT_EQ(
-      coloc_constraints,
-      std::vector<string>({"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
+  EXPECT_EQ(coloc_constraints,
+            std::vector<std::string>(
+                {"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
 }
 
 TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
@@ -796,7 +796,7 @@ TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
   node_def.set_name("Identity");
   node_def.set_op("Identity");
 
-  std::unordered_set<string> match;
+  std::unordered_set<std::string> match;
   match.insert("Node1");
   match.insert("Node3");
   TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
@@ -817,10 +817,10 @@ TEST(MaybeUpdateColocationConstraintsWithMap, Basic) {
   node_map["Node1"] = "Node4";
   node_map["Invalid"] = "Node5";
   TF_ASSERT_OK(MaybeUpdateColocationConstraintsWithMap(node_map, &node_def));
-  std::vector<string> coloc_constraints;
+  std::vector<std::string> coloc_constraints;
   TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
-  EXPECT_EQ(coloc_constraints,
-            std::vector<string>({"loc:@Node4", "loc:@Node2", "loc:@Node3"}));
+  EXPECT_EQ(coloc_constraints, std::vector<std::string>(
+                                   {"loc:@Node4", "loc:@Node2", "loc:@Node3"}));
 }
 
 TEST(MaybeUpdateColocationConstraintsWithMap, NoConstraints) {
diff --git a/tensorflow/core/framework/node_properties_test.cc b/tensorflow/core/framework/node_properties_test.cc
index 8e1dd344e91261..28f992c4e4dff4 100644
--- a/tensorflow/core/framework/node_properties_test.cc
+++ b/tensorflow/core/framework/node_properties_test.cc
@@ -40,7 +40,7 @@ class MockOpRegistry : public OpRegistryInterface {
   // Returns an error status and sets *op_reg_data to nullptr if no OpDef is
   // registered under that name, otherwise returns the registered OpDef.
   // Caller must not delete the returned pointer.
-  absl::Status LookUp(const string& op_type_name,
+  absl::Status LookUp(const std::string& op_type_name,
                       const OpRegistrationData** op_reg_data) const override {
     if (op_type_name == "Foo") {
       *op_reg_data = &op_reg_;
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 31aeb2421bc652..7688578d8513f5 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -39,7 +39,7 @@ absl::Status DefaultValidator(const OpRegistryInterface& op_registry) {
 
 // OpRegistry -----------------------------------------------------------------
 
-absl::Status OpRegistryInterface::LookUpOpDef(const string& op_type_name,
+absl::Status OpRegistryInterface::LookUpOpDef(const std::string& op_type_name,
                                               const OpDef** op_def) const {
   *op_def = nullptr;
   const OpRegistrationData* op_reg_data = nullptr;
@@ -62,7 +62,7 @@ void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
 
 namespace {
 // Helper function that returns Status message for failed LookUp.
-absl::Status OpNotFound(const string& op_type_name) {
+absl::Status OpNotFound(const std::string& op_type_name) {
   absl::Status status = errors::NotFound(
       "Op type not registered '", op_type_name, "' in binary running on ",
       port::Hostname(), ". ",
@@ -76,13 +76,14 @@ absl::Status OpNotFound(const string& op_type_name) {
 }
 }  // namespace
 
-absl::Status OpRegistry::LookUp(const string& op_type_name,
+absl::Status OpRegistry::LookUp(const std::string& op_type_name,
                                 const OpRegistrationData** op_reg_data) const {
   if ((*op_reg_data = LookUp(op_type_name))) return absl::OkStatus();
   return OpNotFound(op_type_name);
 }
 
-const OpRegistrationData* OpRegistry::LookUp(const string& op_type_name) const {
+const OpRegistrationData* OpRegistry::LookUp(
+    const std::string& op_type_name) const {
   {
     tf_shared_lock l(mu_);
     if (initialized_) {
@@ -96,7 +97,7 @@ const OpRegistrationData* OpRegistry::LookUp(const string& op_type_name) const {
 }
 
 const OpRegistrationData* OpRegistry::LookUpSlow(
-    const string& op_type_name) const {
+    const std::string& op_type_name) const {
   const OpRegistrationData* res = nullptr;
 
   bool first_call = false;
@@ -195,10 +196,10 @@ absl::Status OpRegistry::ProcessRegistrations() const {
   return CallDeferred();
 }
 
-string OpRegistry::DebugString(bool include_internal) const {
+std::string OpRegistry::DebugString(bool include_internal) const {
   OpList op_list;
   Export(include_internal, &op_list);
-  string ret;
+  std::string ret;
   for (const auto& op : op_list.op()) {
     absl::StrAppend(&ret, SummarizeOpDef(op), "\n");
   }
@@ -268,7 +269,7 @@ OpListOpRegistry::OpListOpRegistry(const OpList* op_list) {
 }
 
 const OpRegistrationData* OpListOpRegistry::LookUp(
-    const string& op_type_name) const {
+    const std::string& op_type_name) const {
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     return nullptr;
@@ -277,7 +278,8 @@ const OpRegistrationData* OpListOpRegistry::LookUp(
 }
 
 absl::Status OpListOpRegistry::LookUp(
-    const string& op_type_name, const OpRegistrationData** op_reg_data) const {
+    const std::string& op_type_name,
+    const OpRegistrationData** op_reg_data) const {
   if ((*op_reg_data = LookUp(op_type_name))) return absl::OkStatus();
   return OpNotFound(op_type_name);
 }
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 41b39fc2076469..251d58bdd01a15 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -165,7 +165,8 @@ class OpRegistry : public OpRegistryInterface {
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ TF_GUARDED_BY(mu_);
   // Values are owned.
-  mutable absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>>
+  mutable absl::flat_hash_map<std::string,
+                              std::unique_ptr<const OpRegistrationData>>
       registry_ TF_GUARDED_BY(mu_);
   mutable bool initialized_ TF_GUARDED_BY(mu_);
 
@@ -193,7 +194,8 @@ class OpListOpRegistry : public OpRegistryInterface {
 
  private:
   // Values are owned.
-  absl::flat_hash_map<string, std::unique_ptr<const OpRegistrationData>> index_;
+  absl::flat_hash_map<std::string, std::unique_ptr<const OpRegistrationData>>
+      index_;
 };
 
 // Support for defining the OpDef (specifying the semantics of the Op and how
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index da11e32498becf..f6087d6d5f33ed 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -88,10 +88,10 @@ class OpCompatibilityTest : public OpsTestBase {
     TF_ASSERT_OK(RunOpKernel());
   }
 
-  string Result() { return GetOutput(0)->scalar<tstring>()(); }
+  std::string Result() { return GetOutput(0)->scalar<tstring>()(); }
 
   void ExpectIncompatible(const OpDef& old_op_def, const OpDef& new_op_def,
-                          const string& error) {
+                          const std::string& error) {
     // Test OpDefCompatible gives the same answer without the node_def.
     absl::Status status = OpDefCompatible(old_op_def, new_op_def);
     if (status.ok()) {
@@ -103,8 +103,9 @@ class OpCompatibilityTest : public OpsTestBase {
     }
   }
 
-  void ExpectInvalid(const OpDef& old_op_def, const string& validation_error,
-                     const string& compatibility_error) {
+  void ExpectInvalid(const OpDef& old_op_def,
+                     const std::string& validation_error,
+                     const std::string& compatibility_error) {
     // Record the original signature before we change *node_def().
     DataTypeVector old_in_types, old_out_types;
     TF_ASSERT_OK(InOutTypesForNode(*node_def(), old_op_def, &old_in_types,
@@ -127,7 +128,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectTypeMismatch(const OpDef& old_op_def,
-                          const string& compatibility_error) {
+                          const std::string& compatibility_error) {
     // Record the original signature before we change *node_def().
     DataTypeVector old_in_types, old_out_types;
     TF_ASSERT_OK(InOutTypesForNode(*node_def(), old_op_def, &old_in_types,
@@ -153,7 +154,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectRenameFailure(const OpDef& old_op_def,
-                           const string& compatibility_error) {
+                           const std::string& compatibility_error) {
     // This should be all that is needed to get compatibility.
     const OpDef* new_op_def = RegisteredOpDef();
     AddDefaultsToNodeDef(*new_op_def, node_def());
@@ -166,7 +167,7 @@ class OpCompatibilityTest : public OpsTestBase {
   }
 
   void ExpectDefaultChangeFailure(const OpDef& old_op_def,
-                                  const string& compatibility_error) {
+                                  const std::string& compatibility_error) {
     // This should be all that is needed to get compatibility.
     const OpDef* new_op_def = RegisteredOpDef();
     AddDefaultsToNodeDef(*new_op_def, node_def());
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index e4ec9e50497d73..9265f5b10ed7e4 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -36,7 +36,7 @@ namespace tensorflow {
 
 namespace {
 
-string AttrError(absl::string_view orig, const string& op_name) {
+std::string AttrError(absl::string_view orig, const std::string& op_name) {
   return absl::StrCat(" from Attr(\"", orig, "\") for Op ", op_name);
 }
 
@@ -62,7 +62,7 @@ bool ConsumeListPrefix(absl::string_view* sp) {
 
 bool ConsumeQuotedString(char quote_ch, absl::string_view* sp,
                          absl::string_view* out) {
-  const string quote_str(1, quote_ch);
+  const std::string quote_str(1, quote_ch);
   return Scanner(*sp)
       .OneLiteral(quote_str.c_str())
       .RestartCapture()
@@ -150,7 +150,7 @@ bool ProcessCompoundType(const absl::string_view type_string,
 }
 
 void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
-                  OpDef* op_def, std::vector<string>* errors) {
+                  OpDef* op_def, std::vector<std::string>* errors) {
   OpDef::AttrDef* attr = op_def->add_attr();
   absl::string_view orig(spec);
 
@@ -161,7 +161,7 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
 
   // Read "<type>" or "list(<type>)".
   bool is_list = ConsumeListPrefix(&spec);
-  string type;
+  std::string type;
   absl::string_view type_string;  // Used if type == "type"
   if (absl::ConsumePrefix(&spec, "string")) {
     type = "string";
@@ -197,8 +197,8 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
         VERIFY(ConsumeQuotedString('"', &spec, &escaped_string) ||
                    ConsumeQuotedString('\'', &spec, &escaped_string),
                "Trouble parsing allowed string at '", spec, "'");
-        string unescaped;
-        string error;
+        std::string unescaped;
+        std::string error;
         VERIFY(absl::CUnescape(escaped_string, &unescaped, &error),
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
@@ -274,8 +274,8 @@ void FinalizeAttr(absl::string_view spec, bool allow_attr_type_any,
 
 #undef VERIFY
 
-string InOutError(bool is_output, absl::string_view orig,
-                  const string& op_name) {
+std::string InOutError(bool is_output, absl::string_view orig,
+                       const std::string& op_name) {
   return strings::StrCat(" from ", is_output ? "Output" : "Input", "(\"", orig,
                          "\") for Op ", op_name);
 }
@@ -343,7 +343,7 @@ bool ConsumeControlOutName(absl::string_view* sp, absl::string_view* out) {
   } while (false)
 
 void FinalizeInputOrOutput(absl::string_view spec, bool is_output,
-                           OpDef* op_def, std::vector<string>* errors) {
+                           OpDef* op_def, std::vector<std::string>* errors) {
   OpDef::ArgDef* arg =
       is_output ? op_def->add_output_arg() : op_def->add_input_arg();
 
@@ -426,12 +426,13 @@ void FinalizeInputOrOutput(absl::string_view spec, bool is_output,
 
 #undef VERIFY
 
-string ControlOutError(absl::string_view orig, const string& op_name) {
+std::string ControlOutError(absl::string_view orig,
+                            const std::string& op_name) {
   return absl::StrCat(" from ControlOutput(\"", orig, "\") for Op ", op_name);
 }
 
 void FinalizeControlOutput(absl::string_view name, OpDef* op_def,
-                           std::vector<string>* errors) {
+                           std::vector<std::string>* errors) {
   absl::string_view orig(name);
 
   // Parse control output name.
@@ -441,7 +442,7 @@ void FinalizeControlOutput(absl::string_view name, OpDef* op_def,
                                    ControlOutError(orig, op_def->name())));
   }
 
-  *op_def->add_control_output() = string(tmp_name.data(), tmp_name.size());
+  *op_def->add_control_output() = std::string(tmp_name.data(), tmp_name.size());
 }
 
 int num_leading_spaces(absl::string_view s) {
@@ -467,12 +468,12 @@ bool IsDocNameColon(absl::string_view s) {
   return ConsumeDocNameColon(&s, nullptr /* out */);
 }
 
-void FinalizeDoc(const string& text, OpDef* op_def,
-                 std::vector<string>* errors) {
-  std::vector<string> lines = str_util::Split(text, '\n');
+void FinalizeDoc(const std::string& text, OpDef* op_def,
+                 std::vector<std::string>* errors) {
+  std::vector<std::string> lines = str_util::Split(text, '\n');
 
   // Remove trailing spaces.
-  for (string& line : lines) {
+  for (std::string& line : lines) {
     absl::StripTrailingAsciiWhitespace(&line);
   }
 
@@ -493,8 +494,9 @@ void FinalizeDoc(const string& text, OpDef* op_def,
   int end_l = l;
   // Trim trailing blank lines from the description.
   while (start_l < end_l && lines[end_l - 1].empty()) --end_l;
-  string desc = absl::StrJoin(
-      absl::Span<const string>(lines.data() + start_l, end_l - start_l), "\n");
+  std::string desc = absl::StrJoin(
+      absl::Span<const std::string>(lines.data() + start_l, end_l - start_l),
+      "\n");
   if (!desc.empty()) op_def->set_description(desc);
 
   // name: description
@@ -528,7 +530,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
       if (!description[i].empty()) description[i].remove_prefix(min_indent);
     }
     // Concatenate lines into a single string.
-    const string complete(absl::StrJoin(description, "\n"));
+    const std::string complete(absl::StrJoin(description, "\n"));
 
     // Find name.
     bool found = false;
@@ -561,31 +563,31 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 
 }  // namespace
 
-OpDefBuilder::OpDefBuilder(string op_name) {
+OpDefBuilder::OpDefBuilder(std::string op_name) {
   op_def()->set_name(std::move(op_name));
 }
 
-OpDefBuilder& OpDefBuilder::Attr(string spec) {
+OpDefBuilder& OpDefBuilder::Attr(std::string spec) {
   attrs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Input(string spec) {
+OpDefBuilder& OpDefBuilder::Input(std::string spec) {
   inputs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Output(string spec) {
+OpDefBuilder& OpDefBuilder::Output(std::string spec) {
   outputs_.push_back(std::move(spec));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::ControlOutput(string name) {
+OpDefBuilder& OpDefBuilder::ControlOutput(std::string name) {
   control_outputs_.push_back(std::move(name));
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Doc(string text) {
+OpDefBuilder& OpDefBuilder::Doc(std::string text) {
 #ifndef TF_LEAN_BINARY
   if (!doc_.empty()) {
     errors_.push_back(
@@ -622,7 +624,7 @@ OpDefBuilder& OpDefBuilder::SetIsDistributedCommunication() {
   return *this;
 }
 
-OpDefBuilder& OpDefBuilder::Deprecated(int version, string explanation) {
+OpDefBuilder& OpDefBuilder::Deprecated(int version, std::string explanation) {
   if (op_def()->has_deprecation()) {
     errors_.push_back(
         absl::StrCat("Deprecated called twice for Op ", op_def()->name()));
@@ -667,7 +669,7 @@ OpDefBuilder& OpDefBuilder::AllowAttrTypeAny() {
 }
 
 absl::Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
-  std::vector<string> errors = errors_;
+  std::vector<std::string> errors = errors_;
   *op_reg_data = op_reg_data_;
 
   OpDef* op_def = &op_reg_data->op_def;
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 8009135d584188..3df88e028c2bd2 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -40,7 +40,7 @@ typedef std::vector<std::reference_wrapper<const FullTypeDef>> TypeRefVector;
 // A callback into the type inference process, allowing type inference functions
 // to request inferring the type of some function (assumed to exist in the
 // runtime). The function is specified by name.
-typedef std::function<absl::StatusOr<FullTypeDef>(const string&,
+typedef std::function<absl::StatusOr<FullTypeDef>(const std::string&,
                                                   const TypeRefVector&)>
     FunctionTypeInferrer;
 
@@ -266,12 +266,12 @@ class OpDefBuilder {
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
   OpRegistrationData op_reg_data_;
-  std::vector<string> attrs_;
-  std::vector<string> inputs_;
-  std::vector<string> outputs_;
-  std::vector<string> control_outputs_;
+  std::vector<std::string> attrs_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::vector<std::string> control_outputs_;
   std::string doc_;
-  std::vector<string> errors_;
+  std::vector<std::string> errors_;
   bool allow_attr_type_any_ = false;
 };
 
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index 3e8c805bcb419f..8dad7a721dad34 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -74,7 +74,7 @@ class OpDefBuilderTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const OpDefBuilder& builder, const string& error) {
+  void ExpectFailure(const OpDefBuilder& builder, const std::string& error) {
     OpRegistrationData op_reg_data;
     absl::Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index e228d1f4969a7c..b11360b68bb4a6 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -48,7 +48,7 @@ absl::Status AllowedTypeValue(DataType dt, const OpDef::AttrDef& attr) {
       return absl::OkStatus();
     }
   }
-  string allowed_str;
+  std::string allowed_str;
   for (int i = 0; i < allowed_values.list().type_size(); ++i) {
     if (!allowed_str.empty()) {
       absl::StrAppend(&allowed_str, ", ");
@@ -61,15 +61,16 @@ absl::Status AllowedTypeValue(DataType dt, const OpDef::AttrDef& attr) {
       " is not in the list of allowed values: ", allowed_str);
 }
 
-absl::Status AllowedStringValue(const string& str, const OpDef::AttrDef& attr) {
+absl::Status AllowedStringValue(const std::string& str,
+                                const OpDef::AttrDef& attr) {
   const AttrValue& allowed_values(attr.allowed_values());
   for (const auto& allowed : allowed_values.list().s()) {
     if (str == allowed) {
       return absl::OkStatus();
     }
   }
-  string allowed_str;
-  for (const string& allowed : allowed_values.list().s()) {
+  std::string allowed_str;
+  for (const std::string& allowed : allowed_values.list().s()) {
     if (!allowed_str.empty()) {
       absl::StrAppend(&allowed_str, ", ");
     }
@@ -135,7 +136,7 @@ absl::Status ValidateAttrValue(const AttrValue& attr_value,
     } else if (attr.type() == "string") {
       TF_RETURN_IF_ERROR(AllowedStringValue(attr_value.s(), attr));
     } else if (attr.type() == "list(string)") {
-      for (const string& str : attr_value.list().s()) {
+      for (const std::string& str : attr_value.list().s()) {
         TF_RETURN_IF_ERROR(AllowedStringValue(str, attr));
       }
     } else {
@@ -193,7 +194,7 @@ const ApiDef::Arg* FindInputArg(absl::string_view name, const ApiDef& api_def) {
 static absl::Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                                 bool output,
                                 absl::flat_hash_set<absl::string_view>* names) {
-  const string suffix =
+  const std::string suffix =
       absl::StrCat(output ? " for output '" : " for input '", arg.name(), "'");
   VALIDATE(names->emplace(arg.name()).second, "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
@@ -320,7 +321,7 @@ absl::Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate allowed_values
     if (attr.has_allowed_values()) {
-      const string list_type =
+      const std::string list_type =
           is_list ? attr.type() : absl::StrCat("list(", attr.type(), ")");
       TF_RETURN_WITH_CONTEXT_IF_ERROR(
           AttrValueHasType(attr.allowed_values(), list_type), " for attr '",
@@ -360,7 +361,7 @@ absl::Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
     } else {
       // Warn only once for each op name, and do it in a threadsafe manner.
       static mutex mu(LINKER_INITIALIZED);
-      static auto* warned = new absl::flat_hash_set<string>();
+      static auto* warned = new absl::flat_hash_set<std::string>();
       bool warn;
       {
         mutex_lock lock(mu);
@@ -378,8 +379,9 @@ absl::Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
 
 namespace {
 
-string SummarizeArgs(const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
-  string ret;
+std::string SummarizeArgs(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
+  std::string ret;
   for (const OpDef::ArgDef& arg : args) {
     if (!ret.empty()) absl::StrAppend(&ret, ", ");
     absl::StrAppend(&ret, arg.name(), ":");
@@ -399,8 +401,8 @@ string SummarizeArgs(const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
 
 }  // namespace
 
-string SummarizeOpDef(const OpDef& op_def) {
-  string ret = absl::StrCat("Op<name=", op_def.name());
+std::string SummarizeOpDef(const OpDef& op_def) {
+  std::string ret = absl::StrCat("Op<name=", op_def.name());
   absl::StrAppend(&ret, "; signature=", SummarizeArgs(op_def.input_arg()),
                   " -> ", SummarizeArgs(op_def.output_arg()));
   for (int i = 0; i < op_def.attr_size(); ++i) {
@@ -474,12 +476,12 @@ bool MoreRestrictive(const OpDef::AttrDef& old_attr,
   return false;
 }
 
-string AllowedStr(const OpDef::AttrDef& attr) {
+std::string AllowedStr(const OpDef::AttrDef& attr) {
   if (!attr.has_allowed_values()) return "no restriction";
   return SummarizeAttrValue(attr.allowed_values());
 }
 
-string DefaultAttrStr(const OpDef::AttrDef& attr) {
+std::string DefaultAttrStr(const OpDef::AttrDef& attr) {
   if (!attr.has_default_value()) return "no default";
   return SummarizeAttrValue(attr.default_value());
 }
@@ -495,7 +497,7 @@ bool HigherMinimum(const OpDef::AttrDef& old_attr,
   return new_attr.minimum() > old_attr.minimum();
 }
 
-string MinStr(const OpDef::AttrDef& attr) {
+std::string MinStr(const OpDef::AttrDef& attr) {
   if (!attr.has_minimum()) return "no minimum";
   return absl::StrCat(attr.minimum());
 }
@@ -509,7 +511,7 @@ void FillAttrMap(const OpDef& op_def, AttrMap* attr_map) {
 
 // Add a comma to *s every call but the first (*add_comma should be
 // initialized to false).
-void AddComma(string* s, bool* add_comma) {
+void AddComma(std::string* s, bool* add_comma) {
   if (*add_comma) {
     absl::StrAppend(s, ", ");
   } else {
@@ -518,7 +520,7 @@ void AddComma(string* s, bool* add_comma) {
 }
 
 // Will add the `name` from arg if name is true.
-void AddName(string* s, bool name, const OpDef::ArgDef& arg) {
+void AddName(std::string* s, bool name, const OpDef::ArgDef& arg) {
   if (name) {
     absl::StrAppend(s, arg.name(), ":");
   }
@@ -535,11 +537,11 @@ void AddName(string* s, bool name, const OpDef::ArgDef& arg) {
 //
 // We get the types by either using the attrs in args if they are in
 // old_attrs, or substituting the default value from new_attrs.
-string ComputeArgSignature(
+std::string ComputeArgSignature(
     const protobuf::RepeatedPtrField<OpDef::ArgDef>& args,
     const AttrMap& old_attrs, const AttrMap& new_attrs, std::vector<bool>* ref,
     bool names) {
-  string s;
+  std::string s;
   bool add_comma = false;
   for (const OpDef::ArgDef& arg : args) {
     if (!arg.type_list_attr().empty()) {
@@ -568,7 +570,7 @@ string ComputeArgSignature(
       }
     } else {
       int num = 1;  // How many input/outputs does this represent?
-      string type;  // What is the type of this arg?
+      std::string type;  // What is the type of this arg?
       AddName(&type, names, arg);
       if (!arg.number_attr().empty()) {
         // N * type case.
@@ -655,9 +657,9 @@ absl::Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
   }
 
   std::vector<bool> old_in_ref, new_in_ref, old_out_ref, new_out_ref;
-  const string old_in_sig = ComputeArgSignature(
+  const std::string old_in_sig = ComputeArgSignature(
       old_op.input_arg(), old_attrs, new_attrs, &old_in_ref, false /* names */);
-  const string new_in_sig = ComputeArgSignature(
+  const std::string new_in_sig = ComputeArgSignature(
       new_op.input_arg(), old_attrs, new_attrs, &new_in_ref, false /* names */);
   VALIDATE(old_in_sig == new_in_sig, "Input signature mismatch '", old_in_sig,
            "' vs. '", new_in_sig, "'");
@@ -669,10 +671,10 @@ absl::Status OpDefCompatible(const OpDef& old_op, const OpDef& new_op) {
              " changed from non-ref to ref");
   }
 
-  const string old_out_sig =
+  const std::string old_out_sig =
       ComputeArgSignature(old_op.output_arg(), old_attrs, new_attrs,
                           &old_out_ref, true /* names */);
-  const string new_out_sig =
+  const std::string new_out_sig =
       ComputeArgSignature(new_op.output_arg(), old_attrs, new_attrs,
                           &new_out_ref, true /* names */);
   VALIDATE(old_out_sig == new_out_sig, "Output signature mismatch '",
@@ -805,13 +807,13 @@ bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2) {
   return true;
 }
 
-uint64 AttrDefHash(const OpDef::AttrDef& a) {
-  uint64 h = Hash64(a.name());
+uint64_t AttrDefHash(const OpDef::AttrDef& a) {
+  uint64_t h = Hash64(a.name());
   h = Hash64(a.type().data(), a.type().size(), h);
   h = Hash64Combine(AttrValueHash(a.default_value()), h);
   h = Hash64(a.description().data(), a.description().size(), h);
-  h = Hash64Combine(static_cast<uint64>(a.has_minimum()), h);
-  h = Hash64Combine(static_cast<uint64>(a.minimum()), h);
+  h = Hash64Combine(static_cast<uint64_t>(a.has_minimum()), h);
+  h = Hash64Combine(static_cast<uint64_t>(a.minimum()), h);
   h = Hash64Combine(AttrValueHash(a.allowed_values()), h);
   return h;
 }
@@ -837,7 +839,7 @@ bool RepeatedAttrDefEqual(
   return true;
 }
 
-uint64 RepeatedAttrDefHash(
+uint64_t RepeatedAttrDefHash(
     const protobuf::RepeatedPtrField<OpDef::AttrDef>& a) {
   // Insert AttrDefs into map to deterministically sort by name
   std::vector<const OpDef::AttrDef*> a_sorted;
@@ -850,7 +852,7 @@ uint64 RepeatedAttrDefHash(
               return lhs->name() < rhs->name();
             });
   // Iterate and combines hashes of keys and values
-  uint64 h = 0xDECAFCAFFE;
+  uint64_t h = 0xDECAFCAFFE;
   for (const auto& def : a_sorted) {
     h = Hash64(def->name().data(), def->name().size(), h);
     h = Hash64Combine(AttrDefHash(*def), h);
@@ -884,8 +886,8 @@ bool OpDefEqual(const OpDef& o1, const OpDef& o2) {
   return AreSerializedProtosEqual(o1_copy, o2_copy);
 }
 
-uint64 OpDefHash(const OpDef& o) {
-  uint64 h = RepeatedAttrDefHash(o.attr());
+uint64_t OpDefHash(const OpDef& o) {
+  uint64_t h = RepeatedAttrDefHash(o.attr());
 
   // Compute deterministic order-independent control outputs hash.
   std::vector<const char*> control_output;
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index be1f08225c0e2e..abaaeefb03c9a8 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -88,7 +88,7 @@ void RemoveNonDeprecationDescriptionsFromOpDef(OpDef* op_def);
 bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2);
 
 // Returns hash of `a` that is consistent with AttrDefEqual.
-uint64 AttrDefHash(const OpDef::AttrDef& a);
+uint64_t AttrDefHash(const OpDef::AttrDef& a);
 
 // Returns true if all AttrDefs in `a1` equal corresponding AttrDefs in
 // `a2`. Correspondence is established by name.
@@ -96,14 +96,15 @@ bool RepeatedAttrDefEqual(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a1,
                           const protobuf::RepeatedPtrField<OpDef::AttrDef>& a2);
 
 // Returns hash of `a` that is consistent with RepeatedAttrDefEqual
-uint64 RepeatedAttrDefHash(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a);
+uint64_t RepeatedAttrDefHash(
+    const protobuf::RepeatedPtrField<OpDef::AttrDef>& a);
 
 // Returns true if `o1` is equal to `o2`.
 // Equality includes all the fields. OpDef.attr field is treated as a set.
 bool OpDefEqual(const OpDef& o1, const OpDef& o2);
 
 // Returns hash of `o` that is consistent with AttrDefEqual.
-uint64 OpDefHash(const OpDef& o);
+uint64_t OpDefHash(const OpDef& o);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 333a103cef7e65..41fd90d4e79fcf 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -27,13 +27,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-OpDef FromText(const string& text) {
+OpDef FromText(const std::string& text) {
   OpDef op_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &op_def));
   return op_def;
 }
 
-OpDef::AttrDef ADef(const string& text) {
+OpDef::AttrDef ADef(const std::string& text) {
   OpDef::AttrDef attr_def;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &attr_def));
   return attr_def;
@@ -41,7 +41,7 @@ OpDef::AttrDef ADef(const string& text) {
 
 class ValidateOpDefTest : public ::testing::Test {
  protected:
-  absl::Status TestProto(const string& text) {
+  absl::Status TestProto(const std::string& text) {
     return ValidateOpDef(FromText(text));
   }
 
@@ -58,7 +58,7 @@ class ValidateOpDefTest : public ::testing::Test {
 };
 
 namespace {
-void ExpectFailure(const absl::Status& status, const string& message) {
+void ExpectFailure(const absl::Status& status, const std::string& message) {
   EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
   if (!status.ok()) {
     LOG(INFO) << "message: " << status;
@@ -516,9 +516,9 @@ void ExpectDifferent(const OpDef& o1, const OpDef& o2) {
 }
 
 TEST(OpDefEqualityTest, EqualAndHash) {
-  string a1 = "attr { name: 'a' type: 'string' } ";
-  string a2 = "attr { name: 'b' type: 'string' } ";
-  string a3 = "attr { name: 'c' type: 'int32' } ";
+  std::string a1 = "attr { name: 'a' type: 'string' } ";
+  std::string a2 = "attr { name: 'b' type: 'string' } ";
+  std::string a3 = "attr { name: 'c' type: 'int32' } ";
   OpDef o1 = FromText(absl::StrCat("name: 'MatMul' ", a1));
   OpDef o2 = FromText(absl::StrCat("name: 'MatMul' ", a2));
   OpDef o3 = FromText(absl::StrCat("name: 'MatMul' ", a1, a2));
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 026b8e677ac668..79766a2d187d93 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -30,10 +30,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string WordWrap(absl::string_view prefix, absl::string_view str, int width) {
-  const string indent_next_line = "\n" + Spaces(prefix.size());
+std::string WordWrap(absl::string_view prefix, absl::string_view str,
+                     int width) {
+  const std::string indent_next_line = "\n" + Spaces(prefix.size());
   width -= prefix.size();
-  string result;
+  std::string result;
   absl::StrAppend(&result, prefix);
 
   while (!str.empty()) {
@@ -100,8 +101,8 @@ static bool SplitAt(char split_ch, absl::string_view* orig,
 
 // Does this line start with "<spaces><field>:" where "<field>" is
 // in multi_line_fields? Sets *colon_pos to the position of the colon.
-static bool StartsWithFieldName(absl::string_view line,
-                                const std::vector<string>& multi_line_fields) {
+static bool StartsWithFieldName(
+    absl::string_view line, const std::vector<std::string>& multi_line_fields) {
   absl::string_view up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
   while (absl::ConsumePrefix(&up_to_colon, " "))
@@ -115,8 +116,8 @@ static bool StartsWithFieldName(absl::string_view line,
 }
 
 static bool ConvertLine(absl::string_view line,
-                        const std::vector<string>& multi_line_fields,
-                        string* ml) {
+                        const std::vector<std::string>& multi_line_fields,
+                        std::string* ml) {
   // Is this a field we should convert?
   if (!StartsWithFieldName(line, multi_line_fields)) {
     return false;
@@ -140,7 +141,7 @@ static bool ConvertLine(absl::string_view line,
   absl::string_view suffix = after_colon.substr(last_quote + 1);
   // We've now parsed line into '<up_to_colon>: "<escaped>"<suffix>'
 
-  string unescaped;
+  std::string unescaped;
   if (!absl::CUnescape(escaped, &unescaped, nullptr)) {
     // Error unescaping, abort the conversion.
     return false;
@@ -148,8 +149,8 @@ static bool ConvertLine(absl::string_view line,
   // No more errors possible at this point.
 
   // Find a string to mark the end that isn't in unescaped.
-  string end = "END";
-  for (int s = 0; unescaped.find(end) != string::npos; ++s) {
+  std::string end = "END";
+  for (int s = 0; unescaped.find(end) != std::string::npos; ++s) {
     end = absl::StrCat("END", s);
   }
 
@@ -163,9 +164,10 @@ static bool ConvertLine(absl::string_view line,
   return true;
 }
 
-string PBTxtToMultiline(absl::string_view pbtxt,
-                        const std::vector<string>& multi_line_fields) {
-  string ml;
+std::string PBTxtToMultiline(
+    absl::string_view pbtxt,
+    const std::vector<std::string>& multi_line_fields) {
+  std::string ml;
   // Probably big enough, since the input and output are about the
   // same size, but just a guess.
   ml.reserve(pbtxt.size() * (17. / 16));
@@ -184,20 +186,21 @@ string PBTxtToMultiline(absl::string_view pbtxt,
 // Given a single line of text `line` with first : at `colon`, determine if
 // there is an "<<END" expression after the colon and if so return true and set
 // `*end` to everything after the "<<".
-static bool FindMultiline(absl::string_view line, size_t colon, string* end) {
+static bool FindMultiline(absl::string_view line, size_t colon,
+                          std::string* end) {
   if (colon == absl::string_view::npos) return false;
   line.remove_prefix(colon + 1);
   while (absl::ConsumePrefix(&line, " ")) {
   }
   if (absl::ConsumePrefix(&line, "<<")) {
-    *end = string(line);
+    *end = std::string(line);
     return true;
   }
   return false;
 }
 
-string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
-  string pbtxt;
+std::string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
+  std::string pbtxt;
   // Probably big enough, since the input and output are about the
   // same size, but just a guess.
   pbtxt.reserve(multiline_pbtxt.size() * (33. / 32));
@@ -209,7 +212,7 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
       break;
     }
 
-    string end;
+    std::string end;
     auto colon = line.find(':');
     if (!FindMultiline(line, colon, &end)) {
       // Normal case: not a multi-line string, just output the line as-is.
@@ -229,7 +232,7 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
     absl::StrAppend(&pbtxt, line.substr(0, colon + 1));
 
     // Add every line to unescaped until we see the "END" string.
-    string unescaped;
+    std::string unescaped;
     bool first = true;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
@@ -250,13 +253,14 @@ string PBTxtFromMultiline(absl::string_view multiline_pbtxt) {
   return pbtxt;
 }
 
-static void StringReplace(const string& from, const string& to, string* s) {
+static void StringReplace(const std::string& from, const std::string& to,
+                          std::string* s) {
   // Split *s into pieces delimited by `from`.
-  std::vector<string> split;
-  string::size_type pos = 0;
+  std::vector<std::string> split;
+  std::string::size_type pos = 0;
   while (pos < s->size()) {
     auto found = s->find(from, pos);
-    if (found == string::npos) {
+    if (found == std::string::npos) {
       split.push_back(s->substr(pos));
       break;
     } else {
@@ -271,10 +275,10 @@ static void StringReplace(const string& from, const string& to, string* s) {
   *s = absl::StrJoin(split, to);
 }
 
-static void RenameInDocs(const string& from, const string& to,
+static void RenameInDocs(const std::string& from, const std::string& to,
                          ApiDef* api_def) {
-  const string from_quoted = absl::StrCat("`", from, "`");
-  const string to_quoted = absl::StrCat("`", to, "`");
+  const std::string from_quoted = absl::StrCat("`", from, "`");
+  const std::string to_quoted = absl::StrCat("`", to, "`");
   for (int i = 0; i < api_def->in_arg_size(); ++i) {
     if (!api_def->in_arg(i).description().empty()) {
       StringReplace(from_quoted, to_quoted,
@@ -480,17 +484,17 @@ ApiDefMap::ApiDefMap(const OpList& op_list) {
 
 ApiDefMap::~ApiDefMap() {}
 
-absl::Status ApiDefMap::LoadFileList(Env* env,
-                                     const std::vector<string>& filenames) {
+absl::Status ApiDefMap::LoadFileList(
+    Env* env, const std::vector<std::string>& filenames) {
   for (const auto& filename : filenames) {
     TF_RETURN_IF_ERROR(LoadFile(env, filename));
   }
   return absl::OkStatus();
 }
 
-absl::Status ApiDefMap::LoadFile(Env* env, const string& filename) {
+absl::Status ApiDefMap::LoadFile(Env* env, const std::string& filename) {
   if (filename.empty()) return absl::OkStatus();
-  string contents;
+  std::string contents;
   TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &contents));
   absl::Status status = LoadApiDef(contents);
   if (!status.ok()) {
@@ -502,8 +506,8 @@ absl::Status ApiDefMap::LoadFile(Env* env, const string& filename) {
   return absl::OkStatus();
 }
 
-absl::Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
-  const string contents = PBTxtFromMultiline(api_def_file_contents);
+absl::Status ApiDefMap::LoadApiDef(const std::string& api_def_file_contents) {
+  const std::string contents = PBTxtFromMultiline(api_def_file_contents);
   ApiDefs api_defs;
   TF_RETURN_IF_ERROR(
       proto_utils::ParseTextFormatFromString(contents, &api_defs));
@@ -522,7 +526,7 @@ void ApiDefMap::UpdateDocs() {
   for (auto& name_and_api_def : map_) {
     auto& api_def = name_and_api_def.second;
     CHECK_GT(api_def.endpoint_size(), 0);
-    const string canonical_name = api_def.endpoint(0).name();
+    const std::string canonical_name = api_def.endpoint(0).name();
     if (api_def.graph_op_name() != canonical_name) {
       RenameInDocs(api_def.graph_op_name(), canonical_name, &api_def);
     }
@@ -544,7 +548,7 @@ void ApiDefMap::UpdateDocs() {
   }
 }
 
-const tensorflow::ApiDef* ApiDefMap::GetApiDef(const string& name) const {
+const tensorflow::ApiDef* ApiDefMap::GetApiDef(const std::string& name) const {
   return gtl::FindOrNull(map_, name);
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 27ffe522a6dd35..e5414c031abdca 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -29,13 +29,14 @@ namespace tensorflow {
 // Forward declare protos so their symbols can be removed from .so exports
 class OpDef;
 
-inline string Spaces(int n) { return string(n, ' '); }
+inline std::string Spaces(int n) { return std::string(n, ' '); }
 
 // Wrap prefix + str to be at most width characters, indenting every line
 // after the first by prefix.size() spaces.  Intended use case is something
 // like prefix = "  Foo(" and str is a list of arguments (terminated by a ")").
 // TODO(josh11b): Option to wrap on ", " instead of " " when possible.
-string WordWrap(absl::string_view prefix, absl::string_view str, int width);
+std::string WordWrap(absl::string_view prefix, absl::string_view str,
+                     int width);
 
 // Looks for an "=" at the beginning of *description.  If found, strips it off
 // (and any following spaces) from *description and return true.  Otherwise
@@ -43,9 +44,9 @@ string WordWrap(absl::string_view prefix, absl::string_view str, int width);
 bool ConsumeEquals(absl::string_view* description);
 
 // Convert text-serialized protobufs to/from multiline format.
-string PBTxtToMultiline(absl::string_view pbtxt,
-                        const std::vector<string>& multi_line_fields);
-string PBTxtFromMultiline(absl::string_view multiline_pbtxt);
+std::string PBTxtToMultiline(absl::string_view pbtxt,
+                             const std::vector<std::string>& multi_line_fields);
+std::string PBTxtFromMultiline(absl::string_view multiline_pbtxt);
 
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
@@ -62,20 +63,21 @@ class ApiDefMap {
   // definitions take precedence.
   // ApiDefs loaded from files must contain a subset of ops defined
   // in the OpList passed to the constructor.
-  absl::Status LoadFileList(Env* env, const std::vector<string>& filenames);
+  absl::Status LoadFileList(Env* env,
+                            const std::vector<std::string>& filenames);
 
   // Load a single file. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
   // definitions take precedence.
   // ApiDefs loaded from file must contain a subset of ops defined
   // in the OpList passed to the constructor.
-  absl::Status LoadFile(Env* env, const string& filename);
+  absl::Status LoadFile(Env* env, const std::string& filename);
 
   // Load ApiDefs from string containing ApiDefs text proto.
   // api_def_file_contents is expected to be in "multiline format".
   // ApiDefs must contain a subset of ops defined in OpsList
   // passed to the constructor.
-  absl::Status LoadApiDef(const string& api_def_file_contents);
+  absl::Status LoadApiDef(const std::string& api_def_file_contents);
 
   // Updates ApiDef docs. For example, if ApiDef renames an argument
   // or attribute, applies these renames to descriptions as well.
@@ -89,10 +91,10 @@ class ApiDefMap {
   // Note: Returned ApiDef pointer should stay valid even after calling
   // Load* functions defined above. Subsequent calls to Load* might modify
   // returned ApiDef contents, but should never remove the ApiDef itself.
-  const ApiDef* GetApiDef(const string& name) const;
+  const ApiDef* GetApiDef(const std::string& name) const;
 
  private:
-  std::unordered_map<string, ApiDef> map_;
+  std::unordered_map<std::string, ApiDef> map_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index b08c77ca83221c..b06646d9fc51bd 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -72,7 +72,7 @@ END
 
 TEST(OpGenLibTest, MultilinePBTxt) {
   // Non-multiline pbtxt
-  const string pbtxt = R"(foo: "abc"
+  const std::string pbtxt = R"(foo: "abc"
 foo: ""
 foo: "\n\n"
 foo: "abc\nEND"
@@ -81,7 +81,7 @@ bar: "quotes:\""
 )";
 
   // Field "foo" converted to multiline but not "bar".
-  const string ml_foo = R"(foo: <<END
+  const std::string ml_foo = R"(foo: <<END
 abc
 END
 foo: <<END
@@ -105,7 +105,7 @@ bar: "quotes:\""
 )";
 
   // Both fields "foo" and "bar" converted to multiline.
-  const string ml_foo_bar = R"(foo: <<END
+  const std::string ml_foo_bar = R"(foo: <<END
 abc
 END
 foo: <<END
@@ -161,10 +161,10 @@ TEST(OpGenLibTest, PBTxtToMultilineErrorCases) {
 }
 
 TEST(OpGenLibTest, PBTxtToMultilineComments) {
-  const string pbtxt = R"(f: "bar"  # Comment 1
+  const std::string pbtxt = R"(f: "bar"  # Comment 1
     f: "\n"  # Comment 2
 )";
-  const string ml = R"(f: <<END
+  const std::string ml = R"(f: <<END
 bar
 END  # Comment 1
     f: <<END
@@ -267,7 +267,7 @@ TEST(OpGenLibTest, ApiDefLoadSingleApiDef) {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideVisibility) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -275,7 +275,7 @@ op {
   }
 }
 )";
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   visibility: HIDDEN
@@ -304,7 +304,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideEndpoints) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -327,7 +327,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideArgs) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   in_arg {
@@ -363,7 +363,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefOverrideDescriptions) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   summary: "New summary"
@@ -375,7 +375,7 @@ END
 }
 )";
 
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   description_prefix: "B"
@@ -402,7 +402,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidOpInOverride) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "different_testop"
   endpoint {
@@ -420,7 +420,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidArgOrder) {
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
@@ -428,14 +428,14 @@ op {
 }
 )";
 
-  const string api_def2 = R"(
+  const std::string api_def2 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
 }
 )";
 
-  const string api_def3 = R"(
+  const std::string api_def3 = R"(
 op {
   graph_op_name: "testop"
   arg_order: "arg_a"
@@ -462,7 +462,7 @@ op {
 }
 
 TEST(OpGenLibTest, ApiDefInvalidSyntax) {
-  const string api_def = R"pb(
+  const std::string api_def = R"pb(
     op { bad_op_name: "testop" }
   )pb";
 
@@ -474,7 +474,7 @@ TEST(OpGenLibTest, ApiDefInvalidSyntax) {
 }
 
 TEST(OpGenLibTest, ApiDefUpdateDocs) {
-  const string op_list1 = R"(op {
+  const std::string op_list1 = R"(op {
   name: "testop"
   input_arg {
     name: "arg_a"
@@ -492,7 +492,7 @@ TEST(OpGenLibTest, ApiDefUpdateDocs) {
 }
 )";
 
-  const string api_def1 = R"(
+  const std::string api_def1 = R"(
 op {
   graph_op_name: "testop"
   endpoint {
@@ -519,7 +519,7 @@ op {
   TF_CHECK_OK(api_map.LoadApiDef(api_def1));
   api_map.UpdateDocs();
 
-  const string expected_description =
+  const std::string expected_description =
       "`arg_aa`, `arg_cc`, `attr_aa`, `testop2`";
   EXPECT_EQ(expected_description, api_map.GetApiDef("testop")->description());
   EXPECT_EQ(expected_description,
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 056db3d9fc2644..79d85cf24420db 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -213,10 +213,10 @@ absl::Status OpKernel::OutputRange(absl::string_view output_name, int* start,
   }
 }
 
-string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
+std::string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
   int num_inputs = ctx.num_inputs();
   if (num_inputs == 0) return "";
-  std::vector<string> tensor_shapes;
+  std::vector<std::string> tensor_shapes;
   tensor_shapes.reserve(num_inputs);
   for (int i = 0; i < num_inputs; i++) {
     if (!ctx.has_input(i)) {
@@ -235,11 +235,12 @@ string OpKernel::ShapeTraceString(const OpKernelContext& ctx) const {
   return absl::StrCat("(", absl::StrJoin(tensor_shapes, ";"), ")");
 }
 
-string OpKernel::TraceString(const OpKernelContext& ctx, bool verbose) const {
-  string trace_string =
+std::string OpKernel::TraceString(const OpKernelContext& ctx,
+                                  bool verbose) const {
+  std::string trace_string =
       tsl::profiler::TraceMeOp(name_view(), type_string_view());
   if (verbose) {
-    string shape = ShapeTraceString(ctx);
+    std::string shape = ShapeTraceString(ctx);
     if (!shape.empty()) {
       trace_string = tsl::profiler::TraceMeEncode(std::move(trace_string),
                                                   {{"shape", shape}});
@@ -709,7 +710,7 @@ absl::Status OpKernelContext::output_list(absl::string_view name,
 
 void OpKernelContext::maybe_initialize_scope_id_set() {
   if (allocated_scope_ids_ == nullptr) {
-    allocated_scope_ids_ = std::make_unique<std::unordered_set<int32>>();
+    allocated_scope_ids_ = std::make_unique<std::unordered_set<int32_t>>();
   }
 }
 
@@ -988,7 +989,7 @@ void OpKernelContext::maybe_track_allocations_for_set_output(
     const auto it = std::find_if(
         tracking_state_->temp_tensor_buffer_and_size.begin(),
         tracking_state_->temp_tensor_buffer_and_size.end(),
-        [&tensor](const std::pair<const void*, int64>& e) {
+        [&tensor](const std::pair<const void*, int64_t>& e) {
           return e.first == static_cast<const void*>(tensor.data());
         });
     if (it != tracking_state_->temp_tensor_buffer_and_size.end()) {
@@ -1141,11 +1142,11 @@ void OpKernelContext::set_record_memory_consumption(bool v) {
   }
 }
 
-const string& OpKernelContext::executor_type() const {
+const std::string& OpKernelContext::executor_type() const {
   if (params_->executor_type) {
     return *params_->executor_type;
   } else {
-    static const string& kEmptyString = *new string("");
+    static const std::string& kEmptyString = *new std::string("");
     return kEmptyString;
   }
 }
@@ -1158,7 +1159,7 @@ struct KernelRegistration {
       : def(d), kernel_class_name(c), factory(std::move(f)) {}
 
   const KernelDef def;
-  const string kernel_class_name;
+  const std::string kernel_class_name;
   std::unique_ptr<kernel_factory::OpKernelFactory> factory;
 };
 
@@ -1167,7 +1168,7 @@ struct KernelRegistration {
 // KernelDef.
 struct KernelRegistry {
   mutex mu;
-  std::unordered_multimap<string, KernelRegistration> registry
+  std::unordered_multimap<std::string, KernelRegistration> registry
       TF_GUARDED_BY(mu);
 };
 
@@ -1183,11 +1184,11 @@ static const char kKernelLibPattern[] = "libtfkernel*.so";
 
 // Returns Status::OK if the dynamic library at the given path is safe to
 // load with some level of confidence.
-static absl::Status IsProbablySafeToLoad(const string& path) {
+static absl::Status IsProbablySafeToLoad(const std::string& path) {
   // A map of platform string to required CPU feature.
   using port::CPUFeature;
   static const auto* feature_map =
-      new std::map<string, std::pair<CPUFeature, string>>{
+      new std::map<std::string, std::pair<CPUFeature, std::string>>{
           {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
       };
 
@@ -1209,7 +1210,7 @@ static absl::Status IsProbablySafeToLoad(const string& path) {
     }
   }
   if (!missing_features.empty()) {
-    string errmsg = "Missing CPU features: ";
+    std::string errmsg = "Missing CPU features: ";
     errmsg.append(absl::StrJoin(missing_features, ", "));
     return errors::FailedPrecondition(errmsg);
   }
@@ -1227,14 +1228,14 @@ void LoadDynamicKernelsInternal() {
     override_abi_check = strcmp(_abi_check_env_var, "1") == 0;
   }
 
-  string bazel_kernel_dir =
+  std::string bazel_kernel_dir =
       io::JoinPath(env->GetRunfilesDir(), "tensorflow", "core", "kernels");
-  std::vector<string> files;
+  std::vector<std::string> files;
   absl::Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
   if (s_kernel_dir.ok()) {
-    string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
+    std::string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
     for (const auto& file : files) {
-      string fullpath = io::JoinPath(bazel_kernel_dir, file);
+      std::string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
         absl::Status s = IsProbablySafeToLoad(fullpath);
         if (!s.ok() && override_abi_check) {
@@ -1263,8 +1264,8 @@ void LoadDynamicKernels() {
   absl::call_once(dll_loader_flag, LoadDynamicKernelsInternal);
 }
 
-static string Key(absl::string_view op_type, const DeviceType& device_type,
-                  absl::string_view label) {
+static std::string Key(absl::string_view op_type, const DeviceType& device_type,
+                       absl::string_view label) {
   return strings::StrCat(op_type, ":", DeviceTypeString(device_type), ":",
                          label);
 }
@@ -1274,12 +1275,12 @@ static string Key(absl::string_view op_type, const DeviceType& device_type,
 // to JIT kernels during the static registration, to allow them to be found
 // during lookup as normal kernels.
 void SetupOrDisableJit(KernelRegistry* registry) {
-  std::unordered_multimap<string, KernelRegistration> jit_kernels;
+  std::unordered_multimap<std::string, KernelRegistration> jit_kernels;
   bool remove_jit_kernels = absl::StrContains(
       absl::NullSafeStringView(getenv(kDisableJitKernelsEnvVar)), "1");
 
   mutex_lock l(registry->mu);
-  std::unordered_multimap<string, KernelRegistration>& all_kernels =
+  std::unordered_multimap<std::string, KernelRegistration>& all_kernels =
       registry->registry;
   auto it = all_kernels.begin();
   while (it != all_kernels.end()) {
@@ -1344,7 +1345,7 @@ namespace kernel_factory {
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      absl::string_view kernel_class_name,
                                      std::unique_ptr<OpKernelFactory> factory) {
-  const string key =
+  const std::string key =
       Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
           kernel_def->label());
 
@@ -1374,9 +1375,9 @@ OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
 namespace {
 
 // Label defaults to empty if not found in NodeDef.
-const string& GetKernelLabelAttr(const AttrSlice& node_attrs) {
-  static const string& kKernelAttr = *new string("_kernel");
-  static const string& kEmptyString = *new string("");
+const std::string& GetKernelLabelAttr(const AttrSlice& node_attrs) {
+  static const std::string& kKernelAttr = *new std::string("_kernel");
+  static const std::string& kEmptyString = *new std::string("");
 
   // NOTE: We inline the implementation of `GetNodeAttrString()` here in order
   // to use the `AttrSlice::FindByString()` overload, which does a more
@@ -1399,9 +1400,9 @@ absl::Status FindKernelRegistration(
   *reg = nullptr;
   *was_attr_mismatch = false;
 
-  const string& label = GetKernelLabelAttr(node_attrs);
+  const std::string& label = GetKernelLabelAttr(node_attrs);
 
-  const string key = Key(node_op, device_type, label);
+  const std::string key = Key(node_op, device_type, label);
   auto typed_registry = GlobalKernelRegistryTyped();
   tf_shared_lock lock(typed_registry->mu);
   auto regs = typed_registry->registry.equal_range(key);
@@ -1434,7 +1435,7 @@ absl::Status FindKernelRegistration(
   // default kernel.
   if (*reg == nullptr &&
       !IsSymbolicExecutionDevice(device_type.type_string())) {
-    const string default_key = Key(node_op, DEVICE_DEFAULT, label);
+    const std::string default_key = Key(node_op, DEVICE_DEFAULT, label);
     auto regs = typed_registry->registry.equal_range(default_key);
     for (auto iter = regs.first; iter != regs.second; ++iter) {
       // If there is a kernel registered for the op and device_type,
@@ -1496,7 +1497,8 @@ absl::Status FindKernelDef(
     bool has_experimental_debug_info,
     const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
     absl::string_view node_op, absl::string_view node_device,
-    AttrSlice node_attrs, const KernelDef** def, string* kernel_class_name) {
+    AttrSlice node_attrs, const KernelDef** def,
+    std::string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
   TF_RETURN_IF_ERROR(FindKernelRegistration(
@@ -1535,7 +1537,7 @@ absl::Status FindKernelDef(
 
 absl::Status FindKernelDef(const DeviceType& device_type,
                            const NodeDef& node_def, const KernelDef** def,
-                           string* kernel_class_name) {
+                           std::string* kernel_class_name) {
   return FindKernelDef(
       device_type, node_def.name(), node_def.has_experimental_debug_info(),
       node_def.experimental_debug_info(), node_def.op(), node_def.device(),
@@ -1600,8 +1602,8 @@ absl::Status SupportedDeviceTypesForNode(
 
     std::stable_sort(prioritized_device_types->begin(),
                      prioritized_device_types->end(),
-                     [](const std::pair<DeviceType, int32>& a,
-                        const std::pair<DeviceType, int32>& b) {
+                     [](const std::pair<DeviceType, int32_t>& a,
+                        const std::pair<DeviceType, int32_t>& b) {
                        return a.second > b.second;
                      });
   } else {
@@ -1644,10 +1646,10 @@ KernelList GetRegisteredKernelsForOp(absl::string_view op_name) {
   return GetFilteredRegisteredKernels(op_pred);
 }
 
-string KernelsRegisteredForOp(absl::string_view op_name) {
+std::string KernelsRegisteredForOp(absl::string_view op_name) {
   KernelList kernel_list = GetRegisteredKernelsForOp(op_name);
   if (kernel_list.kernel_size() == 0) return "  <no registered kernels>\n";
-  string ret;
+  std::string ret;
   for (const auto& kernel_def : kernel_list.kernel()) {
     absl::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
     if (!kernel_def.label().empty()) {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 17861100a38bf3..1181c593cfc8de 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1315,7 +1315,7 @@ class OpKernelContext {
 
   // Keep track of calls to ScopedAllocator.
   // TODO(ayushd): change to absl::flat_hash_set.
-  std::unique_ptr<std::unordered_set<int32>> allocated_scope_ids_;
+  std::unique_ptr<std::unordered_set<int32_t>> allocated_scope_ids_;
 
   // The following data members are only used when allocation tracking is
   // enabled, memory consumption is being recorded, or tensor access is being
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 9a38f4a11e1f3a..9d7eff71641541 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -93,7 +93,7 @@ class TestOp3Cpu : public tensorflow::OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("Test3").Device(DEVICE_CPU).TypeConstraint<int8>("T"), TestOp3Cpu);
+    Name("Test3").Device(DEVICE_CPU).TypeConstraint<int8_t>("T"), TestOp3Cpu);
 
 namespace {
 
@@ -179,8 +179,9 @@ class OpKernelTest : public ::testing::Test {
   OpKernelTest() : device_(Env::Default()) {}
 
  protected:
-  NodeDef CreateNodeDef(const string& op_type, const DataTypeVector& inputs,
-                        const string& device = "") {
+  NodeDef CreateNodeDef(const std::string& op_type,
+                        const DataTypeVector& inputs,
+                        const std::string& device = "") {
     NodeDefBuilder builder(op_type + "-op", op_type);
     for (DataType dt : inputs) {
       builder.Input(FakeInput(dt));
@@ -191,7 +192,7 @@ class OpKernelTest : public ::testing::Test {
     return node_def;
   }
 
-  void ExpectEqual(const string& what, const DataTypeVector& expected,
+  void ExpectEqual(const std::string& what, const DataTypeVector& expected,
                    const DataTypeVector& observed) {
     EXPECT_EQ(expected.size(), observed.size()) << what;
     const size_t size = std::min(expected.size(), observed.size());
@@ -202,7 +203,7 @@ class OpKernelTest : public ::testing::Test {
     }
   }
 
-  void ExpectSuccess(const string& op_type, DeviceType device_type,
+  void ExpectSuccess(const std::string& op_type, DeviceType device_type,
                      const DataTypeVector& inputs,
                      const DataTypeVector& outputs) {
     absl::Status status;
@@ -217,7 +218,7 @@ class OpKernelTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const string& ascii_node_def, DeviceType device_type,
+  void ExpectFailure(const std::string& ascii_node_def, DeviceType device_type,
                      error::Code code) {
     NodeDef node_def;
     protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
@@ -266,8 +267,9 @@ TEST_F(OpKernelTest, CpuTypeRegistered) {
 }
 
 TEST_F(OpKernelTest, KernelNotRegistered) {
-  const string& local_device = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string& remote_device = "/job:worker/replica:0/task:0/device";
+  const std::string& local_device =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string& remote_device = "/job:worker/replica:0/task:0/device";
   {
     // Try a node def of an op which does not have kernel. And the requested
     // device in NodeDef is on a different address space than the local device.
@@ -810,7 +812,7 @@ TEST_F(OpKernelBuilderTest, OpOutputList) {
 class GetAttrKernel : public ::tensorflow::OpKernel {
  public:
   explicit GetAttrKernel(OpKernelConstruction* context) : OpKernel(context) {
-    string attr_name;
+    std::string attr_name;
     OP_REQUIRES_OK(context, context->GetAttr("attr_name", &attr_name));
 
     status.emplace_back("s", context->GetAttr(attr_name, &s));
@@ -836,11 +838,11 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
   }
   void Compute(::tensorflow::OpKernelContext* context) override {}
 
-  void ExpectOk(std::initializer_list<string> keys) {
+  void ExpectOk(std::initializer_list<std::string> keys) {
     for (const auto& key_status : status) {
       // Only the status for keys in "keys" should be ok().
       bool in_keys = false;
-      for (const string& key : keys) {
+      for (const std::string& key : keys) {
         if (key_status.first == key) {
           in_keys = true;
         }
@@ -850,12 +852,12 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
     }
   }
 
-  string s;
-  std::vector<string> s_list;
+  std::string s;
+  std::vector<std::string> s_list;
   int64_t i;
   std::vector<int64_t> i_list;
-  int32 i32;
-  std::vector<int32> i32_list;
+  int32_t i32;
+  std::vector<int32_t> i32_list;
   float f;
   std::vector<float> f_list;
   bool b;
@@ -867,7 +869,7 @@ class GetAttrKernel : public ::tensorflow::OpKernel {
   std::vector<TensorShapeProto> shape_proto_list;
   TensorShape shape;
   std::vector<TensorShape> shape_list;
-  std::vector<std::pair<string, absl::Status>> status;
+  std::vector<std::pair<std::string, absl::Status>> status;
 };
 
 class GetAttrTest : public OpKernelBuilderTest {};
@@ -884,7 +886,7 @@ TEST_F(GetAttrTest, StringList) {
                     {"attr_name|string|'a'", "a|list(string)|['foo', 'bar']"});
   auto* get_attr_kernel = static_cast<GetAttrKernel*>(op_kernel.get());
   get_attr_kernel->ExpectOk({"s_list"});
-  EXPECT_EQ(std::vector<string>({"foo", "bar"}), get_attr_kernel->s_list);
+  EXPECT_EQ(std::vector<std::string>({"foo", "bar"}), get_attr_kernel->s_list);
 
   op_kernel = ExpectSuccess("GetAttrStringList", DEVICE_CPU,
                             {"attr_name|string|'b'", "a|list(string)|['baz']"});
@@ -914,7 +916,7 @@ TEST_F(GetAttrTest, Int) {
   get_attr_kernel = static_cast<GetAttrKernel*>(op_kernel.get());
   get_attr_kernel->ExpectOk({"i_list", "i32_list"});
   EXPECT_EQ(std::vector<int64_t>({-1, 2, -4}), get_attr_kernel->i_list);
-  EXPECT_EQ(std::vector<int32>({-1, 2, -4}), get_attr_kernel->i32_list);
+  EXPECT_EQ(std::vector<int32_t>({-1, 2, -4}), get_attr_kernel->i32_list);
 
   // 8589934592 == 2^33, too big to fit in an int32
   op_kernel = ExpectSuccess("GetAttrInt", DEVICE_CPU,
diff --git a/tensorflow/core/framework/op_kernel_test_base.h b/tensorflow/core/framework/op_kernel_test_base.h
index 7b3951e56411be..9300f159039846 100644
--- a/tensorflow/core/framework/op_kernel_test_base.h
+++ b/tensorflow/core/framework/op_kernel_test_base.h
@@ -54,13 +54,13 @@ static std::vector<DeviceType> DeviceTypes() {
 class OpKernelBuilderTest : public ::testing::Test {
  protected:
   // Each attr is described by a "name|type|value".
-  NodeDef CreateNodeDef(const string& op_type,
-                        const std::vector<string>& attrs) {
+  NodeDef CreateNodeDef(const std::string& op_type,
+                        const std::vector<std::string>& attrs) {
     NodeDef node_def;
     node_def.set_name(op_type + "-op");
     node_def.set_op(op_type);
-    for (const string& attr_desc : attrs) {
-      std::vector<string> parts = str_util::Split(attr_desc, '|');
+    for (const std::string& attr_desc : attrs) {
+      std::vector<std::string> parts = str_util::Split(attr_desc, '|');
       CHECK_EQ(parts.size(), 3);
       AttrValue attr_value;
       CHECK(ParseAttrValue(parts[1], parts[2], &attr_value)) << attr_desc;
@@ -70,9 +70,9 @@ class OpKernelBuilderTest : public ::testing::Test {
     return node_def;
   }
 
-  std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
+  std::unique_ptr<OpKernel> ExpectSuccess(const std::string& op_type,
                                           const DeviceType& device_type,
-                                          const std::vector<string>& attrs,
+                                          const std::vector<std::string>& attrs,
                                           DataTypeSlice input_types = {}) {
     absl::Status status;
     NodeDef def = CreateNodeDef(op_type, attrs);
@@ -110,8 +110,8 @@ class OpKernelBuilderTest : public ::testing::Test {
     return op;
   }
 
-  void ExpectFailure(const string& op_type, const DeviceType& device_type,
-                     const std::vector<string>& attrs, error::Code code) {
+  void ExpectFailure(const std::string& op_type, const DeviceType& device_type,
+                     const std::vector<std::string>& attrs, error::Code code) {
     absl::Status status;
     const NodeDef def = CreateNodeDef(op_type, attrs);
     Env* env = Env::Default();
@@ -142,17 +142,17 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
   }
 
-  string GetKernelClassName(const string& op_type,
-                            const DeviceType& device_type,
-                            const std::vector<string>& attrs,
-                            DataTypeSlice input_types = {}) {
+  std::string GetKernelClassName(const std::string& op_type,
+                                 const DeviceType& device_type,
+                                 const std::vector<std::string>& attrs,
+                                 DataTypeSlice input_types = {}) {
     NodeDef def = CreateNodeDef(op_type, attrs);
     for (size_t i = 0; i < input_types.size(); ++i) {
       def.add_input("a:0");
     }
 
     const KernelDef* kernel_def = nullptr;
-    string kernel_class_name;
+    std::string kernel_class_name;
     const absl::Status status =
         FindKernelDef(device_type, def, &kernel_def, &kernel_class_name);
     if (status.ok()) {
diff --git a/tensorflow/core/framework/op_registration_test.cc b/tensorflow/core/framework/op_registration_test.cc
index 9ec03a876342ea..7b4e2f97c62d90 100644
--- a/tensorflow/core/framework/op_registration_test.cc
+++ b/tensorflow/core/framework/op_registration_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 namespace {
 
-void Register(const string& op_name, OpRegistry* registry) {
+void Register(const std::string& op_name, OpRegistry* registry) {
   registry->Register(
       [op_name](OpRegistrationData* op_reg_data) -> absl::Status {
         op_reg_data->op_def.set_name(op_name);
diff --git a/tensorflow/core/framework/op_segment.cc b/tensorflow/core/framework/op_segment.cc
index 2f583903f43670..ca75d2af8a466f 100644
--- a/tensorflow/core/framework/op_segment.cc
+++ b/tensorflow/core/framework/op_segment.cc
@@ -35,8 +35,9 @@ OpSegment::~OpSegment() {
   for (const auto& kv : sessions_) delete kv.second;
 }
 
-absl::Status OpSegment::FindOrCreate(const string& session_handle,
-                                     const string& node_name, OpKernel** kernel,
+absl::Status OpSegment::FindOrCreate(const std::string& session_handle,
+                                     const std::string& node_name,
+                                     OpKernel** kernel,
                                      CreateKernelFn create_fn) {
   {
     mutex_lock l(mu_);
@@ -71,7 +72,7 @@ absl::Status OpSegment::FindOrCreate(const string& session_handle,
   return absl::OkStatus();
 }
 
-void OpSegment::AddHold(const string& session_handle) {
+void OpSegment::AddHold(const std::string& session_handle) {
   mutex_lock l(mu_);
   Item** item = &sessions_[session_handle];
   if (*item == nullptr) {
@@ -81,7 +82,7 @@ void OpSegment::AddHold(const string& session_handle) {
   }
 }
 
-void OpSegment::RemoveHold(const string& session_handle) {
+void OpSegment::RemoveHold(const std::string& session_handle) {
   Item* item = nullptr;
   {
     mutex_lock l(mu_);
@@ -101,7 +102,7 @@ void OpSegment::RemoveHold(const string& session_handle) {
 }
 
 bool OpSegment::ShouldOwnKernel(FunctionLibraryRuntime* lib,
-                                const string& node_op) {
+                                const std::string& node_op) {
   // OpSegment should not own kernel if the node is stateless, or a function.
   return lib->IsStateful(node_op) &&
          lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr &&
diff --git a/tensorflow/core/framework/op_segment.h b/tensorflow/core/framework/op_segment.h
index 10c4fa467e3228..e2726fe7f11e98 100644
--- a/tensorflow/core/framework/op_segment.h
+++ b/tensorflow/core/framework/op_segment.h
@@ -67,7 +67,7 @@ class OpSegment {
 
  private:
   // op name -> OpKernel
-  typedef std::unordered_map<string, OpKernel*> KernelMap;
+  typedef std::unordered_map<std::string, OpKernel*> KernelMap;
   struct Item {
     int num_holds = 1;      // Num of holds put on the session.
     KernelMap name_kernel;  // op name -> kernel.
@@ -76,7 +76,7 @@ class OpSegment {
 
   // session handle -> item.
   // Session handles are produced by strings::FpToString()
-  typedef std::unordered_map<string, Item*> SessionMap;
+  typedef std::unordered_map<std::string, Item*> SessionMap;
 
   mutable mutex mu_;
   SessionMap sessions_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/ops_util.cc b/tensorflow/core/framework/ops_util.cc
index 9a4de9240822bd..aad2ae197689f2 100644
--- a/tensorflow/core/framework/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/ops_util.h"
+
 #include <algorithm>
 #include <cmath>
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/padding.h"
@@ -62,12 +64,11 @@ absl::Status GetBroadcastSize(const int index, const int in_size,
   return absl::OkStatus();
 }
 
-string SanitizeThreadSuffix(string suffix) {
-  string clean;
+std::string SanitizeThreadSuffix(std::string suffix) {
+  std::string clean;
   for (int i = 0; i < suffix.size(); ++i) {
     const char ch = suffix[i];
-    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-        (ch >= '0' && ch <= '9') || ch == '_' || ch == '-') {
+    if (absl::ascii_isalnum(ch) || ch == '_' || ch == '-') {
       clean += ch;
     } else {
       clean += '_';
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index bda429e3194668..7ca37906f061f8 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -85,11 +85,11 @@ class QueueInterface : public ResourceBase {
   virtual absl::Status MatchesNodeDef(const NodeDef& node_def) = 0;
 
   // Returns the number of elements in the queue.
-  virtual int32 size() const = 0;
+  virtual int32_t size() const = 0;
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("A Queue of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index 9f7188f25264e8..916f391d529d10 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 // ReaderBase ------------------------------------------------------
 
-ReaderBase::ReaderBase(const string& name) : name_(name) {}
+ReaderBase::ReaderBase(const std::string& name) : name_(name) {}
 
 int64_t ReaderBase::NumRecordsProduced() {
   mutex_lock lock(mu_);
@@ -199,9 +199,9 @@ void ReaderBase::Read(QueueInterface* queue, tstring* key, tstring* value,
   }
 }
 
-string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
-                                     OpKernelContext* context) const {
-  string work;
+std::string ReaderBase::GetNextWorkLocked(QueueInterface* queue,
+                                          OpKernelContext* context) const {
+  std::string work;
   absl::Notification n;
   queue->TryDequeue(
       context, [context, &n, &work](const QueueInterface::Tuple& tuple) {
@@ -246,7 +246,7 @@ absl::Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
 #if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
-    const string debug_string = state.DebugString();
+    const std::string debug_string = state.DebugString();
 #endif
     return errors::InvalidArgument(
         "Unexpected negative value when restoring in ", name(), ": ",
@@ -256,7 +256,7 @@ absl::Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
 #if defined(__ANDROID__) || (__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
-    const string debug_string = state.DebugString();
+    const std::string debug_string = state.DebugString();
 #endif
     return errors::InvalidArgument(
         "Inconsistent work started vs. finished when restoring in ", name(),
diff --git a/tensorflow/core/framework/reader_base.h b/tensorflow/core/framework/reader_base.h
index 73842644d15992..2cd63ea5355526 100644
--- a/tensorflow/core/framework/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -31,7 +31,7 @@ class ReaderBase : public ReaderInterface {
  public:
   // name: For use in error messages, should mention both the name of
   // the op and the node.
-  explicit ReaderBase(const string& name);
+  explicit ReaderBase(const std::string& name);
 
   // Note that methods with names ending in "Locked" are called while
   // the ReaderBase's mutex is held.
@@ -87,7 +87,7 @@ class ReaderBase : public ReaderInterface {
   const tstring& current_work() const { return work_; }
 
   // What was passed to the constructor.
-  const string& name() const { return name_; }
+  const std::string& name() const { return name_; }
 
   // Produce the key name (from current_work and the actual key).
   tstring KeyName(const tstring& key) const;
@@ -106,8 +106,8 @@ class ReaderBase : public ReaderInterface {
   // For descendants that wish to obtain the next work item in a different way.
   // For implementing Read().  Dequeues the next work item from
   // *queue, and if successful returns "work" (a string). May block.
-  virtual string GetNextWorkLocked(QueueInterface* queue,
-                                   OpKernelContext* context) const;
+  virtual std::string GetNextWorkLocked(QueueInterface* queue,
+                                        OpKernelContext* context) const;
 
   // Implementations of ReaderInterface methods.  These ensure thread-safety
   // and call the methods above to do the work.
@@ -127,7 +127,7 @@ class ReaderBase : public ReaderInterface {
   absl::Status RestoreState(const tstring& state) override;
 
   mutable mutex mu_;
-  const string name_;
+  const std::string name_;
   int64_t work_started_ = 0;
   int64_t work_finished_ = 0;
   int64_t num_records_produced_ = 0;
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index 6210b68fe17b45..c8c1f1302811b8 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -77,7 +77,7 @@ class ReaderInterface : public ResourceBase {
   // Note: Must Reset on error.
   virtual absl::Status RestoreState(const tstring& state) = 0;
 
-  string DebugString() const override { return "a reader"; }
+  std::string DebugString() const override { return "a reader"; }
 
  protected:
   ~ReaderInterface() override {}
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 57c9bc0a99b55e..bd07cfe3e506b2 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -92,6 +92,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m) m(::tensorflow::float8_e4m3fnuz)
 #define TF_CALL_float8_e4m3b11fnuz(m) m(::tensorflow::float8_e4m3b11fnuz)
 #define TF_CALL_float8_e5m2fnuz(m) m(::tensorflow::float8_e5m2fnuz)
+#define TF_CALL_float4_e2m1fn(m) m(::tensorflow::float4_e2m1fn)
 
 #define TF_CALL_int4(m) m(::tensorflow::int4)
 #define TF_CALL_uint4(m) m(::tensorflow::uint4)
@@ -135,6 +136,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m)
 #define TF_CALL_float8_e4m3b11fnuz(m)
 #define TF_CALL_float8_e5m2fnuz(m)
+#define TF_CALL_float4_e2m1fn(m)
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
@@ -177,6 +179,7 @@ limitations under the License.
 #define TF_CALL_float8_e4m3fnuz(m)
 #define TF_CALL_float8_e4m3b11fnuz(m)
 #define TF_CALL_float8_e5m2fnuz(m)
+#define TF_CALL_float4_e2m1fn(m)
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index b2847d844e1aea..a264ed0c710b30 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -44,15 +44,15 @@ struct proxy_type_pod<CPUDevice, 8> {
 };
 template <>
 struct proxy_type_pod<CPUDevice, 4> {
-  typedef ::tensorflow::int32 type;
+  typedef int32_t type;
 };
 template <>
 struct proxy_type_pod<CPUDevice, 2> {
-  typedef ::tensorflow::int16 type;
+  typedef int16_t type;
 };
 template <>
 struct proxy_type_pod<CPUDevice, 1> {
-  typedef ::tensorflow::int8 type;
+  typedef int8_t type;
 };
 template <>
 struct proxy_type_pod<GPUDevice, 8> {
@@ -68,7 +68,7 @@ struct proxy_type_pod<GPUDevice, 2> {
 };
 template <>
 struct proxy_type_pod<GPUDevice, 1> {
-  typedef ::tensorflow::int8 type;
+  typedef int8_t type;
 };
 
 
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index b0de3c3bdc1011..cbe5935907f791 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -53,9 +53,11 @@ Rendezvous::ParsedKey& Rendezvous::ParsedKey::operator=(const ParsedKey& b) {
 }
 
 /*  static */
-string Rendezvous::CreateKey(const string& src_device, uint64 src_incarnation,
-                             const string& dst_device, const string& name,
-                             const FrameAndIter& frame_iter) {
+std::string Rendezvous::CreateKey(const std::string& src_device,
+                                  uint64_t src_incarnation,
+                                  const std::string& dst_device,
+                                  const std::string& name,
+                                  const FrameAndIter& frame_iter) {
   // NOTE: ';' is not used in the device name's job name.
   //
   // We include both sender and receiver in the key to facilitate
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 97a5daffcae3ee..f28df3bcaf2c89 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -60,7 +60,7 @@ class RendezvousInterface {
   struct ParsedKey {
     absl::string_view src_device;
     DeviceNameUtils::ParsedName src;
-    uint64 src_incarnation = 0;
+    uint64_t src_incarnation = 0;
     absl::string_view dst_device;
     DeviceNameUtils::ParsedName dst;
     absl::string_view edge_name;
@@ -159,7 +159,7 @@ class Rendezvous : public RendezvousInterface, public core::WeakRefCounted {
   // "src_device" to "dst_device". The tensor is generated in the frame
   // and iteration specified by "frame_iter".
   static std::string CreateKey(const std::string& src_device,
-                               uint64 src_incarnation,
+                               uint64_t src_incarnation,
                                const std::string& dst_device,
                                const std::string& name,
                                const FrameAndIter& frame_iter);
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index d67b8e41b5e35e..bdaab9f3c5a322 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 TEST(RendezvousTest, Key) {
-  const string key = Rendezvous::CreateKey(
+  const std::string key = Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/CPU:0", 7890,
       "/job:mnist/replica:1/task:2/device:GPU:0", "var0", FrameAndIter(0, 0));
   EXPECT_EQ(key,
@@ -83,23 +83,23 @@ class LocalRendezvousTest : public ::testing::Test {
 };
 
 // string -> Tensor<string>
-Tensor V(const string& content) {
+Tensor V(const std::string& content) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = content;
   return tensor;
 }
 
 // Tensor<string> -> string
-string V(const Tensor& tensor) {
+std::string V(const Tensor& tensor) {
   CHECK_EQ(tensor.dtype(), DT_STRING);
   CHECK(TensorShapeUtils::IsScalar(tensor.shape()));
   return tensor.scalar<tstring>()();
 }
 
-Rendezvous::ParsedKey MakeKey(const string& name) {
-  string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0",
-                                   name, FrameAndIter(0, 0));
+Rendezvous::ParsedKey MakeKey(const std::string& name) {
+  std::string s = Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/CPU:0", 7890,
+      "/job:mnist/replica:1/task:2/device:GPU:0", name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 93fc5360e68c9c..4b7244d362a8e9 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -185,7 +185,8 @@ bool DecodeResourceHandleList(std::unique_ptr<port::StringListDecoder> d,
 
   ResourceHandleProto proto;
   for (int i = 0; i < n; ++i) {
-    if (!proto.ParseFromArray(d->Data(sizes[i]), sizes[i])) {
+    if (!proto.ParseFromString(
+            absl::string_view(d->Data(sizes[i]), sizes[i]))) {
       return false;
     }
     if (!ps[i].FromProto(proto).ok()) {
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 393a899862d0d4..4a8f291c36853e 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -67,8 +67,8 @@ class ResourceHandle {
 
   // Hash code for the type of the resource. Is only valid in the same device
   // and in the same execution.
-  uint64 hash_code() const { return hash_code_; }
-  void set_hash_code(uint64 hash_code) { hash_code_ = hash_code; }
+  uint64_t hash_code() const { return hash_code_; }
+  void set_hash_code(uint64_t hash_code) { hash_code_ = hash_code; }
 
   // For debug-only, the name of the type pointed to by this handle, if
   // available.
@@ -135,7 +135,7 @@ class ResourceHandle {
   // does not hold a strong reference to the resource.
   template <typename T>
   static ResourceHandle MakeRefCountingHandle(
-      T* resource, const string& device_name,
+      T* resource, const std::string& device_name,
       const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
       const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
     return MakeRefCountingHandle(resource, device_name, TypeIndex::Make<T>(),
@@ -143,7 +143,7 @@ class ResourceHandle {
   }
 
   static ResourceHandle MakeRefCountingHandle(
-      ResourceBase* resource, const string& device_name,
+      ResourceBase* resource, const std::string& device_name,
       const TypeIndex& type_index,
       const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
       const absl::optional<ManagedStackTrace>& definition_stack_trace = {});
@@ -178,7 +178,7 @@ class ResourceHandle {
   std::string device_;
   std::string container_;
   std::string name_;
-  uint64 hash_code_ = 0;
+  uint64_t hash_code_ = 0;
   std::string maybe_type_name_;
   std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes_;
   std::optional<ManagedStackTrace> definition_stack_trace_;
diff --git a/tensorflow/core/framework/resource_handle_test.cc b/tensorflow/core/framework/resource_handle_test.cc
index 3dbdb46fca6fa8..022b128c145d57 100644
--- a/tensorflow/core/framework/resource_handle_test.cc
+++ b/tensorflow/core/framework/resource_handle_test.cc
@@ -37,7 +37,7 @@ class MockResource : public ResourceBase {
       *alive_ = false;
     }
   }
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   bool* alive_;
   int payload_;
 };
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 771c49626c59f8..c83acfe5329311 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -40,8 +40,8 @@ limitations under the License.
 namespace tensorflow {
 
 ResourceHandle MakeResourceHandle(
-    const string& container, const string& name, const DeviceBase& device,
-    const TypeIndex& type_index,
+    const std::string& container, const std::string& name,
+    const DeviceBase& device, const TypeIndex& type_index,
     const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
     const absl::optional<ManagedStackTrace>& definition_stack_trace) {
   ResourceHandle result;
@@ -62,8 +62,8 @@ ResourceHandle MakeResourceHandle(
 
 absl::Status MakeResourceHandleToOutput(OpKernelContext* context,
                                         int output_index,
-                                        const string& container,
-                                        const string& name,
+                                        const std::string& container,
+                                        const std::string& name,
                                         const TypeIndex& type_index) {
   Tensor* handle;
   TF_RETURN_IF_ERROR(
@@ -86,8 +86,8 @@ absl::Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p) {
 
 }  // end namespace internal
 
-absl::Status ResourceMgr::InsertDebugTypeName(uint64 hash_code,
-                                              const string& type_name) {
+absl::Status ResourceMgr::InsertDebugTypeName(uint64_t hash_code,
+                                              const std::string& type_name) {
   auto iter = debug_type_names_.emplace(hash_code, type_name);
   if (iter.first->second != type_name) {
     return errors::AlreadyExists("Duplicate hash code found for type ",
@@ -96,7 +96,7 @@ absl::Status ResourceMgr::InsertDebugTypeName(uint64 hash_code,
   return absl::OkStatus();
 }
 
-const char* ResourceMgr::DebugTypeName(uint64 hash_code) const {
+const char* ResourceMgr::DebugTypeName(uint64_t hash_code) const {
   auto type_name_iter = debug_type_names_.find(hash_code);
   if (type_name_iter == debug_type_names_.end()) {
     return "<unknown>";
@@ -107,8 +107,8 @@ const char* ResourceMgr::DebugTypeName(uint64 hash_code) const {
 
 ResourceMgr::ResourceAndName::ResourceAndName() : name(nullptr) {}
 
-ResourceMgr::ResourceAndName::ResourceAndName(const string& name)
-    : name(std::make_unique<string>(name)) {}
+ResourceMgr::ResourceAndName::ResourceAndName(const std::string& name)
+    : name(std::make_unique<std::string>(name)) {}
 
 core::RefCountPtr<ResourceBase> ResourceMgr::ResourceAndName::GetResource()
     const {
@@ -141,7 +141,7 @@ ResourceMgr::ResourceAndName& ResourceMgr::ResourceAndName::operator=(
 
 ResourceMgr::ResourceMgr() : default_container_("localhost") {}
 
-ResourceMgr::ResourceMgr(const string& default_container)
+ResourceMgr::ResourceMgr(const std::string& default_container)
     : default_container_(default_container) {}
 
 ResourceMgr::~ResourceMgr() { Clear(); }
@@ -149,7 +149,7 @@ ResourceMgr::~ResourceMgr() { Clear(); }
 void ResourceMgr::Clear() {
   // We do the deallocation outside of the lock to avoid a potential deadlock
   // in case any of the destructors access the resource manager.
-  absl::flat_hash_map<string, Container*> tmp_containers;
+  absl::flat_hash_map<std::string, Container*> tmp_containers;
   {
     mutex_lock l(mu_);
     tmp_containers = std::move(containers_);
@@ -181,17 +181,17 @@ void ResourceMgr::Finalize() {
   finalized_ = true;
 }
 
-string ResourceMgr::DebugString() const {
+std::string ResourceMgr::DebugString() const {
   mutex_lock l(mu_);
   struct Line {
-    const string* container;
-    const string type;
-    const string* resource;
-    const string detail;
+    const std::string* container;
+    const std::string type;
+    const std::string* resource;
+    const std::string detail;
   };
   std::vector<Line> lines;
   for (const auto& p : containers_) {
-    const string& container = p.first;
+    const std::string& container = p.first;
     for (const auto& q : *p.second) {
       const Key& key = q.first;
       const char* type = DebugTypeName(key.first);
@@ -201,7 +201,7 @@ string ResourceMgr::DebugString() const {
       lines.push_back(l);
     }
   }
-  std::vector<string> text;
+  std::vector<std::string> text;
   text.reserve(lines.size());
   for (const Line& line : lines) {
     text.push_back(strings::Printf(
@@ -212,9 +212,9 @@ string ResourceMgr::DebugString() const {
   return absl::StrJoin(text, "\n");
 }
 
-absl::Status ResourceMgr::DoCreate(const string& container_name, TypeIndex type,
-                                   const string& name, ResourceBase* resource,
-                                   bool owns_resource) {
+absl::Status ResourceMgr::DoCreate(const std::string& container_name,
+                                   TypeIndex type, const std::string& name,
+                                   ResourceBase* resource, bool owns_resource) {
   if (finalized_) {
     return absl::FailedPreconditionError(
         "ResourceMgr is finalized. Cannot create a new resource");
@@ -267,16 +267,16 @@ absl::Status ResourceMgr::Lookup(const ResourceHandle& handle,
                   /*type_name=*/"ResourceBase", handle.name(), resource);
 }
 
-absl::Status ResourceMgr::DoLookup(const string& container, TypeIndex type,
-                                   const string& name,
+absl::Status ResourceMgr::DoLookup(const std::string& container, TypeIndex type,
+                                   const std::string& name,
                                    ResourceBase** resource) const {
   return DoLookup(container, type.hash_code(), type.name(), name, resource);
 }
 
-absl::Status ResourceMgr::DoLookup(const string& container,
-                                   uint64 type_hash_code,
-                                   const string& type_name,
-                                   const string& resource_name,
+absl::Status ResourceMgr::DoLookup(const std::string& container,
+                                   uint64_t type_hash_code,
+                                   const std::string& type_name,
+                                   const std::string& resource_name,
                                    ResourceBase** resource) const {
   const Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
@@ -299,8 +299,9 @@ absl::Status ResourceMgr::DoLookup(const string& container,
 }
 
 absl::Status ResourceMgr::PopResourceAndName(
-    const string& container, uint64 type_hash_code, const string& resource_name,
-    const string& type_name, ResourceAndName& resource_and_name) {
+    const std::string& container, uint64_t type_hash_code,
+    const std::string& resource_name, const std::string& type_name,
+    ResourceAndName& resource_and_name) {
   mutex_lock l(mu_);
   Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
@@ -316,10 +317,10 @@ absl::Status ResourceMgr::PopResourceAndName(
   return absl::OkStatus();
 }
 
-absl::Status ResourceMgr::DoDelete(const string& container,
-                                   uint64 type_hash_code,
-                                   const string& resource_name,
-                                   const string& type_name) {
+absl::Status ResourceMgr::DoDelete(const std::string& container,
+                                   uint64_t type_hash_code,
+                                   const std::string& resource_name,
+                                   const std::string& type_name) {
   ResourceAndName resource_and_name;
   TF_RETURN_IF_ERROR(PopResourceAndName(
       container, type_hash_code, resource_name, type_name, resource_and_name));
@@ -335,8 +336,8 @@ absl::Status ResourceMgr::DoDelete(const string& container,
   return absl::OkStatus();
 }
 
-absl::Status ResourceMgr::DoDelete(const string& container, TypeIndex type,
-                                   const string& resource_name) {
+absl::Status ResourceMgr::DoDelete(const std::string& container, TypeIndex type,
+                                   const std::string& resource_name) {
   return DoDelete(container, type.hash_code(), resource_name, type.name());
 }
 
@@ -345,7 +346,7 @@ absl::Status ResourceMgr::Delete(const ResourceHandle& handle) {
                   "<unknown>");
 }
 
-absl::Status ResourceMgr::Cleanup(const string& container) {
+absl::Status ResourceMgr::Cleanup(const std::string& container) {
   {
     tf_shared_lock l(mu_);
     if (!gtl::FindOrNull(containers_, container)) {
@@ -382,13 +383,13 @@ absl::Status ContainerInfo::Init(ResourceMgr* rmgr, const NodeDef& ndef,
                                  bool use_node_name_as_default) {
   CHECK(rmgr);
   rmgr_ = rmgr;
-  string attr_container;
+  std::string attr_container;
   TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "container", &attr_container));
   if (!attr_container.empty() && !IsValidContainerName(attr_container)) {
     return errors::InvalidArgument("container contains invalid characters: ",
                                    attr_container);
   }
-  string attr_shared_name;
+  std::string attr_shared_name;
   TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "shared_name", &attr_shared_name));
   if (!attr_shared_name.empty() && (attr_shared_name[0] == '_')) {
     return errors::InvalidArgument("shared_name cannot start with '_':",
@@ -411,7 +412,7 @@ absl::Status ContainerInfo::Init(ResourceMgr* rmgr, const NodeDef& ndef,
   return absl::OkStatus();
 }
 
-string ContainerInfo::DebugString() const {
+std::string ContainerInfo::DebugString() const {
   return strings::StrCat("[", container(), ",", name(), ",",
                          resource_is_private_to_kernel() ? "private" : "public",
                          "]");
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 4dfba330ed8d89..406dcf39315bd0 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -86,14 +86,14 @@ class ScopedStepContainer {
   // cleanup: callback to delete a container of this name.
   // prefix: optional string prefix to disambiguate step containers.
   ScopedStepContainer(const int64_t step_id,
-                      std::function<void(const string&)> cleanup)
+                      std::function<void(const std::string&)> cleanup)
       : step_id_(step_id),
         container_(absl::StrCat("__per_step_", step_id)),
         cleanup_(cleanup),
         dirty_(false) {}
 
   ScopedStepContainer(const int64_t step_id,
-                      std::function<void(const string&)> cleanup,
+                      std::function<void(const std::string&)> cleanup,
                       const std::string& prefix)
       : step_id_(step_id),
         container_(absl::StrCat("__", prefix, "_per_step_", step_id)),
@@ -141,7 +141,7 @@ class ScopedStepContainer {
  private:
   const int64_t step_id_;
   const std::string container_;
-  const std::function<void(const string&)> cleanup_;
+  const std::function<void(const std::string&)> cleanup_;
   mutex mu_;
   mutable std::atomic<bool> dirty_ TF_GUARDED_BY(mu_);
 };
@@ -200,7 +200,7 @@ class ResourceMgr {
   // then this function does not modify resources[i].
   template <typename T, bool use_dynamic_cast = false>
   absl::Status LookupMany(
-      absl::Span<std::pair<const string*, const string*> const>
+      absl::Span<std::pair<const std::string*, const std::string*> const>
           containers_and_names,
       std::vector<core::RefCountPtr<T>>* resources) const;
 
@@ -245,7 +245,7 @@ class ResourceMgr {
   std::string DebugString() const;
 
  private:
-  typedef std::pair<uint64, absl::string_view> Key;
+  typedef std::pair<uint64_t, absl::string_view> Key;
   struct KeyHash {
     std::size_t operator()(const Key& k) const {
       return Hash64(k.second.data(), k.second.size(), k.first);
@@ -262,7 +262,7 @@ class ResourceMgr {
     std::unique_ptr<std::string> name;
 
     ResourceAndName();
-    explicit ResourceAndName(const string& name);
+    explicit ResourceAndName(const std::string& name);
     ResourceAndName(ResourceAndName&& other) noexcept;
     ~ResourceAndName();
 
@@ -281,7 +281,7 @@ class ResourceMgr {
 
   const std::string default_container_;
   mutable mutex mu_;
-  absl::flat_hash_map<string, Container*> containers_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, Container*> containers_ TF_GUARDED_BY(mu_);
   bool finalized_ TF_GUARDED_BY(mu_) = false;
 
   template <typename T, bool use_dynamic_cast = false>
@@ -289,7 +289,7 @@ class ResourceMgr {
                               const std::string& name, T** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
   absl::Status LookupInternal(const std::string& container,
-                              uint64 type_hash_code, const std::string& name,
+                              uint64_t type_hash_code, const std::string& name,
                               ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
@@ -300,13 +300,13 @@ class ResourceMgr {
   absl::Status DoLookup(const std::string& container, TypeIndex type,
                         const std::string& name, ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
-  absl::Status DoLookup(const std::string& container, uint64 type_hash_code,
+  absl::Status DoLookup(const std::string& container, uint64_t type_hash_code,
                         const std::string& type_name,
                         const std::string& resource_name,
                         ResourceBase** resource) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  absl::Status DoDelete(const std::string& container, uint64 type_hash_code,
+  absl::Status DoDelete(const std::string& container, uint64_t type_hash_code,
                         const std::string& resource_name,
                         const std::string& type_name);
   absl::Status DoDelete(const std::string& container, TypeIndex type,
@@ -315,23 +315,24 @@ class ResourceMgr {
   // Pops the ResourceAndName entry. The entry is moved from the list to
   // the output argument `resource_and_name`.
   absl::Status PopResourceAndName(const std::string& container,
-                                  uint64 type_hash_code,
+                                  uint64_t type_hash_code,
                                   const std::string& resource_name,
                                   const std::string& type_name,
                                   ResourceAndName& resource_and_name);
   // Inserts the type name for 'hash_code' into the hash_code to type name map.
-  absl::Status InsertDebugTypeName(uint64 hash_code,
+  absl::Status InsertDebugTypeName(uint64_t hash_code,
                                    const std::string& type_name)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the type name for the 'hash_code'.
   // Returns "<unknown>" if a resource with such a type was never inserted into
   // the container.
-  const char* DebugTypeName(uint64 hash_code) const
+  const char* DebugTypeName(uint64_t hash_code) const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Map from type hash_code to type name.
-  std::unordered_map<uint64, string> debug_type_names_ TF_GUARDED_BY(mu_);
+  std::unordered_map<uint64_t, std::string> debug_type_names_
+      TF_GUARDED_BY(mu_);
 
   ResourceMgr(const ResourceMgr&) = delete;
   void operator=(const ResourceMgr&) = delete;
@@ -560,8 +561,8 @@ class ResourceHandlesOp : public OpKernel {
   bool IsExpensive() override { return false; }
 
  private:
-  std::vector<string> containers_;
-  std::vector<string> names_;
+  std::vector<std::string> containers_;
+  std::vector<std::string> names_;
   mutex mutex_;
   std::vector<Tensor> resources_;
   std::atomic<bool> initialized_{false};
@@ -682,7 +683,7 @@ absl::Status ResourceMgr::Lookup(const std::string& container,
 
 template <typename T, bool use_dynamic_cast>
 absl::Status ResourceMgr::LookupMany(
-    absl::Span<std::pair<const string*, const string*> const>
+    absl::Span<std::pair<const std::string*, const std::string*> const>
         containers_and_names,
     std::vector<core::RefCountPtr<T>>* resources) const {
   CheckDeriveFromResourceBase<T>();
@@ -854,8 +855,8 @@ template <typename T>
 absl::Status LookupResources(OpKernelContext* ctx,
                              absl::Span<ResourceHandle const* const> p,
                              std::vector<core::RefCountPtr<T>>* values) {
-  std::vector<std::pair<const string*, const string*>> containers_and_names(
-      p.size());
+  std::vector<std::pair<const std::string*, const std::string*>>
+      containers_and_names(p.size());
   for (size_t i = 0; i < p.size(); ++i) {
     TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, *p[i]));
     containers_and_names[i] = {&p[i]->container(), &p[i]->name()};
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index dec1a9583a49a2..363abbf4fccb90 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -45,24 +45,28 @@ using ::tsl::testing::StatusIs;
 
 class Resource : public ResourceBase {
  public:
-  explicit Resource(const string& label) : label_(label) {}
+  explicit Resource(const std::string& label) : label_(label) {}
   ~Resource() override {}
 
-  string DebugString() const override { return absl::StrCat("R/", label_); }
+  std::string DebugString() const override {
+    return absl::StrCat("R/", label_);
+  }
 
  private:
-  string label_;
+  std::string label_;
 };
 
 class Other : public ResourceBase {
  public:
-  explicit Other(const string& label) : label_(label) {}
+  explicit Other(const std::string& label) : label_(label) {}
   ~Other() override {}
 
-  string DebugString() const override { return absl::StrCat("O/", label_); }
+  std::string DebugString() const override {
+    return absl::StrCat("O/", label_);
+  }
 
  private:
-  string label_;
+  std::string label_;
 };
 
 class Finalizable : public ResourceBase {
@@ -79,38 +83,38 @@ class Finalizable : public ResourceBase {
 };
 
 template <typename T>
-string Find(const ResourceMgr& rm, const string& container,
-            const string& name) {
+std::string Find(const ResourceMgr& rm, const std::string& container,
+                 const std::string& name) {
   T* r;
   TF_CHECK_OK(rm.Lookup(container, name, &r));
-  const string ret = r->DebugString();
+  const std::string ret = r->DebugString();
   r->Unref();
   return ret;
 }
 
 template <typename T>
-string LookupOrCreate(ResourceMgr* rm, const string& container,
-                      const string& name, const string& label) {
+std::string LookupOrCreate(ResourceMgr* rm, const std::string& container,
+                           const std::string& name, const std::string& label) {
   T* r;
   TF_CHECK_OK(rm->LookupOrCreate<T>(container, name, &r, [&label](T** ret) {
     *ret = new T(label);
     return absl::OkStatus();
   }));
-  const string ret = r->DebugString();
+  const std::string ret = r->DebugString();
   r->Unref();
   return ret;
 }
 
 static void HasError(const absl::Status& s, const error::Code code,
-                     const string& substr) {
+                     const std::string& substr) {
   EXPECT_EQ(s.code(), code);
   EXPECT_TRUE(absl::StrContains(s.message(), substr))
       << s << ", expected substring " << substr;
 }
 
 template <typename T>
-absl::Status FindErr(const ResourceMgr& rm, const string& container,
-                     const string& name) {
+absl::Status FindErr(const ResourceMgr& rm, const std::string& container,
+                     const std::string& name) {
   T* r;
   absl::Status s = rm.Lookup(container, name, &r);
   CHECK(!s.ok());
@@ -323,9 +327,9 @@ TEST(ResourceMgrTest, CreateUnownedFailAfterFinalize) {
   finalizable->Unref();
 }
 
-absl::Status ComputePolicy(const string& attr_container,
-                           const string& attr_shared_name,
-                           bool use_node_name_as_default, string* result) {
+absl::Status ComputePolicy(const std::string& attr_container,
+                           const std::string& attr_shared_name,
+                           bool use_node_name_as_default, std::string* result) {
   ContainerInfo cinfo;
   ResourceMgr rmgr;
   NodeDef ndef;
@@ -341,9 +345,10 @@ absl::Status ComputePolicy(const string& attr_container,
   return absl::OkStatus();
 }
 
-string Policy(const string& attr_container, const string& attr_shared_name,
-              bool use_node_name_as_default) {
-  string ret;
+std::string Policy(const std::string& attr_container,
+                   const std::string& attr_shared_name,
+                   bool use_node_name_as_default) {
+  std::string ret;
   TF_CHECK_OK(ComputePolicy(attr_container, attr_shared_name,
                             use_node_name_as_default, &ret));
   return ret;
@@ -365,10 +370,10 @@ TEST(ContainerInfo, Basic) {
   EXPECT_EQ(Policy(".cat", "bar", true), "[.cat,bar,public]");
 }
 
-absl::Status WrongPolicy(const string& attr_container,
-                         const string& attr_shared_name,
+absl::Status WrongPolicy(const std::string& attr_container,
+                         const std::string& attr_shared_name,
                          bool use_node_name_as_default) {
-  string dbg;
+  std::string dbg;
   auto s = ComputePolicy(attr_container, attr_shared_name,
                          use_node_name_as_default, &dbg);
   CHECK(!s.ok());
@@ -396,7 +401,7 @@ TEST(ContainerInfo, Error) {
 // handles.
 class StubDevice : public DeviceBase {
  public:
-  explicit StubDevice(const string& name) : DeviceBase(nullptr) {
+  explicit StubDevice(const std::string& name) : DeviceBase(nullptr) {
     attr_.set_name(name);
   }
 
@@ -405,7 +410,7 @@ class StubDevice : public DeviceBase {
   }
 
   const DeviceAttributes& attributes() const override { return attr_; }
-  const string& name() const override { return attr_.name(); }
+  const std::string& name() const override { return attr_.name(); }
 
  private:
   DeviceAttributes attr_;
@@ -414,7 +419,7 @@ class StubDevice : public DeviceBase {
 // Empty stub resource for testing resource handles.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   int value_{0};
 };
 
@@ -560,7 +565,7 @@ TEST(ResourceHandleTest, DifferentDevice) {
 // Other stub resource to test type-checking of resource handles.
 class OtherStubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
 };
 
 TEST(ResourceHandleTest, DifferentType) {
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index db879c40654be5..983cbdcf588aa9 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -47,7 +47,7 @@ class StubDevice : public DeviceBase {
 // Stub resource for testing resource op kernel.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   int code;
 };
 
@@ -84,8 +84,8 @@ REGISTER_KERNEL_BUILDER(Name("StubResourceOp").Device(DEVICE_CPU),
 
 class ResourceOpKernelTest : public ::testing::Test {
  protected:
-  std::unique_ptr<StubResourceOpKernel> CreateOp(int code,
-                                                 const string& shared_name) {
+  std::unique_ptr<StubResourceOpKernel> CreateOp(
+      int code, const std::string& shared_name) {
     static std::atomic<int64_t> count(0);
     NodeDef node_def;
     TF_CHECK_OK(NodeDefBuilder(absl::StrCat("test-node", count.fetch_add(1)),
@@ -137,7 +137,7 @@ TEST_F(ResourceOpKernelTest, PrivateResource) {
   // TODO(gonnet): This test is brittle since it assumes that the
   // ResourceManager is untouched and thus the private resource name starts
   // with "_0_".
-  const string key = "_0_" + op->name();
+  const std::string key = "_0_" + op->name();
 
   StubResource* resource;
   TF_ASSERT_OK(
@@ -155,7 +155,7 @@ TEST_F(ResourceOpKernelTest, PrivateResource) {
 }
 
 TEST_F(ResourceOpKernelTest, SharedResource) {
-  const string shared_name = "shared_stub";
+  const std::string shared_name = "shared_stub";
   const int code = -201;
   auto op = CreateOp(code, shared_name);
   ASSERT_NE(op, nullptr);
@@ -199,7 +199,7 @@ TEST_F(ResourceOpKernelTest, VerifyResource) {
 }
 
 TEST_F(ResourceOpKernelTest, ContainerClearedBetweenRuns) {
-  const string shared_name = "shared_stub";
+  const std::string shared_name = "shared_stub";
   const int code = -201;
   auto op = CreateOp(code, shared_name);
   ASSERT_NE(op, nullptr);
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index e4b8688f8ece1f..4836961f46f08c 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -48,7 +48,7 @@ typedef Eigen::RunQueue<Task, 1024> Queue;
 
 namespace internal {
 RunHandlerEnvironment::RunHandlerEnvironment(
-    Env* env, const ThreadOptions& thread_options, const string& name)
+    Env* env, const ThreadOptions& thread_options, const std::string& name)
     : env_(env), thread_options_(thread_options), name_(name) {}
 
 RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
@@ -67,7 +67,7 @@ RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
 
 RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(
     std::function<void()> f) {
-  uint64 id = 0;
+  uint64_t id = 0;
   if (tsl::tracing::EventCollector::IsEnabled()) {
     id = tsl::tracing::GetUniqueArg();
     tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure,
@@ -126,7 +126,7 @@ void WaitOnWaiter(Waiter* waiter, Waiter* queue_head, mutex* mutex,
 
 ThreadWorkSource::ThreadWorkSource()
     : non_blocking_work_sharding_factor_(
-          static_cast<int32>(ParamFromEnvWithDefault(
+          static_cast<int32_t>(ParamFromEnvWithDefault(
               "TF_RUN_HANDLER_NUM_OF_NON_BLOCKING_QUEUES", 1))),
       non_blocking_work_queues_(non_blocking_work_sharding_factor_),
       blocking_inflight_(0),
@@ -261,7 +261,8 @@ int64_t ThreadWorkSource::GetTracemeId() {
 
 void ThreadWorkSource::SetTracemeId(int64_t value) { traceme_id_ = value; }
 
-void ThreadWorkSource::SetWaiter(uint64 version, Waiter* waiter, mutex* mutex) {
+void ThreadWorkSource::SetWaiter(uint64_t version, Waiter* waiter,
+                                 mutex* mutex) {
   {
     tf_shared_lock lock(run_handler_waiter_mu_);
     // Most of the request won't change sub pool for recomputation.
@@ -313,7 +314,7 @@ std::string ThreadWorkSource::ToString() {
 
 RunHandlerThreadPool::RunHandlerThreadPool(
     int num_blocking_threads, int num_non_blocking_threads, Env* env,
-    const ThreadOptions& thread_options, const string& name,
+    const ThreadOptions& thread_options, const std::string& name,
     Eigen::MaxSizeVector<mutex>* waiters_mu,
     Eigen::MaxSizeVector<Waiter>* queue_waiters)
     : num_threads_(num_blocking_threads + num_non_blocking_threads),
@@ -407,7 +408,7 @@ void RunHandlerThreadPool::AddWorkToQueue(ThreadWorkSource* tws,
 // provide better performance due to less lock retention. The drawback is that
 // the profiler will be a bit harder to read.
 void RunHandlerThreadPool::SetThreadWorkSources(
-    int tid, int start_request_idx, uint64 version,
+    int tid, int start_request_idx, uint64_t version,
     const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources) {
   mutex_lock l(thread_data_[tid].mu);
   if (version > thread_data_[tid].new_version) {
@@ -478,12 +479,12 @@ RunHandlerThreadPool::ThreadData::ThreadData()
     : new_version(0),
       current_index(0),
       new_thread_work_sources(
-          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32_t>(
               ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                                       kMaxConcurrentHandlers)))),
       current_version(0),
       current_thread_work_sources(
-          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32_t>(
               ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                                       kMaxConcurrentHandlers)))) {}
 
@@ -734,7 +735,7 @@ class RunHandler::Impl {
 
   // Stores now time (in microseconds) since unix epoch when the handler is
   // requested via RunHandlerPool::Get().
-  uint64 start_time_us() const { return start_time_us_; }
+  uint64_t start_time_us() const { return start_time_us_; }
   int64_t step_id() const { return step_id_; }
   void ScheduleInterOpClosure(std::function<void()> fn);
   void ScheduleIntraOpClosure(std::function<void()> fn);
@@ -763,7 +764,7 @@ class RunHandler::Impl {
   };
 
   RunHandlerPool::Impl* pool_impl_;  // NOT OWNED.
-  uint64 start_time_us_;
+  uint64_t start_time_us_;
   int64_t step_id_;
   std::unique_ptr<thread::ThreadPoolInterface> thread_pool_interface_;
   internal::ThreadWorkSource tws_;
@@ -776,7 +777,7 @@ class RunHandler::Impl {
 class RunHandlerPool::Impl {
  public:
   explicit Impl(int num_inter_op_threads, int num_intra_op_threads)
-      : max_handlers_(static_cast<int32>(ParamFromEnvWithDefault(
+      : max_handlers_(static_cast<int32_t>(ParamFromEnvWithDefault(
             "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS", kMaxConcurrentHandlers))),
         waiters_mu_(
             ParamFromEnvWithDefault("TF_RUN_HANDLER_NUM_SUB_THREAD_POOL", 2)),
@@ -838,10 +839,10 @@ class RunHandlerPool::Impl {
         thread_work_sources =
             std::make_unique<Eigen::MaxSizeVector<internal::ThreadWorkSource*>>(
 
-                static_cast<int32>(ParamFromEnvWithDefault(
+                static_cast<int32_t>(ParamFromEnvWithDefault(
                     "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                     kMaxConcurrentHandlers)));
-    uint64 version;
+    uint64_t version;
     int num_active_requests;
     RunHandler::Impl* handler_impl;
     {
@@ -899,7 +900,7 @@ class RunHandlerPool::Impl {
     CHECK_EQ(handler->tws()->TaskQueueSize(true), 0);   // Crash OK.
     CHECK_EQ(handler->tws()->TaskQueueSize(false), 0);  // Crash OK.
 
-    uint64 now = tensorflow::EnvTime::NowMicros();
+    uint64_t now = tensorflow::EnvTime::NowMicros();
     double elapsed = (now - handler->start_time_us()) / 1000.0;
     time_hist_.Add(elapsed);
 
@@ -935,7 +936,7 @@ class RunHandlerPool::Impl {
 
  private:
   void RecomputePoolStats(
-      int num_active_requests, uint64 version,
+      int num_active_requests, uint64_t version,
       const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
           thread_work_sources);
 
@@ -971,7 +972,7 @@ class RunHandlerPool::Impl {
 };
 
 void RunHandlerPool::Impl::RecomputePoolStats(
-    int num_active_requests, uint64 version,
+    int num_active_requests, uint64_t version,
     const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
         thread_work_sources) {
   if (num_active_requests == 0) return;
@@ -1019,9 +1020,9 @@ void RunHandlerPool::Impl::LogInfo() {
     int num_active_requests = sorted_active_handlers_.size();
     VLOG(1) << "Printing time histogram: " << time_hist_.ToString();
     VLOG(1) << "Active session runs: " << num_active_requests;
-    uint64 now = tensorflow::Env::Default()->NowMicros();
-    string times_str = "";
-    string ids_str = "";
+    uint64_t now = tensorflow::Env::Default()->NowMicros();
+    std::string times_str = "";
+    std::string ids_str = "";
     auto it = sorted_active_handlers_.cbegin();
     for (int i = 0; i < num_active_requests; ++i) {
       if (i > 0) {
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
index 148378bc0e9c6f..993be4a9eb4c0e 100644
--- a/tensorflow/core/framework/run_handler.h
+++ b/tensorflow/core/framework/run_handler.h
@@ -115,11 +115,11 @@ class RunHandlerEnvironment {
   struct TaskImpl {
     std::function<void()> f;
     Context context;
-    uint64 trace_id;
+    uint64_t trace_id;
   };
   Env* const env_;
   const ThreadOptions thread_options_;
-  const string name_;
+  const std::string name_;
 
  public:
   struct Task {
@@ -127,7 +127,7 @@ class RunHandlerEnvironment {
   };
 
   RunHandlerEnvironment(Env* env, const ThreadOptions& thread_options,
-                        const string& name);
+                        const std::string& name);
 
   EnvThread* CreateThread(std::function<void()> f,
                           const std::string& thread_name);
@@ -174,7 +174,7 @@ class ThreadWorkSource {
 
   void SetTracemeId(int64_t value);
 
-  void SetWaiter(uint64 version, Waiter* waiter, mutex* mutex);
+  void SetWaiter(uint64_t version, Waiter* waiter, mutex* mutex);
 
   int64_t GetInflightTaskCount(bool is_blocking);
 
@@ -193,7 +193,7 @@ class ThreadWorkSource {
     Queue queue;
   };
 
-  int32 non_blocking_work_sharding_factor_;
+  int32_t non_blocking_work_sharding_factor_;
   Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
 
   std::atomic<int64_t> blocking_inflight_;
@@ -207,7 +207,7 @@ class ThreadWorkSource {
   std::atomic<int64_t> traceme_id_;
 
   mutex run_handler_waiter_mu_;
-  uint64 version_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  uint64_t version_ TF_GUARDED_BY(run_handler_waiter_mu_);
   mutex* sub_thread_pool_waiter_mu_ TF_GUARDED_BY(run_handler_waiter_mu_);
   Waiter* sub_thread_pool_waiter_ TF_GUARDED_BY(run_handler_waiter_mu_);
 };
@@ -222,7 +222,7 @@ class RunHandlerThreadPool {
 
   RunHandlerThreadPool(int num_blocking_threads, int num_non_blocking_threads,
                        Env* env, const ThreadOptions& thread_options,
-                       const string& name,
+                       const std::string& name,
                        Eigen::MaxSizeVector<mutex>* waiters_mu,
                        Eigen::MaxSizeVector<Waiter>* queue_waiters);
 
@@ -239,7 +239,7 @@ class RunHandlerThreadPool {
   // The request with start_request_idx will be attempted first. Other requests
   // will be attempted in FIFO order based on their arrival time.
   void SetThreadWorkSources(
-      int tid, int start_request_idx, uint64 version,
+      int tid, int start_request_idx, uint64_t version,
       const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
 
   PerThread* GetPerThread();
@@ -273,14 +273,14 @@ class RunHandlerThreadPool {
   struct ThreadData {
     ThreadData();
     mutex mu;
-    uint64 new_version;
+    uint64_t new_version;
     condition_variable sources_not_empty;
     std::unique_ptr<Thread> thread;
     int current_index;
     std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
         new_thread_work_sources TF_GUARDED_BY(mu);
 
-    uint64 current_version;
+    uint64_t current_version;
     // Should only be accessed by one thread.
     std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
         current_thread_work_sources;
@@ -294,7 +294,7 @@ class RunHandlerThreadPool {
   Eigen::MaxSizeVector<ThreadData> thread_data_;
   internal::RunHandlerEnvironment env_;
   std::atomic<bool> cancelled_;
-  string name_;
+  std::string name_;
   Eigen::MaxSizeVector<mutex>* waiters_mu_;
   Eigen::MaxSizeVector<Waiter>* queue_waiters_;
 
diff --git a/tensorflow/core/framework/run_handler_test.cc b/tensorflow/core/framework/run_handler_test.cc
index b6560dc45c73b9..aab7869c139472 100644
--- a/tensorflow/core/framework/run_handler_test.cc
+++ b/tensorflow/core/framework/run_handler_test.cc
@@ -638,11 +638,11 @@ class RunHandlerTest : public ::testing::Test {
     ASSERT_EQ(setenv("TF_NUM_INTEROP_THREADS", "16", true), 0);
   }
 
-  string a_;
-  string x_;
-  string y_;
-  string y_neg_;
-  string z_;
+  std::string a_;
+  std::string x_;
+  std::string y_;
+  std::string y_neg_;
+  std::string z_;
   GraphDef def_;
 };
 
@@ -651,11 +651,11 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPool) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   EXPECT_EQ(absl::OkStatus(), session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunMetadata
@@ -687,10 +687,10 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPool) {
   thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
 
   // Run the graph 1000 times in 4 different threads concurrently.
-  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<std::string> output_names = {y_ + ":0"};
   auto fn = [&session, output_names, run_options]() {
     for (int i = 0; i < 1000; ++i) {
-      std::vector<std::pair<string, Tensor>> inputs;
+      std::vector<std::pair<std::string, Tensor>> inputs;
       std::vector<Tensor> outputs;
       // Run the graph
       absl::Status s = session->Run(run_options, inputs, output_names, {},
@@ -715,11 +715,11 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPoolWithPriority) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   EXPECT_EQ(absl::OkStatus(), session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunMetadata
@@ -751,7 +751,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
   thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
 
   // Run the graph 1000 times in 4 different threads concurrently.
-  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<std::string> output_names = {y_ + ":0"};
   auto fn = [&session, output_names]() {
     for (int i = 0; i < 1000; ++i) {
       RunOptions run_options;
@@ -759,7 +759,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
       run_options.mutable_experimental()
           ->mutable_run_handler_pool_options()
           ->set_priority(i % 4);
-      std::vector<std::pair<string, Tensor>> inputs;
+      std::vector<std::pair<std::string, Tensor>> inputs;
       std::vector<Tensor> outputs;
       // Run the graph
       absl::Status s = session->Run(run_options, inputs, output_names, {},
diff --git a/tensorflow/core/framework/run_handler_util.cc b/tensorflow/core/framework/run_handler_util.cc
index c54ecc844f595d..85ce55fc240076 100644
--- a/tensorflow/core/framework/run_handler_util.cc
+++ b/tensorflow/core/framework/run_handler_util.cc
@@ -35,7 +35,7 @@ std::vector<double> ParamFromEnvWithDefault(const char* var_name,
   if (!val) {
     return default_value;
   }
-  std::vector<string> splits = str_util::Split(val, ",");
+  std::vector<std::string> splits = str_util::Split(val, ",");
   std::vector<double> result;
   result.reserve(splits.size());
   for (auto& split : splits) {
@@ -56,7 +56,7 @@ std::vector<int> ParamFromEnvWithDefault(const char* var_name,
   if (!val) {
     return default_value;
   }
-  std::vector<string> splits = str_util::Split(val, ",");
+  std::vector<std::string> splits = str_util::Split(val, ",");
   std::vector<int> result;
   result.reserve(splits.size());
   for (auto& split : splits) {
diff --git a/tensorflow/core/framework/run_handler_util_test.cc b/tensorflow/core/framework/run_handler_util_test.cc
index 1eff55529bb5e2..e3e3a2ae57e354 100644
--- a/tensorflow/core/framework/run_handler_util_test.cc
+++ b/tensorflow/core/framework/run_handler_util_test.cc
@@ -37,7 +37,7 @@ void VerifySchedulingRanges(int num_active_requests, int num_threads,
 
   ComputeInterOpSchedulingRanges(num_active_requests, num_threads,
                                  min_threads_per_request, &start, &end);
-  string range_str = "";
+  std::string range_str = "";
   for (int i = 0; i < num_active_requests; ++i) {
     if (i > 0) range_str += " ";
     range_str += strings::StrCat("[", start[i], ", ", end[i], ")");
@@ -107,7 +107,7 @@ TEST(RunHandlerUtilTest, TestComputeInterOpStealingRanges) {
   for (int i = 0; i < num_inter_op_threads; ++i) {
     int expected_start = stealing_ranges[i / 6][0];
     int expected_end = stealing_ranges[i / 6][1];
-    string message =
+    std::string message =
         strings::StrCat("Stealing range of thread ", i, " should be [",
                         expected_start, ", ", expected_end, "]");
     ASSERT_EQ(start_vec[i], expected_start) << message;
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index d102e153c0001f..84ec0d45c4852c 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -50,7 +50,7 @@ class SessionState {
   int64_t tensor_id_ = 0;
 
   // The live tensors in the session. A map from tensor handle to tensor.
-  std::unordered_map<string, Tensor> tensors_;
+  std::unordered_map<std::string, Tensor> tensors_;
 };
 
 // The tensor store remembers the tensors we choose to keep for the
@@ -71,7 +71,7 @@ class TensorStore {
   absl::Status AddTensor(const std::string& name, const TensorAndKey& tk);
 
   // Save the tensors in the tensor store of this run to the session.
-  absl::Status SaveTensors(const std::vector<string>& output_names,
+  absl::Status SaveTensors(const std::vector<std::string>& output_names,
                            SessionState* session_state);
 
   // Returns true if no tensors have been added to this store.
@@ -83,7 +83,7 @@ class TensorStore {
 
   // The tensors that will be saved to session state when this run completes.
   // A map from tensor string name to tensor.
-  std::unordered_map<string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
+  std::unordered_map<std::string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 5ba580d2f49276..9b81a620120cab 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 
 #include "tensorflow/core/framework/bounds_check.h"
@@ -282,9 +283,9 @@ DimensionHandle InferenceContext::NumElements(ShapeHandle s) {
   }
 }
 
-string InferenceContext::DebugString(ShapeHandle s) {
+std::string InferenceContext::DebugString(ShapeHandle s) {
   if (RankKnown(s)) {
-    std::vector<string> vals;
+    std::vector<std::string> vals;
     for (auto d : s->dims_) vals.push_back(DebugString(d));
     return absl::StrCat("[", absl::StrJoin(vals, ","), "]");
   } else {
@@ -292,22 +293,22 @@ string InferenceContext::DebugString(ShapeHandle s) {
   }
 }
 
-string InferenceContext::DebugString(DimensionHandle d) {
+std::string InferenceContext::DebugString(DimensionHandle d) {
   return ValueKnown(d) ? absl::StrCat(Value(d)) : "?";
 }
 
-string InferenceContext::DebugString() const {
+std::string InferenceContext::DebugString() const {
   return absl::StrCat("InferenceContext for node: ", attrs_.SummarizeNode());
 }
 
-string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
+std::string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
   return absl::StrCat(DebugString(shape_and_type.shape), ":",
                       DataTypeString(shape_and_type.dtype));
 }
 
-string InferenceContext::DebugString(
+std::string InferenceContext::DebugString(
     absl::Span<const ShapeAndType> shape_and_types) {
-  std::vector<string> pieces;
+  std::vector<std::string> pieces;
   for (const ShapeAndType& s : shape_and_types) {
     pieces.push_back(DebugString(s));
   }
@@ -316,7 +317,7 @@ string InferenceContext::DebugString(
 
 absl::Status InferenceContext::WithRank(ShapeHandle shape, int64_t rank,
                                         ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -341,7 +342,7 @@ absl::Status InferenceContext::WithRank(ShapeHandle shape, int64_t rank,
 
 absl::Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64_t rank,
                                                ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -356,7 +357,7 @@ absl::Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64_t rank,
 
 absl::Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64_t rank,
                                               ShapeHandle* out) {
-  if (rank > kint32max) {
+  if (rank > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32_t existing = Rank(shape);
@@ -812,7 +813,7 @@ absl::Status InferenceContext::InternalMakeShapeFromTensor(
 
   if (t->shape().dims() == 0) {
     if (t->dtype() == DataType::DT_INT32) {
-      auto flat_t = t->scalar<int32>();
+      auto flat_t = t->scalar<int32_t>();
       if (flat_t() != -1) {
         *out = nullptr;
         return errors::InvalidArgument(
@@ -853,7 +854,7 @@ absl::Status InferenceContext::InternalMakeShapeFromTensor(
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
-    auto flat_t = t->flat<int32>();
+    auto flat_t = t->flat<int32_t>();
     for (int i = 0; i < flat_t.size(); ++i) {
       const int32_t val = flat_t(i);
       if (val < -1) {
@@ -939,7 +940,7 @@ absl::Status InferenceContext::GetScalarFromTensor(const Tensor* t,
     *val = t->scalar<int16_t>()();
     return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT32) {
-    *val = t->scalar<int32>()();
+    *val = t->scalar<int32_t>()();
     return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT64) {
     *val = t->scalar<int64_t>()();
@@ -959,7 +960,7 @@ absl::Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t idx,
   }
 
   if (t->dtype() == DataType::DT_INT32) {
-    auto flat_t = t->flat<int32>();
+    auto flat_t = t->flat<int32_t>();
     if (idx < 0 || idx >= flat_t.size()) {
       return errors::InvalidArgument("Invalid index ", idx,
                                      " for Tensor of size ", flat_t.size());
@@ -1069,7 +1070,7 @@ absl::Status InferenceContext::Add(DimensionHandle first,
     // get pair of values which cannot be store in output. Check below will
     // report error. We still need to avoid undefined behavior of signed
     // overflow and use unsigned addition.
-    const int64_t sum = static_cast<uint64>(first_value) + second_value;
+    const int64_t sum = static_cast<uint64_t>(first_value) + second_value;
     if (sum < 0) {
       return errors::InvalidArgument("Dimension size overflow from adding ",
                                      first_value, " and ", second_value);
@@ -1170,15 +1171,15 @@ absl::Status InferenceContext::Max(DimensionHandle first,
 }
 
 absl::Status InferenceContext::AttachContext(const absl::Status& status) {
-  std::vector<string> input_shapes;
+  std::vector<std::string> input_shapes;
   input_shapes.reserve(inputs_.size());
   for (const ShapeHandle& input_shape : inputs_) {
     input_shapes.emplace_back(DebugString(input_shape));
   }
 
   // Add information about the input tensors and partial tensor shapes used.
-  std::vector<string> input_from_tensors_str;
-  std::vector<string> input_from_tensors_as_shape_str;
+  std::vector<std::string> input_from_tensors_str;
+  std::vector<std::string> input_from_tensors_as_shape_str;
   input_from_tensors_as_shape_str.reserve(inputs_.size());
   for (int i = 0, end = inputs_.size(); i < end; ++i) {
     const int input_tensors_as_shapes_size = input_tensors_as_shapes_.size();
@@ -1197,7 +1198,7 @@ absl::Status InferenceContext::AttachContext(const absl::Status& status) {
     }
   }
 
-  string error_context =
+  std::string error_context =
       absl::StrCat(" for '", attrs_.SummarizeNode(),
                    "' with input shapes: ", absl::StrJoin(input_shapes, ", "));
   if (!input_from_tensors_str.empty()) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 8bfd301d860de1..6065ab22fa8756 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -158,7 +158,7 @@ class Shape {
   Shape(const std::vector<DimensionHandle>& dims);
   ~Shape() {}
 
-  const int32 rank_;
+  const int32_t rank_;
   const std::vector<DimensionHandle> dims_;
 
   friend class InferenceContext;
@@ -430,7 +430,7 @@ class InferenceContext {
     return s->dims_[idx];
   }
 
-  static int32 Rank(ShapeHandle s) {
+  static int32_t Rank(ShapeHandle s) {
     return s.IsSet() ? s->rank_ : kUnknownRank;
   }
   static bool RankKnown(ShapeHandle s) {
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index e07b9b75f0b648..c1d853d4933190 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -230,7 +230,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
   // Error when a constant tensor value as shape was requested, but no partial
   // shapes provided.
   {
-    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    Tensor input_t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5});
     InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({3}), S({4})},
                        {nullptr, &input_t}, {}, {});
     TF_ASSERT_OK(c.construction_status());
@@ -257,7 +257,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
   // Error when a constant tensor value as shape was requested, and a partial
   // shape was provided.
   {
-    Tensor input_t = ::tensorflow::test::AsTensor<int32>({1, 2, 3, 4, 5});
+    Tensor input_t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5});
     InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({3}), S({4})},
                        {nullptr, &input_t}, {S({10, -1, 5}), Unknown()}, {});
     TF_ASSERT_OK(c.construction_status());
@@ -1129,7 +1129,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   Tensor t;
   EXPECT_EQ("?", create(nullptr));
 
-  t = ::tensorflow::test::AsTensor<int32>({1, 2, 3});
+  t = ::tensorflow::test::AsTensor<int32_t>({1, 2, 3});
   EXPECT_EQ("[1,2,3]", create(&t));
 
   t = ::tensorflow::test::AsTensor<int64_t>({3, 2, 1});
@@ -1142,19 +1142,19 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_EQ("[]", create(&t));
 
   // Test negative scalar
-  t = ::tensorflow::test::AsScalar<int32>(-1);
+  t = ::tensorflow::test::AsScalar<int32_t>(-1);
   EXPECT_EQ("?", create(&t));
 
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
   EXPECT_THAT(create(&t),
               HasSubstr("Input tensor must be int32 or int64, but was float"));
 
-  t = ::tensorflow::test::AsScalar<int32>(1);
+  t = ::tensorflow::test::AsScalar<int32_t>(1);
   auto s_scalar = create(&t);
   EXPECT_THAT(s_scalar, HasSubstr("Input tensor must be rank 1, or if its rank "
                                   "0 it must have value -1"));
 
-  t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
+  t = ::tensorflow::test::AsTensor<int32_t>({1, 2}, TensorShape{2, 1});
   auto s_matrix = create(&t);
   EXPECT_THAT(s_matrix,
               HasSubstr("Input tensor must be rank 1, but was rank 2"));
@@ -1165,7 +1165,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
               HasSubstr("Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
-  t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
+  t = ::tensorflow::test::AsTensor<int32_t>({3, -2, 1});
   EXPECT_THAT(create(&t),
               HasSubstr("Invalid value in tensor used for shape: -2"));
 
@@ -1304,8 +1304,8 @@ TEST_F(ShapeInferenceTest, InputTensors) {
 }
 
 TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
-  Tensor t1 = tensorflow::test::AsScalar<int32>(20);
-  Tensor t2 = tensorflow::test::AsScalar<int32>(-1);
+  Tensor t1 = tensorflow::test::AsScalar<int32_t>(20);
+  Tensor t2 = tensorflow::test::AsScalar<int32_t>(-1);
   NodeDef def;
   InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2},
                      {}, {});
@@ -1334,8 +1334,8 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
 }
 
 TEST_F(ShapeInferenceTest, MakeDimForScalarInputWithNegativeIndexing) {
-  Tensor t1 = tensorflow::test::AsScalar<int32>(-2);
-  Tensor t2 = tensorflow::test::AsScalar<int32>(3);
+  Tensor t1 = tensorflow::test::AsScalar<int32_t>(-2);
+  Tensor t2 = tensorflow::test::AsScalar<int32_t>(3);
   NodeDef def;
   InferenceContext c(kVersion, def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2},
                      {}, {});
@@ -1377,7 +1377,7 @@ TEST_F(ShapeInferenceTest, GetAttr) {
 
   std::vector<ShapeHandle> empty;
   InferenceContext c(kVersion, def, op_reg_data.op_def, empty, {}, {}, {});
-  string value;
+  std::string value;
   TF_EXPECT_OK(c.GetAttr("foo", &value));
   EXPECT_EQ("bar", value);
 }
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index e77fed6b3f83a7..d1769e0a0282e5 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -30,17 +30,17 @@ namespace shape_inference {
 
 using errors::Unknown;
 
-absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
-                                                 const string& ins,
-                                                 const string& expected_outs) {
+absl::Status ShapeInferenceTestutil::InferShapes(
+    ShapeInferenceTestOp op, const std::string& ins,
+    const std::string& expected_outs) {
   const OpRegistrationData* op_reg_data;
   TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(op.name, &op_reg_data));
 
-  std::vector<string> ins_v = str_util::Split(ins, ';');
+  std::vector<std::string> ins_v = str_util::Split(ins, ';');
 
   InferenceContext::ShapeManager manager;
   std::vector<ShapeHandle> in_shapes;
-  for (const string& spec : ins_v) {
+  for (const std::string& spec : ins_v) {
     ShapeHandle shape;
     TF_RETURN_IF_ERROR(MakeShapeFromString(&manager, spec, &shape));
     in_shapes.push_back(shape);
@@ -82,7 +82,8 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
   }
 
   // Verify the output shape.
-  std::vector<string> expected_outs_v = str_util::Split(expected_outs, ';');
+  std::vector<std::string> expected_outs_v =
+      str_util::Split(expected_outs, ';');
   if (num_outputs != expected_outs_v.size()) {
     return Unknown("The expected output string lists the wrong number of ",
                    "outputs. It lists ", expected_outs_v.size(),
@@ -92,8 +93,9 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     absl::string_view expected(expected_outs_v[i]);
     shape_inference::ShapeHandle out = c.output(i);
 
-    string err_prefix = absl::StrCat("Output ", i);
-    string err_suffix = absl::StrCat(". Output shape was ", c.DebugString(out));
+    std::string err_prefix = absl::StrCat("Output ", i);
+    std::string err_suffix =
+        absl::StrCat(". Output shape was ", c.DebugString(out));
 
     int in_index = -1;
     for (int i = 0; i < c.num_inputs(); ++i) {
@@ -230,7 +232,7 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
 
 // static
 absl::Status ShapeInferenceTestutil::MakeShapeFromString(
-    InferenceContext::ShapeManager* manager, const string& spec,
+    InferenceContext::ShapeManager* manager, const std::string& spec,
     ShapeHandle* output) {
   if (spec == "?") {
     *output = manager->UnknownShape();
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index c9b9bd74a8515f..5c204012546a22 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -32,9 +32,10 @@ namespace tensorflow {
 class Tensor;
 
 struct ShapeInferenceTestOp {
-  typedef std::pair<string, DataType> ShapeAndType;
-  explicit ShapeInferenceTestOp(absl::string_view name) : name(string(name)) {}
-  string name;
+  typedef std::pair<std::string, DataType> ShapeAndType;
+  explicit ShapeInferenceTestOp(absl::string_view name)
+      : name(std::string(name)) {}
+  std::string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
   std::vector<std::vector<ShapeAndType>*>
@@ -67,15 +68,16 @@ class ShapeInferenceTestutil {
   //            the second is which dimension in that input it corresponds to.
   // <expected_outs> can be "e"; this is used to indicate that shape inference
   // should have failed.
-  static absl::Status InferShapes(ShapeInferenceTestOp op, const string& ins,
-                                  const string& expected_outs);
+  static absl::Status InferShapes(ShapeInferenceTestOp op,
+                                  const std::string& ins,
+                                  const std::string& expected_outs);
 
  private:
   ShapeInferenceTestutil() = default;
 
   // Makes a shape out of 'spec'.
   static absl::Status MakeShapeFromString(
-      InferenceContext::ShapeManager* manager, const string& spec,
+      InferenceContext::ShapeManager* manager, const std::string& spec,
       ShapeHandle* output);
 };
 
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 4bebe251c5d349..56b32d859a9d8b 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -49,8 +49,9 @@ REGISTER_OP("OpTwoOut")
     .Attr("T: numbertype")
     .SetShapeFn([](InferenceContext* c) { return (*global_fn_ptr)(c); });
 
-string RunInferShapes(const string& op_name, const string& ins,
-                      const string& expected_outs, OpShapeInferenceFn fn) {
+std::string RunInferShapes(const std::string& op_name, const std::string& ins,
+                           const std::string& expected_outs,
+                           OpShapeInferenceFn fn) {
   ShapeInferenceTestOp op(op_name);
   const int num_inputs = 1 + std::count(ins.begin(), ins.end(), ';');
   std::vector<NodeDefBuilder::NodeOut> src_list;
@@ -91,7 +92,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
     c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
     return absl::OkStatus();
   };
-  const string& op = "OpOneOut";
+  const std::string& op = "OpOneOut";
 
   EXPECT_EQ("Shape inference should have returned error",
             RunInferShapes(op, "[1];[2];[1]", "e", fn_copy_input_0));
@@ -143,7 +144,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
                                    c->UnknownDim(), c->Dim(c->input(2), 0)}));
     return absl::OkStatus();
   };
-  const string ins = "[0,1,?];[2];[1]";
+  const std::string ins = "[0,1,?];[2];[1]";
   EXPECT_CONTAINS(RunInferShapes(op, ins, "[?,2,?,d2_0]", fn),
                   "Output dim 0,0 expected to be an unknown");
   EXPECT_CONTAINS(RunInferShapes(op, ins, "[0,2,?,d2_0]", fn),
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index 5b89a82f861b9a..c7327f54880842 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -46,13 +46,13 @@ class StatsAggregator {
 
   // Add the given `values` to the histogram with the given `name`. Each
   // element of `values` will be treated as a separate sample in the histogram.
-  virtual void AddToHistogram(const string& name,
+  virtual void AddToHistogram(const std::string& name,
                               absl::Span<const double> values,
                               int64_t global_step) = 0;
 
   // TODO(shivaniagrawal): consistency in double and float usage.
   // Add the given `value` as Scalar with the given `name`.
-  virtual void AddScalar(const string& name, float value,
+  virtual void AddScalar(const std::string& name, float value,
                          int64_t global_step) = 0;
 
   // Stores a protocol buffer representation of the aggregator state in the
@@ -64,8 +64,8 @@ class StatsAggregator {
       SummaryWriterInterface* summary_writer) = 0;
 
   // Increment the `label` cell of metrics mapped with `name` by given `value`.
-  virtual void IncrementCounter(const string& name, const string& label,
-                                int64_t val) = 0;
+  virtual void IncrementCounter(const std::string& name,
+                                const std::string& label, int64_t val) = 0;
 };
 
 // A `StatsAggregatorResource` wraps a sharable `StatsAggregator` as a resource
@@ -86,7 +86,7 @@ class StatsAggregatorResource : public ResourceBase {
     return stats_aggregator_;
   }
 
-  string DebugString() const override { return "StatsAggregatorResource"; }
+  std::string DebugString() const override { return "StatsAggregatorResource"; }
 
  private:
   const std::shared_ptr<StatsAggregator> stats_aggregator_;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1b03732fabfcf6..65e236968b7eb1 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -231,7 +231,7 @@ struct Helper {
 template <>
 struct Helper<tstring> {
   // Proto message uses RepeatedFieldType to hold repeated T.
-  typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedFieldType;
 
   // Encodes "n" elements of type string stored in "in" into Cord
   // "out", which is usually the TensorProto::tensor_content.
@@ -268,7 +268,7 @@ struct Helper<tstring> {
 template <>
 struct Helper<ResourceHandle> {
   // Proto message uses RepeatedFieldType to hold repeated T.
-  typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedFieldType;
 
   // Encodes "n" elements of type ResourceHandle stored in "in" into destination
   // "out", which is usually the TensorProto::tensor_content.
@@ -357,18 +357,18 @@ struct ProtoHelper {};
 
 PROTO_TRAITS(float, float, float);
 PROTO_TRAITS(double, double, double);
-PROTO_TRAITS(int32, int32, int);
-PROTO_TRAITS(uint8, int32, int);
-PROTO_TRAITS(uint16, int32, int);
-PROTO_TRAITS(uint32, uint32, uint32);
-PROTO_TRAITS(int16, int32, int);
-PROTO_TRAITS(int8, int32, int);
+PROTO_TRAITS(int32_t, int32_t, int);
+PROTO_TRAITS(uint8_t, int32_t, int);
+PROTO_TRAITS(uint16_t, int32_t, int);
+PROTO_TRAITS(uint32_t, uint32_t, uint32);
+PROTO_TRAITS(int16_t, int32_t, int);
+PROTO_TRAITS(int8_t, int32_t, int);
 PROTO_TRAITS(bool, bool, bool);
 PROTO_TRAITS(tstring, tstring, string);
-PROTO_TRAITS(qint8, int32, int);
-PROTO_TRAITS(quint8, int32, int);
-PROTO_TRAITS(qint16, int32, int);
-PROTO_TRAITS(quint16, int32, int);
+PROTO_TRAITS(qint8, int32_t, int);
+PROTO_TRAITS(quint8, int32_t, int);
+PROTO_TRAITS(qint16, int32_t, int);
+PROTO_TRAITS(quint16, int32_t, int);
 #undef PROTO_TRAITS
 
 template <typename T>
@@ -416,7 +416,7 @@ struct ProtoHelper<int64_t> {
 };
 
 template <>
-struct ProtoHelper<uint64> {
+struct ProtoHelper<uint64_t> {
   static protobuf::RepeatedField<uint64_t>::const_iterator Begin(
       const TensorProto& proto) {
     return proto.uint64_val().begin();
@@ -424,7 +424,7 @@ struct ProtoHelper<uint64> {
   static size_t NumElements(const TensorProto& proto) {
     return proto.uint64_val().size();
   }
-  static void Fill(const uint64* data, size_t n, TensorProto* proto) {
+  static void Fill(const uint64_t* data, size_t n, TensorProto* proto) {
     protobuf::RepeatedField<protobuf_uint64> copy(data, data + n);
     proto->mutable_uint64_val()->Swap(&copy);
   }
@@ -502,7 +502,7 @@ struct ProtoHelper<complex128> {
 
 template <>
 struct ProtoHelper<qint32> {
-  typedef Helper<int32>::RepeatedFieldType FieldType;
+  typedef Helper<int32_t>::RepeatedFieldType FieldType;
   static const qint32* Begin(const TensorProto& proto) {
     return reinterpret_cast<const qint32*>(proto.int_val().data());
   }
@@ -510,7 +510,7 @@ struct ProtoHelper<qint32> {
     return proto.int_val().size();
   }
   static void Fill(const qint32* data, size_t n, TensorProto* proto) {
-    const int32* p = reinterpret_cast<const int32*>(data);
+    const int32_t* p = reinterpret_cast<const int32_t*>(data);
     FieldType copy(p, p + n);
     proto->mutable_int_val()->Swap(&copy);
   }
@@ -522,7 +522,7 @@ struct ProtoHelper<bfloat16> {
     proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
       proto->mutable_half_val()->AddAlreadyReserved(
-          Eigen::numext::bit_cast<uint16>(data[i]));
+          Eigen::numext::bit_cast<uint16_t>(data[i]));
     }
   }
 };
@@ -533,14 +533,14 @@ struct ProtoHelper<Eigen::half> {
     proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
       proto->mutable_half_val()->AddAlreadyReserved(
-          Eigen::numext::bit_cast<uint16>(data[i]));
+          Eigen::numext::bit_cast<uint16_t>(data[i]));
     }
   }
 };
 
 template <typename Float8>
 struct Float8ProtoHelper {
-  typedef string RepeatedFieldType;
+  typedef std::string RepeatedFieldType;
   static const Float8* Begin(const TensorProto& proto) {
     return reinterpret_cast<const Float8*>(proto.float8_val().data());
   }
@@ -574,6 +574,29 @@ template <>
 struct ProtoHelper<float8_e5m2fnuz>
     : public Float8ProtoHelper<float8_e5m2fnuz> {};
 
+template <typename Float4>
+struct Float4ProtoHelper {
+  typedef std::string RepeatedFieldType;
+  static const Float4* Begin(const TensorProto& proto) {
+    // Read from float8_val
+    return reinterpret_cast<const Float4*>(proto.float8_val().data());
+  }
+  static size_t NumElements(const TensorProto& proto) {
+    // Size is the number of bytes in float8_val
+    return proto.float8_val().size();
+  }
+  static void Fill(const Float4* data, size_t n, TensorProto* proto) {
+    proto->mutable_float8_val()->reserve(n);
+    for (size_t i = 0; i < n; ++i) {
+      proto->mutable_float8_val()->push_back(
+          Eigen::numext::bit_cast<uint8_t>(data[i]));
+    }
+  }
+};
+
+template <>
+struct ProtoHelper<float4_e2m1fn> : public Float4ProtoHelper<float4_e2m1fn> {};
+
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64_t n)
     : BufferBase(a, TypedAllocator::Allocate<T>(a, n, AllocationAttributes())),
@@ -659,7 +682,7 @@ TensorBuffer* Int4OrInt2FromProtoField(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -776,7 +799,7 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
                                           int64_t n) {
   CHECK_GT(n, 0);
   Buffer<Eigen::half>* buf = new Buffer<Eigen::half>(a, n);
-  uint16* data = buf->template base<uint16>();
+  uint16_t* data = buf->template base<uint16_t>();
   if (data == nullptr) {
     buf->Unref();
     return nullptr;
@@ -787,7 +810,7 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -800,7 +823,7 @@ TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
                                        int64_t n) {
   CHECK_GT(n, 0);
   Buffer<bfloat16>* buf = new Buffer<bfloat16>(a, n);
-  uint16* data = buf->template base<uint16>();
+  uint16_t* data = buf->template base<uint16_t>();
   if (data == nullptr) {
     buf->Unref();
     return nullptr;
@@ -811,7 +834,7 @@ TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
     std::copy_n(begin, n, data);
   } else if (in_n > 0) {
     std::copy_n(begin, in_n, data);
-    const uint16 last = *(data + in_n - 1);
+    const uint16_t last = *(data + in_n - 1);
     std::fill_n(data + in_n, n - in_n, last);
   } else {
     std::fill_n(data, n, 0);
@@ -966,6 +989,7 @@ absl::Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
     CASE(float8_e4m3fnuz, SINGLE_ARG(STMTS))                   \
     CASE(float8_e4m3b11fnuz, SINGLE_ARG(STMTS))                \
     CASE(float8_e5m2fnuz, SINGLE_ARG(STMTS))                   \
+    CASE(float4_e2m1fn, SINGLE_ARG(STMTS))                     \
     CASE(int4, SINGLE_ARG(STMTS))                              \
     CASE(uint4, SINGLE_ARG(STMTS))                             \
     CASE(int2, SINGLE_ARG(STMTS))                              \
@@ -1240,7 +1264,7 @@ template <typename T>
 const T& PrintOneElement(const T& value, bool print_v2) {
   return value;
 }
-string PrintOneElement(const tstring& a, bool print_v2) {
+std::string PrintOneElement(const tstring& a, bool print_v2) {
   if (print_v2) {
     return "\"" + absl::Utf8SafeCEscape(a) + "\"";
   } else {
@@ -1267,6 +1291,10 @@ float PrintOneElement(float8_e4m3b11fnuz f, bool print_v2) {
   return static_cast<float>(f);
 }
 
+float PrintOneElement(float4_e2m1fn f, bool print_v2) {
+  return static_cast<float>(f);
+}
+
 int16_t PrintOneElement(int4 a, bool print_v2) {
   return static_cast<int16_t>(a);
 }
@@ -1285,9 +1313,9 @@ uint16_t PrintOneElement(uint2 a, bool print_v2) {
 
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDim(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
+void PrintOneDim(int dim_index, const absl::InlinedVector<int64_t, 4UL>& shape,
                  int64_t limit, int shape_size, const T* data,
-                 int64_t* data_index, string* result) {
+                 int64_t* data_index, std::string* result) {
   if (*data_index >= limit) return;
   int64_t element_count = shape[dim_index];
   // We have reached the right-most dimension of the tensor.
@@ -1324,7 +1352,7 @@ void PrintOneDim(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
 }
 
 // Appends the spacing between elements for a given dim onto a result string
-void PrintDimSpacing(int dim_index, int num_dims, string* result) {
+void PrintDimSpacing(int dim_index, int num_dims, std::string* result) {
   if (dim_index == num_dims - 1) {
     absl::StrAppend(result, " ");
     return;
@@ -1339,9 +1367,10 @@ void PrintDimSpacing(int dim_index, int num_dims, string* result) {
 
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDimV2(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
+void PrintOneDimV2(int dim_index,
+                   const absl::InlinedVector<int64_t, 4UL>& shape,
                    int64_t num_elts_at_ends, int num_dims, const T* data,
-                   int64_t data_index, string* result) {
+                   int64_t data_index, std::string* result) {
   // We have recursed beyond all the dimensions into a single element
   // of the tensor.
   if (dim_index == num_dims) {
@@ -1384,10 +1413,10 @@ void PrintOneDimV2(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
 }
 
 template <typename T>
-string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
-                              const TensorShape& tensor_shape, const T* array,
-                              const bool print_v2) {
-  string ret;
+std::string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
+                                   const TensorShape& tensor_shape,
+                                   const T* array, const bool print_v2) {
+  std::string ret;
   const absl::InlinedVector<int64_t, 4UL> shape = tensor_shape.dim_sizes();
   if (shape.empty()) {
     for (int64_t i = 0; i < limit; ++i) {
@@ -1413,18 +1442,18 @@ string SummarizeArrayInternal(int64_t limit, int64_t num_elts,
 }
 
 template <typename T>
-string SummarizeArray(int64_t limit, int64_t num_elts,
-                      const TensorShape& tensor_shape, const char* data,
-                      const bool print_v2) {
+std::string SummarizeArray(int64_t limit, int64_t num_elts,
+                           const TensorShape& tensor_shape, const char* data,
+                           const bool print_v2) {
   const T* array = reinterpret_cast<const T*>(data);
   return SummarizeArrayInternal<T>(limit, num_elts, tensor_shape, array,
                                    print_v2);
 }
 
 template <>
-string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
-                            const TensorShape& tensor_shape, const char* data,
-                            const bool print_v2) {
+std::string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
+                                 const TensorShape& tensor_shape,
+                                 const char* data, const bool print_v2) {
   if (data == nullptr) {
     return "";  // we already print type and shape
   }
@@ -1439,7 +1468,7 @@ string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
 }
 }  // namespace
 
-string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
+std::string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
   const int64_t num_elts = NumElements();
   if (max_entries < 0) {
     max_entries = num_elts;
@@ -1467,6 +1496,10 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
     case DT_FLOAT8_E4M3B11FNUZ:
       return SummarizeArray<float8_e4m3b11fnuz>(limit, num_elts, shape_, data,
                                                 print_v2);
+    case DT_FLOAT4_E2M1FN:
+      return SummarizeArray<float4_e2m1fn>(limit, num_elts, shape_, data,
+                                           print_v2);
+      break;
     case DT_FLOAT:
       return SummarizeArray<float>(limit, num_elts, shape_, data, print_v2);
       break;
@@ -1474,29 +1507,29 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<double>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT32:
-      return SummarizeArray<uint32>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint32_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT32:
-      return SummarizeArray<int32>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int32_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT8:
     case DT_QUINT8:
-      return SummarizeArray<uint8>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint8_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT16:
     case DT_QUINT16:
-      return SummarizeArray<uint16>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint16_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT16:
     case DT_QINT16:
-      return SummarizeArray<int16>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int16_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT8:
     case DT_QINT8:
-      return SummarizeArray<int8>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<int8_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_UINT64:
-      return SummarizeArray<uint64>(limit, num_elts, shape_, data, print_v2);
+      return SummarizeArray<uint64_t>(limit, num_elts, shape_, data, print_v2);
       break;
     case DT_INT64:
       return SummarizeArray<int64_t>(limit, num_elts, shape_, data, print_v2);
@@ -1519,7 +1552,7 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<uint2>(limit, num_elts, shape_, data, print_v2);
     default: {
       // All irregular cases
-      string ret;
+      std::string ret;
       if (print_v2 && (dims() > 0)) {
         absl::StrAppend(&ret, "[");
       }
@@ -1571,13 +1604,13 @@ bool Tensor::SharesBufferWith(const Tensor& b) const {
          buf_->root_buffer() == b.buf_->root_buffer();
 }
 
-string Tensor::DebugString(int num_values) const {
+std::string Tensor::DebugString(int num_values) const {
   return absl::StrCat("Tensor<type: ", DataTypeString(dtype()),
                       " shape: ", shape().DebugString(),
                       " values: ", SummarizeValue(num_values), ">");
 }
 
-string Tensor::DeviceSafeDebugString() const {
+std::string Tensor::DeviceSafeDebugString() const {
   return absl::StrCat("Tensor<type: ", DataTypeString(dtype()),
                       " shape: ", shape().DebugString(), ">");
 }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index c66f206c471d91..5db5b0bcd74e84 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -217,11 +217,11 @@ class Tensor {
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int32_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint32 scalar_value)
+  explicit Tensor(uint32_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint16 scalar_value)
+  explicit Tensor(uint16_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint8 scalar_value)
+  explicit Tensor(uint8_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int16_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
@@ -235,7 +235,7 @@ class Tensor {
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(int64_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
-  explicit Tensor(uint64 scalar_value)
+  explicit Tensor(uint64_t scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
   explicit Tensor(bool scalar_value)
       : Tensor(scalar_value, host_scalar_tag{}) {}
diff --git a/tensorflow/core/framework/tensor_fuzz.cc b/tensorflow/core/framework/tensor_fuzz.cc
index ef04128e0d8328..578d4c5697fd8d 100644
--- a/tensorflow/core/framework/tensor_fuzz.cc
+++ b/tensorflow/core/framework/tensor_fuzz.cc
@@ -36,7 +36,7 @@ FUZZ_TEST(TensorFuzz, BuildTensorAlwaysSucceedsWithValidTensorShape)
                                      /*dim_upper_bound=*/10));
 
 void DebugStringCheck(const Tensor& tensor) {
-  string out = tensor.DeviceSafeDebugString();
+  std::string out = tensor.DeviceSafeDebugString();
 }
 FUZZ_TEST(TensorFuzz, DebugStringCheck)
     .WithDomains(
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 4dc00a66af3adb..2db5464941f6b9 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -35,7 +35,7 @@ static_assert(sizeof(TensorShapeRep) == sizeof(PartialTensorShape),
 
 template <class Shape>
 static void AppendTo(const TensorShapeBase<Shape>& s,
-                     absl::InlinedVector<int64, 8UL>* vals) {
+                     absl::InlinedVector<int64_t, 8UL>* vals) {
   for (auto dim : s) {
     vals->push_back(dim.size);
   }
@@ -213,10 +213,10 @@ absl::Status TensorShapeBase<Shape>::BuildTensorShapeBase(
 // Returns true iff partial is true and val is < 0.
 // REQUIRES: val < kMaxRep16
 // REQUIRES: partial || val >= 0
-static inline bool Set16(bool partial, uint16* dst, int dim, int64_t val) {
+static inline bool Set16(bool partial, uint16_t* dst, int dim, int64_t val) {
   if (partial) {
     if (val < 0) {
-      dst[dim] = std::numeric_limits<uint16>::max();
+      dst[dim] = std::numeric_limits<uint16_t>::max();
       return true;
     }
   }
@@ -229,10 +229,11 @@ absl::Status TensorShapeBase<Shape>::InitDims(
     absl::Span<const int64_t> dim_sizes) {
   DCHECK_EQ(tag(), REP16);
 
-  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
-  // below cannot overflow.
+  // Allow sizes that are under std::numeric_limits<int64_t>::max()^0.25 so that
+  // 4-way multiplication below cannot overflow.
   static const int64_t kMaxSmall = 0xd744;
-  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <=
+                    std::numeric_limits<int64_t>::max(),
                 "bad overflow check");
   bool large_size = false;
   for (auto s : dim_sizes) {
@@ -253,7 +254,7 @@ absl::Status TensorShapeBase<Shape>::InitDims(
 
   if (!large_size) {
     // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
-    uint16* dst = as16()->dims_;
+    uint16_t* dst = as16()->dims_;
     switch (dim_sizes.size()) {
       case 1: {
         set_ndims_byte(1);
@@ -358,11 +359,11 @@ int64_t TensorShapeBase<Shape>::dim_size(int d) const {
   CHECK_GE(d, 0);                  // Crash OK
   if (d > 0) CHECK_LT(d, dims());  // Crash OK
   if (tag() == REP16) {
-    uint16 dim = as16()->dims_[d];
+    uint16_t dim = as16()->dims_[d];
     if (kIsPartial && dim == kUnknownRep16) return -1;
     return dim;
   } else if (tag() == REP32) {
-    uint32 dim = as32()->dims_[d];
+    uint32_t dim = as32()->dims_[d];
     if (kIsPartial && dim == kUnknownRep32) return -1;
     return dim;
   } else {
@@ -462,10 +463,10 @@ void TensorShapeBase<Shape>::UnsafeAddDim(int64_t size,
   const int nd = ndims_byte();
   if (tag() == REP16 && nd < 6 && size < kMaxRep16) {
     as16()->dims_[nd] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && nd < 3 && size < kMaxRep32) {
     as32()->dims_[nd] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     as64()->dims_->push_back(size);
   } else {
@@ -490,7 +491,7 @@ void TensorShapeBase<Shape>::UnsafeAddDim(int64_t size,
       for (size_t d = 0; d < vals.size(); d++) {
         as32()->dims_[d] = kIsPartial && vals[d] < 0
                                ? kUnknownRep32
-                               : static_cast<uint32>(vals[d]);
+                               : static_cast<uint32_t>(vals[d]);
       }
     } else {
       set_tag(REP_OUT_OF_LINE);
@@ -590,10 +591,10 @@ void TensorShapeBase<Shape>::set_dim(int d, int64_t size) {
   }
   if (tag() == REP16 && size < kMaxRep16) {
     as16()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && size < kMaxRep32) {
     as32()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     (*as64()->dims_)[d] = size;
   } else {
@@ -624,10 +625,10 @@ absl::Status TensorShapeBase<Shape>::SetDimWithStatus(int d, int64_t size) {
 
   if (tag() == REP16 && size < kMaxRep16) {
     as16()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16>(size);
+        kIsPartial && size < 0 ? kUnknownRep16 : static_cast<uint16_t>(size);
   } else if (tag() == REP32 && size < kMaxRep32) {
     as32()->dims_[d] =
-        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32>(size);
+        kIsPartial && size < 0 ? kUnknownRep32 : static_cast<uint32_t>(size);
   } else if (tag() == REP_OUT_OF_LINE) {
     (*as64()->dims_)[d] = size;
   } else {
@@ -752,10 +753,10 @@ TensorShapeIter<Shape> TensorShapeBase<Shape>::end() const {
   return TensorShapeIter<Shape>(static_cast<const Shape*>(this), max_dim);
 }
 
-string TensorShapeRep::DebugString() const {
+std::string TensorShapeRep::DebugString() const {
   const auto& shape = *static_cast<const PartialTensorShape*>(this);
   if (shape.unknown_rank()) return "<unknown>";
-  string s = "[";
+  std::string s = "[";
   for (int i = 0; i < shape.dims(); i++) {
     if (i > 0) absl::StrAppend(&s, ",");
     int64_t dim = shape.dim_size(i);
@@ -769,8 +770,8 @@ string TensorShapeRep::DebugString() const {
   return s;
 }
 
-string TensorShapeRep::DebugString(const TensorShapeProto& proto) {
-  string s;
+std::string TensorShapeRep::DebugString(const TensorShapeProto& proto) {
+  std::string s;
   if (proto.unknown_rank()) {
     absl::StrAppend(&s, "<unknown>");
     if (proto.dim_size() == 0) return s;
@@ -858,15 +859,15 @@ absl::Status MakeShapeHelper(const T* dims, int64_t n, Shape* out) {
   Status TensorShapeUtils::MakeShape(gtl::ArraySlice<T> shape, Shape* out) { \
     return MakeShapeHelper(shape.data(), shape.size(), out);                 \
   }
-MAKE_SHAPE(int32, TensorShape)
+MAKE_SHAPE(int32_t, TensorShape)
 MAKE_SHAPE(int64_t, TensorShape)
-MAKE_SHAPE(int32, PartialTensorShape)
+MAKE_SHAPE(int32_t, PartialTensorShape)
 MAKE_SHAPE(int64_t, PartialTensorShape)
 #undef MAKE_SHAPE
 
-string TensorShapeUtils::ShapeListString(
+std::string TensorShapeUtils::ShapeListString(
     const absl::Span<const TensorShape>& shapes) {
-  string result = "[";
+  std::string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
@@ -985,9 +986,9 @@ bool PartialTensorShape::IsCompatibleWith(
   return true;
 }
 
-string PartialTensorShapeUtils::PartialShapeListString(
+std::string PartialTensorShapeUtils::PartialShapeListString(
     const absl::Span<const PartialTensorShape>& shapes) {
-  string result = "[";
+  std::string result = "[";
   bool first = true;
   for (const PartialTensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 0bcf1fc54af844..dfc292a5f22e50 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -91,10 +91,10 @@ class TensorShapeRep {
   // For PartialTensorShape, a dimension of static_cast<uint??>(-1) is unknown.
   // This value is not allowed in TensorShape either for format compatibility.
   struct Rep16 {
-    uint16 dims_[6];
+    uint16_t dims_[6];
   };
   struct Rep32 {
-    uint32 dims_[3];
+    uint32_t dims_[3];
   };
   struct Rep64 {
     absl::InlinedVector<int64_t, 4UL>* dims_;
@@ -102,10 +102,12 @@ class TensorShapeRep {
 
   // We use the max value of uint16 or uint32 to represent unknown shapes, so
   // the maximum representable valid shape in these representations is one less.
-  static constexpr int64_t kMaxRep16 = std::numeric_limits<uint16>::max() - 1;
-  static constexpr int64_t kMaxRep32 = std::numeric_limits<uint32>::max() - 1;
-  static constexpr uint16 kUnknownRep16 = std::numeric_limits<uint16>::max();
-  static constexpr uint32 kUnknownRep32 = std::numeric_limits<uint32>::max();
+  static constexpr int64_t kMaxRep16 = std::numeric_limits<uint16_t>::max() - 1;
+  static constexpr int64_t kMaxRep32 = std::numeric_limits<uint32_t>::max() - 1;
+  static constexpr uint16_t kUnknownRep16 =
+      std::numeric_limits<uint16_t>::max();
+  static constexpr uint32_t kUnknownRep32 =
+      std::numeric_limits<uint32_t>::max();
 
   Rep16* as16() { return reinterpret_cast<Rep16*>(buf()); }
   Rep32* as32() { return reinterpret_cast<Rep32*>(buf()); }
@@ -126,19 +128,19 @@ class TensorShapeRep {
   DataType data_type() const { return static_cast<DataType>(buf()[13]); }
   void set_data_type(DataType dt) {
     // We only have 8 bits available to store DataType, so make sure it fits
-    DCHECK_LT(static_cast<uint32>(dt), 256u);
-    buf()[13] = static_cast<uint8>(dt);
+    DCHECK_LT(static_cast<uint32_t>(dt), 256u);
+    buf()[13] = static_cast<uint8_t>(dt);
   }
 
   // We store the number of dimensions in byte 14, and the RepTag in byte 15.
   // Bytes [0..13] vary depending on the representation.
   // A value of 255 indicates unknown rank in the PartialTensorShape case.
-  static constexpr uint8 kUnknownRank = 255;
-  uint8 ndims_byte() const { return buf()[14]; }
-  void set_ndims_byte(uint8 nd) { buf()[14] = nd; }
+  static constexpr uint8_t kUnknownRank = 255;
+  uint8_t ndims_byte() const { return buf()[14]; }
+  void set_ndims_byte(uint8_t nd) { buf()[14] = nd; }
 
   RepTag tag() const { return static_cast<RepTag>(buf()[15]); }
-  void set_tag(RepTag tag) { buf()[15] = static_cast<uint8>(tag); }
+  void set_tag(RepTag tag) { buf()[15] = static_cast<uint8_t>(tag); }
 
   void set_num_elements(int64_t n) { num_elements_ = n; }
 
@@ -146,11 +148,11 @@ class TensorShapeRep {
   void DestructorOutOfLine();
   void SlowCopyFrom(const TensorShapeRep& b);
 
-  uint8* buf() { return &u_.buf[0]; }
-  const uint8* buf() const { return &u_.buf[0]; }
+  uint8_t* buf() { return &u_.buf[0]; }
+  const uint8_t* buf() const { return &u_.buf[0]; }
 
   union {
-    uint8 buf[16];
+    uint8_t buf[16];
     // Force data to be aligned enough for a pointer.
     Rep64* unused_aligner;
   } u_;
@@ -290,7 +292,7 @@ class TensorShapeBase : public TensorShapeRep {
   /// Return the number of dimensions in the tensor.
   /// Can be -1 meaning unknown rank for PartialTensorShape.
   int dims() const {
-    uint8 dims = ndims_byte();
+    uint8_t dims = ndims_byte();
     return kIsPartial && dims == kUnknownRank ? -1 : dims;
   }
 
@@ -507,18 +509,19 @@ class TensorShapeUtils {
 
   /// \brief Returns a `TensorShape` whose dimensions are
   /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
-  static absl::Status MakeShape(const int32* dims, int64_t n, TensorShape* out);
+  static absl::Status MakeShape(const int32_t* dims, int64_t n,
+                                TensorShape* out);
   static absl::Status MakeShape(const int64_t* dims, int64_t n,
                                 TensorShape* out);
-  static absl::Status MakeShape(absl::Span<const int32> shape,
+  static absl::Status MakeShape(absl::Span<const int32_t> shape,
                                 TensorShape* out);
   static absl::Status MakeShape(absl::Span<const int64_t> shape,
                                 TensorShape* out);
-  static absl::Status MakeShape(const int32* dims, int64_t n,
+  static absl::Status MakeShape(const int32_t* dims, int64_t n,
                                 PartialTensorShape* out);
   static absl::Status MakeShape(const int64_t* dims, int64_t n,
                                 PartialTensorShape* out);
-  static absl::Status MakeShape(absl::Span<const int32> shape,
+  static absl::Status MakeShape(absl::Span<const int32_t> shape,
                                 PartialTensorShape* out);
   static absl::Status MakeShape(absl::Span<const int64_t> shape,
                                 PartialTensorShape* out);
@@ -774,7 +777,7 @@ inline TensorShapeBase<Shape>::TensorShapeBase(DataType dt) {
   // Optimized implementation of InitDims() where the shape is statically known
   // to be {0}.
   set_ndims_byte(1);
-  uint16* dst = as16()->dims_;
+  uint16_t* dst = as16()->dims_;
   *dst = 0;
   set_num_elements(0);
 }
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 3de9b9b8462a33..5156d62484dbe9 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class TensorShapeTestHelper {
  public:
   static void set_data_type(TensorShape* s, DataType t) { s->set_data_type(t); }
-  static uint8 data_type(const TensorShape* s) { return s->data_type(); }
+  static uint8_t data_type(const TensorShape* s) { return s->data_type(); }
 };
 
 namespace {
@@ -620,11 +620,11 @@ class TensorShapeOld {
   TensorShapeIterOld end() const;
 
   /// For error messages.
-  string DebugString() const;
+  std::string DebugString() const;
 
   /// Same as `TensorShape(proto).DebugString()` but doesn't crash for
   /// invalid protos.
-  static string DebugString(const TensorShapeProto& proto);
+  static std::string DebugString(const TensorShapeProto& proto);
 
  private:
   // Recalculates the dimensions of this tensor after they are modified.
@@ -794,13 +794,13 @@ TensorShapeIterOld TensorShapeOld::end() const {
   return TensorShapeIterOld(this, dims());
 }
 
-string TensorShapeOld::DebugString() const {
+std::string TensorShapeOld::DebugString() const {
   return absl::StrCat(
       "[", absl::StrJoin(absl::Span<const int64_t>(dim_sizes_), ","), "]");
 }
 
-string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
-  string s = "[";
+std::string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
+  std::string s = "[";
   bool first = true;
   for (const auto& d : proto.dim()) {
     absl::StrAppend(&s, first ? "" : ",", d.size());
diff --git a/tensorflow/core/framework/tensor_slice.cc b/tensorflow/core/framework/tensor_slice.cc
index 0eed9342c404ec..30971ed5675b39 100644
--- a/tensorflow/core/framework/tensor_slice.cc
+++ b/tensorflow/core/framework/tensor_slice.cc
@@ -75,18 +75,20 @@ absl::Status TensorSlice::BuildTensorSlice(const TensorSliceProto& proto,
   return absl::OkStatus();
 }
 
-absl::Status TensorSlice::Parse(const string& str, TensorSlice* slice) {
-  std::vector<string> items = str_util::Split(str, ':', str_util::SkipEmpty());
+absl::Status TensorSlice::Parse(const std::string& str, TensorSlice* slice) {
+  std::vector<std::string> items =
+      str_util::Split(str, ':', str_util::SkipEmpty());
   slice->starts_.reserve(items.size());
   slice->lengths_.reserve(items.size());
-  for (const string& x : items) {
+  for (const std::string& x : items) {
     int64_t s, l;
     if (x == "-") {
       // "everything"
       s = 0;
       l = kFullExtent;
     } else {
-      std::vector<string> sl = str_util::Split(x, ',', str_util::SkipEmpty());
+      std::vector<std::string> sl =
+          str_util::Split(x, ',', str_util::SkipEmpty());
       if (sl.size() != 2 || !absl::SimpleAtoi(sl[0], &s) ||
           !absl::SimpleAtoi(sl[1], &l)) {
         return errors::InvalidArgument(
@@ -152,8 +154,8 @@ void TensorSlice::AsProto(TensorSliceProto* proto) const {
   }
 }
 
-string TensorSlice::DebugString() const {
-  string buffer;
+std::string TensorSlice::DebugString() const {
+  std::string buffer;
   bool first = true;
   for (int d = 0; d < dims(); ++d) {
     if (!first) {
diff --git a/tensorflow/core/framework/tensor_slice.h b/tensorflow/core/framework/tensor_slice.h
index 4ada28d1f20109..b6fc4f503bdb11 100644
--- a/tensorflow/core/framework/tensor_slice.h
+++ b/tensorflow/core/framework/tensor_slice.h
@@ -54,8 +54,8 @@ class TensorSlice {
   static absl::Status BuildTensorSlice(const TensorSliceProto& proto,
                                        TensorSlice* output);
 
-  static absl::Status Parse(const string& str, TensorSlice* output);
-  static TensorSlice ParseOrDie(const string& str) {
+  static absl::Status Parse(const std::string& str, TensorSlice* output);
+  static TensorSlice ParseOrDie(const std::string& str) {
     TensorSlice ret;
     absl::Status s = Parse(str, &ret);
     if (!s.ok()) {
@@ -117,7 +117,7 @@ class TensorSlice {
 
   // Conversion of a TensorSlice to other formats
   void AsProto(TensorSliceProto* proto) const;
-  string DebugString() const;
+  std::string DebugString() const;
 
   // Fill *indices and *sizes from *this (so that we can use the slice()
   // function in eigen tensor). We need a tensor shape in case some of the
diff --git a/tensorflow/core/framework/tensor_slice_test.cc b/tensorflow/core/framework/tensor_slice_test.cc
index 1818c0b3f27c3c..13b07d1dcf016c 100644
--- a/tensorflow/core/framework/tensor_slice_test.cc
+++ b/tensorflow/core/framework/tensor_slice_test.cc
@@ -317,11 +317,11 @@ TEST(TensorSliceTest, Deserialization) {
   // since 0 is start's default value.)
 
   TensorSliceProto proto2;
-  ASSERT_TRUE(proto2.ParseFromArray(pb2, sizeof(pb2) - 1));
+  ASSERT_TRUE(proto2.ParseFromString(absl::string_view(pb2, sizeof(pb2) - 1)));
   TensorSlice ts2(proto2);
 
   TensorSliceProto proto3;
-  ASSERT_TRUE(proto3.ParseFromArray(pb3, sizeof(pb3) - 1));
+  ASSERT_TRUE(proto3.ParseFromString(absl::string_view(pb3, sizeof(pb3) - 1)));
   TensorSlice ts3(proto3);
 
   // Both serializations should be interpreted the same.
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 43a80931c18c75..fe4568624e6a4f 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -62,8 +62,8 @@ bool operator==(const Variant& a, const Variant& b) {
   a.Encode(&a_data);
   b.Encode(&b_data);
 
-  string a_metadata;
-  string b_metadata;
+  std::string a_metadata;
+  std::string b_metadata;
   a_data.get_metadata(&a_metadata);
   b_data.get_metadata(&b_metadata);
   if (a_metadata != b_metadata) return false;
@@ -74,7 +74,7 @@ bool operator==(const Variant& a, const Variant& b) {
     TensorProto a_proto, b_proto;
     a_data.tensors(i).AsProtoTensorContent(&a_proto);
     b_data.tensors(i).AsProtoTensorContent(&b_proto);
-    string a_str, b_str;
+    std::string a_str, b_str;
     a_proto.SerializeToString(&a_str);
     b_proto.SerializeToString(&b_str);
     if (a_str != b_str) return false;
@@ -95,15 +95,15 @@ TEST(TensorTest, Default) {
 TEST(TensorTest, DataType_Traits) {
   EXPECT_TRUE(std::is_trivial<float>::value);
   EXPECT_TRUE(std::is_trivial<double>::value);
-  EXPECT_TRUE(std::is_trivial<int32>::value);
-  EXPECT_TRUE(std::is_trivial<uint8>::value);
-  EXPECT_TRUE(std::is_trivial<uint16>::value);
-  EXPECT_TRUE(std::is_trivial<int16>::value);
-  EXPECT_TRUE(std::is_trivial<int8>::value);
+  EXPECT_TRUE(std::is_trivial<int32_t>::value);
+  EXPECT_TRUE(std::is_trivial<uint8_t>::value);
+  EXPECT_TRUE(std::is_trivial<uint16_t>::value);
+  EXPECT_TRUE(std::is_trivial<int16_t>::value);
+  EXPECT_TRUE(std::is_trivial<int8_t>::value);
   EXPECT_TRUE(std::is_trivial<int64_t>::value);
   EXPECT_TRUE(std::is_trivial<bool>::value);
   EXPECT_FALSE(std::is_trivial<tstring>::value);
-  EXPECT_FALSE(std::is_trivial<string>::value);
+  EXPECT_FALSE(std::is_trivial<std::string>::value);
 
   EXPECT_EQ(sizeof(bool), 1);
 
@@ -288,6 +288,17 @@ TEST(Tensor_Float8_E5m2fnuz, Simple) {
   TestCopies<float8_e5m2fnuz>(t);
 }
 
+TEST(Tensor_Float4_E2m1fn, Simple) {
+  Tensor t(DT_FLOAT4_E2M1FN, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float4_e2m1fn>()(a, b) = static_cast<float4_e2m1fn>(a * b);
+    }
+  }
+  TestCopies<float4_e2m1fn>(t);
+}
+
 TEST(Tensor_Float, Simple) {
   Tensor t(DT_FLOAT, TensorShape({10, 20}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
@@ -315,10 +326,10 @@ TEST(Tensor_int8, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int8>()(a, b) = static_cast<int8>(a * b);
+      t.matrix<int8_t>()(a, b) = static_cast<int8_t>(a * b);
     }
   }
-  TestCopies<int8>(t);
+  TestCopies<int8_t>(t);
 }
 
 TEST(Tensor_int16, Simple) {
@@ -326,10 +337,10 @@ TEST(Tensor_int16, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int16>()(a, b) = static_cast<int16>(a * b);
+      t.matrix<int16_t>()(a, b) = static_cast<int16_t>(a * b);
     }
   }
-  TestCopies<int16>(t);
+  TestCopies<int16_t>(t);
 }
 
 TEST(Tensor_int32, Simple) {
@@ -499,10 +510,10 @@ TEST(Tensor_UInt8, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint8>()(a, b) = static_cast<uint8>(a * b);
+      t.matrix<uint8_t>()(a, b) = static_cast<uint8_t>(a * b);
     }
   }
-  TestCopies<uint8>(t);
+  TestCopies<uint8_t>(t);
 }
 
 TEST(Tensor_UInt16, Simple) {
@@ -510,10 +521,10 @@ TEST(Tensor_UInt16, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint16>()(a, b) = static_cast<uint16>(a * b);
+      t.matrix<uint16_t>()(a, b) = static_cast<uint16_t>(a * b);
     }
   }
-  TestCopies<uint16>(t);
+  TestCopies<uint16_t>(t);
 }
 
 TEST(Tensor_UInt32, Simple) {
@@ -521,10 +532,10 @@ TEST(Tensor_UInt32, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint32>()(a, b) = static_cast<uint32>(a * b);
+      t.matrix<uint32_t>()(a, b) = static_cast<uint32_t>(a * b);
     }
   }
-  TestCopies<uint32>(t);
+  TestCopies<uint32_t>(t);
 }
 
 TEST(Tensor_QInt8, Simple) {
@@ -576,7 +587,7 @@ TEST(Tensor_QInt32, Simple) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<qint32>()(a, b) = qint32(static_cast<int32>(a * b));
+      t.matrix<qint32>()(a, b) = qint32(static_cast<int32_t>(a * b));
     }
   }
   TestCopies<qint32>(t);
@@ -970,26 +981,26 @@ TEST(ReinterpretLastDimension, Reinterpret_NCHW_VECT_C_as_NCHW) {
     Tensor t_nchw_vect_c(DT_QINT8, TensorShape({2, 3, 5, 7, 4}));
     auto nchw_vect_c = t_nchw_vect_c.tensor<qint8, 5>();
     Tensor t_expected_nchw(DT_INT32, TensorShape({2, 3, 5, 7}));
-    auto expected_nchw = t_expected_nchw.tensor<int32, 4>();
+    auto expected_nchw = t_expected_nchw.tensor<int32_t, 4>();
     int8_t val = 0;
     for (int n = 0; n < t_nchw_vect_c.shape().dim_size(0); ++n) {
       for (int c = 0; c < t_nchw_vect_c.shape().dim_size(1); ++c) {
         for (int h = 0; h < t_nchw_vect_c.shape().dim_size(2); ++h, ++val) {
-          int8 packet[4];
+          int8_t packet[4];
           for (int w = 0; w < t_nchw_vect_c.shape().dim_size(3); ++w) {
             packet[0] = nchw_vect_c(n, c, h, w, 0) = ++val;
             packet[1] = nchw_vect_c(n, c, h, w, 1) = ++val;
             packet[2] = nchw_vect_c(n, c, h, w, 2) = ++val;
             packet[3] = nchw_vect_c(n, c, h, w, 3) = ++val;
-            expected_nchw(n, c, h, w) = *reinterpret_cast<int32*>(&packet[0]);
+            expected_nchw(n, c, h, w) = *reinterpret_cast<int32_t*>(&packet[0]);
           }
         }
       }
     }
-    auto actual_nchw = t_nchw_vect_c.reinterpret_last_dimension<int32, 4>();
+    auto actual_nchw = t_nchw_vect_c.reinterpret_last_dimension<int32_t, 4>();
     const auto& const_t_nchw_vect_c = t_nchw_vect_c;
     auto const_actual_nchw =
-        const_t_nchw_vect_c.reinterpret_last_dimension<int32, 4>();
+        const_t_nchw_vect_c.reinterpret_last_dimension<int32_t, 4>();
     for (int n = 0; n < t_nchw_vect_c.shape().dim_size(0); ++n) {
       for (int c = 0; c < t_nchw_vect_c.shape().dim_size(1); ++c) {
         for (int h = 0; h < t_nchw_vect_c.shape().dim_size(2); ++h) {
@@ -1217,19 +1228,19 @@ TEST(Tensor_Float, SimpleWithAllocator) {
 }
 
 TEST(Tensor_Int32, SimpleWithHelper) {
-  Tensor t1 = test::AsTensor<int32>({0, 1, 2, 3, 4, 5}, {2, 3});
+  Tensor t1 = test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5}, {2, 3});
   Tensor t2(t1.dtype(), t1.shape());
-  t2.flat<int32>() = t1.flat<int32>() * 2;
-  Tensor t3 = test::AsTensor<int32>({0, 2, 4, 6, 8, 10}, t1.shape());
-  ExpectEqual<int32>(t2, t3);
+  t2.flat<int32_t>() = t1.flat<int32_t>() * 2;
+  Tensor t3 = test::AsTensor<int32_t>({0, 2, 4, 6, 8, 10}, t1.shape());
+  ExpectEqual<int32_t>(t2, t3);
 }
 
 TEST(Tensor_UInt16, SimpleWithHelper) {
-  Tensor t1 = test::AsTensor<uint16>({0, 1, 2, 3, 4, 5}, {2, 3});
+  Tensor t1 = test::AsTensor<uint16_t>({0, 1, 2, 3, 4, 5}, {2, 3});
   Tensor t2(t1.dtype(), t1.shape());
-  t2.flat<uint16>() = t1.flat<uint16>() * uint16(2);
-  Tensor t3 = test::AsTensor<uint16>({0, 2, 4, 6, 8, 10}, t1.shape());
-  ExpectEqual<uint16>(t2, t3);
+  t2.flat<uint16_t>() = t1.flat<uint16_t>() * uint16_t(2);
+  Tensor t3 = test::AsTensor<uint16_t>({0, 2, 4, 6, 8, 10}, t1.shape());
+  ExpectEqual<uint16_t>(t2, t3);
 }
 
 TEST(Tensor_QInt8, SimpleWithHelper) {
@@ -1413,7 +1424,7 @@ TEST(Tensor_Complex, SimpleWithHelper128) {
 class DummyCPUAllocator : public Allocator {
  public:
   DummyCPUAllocator() = default;
-  string Name() override { return "cpu"; }
+  std::string Name() override { return "cpu"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return nullptr;
   }
@@ -1583,7 +1594,7 @@ TEST(Tensor, Slice_Basic) {
   {
     // Test unaligned access via a Slice for 8-bit data type.
     Tensor x(DT_INT8, TensorShape({30}));
-    x.flat<int8>().setConstant(0);
+    x.flat<int8_t>().setConstant(0);
 
     // Take an unaligned slice.
     Tensor y = x.Slice(1, 13);
@@ -1705,37 +1716,40 @@ INSTANTIATE_TEST_SUITE_P(
                                   {1, 2, 3, 4, 0}),
          MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({0}), {})},
         {"DT_UINT8",
-         MkTensor<uint8>(DT_UINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<uint8>(DT_UINT8, TensorShape({0}), {})},
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({2, 2, 1, 1}),
+                           {1, 2, 3, 4, 0}),
+         MkTensor<uint8_t>(DT_UINT8, TensorShape({0}), {})},
         {"DT_UINT16",
-         MkTensor<uint16>(DT_UINT16, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint16>(DT_UINT16, TensorShape({0}), {})},
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint16_t>(DT_UINT16, TensorShape({0}), {})},
         {"DT_UINT32",
-         MkTensor<uint32>(DT_UINT32, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint32>(DT_UINT32, TensorShape({0}), {})},
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint32_t>(DT_UINT32, TensorShape({0}), {})},
         {"DT_UINT64",
-         MkTensor<uint64>(DT_UINT64, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2, 1, 1}),
-                          {1, 2, 3, 4, 0}),
-         MkTensor<uint64>(DT_UINT64, TensorShape({0}), {})},
-        {"DT_INT8", MkTensor<int8>(DT_INT8, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<int8>(DT_INT8, TensorShape({0}), {})},
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({2, 2, 1, 1}),
+                            {1, 2, 3, 4, 0}),
+         MkTensor<uint64_t>(DT_UINT64, TensorShape({0}), {})},
+        {"DT_INT8",
+         MkTensor<int8_t>(DT_INT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<int8_t>(DT_INT8, TensorShape({0}), {})},
         {"DT_INT16",
-         MkTensor<int16>(DT_INT16, TensorShape({5}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
-         MkTensor<int16>(DT_INT16, TensorShape({0}), {})},
+         MkTensor<int16_t>(DT_INT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({2, 2, 1, 1}),
+                           {1, 2, 3, 4, 0}),
+         MkTensor<int16_t>(DT_INT16, TensorShape({0}), {})},
         {"DT_INT32", MkTensor<int>(DT_INT32, TensorShape({5}), {1, 2, 3, 4, 0}),
          MkTensor<int>(DT_INT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
          MkTensor<int>(DT_INT32, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 57c9ce04a9b79e..6ee0f84035f611 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -49,7 +49,8 @@ static ::testing::AssertionResult EqualFailure(const T& x, const T& y) {
 }
 
 template <>
-::testing::AssertionResult EqualFailure<int8>(const int8& x, const int8& y) {
+::testing::AssertionResult EqualFailure<int8_t>(const int8_t& x,
+                                                const int8_t& y) {
   return EqualFailure(static_cast<int>(x), static_cast<int>(y));
 }
 
@@ -231,17 +232,17 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
     case DT_DOUBLE:
       return ExpectEqual<double>(x, y, t);
     case DT_INT32:
-      return ExpectEqual<int32>(x, y);
+      return ExpectEqual<int32_t>(x, y);
     case DT_UINT32:
-      return ExpectEqual<uint32>(x, y);
+      return ExpectEqual<uint32_t>(x, y);
     case DT_UINT16:
-      return ExpectEqual<uint16>(x, y);
+      return ExpectEqual<uint16_t>(x, y);
     case DT_UINT8:
-      return ExpectEqual<uint8>(x, y);
+      return ExpectEqual<uint8_t>(x, y);
     case DT_INT16:
-      return ExpectEqual<int16>(x, y);
+      return ExpectEqual<int16_t>(x, y);
     case DT_INT8:
-      return ExpectEqual<int8>(x, y);
+      return ExpectEqual<int8_t>(x, y);
     case DT_STRING:
       return ExpectEqual<tstring>(x, y);
     case DT_COMPLEX64:
@@ -251,7 +252,7 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
     case DT_INT64:
       return ExpectEqual<int64_t>(x, y);
     case DT_UINT64:
-      return ExpectEqual<uint64>(x, y);
+      return ExpectEqual<uint64_t>(x, y);
     case DT_BOOL:
       return ExpectEqual<bool>(x, y);
     case DT_QINT8:
@@ -278,6 +279,8 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
       return ExpectEqual<float8_e4m3b11fnuz>(x, y, t);
     case DT_FLOAT8_E5M2FNUZ:
       return ExpectEqual<float8_e5m2fnuz>(x, y, t);
+    case DT_FLOAT4_E2M1FN:
+      return ExpectEqual<float4_e2m1fn>(x, y, t);
     case DT_INT4:
       return ExpectEqual<int4>(x, y, t);
     case DT_UINT4:
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 2d9be837368745..899efab94e85ee 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -430,7 +430,7 @@ absl::Status MakeShape(const Tensor& shape, TensorShape* out) {
         shape.shape().DebugString());
   }
   if (shape.dtype() == DataType::DT_INT32) {
-    auto vec = shape.flat<int32>();
+    auto vec = shape.flat<int32_t>();
     return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
   } else if (shape.dtype() == DataType::DT_INT64) {
     auto vec = shape.flat<int64_t>();
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index eec2bd3f018ddf..7b9aaba557774d 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -94,14 +94,14 @@ class TensorProtoFieldHelper : public std::false_type {};
 // values in TensorProto. See tensorflow/core/framework/tensor.proto.
 DEFINE_PROTO_FIELD_HELPER(float, float);
 DEFINE_PROTO_FIELD_HELPER(double, double);
-DEFINE_PROTO_FIELD_HELPER(int8, int);
-DEFINE_PROTO_FIELD_HELPER(uint8, int);
-DEFINE_PROTO_FIELD_HELPER(int16, int);
-DEFINE_PROTO_FIELD_HELPER(uint16, int);
-DEFINE_PROTO_FIELD_HELPER(int32, int);
-DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int8_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint8_t, int);
+DEFINE_PROTO_FIELD_HELPER(int16_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint16_t, int);
+DEFINE_PROTO_FIELD_HELPER(int32_t, int);
+DEFINE_PROTO_FIELD_HELPER(uint32_t, uint32);
 DEFINE_PROTO_FIELD_HELPER(int64_t, int64);
-DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(uint64_t, uint64);
 DEFINE_PROTO_FIELD_HELPER(bool, bool);
 DEFINE_PROTO_FIELD_HELPER(qint8, int);
 DEFINE_PROTO_FIELD_HELPER(quint8, int);
@@ -142,13 +142,13 @@ struct CopyHelper<Eigen::half> {
   template <typename SrcIter>
   static void ToArray(SrcIter begin, SrcIter end, Eigen::half* dst) {
     std::transform(begin, end, dst, [](int x) -> Eigen::half {
-      return Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16>(x));
+      return Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(x));
     });
   }
   template <typename SrcIter, typename DstIter>
   static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
     std::transform(begin, end, dst, [](Eigen::half h) -> int {
-      return static_cast<int>(Eigen::numext::bit_cast<uint16>(h));
+      return static_cast<int>(Eigen::numext::bit_cast<uint16_t>(h));
     });
   }
 };
@@ -158,13 +158,13 @@ struct CopyHelper<bfloat16> {
   template <typename SrcIter>
   static void ToArray(SrcIter begin, SrcIter end, bfloat16* dst) {
     std::transform(begin, end, dst, [](int x) -> bfloat16 {
-      return Eigen::numext::bit_cast<bfloat16>(static_cast<uint16>(x));
+      return Eigen::numext::bit_cast<bfloat16>(static_cast<uint16_t>(x));
     });
   }
   template <typename SrcIter, typename DstIter>
   static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
     std::transform(begin, end, dst, [](bfloat16 bf16) -> int {
-      return static_cast<int>(Eigen::numext::bit_cast<uint16>(bf16));
+      return static_cast<int>(Eigen::numext::bit_cast<uint16_t>(bf16));
     });
   }
 };
@@ -245,10 +245,10 @@ class TensorProtoHelper : public std::true_type {
 
 // Specialization for string.
 template <>
-class TensorProtoHelper<string> : public std::true_type {
+class TensorProtoHelper<std::string> : public std::true_type {
  public:
   static DataType GetDataType() { return DataType::DT_STRING; }
-  static void AddValue(const string& value, TensorProto* proto) {
+  static void AddValue(const std::string& value, TensorProto* proto) {
     *proto->mutable_string_val()->Add() = value;
   }
   template <typename IterType>
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 9b26845e948201..04414e89560a62 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -122,7 +122,7 @@ TEST(TensorUtil, DeepCopy) {
 
 TEST(TensorUtil, DeepCopySlice) {
   Tensor x(DT_INT32, TensorShape({10}));
-  x.flat<int32>().setConstant(1);
+  x.flat<int32_t>().setConstant(1);
 
   // Slice 'x' -- y still refers to the same buffer.
   Tensor y = x.Slice(2, 6);
@@ -131,7 +131,7 @@ TEST(TensorUtil, DeepCopySlice) {
   Tensor z = tensor::DeepCopy(y);
 
   // Set x to be different.
-  x.flat<int32>().setConstant(2);
+  x.flat<int32_t>().setConstant(2);
 
   EXPECT_EQ(TensorShape({10}), x.shape());
   EXPECT_EQ(TensorShape({4}), y.shape());
@@ -142,11 +142,11 @@ TEST(TensorUtil, DeepCopySlice) {
 
   // x and y should now all be '2', but z should be '1'.
   for (int i = 0; i < 10; ++i) {
-    EXPECT_EQ(2, x.flat<int32>()(i));
+    EXPECT_EQ(2, x.flat<int32_t>()(i));
   }
   for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(2, y.unaligned_flat<int32>()(i));
-    EXPECT_EQ(1, z.flat<int32>()(i));
+    EXPECT_EQ(2, y.unaligned_flat<int32_t>()(i));
+    EXPECT_EQ(1, z.flat<int32_t>()(i));
   }
 }
 
@@ -223,7 +223,7 @@ TEST(TensorUtil, Concat) {
     Tensor tensor(DT_INT32, TensorShape({size, 2}));
     for (int i = offset; i < offset + size; ++i) {
       for (int j = 0; j < 2; ++j) {
-        tensor.matrix<int32>()(i - offset, j) = 2 * i + j;
+        tensor.matrix<int32_t>()(i - offset, j) = 2 * i + j;
       }
     }
     to_concat.push_back(tensor);
@@ -236,7 +236,7 @@ TEST(TensorUtil, Concat) {
   ASSERT_EQ(TensorShape({total_size, 2}), concated.shape());
   for (int i = 0; i < total_size; ++i) {
     for (int j = 0; j < 2; ++j) {
-      EXPECT_EQ(2 * i + j, concated.matrix<int32>()(i, j));
+      EXPECT_EQ(2 * i + j, concated.matrix<int32_t>()(i, j));
     }
   }
 }
@@ -296,9 +296,9 @@ TEST(TensorUtil, ConcatSplitStrings) {
 
 TEST(TensorProtoUtil, CreateTensorProtoSpan_string) {
   // Don't use vector to trigger Span version.
-  string s[2] = {"a", "b"};
+  std::string s[2] = {"a", "b"};
   std::vector<size_t> shape{1, 2};
-  auto proto = tensor::CreateTensorProtoSpan<string>(s, shape);
+  auto proto = tensor::CreateTensorProtoSpan<std::string>(s, shape);
   TensorProto expected_tensor_proto;
   expected_tensor_proto.set_dtype(DT_STRING);
   expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
@@ -310,9 +310,9 @@ TEST(TensorProtoUtil, CreateTensorProtoSpan_string) {
 
 TEST(TensorProtoUtil, CreateTensorProtoSpan_int32) {
   // Don't use vector to trigger Span version.
-  int32 s[2] = {123, 456};
+  int32_t s[2] = {123, 456};
   std::vector<size_t> shape{1, 2};
-  auto proto = tensor::CreateTensorProtoSpan<int32>(s, shape);
+  auto proto = tensor::CreateTensorProtoSpan<int32_t>(s, shape);
   TensorProto expected_tensor_proto;
   expected_tensor_proto.set_dtype(DT_INT32);
   expected_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(1);
@@ -323,7 +323,7 @@ TEST(TensorProtoUtil, CreateTensorProtoSpan_int32) {
 }
 
 TEST(TensorProtoUtil, CreatesStringTensorProto) {
-  std::vector<string> values{"a", "b", "c"};
+  std::vector<std::string> values{"a", "b", "c"};
   std::vector<size_t> shape{1, 3};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -347,7 +347,7 @@ TEST(TensorProtoUtil, CreatesStringTensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesInt32TensorProto) {
-  std::vector<int32> values{1, 2};
+  std::vector<int32_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -387,7 +387,7 @@ TEST(TensorProtoUtil, CreatesInt64TensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
-  std::vector<uint32> values{1, 2};
+  std::vector<uint32_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -407,7 +407,7 @@ TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
 }
 
 TEST(TensorProtoUtil, CreatesUInt64TensorProto) {
-  std::vector<uint64> values{1, 2};
+  std::vector<uint64_t> values{1, 2};
   std::vector<size_t> shape{2};
 
   auto proto = tensor::CreateTensorProto(values, shape);
@@ -495,7 +495,7 @@ TEST(TensorProtoUtil, CompressTensorProtoInPlaceTooSmall) {
       tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
   EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
   tensor_proto =
-      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+      tensor::CreateTensorProto(std::vector<uint8_t>(kLength), {kLength});
   EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
   tensor_proto =
       tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
@@ -523,10 +523,10 @@ TEST(TensorProtoUtil, CompressTensorProtoInPlaceAllEqual) {
             0);
 
   tensor_proto =
-      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+      tensor::CreateTensorProto(std::vector<uint8_t>(kLength), {kLength});
   EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
-  EXPECT_EQ(tensor::internal::TensorProtoHelper<uint8>::NumValues(tensor_proto),
-            0);
+  EXPECT_EQ(
+      tensor::internal::TensorProtoHelper<uint8_t>::NumValues(tensor_proto), 0);
   tensor_proto =
       tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
   EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
@@ -645,14 +645,14 @@ TEST(TensorProtoUtil, CompressTensorProtoConstantTail) {
       ConstantTailTest<double>(kLength, tail_length, as_field);
       ConstantTailTest<complex64>(kLength, tail_length, as_field);
       ConstantTailTest<complex128>(kLength, tail_length, as_field);
-      ConstantTailTest<int32>(kLength, tail_length, as_field);
-      ConstantTailTest<uint32>(kLength, tail_length, as_field);
+      ConstantTailTest<int32_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint32_t>(kLength, tail_length, as_field);
       ConstantTailTest<int64_t>(kLength, tail_length, as_field);
-      ConstantTailTest<uint64>(kLength, tail_length, as_field);
-      ConstantTailTest<int8>(kLength, tail_length, as_field);
-      ConstantTailTest<uint8>(kLength, tail_length, as_field);
-      ConstantTailTest<int16>(kLength, tail_length, as_field);
-      ConstantTailTest<uint16>(kLength, tail_length, as_field);
+      ConstantTailTest<uint64_t>(kLength, tail_length, as_field);
+      ConstantTailTest<int8_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint8_t>(kLength, tail_length, as_field);
+      ConstantTailTest<int16_t>(kLength, tail_length, as_field);
+      ConstantTailTest<uint16_t>(kLength, tail_length, as_field);
       ConstantTailTest<Eigen::half>(kLength, tail_length, as_field);
       ConstantTailTest<bfloat16>(kLength, tail_length, as_field);
     }
diff --git a/tensorflow/core/framework/thread_factory.h b/tensorflow/core/framework/thread_factory.h
index 769ada29a5fe05..22c8238f3a40dd 100644
--- a/tensorflow/core/framework/thread_factory.h
+++ b/tensorflow/core/framework/thread_factory.h
@@ -35,7 +35,7 @@ class ThreadFactory {
   //
   // NOTE: The caller is responsible for ensuring that this `ThreadFactory`
   // outlives the returned `Thread`.
-  virtual std::unique_ptr<Thread> StartThread(const string& name,
+  virtual std::unique_ptr<Thread> StartThread(const std::string& name,
                                               std::function<void()> fn) = 0;
 };
 
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 3935df8f050a13..c1ebe5577b70d1 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 
 class TestableSizeTrackingAllocator : public Allocator {
  public:
-  string Name() override { return "test"; }
+  std::string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
     void* ptr = port::Malloc(num_bytes);
     size_map_[ptr] = num_bytes;
@@ -52,7 +52,7 @@ class TestableSizeTrackingAllocator : public Allocator {
 
 class NoMemoryAllocator : public Allocator {
  public:
-  string Name() override { return "test"; }
+  std::string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
     return nullptr;
   }
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index d73ca52743cd8c..22c0d608076af5 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -50,7 +50,7 @@ class TypeIndex {
 
   const char* name() const { return name_; }
 
-  uint64 hash_code() const { return hash_; }
+  uint64_t hash_code() const { return hash_; }
 
   // Returns a TypeIndex object that corresponds to a typename.
   template <typename T>
@@ -76,17 +76,18 @@ class TypeIndex {
 #endif  // TARGET_OS_OSX
 
     // No type names available.
-    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     "[RTTI disabled]");
+    return TypeIndex(
+        static_cast<uint64_t>(reinterpret_cast<intptr_t>(hash_bit)),
+        "[RTTI disabled]");
 #endif  // __GXX_RTTI
   }
 
  private:
   // We hide the constructor of the TypeIndex class. Use the templated
   // Make<T>() function to create a TypeIndex object.
-  explicit TypeIndex(const uint64 hash, const char* name)
+  explicit TypeIndex(const uint64_t hash, const char* name)
       : hash_(hash), name_(name) {}
-  uint64 hash_;
+  uint64_t hash_;
   const char* name_;
 };
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 1e9274fbbb6cc1..60ce9f6148a516 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -82,7 +82,7 @@ const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace {
-string DataTypeStringInternal(DataType dtype) {
+std::string DataTypeStringInternal(DataType dtype) {
   switch (dtype) {
     case DT_INVALID:
       return "INVALID";
@@ -138,6 +138,8 @@ string DataTypeStringInternal(DataType dtype) {
       return "float8_e4m3b11fnuz";
     case DT_FLOAT8_E5M2FNUZ:
       return "float8_e5m2fnuz";
+    case DT_FLOAT4_E2M1FN:
+      return "float4_e2m1fn";
     case DT_INT4:
       return "int4";
     case DT_UINT4:
@@ -157,7 +159,7 @@ string DataTypeStringInternal(DataType dtype) {
 }
 }  // end namespace
 
-string DataTypeString(DataType dtype) {
+std::string DataTypeString(DataType dtype) {
   if (IsRefType(dtype)) {
     DataType non_ref = static_cast<DataType>(dtype - kDataTypeRefOffset);
     return absl::StrCat(DataTypeStringInternal(non_ref), "_ref");
@@ -255,6 +257,9 @@ bool DataTypeFromString(absl::string_view sp, DataType* dt) {
   } else if (sp == "float8_e5m2fnuz") {
     *dt = DT_FLOAT8_E5M2FNUZ;
     return true;
+  } else if (sp == "float4_e2m1fn") {
+    *dt = DT_FLOAT4_E2M1FN;
+    return true;
   } else if (sp == "int4") {
     *dt = DT_INT4;
     return true;
@@ -277,12 +282,12 @@ bool DataTypeFromString(absl::string_view sp, DataType* dt) {
   return false;
 }
 
-string DeviceTypeString(const DeviceType& device_type) {
+std::string DeviceTypeString(const DeviceType& device_type) {
   return device_type.type();
 }
 
-string DataTypeSliceString(const DataTypeSlice types) {
-  string out;
+std::string DataTypeSliceString(const DataTypeSlice types) {
+  std::string out;
   for (auto it = types.begin(); it != types.end(); ++it) {
     absl::StrAppend(&out, it == types.begin() ? "" : ", ", DataTypeString(*it));
   }
@@ -318,6 +323,7 @@ int DataTypeSize(DataType dt) {
     TF_CALL_float8_e4m3fnuz(CASE);
     TF_CALL_float8_e4m3b11fnuz(CASE);
     TF_CALL_float8_e5m2fnuz(CASE);
+    TF_CALL_float4_e2m1fn(CASE);
     TF_CALL_int4(CASE);
     TF_CALL_uint4(CASE);
     TF_CALL_int2(CASE);
@@ -335,17 +341,17 @@ int DataTypeSize(DataType dt) {
 
 DEFINE_DATATYPETOENUM_VALUE(float);
 DEFINE_DATATYPETOENUM_VALUE(double);
-DEFINE_DATATYPETOENUM_VALUE(int32);
-DEFINE_DATATYPETOENUM_VALUE(uint32);
-DEFINE_DATATYPETOENUM_VALUE(uint16);
-DEFINE_DATATYPETOENUM_VALUE(uint8);
-DEFINE_DATATYPETOENUM_VALUE(int16);
-DEFINE_DATATYPETOENUM_VALUE(int8);
+DEFINE_DATATYPETOENUM_VALUE(int32_t);
+DEFINE_DATATYPETOENUM_VALUE(uint32_t);
+DEFINE_DATATYPETOENUM_VALUE(uint16_t);
+DEFINE_DATATYPETOENUM_VALUE(uint8_t);
+DEFINE_DATATYPETOENUM_VALUE(int16_t);
+DEFINE_DATATYPETOENUM_VALUE(int8_t);
 DEFINE_DATATYPETOENUM_VALUE(tstring);
 DEFINE_DATATYPETOENUM_VALUE(complex64);
 DEFINE_DATATYPETOENUM_VALUE(complex128);
 DEFINE_DATATYPETOENUM_VALUE(int64_t);
-DEFINE_DATATYPETOENUM_VALUE(uint64);
+DEFINE_DATATYPETOENUM_VALUE(uint64_t);
 DEFINE_DATATYPETOENUM_VALUE(bool);
 DEFINE_DATATYPETOENUM_VALUE(qint8);
 DEFINE_DATATYPETOENUM_VALUE(quint8);
@@ -359,6 +365,7 @@ DEFINE_DATATYPETOENUM_VALUE(float8_e4m3fn);
 DEFINE_DATATYPETOENUM_VALUE(float8_e4m3fnuz);
 DEFINE_DATATYPETOENUM_VALUE(float8_e4m3b11fnuz);
 DEFINE_DATATYPETOENUM_VALUE(float8_e5m2fnuz);
+DEFINE_DATATYPETOENUM_VALUE(float4_e2m1fn);
 DEFINE_DATATYPETOENUM_VALUE(int4);
 DEFINE_DATATYPETOENUM_VALUE(uint4);
 DEFINE_DATATYPETOENUM_VALUE(int2);
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index c6939faff56176..9413656ed96a3a 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -82,7 +82,7 @@ typedef absl::InlinedVector<DataType, 4UL> DataTypeVector;
 typedef absl::Span<const DataType> DataTypeSlice;
 
 typedef absl::InlinedVector<DeviceType, 4UL> DeviceTypeVector;
-typedef absl::InlinedVector<std::pair<DeviceType, int32>, 4UL>
+typedef absl::InlinedVector<std::pair<DeviceType, int32_t>, 4UL>
     PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
@@ -98,25 +98,25 @@ inline std::string DataTypeVectorString(const DataTypeVector& dtypes) {
 // cannot represent any of the DT_*_REF values.
 class DataTypeSet {
  private:
-  const uint64 mask_;
+  const uint64_t mask_;
 
-  static constexpr uint64 kNumBits = 64;
+  static constexpr uint64_t kNumBits = 64;
 
  public:
   constexpr DataTypeSet(const DataTypeSet& other) : mask_(other.mask_) {}
-  explicit constexpr DataTypeSet(uint64 mask) : mask_(mask) {}
+  explicit constexpr DataTypeSet(uint64_t mask) : mask_(mask) {}
 
   constexpr bool Contains(DataType dt) const {
-    return (static_cast<uint64>(dt) < kNumBits) &&
-           ((mask_ >> static_cast<uint64>(dt)) & 1ull) != 0ull;
+    return (static_cast<uint64_t>(dt) < kNumBits) &&
+           ((mask_ >> static_cast<uint64_t>(dt)) & 1ull) != 0ull;
   }
 
   class Iterator {
     const DataTypeSet& set_;
-    uint64 pos_;
+    uint64_t pos_;
 
    public:
-    Iterator(const DataTypeSet& set, uint64 pos) : set_(set), pos_(pos) {
+    Iterator(const DataTypeSet& set, uint64_t pos) : set_(set), pos_(pos) {
       DCHECK_LE(pos, kNumBits);
     }
     DataType operator*() const { return static_cast<DataType>(pos_); }
@@ -124,7 +124,7 @@ class DataTypeSet {
       ++pos_;
       DCHECK_LE(pos_, kNumBits);
       if (pos_ < kNumBits) {
-        uint64 remaining_mask = set_.mask_ >> pos_;
+        uint64_t remaining_mask = set_.mask_ >> pos_;
         if (remaining_mask != 0ull) {
           pos_ += absl::countr_zero(remaining_mask);
         }
@@ -171,7 +171,7 @@ class DataTypeSet {
 bool DataTypeFromString(absl::string_view sp, DataType* dt);
 
 constexpr inline DataTypeSet ToSet(DataType dt) {
-  return DataTypeSet(1ull << static_cast<uint64>(dt));
+  return DataTypeSet(1ull << static_cast<uint64_t>(dt));
 }
 
 // DT_FLOAT + kDataTypeRefOffset == DT_FLOAT_REF, etc.
@@ -325,12 +325,12 @@ struct EnumToDataType {};  // Specializations below
 
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
-MATCH_TYPE_AND_ENUM(int32, DT_INT32);
-MATCH_TYPE_AND_ENUM(uint32, DT_UINT32);
-MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
-MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
-MATCH_TYPE_AND_ENUM(int16, DT_INT16);
-MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
+MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(tstring, DT_STRING);
 MATCH_TYPE_AND_ENUM(complex64, DT_COMPLEX64);
 MATCH_TYPE_AND_ENUM(complex128, DT_COMPLEX128);
@@ -347,6 +347,7 @@ MATCH_TYPE_AND_ENUM(float8_e4m3fn, DT_FLOAT8_E4M3FN);
 MATCH_TYPE_AND_ENUM(float8_e4m3fnuz, DT_FLOAT8_E4M3FNUZ);
 MATCH_TYPE_AND_ENUM(float8_e4m3b11fnuz, DT_FLOAT8_E4M3B11FNUZ);
 MATCH_TYPE_AND_ENUM(float8_e5m2fnuz, DT_FLOAT8_E5M2FNUZ);
+MATCH_TYPE_AND_ENUM(float4_e2m1fn, DT_FLOAT4_E2M1FN);
 MATCH_TYPE_AND_ENUM(int4, DT_INT4);
 MATCH_TYPE_AND_ENUM(uint4, DT_UINT4);
 MATCH_TYPE_AND_ENUM(int2, DT_INT2);
@@ -382,7 +383,7 @@ struct IsValidDataType<unsigned long> {
 };
 template <>
 struct EnumToDataType<DT_UINT64> {
-  typedef tensorflow::uint64 Type;
+  typedef uint64_t Type;
 };
 
 template <>
@@ -417,7 +418,7 @@ struct IsValidDataType {
 
 // Extra validity checking; not part of public API.
 static_assert(IsValidDataType<int64_t>::value, "Incorrect impl for int64");
-static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
+static_assert(IsValidDataType<int32_t>::value, "Incorrect impl for int32");
 
 // TODO(jeff): Maybe unify this with Tensor::CanUseDMA, or the underlying
 // is_simple<T> in tensor.cc (and possible choose a more general name?)
@@ -429,8 +430,9 @@ constexpr DataTypeSet kDataTypesCanUseMemcpy =
     ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
     ToSet(DT_BFLOAT16) | ToSet(DT_HALF) | ToSet(DT_FLOAT8_E5M2) |
     ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E4M3FNUZ) |
-    ToSet(DT_FLOAT8_E4M3B11FNUZ) | ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_INT4) |
-    ToSet(DT_UINT4) | ToSet(DT_INT2) | ToSet(DT_UINT2);
+    ToSet(DT_FLOAT8_E4M3B11FNUZ) | ToSet(DT_FLOAT8_E5M2FNUZ) |
+    ToSet(DT_FLOAT4_E2M1FN) | ToSet(DT_INT4) | ToSet(DT_UINT4) |
+    ToSet(DT_INT2) | ToSet(DT_UINT2);
 
 constexpr bool DataTypeCanUseMemcpy(DataType dt) {
   return kDataTypesCanUseMemcpy.Contains(dt);
@@ -441,7 +443,7 @@ constexpr DataTypeSet kDataTypeIsFloating =
     ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) |
     ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E5M2) |
     ToSet(DT_FLOAT8_E4M3FNUZ) | ToSet(DT_FLOAT8_E4M3B11FNUZ) |
-    ToSet(DT_FLOAT8_E5M2FNUZ);
+    ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_FLOAT4_E2M1FN);
 inline bool DataTypeIsFloating(DataType dt) {
   return kDataTypeIsFloating.Contains(dt);
 }
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index bef9863cbb3222..314189b58ea38e 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -49,11 +49,11 @@ enum DataType {
                                // bias, finite-only, with NaNs.
   DT_FLOAT8_E5M2FNUZ = 28;     // 5 exponent bits, 2 mantissa bits, finite-only,
                                // with NaN.
-
   DT_INT4 = 29;
   DT_UINT4 = 30;
   DT_INT2 = 31;
   DT_UINT2 = 32;
+  DT_FLOAT4_E2M1FN = 33;  // 2 exponent bits, 1 mantissa bit, finite-only
 
   // Do not use!  These are only for TF1's obsolete reference Variables.
   // Every enum above should have a corresponding value below (verified by
@@ -91,6 +91,7 @@ enum DataType {
   DT_UINT4_REF = 130;
   DT_INT2_REF = 131;
   DT_UINT2_REF = 132;
+  DT_FLOAT4_E2M1FN_REF = 133;
 }
 // LINT.ThenChange(
 //    https://www.tensorflow.org/code/tensorflow/c/tf_datatype.h,
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 745416098d71b9..8a30c8ff0cad26 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -116,6 +116,8 @@ TEST(TypesTest, DataTypeFromString) {
   EXPECT_EQ(DT_FLOAT8_E4M3B11FNUZ, dt);
   ASSERT_TRUE(DataTypeFromString("float8_e5m2fnuz", &dt));
   EXPECT_EQ(DT_FLOAT8_E5M2FNUZ, dt);
+  ASSERT_TRUE(DataTypeFromString("float4_e2m1fn", &dt));
+  EXPECT_EQ(DT_FLOAT4_E2M1FN, dt);
   ASSERT_TRUE(DataTypeFromString("int4", &dt));
   EXPECT_EQ(DT_INT4, dt);
   ASSERT_TRUE(DataTypeFromString("uint4", &dt));
@@ -138,10 +140,10 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_TRUE(GetQuantized<quint8>());
   EXPECT_TRUE(GetQuantized<qint32>());
 
-  EXPECT_FALSE(GetQuantized<int8>());
-  EXPECT_FALSE(GetQuantized<uint8>());
-  EXPECT_FALSE(GetQuantized<int16>());
-  EXPECT_FALSE(GetQuantized<int32>());
+  EXPECT_FALSE(GetQuantized<int8_t>());
+  EXPECT_FALSE(GetQuantized<uint8_t>());
+  EXPECT_FALSE(GetQuantized<int16_t>());
+  EXPECT_FALSE(GetQuantized<int32_t>());
 
   EXPECT_TRUE(DataTypeIsQuantized(DT_QINT8));
   EXPECT_TRUE(DataTypeIsQuantized(DT_QUINT8));
@@ -158,6 +160,7 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E4M3FNUZ));
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E4M3B11FNUZ));
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E5M2FNUZ));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT4_E2M1FN));
   EXPECT_FALSE(DataTypeIsQuantized(DT_UINT4));
   EXPECT_FALSE(DataTypeIsQuantized(DT_INT4));
   EXPECT_FALSE(DataTypeIsQuantized(DT_UINT2));
@@ -173,7 +176,7 @@ TEST(TypesTest, ComplexTypes) {
 
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
-    const string name = DataTypeString(dt);
+    const std::string name = DataTypeString(dt);
     EXPECT_EQ(DataTypeIsInteger(dt),
               absl::StartsWith(name, "int") || absl::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index 69bd07ac6a02e5..8c183c4374e8bf 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -47,7 +47,7 @@ const void* Variant::get() const {
 }
 
 template <>
-string TypeNameVariant(const VariantTensorDataProto& value) {
+std::string TypeNameVariant(const VariantTensorDataProto& value) {
   return value.type_name();
 }
 
@@ -64,19 +64,19 @@ bool DecodeVariant(VariantTensorData* data, VariantTensorDataProto* value) {
 }
 
 template <>
-void EncodeVariant(const VariantTensorDataProto& value, string* buf) {
+void EncodeVariant(const VariantTensorDataProto& value, std::string* buf) {
   value.SerializeToString(buf);
 }
 
 template <>
-bool DecodeVariant(string* buf, VariantTensorDataProto* value) {
+bool DecodeVariant(std::string* buf, VariantTensorDataProto* value) {
   return value->ParseFromString(*buf);
 }
 
 void EncodeVariantList(const Variant* variant_array, int64_t n,
                        std::unique_ptr<port::StringListEncoder> e) {
   for (int i = 0; i < n; ++i) {
-    string s;
+    std::string s;
     variant_array[i].Encode(&s);
     e->Append(s);
   }
@@ -85,7 +85,7 @@ void EncodeVariantList(const Variant* variant_array, int64_t n,
 
 bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
                        Variant* variant_array, int64_t n) {
-  std::vector<uint32> sizes(n);
+  std::vector<uint32_t> sizes(n);
   if (!d->ReadSizes(&sizes)) return false;
 
   for (int i = 0; i < n; ++i) {
@@ -94,7 +94,7 @@ bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
     }
     // TODO(ebrevdo): Replace with StringPiece?  Any way to make this a
     // zero-copy operation that keeps a reference to the data in d?
-    string str(d->Data(sizes[i]), sizes[i]);
+    std::string str(d->Data(sizes[i]), sizes[i]);
     if (!variant_array[i].Decode(std::move(str))) return false;
     if (!DecodeUnaryVariant(&variant_array[i])) {
       LOG(ERROR) << "Could not decode variant with type_name: \""
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index d93991ac0dc217..b735be446fb5c5 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -131,7 +131,7 @@ struct has_type_name : std::false_type {};
 template <typename C>
 struct has_type_name<
     C, typename std::enable_if<std::is_same<
-           decltype(std::declval<C>().TypeName()), string>::value>::type>
+           decltype(std::declval<C>().TypeName()), std::string>::value>::type>
     : std::true_type {};
 
 template <typename T, bool = has_type_name<typename std::decay<T>::type>::value,
@@ -179,18 +179,18 @@ struct has_debug_string : std::false_type {};
 
 template <typename C>
 struct has_debug_string<
-    C, typename std::enable_if<std::is_same<
-           decltype(std::declval<C>().DebugString()), string>::value>::type>
+    C,
+    typename std::enable_if<std::is_same<
+        decltype(std::declval<C>().DebugString()), std::string>::value>::type>
     : std::true_type {};
 
 template <typename C, typename = void>
 struct can_strcat : std::false_type {};
 
 template <typename C>
-struct can_strcat<
-    C, typename std::enable_if<std::is_same<
-           decltype(strings::StrCat(std::declval<C>())), string>::value>::type>
-    : std::true_type {};
+struct can_strcat<C, typename std::enable_if<std::is_same<
+                         decltype(strings::StrCat(std::declval<C>())),
+                         std::string>::value>::type> : std::true_type {};
 
 template <typename T,
           bool = has_debug_string<typename std::decay<T>::type>::value,
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 77e3d313f920e8..9c036722b772b4 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -59,7 +59,7 @@ static int* GetCopyGPUToGPUCounter() {
 
 struct StoredTensorValue {
   Tensor stored;
-  string TypeName() const { return "StoredTensorValue"; }
+  std::string TypeName() const { return "StoredTensorValue"; }
   void Encode(VariantTensorData* data) const { data->tensors_ = {stored}; }
   bool Decode(const VariantTensorData& data) {
     CHECK_EQ(1, data.tensors_.size());
@@ -268,7 +268,7 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
 TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Tensor t_42(DT_INT32, TensorShape({}));
-  t_42.flat<int32>()(0) = 42;
+  t_42.flat<int32_t>()(0) = 42;
   Output create_op = CreateTestVariant(root, t_42);
   Output identity = ops::Identity(root, create_op);
 
@@ -285,7 +285,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
     EXPECT_EQ("StoredTensorValue", r1.TypeName());
     const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
     EXPECT_NE(v1, nullptr);
-    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+    EXPECT_EQ(42, v1->stored.scalar<int32_t>()());
   }
 }
 
@@ -319,7 +319,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Scope with_gpu = root.WithDevice("/gpu:0");
   Tensor t_42(DT_INT32, TensorShape({}));
-  t_42.scalar<int32>()() = 42;
+  t_42.scalar<int32_t>()() = 42;
   Output create_op = CreateTestVariant(root, t_42);
   Output identity = ops::Identity(with_gpu, create_op);
 
@@ -346,7 +346,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
     EXPECT_EQ("StoredTensorValue", r1.TypeName());
     const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
     EXPECT_NE(v1, nullptr);
-    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+    EXPECT_EQ(42, v1->stored.scalar<int32_t>()());
   }
 }
 
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 225da86665613d..3019976645f679 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -46,9 +46,10 @@ const char* VariantBinaryOpToString(VariantBinaryOp op) {
   }
 }
 
-std::unordered_set<string>* UnaryVariantOpRegistry::PersistentStringStorage() {
-  static std::unordered_set<string>* string_storage =
-      new std::unordered_set<string>();
+std::unordered_set<std::string>*
+UnaryVariantOpRegistry::PersistentStringStorage() {
+  static std::unordered_set<std::string>* string_storage =
+      new std::unordered_set<std::string>();
   return string_storage;
 }
 
@@ -70,7 +71,7 @@ UnaryVariantOpRegistry::VariantDecodeFn* UnaryVariantOpRegistry::GetDecodeFn(
 }
 
 void UnaryVariantOpRegistry::RegisterDecodeFn(
-    const string& type_name, const VariantDecodeFn& decode_fn) {
+    const std::string& type_name, const VariantDecodeFn& decode_fn) {
   CHECK(!type_name.empty()) << "Need a valid name for UnaryVariantDecode";
   VariantDecodeFn* existing = GetDecodeFn(type_name);
   CHECK_EQ(existing, nullptr)
@@ -98,7 +99,7 @@ bool DecodeUnaryVariant(Variant* variant) {
   if (decode_fn == nullptr) {
     return false;
   }
-  const string type_name = variant->TypeName();
+  const std::string type_name = variant->TypeName();
   bool decoded = (*decode_fn)(variant);
   if (!decoded) return false;
   if (variant->TypeName() != type_name) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index c7d8680d31bfbe..5c31066f54a4a4 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -188,7 +188,7 @@ class UnaryVariantOpRegistry {
   // iterators).  In other words, one may safely point a StringPiece to
   // a value in the set without that StringPiece being invalidated by
   // future insertions.
-  static std::unordered_set<string>* PersistentStringStorage();
+  static std::unordered_set<std::string>* PersistentStringStorage();
 
  private:
   struct TypeIndexHash {
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 594e9a6682ddfa..2506bdd433242d 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -38,7 +38,7 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 
 struct VariantValue {
-  string TypeName() const { return "TEST VariantValue"; }
+  std::string TypeName() const { return "TEST VariantValue"; }
   static absl::Status CPUZerosLikeFn(OpKernelContext* ctx,
                                      const VariantValue& v,
                                      VariantValue* v_out) {
@@ -147,7 +147,7 @@ TEST(VariantOpDecodeRegistryTest, TestEmpty) {
 TEST(VariantOpDecodeRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantDecodeFn f;
-  string kTypeName = "fjfjfj";
+  std::string kTypeName = "fjfjfj";
   registry.RegisterDecodeFn(kTypeName, f);
   EXPECT_DEATH(registry.RegisterDecodeFn(kTypeName, f),
                "fjfjfj already registered");
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 40028b91a95650..906cfaa3d8e58a 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -71,27 +71,27 @@ bool VariantTensorData::FromConstProto(const VariantTensorDataProto& proto) {
   return true;
 }
 
-string VariantTensorData::SerializeAsString() const {
+std::string VariantTensorData::SerializeAsString() const {
   VariantTensorDataProto proto;
   ToProto(&proto);
   return proto.SerializeAsString();
 }
 
-bool VariantTensorData::SerializeToString(string* buf) {
+bool VariantTensorData::SerializeToString(std::string* buf) {
   VariantTensorDataProto proto;
   ToProto(&proto);
   return proto.SerializeToString(buf);
 }
 
-bool VariantTensorData::ParseFromString(string s) {
+bool VariantTensorData::ParseFromString(std::string s) {
   VariantTensorDataProto proto;
   const bool status = proto.ParseFromString(s);
   if (status) FromProto(std::move(proto));
   return status;
 }
 
-string VariantTensorData::DebugString() const {
-  string repeated_field = "";
+std::string VariantTensorData::DebugString() const {
+  std::string repeated_field = "";
   for (const auto& t : tensors_) {
     repeated_field =
         absl::StrCat(repeated_field, " tensors: ", t.DebugString());
@@ -100,7 +100,7 @@ string VariantTensorData::DebugString() const {
                          repeated_field);
 }
 
-string ProtoDebugString(const VariantTensorData& object) {
+std::string ProtoDebugString(const VariantTensorData& object) {
   return object.DebugString();
 }
 
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 2dbd0157e79b8f..bf99cd721ad6de 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -38,7 +38,7 @@ template <typename T, bool BIG>
 struct Wrapper {
   T value;
   char big[BIG ? 256 : 1];
-  string TypeName() const { return "POD"; }
+  std::string TypeName() const { return "POD"; }
 };
 
 template <bool BIG>
@@ -87,7 +87,7 @@ class MaybeAlive {
 
   static int LiveCounter() { return live_counter_; }
 
-  string TypeName() const { return "MaybeAlive"; }
+  std::string TypeName() const { return "MaybeAlive"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 
@@ -127,7 +127,7 @@ class DeleteCounter {
   char big_[BIG ? 256 : 1];
   int* counter_;
 
-  string TypeName() const { return "DeleteCounter"; }
+  std::string TypeName() const { return "DeleteCounter"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 };
@@ -248,7 +248,7 @@ class MoveAndCopyCounter {
   int* move_counter_;
   int* copy_counter_;
 
-  string TypeName() const { return "MoveAndCopyCounter"; }
+  std::string TypeName() const { return "MoveAndCopyCounter"; }
   void Encode(VariantTensorData* data) const {}
   bool Decode(VariantTensorData data) { return false; }
 };
@@ -538,7 +538,7 @@ struct TensorList {
     return true;
   }
 
-  string TypeName() const { return "TensorList"; }
+  std::string TypeName() const { return "TensorList"; }
 
   std::vector<Tensor> vec;
 };
@@ -616,7 +616,7 @@ void PodUpdateTest() {
     float y;
     char big[BIG ? 256 : 1];
 
-    string TypeName() const { return "POD"; }
+    std::string TypeName() const { return "POD"; }
   };
 
   Variant x = Pod{10, 20.f};
@@ -639,7 +639,7 @@ void TestEncodeDecodePod() {
     float y;
     char big[BIG ? 256 : 1];
 
-    string TypeName() const { return "POD"; }
+    std::string TypeName() const { return "POD"; }
   };
 
   Variant x;
diff --git a/tensorflow/core/function/runtime_client/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
index b38e293026111b..8ac853ab224606 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -231,7 +231,7 @@ absl::StatusOr<ReturnValues> Runtime::CallFunction(
   TF_RETURN_WITH_CONTEXT_IF_ERROR(op->AddInputList(args),
                                   "preparing call args for ", name);
 
-  const FunctionDef* fn_def = ctx.GetFunctionDef(string(name));
+  const FunctionDef* fn_def = ctx.GetFunctionDef(std::string(name));
   int num_retvals = fn_def->signature().output_arg_size();
   int actual_retvals = num_retvals;
   std::vector<ImmediateExecutionTensorHandle*> retvals(num_retvals);
diff --git a/tensorflow/core/graph/graph_debug_info_builder.cc b/tensorflow/core/graph/graph_debug_info_builder.cc
index b539fa1d5c04a5..b09851e5a191bf 100644
--- a/tensorflow/core/graph/graph_debug_info_builder.cc
+++ b/tensorflow/core/graph/graph_debug_info_builder.cc
@@ -241,7 +241,7 @@ GraphDebugInfo GraphDebugInfoBuilder::Build() const { return *debug_info_; }
 absl::Status GraphDebugInfoBuilder::AppendGraphDebugInfoStr(
     absl::string_view prefix, absl::string_view new_info_str) {
   GraphDebugInfo debug_info;
-  if (!debug_info.ParseFromArray(new_info_str.data(), new_info_str.size())) {
+  if (!debug_info.ParseFromString(new_info_str)) {
     return absl::InvalidArgumentError("Failed to parse GraphDebugInfo proto.");
   }
   AppendGraphDebugInfo(prefix, debug_info);
@@ -280,8 +280,7 @@ StackTracesMap LoadTracesFromDebugInfo(const GraphDebugInfo& debug_info) {
 absl::StatusOr<StackTracesMap> LoadTracesFromDebugInfoStr(
     absl::string_view debug_info_str) {
   GraphDebugInfo debug_info;
-  if (!debug_info.ParseFromArray(debug_info_str.data(),
-                                 debug_info_str.size())) {
+  if (!debug_info.ParseFromString(debug_info_str)) {
     return absl::InvalidArgumentError("Failed to parse GraphDebugInfo proto.");
   }
   return LoadTracesFromDebugInfo(debug_info);
diff --git a/tensorflow/core/graph/regularization/simple_delete_test.cc b/tensorflow/core/graph/regularization/simple_delete_test.cc
index b9c2652ed9db57..2eac003707755f 100644
--- a/tensorflow/core/graph/regularization/simple_delete_test.cc
+++ b/tensorflow/core/graph/regularization/simple_delete_test.cc
@@ -57,7 +57,7 @@ TEST(SimpleDeleteTest, TestSimpleDeleteModelSavedTwice) {
   MetaGraphDef* metagraph = saved_model_pb.mutable_meta_graphs(0);
   GraphDef* graph_def = metagraph->mutable_graph_def();
   SimpleDelete(*graph_def);
-  uint64 hash1 = ComputeHash(*graph_def);
+  uint64_t hash1 = ComputeHash(*graph_def);
 
   const std::string export_dir2 =
       io::JoinPath(testing::TensorFlowSrcRoot(),
@@ -67,7 +67,7 @@ TEST(SimpleDeleteTest, TestSimpleDeleteModelSavedTwice) {
   const MetaGraphDef& metagraph2 = saved_model_pb2.meta_graphs(0);
   GraphDef graph_def2 = metagraph2.graph_def();
   SimpleDelete(graph_def2);
-  uint64 hash2 = ComputeHash(graph_def2);
+  uint64_t hash2 = ComputeHash(graph_def2);
 
   EXPECT_EQ(hash1, hash2);
 }
diff --git a/tensorflow/core/graph/regularization/util.cc b/tensorflow/core/graph/regularization/util.cc
index ed9d7254dd43aa..b42bd21f0656aa 100644
--- a/tensorflow/core/graph/regularization/util.cc
+++ b/tensorflow/core/graph/regularization/util.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 namespace tensorflow::graph_regularization {
 
-uint64 ComputeHash(const GraphDef& graph_def) {
+uint64_t ComputeHash(const GraphDef& graph_def) {
   std::string graph_def_string;
   SerializeToStringDeterministic(graph_def, &graph_def_string);
   return tensorflow::Fingerprint64(graph_def_string);
diff --git a/tensorflow/core/graph/regularization/util.h b/tensorflow/core/graph/regularization/util.h
index 2fff645261ce20..e046b81d8dca89 100644
--- a/tensorflow/core/graph/regularization/util.h
+++ b/tensorflow/core/graph/regularization/util.h
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow::graph_regularization {
 
 // Computes the Fingerprint64 hash of the GraphDef.
-uint64 ComputeHash(const GraphDef& graph_def);
+uint64_t ComputeHash(const GraphDef& graph_def);
 
 // Returns the suffix UID of `function_name`, returns an error if there is none.
 absl::StatusOr<int64_t> GetSuffixUID(absl::string_view function_name);
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 7cdd046c48a806..901273fb7e0775 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -15,21 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/graph/tensor_id.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
 
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/str_util.h"
 
 namespace tensorflow {
 
 TensorId::TensorId(const SafeTensorId& id) : TensorId(id.first, id.second) {}
 
 SafeTensorId::SafeTensorId(const TensorId& id)
-    : SafeTensorId(string(id.first), id.second) {}
-
-TensorId ParseTensorName(const string& name) {
-  return ParseTensorName(absl::string_view(name.data(), name.size()));
-}
+    : SafeTensorId(std::string(id.first), id.second) {}
 
 TensorId ParseTensorName(absl::string_view name) {
   // Parse either a name, ^name, or name:digits.  To do so, we go backwards from
@@ -38,28 +38,19 @@ TensorId ParseTensorName(absl::string_view name) {
   // see if the name starts with '^', indicating a control edge. If we find
   // neither ':' nor '^' characters, the output index is implicitly 0, and the
   // whole name string forms the first part of the tensor name.
-  const char* base = name.data();
-  const char* p = base + name.size() - 1;
-  unsigned int index = 0;
-  unsigned int mul = 1;
-  while (p > base && (*p >= '0' && *p <= '9')) {
-    index += ((*p - '0') * mul);
-    mul *= 10;
-    p--;
+  size_t colon_pos = name.rfind(':');
+  if (colon_pos != absl::string_view::npos) {
+    absl::string_view prefix = name.substr(0, colon_pos);
+    absl::string_view suffix = name.substr(colon_pos + 1);
+    uint64_t index;
+    if (str_util::ConsumeLeadingDigits(&suffix, &index) && suffix.empty()) {
+      return TensorId(prefix, index);
+    }
   }
-  TensorId id;
-  if (p > base && *p == ':' && mul > 1) {
-    id.first = absl::string_view(base, p - base);
-    id.second = index;
-  } else if (absl::StartsWith(name, "^")) {
-    // Control edge
-    id.first = absl::string_view(base + 1);
-    id.second = Graph::kControlSlot;
-  } else {
-    id.first = name;
-    id.second = 0;
+  if (absl::ConsumePrefix(&name, "^")) {
+    return TensorId(name, Graph::kControlSlot);
   }
-  return id;
+  return TensorId(name, 0);
 }
 
 bool IsTensorIdControl(const TensorId& tensor_id) {
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index cabd7968b28274..31b30fa14af463 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -57,7 +57,6 @@ struct TensorId : public std::pair<absl::string_view, int> {
   };
 };
 
-TensorId ParseTensorName(const string& name);
 TensorId ParseTensorName(absl::string_view name);
 
 bool IsTensorIdControl(const TensorId& tensor_id);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 8f5704b36f1ba9..32f7b1e6798ef9 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index a630c1d3941aa7..e17c88971cc0c2 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -48,7 +48,7 @@ void Cluster::SetNumWarmupSteps(int num_steps) {
 }
 
 // Set executor type to instantiate
-void Cluster::SetExecutorType(const string* executor_type) {
+void Cluster::SetExecutorType(const std::string* executor_type) {
   options_.config.mutable_experimental()->set_executor_type(*executor_type);
 }
 
@@ -100,8 +100,8 @@ void Cluster::DisableOptimizer(bool disable) {
   }
 }
 
-const std::vector<string> Cluster::GetDeviceNames() const {
-  std::vector<string> device_names;
+const std::vector<std::string> Cluster::GetDeviceNames() const {
+  std::vector<std::string> device_names;
   device_names.reserve(devices_.size());
   for (const auto& device : devices_) {
     device_names.push_back(device.first);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 9782a83f5e0012..d2662424d5ab2f 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -44,7 +44,7 @@ class Cluster {
   virtual ~Cluster();
 
   // Returns a string that represent the type of cluster that was instantiated.
-  virtual string type() const = 0;
+  virtual std::string type() const = 0;
 
   // Provision the hardware resources needed to run TensorFlow and start a
   // TensorFlow session that can take advantage of these resources.
@@ -76,7 +76,7 @@ class Cluster {
   void SetNumWarmupSteps(int num_steps);
 
   // Set executor type to instantiate
-  void SetExecutorType(const string* executor_type);
+  void SetExecutorType(const std::string* executor_type);
 
   // Returns the number of warmup steps.
   int NumWarmupSteps() const;
@@ -94,13 +94,13 @@ class Cluster {
 
   // Return the list of TensorFlow devices that are available to execute a
   // graph. This is empty until provision() is called.
-  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+  const std::unordered_map<std::string, DeviceProperties>& GetDevices() const {
     return devices_;
   }
 
   // Convenience method that returns the set of device names. These names are
   // sorted alphabetically.
-  const std::vector<string> GetDeviceNames() const;
+  const std::vector<std::string> GetDeviceNames() const;
 
   // The DeviceSet is not always available, but when it is it contains a
   // superset of the devices listed in GetDevices/GetDeviceNames().
@@ -116,7 +116,7 @@ class Cluster {
   // Returns peak memory of all devices during the session creation and session
   // runs.
   virtual absl::Status GetPeakMemoryUsage(
-      std::unordered_map<string, uint64>* device_peak_memory) const {
+      std::unordered_map<std::string, uint64_t>* device_peak_memory) const {
     return absl::UnimplementedError(
         "GetPeakMemoryUsage is not implemented for this type of cluster.");
   }
@@ -126,10 +126,10 @@ class Cluster {
   virtual absl::Status Initialize(const GrapplerItem& item) = 0;
 
   // Run the specified graph_def and return the corresponding metadata.
-  virtual absl::Status Run(const GraphDef& graph_def,
-                           const std::vector<std::pair<string, Tensor>>& feed,
-                           const std::vector<string>& fetch,
-                           RunMetadata* metadata) = 0;
+  virtual absl::Status Run(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* metadata) = 0;
 
   // Run the specified GrapplerItem and return the corresponding metadata.
   virtual absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) {
@@ -137,7 +137,7 @@ class Cluster {
   }
 
  protected:
-  std::unordered_map<string, DeviceProperties> devices_;
+  std::unordered_map<std::string, DeviceProperties> devices_;
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 43384037403acc..de16db2cbdea31 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -119,7 +119,7 @@ absl::Status SingleMachine::Provision() {
                          ": ", s.message()));
       }
       attr = GetLocalGPUInfo(platform_device_id);
-    } else if (dev.device_type().find("XLA") == string::npos) {
+    } else if (dev.device_type().find("XLA") == std::string::npos) {
       // Filter out the fake XLA devices to avoid double counting the actual
       // hardware resources that are available.
       attr.set_type(dev.device_type());
@@ -162,8 +162,8 @@ absl::Status SingleMachine::Shutdown() {
 
 absl::Status SingleMachine::Run(
     const GraphDef& graph_def,
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* metadata) {
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* metadata) {
   mutex_lock l(this->last_graph_mu_);
   if (last_graph_ != &graph_def) {
     TF_RETURN_IF_ERROR(ResetSession());
@@ -229,7 +229,7 @@ absl::Status SingleMachine::EnablePeakMemoryStats() {
 }
 
 absl::Status SingleMachine::GetPeakMemoryUsage(
-    std::unordered_map<string, uint64>* device_peak_memory) const {
+    std::unordered_map<std::string, uint64_t>* device_peak_memory) const {
   // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
   // the AllocatorStats would be collected.
   if (!cpu_allocator_stats_enabled_) {
@@ -257,14 +257,14 @@ absl::Status SingleMachine::GetPeakMemoryUsage(
 }
 
 absl::Status SingleMachine::RunWithTimeout(
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* run_metadata) {
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* run_metadata) {
   return RunWithTimeout(feed, fetch, run_metadata, timeout_s_);
 }
 
 absl::Status SingleMachine::RunWithTimeout(
-    const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* run_metadata,
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* run_metadata,
     int64_t timeout_s) {
   // We shouldn't be running or closing the session at this point.
   {
@@ -403,7 +403,7 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
   graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
                                        init_costs.node_size() +
                                        queue_costs.node_size());
-  std::unordered_set<string> nodes_seen;
+  std::unordered_set<std::string> nodes_seen;
   int queue_costs_id_offset = graph_costs->node_size();
   for (const auto& node : graph_costs->node()) {
     nodes_seen.insert(node.name());
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index f3f36626767c52..554f316a3c9c76 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -44,15 +44,15 @@ class SingleMachine : public Cluster {
   SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
   ~SingleMachine() override;
 
-  string type() const override { return "single_machine"; }
+  std::string type() const override { return "single_machine"; }
 
   absl::Status Provision() override;
   absl::Status Shutdown() override;
 
   absl::Status Initialize(const GrapplerItem& item) override;
   absl::Status Run(const GraphDef& item,
-                   const std::vector<std::pair<string, Tensor>>& feed,
-                   const std::vector<string>& fetch,
+                   const std::vector<std::pair<std::string, Tensor>>& feed,
+                   const std::vector<std::string>& fetch,
                    RunMetadata* metadata) override;
 
   const DeviceSet* GetDeviceSet() const override { return device_set_.get(); }
@@ -60,16 +60,16 @@ class SingleMachine : public Cluster {
   absl::Status EnablePeakMemoryStats() override;
 
   // It requires EnableAllocatorStats(true) be called before Provision().
-  absl::Status GetPeakMemoryUsage(
-      std::unordered_map<string, uint64>* device_peak_memory) const override;
+  absl::Status GetPeakMemoryUsage(std::unordered_map<std::string, uint64_t>*
+                                      device_peak_memory) const override;
 
  private:
   absl::Status RunWithTimeout(
-      const std::vector<std::pair<string, Tensor>>& feed,
-      const std::vector<string>& fetch, RunMetadata* run_metadata);
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* run_metadata);
   absl::Status RunWithTimeout(
-      const std::vector<std::pair<string, Tensor>>& feed,
-      const std::vector<string>& fetch, RunMetadata* run_metadata,
+      const std::vector<std::pair<std::string, Tensor>>& feed,
+      const std::vector<std::string>& fetch, RunMetadata* run_metadata,
       int64_t timeout_s);
   absl::Status ResetSession();
   absl::Status CloseSession(bool use_timeout);
@@ -81,10 +81,10 @@ class SingleMachine : public Cluster {
 
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
-  string last_graph_id_;
+  std::string last_graph_id_;
   mutex last_graph_mu_;
   const GraphDef* last_graph_ TF_GUARDED_BY(last_graph_mu_) = nullptr;
-  std::vector<string> init_ops_;
+  std::vector<std::string> init_ops_;
   int64_t expected_init_time_s_;
   std::unique_ptr<Coordinator> coordinator_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index fb3960ea2a8b79..31b3a76c595bb3 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -93,7 +93,7 @@ TEST_F(SingleMachineTest, CostModel) {
   for (const auto& node : metadata.cost_graph().node()) {
     // Skip the special nodes inserted by TF: these are prefixed with an
     // underscore.
-    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+    if (node.name()[0] == '_' || node.name().find("/_") != std::string::npos) {
       continue;
     }
 #ifndef INTEL_MKL
@@ -140,7 +140,8 @@ TEST_F(SingleMachineTest, MultipleItems) {
     // in the fake input, plus 1 enqueue and 1 dequeue node.
     EXPECT_LE(6, metadata1.cost_graph().node_size());
     for (const auto& node : metadata1.cost_graph().node()) {
-      if (node.name()[0] == '_' || node.name().find("/_") != string::npos ||
+      if (node.name()[0] == '_' ||
+          node.name().find("/_") != std::string::npos ||
           node.name() == "queue") {
         continue;
       }
@@ -161,9 +162,9 @@ TEST_F(SingleMachineTest, MultipleItems) {
       metadata2.mutable_cost_graph()->mutable_node(i)->set_compute_cost(0);
       metadata2.clear_step_stats();
     }
-    string s1;
+    std::string s1;
     ::tensorflow::protobuf::TextFormat::PrintToString(metadata1, &s1);
-    string s2;
+    std::string s2;
     ::tensorflow::protobuf::TextFormat::PrintToString(metadata2, &s2);
     EXPECT_EQ(s1, s2);
   }
@@ -211,7 +212,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Initialize(item));
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
-  std::set<string> cost_nodes;
+  std::set<std::string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
 #ifdef INTEL_MKL
     // Skip the special nodes inserted by TF (and MKL): these are either
@@ -227,7 +228,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
     }
 #endif
   }
-  const std::set<string> expected_cost_nodes = {
+  const std::set<std::string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
       "new_shape", "reshaped", "final_shape", "expected_shape",
       "valid",     "all_dims", "all_valid",   "assert_valid"};
@@ -263,7 +264,7 @@ static void RunInfiniteTFLoop() {
   shp->set_op("Const");
   (*shp->mutable_attr())["dtype"].set_type(DT_INT32);
   Tensor shp_tensor(DT_INT32, TensorShape({1}));
-  shp_tensor.flat<int32>()(0) = 1;
+  shp_tensor.flat<int32_t>()(0) = 1;
   shp_tensor.AsProtoTensorContent(
       (*shp->mutable_attr())["value"].mutable_tensor());
 
@@ -394,14 +395,14 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 namespace {
 
 template <class T>
-inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+inline void SetNodeAttr(const std::string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
   SetAttrValue(value, &attr_value);
   auto* attr_map = node->mutable_attr();
   (*attr_map)[key] = attr_value;
 }
 template <>
-inline void SetNodeAttr(const string& key, const Tensor& tensor,
+inline void SetNodeAttr(const std::string& key, const Tensor& tensor,
                         NodeDef* node) {
   TensorProto tensor_proto;
   tensor.AsProtoTensorContent(&tensor_proto);
@@ -528,7 +529,7 @@ GrapplerItem CreateGrapplerItemWithResourceMemory() {
   // Add a queue.
   ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
   Output some_string =
-      ops::Const(s.WithOpName("some_string"), string("nothing"));
+      ops::Const(s.WithOpName("some_string"), std::string("nothing"));
   ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
   ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
                             {DataType::DT_STRING});
@@ -560,7 +561,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   GrapplerItem item = CreateGrapplerItemWithResourceMemory();
   TF_CHECK_OK(cluster_->Initialize(item));
 
-  std::unordered_map<string, uint64> device_peak_memory_before;
+  std::unordered_map<std::string, uint64_t> device_peak_memory_before;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_before));
   EXPECT_EQ(device_peak_memory_before.size(), 1);
   // There might be a bit memory used before session's running anything.
@@ -570,7 +571,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
 
   // Check there is memory that is not released.
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
   EXPECT_EQ(device_peak_memory.size(), 1);
   EXPECT_GT(device_peak_memory.begin()->second, 0);
@@ -578,7 +579,7 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   // Reprovisioning the cluster would release all memory.
   TF_CHECK_OK(cluster_->Shutdown());
   TF_CHECK_OK(cluster_->Provision());
-  std::unordered_map<string, uint64> device_peak_memory_after;
+  std::unordered_map<std::string, uint64_t> device_peak_memory_after;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_after));
   TF_CHECK_OK(cluster_->Shutdown());
 
@@ -596,12 +597,12 @@ TEST_F(SingleMachineTest, PeakMemory) {
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
 
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
   ASSERT_NE(
       device_peak_memory.find("/job:localhost/replica:0/task:0/device:CPU:0"),
       device_peak_memory.end());
-  uint64 cpu_memory =
+  uint64_t cpu_memory =
       device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
   EXPECT_GT(cpu_memory, 0);
 
@@ -629,7 +630,7 @@ TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
   TF_CHECK_OK(cluster.Provision());
   TF_CHECK_OK(cluster.Initialize(item));
 
-  std::unordered_map<string, uint64> device_peak_memory;
+  std::unordered_map<std::string, uint64_t> device_peak_memory;
   absl::Status s = cluster.GetPeakMemoryUsage(&device_peak_memory);
   TF_CHECK_OK(cluster.Shutdown());
   ASSERT_FALSE(s.ok());
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index e1775679e6ba54..c8c450c91f0edc 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -35,12 +35,12 @@ namespace tensorflow {
 namespace grappler {
 
 VirtualCluster::VirtualCluster(
-    const std::unordered_map<string, DeviceProperties>& devices)
+    const std::unordered_map<std::string, DeviceProperties>& devices)
     : VirtualCluster(devices, std::make_unique<OpLevelCostEstimator>(),
                      ReadyNodeManagerFactory("FirstReady")) {}
 
 VirtualCluster::VirtualCluster(
-    const std::unordered_map<string, DeviceProperties>& devices,
+    const std::unordered_map<std::string, DeviceProperties>& devices,
     std::unique_ptr<OpLevelCostEstimator> node_estimator,
     std::unique_ptr<ReadyNodeManager> node_manager)
     : Cluster(0) {
@@ -54,7 +54,7 @@ VirtualCluster::VirtualCluster(
 }
 
 VirtualCluster::VirtualCluster(const DeviceSet* device_set)
-    : VirtualCluster(std::unordered_map<string, DeviceProperties>()) {
+    : VirtualCluster(std::unordered_map<std::string, DeviceProperties>()) {
   device_set_ = device_set;
   for (const auto& device : device_set_->devices()) {
     DeviceProperties props = GetDeviceInfo(device->parsed_name());
@@ -74,8 +74,9 @@ absl::Status VirtualCluster::Initialize(const GrapplerItem& item) {
 }
 
 absl::Status VirtualCluster::Run(
-    const GraphDef& graph, const std::vector<std::pair<string, Tensor>>& feed,
-    const std::vector<string>& fetch, RunMetadata* metadata) {
+    const GraphDef& graph,
+    const std::vector<std::pair<std::string, Tensor>>& feed,
+    const std::vector<std::string>& fetch, RunMetadata* metadata) {
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
@@ -98,11 +99,12 @@ absl::Status VirtualCluster::Run(const GrapplerItem& item,
   TF_RETURN_IF_ERROR(
       estimator_->PredictCosts(item.graph, metadata, /*cost=*/nullptr));
 
-  const std::unordered_map<string, DeviceProperties>& device = GetDevices();
-  std::unordered_map<string, int64_t> peak_mem_usage =
+  const std::unordered_map<std::string, DeviceProperties>& device =
+      GetDevices();
+  std::unordered_map<std::string, int64_t> peak_mem_usage =
       estimator_->GetScheduler()->GetPeakMemoryUsage();
   for (const auto& mem_usage : peak_mem_usage) {
-    const string& device_name = mem_usage.first;
+    const std::string& device_name = mem_usage.first;
     auto it = device.find(device_name);
     if (it == device.end()) {
       // It's probably the fake send/recv device. Eventually we'll need to
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index 1204a34c7f3f8f..5a6f3d83bccb4f 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -39,21 +39,22 @@ namespace grappler {
 class VirtualCluster : public Cluster {
  public:
   explicit VirtualCluster(
-      const std::unordered_map<string, DeviceProperties>& devices);
-  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
-                 std::unique_ptr<OpLevelCostEstimator> node_estimator,
-                 std::unique_ptr<ReadyNodeManager> node_manager);
+      const std::unordered_map<std::string, DeviceProperties>& devices);
+  VirtualCluster(
+      const std::unordered_map<std::string, DeviceProperties>& devices,
+      std::unique_ptr<OpLevelCostEstimator> node_estimator,
+      std::unique_ptr<ReadyNodeManager> node_manager);
   explicit VirtualCluster(const DeviceSet* device_set);
 
   ~VirtualCluster() override;
 
-  string type() const override { return "virtual"; }
+  std::string type() const override { return "virtual"; }
 
   absl::Status Provision() override;
   absl::Status Initialize(const GrapplerItem& item) override;
   absl::Status Run(const GraphDef& graph,
-                   const std::vector<std::pair<string, Tensor>>& feed,
-                   const std::vector<string>& fetch,
+                   const std::vector<std::pair<std::string, Tensor>>& feed,
+                   const std::vector<std::string>& fetch,
                    RunMetadata* metadata) override;
   absl::Status Run(const GrapplerItem& item, RunMetadata* metadata) override;
   const DeviceSet* GetDeviceSet() const override { return device_set_; }
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
index 251f02d407c093..8af18e1e1d6eba 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
@@ -51,7 +51,7 @@ class VirtualClusterTest : public ::testing::Test {
     cpu_device.set_l2_cache_size(256 * 1024);
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
     cpu_device.set_memory_size(1024 * 1024);
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     cluster_ = std::make_unique<VirtualCluster>(devices);
     TF_CHECK_OK(cluster_->Provision());
@@ -86,7 +86,7 @@ TEST_F(VirtualClusterTest, CostModel) {
   EXPECT_LE(4, metadata.cost_graph().node_size());
   for (const auto& node : metadata.cost_graph().node()) {
     // Skip the constant node that configures the random number generator.
-    if (node.name().find("Const/Const") != string::npos) {
+    if (node.name().find("Const/Const") != std::string::npos) {
       continue;
     }
     EXPECT_EQ(1, node.output_info_size());
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 94f5c24032a4a1..f11c7391930d19 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -104,7 +106,7 @@ class TensorSizeHistogram {
   uint64 sum_elem_ = 0;
   // min_ and max_ are initialized to a very large value and zero, respectively,
   // so that any value added can replace the initial min_ and max_.
-  uint64 min_ = kuint64max;
+  uint64_t min_ = std::numeric_limits<uint64_t>::max();
   uint64 max_ = 0;
   // Buckets are logarithmic:
   // 0B, 1B, 2-3B, 4-7B, 8-15B, ..., 2^N - 2^(N+1)-1B, ...
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
index a6cc0dc29fa91f..1a44dd7e373911 100644
--- a/tensorflow/core/grappler/graph_topology_view.cc
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -65,7 +65,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const auto src = node_name_to_index_.find(edge.src.node->name());
     const bool valid_src = src != node_name_to_index_.end();
     if (!valid_src) {
-      const string error_message =
+      const std::string error_message =
           absl::StrCat("Non-existent src node: ", edge.src.node->name());
       if (skip_invalid_edges_) {
         VLOG(0) << "Skip error: " << error_message;
@@ -78,7 +78,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const bool valid_dst = dst != node_name_to_index_.end();
 
     if (!valid_dst) {
-      const string error_message =
+      const std::string error_message =
           absl::StrCat("Non-existent dst node: ", edge.dst.node->name());
       if (skip_invalid_edges_) {
         VLOG(0) << "Skip error: " << error_message;
@@ -103,7 +103,7 @@ absl::Status GraphTopologyView::InitializeFromGraph(
     const NodeDef& node = graph.node(node_idx);
     fanins_[node_idx].reserve(node.input_size());
 
-    for (const string& input : node.input()) {
+    for (const std::string& input : node.input()) {
       TensorId tensor = ParseTensorName(input);
       if (ignore_control_edges && IsTensorIdControl(tensor)) {
         continue;
@@ -112,8 +112,8 @@ absl::Status GraphTopologyView::InitializeFromGraph(
       const bool valid_input = it != node_name_to_index_.end();
 
       if (!valid_input) {
-        const string error_message = absl::StrCat("Non-existent input ", input,
-                                                  " in node ", node.name());
+        const std::string error_message = absl::StrCat(
+            "Non-existent input ", input, " in node ", node.name());
         if (skip_invalid_edges_) {
           VLOG(3) << "Skip error: " << error_message;
         } else {
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
index 4d93eaa0b198b8..adda65d886b667 100644
--- a/tensorflow/core/grappler/graph_topology_view_test.cc
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -24,7 +24,7 @@ namespace grappler {
 
 class GraphTopologyViewTest : public ::testing::Test {
  protected:
-  using NodeConfig = std::pair<string, std::vector<string>>;
+  using NodeConfig = std::pair<std::string, std::vector<std::string>>;
 
   static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
     GraphDef graph;
@@ -35,7 +35,7 @@ class GraphTopologyViewTest : public ::testing::Test {
 
       NodeDef node_def;
       node_def.set_name(node_name);
-      for (const string& input : node_inputs) {
+      for (const std::string& input : node_inputs) {
         node_def.add_input(input);
       }
 
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index d4c83e754dda96..a637c2f4e207a0 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -161,16 +161,16 @@ TEST_F(GraphViewTest, BasicGraph) {
   const NodeDef* add_node = graph.GetNode("AddN");
   EXPECT_NE(add_node, nullptr);
 
-  absl::flat_hash_set<string> fanouts;
-  absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
+  absl::flat_hash_set<std::string> fanouts;
+  absl::flat_hash_set<std::string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
   for (const auto& fo : graph.GetFanouts(*add_node, false)) {
     fanouts.insert(absl::StrCat(fo.node->name(), ":", fo.port_id));
   }
   EXPECT_EQ(graph.NumFanouts(*add_node, false), 2);
   EXPECT_EQ(fanouts, expected_fanouts);
 
-  absl::flat_hash_set<string> fanins;
-  absl::flat_hash_set<string> expected_fanins = {"Sign_1:0", "Sign:0"};
+  absl::flat_hash_set<std::string> fanins;
+  absl::flat_hash_set<std::string> expected_fanins = {"Sign_1:0", "Sign:0"};
   for (const auto& fi : graph.GetFanins(*add_node, false)) {
     fanins.insert(absl::StrCat(fi.node->name(), ":", fi.port_id));
   }
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 0143b66c5ac228..b6da0ed010e189 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -74,9 +74,9 @@ std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
 }
 
 std::vector<const NodeDef*> GrapplerItem::EnqueueOpsFanin() const {
-  std::vector<string> enqueue_ops;
+  std::vector<std::string> enqueue_ops;
   for (const auto& queue_runner : queue_runners) {
-    for (const string& enqueue_op : queue_runner.enqueue_op_name()) {
+    for (const std::string& enqueue_op : queue_runner.enqueue_op_name()) {
       enqueue_ops.push_back(enqueue_op);
     }
   }
@@ -103,9 +103,9 @@ std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
   return vars;
 }
 
-std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
-  std::unordered_set<string> result;
-  for (const string& f : fetch) {
+std::unordered_set<std::string> GrapplerItem::NodesToPreserve() const {
+  std::unordered_set<std::string> result;
+  for (const std::string& f : fetch) {
     VLOG(1) << "Add fetch " << f;
     result.insert(NodeName(f));
   }
@@ -130,7 +130,7 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   }
 
   for (const auto& queue_runner : queue_runners) {
-    for (const string& enqueue_op : queue_runner.enqueue_op_name()) {
+    for (const std::string& enqueue_op : queue_runner.enqueue_op_name()) {
       result.insert(NodeName(enqueue_op));
     }
     if (!queue_runner.close_op_name().empty()) {
@@ -167,11 +167,11 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   return result;
 }
 
-const std::unordered_set<string>& GrapplerItem::devices() const {
+const std::unordered_set<std::string>& GrapplerItem::devices() const {
   return devices_;
 }
 
-absl::Status GrapplerItem::AddDevice(const string& device) {
+absl::Status GrapplerItem::AddDevice(const std::string& device) {
   DeviceNameUtils::ParsedName name;
 
   if (!DeviceNameUtils::ParseFullName(device, &name)) {
@@ -189,7 +189,7 @@ absl::Status GrapplerItem::AddDevice(const string& device) {
 
 absl::Status GrapplerItem::AddDevices(const GrapplerItem& other) {
   std::vector<absl::string_view> invalid_devices;
-  for (const string& device : other.devices()) {
+  for (const std::string& device : other.devices()) {
     absl::Status added = AddDevice(device);
     if (!added.ok()) invalid_devices.emplace_back(device);
   }
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 36bc4f1552e4be..98e70d599fa4c9 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -46,22 +46,22 @@ struct GrapplerItem {
   // Create a copy of this GrapplerItem with graph swapped with the argument.
   GrapplerItem WithGraph(GraphDef&& graph) const;
 
-  string id;  // A unique id for this item
+  std::string id;  // A unique id for this item
 
   // Inputs
   GraphDef graph;
-  std::vector<std::pair<string, Tensor>> feed;
-  std::vector<string> fetch;
+  std::vector<std::pair<std::string, Tensor>> feed;
+  std::vector<std::string> fetch;
 
   // Initialization op(s).
-  std::vector<string> init_ops;
+  std::vector<std::string> init_ops;
   // Expected initialization time in seconds, or 0 if unknown
   int64_t expected_init_time = 0;
 
   // Save/restore ops (if any)
-  string save_op;
-  string restore_op;
-  string save_restore_loc_tensor;
+  std::string save_op;
+  std::string restore_op;
+  std::string save_restore_loc_tensor;
 
   // Queue runner(s) required to run the queue(s) of this model.
   std::vector<QueueRunnerDef> queue_runners;
@@ -69,7 +69,7 @@ struct GrapplerItem {
   // List of op names to keep in the graph. This includes nodes that are
   // referenced in various collections, and therefore must be preserved to
   // ensure that the optimized metagraph can still be loaded.
-  std::vector<string> keep_ops;
+  std::vector<std::string> keep_ops;
 
   // Return the set of node evaluated during a regular train/inference step.
   std::vector<const NodeDef*> MainOpsFanin() const;
@@ -81,7 +81,7 @@ struct GrapplerItem {
   std::vector<const NodeDef*> MainVariables() const;
   // Return a set of node names that must be preserved. This includes feed and
   // fetch nodes, keep_ops, init_ops.
-  std::unordered_set<string> NodesToPreserve() const;
+  std::unordered_set<std::string> NodesToPreserve() const;
 
   struct OptimizationOptions {
     // Is it allowed to add nodes to the graph that do not have registered
@@ -108,11 +108,11 @@ struct GrapplerItem {
     int intra_op_parallelism_threads = tsl::port::MaxParallelism();
   };
 
-  const std::unordered_set<string>& devices() const;
+  const std::unordered_set<std::string>& devices() const;
   // Adds a device to a set of available devices, only if it's a valid fully
   // defined device name. Returns `OkStatus()` if successfully added a device,
   // and an error otherwise.
-  absl::Status AddDevice(const string& device);
+  absl::Status AddDevice(const std::string& device);
   // Adds all valid devices from the other Grappler item to the device set.
   absl::Status AddDevices(const GrapplerItem& other);
   // Adds all valid devices from the nodes of the graph to the device set.
@@ -132,7 +132,7 @@ struct GrapplerItem {
   // A set of fully defined device names that can be used to place the nodes of
   // the `graph`.
   // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
-  std::unordered_set<string> devices_;
+  std::unordered_set<std::string> devices_;
 
   OptimizationOptions optimization_options_;
 };
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 54c8883db7cd2d..ee1964096cd720 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -95,7 +95,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
                                     const TensorShapeProto& shape_pb_in,
                                     TensorShapeProto* shape_pb_out,
                                     TensorShape* shape_out) {
-  std::vector<int32> dims;
+  std::vector<int32_t> dims;
   for (const auto& dim_proto : shape_pb_in.dim()) {
     if (cfg.placeholder_unknown_output_shape_dim >= 0 &&
         dim_proto.size() == -1) {
@@ -103,7 +103,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
       shape_pb_out->add_dim()->set_size(
           cfg.placeholder_unknown_output_shape_dim);
     } else {
-      dims.push_back(std::max<int32>(1, dim_proto.size()));
+      dims.push_back(std::max<int32_t>(1, dim_proto.size()));
       shape_pb_out->add_dim()->set_size(dim_proto.size());
     }
   }
@@ -117,7 +117,7 @@ absl::Status ReplaceUnknownShapeDim(const ItemConfig& cfg,
 // (b/134092018).
 absl::Status UpdatePlaceholderShape(
     const ItemConfig& cfg,
-    const std::unordered_set<string>& signature_feed_nodes,
+    const std::unordered_set<std::string>& signature_feed_nodes,
     GrapplerItem* new_item, NodeDef* node) {
   if (node->attr().count("dtype") == 0) {
     return absl::InternalError(absl::StrCat("Unknown type for placeholder ",
@@ -188,7 +188,7 @@ absl::Status UpdatePlaceholderShape(
   } else if (cfg.feed_nodes.count(node->name()) > 0) {
     // If specific feed nodes were given, only update their tensors.
     auto it = find_if(new_item->feed.begin(), new_item->feed.end(),
-                      [&node](std::pair<string, Tensor>& f) {
+                      [&node](std::pair<std::string, Tensor>& f) {
                         return f.first == node->name();
                       });
     DCHECK(it != new_item->feed.end());
@@ -294,7 +294,8 @@ absl::Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
 }
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
-    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg) {
+    const std::string& id, const MetaGraphDef& meta_graph,
+    const ItemConfig& cfg) {
   if (id.empty()) {
     LOG(ERROR) << "id must be non-empty.";
     return nullptr;
@@ -305,7 +306,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
   // Fill in feed nodes from config, if any provided.
   for (const auto& feed_node : cfg.feed_nodes) {
-    const string feed_name = NodeName(feed_node);
+    const std::string feed_name = NodeName(feed_node);
     new_item->feed.emplace_back(feed_name, Tensor());
   }
   for (const auto& fetch_node : cfg.fetch_nodes) {
@@ -325,8 +326,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
   // Detect feed and fetch nodes from signature defs. Signatures may share same
   // inputs or outputs.
-  std::unordered_set<string> signature_feed_nodes;
-  std::unordered_set<string> signature_fetch_nodes;
+  std::unordered_set<std::string> signature_feed_nodes;
+  std::unordered_set<std::string> signature_fetch_nodes;
   for (const auto& name_and_signature : meta_graph.signature_def()) {
     for (const auto& name_and_input : name_and_signature.second.inputs()) {
       const TensorInfo& input = name_and_input.second;
@@ -442,7 +443,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // have to run restore op first.
 
   // Try to find initializers from variables and tables as init ops.
-  for (const string& var_collection :
+  for (const std::string& var_collection :
        {"variables", "local_variables", "model_variables",
         "trainable_variables"}) {
     if (meta_graph.collection_def().count(var_collection) == 0) {
@@ -476,7 +477,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // We keep the mapping from asset node to asset files. This should have been
   // used as feed but since asset node is usually a constant node, we will fill
   // the values of these constant nodes with their actual asset file paths.
-  std::unordered_map<string, string> asset_node_to_value;
+  std::unordered_map<std::string, std::string> asset_node_to_value;
 
   // Assets file may have changed their directory, we assemble their new paths
   // if assets_directory_override is set. We also make sure we still can
@@ -495,8 +496,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
               LOG(ERROR) << "Failed to parse AssetFile.";
               continue;
             }
-            string asset_filepath = io::JoinPath(cfg.assets_directory_override,
-                                                 asset_file_def.filename());
+            std::string asset_filepath = io::JoinPath(
+                cfg.assets_directory_override, asset_file_def.filename());
             if (!FilesExist({asset_filepath}, nullptr)) {
               LOG(ERROR) << "Can't access one or more of the asset files "
                          << asset_filepath << ", skipping this input";
@@ -514,7 +515,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   } else if (meta_graph.collection_def().count("asset_filepaths") > 0) {
     const CollectionDef& file_paths =
         meta_graph.collection_def().at("asset_filepaths");
-    std::vector<string> paths;
+    std::vector<std::string> paths;
     for (const auto& raw_path : file_paths.bytes_list().value()) {
       paths.push_back(raw_path);
     }
@@ -544,7 +545,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   // Add each node referenced in a collection to the list of nodes to keep.
   for (const auto& col : meta_graph.collection_def()) {
     const CollectionDef& collection = col.second;
-    for (const string& node : collection.node_list().value()) {
+    for (const std::string& node : collection.node_list().value()) {
       new_item->keep_ops.push_back(NodeName(node));
     }
   }
@@ -654,7 +655,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   }
 
   // Validate feed, fetch and init nodes
-  std::unordered_set<string> nodes;
+  std::unordered_set<std::string> nodes;
   for (const auto& node : new_item->graph.node()) {
     nodes.insert(node.name());
   }
@@ -680,7 +681,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 }
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
-    const string& id, const string& meta_graph_file, const ItemConfig& cfg) {
+    const std::string& id, const std::string& meta_graph_file,
+    const ItemConfig& cfg) {
   MetaGraphDef meta_graph;
   if (!ReadMetaGraphDefFromFile(meta_graph_file, &meta_graph).ok()) {
     LOG(ERROR) << "Failed to read " << meta_graph_file;
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 00661da0253c0d..f16ffef470240a 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -43,13 +43,13 @@ struct ItemConfig {
   // Has no effect if "inline_functions" is disabled.
   bool erase_noinline_attributes = false;
   // If non-empty, override the directory of asset paths.
-  string assets_directory_override;
+  std::string assets_directory_override;
   // If true, runs ModelPruner on the graph.
   bool prune_graph = false;
   // Override feed nodes list.
-  std::set<string> feed_nodes;
+  std::set<std::string> feed_nodes;
   // Override fetch nodes list.
-  std::set<string> fetch_nodes;
+  std::set<std::string> fetch_nodes;
 
   // Configs for graph optimizations from common_runtime. This is NOT Grappler
   // function optimizer. When Grappler is invoked at runtime, it is typically
@@ -71,13 +71,15 @@ absl::Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
 // Factory method for creating a GrapplerItem from a MetaGraphDef.
 // Returns nullptr if the given meta_graph cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
-    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
+    const std::string& id, const MetaGraphDef& meta_graph,
+    const ItemConfig& cfg);
 
 // Factory method for creating a GrapplerItem from a file
 // containing a MetaGraphDef in either binary or text format.
 // Returns nullptr if the given meta_graph cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
-    const string& id, const string& meta_graph_file, const ItemConfig& cfg);
+    const std::string& id, const std::string& meta_graph_file,
+    const ItemConfig& cfg);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 7b288aa1d7cdd2..472909e1790a91 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -42,19 +42,19 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
   Output var =
       ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
   Output filename_node =
-      ops::Const(s.WithOpName("filename"), string("model"), TensorShape());
+      ops::Const(s.WithOpName("filename"), std::string("model"), TensorShape());
   Output tensor_name =
-      ops::Const(s.WithOpName("tensorname"), string("var"), TensorShape());
+      ops::Const(s.WithOpName("tensorname"), std::string("var"), TensorShape());
   Output restore = ops::Restore(s.WithOpName("restore"), filename_node,
                                 tensor_name, DataType::DT_FLOAT);
   Output assign = ops::Assign(s.WithOpName("assign"), var, restore);
 
   TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
 
-  string temp_dir = testing::TmpDir();
+  std::string temp_dir = testing::TmpDir();
 
   Env *env = Env::Default();
-  string filename =
+  std::string filename =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename");
   env->DeleteFile(filename).IgnoreError();
   std::unique_ptr<WritableFile> file_to_write;
@@ -88,7 +88,7 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest) {
       ASSERT_TRUE(iter->second.has_tensor());
       ASSERT_EQ(1, iter->second.tensor().string_val_size());
 
-      string tensor_string_val = iter->second.tensor().string_val(0);
+      std::string tensor_string_val = iter->second.tensor().string_val(0);
       EXPECT_EQ(tensor_string_val, filename);
     }
   }
@@ -100,12 +100,12 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output var =
       ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
-  Output filename_node1 =
-      ops::Const(s.WithOpName("filename1"), string("model1"), TensorShape());
-  Output filename_node2 =
-      ops::Const(s.WithOpName("filename2"), string("model2"), TensorShape());
+  Output filename_node1 = ops::Const(s.WithOpName("filename1"),
+                                     std::string("model1"), TensorShape());
+  Output filename_node2 = ops::Const(s.WithOpName("filename2"),
+                                     std::string("model2"), TensorShape());
   Output tensor_name =
-      ops::Const(s.WithOpName("tensorname"), string("var"), TensorShape());
+      ops::Const(s.WithOpName("tensorname"), std::string("var"), TensorShape());
   Output restore1 = ops::Restore(s.WithOpName("restore1"), filename_node1,
                                  tensor_name, DataType::DT_FLOAT);
   Output restore2 = ops::Restore(s.WithOpName("restore2"), filename_node1,
@@ -115,11 +115,11 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
 
   TF_CHECK_OK(s.ToGraphDef(meta_graph.mutable_graph_def()));
 
-  string temp_dir = testing::TmpDir();
+  std::string temp_dir = testing::TmpDir();
 
   // Create the first AssetFileDef that has a valid file.
   Env *env = Env::Default();
-  string filename1 =
+  std::string filename1 =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename1");
   env->DeleteFile(filename1).IgnoreError();
   std::unique_ptr<WritableFile> file_to_write;
@@ -132,7 +132,7 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   *asset_file_def1.mutable_filename() = "grappler_item_builder_test_filename1";
 
   // Create the second AssetFileDef that has not a valid file.
-  string filename2 =
+  std::string filename2 =
       io::JoinPath(temp_dir, "grappler_item_builder_test_filename1");
   env->DeleteFile(filename2).IgnoreError();
   EXPECT_FALSE(env->FileExists(filename2).ok());
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index a8fbe356829409..f023d7f1f6e746 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -33,11 +33,11 @@ TEST_F(GrapplerItemTest, Basic) {
 
   EXPECT_TRUE(item.InitOpsFanin().empty());
 
-  std::vector<string> graph_nodes;
+  std::vector<std::string> graph_nodes;
   for (const auto& node : item.graph.node()) {
     graph_nodes.push_back(node.name());
   }
-  std::vector<string> main_ops;
+  std::vector<std::string> main_ops;
   for (const auto& node : item.MainOpsFanin()) {
     main_ops.push_back(node->name());
   }
@@ -49,9 +49,9 @@ TEST_F(GrapplerItemTest, Basic) {
 TEST_F(GrapplerItemTest, InferDevices) {
   using test::function::NDef;
 
-  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
-  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
-  const string cpu2 = "/device:CPU:2";
+  const std::string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const std::string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const std::string cpu2 = "/device:CPU:2";
 
   GrapplerItem item;
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index ebebebf5226d6a..326b291c4eb4f5 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/op_types.h"
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -767,9 +768,7 @@ bool ModifiesInputsInPlace(const NodeDef& node) {
     return false;
   }
 
-  string lower_op_name = op_name;
-  std::transform(lower_op_name.begin(), lower_op_name.end(),
-                 lower_op_name.begin(), ::tolower);
+  std::string lower_op_name = absl::AsciiStrToLower(op_name);
   if (absl::StrContains(lower_op_name, "inplace")) {
     return true;
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index cb903c798f2980..0e1cbc6b9aeed4 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <algorithm>
 #include <cmath>
+#include <limits>
 #include <utility>
 
 #include "absl/status/status.h"
@@ -1267,28 +1269,28 @@ absl::Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)         \
-  {                                                               \
-    const auto* val_ptr = tensor->flat<TYPE>().data();            \
-    auto last = *val_ptr;                                         \
-    int64_t last_index = 0;                                       \
-    for (int64_t i = 0; i < tensor->NumElements(); ++i) {         \
-      TYPE cur = *val_ptr++;                                      \
-      if (PackedValuesNotEqual(cur, last)) {                      \
-        last = cur;                                               \
-        last_index = i;                                           \
-      }                                                           \
-    }                                                             \
-    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);          \
-    if (encoded_size < kint32max) {                               \
-      optimized = true;                                           \
-      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);    \
-      const auto* src_ptr = tensor->flat<TYPE>().data();          \
-      auto* dst_ptr = t->mutable_##FIELDTYPE##_val()              \
-                          -> AddNAlreadyReserved(last_index + 1); \
-      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);      \
-    }                                                             \
-  }                                                               \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)                      \
+  {                                                                            \
+    const auto* val_ptr = tensor->flat<TYPE>().data();                         \
+    auto last = *val_ptr;                                                      \
+    int64_t last_index = 0;                                                    \
+    for (int64_t i = 0; i < tensor->NumElements(); ++i) {                      \
+      TYPE cur = *val_ptr++;                                                   \
+      if (PackedValuesNotEqual(cur, last)) {                                   \
+        last = cur;                                                            \
+        last_index = i;                                                        \
+      }                                                                        \
+    }                                                                          \
+    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);                       \
+    if (encoded_size < std::numeric_limits<int32_t>::max()) {                  \
+      optimized = true;                                                        \
+      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);                 \
+      const auto* src_ptr = tensor->flat<TYPE>().data();                       \
+      auto* dst_ptr =                                                          \
+          t->mutable_##FIELDTYPE##_val()->AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);                   \
+    }                                                                          \
+  }                                                                            \
   break
 
     switch (tensor->dtype()) {
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index f7c158265a5d3c..25dfae3ec7be3e 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -268,9 +268,9 @@ absl::Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
 
 absl::Status AddShuffleDataset(MutableGraphView* graph,
                                const NodeDef& add_before,
-                               const string& buffer_size_node,
-                               const string& seed_node,
-                               const string& seed2_node,
+                               const std::string& buffer_size_node,
+                               const std::string& seed_node,
+                               const std::string& seed2_node,
                                bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
@@ -299,8 +299,8 @@ absl::Status AddShuffleDataset(MutableGraphView* graph,
 
 absl::Status AddShuffleDatasetV2(MutableGraphView* graph,
                                  const NodeDef& add_before,
-                                 const string& buffer_size_node,
-                                 const string& seed_generator_node) {
+                                 const std::string& buffer_size_node,
+                                 const std::string& seed_generator_node) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
   new_node.set_op(kShuffleDatasetV2OpName);
@@ -323,10 +323,10 @@ absl::Status AddShuffleDatasetV2(MutableGraphView* graph,
 
 absl::Status AddShuffleDatasetV3(MutableGraphView* graph,
                                  const NodeDef& add_before,
-                                 const string& buffer_size_node,
-                                 const string& seed_node,
-                                 const string& seed2_node,
-                                 const string& seed_generator_node,
+                                 const std::string& buffer_size_node,
+                                 const std::string& seed_node,
+                                 const std::string& seed2_node,
+                                 const std::string& seed_generator_node,
                                  bool reshuffle_each_iteration) {
   NodeDef* add_after = graph->GetNode(add_before.input(0));
   NodeDef new_node;
@@ -373,11 +373,11 @@ bool ReaderOpInFunction(const NodeDef& node,
   return false;
 }
 
-absl::Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
-                                  absl::flat_hash_set<string>* nodes_to_delete,
-                                  string* op_name, string* buffer_size_node,
-                                  string* seed_node, string* seed2_node,
-                                  bool* reshuffle_each_iteration) {
+absl::Status RemoveShuffleDataset(
+    MutableGraphView* graph, const NodeDef& node,
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_node,
+    std::string* seed2_node, bool* reshuffle_each_iteration) {
   if (node.op() == kShuffleDatasetOpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -400,8 +400,8 @@ absl::Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
 
 absl::Status RemoveShuffleDatasetV2(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, string* op_name,
-    string* buffer_size_node, string* seed_generator_node) {
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_generator_node) {
   if (node.op() == kShuffleDatasetV2OpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -422,9 +422,10 @@ absl::Status RemoveShuffleDatasetV2(
 
 absl::Status RemoveShuffleDatasetV3(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, string* op_name,
-    string* buffer_size_node, string* seed_node, string* seed2_node,
-    string* seed_generator_node, bool* reshuffle_each_iteration) {
+    absl::flat_hash_set<std::string>* nodes_to_delete, std::string* op_name,
+    std::string* buffer_size_node, std::string* seed_node,
+    std::string* seed2_node, std::string* seed_generator_node,
+    bool* reshuffle_each_iteration) {
   if (node.op() == kShuffleDatasetV3OpName) {
     *op_name = node.op();
     *buffer_size_node = node.input(1);
@@ -448,13 +449,13 @@ absl::Status RemoveShuffleDatasetV3(
 
 absl::Status ProcessDatasetSourceNode(
     MutableGraphView* graph, const NodeDef& node,
-    absl::flat_hash_set<string>* nodes_to_delete, int64_t num_workers,
+    absl::flat_hash_set<std::string>* nodes_to_delete, int64_t num_workers,
     int64_t index) {
-  string shuffle_op_name = "";
-  string buffer_size_node = "";
-  string seed_node = "";
-  string seed2_node = "";
-  string seed_generator_node = "";
+  std::string shuffle_op_name = "";
+  std::string buffer_size_node = "";
+  std::string seed_node = "";
+  std::string seed2_node = "";
+  std::string seed_generator_node = "";
   bool reshuffle_each_iteration;
 
   TF_RETURN_IF_ERROR(AddShardNode(graph, node, num_workers, index));
@@ -492,7 +493,7 @@ absl::Status ProcessDatasetSourceNode(
 const NodeDef* FindFuncAndTensorSliceDataset(
     const NodeDef* node, int64_t num_workers, int64_t index,
     FunctionLibraryDefinition* flib, MutableGraphView* graph,
-    absl::flat_hash_set<string>* nodes_to_delete) {
+    absl::flat_hash_set<std::string>* nodes_to_delete) {
   if (IsDatasetNodeOfType(*node, kFuncDatasetOps)) {
     const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
     if (input_node->op() == kTensorSliceDatasetOpName ||
@@ -550,10 +551,10 @@ DropRemainderValue GetDropRemainder(const MutableGraphView& graph,
                               : DropRemainderValue::kFalse;
 }
 
-absl::Status RecursivelyHandleOp(const NodeDef& node, int64_t num_workers,
-                                 int64_t index, FunctionLibraryDefinition* flib,
-                                 MutableGraphView* graph,
-                                 absl::flat_hash_set<string>* nodes_to_delete) {
+absl::Status RecursivelyHandleOp(
+    const NodeDef& node, int64_t num_workers, int64_t index,
+    FunctionLibraryDefinition* flib, MutableGraphView* graph,
+    absl::flat_hash_set<std::string>* nodes_to_delete) {
   if (node.op() == kAssertCardinalityDatasetOpName) {
     LOG(WARNING) << "The `assert_cardinality` transformation is currently not "
                     "handled by the auto-shard rewrite and will be removed.";
@@ -664,7 +665,7 @@ absl::Status RecursivelyHandleOp(const NodeDef& node, int64_t num_workers,
 absl::Status ShardByFile(const NodeDef& sink_node, int64_t num_workers,
                          int64_t index, FunctionLibraryDefinition* flib,
                          MutableGraphView* graph) {
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   TF_RETURN_IF_ERROR(RecursivelyHandleOp(sink_node, num_workers, index, flib,
                                          graph, &nodes_to_delete));
   return graph->DeleteNodes(nodes_to_delete);
@@ -818,7 +819,7 @@ absl::Status OptimizeGraph(const GrapplerItem& item, int64_t num_workers,
 
   // id for telemetry purpose. item.id is always the same so we use the address
   // of the output as id.
-  string id = absl::StrCat(reinterpret_cast<uint64>(output));
+  std::string id = absl::StrCat(reinterpret_cast<uint64_t>(output));
   // Only record metrics on the first shard to avoid duplication.
   if (index == 0) {
     std::vector<std::string> ineligible_reason;
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.h b/tensorflow/core/grappler/optimizers/data/auto_shard.h
index 400ace5f99e7c8..c1901ebb251abf 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.h
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -32,7 +32,7 @@ class AutoShard : public TFDataOptimizerBase {
   AutoShard() = default;
   ~AutoShard() override = default;
 
-  string name() const override { return "tf_auto_shard"; }
+  std::string name() const override { return "tf_auto_shard"; }
 
   bool UsesFunctionLibrary() const override { return true; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
index eb3022aa37facf..60ef5aece2ce7d 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
@@ -44,7 +44,7 @@ using ::testing::UnorderedElementsAre;
 
 // Adds a MapDataset, a RebatchDataset, a PrefetchDataset and a fake sink that
 // are common to all graphs; and sets the fetch node to the fake sink.
-void FinishItem(GrapplerItem* item, const string& input_node_name) {
+void FinishItem(GrapplerItem* item, const std::string& input_node_name) {
   *item->graph.add_node() =
       NDef("map_before_rebatch", "MapDataset", {input_node_name},
            {{"f", "__inference_Dataset_map_normalize_8232"},
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
index 61a4a8e1d651ff..de186866b83957 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
@@ -63,7 +63,7 @@ absl::Status AutotuneBufferSizes::OptimizeAndCollectStats(
   NodeDef* autotune_value =
       graph_utils::AddScalarConstNode(data::model::kAutotune, &graph);
 
-  absl::flat_hash_set<string> already_prefetched;
+  absl::flat_hash_set<std::string> already_prefetched;
 
   // 1) Collect about all existing `PrefetchDataset` nodes, replacing
   // `prefetch(N)` with `prefetch(AUTOTUNE, buffer_size_min=N)` for all N !=-1.
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
index 0860ba50d4b161..3174a3ee853abd 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -39,7 +39,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
   AutotuneBufferSizes() = default;
   ~AutotuneBufferSizes() override = default;
 
-  string name() const override { return "autotune_buffer_sizes"; };
+  std::string name() const override { return "autotune_buffer_sizes"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -47,7 +47,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
index a3a7caba94e7ab..de9d5d93ad1434 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
@@ -41,10 +41,10 @@ absl::Status OptimizeWithAutotuneBufferSizes(const GrapplerItem &item,
   return optimizer.Optimize(nullptr, item, output);
 }
 
-class SimpleInject : public ::testing::TestWithParam<string> {};
+class SimpleInject : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(SimpleInject, AutotuneBufferSizesTest) {
-  const string async_dataset = GetParam();
+  const std::string async_dataset = GetParam();
   using test::function::NDef;
   GrapplerItem item;
   if (async_dataset == "map") {
@@ -162,20 +162,20 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
   NodeDef *stop_val = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_val = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_val->name();
   range_inputs[1] = stop_val->name();
   range_inputs[2] = step_val->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("range", "RangeDataset",
                                              range_inputs, range_attrs, &graph);
 
   NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64_t>(1, &graph);
-  std::vector<string> map_inputs1(2);
+  std::vector<std::string> map_inputs1(2);
   map_inputs1[0] = range_node->name();
   map_inputs1[1] = parallelism_val->name();
-  std::vector<std::pair<string, AttrValue>> map_attrs(4);
+  std::vector<std::pair<std::string, AttrValue>> map_attrs(4);
   AttrValue attr_val;
   SetAttrValue("value", &attr_val);
   map_attrs[0] = std::make_pair("f", attr_val);
@@ -187,10 +187,10 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
 
   NodeDef *buffer_size_val =
       graph_utils::AddScalarConstNode<int64_t>(initial_buffer_size, &graph);
-  std::vector<string> prefetch_inputs(2);
+  std::vector<std::string> prefetch_inputs(2);
   prefetch_inputs[0] = map_node1->name();
   prefetch_inputs[1] = buffer_size_val->name();
-  std::vector<std::pair<string, AttrValue>> prefetch_attrs(4);
+  std::vector<std::pair<std::string, AttrValue>> prefetch_attrs(4);
   AttrValue legacy_autotune_attr;
   SetAttrValue(legacy_autotune, &legacy_autotune_attr);
   AttrValue buffer_size_min_attr;
@@ -202,13 +202,13 @@ TEST_P(MultipleNodes, AutotuneBufferSizesTest) {
   NodeDef *prefetch_node = graph_utils::AddNode(
       "prefetch", "PrefetchDataset", prefetch_inputs, prefetch_attrs, &graph);
 
-  std::vector<string> map_inputs2(2);
+  std::vector<std::string> map_inputs2(2);
   map_inputs2[0] = prefetch_node->name();
   map_inputs2[1] = parallelism_val->name();
   NodeDef *map_node2 = graph_utils::AddNode("map2", "ParallelMapDatasetV2",
                                             map_inputs2, map_attrs, &graph);
 
-  std::vector<string> map_inputs3(1);
+  std::vector<std::string> map_inputs3(1);
   map_inputs3[0] = map_node2->name();
   graph_utils::AddNode("map3", "MapDataset", map_inputs3, map_attrs, &graph);
 
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
index 55fb6e4961164e..4d20fdb698d137 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kBatchDataset[] = "BatchDatasetV2";
 constexpr char kParallelBatchDataset[] = "ParallelBatchDataset";
 
-NodeDef MakeParallelBatch(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelBatch(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -46,7 +46,7 @@ NodeDef MakeParallelBatch(const string& name, MutableGraphView* graph) {
   parallel_batch.set_op(kParallelBatchDataset);
   auto* num_parallel_calls =
       graph_utils::AddScalarConstNode(data::model::kAutotune, graph);
-  string drop_remainder_name = parallel_batch.input(2);
+  std::string drop_remainder_name = parallel_batch.input(2);
   parallel_batch.set_input(2, num_parallel_calls->name());
   parallel_batch.add_input(drop_remainder_name);
 
@@ -72,7 +72,7 @@ absl::Status BatchParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_batch_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
index 2e77dea08bbda1..8d2f6895c322e2 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
@@ -30,7 +30,7 @@ class BatchParallelization : public TFDataOptimizerBase {
   BatchParallelization() = default;
   ~BatchParallelization() override = default;
 
-  string name() const override { return "batch_parallelization"; };
+  std::string name() const override { return "batch_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class BatchParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
index 1f15a57a396029..426400737db8aa 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization_test.cc
@@ -75,10 +75,10 @@ TEST_P(AutotuneSetting, BatchParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, BatchParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
index 977b0c5d513d1a..b48d799f03c6c3 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -27,7 +27,7 @@ class DisableIntraOpParallelism : public TFDataOptimizerBase {
   DisableIntraOpParallelism() = default;
   ~DisableIntraOpParallelism() override = default;
 
-  string name() const override { return "disable_intra_op_parallelism"; };
+  std::string name() const override { return "disable_intra_op_parallelism"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
index fb67798c6d9753..3fbf47b9cc205d 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -32,10 +32,10 @@ using test::function::NDef;
 
 // If the user manually sets intra op parallelism, we don't insert the op.
 class IntraOpAlreadySetTest
-    : public ::testing::TestWithParam<std::tuple<string, int64_t>> {};
+    : public ::testing::TestWithParam<std::tuple<std::string, int64_t>> {};
 
 TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
-  const string op = std::get<0>(GetParam());
+  const std::string op = std::get<0>(GetParam());
   const int64_t value = std::get<1>(GetParam());
 
   GrapplerItem item;
@@ -44,26 +44,26 @@ TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
   NodeDef *start_val = graph_utils::AddScalarConstNode<int64_t>(0, &graph);
   NodeDef *stop_val = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_val = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_val->name();
   range_inputs[1] = stop_val->name();
   range_inputs[2] = step_val->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("range", "RangeDataset",
                                              range_inputs, range_attrs, &graph);
 
   NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64_t>(value, &graph);
-  std::vector<string> parallelism_inputs(2);
+  std::vector<std::string> parallelism_inputs(2);
   parallelism_inputs[0] = range_node->name();
   parallelism_inputs[1] = parallelism_val->name();
-  std::vector<std::pair<string, AttrValue>> parallelism_attrs;
+  std::vector<std::pair<std::string, AttrValue>> parallelism_attrs;
   NodeDef *parallelism_node = graph_utils::AddNode(
       "max_parallelism", op, parallelism_inputs, parallelism_attrs, &graph);
 
-  std::vector<string> sink_inputs(1);
+  std::vector<std::string> sink_inputs(1);
   sink_inputs[0] = parallelism_node->name();
-  std::vector<std::pair<string, AttrValue>> sink_attrs;
+  std::vector<std::pair<std::string, AttrValue>> sink_attrs;
   NodeDef *sink_node =
       graph_utils::AddNode("Sink", "Identity", sink_inputs, sink_attrs, &graph);
   item.fetch.push_back(sink_node->name());
@@ -97,10 +97,10 @@ INSTANTIATE_TEST_SUITE_P(
 // If we can not find the sink node or sink node op is "_Retval", we don't apply
 // the optimization; otherwise, we insert the op to disable intra op
 // parallelism.
-class IntraOpNotSetTest : public ::testing::TestWithParam<string> {};
+class IntraOpNotSetTest : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   GrapplerItem item;
 
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
index 3aded2589424ba..225a652c05265c 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
@@ -30,7 +30,9 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
   DisablePrefetchLegacyAutotune() = default;
   ~DisablePrefetchLegacyAutotune() override = default;
 
-  string name() const override { return "disable_prefetch_legacy_autotune"; };
+  std::string name() const override {
+    return "disable_prefetch_legacy_autotune";
+  };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +40,7 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
index 35c333c0fffe29..257d2893afb547 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
@@ -30,7 +30,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
   EnableGradientDescent() = default;
   ~EnableGradientDescent() override = default;
 
-  string name() const override { return "enable_gradient_descent"; };
+  std::string name() const override { return "enable_gradient_descent"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
index ff4b83b32cf25e..432cf04516a4d5 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
@@ -43,12 +43,13 @@ absl::Status OptimizeWithEnableGradientDescent(const GrapplerItem &item,
 }
 
 class SimpleRewrite
-    : public ::testing::TestWithParam<std::tuple<bool, int64_t, string>> {};
+    : public ::testing::TestWithParam<std::tuple<bool, int64_t, std::string>> {
+};
 
 TEST_P(SimpleRewrite, EnableGradientDescentTest) {
   const bool autotune = std::get<0>(GetParam());
   const int64_t algorithm_index = std::get<1>(GetParam());
-  const string op = std::get<2>(GetParam());
+  const std::string op = std::get<2>(GetParam());
 
   using test::function::NDef;
   GrapplerItem item;
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index bb3ab390357370..9a9a774223cd34 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -70,7 +70,7 @@ absl::Status FilterFusion::OptimizeAndCollectStats(Cluster* cluster,
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              output->library());
 
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index 757f75578ccbf2..59a1f3a3748709 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -27,7 +27,7 @@ class FilterFusion : public TFDataOptimizerBase {
   FilterFusion() = default;
   ~FilterFusion() override = default;
 
-  string name() const override { return "filter_fusion"; };
+  std::string name() const override { return "filter_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
index 88536e8670fb5d..92a627bf490249 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kFilterDataset[] = "FilterDataset";
 constexpr char kParallelFilterDataset[] = "ParallelFilterDataset";
 
-NodeDef MakeParallelFilter(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelFilter(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -71,7 +71,7 @@ absl::Status FilterParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_filter_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
index 63f759075d0c0d..2d64ca051204cc 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
@@ -30,7 +30,7 @@ class FilterParallelization : public TFDataOptimizerBase {
   FilterParallelization() = default;
   ~FilterParallelization() override = default;
 
-  string name() const override { return "filter_parallelization"; };
+  std::string name() const override { return "filter_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class FilterParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
index 480c7e54f1d934..7a10b7762acac9 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization_test.cc
@@ -77,10 +77,10 @@ TEST_P(AutotuneSetting, FilterParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, FilterParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index 2b5304fecac230..0d0c2eab1428f7 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -24,13 +24,14 @@ namespace tensorflow {
 namespace grappler {
 namespace function_utils {
 
-FunctionDefTensorDesc::FunctionDefTensorDesc(const string& node_name,
-                                             const string& output, int position)
+FunctionDefTensorDesc::FunctionDefTensorDesc(const std::string& node_name,
+                                             const std::string& output,
+                                             int position)
     : node_name(node_name), node_output(output), position(position) {
   full_str = strings::StrCat(node_name, ":", node_output, ":", position);
 }
 
-FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
+FunctionDefTensorDesc::FunctionDefTensorDesc(const std::string& input) {
   // Parses node_name:node_output:position string into its components.
   full_str = input;
   absl::string_view capture;
@@ -41,7 +42,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
           .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
           .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
           .GetResult(&remaining, &capture)) {
-    node_name = string(capture.data(), capture.size());
+    node_name = std::string(capture.data(), capture.size());
   }
 
   // Parse "node_output" if it exists
@@ -51,7 +52,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
           .One(strings::Scanner::LETTER)
           .Any(strings::Scanner::LETTER_DIGIT_UNDERSCORE)
           .GetResult(&remaining, &capture)) {
-    node_output = string(capture.data(), capture.size());
+    node_output = std::string(capture.data(), capture.size());
   }
 
   // Parse "position" if it exists
@@ -71,7 +72,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
 // Note that we're not using GrapplerFunctionItem because it doesn't cover
 // some of our desired uses (eg changing the outputs of a function), and the
 // FunctionDef -> GraphDef conversion isn't really necessary in this case.
-void ReplaceReferences(const string& from, const string& to,
+void ReplaceReferences(const std::string& from, const std::string& to,
                        FunctionDef* func) {
   for (NodeDef& n : *func->mutable_node_def()) {
     std::replace(n.mutable_input()->begin(), n.mutable_input()->end(), from,
@@ -88,7 +89,7 @@ void ReplaceReferences(const string& from, const string& to,
 void AddFunctionOutputWithUniqueName(absl::string_view prefix,
                                      absl::string_view output_tensor_name,
                                      FunctionDef* fdef, DataType dtype) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = fdef->signature().output_arg_size();
   while (ContainsFunctionOutputWithName(name, *fdef)) {
     name = absl::StrCat(prefix, "/_", id);
@@ -98,10 +99,10 @@ void AddFunctionOutputWithUniqueName(absl::string_view prefix,
   output->set_name(name);
   output->set_type(dtype);
 
-  (*fdef->mutable_ret())[name] = string(output_tensor_name);
+  (*fdef->mutable_ret())[name] = std::string(output_tensor_name);
 }
 
-OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+OpDef_ArgDef* AddFunctionInput(const std::string& name, FunctionDef* fdef,
                                DataType dtype) {
   auto* input_arg = fdef->mutable_signature()->mutable_input_arg()->Add();
   input_arg->set_type(dtype);
@@ -110,18 +111,19 @@ OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
   return input_arg;
 }
 
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 FunctionDef* fd) {
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    FunctionDef* fd) {
   NodeDef* node = fd->add_node_def();
   if (!name.empty()) {
-    node->set_name(string(name));
+    node->set_name(name);
   } else {
     SetUniqueFunctionNodeName(op, fd, node);
   }
-  node->set_op(string(op));
-  for (const string& input : inputs) {
+  node->set_op(op);
+  for (const std::string& input : inputs) {
     node->add_input(input);
   }
   for (const auto& attr : attributes) {
@@ -174,7 +176,7 @@ int FindFunctionNodeWithOp(absl::string_view op, const FunctionDef& function) {
 
 void SetUniqueFunctionNodeName(absl::string_view prefix, FunctionDef* function,
                                NodeDef* node) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = function->node_def_size();
   while (ContainsFunctionNodeWithName(name, *function)) {
     name = absl::StrCat(prefix, "/_", id);
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index e78e98aef5e082..b4e58c4646690c 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -42,23 +42,24 @@ namespace function_utils {
 struct FunctionDefTensorDesc {
   FunctionDefTensorDesc() = default;
 
-  FunctionDefTensorDesc(const string& node_name, const string& output,
+  FunctionDefTensorDesc(const std::string& node_name, const std::string& output,
                         int position);
 
   // Parses node_name:node_output:position string into its components.
-  explicit FunctionDefTensorDesc(const string& input);
+  explicit FunctionDefTensorDesc(const std::string& input);
 
   // TODO(rachelim): Add provisions to deal with special formats, like how
   // GrapplerFunctionItem expands node output range if position is not defined
-  string full_str;
-  string node_name;
-  string node_output;
+  std::string full_str;
+  std::string node_name;
+  std::string node_output;
   int position = -1;
 };
 
 // Replaces all references to `from` tensor in func's nodes' inputs and retvals
 // to `to` tensor. This is similar to `MutableGraphView::ReplaceInputs`.
-void ReplaceReferences(const string& from, const string& to, FunctionDef* func);
+void ReplaceReferences(const std::string& from, const std::string& to,
+                       FunctionDef* func);
 
 // Adds a function output to the function def, ensuring that the output key
 // is unique, and maps to output_tensor_name in the ret dict.
@@ -67,14 +68,15 @@ void AddFunctionOutputWithUniqueName(absl::string_view prefix,
                                      FunctionDef* fdef, DataType dtype);
 
 // Adds an input to a FunctionDef.
-OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
+OpDef_ArgDef* AddFunctionInput(const std::string& name, FunctionDef* fdef,
                                DataType dtype);
 
 // Adds a node to a FunctionDef.
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 FunctionDef* fd);
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    FunctionDef* fd);
 
 // Checks whether the function contains a node with the given name.
 bool ContainsFunctionNodeWithName(absl::string_view name,
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index cac01d1842a47f..110f49c85921e8 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -145,7 +145,7 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   EXPECT_EQ(node1.input_size(), 0);
   EXPECT_EQ(node1.attr_size(), 0);
 
-  const std::vector<string> inputs({"input1", "input2"});
+  const std::vector<std::string> inputs({"input1", "input2"});
   AddNode("", op_name, inputs, {}, &func);
   const NodeDef& node2 =
       func.node_def(FindFunctionNodeWithName("xxx/_2", func));
@@ -159,7 +159,7 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   AttrValue a1, a2;
   a1.set_type(DT_INT32);
   a2.set_type(DT_INT64);
-  const std::vector<std::pair<string, AttrValue>> attrs(
+  const std::vector<std::pair<std::string, AttrValue>> attrs(
       {{"attr1", a1}, {"attr2", a2}});
   AddNode("", op_name, {}, attrs, &func);
   const NodeDef& node3 =
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index 6ad4eedf3d4c0c..17f0e72afa4634 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -52,36 +52,36 @@ namespace {
 // See the comment for the proto field `tensorflow.NodeDef.input`.
 constexpr char kControlInputPrefix[] = "^";
 
-bool IsControlInput(const string& node_input) {
+bool IsControlInput(const std::string& node_input) {
   return absl::StartsWith(node_input, kControlInputPrefix);
 }
 
-string StripControlInputNotation(const string& node_input) {
-  return string(absl::StripPrefix(node_input, kControlInputPrefix));
+std::string StripControlInputNotation(const std::string& node_input) {
+  return std::string(absl::StripPrefix(node_input, kControlInputPrefix));
 }
 
-string AddControlInputNotation(const string& node_input) {
+std::string AddControlInputNotation(const std::string& node_input) {
   return absl::StrCat(kControlInputPrefix, node_input);
 }
 
 // Returns e.g. `"node"` given `"node:out"` or `"node:out:0"`. See the comment
 // for the proto field `tensorflow.FunctionDef.node_def`.
-string ParseNodeConnection(const string& name) {
+std::string ParseNodeConnection(const std::string& name) {
   return name.substr(0, name.find(':'));
 }
 
-string ParseOutputNode(const string& name) {
-  if (name.find(':') == string::npos) return {};
-  return name.substr(name.find(':'), string::npos);
+std::string ParseOutputNode(const std::string& name) {
+  if (name.find(':') == std::string::npos) return {};
+  return name.substr(name.find(':'), std::string::npos);
 }
 
-string GetOutputNode(const FunctionDef& function, int output_idx) {
+std::string GetOutputNode(const FunctionDef& function, int output_idx) {
   const auto& ret_output_name =
       function.signature().output_arg(output_idx).name();
   return function.ret().at(ret_output_name);
 }
 
-string& GetMutableOutputNode(FunctionDef* function, int output_idx) {
+std::string& GetMutableOutputNode(FunctionDef* function, int output_idx) {
   const auto& ret_output_name =
       function->signature().output_arg(output_idx).name();
   return function->mutable_ret()->at(ret_output_name);
@@ -96,10 +96,10 @@ StringCollection GetNames(const Iterable& iterable, int allocate_size) {
 }
 
 template <typename Iterable>
-gtl::FlatSet<string> GetNodeNamesSet(const Iterable& nodes) {
+gtl::FlatSet<std::string> GetNodeNamesSet(const Iterable& nodes) {
   // NOTE(prazek): Cases where the set is not modified after construction
   // could use sorted vector with binary_search instead, to make it faster.
-  gtl::FlatSet<string> names;
+  gtl::FlatSet<std::string> names;
   for (const auto& node : nodes) {
     CHECK(gtl::InsertIfNotPresent(&names, node.name()))
         << "Functions should have unique node names. Node with name "
@@ -109,16 +109,16 @@ gtl::FlatSet<string> GetNodeNamesSet(const Iterable& nodes) {
 }
 
 template <typename Iterable>
-gtl::FlatMap<string, string> GetUniqueNames(const Iterable& first_iterable,
-                                            const Iterable& second_iterable) {
-  gtl::FlatMap<string, string> changed_node_names;
+gtl::FlatMap<std::string, std::string> GetUniqueNames(
+    const Iterable& first_iterable, const Iterable& second_iterable) {
+  gtl::FlatMap<std::string, std::string> changed_node_names;
   const auto first_names = GetNodeNamesSet(first_iterable);
   auto second_names = GetNodeNamesSet(first_iterable);
   int id = second_iterable.size();
 
   for (const auto& node : second_iterable) {
-    string name_before = node.name();
-    string name = name_before;
+    std::string name_before = node.name();
+    std::string name = name_before;
     bool changed_name = false;
 
     while (first_names.count(name) ||
@@ -143,14 +143,14 @@ gtl::FlatMap<string, string> GetUniqueNames(const Iterable& first_iterable,
 void RenameFunctionNodes(
     const FunctionDef& first_function,
     protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse,
-    protobuf::Map<string, string>* rets_to_fuse,
-    protobuf::Map<string, string>* control_rets_to_fuse,
-    protobuf::RepeatedPtrField<string>* control_outputs_to_fuse) {
-  const gtl::FlatMap<string, string> changed_node_names =
+    protobuf::Map<std::string, std::string>* rets_to_fuse,
+    protobuf::Map<std::string, std::string>* control_rets_to_fuse,
+    protobuf::RepeatedPtrField<std::string>* control_outputs_to_fuse) {
+  const gtl::FlatMap<std::string, std::string> changed_node_names =
       GetUniqueNames(first_function.node_def(), *nodes_to_fuse);
 
-  auto updated_name = [&changed_node_names](const string& input) {
-    string input_node = ParseNodeConnection(input);
+  auto updated_name = [&changed_node_names](const std::string& input) {
+    std::string input_node = ParseNodeConnection(input);
     auto iter = changed_node_names.find(input_node);
     if (iter != changed_node_names.end()) {
       return iter->second + ParseOutputNode(input);
@@ -159,12 +159,12 @@ void RenameFunctionNodes(
   };
 
   for (NodeDef& function_node : *nodes_to_fuse) {
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_node_names, function_node.name())) {
       function_node.set_name(*new_name);
     }
 
-    for (string& input : *function_node.mutable_input()) {
+    for (std::string& input : *function_node.mutable_input()) {
       input = updated_name(input);
     }
   }
@@ -179,10 +179,10 @@ void RenameFunctionNodes(
   // `FunctionDef.signature.control_output` elements with these new values.
   // The old keys and elements are ignored; these keys and elements serve only
   // to look up one another.
-  protobuf::Map<string, string> new_control_rets_to_fuse;
-  protobuf::RepeatedPtrField<string> new_control_outputs_to_fuse;
+  protobuf::Map<std::string, std::string> new_control_rets_to_fuse;
+  protobuf::RepeatedPtrField<std::string> new_control_outputs_to_fuse;
   for (const auto& [unused, control_ret_node] : *control_rets_to_fuse) {
-    string updated_control_ret_node = updated_name(control_ret_node);
+    std::string updated_control_ret_node = updated_name(control_ret_node);
     new_control_rets_to_fuse.insert(
         {updated_control_ret_node, updated_control_ret_node});
     *new_control_outputs_to_fuse.Add() = updated_control_ret_node;
@@ -199,12 +199,12 @@ StringCollection GetFunctionInputs(const FunctionDef& function) {
 // This function produces signature having names that do not conflict with
 // `first_signature`.  The input of returns and nodes that will be fused are
 // updated to use new names.
-OpDef GetUniqueSignature(const OpDef& first_signature,
-                         const OpDef& second_signature,
-                         protobuf::Map<string, string>* rets_to_fuse,
-                         protobuf::Map<string, string>* control_rets_to_fuse,
-                         protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
-  const gtl::FlatMap<string, string> changed_input_names =
+OpDef GetUniqueSignature(
+    const OpDef& first_signature, const OpDef& second_signature,
+    protobuf::Map<std::string, std::string>* rets_to_fuse,
+    protobuf::Map<std::string, std::string>* control_rets_to_fuse,
+    protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  const gtl::FlatMap<std::string, std::string> changed_input_names =
       GetUniqueNames(first_signature.input_arg(), second_signature.input_arg());
   OpDef signature;
   signature.set_name(second_signature.name());
@@ -212,25 +212,26 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
   for (const auto& input_arg : second_signature.input_arg()) {
     auto& input = *signature.add_input_arg();
     input = input_arg;
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_input_names, input.name())) {
       input.set_name(*new_name);
     }
   }
-  const gtl::FlatMap<string, string> changed_output_names = GetUniqueNames(
-      first_signature.output_arg(), second_signature.output_arg());
+  const gtl::FlatMap<std::string, std::string> changed_output_names =
+      GetUniqueNames(first_signature.output_arg(),
+                     second_signature.output_arg());
 
   for (const auto& output_arg : second_signature.output_arg()) {
     auto& output = *signature.add_output_arg();
     output = output_arg;
-    if (const string* new_name =
+    if (const std::string* new_name =
             gtl::FindOrNull(changed_output_names, output.name())) {
       output.set_name(*new_name);
     }
   }
 
-  auto new_rets = [&](const protobuf::Map<string, string>& old_rets) {
-    protobuf::Map<string, string> new_rets;
+  auto new_rets = [&](const protobuf::Map<std::string, std::string>& old_rets) {
+    protobuf::Map<std::string, std::string> new_rets;
     for (const auto& ret : old_rets) {
       const auto& key = changed_output_names.count(ret.first)
                             ? changed_output_names.at(ret.first)
@@ -253,7 +254,7 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
       bool is_control_input = IsControlInput(node_input);
       const auto& input =
           ParseNodeConnection(StripControlInputNotation(node_input));
-      if (const string* new_name =
+      if (const std::string* new_name =
               gtl::FindOrNull(changed_input_names, input)) {
         node_input = *new_name + ParseOutputNode(node_input);
         if (is_control_input) {
@@ -304,7 +305,7 @@ void FuseReturns(const StringCollection& first_inputs,
                  const StringCollection& second_inputs,
                  const StringCollection& first_outputs,
                  const SetInputFn& set_input,
-                 protobuf::Map<string, string>* fused_ret) {
+                 protobuf::Map<std::string, std::string>* fused_ret) {
   for (auto& ret : *fused_ret) {
     auto return_input = ParseNodeConnection(ret.second);
     auto input_it =
@@ -381,9 +382,9 @@ bool CanCompose(const OpDef& first_signature, const OpDef& second_signature) {
   return first_signature.output_arg_size() == second_signature.input_arg_size();
 }
 
-string ComposeInput(const StringCollection& first_inputs,
-                    const StringCollection& second_inputs,
-                    const StringCollection& first_outputs, int arg_num) {
+std::string ComposeInput(const StringCollection& first_inputs,
+                         const StringCollection& second_inputs,
+                         const StringCollection& first_outputs, int arg_num) {
   // Take corresponding parent output.
   return first_outputs.at(arg_num);
 }
@@ -412,9 +413,9 @@ void ComposeSignature(const OpDef& first_signature,
       second_signature.control_output().end());
 }
 
-void ComposeOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret) {
+void ComposeOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret) {
   *fused_ret = second_ret;
 }
 
@@ -429,16 +430,16 @@ void CombineSignature(const OpDef& first_signature,
       second_signature.output_arg());
 }
 
-void CombineOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret) {
+void CombineOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret) {
   *fused_ret = first_ret;
   fused_ret->insert(second_ret.begin(), second_ret.end());
 }
 
-string SameInput(const StringCollection& first_inputs,
-                 const StringCollection& second_inputs,
-                 const StringCollection& first_outputs, int arg_num) {
+std::string SameInput(const StringCollection& first_inputs,
+                      const StringCollection& second_inputs,
+                      const StringCollection& first_outputs, int arg_num) {
   return first_inputs.at(arg_num);
 }
 
@@ -498,9 +499,10 @@ void LazyConjunctionNodes(const FunctionDef& first_function,
   GetMutableOutputNode(fused_function, 0) = if_node->name() + ":output:0";
 }
 
-void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
-                           const protobuf::Map<string, string>& second_ret,
-                           protobuf::Map<string, string>* fused_ret) {
+void LazyConjunctionOutput(
+    const protobuf::Map<std::string, std::string>& first_ret,
+    const protobuf::Map<std::string, std::string>& second_ret,
+    protobuf::Map<std::string, std::string>* fused_ret) {
   CHECK_EQ(first_ret.size(), 1);
   CHECK_EQ(second_ret.size(), 1);
   // Temporarily copy returns from first_ret.  We are going to change the
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.h b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
index d0b7ed7cb4de67..3f86bfc63c4a38 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
@@ -34,7 +34,7 @@ using SetFunctionSignatureFn = std::function<void(
     const OpDef& first_function_signature,
     const OpDef& second_function_signature, OpDef* fused_function_signature)>;
 
-using StringCollection = absl::InlinedVector<string, 2UL>;
+using StringCollection = absl::InlinedVector<std::string, 2UL>;
 
 // These functions are invoked with nodes from second function that were
 // previously taking arguments as input. The `arg_num` tells which
@@ -43,17 +43,17 @@ using StringCollection = absl::InlinedVector<string, 2UL>;
 // would be called on the first and third input with arg_num equal 1 and 4.
 // It should set up inputs based on first function inputs or outputs or
 // second function inputs.
-using SetInputFn =
-    std::function<string(const StringCollection& first_function_inputs,
-                         const StringCollection& second_function_inputs,
-                         const StringCollection& parent_outputs, int arg_num)>;
+using SetInputFn = std::function<std::string(
+    const StringCollection& first_function_inputs,
+    const StringCollection& second_function_inputs,
+    const StringCollection& parent_outputs, int arg_num)>;
 
 // This function is invoked with first and second function ret. It is used to
 // set up returns of fused function.
-using SetOutputFn =
-    std::function<void(const protobuf::Map<string, string>& parent_ret,
-                       const protobuf::Map<string, string>& second_function_ret,
-                       protobuf::Map<string, string>* fused_ret)>;
+using SetOutputFn = std::function<void(
+    const protobuf::Map<std::string, std::string>& parent_ret,
+    const protobuf::Map<std::string, std::string>& second_function_ret,
+    protobuf::Map<std::string, std::string>* fused_ret)>;
 
 using SetNodesFn = std::function<void(
     const FunctionDef& first_function, const FunctionDef& second_function,
@@ -69,15 +69,15 @@ bool CanCompose(const OpDef& first_signature, const OpDef& second_signature);
 void ComposeSignature(const OpDef& first_signature,
                       const OpDef& second_signature, OpDef* fused_signature);
 
-string ComposeInput(const StringCollection& first_inputs,
-                    const StringCollection& second_inputs,
-                    const StringCollection& first_outputs, int arg_num);
+std::string ComposeInput(const StringCollection& first_inputs,
+                         const StringCollection& second_inputs,
+                         const StringCollection& first_outputs, int arg_num);
 
 // Sets output to the composition of first and second function:
 // second_function(first_function(args...)).
-void ComposeOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret);
+void ComposeOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret);
 
 // Set input signature to `first_function_signature` and output signature
 // to `first_function_signature` + `second_function_signature`
@@ -87,9 +87,9 @@ void CombineSignature(const OpDef& first_signature,
 // Apart from first function returns, return values from second function as
 // extra returns like:
 // return *first_function(...), *second_function(...)
-void CombineOutput(const protobuf::Map<string, string>& first_ret,
-                   const protobuf::Map<string, string>& second_ret,
-                   protobuf::Map<string, string>* fused_ret);
+void CombineOutput(const protobuf::Map<std::string, std::string>& first_ret,
+                   const protobuf::Map<std::string, std::string>& second_ret,
+                   protobuf::Map<std::string, std::string>* fused_ret);
 
 // Returns true if both signatures have the same number of input and output
 // args.
@@ -101,15 +101,16 @@ void SameSignature(const OpDef& first_signature, const OpDef& second_signature,
                    OpDef* fused_signature);
 
 // Take the same input as first function.
-string SameInput(const StringCollection& first_inputs,
-                 const StringCollection& second_inputs,
-                 const StringCollection& first_outputs, int arg_num);
+std::string SameInput(const StringCollection& first_inputs,
+                      const StringCollection& second_inputs,
+                      const StringCollection& first_outputs, int arg_num);
 
 // Create a fused function that computes the short-circuit logical AND of the
 // result of the first function and the result of the second function.
-void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
-                           const protobuf::Map<string, string>& second_ret,
-                           protobuf::Map<string, string>* fused_ret);
+void LazyConjunctionOutput(
+    const protobuf::Map<std::string, std::string>& first_ret,
+    const protobuf::Map<std::string, std::string>& second_ret,
+    protobuf::Map<std::string, std::string>* fused_ret);
 
 void LazyConjunctionNodes(const FunctionDef& first_function,
                           const FunctionDef& second_function,
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
index 66e02acb967939..21df6afab30497 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
@@ -35,22 +35,22 @@ namespace grappler {
 namespace fusion_utils {
 namespace {
 
-string ParseNodeConnection(const string& name) {
+std::string ParseNodeConnection(const std::string& name) {
   return name.substr(0, name.find(':'));
 }
 
 void CheckUniqueNames(const FunctionDef& function) {
-  std::unordered_set<string> inputs;
+  std::unordered_set<std::string> inputs;
   for (const auto& input_arg : function.signature().input_arg())
     inputs.insert(input_arg.name());
   EXPECT_EQ(inputs.size(), function.signature().input_arg_size());
 
-  std::unordered_set<string> outputs;
+  std::unordered_set<std::string> outputs;
   for (const auto& output_arg : function.signature().output_arg())
     outputs.insert(output_arg.name());
   EXPECT_EQ(outputs.size(), function.signature().output_arg_size());
 
-  std::unordered_set<string> nodes;
+  std::unordered_set<std::string> nodes;
   for (const auto& node : function.node_def()) nodes.insert(node.name());
 
   EXPECT_EQ(nodes.size(), function.node_def_size());
@@ -147,8 +147,8 @@ TEST(FusionUtilsTest, FuseFunctionWithControlOutputs) {
                     fusion_utils::MergeNodes, graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().control_output_size(), 2);
-  string control_output_1 = fused_function->signature().control_output(0);
-  string control_output_2 = fused_function->signature().control_output(1);
+  std::string control_output_1 = fused_function->signature().control_output(0);
+  std::string control_output_2 = fused_function->signature().control_output(1);
   EXPECT_NE(control_output_1, control_output_2);
   EXPECT_EQ(fused_function->control_ret_size(), 2);
   EXPECT_TRUE(fused_function->control_ret().contains(control_output_1));
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index e99da1c407aa1a..3f54ccedf2ce2a 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -32,8 +32,8 @@ NodeDef MakeBatchV2Node(absl::string_view name,
                         bool parallel_copy) {
   return test::function::NDef(
       name, "BatchDatasetV2",
-      {string(input_node_name), string(batch_size_node_name),
-       string(drop_remainder_node_name)},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(drop_remainder_node_name)},
       {{"parallel_copy", parallel_copy},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -47,11 +47,12 @@ NodeDef MakeParallelBatchNode(absl::string_view name,
                               absl::string_view deterministic) {
   return test::function::NDef(
       name, "ParallelBatchDataset",
-      {string(input_node_name), string(batch_size_node_name),
-       string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(num_parallel_calls_node_name),
+       std::string(drop_remainder_node_name)},
       {{"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}},
-       {"deterministic", string(deterministic)}});
+       {"deterministic", std::string(deterministic)}});
 }
 
 NodeDef MakeCacheV2Node(absl::string_view name,
@@ -61,9 +62,9 @@ NodeDef MakeCacheV2Node(absl::string_view name,
   return test::function::NDef(
       name, "CacheDatasetV2",
       {
-          string(input_node_name),
-          string(filename_node_name),
-          string(cache_node_name),
+          std::string(input_node_name),
+          std::string(filename_node_name),
+          std::string(cache_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -75,8 +76,9 @@ NodeDef MakeFilterNode(absl::string_view name,
                        absl::string_view input_node_name,
                        absl::string_view function_name) {
   return test::function::NDef(
-      name, "FilterDataset", {string(input_node_name)},
-      {{"predicate", FunctionDefHelper::FunctionRef(string(function_name))},
+      name, "FilterDataset", {std::string(input_node_name)},
+      {{"predicate",
+        FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -90,9 +92,10 @@ NodeDef MakeMapAndBatchNode(absl::string_view name,
                             absl::string_view function_name) {
   return test::function::NDef(
       name, "MapAndBatchDataset",
-      {string(input_node_name), string(batch_size_node_name),
-       string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
+      {std::string(input_node_name), std::string(batch_size_node_name),
+       std::string(num_parallel_calls_node_name),
+       std::string(drop_remainder_node_name)},
+      {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -101,8 +104,8 @@ NodeDef MakeMapAndBatchNode(absl::string_view name,
 NodeDef MakeMapNode(absl::string_view name, absl::string_view input_node_name,
                     absl::string_view function_name) {
   return test::function::NDef(
-      name, "MapDataset", {string(input_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
+      name, "MapDataset", {std::string(input_node_name)},
+      {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
        {"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}}});
@@ -116,10 +119,11 @@ NodeDef MakeParallelInterleaveV2Node(
     absl::string_view function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name),
+       std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -135,14 +139,15 @@ NodeDef MakeParallelInterleaveV4Node(
     absl::string_view function_name, absl::string_view deterministic) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV4",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name),
+       std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
       });
 }
 
@@ -154,14 +159,14 @@ NodeDef MakeInterleaveNode(absl::string_view name,
                            absl::string_view deterministic) {
   return test::function::NDef(
       name, "InterleaveDataset",
-      {string(input_node_name), string(cycle_length_node_name),
-       string(block_length_node_name)},
+      {std::string(input_node_name), std::string(cycle_length_node_name),
+       std::string(block_length_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
       });
 }
 
@@ -171,9 +176,9 @@ NodeDef MakeParallelMapNode(absl::string_view name,
                             absl::string_view function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelMapDataset",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -189,13 +194,13 @@ NodeDef MakeParallelMapV2Node(absl::string_view name,
                               bool use_unbounded_threadpool) {
   return test::function::NDef(
       name, "ParallelMapDatasetV2",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
-          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
           {"Targuments", {}},
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
-          {"deterministic", string(deterministic)},
+          {"deterministic", std::string(deterministic)},
           {"use_unbounded_threadpool", use_unbounded_threadpool},
       });
 }
@@ -206,7 +211,7 @@ NodeDef MakeParseExampleNode(absl::string_view name,
                              bool sloppy) {
   return test::function::NDef(
       name, "ParseExampleDataset",
-      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {std::string(input_node_name), std::string(num_parallel_calls_node_name)},
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
           {"output_types", absl::Span<const DataType>{}},
@@ -221,9 +226,9 @@ NodeDef MakeShuffleV2Node(absl::string_view name,
   return test::function::NDef(
       name, "ShuffleDatasetV2",
       {
-          string(input_node_name),
-          string(buffer_size_node_name),
-          string(seed_generator_node_name),
+          std::string(input_node_name),
+          std::string(buffer_size_node_name),
+          std::string(seed_generator_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -236,8 +241,8 @@ NodeDef MakeTakeNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "TakeDataset",
       {
-          string(input_node_name),
-          string(count_node_name),
+          std::string(input_node_name),
+          std::string(count_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -251,7 +256,7 @@ NodeDef MakeTensorSliceNode(absl::string_view name,
   return test::function::NDef(
       name, "TensorSliceDataset",
       {
-          string(tensor_node_name),
+          std::string(tensor_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -265,8 +270,8 @@ NodeDef MakeSkipNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "SkipDataset",
       {
-          string(input_node_name),
-          string(count_node_name),
+          std::string(input_node_name),
+          std::string(count_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -280,9 +285,9 @@ NodeDef MakeShardNode(absl::string_view name, absl::string_view input_node_name,
   return test::function::NDef(
       name, "ShardDataset",
       {
-          string(input_node_name),
-          string(num_shards_node_name),
-          string(index_node_name),
+          std::string(input_node_name),
+          std::string(num_shards_node_name),
+          std::string(index_node_name),
       },
       {
           {"output_shapes", absl::Span<const TensorShape>{}},
@@ -294,7 +299,8 @@ NodeDef MakePrefetchNode(absl::string_view name,
                          absl::string_view input_node_name,
                          absl::string_view buffer_size) {
   return test::function::NDef(
-      name, "PrefetchDataset", {string(input_node_name), string(buffer_size)},
+      name, "PrefetchDataset",
+      {std::string(input_node_name), std::string(buffer_size)},
       {{"output_shapes", absl::Span<const TensorShape>{}},
        {"output_types", absl::Span<const DataType>{}},
        {"slack_period", 0},
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index cf931c91f89c36..760d55da080b35 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -60,7 +60,7 @@ std::vector<int> GetElementIndicesWithPredicate(const Predicate& predicate,
 }
 
 std::vector<int> CreateNameIndex(const GraphDef& graph) {
-  std::map<string, int> names;
+  std::map<std::string, int> names;
   for (int i = 0; i < graph.node_size(); ++i) {
     names[graph.node(i).name()] = i;
   }
@@ -73,7 +73,7 @@ std::vector<int> CreateNameIndex(const GraphDef& graph) {
 }
 
 std::vector<int> CreateInputIndex(const NodeDef& node) {
-  std::map<string, int> inputs;
+  std::map<std::string, int> inputs;
   for (int i = 0; i < node.input_size(); ++i) {
     inputs[node.input(i)] = i;
   }
@@ -117,18 +117,19 @@ NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph) {
   return graph->AddNode(std::move(node));
 }
 
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 MutableGraphView* graph) {
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    MutableGraphView* graph) {
   NodeDef node;
   if (!name.empty()) {
-    node.set_name(string(name));
+    node.set_name(name);
   } else {
     SetUniqueGraphNodeName(op, graph->graph(), &node);
   }
-  node.set_op(string(op));
-  for (const string& input : inputs) {
+  node.set_op(op);
+  for (const std::string& input : inputs) {
     node.add_input(input);
   }
   for (const auto& attr : attributes) {
@@ -278,7 +279,7 @@ int FindGraphNodeWithOp(absl::string_view op, const GraphDef& graph) {
       [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
 }
 
-std::vector<int> FindAllGraphNodesWithOp(const string& op,
+std::vector<int> FindAllGraphNodesWithOp(const std::string& op,
                                          const GraphDef& graph) {
   return GetElementIndicesWithPredicate(
       [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
@@ -300,7 +301,7 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
 absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
                                        DataTypeVector* output_types) {
   // We don't name the output_types attr consistently, so should check for both.
-  for (const string& attr_name : {"output_types", "Toutput_types"}) {
+  for (const std::string& attr_name : {"output_types", "Toutput_types"}) {
     if (node.attr().contains(attr_name)) {
       return GetNodeAttr(node, attr_name, output_types);
     }
@@ -311,10 +312,10 @@ absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
 
 void SetUniqueGraphNodeName(absl::string_view prefix, GraphDef* graph,
                             NodeDef* node) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = graph->node_size();
   while (ContainsGraphNodeWithName(name, *graph)) {
-    if (name.rfind("_generated") != string::npos &&
+    if (name.rfind("_generated") != std::string::npos &&
         (name.rfind("_generated") == (name.size() - strlen("_generated")))) {
       name.insert(name.rfind("_generated"), absl::StrCat("/_", id));
     } else {
@@ -328,7 +329,7 @@ void SetUniqueGraphNodeName(absl::string_view prefix, GraphDef* graph,
 void SetUniqueGraphFunctionName(absl::string_view prefix,
                                 const FunctionDefLibrary* library,
                                 FunctionDef* function) {
-  string name = string(prefix);
+  std::string name = std::string(prefix);
   int id = library->function_size();
   while (ContainsGraphFunctionWithName(name, *library)) {
     name = absl::StrCat(prefix, "/_", id);
@@ -337,13 +338,14 @@ void SetUniqueGraphFunctionName(absl::string_view prefix,
   function->mutable_signature()->set_name(std::move(name));
 }
 
-void CopyAttribute(const string& attribute_name, const NodeDef& from,
+void CopyAttribute(const std::string& attribute_name, const NodeDef& from,
                    NodeDef* to_node) {
   (*to_node->mutable_attr())[attribute_name] = from.attr().at(attribute_name);
 }
 
-void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
-                         const NodeDef& second, NodeDef* to_node) {
+void ConcatAttributeList(const std::string& attribute_name,
+                         const NodeDef& first, const NodeDef& second,
+                         NodeDef* to_node) {
   CopyAttribute(attribute_name, first, to_node);
   (*to_node->mutable_attr())
       .at(attribute_name)
@@ -353,12 +355,12 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 
 absl::Status EnsureNodeNamesUnique(Graph* g) {
   // Modeled after Scope::Impl::GetUniqueName
-  std::unordered_map<string, int> name_map;
+  std::unordered_map<std::string, int> name_map;
 
   for (auto node : g->op_nodes()) {
-    const string& prefix = node->name();
+    const std::string& prefix = node->name();
     if (auto entry = gtl::FindOrNull(name_map, prefix)) {
-      string unique_name;
+      std::string unique_name;
       do {
         unique_name = absl::StrCat(prefix, "_", ++*entry);
       } while (name_map.find(unique_name) != name_map.end());
@@ -409,7 +411,7 @@ void MaybeSetFusedMetadata(const NodeDef& node1, const NodeDef& node2,
     metadata2.ParseFromString(node2.attr().at("metadata").s());
   }
   data::Metadata fused_metadata;
-  auto normalize_name = [](const string& name) {
+  auto normalize_name = [](const std::string& name) {
     return name.empty() ? "?" : name;
   };
   *fused_metadata.mutable_name() =
@@ -433,18 +435,18 @@ bool CopyShapesAndTypesAttrs(const NodeDef& from, NodeDef* to_node) {
 }
 
 namespace {
-const auto* kSloppyAttrOps = new absl::flat_hash_set<string>{
+const auto* kSloppyAttrOps = new absl::flat_hash_set<std::string>{
     "ParallelInterleaveDatasetV2",
     "ParallelMapDataset",
     "ParseExampleDataset",
 };
 
-const auto* kReplicateOnSplitAttrOps = new absl::flat_hash_set<string>{
+const auto* kReplicateOnSplitAttrOps = new absl::flat_hash_set<std::string>{
     "TensorSliceDataset",
     "RangeDataset",
 };
 
-const auto* kDeterministicAttrOps = new absl::flat_hash_set<string>{
+const auto* kDeterministicAttrOps = new absl::flat_hash_set<std::string>{
     "LegacyParallelInterleaveDatasetV2",
     "ParallelInterleaveDatasetV3",
     "ParallelInterleaveDatasetV4",
@@ -453,13 +455,15 @@ const auto* kDeterministicAttrOps = new absl::flat_hash_set<string>{
 };
 }  // anonymous namespace
 
-bool HasSloppyAttr(const string& op) { return kSloppyAttrOps->contains(op); }
+bool HasSloppyAttr(const std::string& op) {
+  return kSloppyAttrOps->contains(op);
+}
 
-bool HasReplicateOnSplitAttr(const string& op) {
+bool HasReplicateOnSplitAttr(const std::string& op) {
   return kReplicateOnSplitAttrOps->contains(op);
 }
 
-bool HasDeterministicAttr(const string& op) {
+bool HasDeterministicAttr(const std::string& op) {
   return kDeterministicAttrOps->contains(op);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 70d0c48085716a..78cc5e2818254c 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -49,10 +49,11 @@ int GetFirstElementIndexWithPredicate(const Predicate& predicate,
 }
 
 // Adds a node to the graph.
-NodeDef* AddNode(absl::string_view name, absl::string_view op,
-                 const std::vector<string>& inputs,
-                 const std::vector<std::pair<string, AttrValue>>& attributes,
-                 MutableGraphView* graph);
+NodeDef* AddNode(
+    absl::string_view name, absl::string_view op,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::pair<std::string, AttrValue>>& attributes,
+    MutableGraphView* graph);
 
 // Adds Placeholder node for given type.
 NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph);
@@ -134,7 +135,7 @@ absl::Status GetDatasetOutputTypesAttr(const NodeDef& node,
 
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
-std::vector<int> FindAllGraphNodesWithOp(const string& op,
+std::vector<int> FindAllGraphNodesWithOp(const std::string& op,
                                          const GraphDef& graph);
 
 // Sets the node name using `prefix` as a prefix while guaranteeing the name
@@ -150,13 +151,14 @@ void SetUniqueGraphFunctionName(absl::string_view prefix,
 
 // Copies attribute having name `attribute_name` from node `from` to node
 // `to_node`.
-void CopyAttribute(const string& attribute_name, const NodeDef& from,
+void CopyAttribute(const std::string& attribute_name, const NodeDef& from,
                    NodeDef* to_node);
 
 // Concatenates list attribute having name `attribute_name` from `first` and
 // `second` node, setting it to `to_node`.
-void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
-                         const NodeDef& second, NodeDef* to_node);
+void ConcatAttributeList(const std::string& attribute_name,
+                         const NodeDef& first, const NodeDef& second,
+                         NodeDef* to_node);
 
 // Checks that all nodes in the graphs have unique names, and sets their names
 // to be unique if they are not already.  This is necessary as Graph does not
@@ -195,13 +197,13 @@ void MaybeSetFusedMetadata(const NodeDef& node1, const NodeDef& node2,
 bool CopyShapesAndTypesAttrs(const NodeDef& from, NodeDef* to_node);
 
 // Checks whether the op has a "sloppy" attribute.
-bool HasSloppyAttr(const string& op);
+bool HasSloppyAttr(const std::string& op);
 
 // Checks whether the op has a "replicate_on_split" attribute.
-bool HasReplicateOnSplitAttr(const string& op);
+bool HasReplicateOnSplitAttr(const std::string& op);
 
 // Checks whether the op has a "deterministic" attribute.
-bool HasDeterministicAttr(const string& op);
+bool HasDeterministicAttr(const std::string& op);
 
 // Sets the `name` as the metadata name of the `node`. It returns an error if
 // the `node` already has a metadata name.
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 31ca40af244757..b381f31ea92145 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -314,7 +314,7 @@ TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
 
   // Arbitrary const
   Tensor tensor(DT_INT32, {});
-  tensor.scalar<int32>()() = 5;
+  tensor.scalar<int32_t>()() = 5;
 
   for (auto node : {&const_0, &const_1}) {
     TF_EXPECT_OK(NodeBuilder("Const", "Const")
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
index f793335cdf4380..c603bcb7cf470e 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
@@ -158,7 +158,7 @@ absl::flat_hash_map<absl::string_view, const NodeDef*> NameToNode(
   return name_to_node;
 }
 
-NodeDef* GetMutableNode(const string& node_name, MutableGraphView* graph) {
+NodeDef* GetMutableNode(const std::string& node_name, MutableGraphView* graph) {
   int index = graph_utils::FindGraphNodeWithName(node_name, *graph->graph());
   DCHECK_NE(index, -1) << "Failed to find node " << node_name
                        << " in the optimized graph.";
@@ -167,7 +167,7 @@ NodeDef* GetMutableNode(const string& node_name, MutableGraphView* graph) {
 
 // Converts a ParallelInterleaveDataset or ParallelMapDataset to the equivalent
 // non-parallel version, to make it deterministic.
-absl::Status ConvertMapOrInterleave(const string& node_name,
+absl::Status ConvertMapOrInterleave(const std::string& node_name,
                                     MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
 
@@ -257,7 +257,7 @@ absl::flat_hash_set<absl::string_view> GetAllTransitiveDependencies(
 // separate Map and MapAndBatch ops. All the nondeterministic nodes and their
 // dependencies are moved to the Map node.
 absl::Status SplitMap(
-    const FunctionLibraryDefinition& library, const string& map_node_name,
+    const FunctionLibraryDefinition& library, const std::string& map_node_name,
     MutableGraphView* graph,
     const absl::flat_hash_set<absl::string_view>& nondeterministic_nodes) {
   NodeDef* map_node = GetMutableNode(map_node_name, graph);
@@ -344,7 +344,7 @@ absl::Status SplitMap(
   NodeDef* second_map_node_ptr;
   {
     NodeDef second_map_node;
-    string node_name =
+    std::string node_name =
         map_node->op() == kMapAndBatchOp ? "map_and_batch" : "parallel_map";
     graph_utils::SetUniqueGraphNodeName(
         absl::StrCat("make_deterministic_parallel_", node_name, "/",
@@ -384,7 +384,8 @@ absl::Status SplitMap(
 
 // Converts a ParallalBatch dataset to a Batch dataset, to make it
 // deterministic.
-absl::Status ConvertBatch(const string& node_name, MutableGraphView* graph) {
+absl::Status ConvertBatch(const std::string& node_name,
+                          MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
   node->set_op(kBatchV2Op);
   std::string num_parallel_calls_input = node->input(2);
@@ -398,7 +399,7 @@ absl::Status ConvertBatch(const string& node_name, MutableGraphView* graph) {
 // deterministic. Caller should delete the MapAndBatch node afterwards.
 // TODO(reedwm): Handle 'metadata' attribute. Currently the Map node and Batch
 // node will have an empty 'metadata' attribute.
-absl::Status ConvertMapAndBatch(const string& node_name,
+absl::Status ConvertMapAndBatch(const std::string& node_name,
                                 MutableGraphView* graph) {
   int index = graph_utils::FindGraphNodeWithName(node_name, *graph->graph());
   DCHECK_NE(index, -1) << "Failed to find node " << node_name
@@ -479,7 +480,8 @@ absl::Status ConvertMapAndBatch(const string& node_name,
 
 // Change the buffer_size of a Prefetch node to zero, effectively disabling it,
 // to make it deterministic.
-absl::Status ConvertPrefetch(const string& node_name, MutableGraphView* graph) {
+absl::Status ConvertPrefetch(const std::string& node_name,
+                             MutableGraphView* graph) {
   NodeDef* node = GetMutableNode(node_name, graph);
   constexpr int buffer_size_index = 1;
   node->add_input(absl::StrCat("^", node->input(buffer_size_index)));
@@ -548,7 +550,7 @@ bool FunctionMayIntroduceNondeterminism(
 bool FunctionMayIntroduceNondeterminism(
     const FunctionLibraryDefinition& library, const std::string& function_name,
     NondeterminismType nondeterminism_type) {
-  absl::flat_hash_set<string> functions_processed;
+  absl::flat_hash_set<std::string> functions_processed;
   return FunctionMayIntroduceNondeterminism(library, function_name,
                                             nondeterminism_type,
                                             &functions_processed, nullptr);
@@ -652,7 +654,7 @@ bool GraphMayHaveAsyncNondeterminism(const FunctionLibraryDefinition& library,
       return true;
     }
   }
-  for (const string& function_name : library.ListFunctionNames()) {
+  for (const std::string& function_name : library.ListFunctionNames()) {
     const FunctionDef* function_def = library.Find(function_name);
     CHECK(function_def);  // Crash Ok
     for (const NodeDef& node : function_def->node_def()) {
@@ -673,7 +675,7 @@ absl::Status MakeDeterministic::OptimizeAndCollectStats(
   MutableGraphView graph(output);
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   bool remove_async_nodes =
       GraphMayHaveAsyncNondeterminism(function_library, item.graph);
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.h b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
index 30659c435253a8..6971e5cf277e65 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.h
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
@@ -56,7 +56,7 @@ class MakeDeterministic : public TFDataOptimizerBase {
   MakeDeterministic() = default;
   ~MakeDeterministic() override = default;
 
-  string name() const override { return "make_deterministic"; };
+  std::string name() const override { return "make_deterministic"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
index c6a6294a4f1755..430d95f2162bb0 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
@@ -34,8 +34,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<string> GetNodeNames(const FunctionDef& func) {
-  std::vector<string> node_names;
+std::vector<std::string> GetNodeNames(const FunctionDef& func) {
+  std::vector<std::string> node_names;
   for (const NodeDef& node : func.node_def()) {
     node_names.push_back(node.name());
   }
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index b10468093b005b..ce3d3c8cbd5202 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -26,7 +26,7 @@ class MakeSloppy : public TFDataOptimizerBase {
   MakeSloppy() = default;
   ~MakeSloppy() override = default;
 
-  string name() const override { return "make_sloppy"; }
+  std::string name() const override { return "make_sloppy"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 44d46212ee4f65..55f5190c223f75 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -110,7 +110,7 @@ absl::Status MapAndBatchFusion::OptimizeAndCollectStats(
     OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
       continue;
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index 7e7e002b3fa380..d9f99dfcc0f423 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -26,7 +26,7 @@ class MapAndBatchFusion : public TFDataOptimizerBase {
   MapAndBatchFusion() = default;
   ~MapAndBatchFusion() override = default;
 
-  string name() const override { return "map_and_batch_fusion"; };
+  std::string name() const override { return "map_and_batch_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 0aaa95f77fbeb0..b42376ddee4d73 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -33,11 +33,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -45,10 +45,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(2);
+    std::vector<std::string> map_inputs(2);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -63,10 +63,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -116,11 +116,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -128,10 +128,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(2);
+    std::vector<std::string> map_inputs(2);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -148,11 +148,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::AddScalarConstNode<bool>(true, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(3);
+    std::vector<std::string> batch_inputs(3);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
     batch_inputs[2] = drop_remainder_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -200,11 +200,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -214,11 +214,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -233,10 +233,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -286,11 +286,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -300,11 +300,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(2);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -319,10 +319,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -373,20 +373,20 @@ TEST(MapAndBatchFusionTest, NoChange) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
 
   NodeDef *batch_size_node =
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
-  std::vector<string> batch_inputs(2);
+  std::vector<std::string> batch_inputs(2);
   batch_inputs[0] = range_node->name();
   batch_inputs[1] = batch_size_node->name();
-  std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+  std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
   AttrValue shapes_attr;
   SetAttrValue("output_shapes", &shapes_attr);
   batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
@@ -409,11 +409,11 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
   NodeDef *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, &graph);
   NodeDef *step_node = graph_utils::AddScalarConstNode<int64_t>(1, &graph);
 
-  std::vector<string> range_inputs(3);
+  std::vector<std::string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  std::vector<std::pair<string, AttrValue>> range_attrs;
+  std::vector<std::pair<std::string, AttrValue>> range_attrs;
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
   NodeDef *captured_input_node =
@@ -423,11 +423,11 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
 
   NodeDef *map_node;
   {
-    std::vector<string> map_inputs(3);
+    std::vector<std::string> map_inputs(3);
     map_inputs[0] = range_node->name();
     map_inputs[1] = captured_input_node->name();
     map_inputs[2] = num_parallel_calls_node->name();
-    std::vector<std::pair<string, AttrValue>> map_attrs(3);
+    std::vector<std::pair<std::string, AttrValue>> map_attrs(3);
     AttrValue f_attr;
     SetAttrValue("f", &f_attr);
     map_attrs[0] = std::make_pair("f", f_attr);
@@ -446,10 +446,10 @@ TEST(MapAndBatchFusionTest, NoChange_UnboundedThreadpoolParallelMap) {
       graph_utils::AddScalarConstNode<int64_t>(5, &graph);
   NodeDef *batch_node;
   {
-    std::vector<string> batch_inputs(2);
+    std::vector<std::string> batch_inputs(2);
     batch_inputs[0] = map_node->name();
     batch_inputs[1] = batch_size_node->name();
-    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    std::vector<std::pair<std::string, AttrValue>> batch_attrs(2);
     AttrValue shapes_attr;
     SetAttrValue("output_shapes", &shapes_attr);
     batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index fd638726a58ef8..b974f083e24bbf 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -168,7 +168,7 @@ absl::Status MapAndFilterFusion::OptimizeAndCollectStats(
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index 018a8751af4f94..3c4543c31d80d4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -35,7 +35,7 @@ class MapAndFilterFusion : public TFDataOptimizerBase {
   MapAndFilterFusion() = default;
   ~MapAndFilterFusion() override = default;
 
-  string name() const override { return "map_and_filter_fusion"; };
+  std::string name() const override { return "map_and_filter_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 4e6b9c9e647efc..4b8321514880f9 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -48,7 +48,8 @@ constexpr char kValueAttr[] = "value";
 constexpr int kAutotuneValue = -1;
 
 // Returns true if it is a `tf.data.AUTOTUNE` node.
-bool IsAutotuneNode(const string& node_name, const MutableGraphView& graph) {
+bool IsAutotuneNode(const std::string& node_name,
+                    const MutableGraphView& graph) {
   const NodeDef* node = graph.GetNode(node_name);
   if (!node) return false;
   if (node->op() != kConstOp) return false;
@@ -91,10 +92,10 @@ bool SameDeterministicAttr(const NodeDef& parallel_map_node,
 // optimizing each function in that graph and later aggregating any new
 // functions introduced during these individual optimizations into that single
 // graph's collective function library).
-string GetFusedName(const NodeDef& parent, const NodeDef& child) {
+std::string GetFusedName(const NodeDef& parent, const NodeDef& child) {
   return absl::StrCat("map_fusion_nodes/", parent.name(), "/", child.name());
 }
-string GetFusedName(const FunctionDef& parent, const FunctionDef& child) {
+std::string GetFusedName(const FunctionDef& parent, const FunctionDef& child) {
   return absl::StrCat("map_fusion_funcs/", parent.signature().name(), "/",
                       child.signature().name());
 }
@@ -171,7 +172,7 @@ absl::Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index 2512fc882ab9c1..26068eadf16a7e 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -30,7 +30,7 @@ class MapFusion : public TFDataOptimizerBase {
   MapFusion() = default;
   ~MapFusion() override = default;
 
-  string name() const override { return "map_fusion"; };
+  std::string name() const override { return "map_fusion"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class MapFusion : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 2118fb95261886..6a0a10926c3d5d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -34,7 +34,7 @@ namespace {
 constexpr char kMapDataset[] = "MapDataset";
 constexpr char kParallelMapDataset[] = "ParallelMapDatasetV2";
 
-NodeDef MakeParallelMap(const string& name, MutableGraphView* graph) {
+NodeDef MakeParallelMap(const std::string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
   int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
@@ -72,7 +72,7 @@ absl::Status MapParallelization::OptimizeAndCollectStats(
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
     return absl::OkStatus();
 
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index 6ed7003454b996..54bc5bd5632704 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -30,7 +30,7 @@ class MapParallelization : public TFDataOptimizerBase {
   MapParallelization() = default;
   ~MapParallelization() override = default;
 
-  string name() const override { return "map_parallelization"; };
+  std::string name() const override { return "map_parallelization"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -38,7 +38,7 @@ class MapParallelization : public TFDataOptimizerBase {
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     if (!config) return absl::OkStatus();
 
-    const string& autotune = config->parameter_map().at(kAutotune).s();
+    const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
       autotune_ = true;
     } else if (autotune == "false") {
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
index 50444ecb73602e..2bfa7363da1825 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
@@ -75,10 +75,10 @@ TEST_P(AutotuneSetting, MapParallelizationTest) {
 
 INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
 
-class FromFunctionDef : public ::testing::TestWithParam<string> {};
+class FromFunctionDef : public ::testing::TestWithParam<std::string> {};
 
 TEST_P(FromFunctionDef, MapParallelizationTest) {
-  const string op = GetParam();
+  const std::string op = GetParam();
   bool from_function_def = (op == "_Retval");
 
   using test::function::NDef;
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 718f11106277dd..18787fc4be25b5 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,7 +35,7 @@ namespace grappler {
 namespace {
 
 using ConfigMap =
-    std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
+    std::map<std::string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
 // clang-format off
@@ -76,7 +76,7 @@ absl::Status ToConfigMap(
   for (const auto& option_string : options) {
     // The option string has the format
     // <optimizer_name>:<config_key>:<config_value>
-    std::vector<string> split = absl::StrSplit(option_string, ':');
+    std::vector<std::string> split = absl::StrSplit(option_string, ':');
     if (split.size() != 3) {
       return errors::Internal(
           "Wrong format for optimizer options. Expect <optimizer name>:<config "
@@ -84,9 +84,9 @@ absl::Status ToConfigMap(
           option_string);
     }
 
-    const string& optimizer_name = split[0];
-    const string& config_key = split[1];
-    const string& config_value = split[2];
+    const std::string& optimizer_name = split[0];
+    const std::string& config_key = split[1];
+    const std::string& config_value = split[2];
 
     auto optimizer_config = gtl::FindOrNull(*result, optimizer_name);
     if (!optimizer_config) {
@@ -168,7 +168,7 @@ absl::Status TFDataMetaOptimizer::Optimize(Cluster* cluster,
   return absl::OkStatus();
 }
 
-absl::Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
+absl::Status TFDataMetaOptimizer::ApplyOptimization(const std::string& name,
                                                     Cluster* cluster,
                                                     GrapplerItem* item) const {
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
index e839389dba34ca..ac1c819bc32669 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -29,7 +29,7 @@ class TFDataMetaOptimizer : public CustomGraphOptimizer {
   TFDataMetaOptimizer() = default;
   ~TFDataMetaOptimizer() override = default;
 
-  string name() const override { return "tf_data_meta_optimizer"; };
+  std::string name() const override { return "tf_data_meta_optimizer"; };
 
   bool UsesFunctionLibrary() const override { return true; }
 
@@ -40,12 +40,12 @@ class TFDataMetaOptimizer : public CustomGraphOptimizer {
                         GraphDef* output) override;
 
  private:
-  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+  absl::flat_hash_map<std::string, std::unique_ptr<GraphOptimizer>>
       enabled_optimizers_;
 
   // Applies an optimization with the specified name on `item`, and stores
   // the result in `item.graph`
-  absl::Status ApplyOptimization(const string& name, Cluster* cluster,
+  absl::Status ApplyOptimization(const std::string& name, Cluster* cluster,
                                  GrapplerItem* item) const;
 };
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index cba50f55e18260..ad8a3755dd3a2f 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -70,8 +70,9 @@ bool IsShardOne(const NodeDef& shard_node, const MutableGraphView& graph) {
   return IsConstNodeWithValue(*graph.GetNode(shard_node.input(1)), 1);
 }
 
-bool IsOutputIdentityOfInput(const FunctionDef& fdef, const string& output_arg,
-                             const string& input_arg) {
+bool IsOutputIdentityOfInput(const FunctionDef& fdef,
+                             const std::string& output_arg,
+                             const std::string& input_arg) {
   if (!fdef.ret().contains(output_arg)) {
     LOG(WARNING)
         << "Malformed FunctionDef: ret dict does not contain output arg key.";
@@ -146,7 +147,7 @@ absl::Status NoOpElimination::OptimizeAndCollectStats(
     OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  absl::flat_hash_set<string> nodes_to_delete;
+  absl::flat_hash_set<std::string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              graph.graph()->library());
   for (const NodeDef& node : item.graph.node()) {
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index 389b112e66f9b5..8d7e5a9e6973e1 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -28,7 +28,7 @@ class NoOpElimination : public TFDataOptimizerBase {
   NoOpElimination() = default;
   ~NoOpElimination() override = default;
 
-  string name() const override { return "noop_elimination"; };
+  std::string name() const override { return "noop_elimination"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
index 173f3e463fdf6d..eadd19ed929b5c 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
@@ -25,24 +25,24 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<std::pair<string, AttrValue>> GetCommonAttributes() {
+std::vector<std::pair<std::string, AttrValue>> GetCommonAttributes() {
   AttrValue shapes_attr, types_attr;
   SetAttrValue("output_shapes", &shapes_attr);
   SetAttrValue("output_types", &types_attr);
-  std::vector<std::pair<string, AttrValue>> commonAttributes = {
+  std::vector<std::pair<std::string, AttrValue>> commonAttributes = {
       {"output_shapes", shapes_attr}, {"output_types", types_attr}};
 
   return commonAttributes;
 }
 
-NodeDef *MakeNode(absl::string_view node_type, std::vector<int> params,
-                  string input_node, MutableGraphView *graph) {
+NodeDef* MakeNode(absl::string_view node_type, std::vector<int> params,
+                  std::string input_node, MutableGraphView* graph) {
   std::vector<NodeDef *> node_params;
   for (int param : params) {
     node_params.push_back(
         graph_utils::AddScalarConstNode<int64_t>(param, graph));
   }
-  std::vector<string> inputs = {input_node};
+  std::vector<std::string> inputs = {input_node};
   for (int i = 0; i < node_params.size(); i++) {
     inputs.push_back(node_params[i]->name());
   }
@@ -50,14 +50,14 @@ NodeDef *MakeNode(absl::string_view node_type, std::vector<int> params,
                               graph);
 }
 
-NodeDef *MakeNonConstNode(absl::string_view node_type,
-                          std::vector<DataType> param_dtypes, string input_node,
-                          MutableGraphView *graph) {
+NodeDef* MakeNonConstNode(absl::string_view node_type,
+                          std::vector<DataType> param_dtypes,
+                          std::string input_node, MutableGraphView* graph) {
   std::vector<NodeDef *> node_params;
   for (DataType dtype : param_dtypes) {
     node_params.push_back(graph_utils::AddScalarPlaceholder(dtype, graph));
   }
-  std::vector<string> inputs = {input_node};
+  std::vector<std::string> inputs = {input_node};
   for (int i = 0; i < node_params.size(); i++) {
     inputs.push_back(node_params[i]->name());
   }
@@ -66,7 +66,7 @@ NodeDef *MakeNonConstNode(absl::string_view node_type,
                               graph);
 }
 
-NodeDef *MakeCacheNode(string input_node, MutableGraphView *graph) {
+NodeDef* MakeCacheNode(std::string input_node, MutableGraphView* graph) {
   NodeDef *node_filename =
       graph_utils::AddScalarConstNode<absl::string_view>("", graph);
   return graph_utils::AddNode("", "CacheDataset",
@@ -79,15 +79,16 @@ NodeDef *MakeRangeNode(MutableGraphView *graph) {
   auto *stop_node = graph_utils::AddScalarConstNode<int64_t>(10, graph);
   auto *step_node = graph_utils::AddScalarConstNode<int64_t>(1, graph);
 
-  std::vector<string> range_inputs = {start_node->name(), stop_node->name(),
-                                      step_node->name()};
+  std::vector<std::string> range_inputs = {
+      start_node->name(), stop_node->name(), step_node->name()};
 
   return graph_utils::AddNode("", "RangeDataset", range_inputs,
                               GetCommonAttributes(), graph);
 }
 
 struct NoOpLastEliminationTest
-    : ::testing::TestWithParam<std::tuple<string, std::vector<int>, bool>> {};
+    : ::testing::TestWithParam<
+          std::tuple<std::string, std::vector<int>, bool>> {};
 
 // This test checks whether the no-op elimination correctly handles
 // transformations at the end of the pipeline.
@@ -95,7 +96,7 @@ TEST_P(NoOpLastEliminationTest, EliminateLastNoOpNode) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);
 
-  const string &node_type = std::get<0>(GetParam());
+  const std::string& node_type = std::get<0>(GetParam());
   const std::vector<int> node_params = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
@@ -127,7 +128,8 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple("ShardDataset", std::vector<int>({2, 0}), true)));
 
 struct NoOpMiddleEliminationTest
-    : ::testing::TestWithParam<std::tuple<string, std::vector<int>, bool>> {};
+    : ::testing::TestWithParam<
+          std::tuple<std::string, std::vector<int>, bool>> {};
 
 // This test checks whether the no-op elimination correctly handles
 // transformations int the middle of the pipeline.
@@ -135,7 +137,7 @@ TEST_P(NoOpMiddleEliminationTest, EliminateMiddleNoOpNode) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);
 
-  const string &node_type = std::get<0>(GetParam());
+  const std::string& node_type = std::get<0>(GetParam());
   const std::vector<int> node_params = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
@@ -176,8 +178,8 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple("ShardDataset", std::vector<int>({1, 0}), false),
         std::make_tuple("ShardDataset", std::vector<int>({2, 0}), true)));
 
-using NodesTypes = std::tuple<std::pair<string, std::vector<int>>,
-                              std::pair<string, std::vector<int>>>;
+using NodesTypes = std::tuple<std::pair<std::string, std::vector<int>>,
+                              std::pair<std::string, std::vector<int>>>;
 struct NoOpMultipleEliminationTest : ::testing::TestWithParam<NodesTypes> {};
 
 // This test checks whether the no-op elimination correctly removes
@@ -188,13 +190,13 @@ TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
-  const std::vector<std::pair<string, std::vector<int>>> noop_nodes = {
+  const std::vector<std::pair<std::string, std::vector<int>>> noop_nodes = {
       std::get<0>(GetParam()), std::get<1>(GetParam())};
 
   NodeDef *range_node = MakeRangeNode(&graph);
 
   NodeDef *previous = range_node;
-  std::vector<string> nodes_to_remove;
+  std::vector<std::string> nodes_to_remove;
   nodes_to_remove.reserve(noop_nodes.size());
 
   for (const auto &noop_node : noop_nodes) {
@@ -223,14 +225,14 @@ TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
   EXPECT_EQ(cache_node_out.input(0), range_node->name());
 }
 
-const auto *const kTakeNode =
-    new std::pair<string, std::vector<int>>{"TakeDataset", {-1}};
-const auto *const kSkipNode =
-    new std::pair<string, std::vector<int>>{"SkipDataset", {0}};
-const auto *const kRepeatNode =
-    new std::pair<string, std::vector<int>>{"RepeatDataset", {1}};
-const auto *const kShardNode =
-    new std::pair<string, std::vector<int>>{"ShardDataset", {1, 0}};
+const auto* const kTakeNode =
+    new std::pair<std::string, std::vector<int>>{"TakeDataset", {-1}};
+const auto* const kSkipNode =
+    new std::pair<std::string, std::vector<int>>{"SkipDataset", {0}};
+const auto* const kRepeatNode =
+    new std::pair<std::string, std::vector<int>>{"RepeatDataset", {1}};
+const auto* const kShardNode =
+    new std::pair<std::string, std::vector<int>>{"ShardDataset", {1, 0}};
 
 INSTANTIATE_TEST_CASE_P(
     BasicRemovalTest, NoOpMultipleEliminationTest,
@@ -240,8 +242,8 @@ INSTANTIATE_TEST_CASE_P(
 
 struct NoOpPlaceholdersTest
     : ::testing::TestWithParam<
-          std::tuple<std::pair<string, std::vector<DataType>>,
-                     std::pair<string, std::vector<DataType>>>> {};
+          std::tuple<std::pair<std::string, std::vector<DataType>>,
+                     std::pair<std::string, std::vector<DataType>>>> {};
 
 TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
   GrapplerItem item;
@@ -249,10 +251,10 @@ TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
-  const std::vector<std::pair<string, std::vector<DataType>>> noop_nodes = {
-      std::get<0>(GetParam()), std::get<1>(GetParam())};
+  const std::vector<std::pair<std::string, std::vector<DataType>>> noop_nodes =
+      {std::get<0>(GetParam()), std::get<1>(GetParam())};
   NodeDef *range_node = MakeRangeNode(&graph);
-  std::vector<string> nodes_to_keep;
+  std::vector<std::string> nodes_to_keep;
   nodes_to_keep.reserve(noop_nodes.size());
   NodeDef *previous = range_node;
 
@@ -270,15 +272,18 @@ TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
     EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(noop_node_name, output));
 }
 
-const auto *const kNonConstTakeNode =
-    new std::pair<string, std::vector<DataType>>{"TakeDataset", {DT_INT32}};
-const auto *const kNonConstSkipNode =
-    new std::pair<string, std::vector<DataType>>{"SkipDataset", {DT_INT32}};
-const auto *const kNonConstRepeatNode =
-    new std::pair<string, std::vector<DataType>>{"RepeatDataset", {DT_INT32}};
-const auto *const kNonConstShardNode =
-    new std::pair<string, std::vector<DataType>>{"ShardDataset",
-                                                 {DT_INT32, DT_INT32}};
+const auto* const kNonConstTakeNode =
+    new std::pair<std::string, std::vector<DataType>>{"TakeDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstSkipNode =
+    new std::pair<std::string, std::vector<DataType>>{"SkipDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstRepeatNode =
+    new std::pair<std::string, std::vector<DataType>>{"RepeatDataset",
+                                                      {DT_INT32}};
+const auto* const kNonConstShardNode =
+    new std::pair<std::string, std::vector<DataType>>{"ShardDataset",
+                                                      {DT_INT32, DT_INT32}};
 
 INSTANTIATE_TEST_CASE_P(
     DoNotRemovePlaceholders, NoOpPlaceholdersTest,
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.h b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
index 0fd3f4a138803e..a201a210816510 100644
--- a/tensorflow/core/grappler/optimizers/data/parallel_batch.h
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -29,7 +29,7 @@ class ParallelBatch : public TFDataOptimizerBase {
   ParallelBatch() = default;
   ~ParallelBatch() override = default;
 
-  string name() const override { return "parallel_batch"; }
+  std::string name() const override { return "parallel_batch"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
index 550436f4e3d234..d1e56fa3afb6af 100644
--- a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
+++ b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
@@ -26,7 +26,7 @@ class RemoveCompressionMap : public TFDataOptimizerBase {
   RemoveCompressionMap() = default;
   ~RemoveCompressionMap() override = default;
 
-  string name() const override { return "remove_compression_map"; }
+  std::string name() const override { return "remove_compression_map"; }
 
   bool UsesFunctionLibrary() const override { return false; }
 
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
index 393a5e1bb44dc1..92ef3e2fc0b454 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
@@ -47,7 +47,7 @@ constexpr char kBatchTimeoutMicrosAttr[] = "batch_timeout_micros";
 constexpr char kAllowedBatchSizesAttr[] = "allowed_batch_sizes";
 constexpr char kMaxEnqueuedBatchesAttr[] = "max_enqueued_batches";
 constexpr char kEnableLargeBatchSplitting[] = "enable_large_batch_splitting";
-constexpr int64 kBoostMicrosNotSet = -1;
+constexpr int64_t kBoostMicrosNotSet = -1;
 
 using BatchOpRewriteFunction = std::function<void(NodeDef* batch_op)>;
 
@@ -61,10 +61,10 @@ using ::tensorflow::grappler::GrapplerItem;
 namespace {
 // Parameters for adaptive batch scheduler only.
 struct AdaptiveBatchSchedulerParams {
-  int32 initial_inflight_batches;
-  int32 min_inflight_batches;
-  int32 max_inflight_batches;
-  int32 batches_to_average_over;
+  int32_t initial_inflight_batches;
+  int32_t min_inflight_batches;
+  int32_t max_inflight_batches;
+  int32_t batches_to_average_over;
   int64_t full_batch_scheduling_boost_micros;
 };
 
@@ -175,7 +175,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool asbs_overridden = false;
   if (config_proto_.has_experimental() &&
       config_proto_.experimental().has_session_metadata()) {
-    const string model_name =
+    const std::string model_name =
         config_proto_.experimental().session_metadata().name();
 
     if (!config_.model_scheduler_options().empty()) {
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
index 7b3a784bd205e8..74a4a03aa38ecd 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
@@ -46,17 +46,17 @@ using ::tensorflow::grappler::GrapplerItem;
 using ::tensorflow::serving::BatchOpRewriteConfig;
 
 // Add batch op in both GraphDef.node and GraphDef.library.function.node_def.
-void AddBatchOp(GraphDef* graph, int num_batch_threads = 16,
-                const absl::flat_hash_map<string, int>& reserved_int_attrs = {},
-                int max_batch_size = 16, int batch_timeout_micros = 10000,
-                const std::vector<int32>& allowed_batch_sizes = {8, 16},
-                int max_enqueued_batches = 1000,
-                bool disable_large_batch_splitting = false,
-                std::string_view mixed_priority_policy = "",
-                int low_priority_max_batch_size = -1,
-                int low_priority_batch_timeout_micros = -1,
-                const std::vector<int32>& low_priority_allowed_batch_sizes = {},
-                int low_priority_max_enqueued_batches = -1) {
+void AddBatchOp(
+    GraphDef* graph, int num_batch_threads = 16,
+    const absl::flat_hash_map<std::string, int>& reserved_int_attrs = {},
+    int max_batch_size = 16, int batch_timeout_micros = 10000,
+    const std::vector<int32_t>& allowed_batch_sizes = {8, 16},
+    int max_enqueued_batches = 1000, bool disable_large_batch_splitting = false,
+    std::string_view mixed_priority_policy = "",
+    int low_priority_max_batch_size = -1,
+    int low_priority_batch_timeout_micros = -1,
+    const std::vector<int32_t>& low_priority_allowed_batch_sizes = {},
+    int low_priority_max_enqueued_batches = -1) {
   auto set_batch_node_attribute = [&](const int32_t num_batch_threads,
                                       NodeDef* batch_op) {
     batch_op->set_name("cond/batch/BatchFunction");
@@ -288,7 +288,7 @@ TEST_F(BatchOpRewriterTest, UpdateBatchOptions) {
       128);
   (*config.mutable_batch_options())["model_with_override"]
       .set_batch_timeout_micros(5000);
-  const std::vector<int32> allowed_batch_sizes{4, 32};
+  const std::vector<int32_t> allowed_batch_sizes{4, 32};
   (*config.mutable_batch_options())["model_with_override"]
       .mutable_allowed_batch_sizes()
       ->Add(allowed_batch_sizes.begin(), allowed_batch_sizes.end());
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 902d619946d052..94d02108caa0ed 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -58,7 +59,7 @@ bool HasOpName(const string& node_name, const string& op_name) {
   if (end != string::npos) {
     size_t p = end + 1;
     while (p < node_name.size()) {
-      if (!isdigit(node_name[p])) {
+      if (!absl::ascii_isdigit(node_name[p])) {
         end = node_name.size();
         break;
       }
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 0619c756df14a5..d2bca75f98527f 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -300,7 +300,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       // Output Mapping
       {{"o", "o:z:0"}});
 
-  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  protobuf::Map<std::string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
 
   GrapplerFunctionItem item;
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index fa2a5381fd844b..ceab1328dcc08a 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -641,7 +641,7 @@ absl::Status MutableGraphView::GetNodeNamesAndPartitionUpdatedNodes(
       continue;
     }
     // Get name of updated node after potential mutation.
-    const string& node_name =
+    const std::string& node_name =
         diff.update_name ? diff.name : nodes_[diff.node_index].GetName();
     auto it = node_names->insert({node_name, internal::kNodeNamePresent});
     if (!it.second) {
diff --git a/tensorflow/core/grappler/utils/graph_view_internal.h b/tensorflow/core/grappler/utils/graph_view_internal.h
index d66b1ca04528e1..f23de430d88fb3 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal.h
+++ b/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -406,7 +406,7 @@ inline bool UpdateName(NodeViewDiff<GraphViewT>* diff, absl::string_view name) {
     diff->name.clear();
     diff->update_name = false;
   } else {
-    diff->name = string(name);
+    diff->name = std::string(name);
     diff->update_name = true;
   }
   return true;
diff --git a/tensorflow/core/grappler/utils/graph_view_internal_test.cc b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
index 8bd4591d9e3fe1..6b9d3b82f94d61 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
@@ -785,7 +785,7 @@ TEST(MutableNodeViewDiffTest, IsWellFormedRenamedMissingFaninControl) {
   EXPECT_TRUE(IsEmpty(&diff));
   EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
 
-  string old_node_name = "d";
+  std::string old_node_name = "d";
   string new_node_name = "e";
   updated_node_names.erase(old_node_name);
   updated_node_names.emplace(old_node_name, 3);
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
index 376b2854c01a5d..478ca1fb4062c4 100644
--- a/tensorflow/core/grappler/utils/graph_view_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -2012,7 +2012,7 @@ class TopologicalSortTest : public CompareGraphTest {
 
   void CompareGraphNodePrecedences(
       const MutableGraphView& graph_view,
-      absl::Span<const std::pair<string, string>> node_precedences) {
+      absl::Span<const std::pair<string, std::string>> node_precedences) {
     for (const auto& node_precedence : node_precedences) {
       auto* parent_node = graph_view.GetNode(node_precedence.first);
       ASSERT_NE(parent_node, nullptr);
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index da44c9ebde953f..99b8d43d1a3228 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -52,7 +52,7 @@ void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
         << got_node.DebugString();
 
     // Order of control dependencies doesn't matter, so we sort them first.
-    const auto is_control = [](const string& input) -> bool {
+    const auto is_control = [](const std::string& input) -> bool {
       return ParseTensorName(input).index() < 0;
     };
 
diff --git a/tensorflow/core/grappler/utils/pattern_utils_test.cc b/tensorflow/core/grappler/utils/pattern_utils_test.cc
index 1281968bc47db5..fc2b243323578c 100644
--- a/tensorflow/core/grappler/utils/pattern_utils_test.cc
+++ b/tensorflow/core/grappler/utils/pattern_utils_test.cc
@@ -515,7 +515,7 @@ TEST_F(PatternMatcherTest, CommutativeInputs) {
 
   absl::Status status;
   std::vector<string> commutative_ops = {"Mul", "Add", "AddV2"};
-  for (string op : commutative_ops) {
+  for (std::string op : commutative_ops) {
     for (bool should_swap : {false, true}) {
       std::vector<string> commutative_operands =
           (should_swap ? std::vector<string>{"d", "c"}
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index a9eaad04d3bd00..b6ff0170a2c2f5 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -133,7 +133,7 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
   });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
-  std::vector<string> order = {"1", "2"};
+  std::vector<std::string> order = {"1", "2"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index 7b36d328e93847..cd3b9fdde414c5 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -226,7 +226,7 @@ TEST(TraversalTest, DfsWithAdvancePredicate) {
                MkCallbacks(&pre_order, &post_order, &back_edges));
 
   const std::vector<string> expected_pre = {"1", "4", "5", "6", "2"};
-  const std::vector<string> expected_post = {"6", "5", "4", "2", "1"};
+  const std::vector<std::string> expected_post = {"6", "5", "4", "2", "1"};
 
   EXPECT_EQ(pre_order, expected_pre);
   EXPECT_EQ(post_order, expected_post);
diff --git a/tensorflow/core/ir/ops.cc b/tensorflow/core/ir/ops.cc
index 52ef2d416d393d..057aad050762d3 100644
--- a/tensorflow/core/ir/ops.cc
+++ b/tensorflow/core/ir/ops.cc
@@ -1155,7 +1155,7 @@ static LogicalResult VerifyPreservedAttrs(Operation* op,
       num_rets = 1;
     } else {
       num_rets = cast<RegionBranchTerminatorOpInterface>(terminator)
-                     .getMutableSuccessorOperands(region)
+                     .getMutableSuccessorOperands(RegionSuccessor(&region))
                      .size();
     }
     if (num_rets != attrs.getResAttrs().size()) {
@@ -1171,7 +1171,7 @@ static LogicalResult VerifyPreservedAttrs(Operation* op,
 // YieldOp
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // Get the subrange of non-control operands.
   return getArgsMutable();
 }
@@ -1212,10 +1212,10 @@ template <typename IfLikeRegionOp>
 void GetIfLikeRegionOpSuccessorRegions(
     IfLikeRegionOp op, RegionBranchPoint point,
     SmallVectorImpl<RegionSuccessor>& regions) {
-  // Both regions branch back to the parent op.
   if (!point.isParent()) {
     // Ignore the control token.
     regions.emplace_back(
+        op.getOperation(),
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else {
     // Unknown successor.
@@ -1296,6 +1296,7 @@ void GetCaseLikeRegionOpSuccessorRegions(
   if (!point.isParent()) {
     // Ignore the control token.
     regions.emplace_back(
+        op.getOperation(),
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else {
     // Unknown successor. Add all of them.
@@ -1325,7 +1326,7 @@ void GetCaseLikeRegionOpEntrySuccessorRegions(
 // ConditionOp
 
 MutableOperandRange ConditionOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
+    RegionSuccessor successor) {
   // Get the subrange of non-control operands that are forwarded to the
   // successor region.
   return getArgsMutable();
@@ -1380,12 +1381,18 @@ static void GetWhileLikeRegionOpSuccessorRegions(
     WhileLikeRegionOp op, RegionBranchPoint point ,
     SmallVectorImpl<RegionSuccessor>& regions) {
   // The parent op and the body region always branch to the condition region.
-  if (point.isParent() || point == op.getRegion(1)) {
+  if (point.isParent() ||
+      (point.getTerminatorPredecessorOrNull() &&
+       point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+           &op.getRegion(1))) {
     regions.emplace_back(&op.getCondRegion(),
                          GetLoopRegionDataArgs(op.getCondRegion()));
     return;
   }
-  assert(point == op->getRegion(0) && "invalid region index");
+  assert((point.getTerminatorPredecessorOrNull() &&
+          point.getTerminatorPredecessorOrNull()->getParentRegion() ==
+              &op.getRegion(0)) &&
+         "invalid region index");
   // The condition regions branches to the loop body or back to the parent.
   // Try to narrow the condition value to a constant.
   auto condition =
@@ -1399,7 +1406,7 @@ static void GetWhileLikeRegionOpSuccessorRegions(
   }
   if (!cond || !*cond) {
     // Drop the control token.
-    regions.emplace_back(op.getResults().drop_back());
+    regions.emplace_back(op.getOperation(), op.getResults().drop_back());
   }
 }
 
@@ -1427,8 +1434,7 @@ LogicalResult ForRegionOp::verify() {
   return VerifyPreservedAttrs(*this, {getRegionAttrsAttr()});
 }
 
-OperandRange ForRegionOp::getEntrySuccessorOperands(
-    RegionBranchPoint point) {
+OperandRange ForRegionOp::getEntrySuccessorOperands(RegionSuccessor successor) {
   return getInit();
 }
 
@@ -1440,7 +1446,7 @@ void ForRegionOp::getSuccessorRegions(
                        GetLoopRegionDataArgs(getBodyRegion()).drop_front());
   if (point.isParent()) return;
   // The body might branch back to the parent. Drop the control token.
-  regions.emplace_back((*this)->getResults().drop_back());
+  regions.emplace_back(getOperation(), getResults().drop_back());
 }
 
 BlockArgument ForRegionOp::getDataValueOf(BlockArgument ctl) {
diff --git a/tensorflow/core/ir/ops.td b/tensorflow/core/ir/ops.td
index b6bbbee3b6e88e..d54d2f58eefd7f 100644
--- a/tensorflow/core/ir/ops.td
+++ b/tensorflow/core/ir/ops.td
@@ -864,7 +864,7 @@ class TFGraph_WhileLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
     }
 
     OperandRange $cppClass::getEntrySuccessorOperands(
-         ::mlir::RegionBranchPoint point) {
+         ::mlir::RegionSuccessor successor) {
       return getInit();
     }
     void $cppClass::getSuccessorRegions(
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 933dec0e737b35..d12edfe2deb68e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -575,7 +575,7 @@ cc_library(
         "//tensorflow/core/platform:tensor_float_32_hdr_lib",
         "//tensorflow/core/util:determinism_for_kernels",
         "//tensorflow/core/util:env_var",
-        "@local_xla//xla/stream_executor:numeric_options",
+        "@local_xla//xla/stream_executor:engine_options",
         "@local_xla//xla/tsl/util:determinism_for_kernels",
     ] + if_static(["//tensorflow/core/platform:tensor_float_32_utils"]),
 )
@@ -1077,7 +1077,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "gather_op",
     prefix = "gather_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 tf_kernel_library(
@@ -1367,6 +1370,8 @@ tf_kernel_library(
     srcs = ["ragged_gather_op.cc"],
     deps = [
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2450,6 +2455,7 @@ tf_kernel_library(
         ":range_sampler",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2461,6 +2467,8 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -2752,6 +2760,8 @@ tf_kernel_library(
         ":gather_functor",
         ":gpu_prim_hdrs",
         "//tensorflow/core:framework_internal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2971,6 +2981,7 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:determinism_for_kernels",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
@@ -5480,6 +5491,8 @@ tf_kernel_library(
     deps = STATE_DEPS + [
         ":loose_headers",
         "//tensorflow/core/util:determinism_for_kernels",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -5489,6 +5502,8 @@ tf_kernel_library(
     deps = STATE_DEPS + [
         "//tensorflow/core/framework:op_requires",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -5904,6 +5919,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:bounds_check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -6530,7 +6547,6 @@ filegroup(
         "variable_ops.h",
         "variant_ops_util.cc",
         "variant_ops_util.h",
-    ] + [
         "//tensorflow/c/kernels:portable_all_op_kernels",
         "//tensorflow/core/kernels/image:non_max_suppression_op.cc",
         "//tensorflow/core/kernels/image:non_max_suppression_op.h",
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 12e1622963da07..253f21b69451fe 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -15,19 +15,27 @@ limitations under the License.
 
 // See docs in ../ops/string_ops.cc.
 
+#include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/tstring.h"
 
 namespace tensorflow {
 
@@ -36,36 +44,28 @@ class AsStringOp : public OpKernel {
   using OpKernel::OpKernel;
 
   explicit AsStringOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    int32_t precision;
     bool scientific;
     bool shortest;
-    int32_t width;
-    string fill_string;
+    std::string fill_string;
     DataType dtype;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("precision", &precision));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("precision", &precision_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("scientific", &scientific));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shortest", &shortest));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("width", &width));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("width", &width_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill", &fill_string));
-    switch (dtype) {
-      case DT_STRING:
-      case DT_HALF:
-      case DT_BFLOAT16:
-      case DT_FLOAT:
-      case DT_DOUBLE:
-      case DT_COMPLEX64:
-      case DT_COMPLEX128:
-        break;
-      default:
-        OP_REQUIRES(ctx, !(scientific || shortest),
-                    errors::InvalidArgument("scientific and shortest format "
-                                            "not supported for datatype ",
-                                            DataTypeString(dtype)));
-        OP_REQUIRES(ctx, precision < 0,
-                    errors::InvalidArgument("precision not supported "
-                                            "for datatype ",
-                                            DataTypeString(dtype)));
+    if (dtype != DT_STRING && !DataTypeIsFloating(dtype) &&
+        !DataTypeIsComplex(dtype)) {
+      OP_REQUIRES(ctx, !(scientific || shortest),
+                  absl::InvalidArgumentError(
+                      absl::StrCat("scientific and shortest format "
+                                   "not supported for datatype ",
+                                   DataTypeString(dtype))));
+      OP_REQUIRES(
+          ctx, precision_ < 0,
+          absl::InvalidArgumentError(absl::StrCat("precision not supported "
+                                                  "for datatype ",
+                                                  DataTypeString(dtype))));
     }
     OP_REQUIRES(
         ctx, fill_string.size() <= 1,
@@ -74,7 +74,6 @@ class AsStringOp : public OpKernel {
                 errors::InvalidArgument(
                     "Cannot select both scientific and shortest notation"));
 
-    format_ = "%";
     if (!fill_string.empty()) {
       switch (fill_string[0]) {
         case ' ':
@@ -82,7 +81,6 @@ class AsStringOp : public OpKernel {
         case '-':
         case '0':
         case '#':
-          strings::Appendf(&format_, "%s", fill_string.c_str());
           break;
         default:
           bool fill_not_supported = true;
@@ -91,64 +89,55 @@ class AsStringOp : public OpKernel {
                                               fill_string, "\""));
       }
     }
-    if (width > -1) {
-      strings::Appendf(&format_, "%d", width);
+    if (width_ <= -1) {
+      width_ = 0;
     }
-    if (precision > -1) {
-      strings::Appendf(&format_, ".%d", precision);
+    // If input is string and width unspecified, simply forward to output.
+    if (dtype == DT_STRING && width_ <= 0) {
+      return;
     }
-    switch (dtype) {
-      case DT_STRING:
-        // Clear format to signal pass-through.
-        if (width <= 0) {
-          format_ = "";
-        } else {
-          strings::Appendf(&format_, "s");
-        }
-        break;
-      case DT_UINT8:
-      case DT_UINT16:
-      case DT_UINT32:
-        strings::Appendf(&format_, "u");
-        break;
-      case DT_UINT64:
-        strings::Appendf(&format_, "llu");
-        break;
-      case DT_INT8:
-      case DT_INT16:
-      case DT_INT32:
-        strings::Appendf(&format_, "d");
-        break;
-      case DT_INT64:
-        strings::Appendf(&format_, "lld");
-        break;
-      case DT_HALF:
-      case DT_BFLOAT16:
-      case DT_FLOAT:
-      case DT_DOUBLE:
-      case DT_COMPLEX64:
-      case DT_COMPLEX128:
-        if (shortest) {
-          strings::Appendf(&format_, "g");
-        } else if (scientific) {
-          strings::Appendf(&format_, "e");
-        } else {
-          strings::Appendf(&format_, "f");
-        }
-        break;
-      case DT_BOOL:
-        break;
-      case DT_VARIANT:
-        break;
-      default:
-        bool type_not_supported = true;
-        OP_REQUIRES(ctx, !type_not_supported,
-                    errors::InvalidArgument("Type not supported: ",
-                                            DataTypeString(dtype)));
+    char format_char;
+    if (dtype == DT_STRING) {
+      format_char = 's';
+    } else if (DataTypeIsUnsigned(dtype)) {
+      format_char = 'u';
+    } else if (DataTypeIsSigned(dtype)) {
+      format_char = 'd';
+    } else if (DataTypeIsFloating(dtype) || DataTypeIsComplex(dtype)) {
+      if (shortest) {
+        format_char = 'g';
+      } else if (scientific) {
+        format_char = 'e';
+      } else {
+        format_char = 'f';
+      }
+    } else if (dtype == DT_BOOL) {
+      return;
+    } else if (dtype == DT_VARIANT) {
+      return;
+    } else {
+      bool type_not_supported = true;
+      OP_REQUIRES(ctx, !type_not_supported,
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "Type not supported: ", DataTypeString(dtype))));
     }
-
-    if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
-      format_ = strings::Printf("(%s,%s)", format_.c_str(), format_.c_str());
+    format_ = absl::StrCat("%", fill_string, "*.*",
+                           absl::string_view(&format_char, 1));
+    if (format_char == 's') {
+      string_format_ = StringFormat::New(format_);
+      OP_REQUIRES(ctx, string_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
+    } else if (format_char == 'u' || format_char == 'd') {
+      integral_format_ = IntegralFormat::New(format_);
+      OP_REQUIRES(ctx, integral_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
+    } else {
+      floating_format_ = FloatingFormat::New(format_);
+      OP_REQUIRES(ctx, floating_format_ != nullptr,
+                  absl::InvalidArgumentError(
+                      absl::StrCat("Invalid format: ", format_)));
     }
   }
 
@@ -158,7 +147,7 @@ class AsStringOp : public OpKernel {
     const DataType& dtype = input_tensor->dtype();
 
     // If input is string and width unspecified, simply forward to output.
-    if (dtype == DT_STRING && format_.empty()) {
+    if (dtype == DT_STRING && width_ <= 0) {
       context->set_output(0, context->input(0));
       return;
     }
@@ -169,70 +158,91 @@ class AsStringOp : public OpKernel {
                                             &output_tensor));
     auto output_flat = output_tensor->flat<tstring>();
 
-#define ENCODE_TYPE(type, T, enc_str)                                     \
-  case (type): {                                                          \
-    const auto& input_flat = input_tensor->flat<T>();                     \
-    for (int i = 0; i < input_flat.size(); ++i) {                         \
-      output_flat(i) = strings::Printf((enc_str.c_str()), input_flat(i)); \
-    }                                                                     \
+    if (dtype == DT_BOOL) {
+      const auto& input_flat = input_tensor->flat<bool>();
+      for (int i = 0; i < input_flat.size(); ++i) {
+        output_flat(i) = (input_flat(i)) ? "true" : "false";
+      }
+      return;
+    }
+
+    if (dtype == DT_VARIANT) {
+      const auto& input_flat = input_tensor->flat<Variant>();
+      for (int i = 0; i < input_flat.size(); ++i) {
+        output_flat(i) = input_flat(i).DebugString();
+      }
+      return;
+    }
+
+    // All other cases use the format string.
+
+#define ENCODE_TYPE(type, T, enc_fmt)                                   \
+  case (type): {                                                        \
+    const auto& input_flat = input_tensor->flat<T>();                   \
+    for (int i = 0; i < input_flat.size(); ++i) {                       \
+      output_flat(i) =                                                  \
+          absl::StrFormat(*enc_fmt, width_, precision_, input_flat(i)); \
+    }                                                                   \
   } break
 
     switch (dtype) {
-      ENCODE_TYPE(DT_UINT8, uint8, format_);
-      ENCODE_TYPE(DT_UINT16, uint16, format_);
-      ENCODE_TYPE(DT_UINT32, uint32, format_);
-      ENCODE_TYPE(DT_UINT64, uint64, format_);
-      ENCODE_TYPE(DT_INT8, int8, format_);
-      ENCODE_TYPE(DT_INT16, int16, format_);
-      ENCODE_TYPE(DT_INT32, int32, format_);
-      ENCODE_TYPE(DT_INT64, int64_t, format_);
-      ENCODE_TYPE(DT_FLOAT, float, format_);
-      ENCODE_TYPE(DT_DOUBLE, double, format_);
-      case (DT_BOOL): {
-        const auto& input_flat = input_tensor->flat<bool>();
-        for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = (input_flat(i)) ? "true" : "false";
-        }
-      } break;
+      ENCODE_TYPE(DT_UINT8, uint8_t, integral_format_);
+      ENCODE_TYPE(DT_UINT16, uint16_t, integral_format_);
+      ENCODE_TYPE(DT_UINT32, uint32_t, integral_format_);
+      ENCODE_TYPE(DT_UINT64, uint64_t, integral_format_);
+      ENCODE_TYPE(DT_INT8, int8_t, integral_format_);
+      ENCODE_TYPE(DT_INT16, int16_t, integral_format_);
+      ENCODE_TYPE(DT_INT32, int32_t, integral_format_);
+      ENCODE_TYPE(DT_INT64, int64_t, integral_format_);
+      ENCODE_TYPE(DT_FLOAT, float, floating_format_);
+      ENCODE_TYPE(DT_DOUBLE, double, floating_format_);
       case (DT_STRING): {
         const auto& input_flat = input_tensor->flat<tstring>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), absl::string_view(input_flat(i)).data());
-        }
-      } break;
-      case (DT_VARIANT): {
-        const auto& input_flat = input_tensor->flat<Variant>();
-        for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = input_flat(i).DebugString();
+          output_flat(i) = absl::StrFormat(*string_format_, width_, precision_,
+                                           absl::string_view(input_flat(i)));
         }
       } break;
       case (DT_HALF): {
         const auto& input_flat = input_tensor->flat<Eigen::half>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(format_.c_str(),
-                                           static_cast<float>(input_flat(i)));
+          output_flat(i) =
+              absl::StrFormat(*floating_format_, width_, precision_,
+                              static_cast<float>(input_flat(i)));
         }
       } break;
       case (DT_BFLOAT16): {
         const auto& input_flat = input_tensor->flat<bfloat16>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(format_.c_str(),
-                                           static_cast<float>(input_flat(i)));
+          output_flat(i) =
+              absl::StrFormat(*floating_format_, width_, precision_,
+                              static_cast<float>(input_flat(i)));
         }
       } break;
       case (DT_COMPLEX64): {
         const auto& input_flat = input_tensor->flat<complex64>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), input_flat(i).real(), input_flat(i).imag());
+          output_flat(i) =
+              absl::StrCat("(",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).real()),
+                           ",",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).imag()),
+                           ")");
         }
       } break;
       case (DT_COMPLEX128): {
         const auto& input_flat = input_tensor->flat<complex128>();
         for (int i = 0; i < input_flat.size(); ++i) {
-          output_flat(i) = strings::Printf(
-              format_.c_str(), input_flat(i).real(), input_flat(i).imag());
+          output_flat(i) =
+              absl::StrCat("(",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).real()),
+                           ",",
+                           absl::StrFormat(*floating_format_, width_,
+                                           precision_, input_flat(i).imag()),
+                           ")");
         }
       } break;
       default:
@@ -246,7 +256,32 @@ class AsStringOp : public OpKernel {
   }
 
  private:
-  string format_;
+  // Used to parse "%*.*g", etc.
+  using FloatingFormat =
+      absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::g |
+                             absl::FormatConversionCharSet::e |
+                             absl::FormatConversionCharSet::f>;
+
+  // Used to parse "%*.*u", etc.
+  using IntegralFormat =
+      absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::kStar,
+                         absl::FormatConversionCharSet::u |
+                             absl::FormatConversionCharSet::d>;
+
+  // Used to parse "%*.*s", etc.
+  using StringFormat = absl::ParsedFormat<absl::FormatConversionCharSet::kStar,
+                                          absl::FormatConversionCharSet::kStar,
+                                          absl::FormatConversionCharSet::s>;
+
+  int precision_ = -1;
+  int width_ = -1;
+  decltype(StringFormat::New("%*.*s")) string_format_;
+  decltype(IntegralFormat::New("%*.*u")) integral_format_;
+  decltype(FloatingFormat::New("%*.*g")) floating_format_;
+  std::string format_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("AsString").Device(DEVICE_CPU), AsStringOp);
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index ef391875ff3951..cdd9af962f346e 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -531,6 +531,7 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:refcount",
         "@local_tsl//tsl/platform:status",
+        "@local_xla//xla/tsl/lib/core:status_test_util",
         "@local_xla//xla/tsl/lib/monitoring:cell_reader",
         "@local_xla//xla/tsl/lib/monitoring:test_utils",
         "@local_xla//xla/tsl/platform:criticality",
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 8be441b231387a..4006fcbaf94dea 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -89,7 +89,7 @@ class AdaptiveSharedBatchScheduler
 
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
     // Number of batch processing threads - the maximum value of
     // in_flight_batches_limit_.  It is recommended that this value be set by
     // running the system under load, observing the learned value for
@@ -329,7 +329,7 @@ class ASBSQueue : public BatchScheduler<TaskType> {
 
   // Returns uint64 one greater than was returned by the previous call.
   // Context id is reused after std::numeric_limits<uint64>::max is exhausted.
-  static uint64 NewTraceMeContextIdForBatch();
+  static uint64_t NewTraceMeContextIdForBatch();
 
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
@@ -347,7 +347,7 @@ template <typename TaskType>
 class ASBSBatch : public Batch<TaskType> {
  public:
   ASBSBatch(ASBSQueue<TaskType>* queue, int64_t creation_time_micros,
-            int64_t batch_timeout_micros, uint64 traceme_context_id)
+            int64_t batch_timeout_micros, uint64_t traceme_context_id)
       : queue_(queue),
         creation_time_micros_(creation_time_micros),
         schedulable_time_micros_(creation_time_micros + batch_timeout_micros),
@@ -361,13 +361,13 @@ class ASBSBatch : public Batch<TaskType> {
 
   int64_t schedulable_time_micros() const { return schedulable_time_micros_; }
 
-  uint64 traceme_context_id() const { return traceme_context_id_; }
+  uint64_t traceme_context_id() const { return traceme_context_id_; }
 
  private:
   ASBSQueue<TaskType>* queue_;
   const int64_t creation_time_micros_;
   const int64_t schedulable_time_micros_;
-  const uint64 traceme_context_id_;
+  const uint64_t traceme_context_id_;
   ASBSBatch(const ASBSBatch&) = delete;
   void operator=(const ASBSBatch&) = delete;
 };
@@ -860,8 +860,8 @@ size_t ASBSQueue<TaskType>::SchedulingCapacityLocked() const {
 
 template <typename TaskType>
 // static
-uint64 ASBSQueue<TaskType>::NewTraceMeContextIdForBatch() {
-  static std::atomic<uint64> traceme_context_id(0);
+uint64_t ASBSQueue<TaskType>::NewTraceMeContextIdForBatch() {
+  static std::atomic<uint64_t> traceme_context_id(0);
   return traceme_context_id.fetch_add(1, std::memory_order_relaxed);
 }
 }  // namespace internal
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index a066550399c56d..c915deead27a85 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -161,7 +161,7 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
     // To share a thread pool (2) create a scheduler and pass it in.
 
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
 
     // The number of threads to use to process batches.
     // Must be >= 1, and should be tuned carefully.
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index 197058ef7982b4..6301f0e1bf111b 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -99,11 +99,11 @@ class BenchmarkBatchTask : public BatchTask {
 
   size_t size() const override { return 1; }
 
-  uint64 start_time_micros() const { return start_time_micros_; }
+  uint64_t start_time_micros() const { return start_time_micros_; }
 
  private:
   // The time at which the task was created, in microseconds.
-  const uint64 start_time_micros_;
+  const uint64_t start_time_micros_;
 };
 
 BenchmarkBatchTask::BenchmarkBatchTask()
@@ -164,7 +164,7 @@ class LatencyBenchmark {
   void InjectLoad();
 
   // Return latency and batch size stat.
-  string ReportLatencyBatchSz();
+  std::string ReportLatencyBatchSz();
 
   // Reset scheduler. This has a side-effect of waiting for all work to be
   // completed prior to reset.
@@ -255,7 +255,7 @@ void LatencyBenchmark::InjectLoad() {
 void LatencyBenchmark::ProcessBatch(
     std::unique_ptr<Batch<BenchmarkBatchTask>> batch) {
   PerformBatchCpuWork();
-  const uint64 batch_completion_time = Env::Default()->NowMicros();
+  const uint64_t batch_completion_time = Env::Default()->NowMicros();
 
   {
     mutex_lock l(mu_);
@@ -263,7 +263,7 @@ void LatencyBenchmark::ProcessBatch(
   }
 
   for (int i = 0; i < batch->num_tasks(); ++i) {
-    const uint64 task_latency_micros =
+    const uint64_t task_latency_micros =
         batch_completion_time - batch->task(i).start_time_micros();
     {
       mutex_lock l(mu_);
@@ -280,7 +280,7 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
   CHECK_NE(dummy, 0);
 }
 
-string LatencyBenchmark::ReportLatencyBatchSz() {
+std::string LatencyBenchmark::ReportLatencyBatchSz() {
   mutex_lock l(mu_);
   return absl::StrCat(
       "lat_p99.9=", task_latency_millis_histogram_.Percentile(99.9),
@@ -347,9 +347,9 @@ void LatencyBM(::testing::benchmark::State& state) {
     scheduler_options.num_batch_threads = state.range(1);
     scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
     const int kBatchCpuCost = 10 * 1000 * 1000;
-    const int64 kQps = state.range(2);
-    const int64 kInjectionIntervalMicros = 1000000 / (kQps / state.threads());
-    const int64 kNumTasks = latency_benchmark_duration_secs * kQps;
+    const int64_t kQps = state.range(2);
+    const int64_t kInjectionIntervalMicros = 1000000 / (kQps / state.threads());
+    const int64_t kNumTasks = latency_benchmark_duration_secs * kQps;
     if (kNumTasks <= 10000) {
       LOG(WARNING) << "Not enough tasks (" << kNumTasks << ")"
                    << " to report meaningful 99.9% latency!"
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 10edb6660a70af..b00b468cf79ac7 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -82,8 +82,9 @@ namespace serving {
 namespace {
 
 // TODO(b/181883417): Replace with RecordPaddingSizeV2.
-void RecordPaddingSize(int32_t padding_size, const string& model_name,
-                       int32_t execution_batch_size, const string& op_name) {
+void RecordPaddingSize(int32_t padding_size, const std::string& model_name,
+                       int32_t execution_batch_size,
+                       const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<3>::New(
       {"/tensorflow/serving/batching/padding_size",
        "Tracks the padding size distribution on batches by model_name (if "
@@ -121,16 +122,17 @@ static auto* padding_size_v2_sampler = tensorflow::monitoring::Sampler<3>::New(
      "model_name", "execution_batch_size", "op_name"},
     monitoring::Buckets::Explicit(GetBucketLimitsForPaddingSizeV2()));
 
-void RecordPaddingSizeV2(int32_t padding_size, const string& model_name,
-                         int32_t execution_batch_size, const string& op_name) {
+void RecordPaddingSizeV2(int32_t padding_size, const std::string& model_name,
+                         int32_t execution_batch_size,
+                         const std::string& op_name) {
   padding_size_v2_sampler
       ->GetCell(model_name, absl::StrCat(execution_batch_size), op_name)
       ->Add(static_cast<double>(padding_size));
 }
 
 // TODO(b/181883417): Replace with RecordInputBatchSizeV2.
-void RecordInputBatchSize(int32_t batch_size, const string& model_name,
-                          const string& op_name) {
+void RecordInputBatchSize(int32_t batch_size, const std::string& model_name,
+                          const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
       {"/tensorflow/serving/batching/input_batch_size",
        "Tracks the batch size distribution on the inputs by model_name (if "
@@ -141,8 +143,8 @@ void RecordInputBatchSize(int32_t batch_size, const string& model_name,
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
-void RecordInputStatsV2(int32_t batch_size, const string& model_name,
-                        const string& op_name,
+void RecordInputStatsV2(int32_t batch_size, const std::string& model_name,
+                        const std::string& op_name,
                         const tsl::criticality::Criticality& criticality) {
   static auto* cell = tensorflow::monitoring::Sampler<3>::New(
       {"/tensorflow/serving/batching/input_batch_size_v2",
@@ -166,8 +168,8 @@ void RecordInputStatsV2(int32_t batch_size, const string& model_name,
 }
 
 // Record the actual batch size without padding.
-void RecordBatchSize(int32_t batch_size, const string& model_name,
-                     const string& op_name) {
+void RecordBatchSize(int32_t batch_size, const std::string& model_name,
+                     const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::Sampler<2>::New(
       {"/tensorflow/serving/batching/batch_size",
        "Tracks the batch size distribution on the batch result by model_name "
@@ -177,8 +179,8 @@ void RecordBatchSize(int32_t batch_size, const string& model_name,
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
-void RecordProcessedBatchSize(int32_t batch_size, const string& model_name,
-                              const string& op_name) {
+void RecordProcessedBatchSize(int32_t batch_size, const std::string& model_name,
+                              const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::PercentileSampler<2>::New(
       {"/tensorflow/serving/batching/processed_batch_size",
        "Tracks the batch size distribution on processing by model_name (if "
@@ -196,16 +198,17 @@ static auto* processed_batch_size_v2_counter = monitoring::Counter<3>::New(
     "model_name", "op_name", "batch_size");
 
 // Export the exact number instead of the distribution of processed batch size.
-void RecordProcessedBatchSizeV2(int32_t batch_size, const string& model_name,
-                                const string& op_name) {
+void RecordProcessedBatchSizeV2(int32_t batch_size,
+                                const std::string& model_name,
+                                const std::string& op_name) {
   processed_batch_size_v2_counter
       ->GetCell(model_name, op_name, std::to_string(batch_size))
       ->IncrementBy(1);
 }
 
 // TODO(b/181883417): Replace with RecordBatchDelayUsV2.
-void RecordBatchDelayUs(int64_t batch_delay_us, const string& model_name,
-                        const string& op_name, int32_t batch_size) {
+void RecordBatchDelayUs(int64_t batch_delay_us, const std::string& model_name,
+                        const std::string& op_name, int32_t batch_size) {
   static auto* cell = monitoring::PercentileSampler<3>::New(
       {"/tensorflow/serving/batching/batch_delay_us",
        "Tracks the batching delay (in microseconds) for inputs by model_name "
@@ -217,8 +220,8 @@ void RecordBatchDelayUs(int64_t batch_delay_us, const string& model_name,
       ->Add(static_cast<double>(batch_delay_us));
 }
 
-void RecordBatchDelayUsV2(int64_t batch_delay_us, const string& model_name,
-                          const string& op_name, int32_t batch_size) {
+void RecordBatchDelayUsV2(int64_t batch_delay_us, const std::string& model_name,
+                          const std::string& op_name, int32_t batch_size) {
   static auto* cell = tensorflow::monitoring::Sampler<3>::New(
       {"/tensorflow/serving/batching/batch_delay_us_v2",
        "Tracks the batching delay (in microseconds) for inputs by model_name "
@@ -233,7 +236,8 @@ void RecordBatchDelayUsV2(int64_t batch_delay_us, const string& model_name,
 
 void RecordBatchTaskSizeSum(int32_t batch_task_size,
                             int32_t unbatched_task_size,
-                            const string& model_name, const string& op_name) {
+                            const std::string& model_name,
+                            const std::string& op_name) {
   static auto* cell = tensorflow::monitoring::Counter<3>::New(
       "/tensorflow/serving/batching/batch_task_size_sum",
       "Tracks the sum of the task sizes in a batch.", "model_name", "op_name",
@@ -243,8 +247,8 @@ void RecordBatchTaskSizeSum(int32_t batch_task_size,
 }
 
 void RecordBatchParamBatchTimeoutMicros(int64_t batch_timeout_micros,
-                                        const string& model_name,
-                                        const string& op_name) {
+                                        const std::string& model_name,
+                                        const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/batch_timeout_micros",
       "Tracks how long a request can wait before being processed by a batch.",
@@ -253,18 +257,18 @@ void RecordBatchParamBatchTimeoutMicros(int64_t batch_timeout_micros,
 }
 
 void RecordBatchParamMaxBatchSize(int64_t max_batch_size,
-                                  const string& model_name,
-                                  const string& op_name) {
+                                  const std::string& model_name,
+                                  const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/max_batch_size",
       "Tracks the maximum size of a batch.", "model_name", "op_name");
   cell->GetCell(model_name, op_name)->Set(max_batch_size);
 }
 
-void RecordBatchParamPaddingPolicy(const string& batch_padding_policy,
-                                   const string& model_name,
-                                   const string& op_name) {
-  static auto* cell = monitoring::Gauge<string, 2>::New(
+void RecordBatchParamPaddingPolicy(const std::string& batch_padding_policy,
+                                   const std::string& model_name,
+                                   const std::string& op_name) {
+  static auto* cell = monitoring::Gauge<std::string, 2>::New(
       "/tensorflow/serving/batching/configured_batch_padding_policy",
       "The value of BatchFunction.batch_padding_policy attribute.",
       "model_name", "op_name");
@@ -272,8 +276,8 @@ void RecordBatchParamPaddingPolicy(const string& batch_padding_policy,
 }
 
 void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
-                                        const string& model_name,
-                                        const string& op_name) {
+                                        const std::string& model_name,
+                                        const std::string& op_name) {
   static auto* cell = monitoring::Gauge<int64_t, 2>::New(
       "/tensorflow/serving/batching/max_enqueued_batches",
       "Tracks the maximum number of enqueued batches.", "model_name",
@@ -281,10 +285,10 @@ void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
   cell->GetCell(model_name, op_name)->Set(max_enqueued_batches);
 }
 
-void RecordBatchParamAllowedBatchSizes(const string& allowed_batch_sizes,
-                                       const string& model_name,
-                                       const string& op_name) {
-  static auto* cell = monitoring::Gauge<string, 2>::New(
+void RecordBatchParamAllowedBatchSizes(const std::string& allowed_batch_sizes,
+                                       const std::string& model_name,
+                                       const std::string& op_name) {
+  static auto* cell = monitoring::Gauge<std::string, 2>::New(
       "/tensorflow/serving/batching/allowed_batch_sizes",
       "Tracks the sizes that are allowed to form a batch.", "model_name",
       "op_name");
@@ -308,8 +312,8 @@ void RecordBatchCosts(const std::string& model_name,
       ->Add(absl::ToDoubleMicroseconds(total_cost));
 }
 
-const string& GetModelName(OpKernelContext* ctx) {
-  static string* kModelNameUnset = new string("model_name_unset");
+const std::string& GetModelName(OpKernelContext* ctx) {
+  static std::string* kModelNameUnset = new std::string("model_name_unset");
   if (!ctx->session_metadata()) return *kModelNameUnset;
   if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
   return ctx->session_metadata()->name();
@@ -354,8 +358,8 @@ using ::tensorflow::concat_split_util::Concat;
 using ::tensorflow::concat_split_util::Split;
 using TensorMatrix = std::vector<std::vector<Tensor>>;
 
-string GetTensorNamesAndShapesString(const OpKernelContext* context,
-                                     const OpInputList& tensors) {
+std::string GetTensorNamesAndShapesString(const OpKernelContext* context,
+                                          const OpInputList& tensors) {
   std::stringstream out;
   int i = 0;
   for (const Tensor& tensor : tensors) {
@@ -366,7 +370,8 @@ string GetTensorNamesAndShapesString(const OpKernelContext* context,
 }
 
 absl::Status BatchResourceBase::RegisterWarmupInputs(
-    int64_t guid, OpKernelContext* context, const string& batcher_queue_name,
+    int64_t guid, OpKernelContext* context,
+    const std::string& batcher_queue_name,
     const CreateBatchTaskFn& create_batch_task_fn,
     AsyncOpKernel::DoneCallback done) {
   auto shared_status = std::make_shared<ThreadSafeStatus>();
@@ -402,7 +407,8 @@ absl::Status BatchResourceBase::RegisterWarmupInputs(
 }
 
 absl::Status BatchResourceBase::RegisterInput(
-    int64_t guid, OpKernelContext* context, const string& batcher_queue_name,
+    int64_t guid, OpKernelContext* context,
+    const std::string& batcher_queue_name,
     const CreateBatchTaskFn& create_batch_task_fn,
     AsyncOpKernel::DoneCallback done_callback, int forced_warmup_batch_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BatchTask> batch_components,
@@ -539,7 +545,7 @@ absl::Status BatchResourceBase::RegisterInput(
 BatchResourceBase::GetBatcherQueueOptions(
     int32_t num_batch_threads, int32_t max_batch_size,
     int32_t batch_timeout_micros, int32_t max_enqueued_batches,
-    const std::vector<int32>& allowed_batch_sizes,
+    const std::vector<int32_t>& allowed_batch_sizes,
     bool enable_large_batch_splitting, bool disable_padding) {
   return GetBatcherQueueOptions(
       num_batch_threads, max_batch_size, batch_timeout_micros,
@@ -558,12 +564,12 @@ BatchResourceBase::GetBatcherQueueOptions(
 BatchResourceBase::GetBatcherQueueOptions(
     int32_t num_batch_threads, int32_t max_batch_size,
     int32_t batch_timeout_micros, int32_t max_enqueued_batches,
-    const std::vector<int32>& allowed_batch_sizes,
+    const std::vector<int32_t>& allowed_batch_sizes,
     bool enable_large_batch_splitting, bool disable_padding,
     absl::string_view batch_padding_policy, int32_t low_priority_max_batch_size,
     int32_t low_priority_batch_timeout_micros,
     int32_t low_priority_max_enqueued_batches,
-    const std::vector<int32>& low_priority_allowed_batch_sizes,
+    const std::vector<int32_t>& low_priority_allowed_batch_sizes,
     MixedPriorityBatchingPolicy mixed_priority_batching_policy) {
   BatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.input_batch_size_limit = max_batch_size;
@@ -608,18 +614,17 @@ BatchResourceBase::GetBatcherQueueOptions(
       return SplitInputTask(input_task, open_batch_remaining_slot,
                             max_batch_size, output_tasks);
     };
-
-    if (allowed_batch_sizes.empty()) {
-      batcher_queue_options.max_execution_batch_size = max_batch_size;
-      batcher_queue_options.high_priority_queue_options
-          .max_execution_batch_size = max_batch_size;
-    } else {
-      batcher_queue_options.max_execution_batch_size =
-          *allowed_batch_sizes.rbegin();
-      batcher_queue_options.high_priority_queue_options
-          .max_execution_batch_size = *allowed_batch_sizes.rbegin();
-      batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
-    }
+  }
+  if (allowed_batch_sizes.empty()) {
+    batcher_queue_options.max_execution_batch_size = max_batch_size;
+    batcher_queue_options.high_priority_queue_options.max_execution_batch_size =
+        max_batch_size;
+  } else {
+    batcher_queue_options.max_execution_batch_size =
+        *allowed_batch_sizes.rbegin();
+    batcher_queue_options.high_priority_queue_options.max_execution_batch_size =
+        *allowed_batch_sizes.rbegin();
+    batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
   }
   batcher_queue_options.disable_padding = disable_padding;
 
@@ -630,7 +635,7 @@ BatchResourceBase::GetBatcherQueueOptions(
 BatchResourceBase::GetAdaptiveBatcherQueueOptions(
     int32_t max_batch_size, int32_t batch_timeout_micros,
     int32_t max_enqueued_batches, bool enable_large_batch_splitting,
-    const std::vector<int32>& allowed_batch_sizes, bool disable_padding) {
+    const std::vector<int32_t>& allowed_batch_sizes, bool disable_padding) {
   AdaptiveBatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.max_input_task_size =
       std::make_optional(max_batch_size);
@@ -687,7 +692,7 @@ bool BatchResourceBase::IsLowPriorityBatch(const BatchT& batch) const {
 // returns 'batch_size'.
 int BatchResourceBase::RoundToLowestAllowedBatchSize(
     int batch_size, bool is_low_priority_batch) const {
-  const std::vector<int32>& allowed_batch_sizes =
+  const std::vector<int32_t>& allowed_batch_sizes =
       is_low_priority_batch ? batcher_queue_options_.low_priority_queue_options
                                   .allowed_batch_sizes
                             : allowed_batch_sizes_;
@@ -1228,8 +1233,8 @@ void BatchResourceBase::ProcessBatchCallBack(
 }
 
 absl::Status BatchResourceBase::LookupOrCreateBatcherQueue(
-    const string& queue_name, const string& model_name, const string& op_name,
-    BatcherQueueT** queue) {
+    const std::string& queue_name, const std::string& model_name,
+    const std::string& op_name, BatcherQueueT** queue) {
   mutex_lock l(batcher_queues_mu_);
 
   auto it = batcher_queues_.find(queue_name);
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index 28d997426827ba..0a46a96020288e 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/lib/monitoring/cell_reader.h"
 #include "xla/tsl/lib/monitoring/test_utils.h"
 #include "xla/tsl/platform/criticality.h"
@@ -76,6 +77,7 @@ TEST(BatchTaskCriticalityTest, CriticalityDefaultsToCritical) {
 
 struct PriorityTestParams {
   std::string test_name;
+  bool enable_large_batch_splitting;
   MixedPriorityBatchingPolicy mixed_priority_batching_policy;
   // The expected number of batches for each allowed batch size.
   absl::flat_hash_map<int, int> expected_batch_size_count;
@@ -83,13 +85,11 @@ struct PriorityTestParams {
   absl::flat_hash_map<int, int> expected_batch_size_padding_sum;
 };
 
-class TestPriorityBatchResourceBase : public BatchResourceBase {
+class TestBatchResourceBase : public BatchResourceBase {
  public:
   using BatchResourceBase::BatchResourceBase;
 
-  std::string DebugString() const override {
-    return "TestPriorityBatchResourceBase";
-  }
+  std::string DebugString() const override { return "TestBatchResourceBase"; }
 
  protected:
   // Simple function that returns the input tensors as the output tensors.
@@ -223,7 +223,7 @@ TEST(BatchTaskCriticalityTest, CriticalitySuccessfullyPropagated) {
 
 TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
   std::shared_ptr<SharedBatchScheduler<BatchResourceBase::BatchTask>> batcher;
-  ASSERT_OK(SharedBatchScheduler<BatchResourceBase::BatchTask>::Create(
+  TF_ASSERT_OK(SharedBatchScheduler<BatchResourceBase::BatchTask>::Create(
       SharedBatchScheduler<BatchResourceBase::BatchTask>::Options(), &batcher));
   std::vector<int32_t> allowed_batch_sizes = {4, 8, 12, 16};
   int max_batch_size = 16;
@@ -233,11 +233,12 @@ TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
   // so the low priority tasks can be padded to the high priority batch instead
   // of forming a separate batch.
   BatchResourceBase::BatcherT::QueueOptions queue_options =
-      TestPriorityBatchResourceBase::GetBatcherQueueOptions(
+      TestBatchResourceBase::GetBatcherQueueOptions(
           /*num_batch_threads=*/num_requests, /*max_batch_size=*/max_batch_size,
           /*batch_timeout_micros=*/batch_timeout,
           /*max_enqueued_batches=*/num_requests, allowed_batch_sizes,
-          /*enable_large_batch_splitting=*/true,
+          /*enable_large_batch_splitting=*/
+          GetParam().enable_large_batch_splitting,
           /*disable_padding=*/false, kPadUpPolicy,
           /*low_priority_max_batch_size=*/max_batch_size,
           /*low_priority_batch_timeout_micros=*/batch_timeout * 3,
@@ -246,8 +247,8 @@ TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
           /*mixed_priority_batching_policy=*/
           GetParam().mixed_priority_batching_policy);
   tsl::core::RefCountPtr<BatchResourceBase> batch_resource(
-      new TestPriorityBatchResourceBase(true, batcher, queue_options,
-                                        allowed_batch_sizes));
+      new TestBatchResourceBase(true, batcher, queue_options,
+                                allowed_batch_sizes));
 
   std::vector<std::unique_ptr<OpKernelContext>> contexts;
   for (int i = 0; i < num_requests; ++i) {
@@ -270,7 +271,7 @@ TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
       return batch_task;
     };
     auto done_callback = [&]() { blocking_counter.DecrementCount(); };
-    ASSERT_OK(batch_resource->RegisterInput(
+    TF_ASSERT_OK(batch_resource->RegisterInput(
         /*guid=*/i, contexts[i].get(),
         /*batcher_queue_name=*/"batcher_queue_name",
         /*create_batch_task_fn=*/create_batch_task_fn,
@@ -306,6 +307,7 @@ INSTANTIATE_TEST_SUITE_P(
         // has 3 tasks and total size is 12. Each batch has 3 paddings.
         {
             "priority_isolation",
+            /*enable_large_batch_splitting=*/true,
             MixedPriorityBatchingPolicy::kPriorityIsolation,
             /*expected_batch_size_count=*/
             {{4, 0}, {8, 0}, {12, 2}, {16, 0}},
@@ -319,13 +321,28 @@ INSTANTIATE_TEST_SUITE_P(
         // has 6 tasks and total size is 16. No padding for the first batch. The
         // second batch has 1 task of size 2 and is padded to size 4.
         {
-            "priority_merge",
+            "priority_merge_enable_splitting",
+            /*enable_large_batch_splitting=*/true,
             MixedPriorityBatchingPolicy::kPriorityMerge,
             /*expected_batch_size_count=*/
             {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
             /*expected_batch_size_padding_sum=*/
             {{4, 2}, {8, 0}, {12, 0}, {16, 0}},
         },
+        // With priority_merge policy, high priority tasks and low priority
+        // tasks are batched together. Since splitting is disabled, there are 2
+        // batches. First batch has 5 tasks, total size is 15 and is padded to
+        // size 16. The second batch has 1 low priority task of size 3 and is
+        // padded to size 4.
+        {
+            "priority_merge_disable_splitting",
+            /*enable_large_batch_splitting=*/false,
+            MixedPriorityBatchingPolicy::kPriorityMerge,
+            /*expected_batch_size_count=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
+        },
         // With padding_with_max_batch_size policy, high priority tasks and low
         // priority tasks are batched to the max batch size and there is no
         // splitting for low priority tasks. 3 high priority tasks and 2 low
@@ -334,6 +351,7 @@ INSTANTIATE_TEST_SUITE_P(
         // task of size 3 and is padded to size 4.
         {
             "padding_with_max_batch_size",
+            /*enable_large_batch_splitting=*/true,
             MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize,
             /*expected_batch_size_count=*/
             {{4, 1}, {8, 0}, {12, 0}, {16, 1}},
@@ -348,6 +366,19 @@ INSTANTIATE_TEST_SUITE_P(
         // size 8.
         {
             "low_priority_padding_with_next_allowed_batch_size",
+            /*enable_large_batch_splitting=*/true,
+            MixedPriorityBatchingPolicy::
+                kLowPriorityPaddingWithNextAllowedBatchSize,
+            /*expected_batch_size_count=*/
+            {{4, 0}, {8, 1}, {12, 1}, {16, 0}},
+            /*expected_batch_size_padding_sum=*/
+            {{4, 0}, {8, 2}, {12, 0}, {16, 0}},
+        },
+        // Same as above but disabled large batch splitting.
+        {
+            "low_priority_padding_with_next_allowed_batch_size_disable_"
+            "splitting",
+            /*enable_large_batch_splitting=*/false,
             MixedPriorityBatchingPolicy::
                 kLowPriorityPaddingWithNextAllowedBatchSize,
             /*expected_batch_size_count=*/
@@ -1006,6 +1037,209 @@ TEST_F(BatchResourceBaseTest, ConfiguredBatchPaddingPolicyMetric) {
   my_batch_resource->Unref();
 }
 
+struct MaxExecutionBatchSizeTestParams {
+  std::string test_name;
+  bool enable_large_batch_splitting;
+  int max_batch_size;
+  std::vector<int> allowed_batch_sizes;
+  absl::flat_hash_map<int, int> expected_batch_size_and_count;
+  absl::flat_hash_map<int, int> expected_batch_size_and_padding_sum;
+};
+
+class BatchResourceBaseMaxExecutionBatchSizeTest
+    : public ::testing::TestWithParam<MaxExecutionBatchSizeTestParams> {
+ protected:
+  void SetUp() override {
+    processed_batch_size_v2_reader_ = std::make_unique<CellReader<int64_t>>(
+        "/tensorflow/serving/batching/processed_batch_size_v2");
+    padding_size_v2_reader_ = std::make_unique<CellReader<Histogram>>(
+        "/tensorflow/serving/batching/padding_size_v2");
+    // Create device_.
+    device_ = DeviceFactory::NewDevice("CPU", SessionOptions{},
+                                       "/job:a/replica:0/task:0");
+    // Create batch_kernel_node_def.
+    NodeDefBuilder batch_function_builder("my_batch_node", "BatchFunction");
+    batch_function_builder.Attr("max_batch_size", GetParam().max_batch_size);
+    batch_function_builder.Attr("num_batch_threads", 6);
+    batch_function_builder.Attr("allowed_batch_sizes",
+                                GetParam().allowed_batch_sizes);
+    batch_function_builder.Attr("batch_timeout_micros", 3000000);
+    batch_function_builder.Attr("max_enqueued_batches", 6);
+    batch_function_builder.Attr("enable_large_batch_splitting",
+                                GetParam().enable_large_batch_splitting);
+    batch_function_builder.Attr("Tin", {DataType::DT_INT64});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{
+        NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
+    batch_function_builder.Attr("Tcaptured", std::vector<DataType>{});
+    batch_function_builder.Input(std::vector<NodeDefBuilder::NodeOut>{});
+    batch_function_builder.Attr("Tout", {DataType::DT_INT64});
+    NameAttrList f;
+    f.set_name("func_to_batch");
+    batch_function_builder.Attr("f", f);
+    NodeDef batch_kernel_node_def;
+    CHECK_OK(batch_function_builder.Finalize(&batch_kernel_node_def));
+
+    // Create batch_kernel_.
+    absl::Status op_kernel_creation_status;
+    batch_kernel_ =
+        CreateOpKernel(DEVICE_CPU, device_.get(), device_->GetAllocator({}),
+                       batch_kernel_node_def, TF_GRAPH_DEF_VERSION,
+                       &op_kernel_creation_status);
+    CHECK_OK(op_kernel_creation_status);
+    CHECK(batch_kernel_ != nullptr);
+
+    // Create input tensors.
+    input_tensor_ = Tensor(DataType::DT_INT64, TensorShape({1, 4}));
+    input_tensor_.flat<int64_t>().setZero();
+    input_tensor_values_ = {
+        TensorValue(&input_tensor_),
+    };
+
+    // Fill-in session_metadata_.
+    session_metadata_.set_name("my_model_name");
+
+    // Fill-in params_.
+    params_.device = device_.get();
+    params_.op_kernel = batch_kernel_.get();
+    params_.inputs = input_tensor_values_;
+    params_.session_metadata = &session_metadata_;
+
+    // Create context_.
+    context_ = std::make_unique<OpKernelContext>(&params_);
+  }
+
+  std::unique_ptr<CellReader<int64_t>> processed_batch_size_v2_reader_;
+  std::unique_ptr<CellReader<Histogram>> padding_size_v2_reader_;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<OpKernel> batch_kernel_;
+  Tensor input_tensor_;
+  std::vector<TensorValue> input_tensor_values_;
+  SessionMetadata session_metadata_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernelContext> context_;
+};
+
+TEST_P(BatchResourceBaseMaxExecutionBatchSizeTest,
+       MaxExecutionBatchSizeIsRespected) {
+  std::shared_ptr<SharedBatchScheduler<BatchResourceBase::BatchTask>> batcher;
+  TF_ASSERT_OK(SharedBatchScheduler<BatchResourceBase::BatchTask>::Create(
+      SharedBatchScheduler<BatchResourceBase::BatchTask>::Options(), &batcher));
+  int64_t batch_timeout = absl::ToInt64Microseconds(absl::Seconds(3));
+  int num_requests = 10;
+  BatchResourceBase::BatcherT::QueueOptions queue_options =
+      TestBatchResourceBase::GetBatcherQueueOptions(
+          /*num_batch_threads=*/num_requests,
+          /*max_batch_size=*/GetParam().max_batch_size,
+          /*batch_timeout_micros=*/batch_timeout,
+          /*max_enqueued_batches=*/num_requests, GetParam().allowed_batch_sizes,
+          /*enable_large_batch_splitting=*/
+          GetParam().enable_large_batch_splitting,
+          /*disable_padding=*/false);
+  tsl::core::RefCountPtr<BatchResourceBase> batch_resource(
+      new TestBatchResourceBase(true, batcher, queue_options,
+                                GetParam().allowed_batch_sizes));
+
+  std::vector<std::unique_ptr<OpKernelContext>> contexts;
+  for (int i = 0; i < num_requests; ++i) {
+    contexts.push_back(std::make_unique<OpKernelContext>(&params_));
+  }
+
+  absl::BlockingCounter blocking_counter(num_requests);
+  for (int i = 0; i < num_requests; ++i) {
+    auto create_batch_task_fn = [&]() {
+      return std::make_unique<BatchResourceBase::BatchTask>();
+    };
+    auto done_callback = [&]() { blocking_counter.DecrementCount(); };
+    TF_ASSERT_OK(batch_resource->RegisterInput(
+        /*guid=*/i, contexts[i].get(),
+        /*batcher_queue_name=*/"batcher_queue_name",
+        /*create_batch_task_fn=*/create_batch_task_fn,
+        /*done_callback=*/done_callback,
+        /*forced_warmup_batch_size=*/0));
+  }
+  blocking_counter.Wait();
+
+  for (const auto& [batch_size, expected_count] :
+       GetParam().expected_batch_size_and_count) {
+    EXPECT_EQ(processed_batch_size_v2_reader_->Delta(
+                  "my_model_name", "my_batch_node", absl::StrCat(batch_size)),
+              expected_count);
+  }
+  for (const auto& [batch_size, expected_padding_sum] :
+       GetParam().expected_batch_size_and_padding_sum) {
+    EXPECT_EQ(
+        padding_size_v2_reader_
+            ->Delta("my_model_name", absl::StrCat(batch_size), "my_batch_node")
+            .sum(),
+        expected_padding_sum);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchResourceBaseMaxExecutionBatchSizeTests,
+    BatchResourceBaseMaxExecutionBatchSizeTest,
+    testing::ValuesIn<MaxExecutionBatchSizeTestParams>({
+        // There are 10 requests and each request has task size 1. When batch
+        // splitting is enabled and allowed_batch_sizes is empty, the
+        // max_execution_batch_size is assigned by the max_batch_size 16. Since
+        // allowed_batch_sizes is empty, any batch size <= 16 is allowed.
+        // Therefore an input batch of size 10 is processed directly with no
+        // padding.
+        {
+            "batch_splitting_enabled_and_allowed_batch_sizes_empty",
+            /*enable_large_batch_splitting=*/true,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{},
+            /*expected_batch_size_and_count=*/{{10, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{10, 0}},
+        },
+        // Same requests as above. With batch splitting disabled,
+        // max_execution_batch_size is set by input_batch_size_limit, which
+        // inherits its value from max_batch_size (16). Since
+        // allowed_batch_sizes is empty, any batch size <= 16 is permitted.
+        // Therefore, an input batch of size 10 is processed directly with no
+        // padding.
+        {
+            "batch_splitting_disabled_and_allowed_batch_sizes_empty",
+            /*enable_large_batch_splitting=*/false,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{},
+            /*expected_batch_size_and_count=*/{{10, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{10, 0}},
+        },
+        // Same requests as above. When batch splitting is enabled and
+        // allowed_batch_sizes is not empty, the max_execution_batch_size is
+        // assigned to the largest allowed_batch_size 8. There are two batches.
+        // The first batch has 8 requests with total size 8, no padding. The
+        // second batch has 2 requests with total size 2, padding to size 4
+        // with 2 paddings.
+        {
+            "batch_splitting_enabled_and_allowed_batch_sizes_not_empty",
+            /*enable_large_batch_splitting=*/true,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{4, 8},
+            /*expected_batch_size_and_count=*/{{4, 1}, {8, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{4, 2}, {8, 0}},
+        },
+        // Same requests as above. When batch splitting is disabled, the
+        // max_execution_batch_size is assigned to the max_batch_size 16. Since
+        // allowed_batch_sizes is not empty and the padding policy is pad up,
+        // there is one batch of total size 10 which is padded to size 16 with 6
+        // paddings.
+        {
+            "batch_splitting_disabled_and_allowed_batch_sizes_not_empty",
+            /*enable_large_batch_splitting=*/false,
+            /*max_batch_size=*/16,
+            /*allowed_batch_sizes=*/{4, 8, 16},
+            /*expected_batch_size_and_count=*/{{4, 0}, {8, 0}, {16, 1}},
+            /*expected_batch_size_and_padding_sum=*/{{4, 0}, {8, 0}, {16, 6}},
+        },
+    }),
+    [](const ::testing::TestParamInfo<
+        BatchResourceBaseMaxExecutionBatchSizeTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
 }  // namespace
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index 12364fcfc6266c..936473a1884dc9 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -109,17 +109,17 @@ class TaskQueue {
 
   struct TaskWrapper {
     std::unique_ptr<TaskType> task;
-    uint64 start_time_micros;
+    uint64_t start_time_micros;
 
-    TaskWrapper(std::unique_ptr<TaskType> task, uint64 start_time_micros)
+    TaskWrapper(std::unique_ptr<TaskType> task, uint64_t start_time_micros)
         : task(std::move(task)), start_time_micros(start_time_micros) {}
   };
 
   // Appends a task to the end of the queue with the given start time.
-  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+  void AddTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros);
 
   // Adds a task to the front of the queue with the given start time.
-  void PrependTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
+  void PrependTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros);
 
   // Removes a task from the front of the queue, i.e., the oldest task in the
   // queue.
@@ -132,7 +132,7 @@ class TaskQueue {
 
   // Returns the start time of the earliest task in the queue. If the queue is
   // empty, return the null value.
-  std::optional<uint64> EarliestTaskStartTime() const;
+  std::optional<uint64_t> EarliestTaskStartTime() const;
 
   // Returns true iff the queue contains 0 tasks.
   bool empty() const;
@@ -162,7 +162,7 @@ class TaskQueue {
 
 template <typename TaskType>
 void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
-                                  uint64 start_time_micros) {
+                                  uint64_t start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
@@ -173,7 +173,7 @@ void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
 
 template <typename TaskType>
 void TaskQueue<TaskType>::PrependTask(std::unique_ptr<TaskType> task,
-                                      uint64 start_time_micros) {
+                                      uint64_t start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
@@ -233,7 +233,7 @@ bool TaskQueue<TaskType>::empty() const {
 }
 
 template <typename TaskType>
-std::optional<uint64> TaskQueue<TaskType>::EarliestTaskStartTime() const {
+std::optional<uint64_t> TaskQueue<TaskType>::EarliestTaskStartTime() const {
   {
     mutex_lock l(mu_);
 
@@ -275,13 +275,13 @@ template <typename TaskType>
 class Batch {
  public:
   Batch();
-  explicit Batch(uint64 traceme_context_id);
+  explicit Batch(uint64_t traceme_context_id);
   virtual ~Batch();  // Blocks until the batch is closed.
 
   // Appends 'task' to the batch. After calling AddTask(), the newly-added task
   // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
   // Dies if the batch is closed.
-  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros = 0);
+  void AddTask(std::unique_ptr<TaskType> task, uint64_t start_time_micros = 0);
 
   // Removes the most recently added task. Returns nullptr if the batch is
   // empty.
@@ -318,7 +318,7 @@ class Batch {
   void Close();
 
   // Returns the TraceMe context id of this batch.
-  uint64 traceme_context_id() const;
+  uint64_t traceme_context_id() const;
 
   // Attempts to trim this batch to a new, smaller size (not to be confused with
   // the number of tasks in the batch). On success, the trimmed tasks go into
@@ -331,7 +331,7 @@ class Batch {
 
   // Returns the start time of the earliest task in the queue. If the queue is
   // empty, return the null value.
-  std::optional<uint64> EarliestTaskStartTime() const;
+  std::optional<uint64_t> EarliestTaskStartTime() const;
 
  private:
   mutable mutex mu_;
@@ -348,11 +348,11 @@ class Batch {
   absl::Notification closed_;
 
   // The TracMe context id.
-  const uint64 traceme_context_id_;
+  const uint64_t traceme_context_id_;
 
   // The minimum start time of all tasks in the batch.
   // If the batch is empty, the value is undefined.
-  uint64 earliest_task_start_time_micros_ TF_GUARDED_BY(mu_);
+  uint64_t earliest_task_start_time_micros_ TF_GUARDED_BY(mu_);
 
   Batch(const Batch&) = delete;
   void operator=(const Batch&) = delete;
@@ -421,7 +421,7 @@ template <typename TaskType>
 Batch<TaskType>::Batch() : Batch(0) {}
 
 template <typename TaskType>
-Batch<TaskType>::Batch(uint64 traceme_context_id)
+Batch<TaskType>::Batch(uint64_t traceme_context_id)
     : traceme_context_id_(traceme_context_id) {}
 
 template <typename TaskType>
@@ -431,7 +431,7 @@ Batch<TaskType>::~Batch() {
 
 template <typename TaskType>
 void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task,
-                              uint64 start_time_micros) {
+                              uint64_t start_time_micros) {
   DCHECK(!IsClosed());
   {
     mutex_lock l(mu_);
@@ -448,7 +448,7 @@ void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task,
 }
 
 template <typename TaskType>
-std::optional<uint64> Batch<TaskType>::EarliestTaskStartTime() const {
+std::optional<uint64_t> Batch<TaskType>::EarliestTaskStartTime() const {
   {
     mutex_lock l(mu_);
     if (tasks_.empty()) {
@@ -552,7 +552,7 @@ void Batch<TaskType>::Close() {
 }
 
 template <typename TaskType>
-uint64 Batch<TaskType>::traceme_context_id() const {
+uint64_t Batch<TaskType>::traceme_context_id() const {
   return traceme_context_id_;
 }
 
@@ -567,9 +567,9 @@ void Batch<TaskType>::TryTrimToNewSize(
   // Index of the first task to trim away. It is possible that it is the index
   // of a task of size larger than 1 that will have to be split in order to get
   // to the target new_size.
-  int32 first_task_to_move = 0;
+  int32_t first_task_to_move = 0;
   // The sum of sizes of tasks i, where i < first_task_to_move.
-  int32 size_of_previous_tasks = 0;
+  int32_t size_of_previous_tasks = 0;
   while (size_of_previous_tasks + tasks_[first_task_to_move]->size() <=
          new_size) {
     size_of_previous_tasks += tasks_[first_task_to_move]->size();
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
index 21622a5f0ba2f0..c309110f3afa67 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace serving {
 
 int GetNextAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding) {
   if (disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
@@ -44,9 +44,9 @@ int GetNextAllowedBatchSize(int batch_size,
   return batch_size;
 }
 
-int32 GetPrevAllowedBatchSize(int batch_size,
-                              const std::vector<int32>& allowed_batch_sizes,
-                              bool disable_padding) {
+int32_t GetPrevAllowedBatchSize(int batch_size,
+                                const std::vector<int32_t>& allowed_batch_sizes,
+                                bool disable_padding) {
   if (disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
index 7dabf5628c9980..d9ce8070a16895 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
@@ -34,13 +34,13 @@ namespace serving {
 // greater than or equal to the given batch size. If allowed_batch_sizes,
 // returns batch_size as is.
 int GetNextAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding);
 
 // Returns the largest allowed batch size that is smaller than or equal to
 // batch_size. Returns batch_size if no such size exists.
 int GetPrevAllowedBatchSize(int batch_size,
-                            const std::vector<int32>& allowed_batch_sizes,
+                            const std::vector<int32_t>& allowed_batch_sizes,
                             bool disable_padding);
 
 // Constants containing possible values for the batch_padding_policy argument
@@ -69,7 +69,7 @@ inline constexpr absl::string_view kMinimizeTpuCostPerRequestPolicy =
 // out_trimmed_tasks vector in the same order as they were in the batch.
 template <typename TaskType>
 void MaybeBatchDown(Batch<TaskType>& batch,
-                    const std::vector<int32>& allowed_batch_sizes,
+                    const std::vector<int32_t>& allowed_batch_sizes,
                     bool disable_padding,
                     absl::string_view batch_padding_policy,
                     ModelBatchStats* model_batch_stats,
@@ -103,15 +103,15 @@ void MaybeBatchDown(Batch<TaskType>& batch,
     return;
   }
 
-  int32 batch_size = batch.size();
+  int32_t batch_size = batch.size();
 
-  int32 pad_up_size =
+  int32_t pad_up_size =
       GetNextAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
   if (pad_up_size == batch_size) {
     return;  // Good, no padding is necessary.
   }
 
-  int32 batch_down_size =
+  int32_t batch_down_size =
       GetPrevAllowedBatchSize(batch_size, allowed_batch_sizes, disable_padding);
   if (batch_down_size == batch_size) {
     return;  // Can't batch down (e.g. no smaller batch size available).
diff --git a/tensorflow/core/kernels/batching_util/batch_stats.h b/tensorflow/core/kernels/batching_util/batch_stats.h
index 87c36fca0c02a1..22729032d5d6e2 100644
--- a/tensorflow/core/kernels/batching_util/batch_stats.h
+++ b/tensorflow/core/kernels/batching_util/batch_stats.h
@@ -140,7 +140,7 @@ class ModelBatchStats {
   // size.
   //
   // The returned reference persist for as long as 'this' is alive.
-  BatchSizeStats& batch_size(int32 batch_size) {
+  BatchSizeStats& batch_size(int32_t batch_size) {
     mutex_lock l(mu_);
     return batch_size_stats_by_batch_size_[batch_size];
   }
@@ -161,8 +161,8 @@ class ModelBatchStats {
   // Returns the list of batch sizes for which this model has statistics.
   //
   // The returned list is not guaranteed to be sorted.
-  std::vector<int32> BatchSizes() const {
-    std::vector<int32> result;
+  std::vector<int32_t> BatchSizes() const {
+    std::vector<int32_t> result;
     mutex_lock l(mu_);
     result.reserve(batch_size_stats_by_batch_size_.size());
     for (const auto& [key, value] : batch_size_stats_by_batch_size_) {
@@ -198,7 +198,7 @@ class ModelBatchStats {
   // element deletion is possible because we return references to items in this
   // map and don't track their lifetime. We are using the node hash map so that
   // elements, once created, are fixed in memory.
-  absl::node_hash_map<int32, BatchSizeStats> batch_size_stats_by_batch_size_
+  absl::node_hash_map<int32_t, BatchSizeStats> batch_size_stats_by_batch_size_
       TF_GUARDED_BY(mu_);
 
   // The total count of individual unit-sized queries processed by this model.
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.cc b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
index cbe1109cc7e367..d2fc86ac0e826d 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.cc
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
@@ -40,7 +40,7 @@ void FakeClockEnv::AdvanceByMicroseconds(int micros) {
   }
 }
 
-void FakeClockEnv::BlockUntilSleepingThread(uint64 wake_time) {
+void FakeClockEnv::BlockUntilSleepingThread(uint64_t wake_time) {
   for (;;) {
     {
       mutex_lock l(mu_);
@@ -67,7 +67,7 @@ void FakeClockEnv::BlockUntilThreadsAsleep(int num_threads) {
   }
 }
 
-uint64 FakeClockEnv::NowMicros() const {
+uint64_t FakeClockEnv::NowMicros() const {
   {
     mutex_lock l(mu_);
     return current_time_;
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
index 119abc2ee1f328..739324b2a7cdac 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.h
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -46,22 +46,22 @@ class FakeClockEnv : public EnvWrapper {
 
   // Blocks until there is a sleeping thread that is scheduled to wake up at
   // the given (absolute) time.
-  void BlockUntilSleepingThread(uint64 wake_time);
+  void BlockUntilSleepingThread(uint64_t wake_time);
 
   // Blocks until there are at least num_threads sleeping.
   void BlockUntilThreadsAsleep(int num_threads);
 
   // Methods that this class implements.
-  uint64 NowMicros() const override;
+  uint64_t NowMicros() const override;
   void SleepForMicroseconds(int64_t micros) override;
 
  private:
   mutable mutex mu_;
 
-  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t current_time_ TF_GUARDED_BY(mu_) = 0;
 
   struct SleepingThread {
-    uint64 wake_time;
+    uint64_t wake_time;
     absl::Notification* wake_notification;
   };
   std::vector<SleepingThread> sleeping_threads_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.cc b/tensorflow/core/kernels/batching_util/periodic_function.cc
index e579135a2e7fd0..b72ac33bdbc3ef 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function.cc
@@ -29,9 +29,9 @@ PeriodicFunction::PeriodicFunction(absl::AnyInvocable<void()> function,
                                    const int64_t interval_micros,
                                    const Options& options)
     : function_(std::move(function)),
-      interval_micros_([interval_micros]() -> int64 {
+      interval_micros_([interval_micros]() -> int64_t {
         if (interval_micros < 0) {
-          const string error =
+          const std::string error =
               absl::StrCat(" The value of 'interval_micros' should be >= 0: ",
                            interval_micros, ". ");
           DCHECK(false) << error;
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index 45a56e60461b26..0cd0504ab88bfe 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -82,7 +82,7 @@ class PeriodicFunction {
 
     // Specifies the thread name prefix (see the description in class
     // Thread).
-    string thread_name_prefix = "periodic_function";
+    std::string thread_name_prefix = "periodic_function";
 
     // The environment to use. Does not take ownership, but must remain alive
     // for as long as the PeriodicFunction exists.
diff --git a/tensorflow/core/kernels/batching_util/periodic_function_test.cc b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
index 2c1300d7dada2a..bc908b500ad7d3 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function_test.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
@@ -46,7 +46,7 @@ using test_util::FakeClockEnv;
 
 void StopPeriodicFunction(PeriodicFunction* periodic_function,
                           FakeClockEnv* fake_clock_env,
-                          const uint64 pf_interval_micros) {
+                          const uint64_t pf_interval_micros) {
   fake_clock_env->BlockUntilThreadsAsleep(1);
   internal::PeriodicFunctionTestAccess(periodic_function).NotifyStop();
   fake_clock_env->AdvanceByMicroseconds(pf_interval_micros);
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index 7fa9f48b23de41..64a63c5e678653 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -71,7 +71,7 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
 
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
     // Maximum number of batch processing threads.
     int64_t num_batch_threads = port::NumSchedulableCPUs();
     // Although batch selection is primarily based on age, this parameter
@@ -87,7 +87,7 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
     int64_t initial_in_flight_batches_limit = 3;
     // Returns the current number of batches directly waiting to be processed
     // by the serial device (i.e. GPU, TPU).
-    std::function<int64()> get_pending_on_serial_device;
+    std::function<int64_t()> get_pending_on_serial_device;
     // Desired average number of batches directly waiting to be processed by the
     // serial device. Small numbers of O(1) should deliver the best latency.
     double target_pending = 2;
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index d458cd86a8e7aa..d568707754bd89 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -133,7 +133,7 @@ class SharedBatchScheduler
   // TODO(b/25089730): Tune defaults based on best practices as they develop.
   struct Options {
     // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
+    std::string thread_pool_name = {"batch_threads"};
 
     // The number of threads to use to process batches.
     // Must be >= 1, and should be tuned carefully.
@@ -233,7 +233,7 @@ class SharedBatchScheduler
     size_t max_execution_batch_size = 1000;
 
     // If non-empty, contains configured batch sizes.
-    std::vector<int32> allowed_batch_sizes;
+    std::vector<int32_t> allowed_batch_sizes;
 
     // If true, the padding will not be appended.
     bool disable_padding = false;
@@ -241,7 +241,7 @@ class SharedBatchScheduler
     // The padding policy to use.
     //
     // See the documentation for kPadUpPolicy for details.
-    string batch_padding_policy = string(kPadUpPolicy);
+    std::string batch_padding_policy = std::string(kPadUpPolicy);
 
     // A pointer to a ModelBatchStats instance for this model. To be used for
     // cost-based padding policy selection.
@@ -266,7 +266,7 @@ class SharedBatchScheduler
       // See QueueOptions.max_enqueued_batches
       size_t max_enqueued_batches = 0;
       // See QueueOptions.allowed_batch_sizes
-      std::vector<int32> allowed_batch_sizes;
+      std::vector<int32_t> allowed_batch_sizes;
     };
     // A subset of queue options for high priority input. These options are
     // currently not being used in favor of the equivalents options at the
@@ -516,7 +516,7 @@ class Queue {
   size_t tail_batch_task_size() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the number of enqueued batches.
-  int64 num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  int64_t num_enqueued_batches() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Gets the appropriate batches.
   std::deque<std::unique_ptr<Batch<TaskType>>>& GetBatches()
@@ -573,7 +573,7 @@ class Queue {
       TF_GUARDED_BY(mu_);
 
   // The counter of the TraceMe context ids.
-  uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
 
   // The time at which the first task was added to the open (back-most) batch
   // in 'high_priority_batches_'. Valid iff that batch contains at least one
@@ -581,7 +581,7 @@ class Queue {
   //
   // Note that when using a batch padding policy other than PAD_UP, this field
   // might contain an approximate value.
-  uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
+  uint64_t open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
 
   // Whether this queue contains a batch that is eligible to be scheduled.
   // Used to keep track of when to call 'schedulable_batch_callback_'.
@@ -980,7 +980,7 @@ void Queue<TaskType>::PadOpenBatchWithLowPriorityTasks() {
       return;
     }
 
-    uint64 task_time = low_priority_tasks_.EarliestTaskStartTime().value();
+    uint64_t task_time = low_priority_tasks_.EarliestTaskStartTime().value();
     std::unique_ptr<TaskType> task = low_priority_tasks_.RemoveTask();
 
     const int64_t input_task_size = task->size();
@@ -1089,11 +1089,11 @@ size_t Queue<TaskType>::SchedulingCapacity() const {
 
 template <typename TaskType>
 size_t Queue<TaskType>::SchedulingCapacityInternal() const {
-  const int64 num_new_batches_schedulable =
+  const int64_t num_new_batches_schedulable =
       static_cast<int64_t>(options_.max_enqueued_batches) -
       this->num_enqueued_batches();
-  const int64 execution_batch_size_limit = max_execution_batch_size();
-  const int64 open_batch_capacity =
+  const int64_t execution_batch_size_limit = max_execution_batch_size();
+  const int64_t open_batch_capacity =
       execution_batch_size_limit - this->tail_batch_task_size();
   // Note the returned value is guaranteed to be not negative, since
   // enqueue operation could only happen if queue has enough capacity.
@@ -1201,7 +1201,7 @@ Queue<TaskType>::ScheduleBatch() {
       // batch, making it read-only.
       Batch<TaskType>& old_batch = *batches[0];
       if (!old_batch.empty()) {
-        uint64 old_batch_time = old_batch.EarliestTaskStartTime().value();
+        uint64_t old_batch_time = old_batch.EarliestTaskStartTime().value();
         std::vector<std::unique_ptr<TaskType>> trimmed_tasks;
         MaybeBatchDown(
             /* batch= */ old_batch,
@@ -1415,7 +1415,7 @@ Queue<TaskType>::PeekBatchPriorityImpl() const {
   Batch<TaskType>* open_batch = batches.back().get();
 
   size_t effective_batch_size = open_batch->size();
-  uint64 effective_start_time_micros = open_batch_start_time_micros_;
+  uint64_t effective_start_time_micros = open_batch_start_time_micros_;
   int64_t effective_batch_timeout_micros = options_.batch_timeout_micros;
   if (effective_batch_size == 0) {
     // open_batch_start_time_micros_ is not valid for an empty batch.
@@ -1498,7 +1498,7 @@ size_t Queue<TaskType>::tail_batch_task_size() const {
 }
 
 template <typename TaskType>
-int64 Queue<TaskType>::num_enqueued_batches() const {
+int64_t Queue<TaskType>::num_enqueued_batches() const {
   return GetBatches().size();
 }
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 2795928315860b..6ae76b0d54c842 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -2442,7 +2442,7 @@ void BM_QueueSchedule(::testing::benchmark::State& state) {
   const int queue_index = state.range(1);
   Queue* queue = (*queues)[queue_index].get();
 
-  const string label =
+  const std::string label =
       absl::StrCat(state.threads(), "-Threads", (*queue_labels)[queue_index]);
   state.SetLabel(label);
   for (auto s : state) {
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index 940e08cd71f880..eab8997f6cd781 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <cfloat>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/range_sampler.h"
@@ -219,8 +219,10 @@ class ComputeAccidentalHitsOp : public OpKernel {
                     "sampled_candidates must be a vector, which is typically "
                     "an output from CandidateSampler"));
 
-    std::unordered_map<int64_t, int> sampled_candidate_to_pos;
-    for (int64_t i = 0; i < in_sampled_candidates.dim_size(0); ++i) {
+    const int64_t num_sampled = in_sampled_candidates.dim_size(0);
+    absl::flat_hash_map<int64_t, int> sampled_candidate_to_pos;
+    sampled_candidate_to_pos.reserve(num_sampled);
+    for (int64_t i = 0; i < num_sampled; ++i) {
       sampled_candidate_to_pos[in_sampled_candidates.vec<int64_t>()(i)] = i;
     }
 
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 93b595f49bcdaa..aca59d5acd369e 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -114,9 +116,9 @@ class DenseCount : public OpKernel {
     OP_REQUIRES(context,
                 TensorShapeUtils::IsVector(data.shape()) ||
                     TensorShapeUtils::IsMatrix(data.shape()),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
-                    data.shape().DebugString()));
+                    data.shape().DebugString())));
 
     // Ensure all values are non-negative.
     const auto data_values = data.flat<T>();
@@ -125,15 +127,15 @@ class DenseCount : public OpKernel {
         (data_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == data.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and data must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; data shape: ", data.shape().DebugString()));
+              "; data shape: ", data.shape().DebugString())));
     }
 
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
@@ -143,7 +145,7 @@ class DenseCount : public OpKernel {
     int num_batch_elements = 1;
     for (int i = 0; i < num_batch_dimensions; ++i) {
       OP_REQUIRES(context, data.shape().dim_size(i) != 0,
-                  errors::InvalidArgument(
+                  absl::InvalidArgumentError(
                       "Invalid input: Shapes dimension cannot be 0."));
       num_batch_elements *= data.shape().dim_size(i);
     }
@@ -202,29 +204,31 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices.shape()),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Input indices must be a 2-dimensional tensor. Got: ",
-                    indices.shape().DebugString()));
+                    indices.shape().DebugString())));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(values.shape()),
-                errors::InvalidArgument("Input values must be a vector. Got: ",
-                                        values.shape().DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Input values must be a vector. Got: ",
+                                 values.shape().DebugString())));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(shape.shape()),
-                errors::InvalidArgument("Input shape must be a vector. Got: ",
-                                        shape.shape().DebugString()));
-    OP_REQUIRES(context,
-                values.shape().dim_size(0) == indices.shape().dim_size(0),
-                errors::InvalidArgument(
-                    "Number of values must match first dimension of indices.",
-                    "Got ", values.shape().dim_size(0),
-                    " values, indices shape: ", indices.shape().DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Input shape must be a vector. Got: ",
+                                 shape.shape().DebugString())));
+    OP_REQUIRES(
+        context, values.shape().dim_size(0) == indices.shape().dim_size(0),
+        absl::InvalidArgumentError(absl::StrCat(
+            "Number of values must match first dimension of indices.", "Got ",
+            values.shape().dim_size(0),
+            " values, indices shape: ", indices.shape().DebugString())));
     OP_REQUIRES(
         context, shape.shape().dim_size(0) == indices.shape().dim_size(1),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "Number of dimensions must match second dimension of indices.",
             "Got ", shape.shape().dim_size(0),
-            " dimensions, indices shape: ", indices.shape().DebugString()));
+            " dimensions, indices shape: ", indices.shape().DebugString())));
     OP_REQUIRES(context, shape.NumElements() > 0,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "The shape argument requires at least one element."));
     // Validate indices: each index must be valid for the corresponding
     // dimension. This could be possibly done better.
@@ -237,10 +241,10 @@ class SparseCount : public OpKernel {
         OP_REQUIRES(
             context,
             indices_values(i, j) >= 0 && indices_values(i, j) < shape_vector(j),
-            errors::InvalidArgument(
+            absl::InvalidArgumentError(absl::StrCat(
                 "Invalid index value at ", i, ": dimension ", j, " has value ",
                 indices_values(i, j), " which is not in [0, ", shape_vector(j),
-                ") (as given by dense shape ", shape.DebugString()));
+                ") (as given by dense shape ", shape.DebugString())));
       }
     }
 
@@ -251,23 +255,23 @@ class SparseCount : public OpKernel {
         (values_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == values.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and values must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; values shape: ", values.shape().DebugString()));
+              "; values shape: ", values.shape().DebugString())));
     }
 
     bool is_1d = shape.NumElements() == 1;
     int num_batches = is_1d ? 1 : shape_vector(0);
-    OP_REQUIRES(
-        context, 0 < num_batches && num_batches < kMaxBatches,
-        errors::InvalidArgument("Cannot allocate ", num_batches,
-                                " batches, is the dense shape too wide?"));
+    OP_REQUIRES(context, 0 < num_batches && num_batches < kMaxBatches,
+                absl::InvalidArgumentError(
+                    absl::StrCat("Cannot allocate ", num_batches,
+                                 " batches, is the dense shape too wide?")));
 
     const auto weight_values = weights.flat<W>();
 
@@ -279,11 +283,11 @@ class SparseCount : public OpKernel {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       if (batch >= num_batches) {
         OP_REQUIRES(context, batch < num_batches,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "Indices value along the first dimension must be ",
                         "lower than the first index of the shape.", "Got ",
                         batch, " as batch and ", num_batches,
-                        " as the first dimension of the shape."));
+                        " as the first dimension of the shape.")));
       }
       const auto& value = values_values(idx);
       if (maxlength_ < 0 || value < maxlength_) {
@@ -332,10 +336,10 @@ class RaggedCount : public OpKernel {
     if (use_weights) {
       OP_REQUIRES(
           context, weights.shape() == values.shape(),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Weights and values must have the same shape. Weight shape: ",
               weights.shape().DebugString(),
-              "; values shape: ", values.shape().DebugString()));
+              "; values shape: ", values.shape().DebugString())));
     }
 
     const auto splits_values = splits.flat<int64_t>();
@@ -346,15 +350,15 @@ class RaggedCount : public OpKernel {
 
     OP_REQUIRES(
         context, num_batches > 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(
             "Must provide at least 2 elements for the splits argument"));
     OP_REQUIRES(context, splits_values(0) == 0,
-                errors::InvalidArgument("Splits must start with 0, not with ",
-                                        splits_values(0)));
+                absl::InvalidArgumentError(absl::StrCat(
+                    "Splits must start with 0, not with ", splits_values(0))));
     OP_REQUIRES(context, splits_values(num_batches) == num_values,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Splits must end with the number of values, got ",
-                    splits_values(num_batches), " instead of ", num_values));
+                    splits_values(num_batches), " instead of ", num_values)));
 
     // Ensure all values are non-negative.
     Eigen::TensorFixedSize<bool, Eigen::Sizes<>, Eigen::RowMajor> nonnegative;
@@ -362,7 +366,7 @@ class RaggedCount : public OpKernel {
         (values_values >= static_cast<T>(0)).all();
     OP_REQUIRES(
         context, nonnegative(),
-        errors::InvalidArgument("Input values must all be non-negative"));
+        absl::InvalidArgumentError("Input values must all be non-negative"));
 
     auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index ac89801639bbfd..f08f977d1fe631 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -73,6 +73,7 @@ tf_kernel_library(
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core/util/tensor_bundle:naming",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 83b42576084346..1cc826e8c17b3d 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
@@ -175,10 +176,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         env_(env),
         num_tensors_(input->output_dtypes().size()),
         tensor_index_padding_size_(StringPaddingSize(num_tensors_)),
-        item_index_padding_size_(StringPaddingSize(kMaxItems)),
-        tensor_format_string_(strings::Printf(kKeyStrFormat,
-                                              item_index_padding_size_,
-                                              tensor_index_padding_size_)) {
+        item_index_padding_size_(StringPaddingSize(kMaxItems)) {
     input_->Ref();
     DCHECK_EQ(item_index_padding_size_, 7);
   }
@@ -230,9 +228,9 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
     return strings::Printf(kPaddingSizeStrFormat, num_tensors - 1).size();
   }
 
-  string FormatName(size_t item_index, size_t tensor_index) const {
-    return strings::Printf(tensor_format_string_.c_str(), item_index,
-                           tensor_index);
+  std::string FormatName(size_t item_index, size_t tensor_index) const {
+    return absl::StrFormat("%*zu_%*zu", item_index_padding_size_, item_index,
+                           tensor_index_padding_size_, tensor_index);
   }
 
   class FileIterator : public DatasetIterator<FileDatasetBase> {
@@ -710,7 +708,6 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
   const size_t tensor_index_padding_size_;
   static constexpr size_t kMaxItems = 10000000;  // 10 million
   const size_t item_index_padding_size_;
-  const string tensor_format_string_;
 };  // FileDatasetBase
 
 class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 01eba9c38c7455..607d434ec4fa1f 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/random_dataset_op.h"
 
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -84,7 +87,8 @@ class RandomDatasetOp::Dataset : public DatasetBase {
     // These splits aren't actually used during iteration.
     // TODO(aaudibert): Avoid sending dummy splits over RPC when using tf.data
     // service with RandomDataset.
-    split_providers->push_back(std::make_unique<IndexSplitProvider>(kint64max));
+    split_providers->push_back(std::make_unique<IndexSplitProvider>(
+        std::numeric_limits<int64_t>::max()));
     return absl::OkStatus();
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index dc974c34b735f2..fb4ac9abe70131 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -24,7 +24,7 @@ namespace experimental {
 namespace sql {
 
 std::unique_ptr<QueryConnection> DriverManager::CreateQueryConnection(
-    const string& driver_name) {
+    const std::string& driver_name) {
   if (driver_name == "sqlite") {
     return std::make_unique<SqliteQueryConnection>();
   } else {  // TODO(b/64276826, b/64276995) Add support for other db types.
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index 7aa307e2690b13..d15afd9f0b7398 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -32,7 +32,7 @@ class DriverManager {
   // == `sqlite`, then `CreateQueryConnection` will create a
   // `SqliteQueryConnection` instance.
   static std::unique_ptr<QueryConnection> CreateQueryConnection(
-      const string& driver_name);
+      const std::string& driver_name);
 };
 
 }  // namespace sql
diff --git a/tensorflow/core/kernels/data/experimental/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 031a87253e5fad..68070966e7f6a0 100644
--- a/tensorflow/core/kernels/data/experimental/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -48,7 +48,8 @@ class QueryConnection {
   // The client must call `Close()` to release the connection resources, even
   // if `Open()` fails. `Close()` must be called before making another call
   // to `Open()`.
-  virtual absl::Status Open(const string& data_source_name, const string& query,
+  virtual absl::Status Open(const std::string& data_source_name,
+                            const std::string& query,
                             const DataTypeVector& output_types) = 0;
   // Closes an opened connection.
   virtual absl::Status Close() = 0;
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 714899b59dc993..51ba27aa0ce500 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -31,8 +31,8 @@ SqliteQueryConnection::~SqliteQueryConnection() {
   if (db_ != nullptr) db_->Unref();
 }
 
-absl::Status SqliteQueryConnection::Open(const string& data_source_name,
-                                         const string& query,
+absl::Status SqliteQueryConnection::Open(const std::string& data_source_name,
+                                         const std::string& query,
                                          const DataTypeVector& output_types) {
   if (db_ != nullptr) {
     return errors::FailedPrecondition(
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 4cf2608c22f02c..fe9fe8d0d83535 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -30,7 +30,8 @@ class SqliteQueryConnection : public QueryConnection {
  public:
   SqliteQueryConnection();
   ~SqliteQueryConnection() override;
-  absl::Status Open(const string& data_source_name, const string& query,
+  absl::Status Open(const std::string& data_source_name,
+                    const std::string& query,
                     const DataTypeVector& output_types) override;
   absl::Status Close() override;
   absl::Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -46,7 +47,7 @@ class SqliteQueryConnection : public QueryConnection {
   Sqlite* db_ = nullptr;
   SqliteStatement stmt_;
   int column_count_ = 0;
-  string query_;
+  std::string query_;
   DataTypeVector output_types_;
 };
 
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index c2bc283e140ed8..9c1691eb6aec3b 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -82,8 +82,8 @@ class SqlDatasetOp : public DatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const string& driver_name,
-            const string& data_source_name, const string& query,
+    Dataset(OpKernelContext* ctx, const std::string& driver_name,
+            const std::string& data_source_name, const std::string& query,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
@@ -94,7 +94,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::Sql")});
     }
@@ -107,7 +107,7 @@ class SqlDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
+    std::string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
     absl::Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 95d4c0169c52af..cd1550b61665fb 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -44,9 +44,11 @@ static mutex* get_counters_map_lock() {
   return &counters_map_lock;
 }
 
-static std::unordered_map<string, monitoring::Counter<1>*>* get_counters_map() {
-  static std::unordered_map<string, monitoring::Counter<1>*>* counters_map =
-      new std::unordered_map<string, monitoring::Counter<1>*>;
+static std::unordered_map<std::string, monitoring::Counter<1>*>*
+get_counters_map() {
+  static std::unordered_map<std::string, monitoring::Counter<1>*>*
+      counters_map =
+          new std::unordered_map<std::string, monitoring::Counter<1>*>;
   return counters_map;
 }
 
@@ -54,7 +56,7 @@ class StatsAggregatorImpl : public StatsAggregator {
  public:
   StatsAggregatorImpl() {}
 
-  void AddToHistogram(const string& name, absl::Span<const double> values,
+  void AddToHistogram(const std::string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -63,7 +65,7 @@ class StatsAggregatorImpl : public StatsAggregator {
     }
   }
 
-  void AddScalar(const string& name, float value,
+  void AddScalar(const std::string& name, float value,
                  const int64_t steps) override {
     mutex_lock l(mu_);
     scalars_[name] = value;
@@ -72,7 +74,7 @@ class StatsAggregatorImpl : public StatsAggregator {
   void EncodeToProto(Summary* out_summary) override {
     mutex_lock l(mu_);
     for (const auto& pair : histograms_) {
-      const string& name = pair.first;
+      const std::string& name = pair.first;
       const histogram::Histogram& histogram = pair.second;
 
       Summary::Value* value = out_summary->add_value();
@@ -94,7 +96,7 @@ class StatsAggregatorImpl : public StatsAggregator {
     return absl::OkStatus();
   }
 
-  void IncrementCounter(const string& name, const string& label,
+  void IncrementCounter(const std::string& name, const std::string& label,
                         int64_t val) override {
     mutex_lock l(*get_counters_map_lock());
     auto counters_map = get_counters_map();
@@ -112,9 +114,9 @@ class StatsAggregatorImpl : public StatsAggregator {
 
  private:
   mutex mu_;
-  std::unordered_map<string, histogram::Histogram> histograms_
+  std::unordered_map<std::string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
-  std::unordered_map<string, float> scalars_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, float> scalars_ TF_GUARDED_BY(mu_);
   StatsAggregatorImpl(const StatsAggregatorImpl&) = delete;
   void operator=(const StatsAggregatorImpl&) = delete;
 };
@@ -143,7 +145,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     }
   }
 
-  void AddToHistogram(const string& name, absl::Span<const double> values,
+  void AddToHistogram(const std::string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -153,7 +155,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     AddToEvents(name, steps, histogram);
   }
 
-  void AddScalar(const string& name, float value,
+  void AddScalar(const std::string& name, float value,
                  const int64_t steps) override {
     mutex_lock l(mu_);
     AddToEvents(name, steps, value);
@@ -167,7 +169,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     return absl::OkStatus();
   }
 
-  void IncrementCounter(const string& name, const string& label,
+  void IncrementCounter(const std::string& name, const std::string& label,
                         int64_t val) override {
     mutex_lock l(*get_counters_map_lock());
     auto counters_map = get_counters_map();
@@ -203,7 +205,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
   }
 
  private:
-  void AddToEvents(const string& name, const int64_t steps,
+  void AddToEvents(const std::string& name, const int64_t steps,
                    const float scalar_value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
       return;
@@ -218,7 +220,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     TF_CHECK_OK(summary_writer_interface_->WriteEvent(std::move(e)));
   }
 
-  void AddToEvents(const string& name, const int64_t steps,
+  void AddToEvents(const std::string& name, const int64_t steps,
                    const histogram::Histogram& histogram)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
@@ -238,7 +240,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
       nullptr;
   // not owned, we might be associating the default summary_writer from the
   // context
-  std::unordered_map<string, histogram::Histogram> histograms_
+  std::unordered_map<std::string, histogram::Histogram> histograms_
       TF_GUARDED_BY(mu_);
   StatsAggregatorImplV2(const StatsAggregatorImplV2&) = delete;
   void operator=(const StatsAggregatorImplV2&) = delete;
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 3fbf149172c562..548095d2ee4e12 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -58,7 +58,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     std::string tag)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           tag_(std::move(tag)) {
@@ -68,7 +69,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::LatencyStats")});
     }
@@ -80,7 +81,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
@@ -125,10 +126,10 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
                                    std::vector<Tensor>* out_tensors,
                                    bool* end_of_sequence) override {
         tf_shared_lock l(mu_);
-        uint64 start = EnvTime::NowMicros();
+        uint64_t start = EnvTime::NowMicros();
         absl::Status s =
             input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-        uint64 end = EnvTime::NowMicros();
+        uint64_t end = EnvTime::NowMicros();
         auto stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator && !*end_of_sequence) {
           int64_t steps = num_elements();
@@ -184,7 +185,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     std::string tag)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           tag_(std::move(tag)) {
@@ -194,7 +196,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::BytesProducedStats")});
     }
@@ -206,7 +208,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index 02cea46c2ff1ad..a76d33ddb25ba5 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -70,7 +70,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::TakeWhile")});
     }
@@ -83,7 +83,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "TakeWhileDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index a3696281f18071..52476c12780f67 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -65,8 +65,8 @@ absl::Status ValidateNumThreads(int32_t num_threads) {
 class ThreadPoolResource : public ResourceBase {
  public:
   ThreadPoolResource(Env* env, const ThreadOptions& thread_options,
-                     const string& name, int num_threads, bool low_latency_hint,
-                     int max_intra_op_parallelism)
+                     const std::string& name, int num_threads,
+                     bool low_latency_hint, int max_intra_op_parallelism)
       : thread_pool_(env, thread_options, name, num_threads, low_latency_hint),
         max_intra_op_parallelism_(max_intra_op_parallelism) {}
 
@@ -86,9 +86,9 @@ class ThreadPoolResource : public ResourceBase {
     }
   }
 
-  int32 NumThreads() { return thread_pool_.NumThreads(); }
+  int32_t NumThreads() { return thread_pool_.NumThreads(); }
 
-  string DebugString() const override { return "ThreadPoolResource"; }
+  std::string DebugString() const override { return "ThreadPoolResource"; }
 
  private:
   thread::ThreadPool thread_pool_;
@@ -156,7 +156,7 @@ class ThreadPoolHandleOp : public OpKernel {
   mutex mu_;
   ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
   bool initialized_ TF_GUARDED_BY(mu_) = false;
-  string display_name_;
+  std::string display_name_;
   int num_threads_;
   int max_intra_op_parallelism_;
 };
@@ -194,7 +194,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::ThreadPool")});
     }
@@ -206,7 +206,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override {
+    std::string DebugString() const override {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
@@ -316,7 +316,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, absl::StrCat(prefix, "::MaxIntraOpParallelism")});
   }
@@ -328,7 +328,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "MaxIntraOpParallelismDatasetOp::Dataset";
   }
 
@@ -460,7 +460,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(
         Iterator::Params{this, absl::StrCat(prefix, "::PrivateThreadPool")});
   }
@@ -472,7 +472,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "PrivateThreadPoolDatasetOp::Dataset";
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index e245ddddce90a9..c14b86dc451a47 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -47,7 +47,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(
         Iterator::Params{this, absl::StrCat(prefix, "::Unique")});
   }
@@ -60,7 +60,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat("UniqueDatasetOp::Dataset");
   }
 
@@ -173,7 +173,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
         } else {
           DCHECK_EQ(DT_STRING, t.dtype());
           auto flat_t = t.flat<tstring>();
-          uint64 hash = 0;
+          uint64_t hash = 0;
           for (int64_t i = 0; i < t.NumElements(); ++i) {
             hash = Hash64Combine(hash, Hash64(flat_t(i)));
           }
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
index b218f27516f14e..5ddf3bff0bf460 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
@@ -44,7 +44,8 @@ class UniqueDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     input_names->emplace_back(UniqueDatasetOp::kInputDataset);
     return absl::OkStatus();
@@ -57,7 +58,9 @@ class UniqueDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return UniqueDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return UniqueDatasetOp::kDatasetType;
+  }
 };
 
 class UniqueDatasetOpTest : public DatasetOpsTestBase {};
diff --git a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
index c6e806e4cde410..833d3a4e4d3237 100644
--- a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
@@ -168,7 +168,7 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 205bd45973f21a..2641c447a16249 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <functional>
+#include <limits>
 #include <optional>
 #include <string>
 #include <utility>
@@ -73,7 +75,7 @@ int64_t sgn(int64_t val) { return (0 < val) - (val < 0); }
 
 int64_t RangeCardinality(int64_t start, int64_t stop, int64_t step) {
   // `enumerate` uses int max to simulate an infinite range dataset.
-  if (stop >= tsl::kint64max) {
+  if (stop >= std::numeric_limits<int64_t>::max()) {
     return kInfiniteCardinality;
   }
 
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 5af2eb1210e389..5a095bc82b3cd6 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -67,6 +67,7 @@ TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_float8_e5m2(DEFINE_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(DEFINE_GPU_KERNELS);
+TF_CALL_float4_e2m1fn(DEFINE_GPU_KERNELS);
 TF_CALL_int4(DEFINE_GPU_KERNELS);
 TF_CALL_uint4(DEFINE_GPU_KERNELS);
 TF_CALL_int2(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 17e8e4cb190528..366f67f13c8769 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -47,10 +50,10 @@ class DynamicPartitionOp_Shared : public OpKernel {
     OP_REQUIRES(
         c,
         TensorShapeUtils::StartsWith((*data)->shape(), (*partitions)->shape()),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "data.shape must start with partitions.shape, ",
             "got data.shape = ", (*data)->shape().DebugString(),
-            ", partitions.shape = ", (*partitions)->shape().DebugString()));
+            ", partitions.shape = ", (*partitions)->shape().DebugString())));
 
     // Count how many occurrences of each partition id we have in partitions
     absl::InlinedVector<int, 32UL> partition_count(num_partitions_);
@@ -59,9 +62,9 @@ class DynamicPartitionOp_Shared : public OpKernel {
     for (int64_t i = 0; i < N; i++) {
       const int32_t p = internal::SubtleMustCopy(e_partitions(i));
       OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
-                  errors::InvalidArgument(
+                  absl::InvalidArgumentError(absl::StrCat(
                       "partitions", SliceDebugString((*partitions)->shape(), i),
-                      " = ", p, " is not in [0, ", num_partitions_, ")"));
+                      " = ", p, " is not in [0, ", num_partitions_, ")")));
       partition_count[p]++;
     }
 
@@ -111,14 +114,14 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       }
       for (int64_t i = 0; i < N; i++) {
         const int32_t p = internal::SubtleMustCopy(e_partitions(i));
-        OP_REQUIRES(
-            c, FastBoundsCheck(p, num_partitions_),
-            errors::InvalidArgument("indices[", i, "] is out of range"));
+        OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
+                    absl::InvalidArgumentError(
+                        absl::StrCat("indices[", i, "] is out of range")));
         auto oi = output_index[p];
         OP_REQUIRES(c, FastBoundsCheck(oi, out_vec[p].size()),
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "out_vec[", p, "] size: ", out_vec[p].size(),
-                        " is not LTE output_index[", p, "] : ", oi));
+                        " is not LTE output_index[", p, "] : ", oi)));
         out_vec[p](oi) = data_flat(i);
         output_index[p]++;
       }
@@ -139,15 +142,16 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       for (int64_t i = 0; i < N; i++) {
         // outputs[p][output_index[p]++] = data[i]
         const int32_t p = internal::SubtleMustCopy(e_partitions(i));
-        OP_REQUIRES(
-            c, FastBoundsCheck(p, num_partitions_),
-            errors::InvalidArgument("indices[", i,
-                                    "] has been asynchronously overwritten and "
-                                    "is no longer in range!"));
+        OP_REQUIRES(c, FastBoundsCheck(p, num_partitions_),
+                    absl::InvalidArgumentError(absl::StrCat(
+                        "indices[", i,
+                        "] has been asynchronously overwritten and "
+                        "is no longer in range!")));
         auto oi = output_index[p];
-        OP_REQUIRES(c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
-                    errors::InvalidArgument("Size of output_index: ", oi,
-                                            " is no longer in range."));
+        OP_REQUIRES(
+            c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
+            absl::InvalidArgumentError(absl::StrCat(
+                "Size of output_index: ", oi, " is no longer in range.")));
         Eigen::DSizes<Eigen::DenseIndex, 2> out_indices(oi, 0);
         Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
         out_flat[p].slice(out_indices, sizes) =
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 68b20ef6e3d7a0..17ab793bc74521 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -67,6 +67,7 @@ DEFINE_SETZERO_CPU(float8_e4m3fn);
 DEFINE_SETZERO_CPU(float8_e4m3fnuz);
 DEFINE_SETZERO_CPU(float8_e4m3b11fnuz);
 DEFINE_SETZERO_CPU(float8_e5m2fnuz);
+DEFINE_SETZERO_CPU(float4_e2m1fn);
 DEFINE_SETZERO_CPU(int4);
 DEFINE_SETZERO_CPU(uint4);
 DEFINE_SETZERO_CPU(int2);
@@ -102,6 +103,7 @@ DEFINE_SETONE_CPU(float8_e4m3fn);
 DEFINE_SETONE_CPU(float8_e4m3fnuz);
 DEFINE_SETONE_CPU(float8_e4m3b11fnuz);
 DEFINE_SETONE_CPU(float8_e5m2fnuz);
+DEFINE_SETONE_CPU(float4_e2m1fn);
 DEFINE_SETONE_CPU(int4);
 DEFINE_SETONE_CPU(uint4);
 DEFINE_SETONE_CPU(int2);
@@ -145,6 +147,7 @@ DEFINE_FILL_CPU(float8_e4m3fn);
 DEFINE_FILL_CPU(float8_e4m3fnuz);
 DEFINE_FILL_CPU(float8_e4m3b11fnuz);
 DEFINE_FILL_CPU(float8_e5m2fnuz);
+DEFINE_FILL_CPU(float4_e2m1fn);
 TF_CALL_int4(DEFINE_FILL_CPU);
 TF_CALL_uint4(DEFINE_FILL_CPU);
 #undef DEFINE_FILL_CPU
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 59ac5db06545b5..4cac7f8f1be68a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include <limits>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -56,7 +61,7 @@ class GatherOp : public OpKernel {
     const Tensor& indices = c->input(1);
     OP_REQUIRES(
         c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-        errors::InvalidArgument("params must be at least 1 dimensional"));
+        absl::InvalidArgumentError("params must be at least 1 dimensional"));
 
     // GatherV2 added an axis argument. For backwards compatibility with Gather,
     // fall back to axis 0 if the op does not have an axis input.
@@ -66,26 +71,29 @@ class GatherOp : public OpKernel {
       axis_is_set = true;
       const Tensor& axis_tensor = c->input(2);
       OP_REQUIRES(c, TensorShapeUtils::IsScalar(axis_tensor.shape()),
-                  errors::InvalidArgument("axis must be scalar"));
+                  absl::InvalidArgumentError("axis must be scalar"));
 
       if (axis_tensor.dtype() == DT_INT32) {
-        axis = axis_tensor.scalar<int32>()();
+        axis = axis_tensor.scalar<int32_t>()();
       } else if (axis_tensor.dtype() == DT_INT64) {
         axis = axis_tensor.scalar<int64_t>()();
       } else {
         OP_REQUIRES(c, false,
-                    errors::InvalidArgument("axis must be int32 or int64."));
+                    absl::InvalidArgumentError("axis must be int32 or int64."));
       }
     }
-    // special case to avoid checkfail when axis = kint64max.
-    OP_REQUIRES(c, axis < kint64max,
-                absl::InvalidArgumentError("axis must be less than kint64max"));
+    // special case to avoid checkfail when axis =
+    // std::numeric_limits<int64_t>::max().
+    OP_REQUIRES(
+        c, axis < std::numeric_limits<int64_t>::max(),
+        absl::InvalidArgumentError(
+            "axis must be less than std::numeric_limits<int64_t>::max()"));
 
     int64_t min_params_dim = axis < 0 ? -axis : axis + 1;
-    OP_REQUIRES(
-        c, params.dims() >= min_params_dim,
-        errors::InvalidArgument("Shape must be at least rank ", min_params_dim,
-                                " but is rank ", params.dims()));
+    OP_REQUIRES(c, params.dims() >= min_params_dim,
+                absl::InvalidArgumentError(
+                    absl::StrCat("Shape must be at least rank ", min_params_dim,
+                                 " but is rank ", params.dims())));
 
     if (axis < 0) {
       axis = params.dims() + axis;
@@ -96,9 +104,9 @@ class GatherOp : public OpKernel {
     if (batch_dims != 0) {
       OP_REQUIRES(c,
                   batch_dims >= -indices.dims() && batch_dims <= indices.dims(),
-                  errors::InvalidArgument("Expected batch_dims in the range [",
-                                          -indices.dims(), ", ", indices.dims(),
-                                          "], but got ", batch_dims));
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "Expected batch_dims in the range [", -indices.dims(),
+                      ", ", indices.dims(), "], but got ", batch_dims)));
 
       if (batch_dims < 0) {
         batch_dims = indices.dims() + batch_dims;
@@ -106,33 +114,35 @@ class GatherOp : public OpKernel {
 
       if (!axis_is_set) axis = batch_dims;
 
-      OP_REQUIRES(c, batch_dims < params.dims(),
-                  errors::InvalidArgument("batch_dims (", batch_dims,
-                                          ") must be less than rank(params) (",
-                                          params.dims(), ")."));
-
-      OP_REQUIRES(c, axis >= batch_dims,
-                  errors::InvalidArgument("batch_dims (", batch_dims,
-                                          ") must be less than or equal to ",
-                                          "axis (", axis, ")."));
+      OP_REQUIRES(
+          c, batch_dims < params.dims(),
+          absl::InvalidArgumentError(absl::StrCat(
+              "batch_dims (", batch_dims, ") must be less than rank(params) (",
+              params.dims(), ").")));
+
+      OP_REQUIRES(
+          c, axis >= batch_dims,
+          absl::InvalidArgumentError(absl::StrCat(
+              "batch_dims (", batch_dims, ") must be less than or equal to ",
+              "axis (", axis, ").")));
       for (int i = 0; i < batch_dims; ++i) {
         OP_REQUIRES(c, params.dim_size(i) == indices.dim_size(i),
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "params.shape[", i, "]: ", params.dim_size(i),
                         " should be equal to indices.shape[", i,
-                        "]: ", indices.dim_size(i)));
+                        "]: ", indices.dim_size(i))));
       }
     }
 
     // Check that we have enough index space
     int64_t gather_dim_size = params.dim_size(axis);
     const int64_t N = indices.NumElements();
-    OP_REQUIRES(
-        c, gather_dim_size <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[", axis, "] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", gather_dim_size, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, gather_dim_size <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[", axis, "] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", gather_dim_size, " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // The result shape is params.shape[:axis] + indices.shape[batch_dims:] +
     // params.shape[axis + 1:].
@@ -182,15 +192,15 @@ class GatherOp : public OpKernel {
     }
     OP_REQUIRES(
         c, bad_i < 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
+            indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")")));
   }
 
  private:
   // The number of batch dimensions, as passed in the batch_dims attribute.
   // It must be less than or equal to rank(indices).
-  int32 batch_dims_ = 0;
+  int32_t batch_dims_ = 0;
 };
 
 #define REGISTER_GATHER_FULL(dev, type, index_type)                    \
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index f3cbce2fb249bb..ee1e1c16399141 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -56,8 +56,8 @@ TEST_F(GatherOpTest, ScalarIndices) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({}), {3});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {3});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -74,8 +74,8 @@ TEST_F(GatherOpTest, ScalarIndices_Complex) {
       TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
                          std::complex<float>(2, 12), std::complex<float>(3, 13),
                          std::complex<float>(4, 14)});
-  AddInputFromArray<int32>(TensorShape({}), {3});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {3});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -91,8 +91,8 @@ TEST_F(GatherOpTest, Simple_TwoD32_Axis0) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -106,8 +106,8 @@ TEST_F(GatherOpTest, InvalidInputShape_TwoD32) {
 
   // Feed invalid input shape and run
   AddInput<float>(TensorShape({0, 3}), [](int) -> float { return 0.f; });
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   auto s = RunOpKernel();
   EXPECT_TRUE(
       absl::StrContains(s.ToString(), "indices[0] = 0 is not in [0, 0)"))
@@ -120,8 +120,8 @@ TEST_F(GatherOpTest, Simple_TwoD32_Axis1) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 1, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 1, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {1});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -136,8 +136,8 @@ TEST_F(GatherOpTest, ZeroSize_TwoD32) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 0}), {});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output.
@@ -166,8 +166,8 @@ TEST_F(GatherOpTest, HighRank) {
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   TF_ASSERT_OK(RunOpKernel());
 
   // Check the output
@@ -182,8 +182,8 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(
       absl::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
@@ -196,8 +196,8 @@ TEST_F(GatherOpTest, Error_BatchDimsOutOfRange) {
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
-  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Expected batch_dims in the range [-1, 1], but got 10"))
@@ -256,8 +256,8 @@ static Graph* Gather(int dim) {
       ->Arg(200)                                                              \
       ->Arg(1000)
 
-BM_GATHER(cpu, int32);
-BM_GATHER(gpu, int32);
+BM_GATHER(cpu, int32_t);
+BM_GATHER(gpu, int32_t);
 BM_GATHER(cpu, int64_t);
 BM_GATHER(gpu, int64_t);
 
diff --git a/tensorflow/core/kernels/generate_vocab_remapping_op.cc b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
index e54173c4b07f2f..5ab9e2eea4790f 100644
--- a/tensorflow/core/kernels/generate_vocab_remapping_op.cc
+++ b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
@@ -56,7 +56,7 @@ class GenerateVocabRemappingOp : public OpKernel {
                     new_vocab_file_tensor->shape().DebugString()));
 
     // Build a new ID->token lookup table.
-    const string& new_vocab_filename =
+    const std::string& new_vocab_filename =
         new_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !new_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
@@ -88,7 +88,7 @@ class GenerateVocabRemappingOp : public OpKernel {
                     "old_vocab_file should be a single string, but got ",
                     old_vocab_file_tensor->shape().DebugString()));
     // Build a token->old ID lookup table.
-    const string& old_vocab_filename =
+    const std::string& old_vocab_filename =
         old_vocab_file_tensor->scalar<tstring>()();
     OP_REQUIRES(context, !old_vocab_filename.empty(),
                 errors::InvalidArgument("new vocab filename cannot be empty."));
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
index 09e74987c72598..a9467b6246b03a 100644
--- a/tensorflow/core/kernels/guarantee_const_op_test.cc
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -40,20 +40,20 @@ class GuaranteeConstOpTest : public OpsTestBase {
 
 TEST_F(GuaranteeConstOpTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(GuaranteeConstOpTest, StringSuccess) {
diff --git a/tensorflow/core/kernels/histogram_op.cc b/tensorflow/core/kernels/histogram_op.cc
index bf7c6fb5770da7..28f179b0a44932 100644
--- a/tensorflow/core/kernels/histogram_op.cc
+++ b/tensorflow/core/kernels/histogram_op.cc
@@ -48,9 +48,9 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
 
     Tensor index_to_bin_tensor;
     TF_RETURN_IF_ERROR(context->forward_input_or_allocate_temp(
-        {0}, DataTypeToEnum<int32>::value, TensorShape({values.size()}),
+        {0}, DataTypeToEnum<int32_t>::value, TensorShape({values.size()}),
         &index_to_bin_tensor));
-    auto index_to_bin = index_to_bin_tensor.flat<int32>();
+    auto index_to_bin = index_to_bin_tensor.flat<int32_t>();
 
     // Avoid overflow in step computation.
     const double step =
@@ -59,9 +59,9 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
     const double nbins_minus_1 = static_cast<double>(nbins - 1);
 
     // We cannot handle NANs in the algorithm below (due to the cast to int32)
-    const Eigen::Tensor<int32, 1, 1> nans_tensor =
-        values.isnan().template cast<int32>();
-    const Eigen::Tensor<int32, 0, 1> reduced_tensor = nans_tensor.sum();
+    const Eigen::Tensor<int32_t, 1, 1> nans_tensor =
+        values.isnan().template cast<int32_t>();
+    const Eigen::Tensor<int32_t, 0, 1> reduced_tensor = nans_tensor.sum();
     const int num_nans = reduced_tensor(0);
     if (num_nans > 0) {
       return errors::InvalidArgument("Histogram values must not contain NaN");
@@ -82,7 +82,7 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
                                static_cast<double>(value_range(0))) /
                               step)
                                  .cwiseMin(nbins_minus_1)
-                                 .template cast<int32>();
+                                 .template cast<int32_t>();
 
     out.setZero();
     for (int32_t i = 0; i < index_to_bin.size(); i++) {
@@ -114,7 +114,7 @@ class HistogramFixedWidthOp : public OpKernel {
 
     const auto values = values_tensor.flat<T>();
     const auto value_range = value_range_tensor.flat<T>();
-    const auto nbins = nbins_tensor.scalar<int32>()();
+    const auto nbins = nbins_tensor.scalar<int32_t>()();
 
     OP_REQUIRES(
         ctx, value_range(0) < value_range(1),
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index 9d9cda422824a3..e9d98426c7fae4 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -49,7 +49,7 @@ void _HostConstantOp::Compute(OpKernelContext* ctx) {
 REGISTER_KERNEL_BUILDER(Name("Const")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
+                            .TypeConstraint<int32_t>("dtype"),
                         _HostConstantOp);
 
 // HostConst: forced to generate output on the host.
diff --git a/tensorflow/core/kernels/identity_n_op_test.cc b/tensorflow/core/kernels/identity_n_op_test.cc
index 1cde97d5301188..cf2fd5b476570b 100644
--- a/tensorflow/core/kernels/identity_n_op_test.cc
+++ b/tensorflow/core/kernels/identity_n_op_test.cc
@@ -39,13 +39,13 @@ class IdentityNOpTest : public OpsTestBase {
 
 TEST_F(IdentityNOpTest, Int32DoubleSuccess_6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_DOUBLE));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<double>(TensorShape({6}),
                             {7.3, 8.3, 9.3, 10.3, 11.3, 12.3});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected0(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected0, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected0, *GetOutput(0));
+  test::FillValues<int32_t>(&expected0, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected0, *GetOutput(0));
   Tensor expected1(allocator(), DT_DOUBLE, TensorShape({6}));
   test::FillValues<double>(&expected1, {7.3, 8.3, 9.3, 10.3, 11.3, 12.3});
   test::ExpectTensorEqual<double>(expected1, *GetOutput(1));
@@ -53,27 +53,27 @@ TEST_F(IdentityNOpTest, Int32DoubleSuccess_6) {
 
 TEST_F(IdentityNOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({2, 3}), {7, 8, 9, 10, 11, 12});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {7, 8, 9, 10, 11, 12});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
-  test::FillValues<int32>(&expected, {7, 8, 9, 10, 11, 12});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(1));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {7, 8, 9, 10, 11, 12});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(1));
 }
 
 TEST_F(IdentityNOpTest, StringInt32Success) {
   TF_ASSERT_OK(Init(DT_STRING, DT_INT32));
   AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
-  AddInputFromArray<int32>(TensorShape({8}), {1, 3, 5, 7, 9, 11, 13, 15});
+  AddInputFromArray<int32_t>(TensorShape({8}), {1, 3, 5, 7, 9, 11, 13, 15});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected0(allocator(), DT_STRING, TensorShape({6}));
   test::FillValues<tstring>(&expected0, {"A", "b", "C", "d", "E", "f"});
   test::ExpectTensorEqual<tstring>(expected0, *GetOutput(0));
   Tensor expected1(allocator(), DT_INT32, TensorShape({8}));
-  test::FillValues<int32>(&expected1, {1, 3, 5, 7, 9, 11, 13, 15});
-  test::ExpectTensorEqual<int32>(expected1, *GetOutput(1));
+  test::FillValues<int32_t>(&expected1, {1, 3, 5, 7, 9, 11, 13, 15});
+  test::ExpectTensorEqual<int32_t>(expected1, *GetOutput(1));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 222f89d9f43166..61cd717420f08b 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -193,7 +193,7 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
                               .TypeConstraint<type>("T"),     \
                           IdentityOp)
 
-REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(int32_t);
 REGISTER_DEFAULT_HOST_KERNEL(tstring);
 REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
 
diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc
index 408b28e9d805cb..62da102905007d 100644
--- a/tensorflow/core/kernels/identity_op_test.cc
+++ b/tensorflow/core/kernels/identity_op_test.cc
@@ -39,20 +39,20 @@ class IdentityOpTest : public OpsTestBase {
 
 TEST_F(IdentityOpTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(IdentityOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32));
-  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(IdentityOpTest, StringSuccess) {
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index 28a39705bfa59d..cacde8cb079467 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 
 class IdentityReader : public ReaderBase {
  public:
-  explicit IdentityReader(const string& node_name)
+  explicit IdentityReader(const std::string& node_name)
       : ReaderBase(absl::StrCat("IdentityReader '", node_name, "'")) {}
 
   absl::Status ReadLocked(tstring* key, tstring* value, bool* produced,
diff --git a/tensorflow/core/kernels/image/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
index df8650ebfed515..0890d444ed5538 100644
--- a/tensorflow/core/kernels/image/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -99,10 +99,10 @@ class AdjustContrastOp : public OpKernel {
       Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       AdjustContrastOp<CPUDevice, T>);
 
-REGISTER_KERNEL(uint8);
-REGISTER_KERNEL(int8);
-REGISTER_KERNEL(int16);
-REGISTER_KERNEL(int32);
+REGISTER_KERNEL(uint8_t);
+REGISTER_KERNEL(int8_t);
+REGISTER_KERNEL(int16_t);
+REGISTER_KERNEL(int32_t);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
diff --git a/tensorflow/core/kernels/image/attention_ops.cc b/tensorflow/core/kernels/image/attention_ops.cc
index 71c8015bcd1ff1..99b1b246629852 100644
--- a/tensorflow/core/kernels/image/attention_ops.cc
+++ b/tensorflow/core/kernels/image/attention_ops.cc
@@ -33,12 +33,12 @@ namespace tensorflow {
 class ExtractGlimpseOp : public OpKernel {
  public:
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
-    const string& op = context->def().op();
+    const std::string& op = context->def().op();
     version_ = (op == "ExtractGlimpse") ? 1 : 2;
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
     bool uniform_noise = false;
-    string noise;
+    std::string noise;
     OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise));
     OP_REQUIRES_OK(context, context->GetAttr("noise", &noise));
     OP_REQUIRES(context,
@@ -132,7 +132,7 @@ class ExtractGlimpseOp : public OpKernel {
   bool normalized_;
   bool centered_;
   Eigen::ExtractGlimpsesNoiseMode noise_;
-  int32 version_;
+  int32_t version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index cd7501a4966625..a667e2752ba013 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -83,14 +83,16 @@ static inline absl::Status ParseAndCheckBoxSizes(const Tensor& boxes,
 // [0, batch_size) then calls done.
 template <typename Device>
 inline void RunIfBoxIndexIsValid(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
-    int batch_size, const Callback& compute, const Callback& done);
+    OpKernelContext* context,
+    typename TTypes<int32_t, 1>::ConstTensor box_index, int batch_size,
+    const Callback& compute, const Callback& done);
 
 // Specialization of CheckValidBoxIndex for a CPUDevice.
 template <>
 inline void RunIfBoxIndexIsValid<CPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
-    int batch_size, const Callback& compute, const Callback& done) {
+    OpKernelContext* context,
+    typename TTypes<int32_t, 1>::ConstTensor box_index, int batch_size,
+    const Callback& compute, const Callback& done) {
   const int num_boxes = box_index.dimension(0);
   for (int b = 0; b < num_boxes; ++b) {
     OP_REQUIRES_ASYNC(
@@ -169,7 +171,7 @@ class CropAndResizeOp : public AsyncOpKernel {
         done);
 
     // Copy and validate crop sizes.
-    auto crop_size_vec = crop_size.vec<int32>();
+    auto crop_size_vec = crop_size.vec<int32_t>();
     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
     OP_REQUIRES_ASYNC(
@@ -192,7 +194,7 @@ class CropAndResizeOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResize<Device, T>()(
           context, image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), method_, extrapolation_value_,
+          box_index.tensor<int32_t, 1>(), method_, extrapolation_value_,
           output->tensor<float, 4>());
 
       if (!status) {
@@ -201,14 +203,14 @@ class CropAndResizeOp : public AsyncOpKernel {
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
 
  private:
   float extrapolation_value_;
-  string method_;
+  std::string method_;
 };
 
 // Partial specialization of CropAndResize functor for a CPUDevice.
@@ -218,8 +220,8 @@ struct CropAndResize<CPUDevice, T> {
   bool operator()(OpKernelContext* context,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
-                  const string& method_name, float extrapolation_value,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
+                  const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -403,7 +405,7 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
                       errors::InvalidArgument("image_size must have 4 elements",
                                               image_size.shape().DebugString()),
                       done);
-    auto image_size_vec = image_size.vec<int32>();
+    auto image_size_vec = image_size.vec<int32_t>();
     const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
@@ -440,7 +442,7 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
           context, grads.tensor<float, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), output->tensor<T, 4>(), method_);
+          box_index.tensor<int32_t, 1>(), output->tensor<T, 4>(), method_);
 
       if (!status) {
         context->SetStatus(errors::Internal(
@@ -448,13 +450,13 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
 
  private:
-  string method_;
+  std::string method_;
 };
 
 // Partial specialization of CropAndResizeBackpropImage functor for a CPUDevice.
@@ -464,9 +466,9 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
                   typename TTypes<T, 4>::Tensor grads_image,
-                  const string& method_name) {
+                  const std::string& method_name) {
     const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -583,7 +585,7 @@ class CropAndResizeGradBoxesOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
+    std::string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
@@ -658,14 +660,14 @@ class CropAndResizeGradBoxesOp : public AsyncOpKernel {
       const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
           context->eigen_device<Device>(), grads.tensor<float, 4>(),
           image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), output->tensor<float, 2>());
+          box_index.tensor<int32_t, 1>(), output->tensor<float, 2>());
       if (!status) {
         context->SetStatus(errors::Internal(
             "Failed to launch CropAndResizeBackpropBoxes kernel."));
       }
     };
 
-    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32_t, 1>(),
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
@@ -679,7 +681,7 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_index,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index,
                   typename TTypes<float, 2>::Tensor grads_boxes) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.h b/tensorflow/core/kernels/image/crop_and_resize_op.h
index dd838ea570e9cd..0e61fdb9b8bd41 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.h
@@ -30,7 +30,7 @@ struct CropAndResize {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   const std::string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops);
 };
@@ -41,7 +41,7 @@ struct CropAndResizeBackpropImage {
   bool operator()(const OpKernelContext* context,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   typename TTypes<T, 4>::Tensor grads_image,
                   const std::string& method_name);
 };
@@ -52,7 +52,7 @@ struct CropAndResizeBackpropBoxes {
   bool operator()(const Device& d, typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32_t, 1>::ConstTensor box_ind,
                   typename TTypes<float, 2>::Tensor grads_boxes);
 };
 
@@ -60,7 +60,7 @@ template <typename Device>
 struct CheckValidBoxIndexHelper {
   // Checks if all values in box_index are in [0, batch).
   void operator()(const Device& d,
-                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
+                  typename TTypes<int32_t, 1>::ConstTensor box_index, int batch,
                   typename TTypes<bool, 0>::Tensor isvalid) {
     isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
   }
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
index abfa9c7c579674..8f73a552812d7e 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_benchmark_test.cc
@@ -29,7 +29,7 @@ static Graph* CropAndResize(int batches, int width, int height, int depth,
   Tensor boxes(DT_FLOAT, TensorShape({batches, 4}));
   auto boxes_tensor = boxes.matrix<float>();
   Tensor box_ind(DT_INT32, TensorShape({batches}));
-  auto box_ind_flat = box_ind.flat<int32>();
+  auto box_ind_flat = box_ind.flat<int32_t>();
   for (int i = 0; i < batches; ++i) {
     boxes_tensor(i, 0) = 0.2;
     boxes_tensor(i, 1) = 0.2;
@@ -38,7 +38,7 @@ static Graph* CropAndResize(int batches, int width, int height, int depth,
     box_ind_flat(i) = i;
   }
   Tensor crop_size(DT_INT32, TensorShape({2}));
-  auto crop_size_flat = crop_size.flat<int32>();
+  auto crop_size_flat = crop_size.flat<int32_t>();
   crop_size_flat(0) = crop_height;
   crop_size_flat(1) = crop_width;
   Node* ret;
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op_test.cc b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
index b82df065927a82..ac00c2d45245e4 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(float extrapolation_value, const string& method) {
+  void MakeOp(float extrapolation_value, const std::string& method) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
@@ -76,24 +76,24 @@ class CropAndResizeOpTest : public OpsTestBase {
 
 REGISTER_TEST(float)
 REGISTER_TEST(double)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
-  MakeOp<uint8>(0, "bilinear");
+  MakeOp<uint8_t>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8_t>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -102,14 +102,14 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8NearestNeibor) {
-  MakeOp<uint8>(0, "nearest");
+  MakeOp<uint8_t>(0, "nearest");
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8_t>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -124,8 +124,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -140,8 +140,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1FlippedNearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -156,8 +156,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -177,8 +177,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -198,8 +198,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -219,8 +219,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3FlippedNearestNeighbor) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -242,8 +242,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -267,8 +267,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2NearestNeighbor) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -292,8 +292,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -317,8 +317,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2FlippedNearestNeighbor) {
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
   AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
@@ -341,8 +341,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {-1, -1, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -362,8 +362,8 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({0, 4}), {});
-  AddInputFromArray<int32>(TensorShape({0}), {});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({0}), {});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({0, 3, 3, 1}));
@@ -377,8 +377,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {0});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({1}), {0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(absl::StrContains(s.ToString(), "input image must be 4-D")) << s;
@@ -388,8 +388,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
@@ -401,8 +401,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
-  AddInputFromArray<int32>(TensorShape({1}), {1});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({1}), {1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   absl::Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(absl::StrContains(s.ToString(),
@@ -425,8 +425,8 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
                   [=](int i) -> float { return i % kLength; });
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {0, 0, 0.5, 0.5, 0.5, 0.5, 1, 1});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
-  AddInputFromArray<int32>(TensorShape({2}), {kHalf, kHalf});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {kHalf, kHalf});
 
   TF_ASSERT_OK(RunOpKernel());
 
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 165747897a8e3c..a73d6f1ad660cb 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -129,7 +129,7 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->GetAttr("acceptable_fraction",
                                       &flags_.min_acceptable_fraction));
-      string dct_method;
+      std::string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
           context,
@@ -189,7 +189,7 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   // Helper for decoding BMP.
-  inline int32 ByteSwapInt32ForBigEndian(int32_t x) {
+  inline int32_t ByteSwapInt32ForBigEndian(int32_t x) {
     if (!port::kLittleEndian) {
       return BYTE_SWAP_32(x);
     } else {
@@ -198,7 +198,7 @@ class DecodeImageV2Op : public OpKernel {
   }
 
   // Helper for decoding BMP.
-  inline int16 ByteSwapInt16ForBigEndian(int16_t x) {
+  inline int16_t ByteSwapInt16ForBigEndian(int16_t x) {
     if (!port::kLittleEndian) {
       return BYTE_SWAP_16(x);
     } else {
@@ -264,7 +264,7 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES(context, crop_window.dim_size(0) == 4,
                   errors::InvalidArgument("crop_size must have four elements ",
                                           crop_window.shape().DebugString()));
-      auto crop_window_vec = crop_window.vec<int32>();
+      auto crop_window_vec = crop_window.vec<int32_t>();
       flags.crop_y = crop_window_vec(0);
       flags.crop_x = crop_window_vec(1);
       flags.crop_height = crop_window_vec(2);
@@ -288,9 +288,9 @@ class DecodeImageV2Op : public OpKernel {
     // Decode JPEG. Directly allocate to the output buffer if data type is
     // uint8 (to save extra copying). Otherwise, allocate a new uint8 buffer
     // with buffer size. `jpeg::Uncompress` supports unit8 only.
-    uint8* buffer = jpeg::Uncompress(
+    uint8_t* buffer = jpeg::Uncompress(
         input.data(), input.size(), flags, nullptr /* nwarn */,
-        [&](int width, int height, int channels) -> uint8* {
+        [&](int width, int height, int channels) -> uint8_t* {
           buffer_size = height * width * channels;
           absl::Status status;
           // By the existing API, we support decoding JPEG with `DecodeGif`
@@ -310,9 +310,9 @@ class DecodeImageV2Op : public OpKernel {
           }
 
           if (data_type_ == DataType::DT_UINT8) {
-            return output->flat<uint8>().data();
+            return output->flat<uint8_t>().data();
           } else {
-            return new uint8[buffer_size];
+            return new uint8_t[buffer_size];
           }
         });
 
@@ -327,20 +327,20 @@ class DecodeImageV2Op : public OpKernel {
       return;
     }
     // Make sure we don't forget to deallocate `buffer`.
-    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+    std::unique_ptr<uint8_t[]> buffer_unique_ptr(buffer);
 
     // Convert uint8 image data to desired data type.
     // Use eigen threadpooling to speed up the copy operation.
     const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
-    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    TTypes<uint8_t>::UnalignedConstFlat buffer_view(buffer, buffer_size);
     if (data_type_ == DataType::DT_UINT16) {
-      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                           (std::numeric_limits<uint8>::max() + 1));
+      uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                             (std::numeric_limits<uint8_t>::max() + 1));
       // Fill output tensor with desired dtype.
-      output->flat<uint16>().device(device) =
-          buffer_view.cast<uint16>() * scale;
+      output->flat<uint16_t>().device(device) =
+          buffer_view.cast<uint16_t>() * scale;
     } else if (data_type_ == DataType::DT_FLOAT) {
-      float scale = 1. / std::numeric_limits<uint8>::max();
+      float scale = 1. / std::numeric_limits<uint8_t>::max();
       // Fill output tensor with desired dtype.
       output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
     }
@@ -415,35 +415,35 @@ class DecodeImageV2Op : public OpKernel {
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
+              reinterpret_cast<png_bytep>(output->flat<uint8_t>().data()),
+              decode.channels * width * sizeof(uint8_t), &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
     } else if (data_type_ == DataType::DT_UINT16) {
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
+              reinterpret_cast<png_bytep>(output->flat<uint16_t>().data()),
+              decode.channels * width * sizeof(uint16_t), &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
     } else if (data_type_ == DataType::DT_FLOAT) {
       // `png::CommonFinishDecode` does not support `float`. First allocate
       // uint16 buffer for the image and decode in uint16 (lossless). Wrap the
       // buffer in `unique_ptr` so that we don't forget to delete the buffer.
-      std::unique_ptr<uint16[]> buffer(
-          new uint16[height * width * decode.channels]);
+      std::unique_ptr<uint16_t[]> buffer(
+          new uint16_t[height * width * decode.channels]);
       OP_REQUIRES(
           context,
           png::CommonFinishDecode(reinterpret_cast<png_bytep>(buffer.get()),
-                                  decode.channels * width * sizeof(uint16),
+                                  decode.channels * width * sizeof(uint16_t),
                                   &decode),
           errors::InvalidArgument("Invalid PNG data, size ", input.size()));
 
       // Convert uint16 image data to desired data type.
       // Use eigen threadpooling to speed up the copy operation.
       const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
-      TTypes<uint16, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
-                                                  decode.channels);
-      float scale = 1. / std::numeric_limits<uint16>::max();
+      TTypes<uint16_t, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                    decode.channels);
+      float scale = 1. / std::numeric_limits<uint16_t>::max();
       // Fill output tensor with desired dtype.
       output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
     }
@@ -477,10 +477,10 @@ class DecodeImageV2Op : public OpKernel {
     // uint8 only.
     Tensor* output = nullptr;
     int64_t buffer_size = 0;
-    string error_string;
-    uint8* buffer = gif::Decode(
+    std::string error_string;
+    uint8_t* buffer = gif::Decode(
         input.data(), input.size(),
-        [&](int num_frames, int width, int height, int channels) -> uint8* {
+        [&](int num_frames, int width, int height, int channels) -> uint8_t* {
           buffer_size =
               static_cast<int64_t>(num_frames) * height * width * channels;
 
@@ -515,9 +515,9 @@ class DecodeImageV2Op : public OpKernel {
           }
 
           if (data_type_ == DataType::DT_UINT8) {
-            return output->flat<uint8>().data();
+            return output->flat<uint8_t>().data();
           } else {
-            return new uint8[buffer_size];
+            return new uint8_t[buffer_size];
           }
         },
         &error_string, expand_animations_);
@@ -532,20 +532,20 @@ class DecodeImageV2Op : public OpKernel {
       return;
     }
     // Make sure we don't forget to deallocate `buffer`.
-    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+    std::unique_ptr<uint8_t[]> buffer_unique_ptr(buffer);
 
     // Convert the raw uint8 buffer to desired dtype.
     // Use eigen threadpooling to speed up the copy operation.
-    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    TTypes<uint8_t>::UnalignedConstFlat buffer_view(buffer, buffer_size);
     const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
     if (data_type_ == DataType::DT_UINT16) {
-      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                           (std::numeric_limits<uint8>::max() + 1));
+      uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                             (std::numeric_limits<uint8_t>::max() + 1));
       // Fill output tensor with desired dtype.
-      output->flat<uint16>().device(device) =
-          buffer_view.cast<uint16>() * scale;
+      output->flat<uint16_t>().device(device) =
+          buffer_view.cast<uint16_t>() * scale;
     } else if (data_type_ == DataType::DT_FLOAT) {
-      float scale = 1. / std::numeric_limits<uint8>::max();
+      float scale = 1. / std::numeric_limits<uint8_t>::max();
       // Fill output tensor with desired dtype.
       output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
     }
@@ -578,18 +578,18 @@ class DecodeImageV2Op : public OpKernel {
                                         "size, width, height, and bpp, got ",
                                         input.size(), " bytes"));
 
-    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    const uint8_t* img_bytes = reinterpret_cast<const uint8_t*>(input.data());
     int32_t header_size_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 10)));
     const int32_t header_size = ByteSwapInt32ForBigEndian(header_size_);
     int32_t width_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 18)));
     const int32_t width = ByteSwapInt32ForBigEndian(width_);
     int32_t height_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+        *(reinterpret_cast<const int32_t*>(img_bytes + 22)));
     const int32_t height = ByteSwapInt32ForBigEndian(height_);
     int16_t bpp_ = internal::SubtleMustCopy(
-        *(reinterpret_cast<const int16*>(img_bytes + 28)));
+        *(reinterpret_cast<const int16_t*>(img_bytes + 28)));
     const int16_t bpp = ByteSwapInt16ForBigEndian(bpp_);
 
     // `channels_` is desired number of channels. `img_channels` is number of
@@ -657,28 +657,29 @@ class DecodeImageV2Op : public OpKernel {
         context->allocate_output(
             0, TensorShape({abs_height, width, requested_channels}), &output));
 
-    const uint8* bmp_pixels = &img_bytes[header_size];
+    const uint8_t* bmp_pixels = &img_bytes[header_size];
 
     if (data_type_ == DataType::DT_UINT8) {
-      DecodeBMP(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+      DecodeBMP(bmp_pixels, row_size, output->flat<uint8_t>().data(), width,
                 abs_height, requested_channels, img_channels, top_down);
     } else {
-      std::unique_ptr<uint8[]> buffer(
-          new uint8[height * width * requested_channels]);
+      std::unique_ptr<uint8_t[]> buffer(
+          new uint8_t[height * width * requested_channels]);
       DecodeBMP(bmp_pixels, row_size, buffer.get(), width, abs_height,
                 requested_channels, img_channels, top_down);
-      TTypes<uint8, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
-                                                 requested_channels);
+      TTypes<uint8_t, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                   requested_channels);
       // Convert the raw uint8 buffer to desired dtype.
       // Use eigen threadpooling to speed up the copy operation.
       const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
       if (data_type_ == DataType::DT_UINT16) {
-        uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
-                             (std::numeric_limits<uint8>::max() + 1));
+        uint16_t scale = floor((std::numeric_limits<uint16_t>::max() + 1) /
+                               (std::numeric_limits<uint8_t>::max() + 1));
         // Fill output tensor with desired dtype.
-        output->tensor<uint16, 3>().device(device) = buf.cast<uint16>() * scale;
+        output->tensor<uint16_t, 3>().device(device) =
+            buf.cast<uint16_t>() * scale;
       } else if (data_type_ == DataType::DT_FLOAT) {
-        float scale = 1. / std::numeric_limits<uint8>::max();
+        float scale = 1. / std::numeric_limits<uint8_t>::max();
         // Fill output tensor with desired dtype.
         output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
       }
@@ -724,7 +725,7 @@ class DecodeImageV2Op : public OpKernel {
 
       // Actually decode the image into the output buffer.
       OP_REQUIRES(context,
-                  webp::DecodeWebPImage(input, output->flat<uint8>().data(),
+                  webp::DecodeWebPImage(input, output->flat<uint8_t>().data(),
                                         width, height, channels),
                   errors::InvalidArgument("Failed to decode WebP image."));
       // Note: Here we could also perform casting to other dtypes, but users can
@@ -762,7 +763,7 @@ class DecodeImageV2Op : public OpKernel {
             return nullptr;
           }
 
-          return output->flat<uint8>().data();
+          return output->flat<uint8_t>().data();
         },
         &error_string, expand_animations_);
 
@@ -773,15 +774,16 @@ class DecodeImageV2Op : public OpKernel {
   }
 
  private:
-  void DecodeBMP(const uint8* input, const int row_size, uint8* const output,
-                 const int width, const int height, const int output_channels,
-                 const int input_channels, bool top_down);
+  void DecodeBMP(const uint8_t* input, const int row_size,
+                 uint8_t* const output, const int width, const int height,
+                 const int output_channels, const int input_channels,
+                 bool top_down);
 
   int channels_ = 0;
   DataType data_type_ = DataType::DT_UINT8;
   bool expand_animations_ = true;
   jpeg::UncompressFlags flags_;
-  string op_type_;
+  std::string op_type_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageV2Op);
@@ -794,8 +796,8 @@ REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeImageV2Op);
 REGISTER_KERNEL_BUILDER(Name("DecodeWebP").Device(DEVICE_CPU), DecodeImageV2Op);
 
-void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
-                                uint8* const output, const int width,
+void DecodeImageV2Op::DecodeBMP(const uint8_t* input, const int row_size,
+                                uint8_t* const output, const int width,
                                 const int height, const int output_channels,
                                 const int input_channels, bool top_down) {
   for (int i = 0; i < height; i++) {
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op.cc b/tensorflow/core/kernels/image/encode_jpeg_op.cc
index 24b0b90ee1fda6..3ef479b9154bdd 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op.cc
@@ -56,7 +56,7 @@ class EncodeJpegOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling",
                                              &flags_.chroma_downsampling));
 
-    string density_unit;
+    std::string density_unit;
     OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit));
     if (density_unit == "in") {
       flags_.density_unit = 1;
@@ -80,15 +80,15 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
-        errors::InvalidArgument(
-            "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(context,
+                FastBoundsCheck(image.NumElements(),
+                                std::numeric_limits<int32_t>::max()),
+                errors::InvalidArgument(
+                    "Cannot encode images with >= max int32 elements"));
 
-    const int32_t dim_size0 = static_cast<int32>(image.dim_size(0));
-    const int32_t dim_size1 = static_cast<int32>(image.dim_size(1));
-    const int32_t dim_size2 = static_cast<int32>(image.dim_size(2));
+    const int32_t dim_size0 = static_cast<int32_t>(image.dim_size(0));
+    const int32_t dim_size1 = static_cast<int32_t>(image.dim_size(1));
+    const int32_t dim_size2 = static_cast<int32_t>(image.dim_size(2));
 
     // Autodetect format if desired, otherwise make sure format and
     // image channels are consistent.
@@ -122,15 +122,16 @@ class EncodeJpegOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES(context,
-                jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<tstring>()()),
-                errors::Internal("JPEG encoding failed"));
+    OP_REQUIRES(
+        context,
+        jpeg::Compress(image.flat<uint8_t>().data(), dim_size1, dim_size0,
+                       adjusted_flags, &output->scalar<tstring>()()),
+        errors::Internal("JPEG encoding failed"));
   }
 
  private:
-  string format_;
-  string xmp_metadata_;  // Owns data referenced by flags_
+  std::string format_;
+  std::string xmp_metadata_;  // Owns data referenced by flags_
   jpeg::CompressFlags flags_;
 };
 REGISTER_KERNEL_BUILDER(Name("EncodeJpeg").Device(DEVICE_CPU), EncodeJpegOp);
@@ -146,15 +147,15 @@ class EncodeJpegVariableQualityOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
-        errors::InvalidArgument(
-            "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(context,
+                FastBoundsCheck(image.NumElements(),
+                                std::numeric_limits<int32_t>::max()),
+                errors::InvalidArgument(
+                    "Cannot encode images with >= max int32 elements"));
 
-    const int32_t dim_size0 = static_cast<int32>(image.dim_size(0));
-    const int32_t dim_size1 = static_cast<int32>(image.dim_size(1));
-    const int32_t dim_size2 = static_cast<int32>(image.dim_size(2));
+    const int32_t dim_size0 = static_cast<int32_t>(image.dim_size(0));
+    const int32_t dim_size1 = static_cast<int32_t>(image.dim_size(1));
+    const int32_t dim_size2 = static_cast<int32_t>(image.dim_size(2));
 
     // Use default jpeg compression flags except for format and quality.
     jpeg::CompressFlags adjusted_flags;
@@ -188,10 +189,11 @@ class EncodeJpegVariableQualityOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES(context,
-                jpeg::Compress(image.flat<uint8>().data(), dim_size1, dim_size0,
-                               adjusted_flags, &output->scalar<tstring>()()),
-                errors::Internal("JPEG encoding failed"));
+    OP_REQUIRES(
+        context,
+        jpeg::Compress(image.flat<uint8_t>().data(), dim_size1, dim_size0,
+                       adjusted_flags, &output->scalar<tstring>()()),
+        errors::Internal("JPEG encoding failed"));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("EncodeJpegVariableQuality").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
index 0e51b4e244141f..c52685ccea043a 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
@@ -35,9 +35,9 @@ TEST_F(EncodeJpegWithVariableQualityTest, FailsForInvalidQuality) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
 
-  AddInputFromArray<uint8>(TensorShape({2, 2, 3}),
-                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-  AddInputFromArray<int32>(TensorShape({}), {200});
+  AddInputFromArray<uint8_t>(TensorShape({2, 2, 3}),
+                             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32_t>(TensorShape({}), {200});
   absl::Status status = RunOpKernel();
   EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StartsWith(status.message(), "quality must be in [0,100]"));
diff --git a/tensorflow/core/kernels/image/encode_png_op.cc b/tensorflow/core/kernels/image/encode_png_op.cc
index ae1226c05e9d1b..ebf7527302ac9a 100644
--- a/tensorflow/core/kernels/image/encode_png_op.cc
+++ b/tensorflow/core/kernels/image/encode_png_op.cc
@@ -67,16 +67,18 @@ class EncodePngOp : public OpKernel {
                 errors::Internal("Invalid image provided."));
     OP_REQUIRES(
         context,
-        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        FastBoundsCheck(image.NumElements(),
+                        std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("image cannot have >= int32 max elements"));
 
     const int batch_dims = image.dims() - 3;
-    const int32_t height = static_cast<int32>(image.dim_size(batch_dims));
-    const int32_t width = static_cast<int32>(image.dim_size(batch_dims + 1));
-    const int32_t channels = static_cast<int32>(image.dim_size(batch_dims + 2));
+    const int32_t height = static_cast<int32_t>(image.dim_size(batch_dims));
+    const int32_t width = static_cast<int32_t>(image.dim_size(batch_dims + 1));
+    const int32_t channels =
+        static_cast<int32_t>(image.dim_size(batch_dims + 2));
 
     // In some cases, we pass width*channels*2 to png.
-    const int32_t max_row_width = std::numeric_limits<int32>::max() / 2;
+    const int32_t max_row_width = std::numeric_limits<int32_t>::max() / 2;
 
     OP_REQUIRES(context, FastBoundsCheck(width * channels, max_row_width),
                 errors::InvalidArgument("image too wide to encode"));
diff --git a/tensorflow/core/kernels/image/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
index b40c59147e51b5..dc9739123ca1f8 100644
--- a/tensorflow/core/kernels/image/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -41,8 +41,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 static inline void ParseAttributeVec4(OpKernelConstruction* context,
-                                      const string& attr_name,
-                                      std::vector<int32>* attr) {
+                                      const std::string& attr_name,
+                                      std::vector<int32_t>* attr) {
   OP_REQUIRES_OK(context, context->GetAttr(attr_name, attr));
   OP_REQUIRES(
       context, (*attr)[0] == 1 && (*attr)[3] == 1,
@@ -115,9 +115,9 @@ class ExtractImagePatchesOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksizes_;
-  std::vector<int32> strides_;
-  std::vector<int32> rates_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> strides_;
+  std::vector<int32_t> rates_;
 
   Padding padding_;
 
diff --git a/tensorflow/core/kernels/image/extract_volume_patches_op.cc b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
index 1edcf5cad22b2f..0db3ca43ad36d8 100644
--- a/tensorflow/core/kernels/image/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_volume_patches_op.cc
@@ -46,8 +46,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 static inline void ParseAttributeVec5(OpKernelConstruction* context,
-                                      const string& attr_name,
-                                      std::vector<int32>* attr) {
+                                      const std::string& attr_name,
+                                      std::vector<int32_t>* attr) {
   OP_REQUIRES_OK(context, context->GetAttr(attr_name, attr));
   OP_REQUIRES(
       context, (*attr)[0] == 1 && (*attr)[4] == 1,
@@ -143,8 +143,8 @@ class ExtractVolumePatchesOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksizes_;
-  std::vector<int32> strides_;
+  std::vector<int32_t> ksizes_;
+  std::vector<int32_t> strides_;
   // std::vector<int32> rates_;
 
   Padding padding_;
diff --git a/tensorflow/core/kernels/image/image_ops.cc b/tensorflow/core/kernels/image/image_ops.cc
index 166aeb56b451a1..b34c8f55104ce8 100644
--- a/tensorflow/core/kernels/image/image_ops.cc
+++ b/tensorflow/core/kernels/image/image_ops.cc
@@ -75,7 +75,7 @@ void DoImageProjectiveTransformOp(OpKernelContext* ctx,
     OP_REQUIRES(ctx, shape_t.NumElements() == 2,
                 errors::InvalidArgument("output shape must have two elements",
                                         shape_t.shape().DebugString()));
-    auto shape_vec = shape_t.vec<int32>();
+    auto shape_vec = shape_t.vec<int32_t>();
     out_height = shape_vec(0);
     out_width = shape_vec(1);
     OP_REQUIRES(ctx, out_height > 0 && out_width > 0,
@@ -121,7 +121,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
  public:
   explicit ImageProjectiveTransformV2(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
-    string interpolation_str;
+    std::string interpolation_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
     if (interpolation_str == "NEAREST") {
       interpolation_ = Interpolation::NEAREST;
@@ -131,7 +131,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
       LOG(ERROR) << "Invalid interpolation " << interpolation_str
                  << ". Supported types: NEAREST, BILINEAR";
     }
-    string mode_str;
+    std::string mode_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_mode", &mode_str));
     if (mode_str == "REFLECT") {
       fill_mode_ = Mode::FILL_REFLECT;
diff --git a/tensorflow/core/kernels/image/mirror_pad_op.h b/tensorflow/core/kernels/image/mirror_pad_op.h
index 7c3df97855208c..0bdf9ad6961b70 100644
--- a/tensorflow/core/kernels/image/mirror_pad_op.h
+++ b/tensorflow/core/kernels/image/mirror_pad_op.h
@@ -343,13 +343,13 @@ namespace functor {
 template <typename Device, typename T, typename Tpaddings, int Dims>
 struct MirrorPad {
   void operator()(const Device& device,
-                  typename TTypes<T, Dims, int32>::Tensor output,
-                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<T, Dims, int32_t>::Tensor output,
+                  typename TTypes<T, Dims, int32_t>::ConstTensor input,
                   typename TTypes<Tpaddings>::ConstMatrix padding, int offset) {
-    Eigen::array<Eigen::IndexPair<int32>, Dims> padding_dims;
+    Eigen::array<Eigen::IndexPair<int32_t>, Dims> padding_dims;
 
     for (int i = 0; i < Dims; ++i) {
-      padding_dims[i] = Eigen::IndexPair<int32>(padding(i, 0), padding(i, 1));
+      padding_dims[i] = Eigen::IndexPair<int32_t>(padding(i, 0), padding(i, 1));
     }
 
     output.device(device) = MirrorPadOp(input, padding_dims, offset);
@@ -370,16 +370,16 @@ struct MirrorPad {
 template <typename Device, typename T, typename Tpaddings, int Dims>
 struct MirrorPadGrad {
   void operator()(const Device& device,
-                  typename TTypes<T, Dims, int32>::Tensor output,
-                  typename TTypes<T, Dims, int32>::ConstTensor input,
+                  typename TTypes<T, Dims, int32_t>::Tensor output,
+                  typename TTypes<T, Dims, int32_t>::ConstTensor input,
                   typename TTypes<Tpaddings>::ConstMatrix paddings, int offset,
-                  typename TTypes<T, Dims, int32>::Tensor scratch) {
+                  typename TTypes<T, Dims, int32_t>::Tensor scratch) {
     // Copy the gradient input into the scratch buffer.
     scratch.device(device) = input;
 
-    Eigen::array<int32, Dims> lhs_offsets;
-    Eigen::array<int32, Dims> rhs_offsets;
-    Eigen::array<int32, Dims> extents;
+    Eigen::array<int32_t, Dims> lhs_offsets;
+    Eigen::array<int32_t, Dims> rhs_offsets;
+    Eigen::array<int32_t, Dims> extents;
     Eigen::array<bool, Dims> reverses;
 
     for (int i = 0; i < Dims; ++i) {
diff --git a/tensorflow/core/kernels/image/mirror_pad_op_test.cc b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
index d424ac36f12533..94b8dc0697b6e6 100644
--- a/tensorflow/core/kernels/image/mirror_pad_op_test.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op_test.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 class MirrorPadOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(const string& mode) {
+  void MakeOp(const std::string& mode) {
     TF_EXPECT_OK(NodeDefBuilder("mirror_pad_op", "MirrorPad")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_INT32))
@@ -79,11 +79,11 @@ REGISTER_TEST(double)
 REGISTER_TEST(quint8)
 REGISTER_TEST(qint8)
 REGISTER_TEST(qint32)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
@@ -102,8 +102,8 @@ TEST_F(MirrorPadOpTest, TestMirrorPadReflectLargeInput) {
   //  0, 1, 2, ..., 999
   AddInput<float>(TensorShape({1, kInput, kInput, 1}),
                   [=](int i) -> float { return i % kInput; });
-  AddInputFromArray<int32>(TensorShape({4, 2}),
-                           {0, 0, kPad, kPad, kPad, kPad, 0, 0});
+  AddInputFromArray<int32_t>(TensorShape({4, 2}),
+                             {0, 0, kPad, kPad, kPad, kPad, 0, 0});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, kOutput, kOutput, 1}));
@@ -136,8 +136,8 @@ TEST_F(MirrorPadOpTest, TestMirrorPadSymmetricLargeInput) {
   //  0, 1, 2, ..., 999
   AddInput<float>(TensorShape({1, kInput, kInput, 1}),
                   [=](int i) -> float { return i % kInput; });
-  AddInputFromArray<int32>(TensorShape({4, 2}),
-                           {0, 0, kPad, kPad, kPad, kPad, 0, 0});
+  AddInputFromArray<int32_t>(TensorShape({4, 2}),
+                             {0, 0, kPad, kPad, kPad, kPad, 0, 0});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, kOutput, kOutput, 1}));
@@ -159,7 +159,7 @@ TEST_F(MirrorPadOpTest, TestMirrorPadSymmetricLargeInput) {
 class MirrorPadGradOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(const string& mode) {
+  void MakeOp(const std::string& mode) {
     TF_EXPECT_OK(NodeDefBuilder("mirror_pad_grad_op", "MirrorPadGrad")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_INT32))
@@ -196,11 +196,11 @@ class MirrorPadGradOpTest : public OpsTestBase {
 
 REGISTER_TEST(float)
 REGISTER_TEST(double)
-REGISTER_TEST(uint8)
-REGISTER_TEST(uint16)
-REGISTER_TEST(int8)
-REGISTER_TEST(int16)
-REGISTER_TEST(int32)
+REGISTER_TEST(uint8_t)
+REGISTER_TEST(uint16_t)
+REGISTER_TEST(int8_t)
+REGISTER_TEST(int16_t)
+REGISTER_TEST(int32_t)
 REGISTER_TEST(int64_t)
 
 #undef REGISTER_TEST
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
index fb0a562182315e..012c9bf0f8f926 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -822,7 +822,7 @@ class NonMaxSuppressionV4Op : public OpKernel {
     Tensor* num_outputs_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 1, tensorflow::TensorShape{}, &num_outputs_t));
-    num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
+    num_outputs_t->scalar<int32_t>().setConstant(num_valid_outputs);
   }
 
  private:
@@ -902,7 +902,7 @@ class NonMaxSuppressionV5Op : public OpKernel {
     Tensor* num_outputs_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
                                 2, tensorflow::TensorShape{}, &num_outputs_t));
-    num_outputs_t->scalar<int32>().setConstant(num_valid_outputs);
+    num_outputs_t->scalar<int32_t>().setConstant(num_valid_outputs);
   }
 
  private:
diff --git a/tensorflow/core/kernels/image/resize_area_op_test.cc b/tensorflow/core/kernels/image/resize_area_op_test.cc
index 2d91114668d630..af4db8cd3c18d8 100644
--- a/tensorflow/core/kernels/image/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_area_op_test.cc
@@ -152,7 +152,7 @@ class ResizeAreaOpTest : public OpsTestBase {
                      int target_width, int channels) {
     const Tensor* input =
         SetRandomImageInput(TensorShape({1, in_height, in_width, channels}));
-    AddInputFromArray<int32>(TensorShape({2}), {target_height, target_width});
+    AddInputFromArray<int32_t>(TensorShape({2}), {target_height, target_width});
 
     TF_ASSERT_OK(RunOpKernel());
     std::unique_ptr<Tensor> expected(
diff --git a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
index c80a518c4e01d4..366f7e284fd279 100644
--- a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
@@ -173,7 +173,7 @@ class ResizeBicubicOpTest : public OpsTestBase {
               << "x" << channels;
     const Tensor* input = SetRandomImageInput(
         TensorShape({batch_size, in_height, in_width, channels}));
-    AddInputFromArray<int32>(TensorShape({2}), {target_height, target_width});
+    AddInputFromArray<int32_t>(TensorShape({2}), {target_height, target_width});
 
     TF_ASSERT_OK(RunOpKernel());
 
@@ -213,7 +213,7 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To1x1) {
   // 1, 2
   // 3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -225,7 +225,7 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To1x1) {
 
 TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
 
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
@@ -267,7 +267,7 @@ static Graph* ResizeBicubic(int batch_size, int size, int channels,
   Tensor input(DT_FLOAT, TensorShape({batch_size, size, size, channels}));
   input.flat<float>().setRandom();
   Tensor shape(DT_INT32, TensorShape({2}));
-  auto shape_t = shape.flat<int32>();
+  auto shape_t = shape.flat<int32_t>();
   shape_t(0) = scale_y * size;
   shape_t(1) = scale_x * size;
   test::graph::Binary(g, "ResizeBicubic", test::graph::Constant(g, input),
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
index d8b3d6c779f92b..b664bbb02de21e 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
@@ -137,7 +137,7 @@ class ResizeBilinearOpTestBase
                   int channels, int output_width, int output_height) {
     const TensorShape shape({batch_size, input_width, input_height, channels});
     const Tensor* input = SetRandomImageInput(shape);
-    AddInputFromArray<int32>(TensorShape({2}), {output_width, output_height});
+    AddInputFromArray<int32_t>(TensorShape({2}), {output_width, output_height});
     TF_ASSERT_OK(RunOpKernel());
 
     std::unique_ptr<Tensor> expected(new Tensor(
@@ -199,7 +199,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -211,7 +211,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
 
 TEST_P(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
   const Tensor* input = SetRandomImageInput(TensorShape({1, 2, 2, 1}));
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -230,7 +230,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   // When scaling down, we have to arbitrarily pick a pixel from the
@@ -245,7 +245,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -265,7 +265,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -290,7 +290,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -311,7 +311,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -332,7 +332,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -356,7 +356,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -388,7 +388,7 @@ TEST_P(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -410,7 +410,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
   //
   // repeated twice
   AddInputFromArray<float>(TensorShape({2, 2, 2, 1}), {1, 2, 3, 4, 1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1}));
@@ -426,7 +426,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
 TEST_P(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 2}),
                            {1, -1, 2, -2, 3, -3, 4, -4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 2}));
@@ -452,7 +452,7 @@ TEST_P(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -492,7 +492,7 @@ TEST_P(ResizeBilinearOpTest, Test6_3c) { TestResize(1, 304, 303, 3, 299, 299); }
 
 TEST_P(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32_t>(TensorShape({2}), {0, 0});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(
@@ -502,7 +502,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidOutputSize) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "input must be 4-dimensional"))
@@ -511,7 +511,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidInputShape) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2, 1}), {4, 4});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must be 1-dimensional"))
@@ -520,7 +520,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidSizeDim) {
 
 TEST_P(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
+  AddInputFromArray<int32_t>(TensorShape({3}), {4, 4, 1});
   absl::Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must have two elements"))
@@ -562,7 +562,7 @@ class ResizeBM : public ResizeBilinearOpTest {
     const TensorShape shape(
         {/*batch_size*/ 1, input_width, input_height, num_channels});
     SetRandomImageInput(shape);
-    AddInputFromArray<int32>(TensorShape({2}), {output_width, output_height});
+    AddInputFromArray<int32_t>(TensorShape({2}), {output_width, output_height});
   }
 
   using ResizeBilinearOpTest::RunOpKernel;
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
index fa62e644bb9f3e..7dfe9f2cefd2e2 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc
@@ -236,7 +236,7 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
                 errors::InvalidArgument("shape_t must have two elements",
                                         shape_t.shape().DebugString()));
 
-    auto sizes = shape_t.vec<int32>();
+    auto sizes = shape_t.vec<int32_t>();
     OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
                 errors::InvalidArgument("shape_t's elements must be positive"));
 
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
index 4e73c6ed7e5e8b..9367253f356707 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
@@ -101,7 +101,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -119,7 +119,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -136,7 +136,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -157,7 +157,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -179,7 +179,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -201,7 +201,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -220,7 +220,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 5});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
@@ -243,7 +243,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearestNeighbor4x4To3x3) {
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -268,7 +268,7 @@ TEST_P(ResizeNearestNeighborOpAlignCornersTest,
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -288,7 +288,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {5, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
@@ -310,7 +310,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -334,7 +334,7 @@ TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
   //    [ 7, 7 ], [ 8, 8] ]
   AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
                            {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
@@ -364,7 +364,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 5, 1}),
                            {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -381,7 +381,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  AddInputFromArray<int32_t>(TensorShape({2}), {1, 1});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
@@ -398,7 +398,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -420,7 +420,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
   //  7, 8, 9
   AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
                            {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
@@ -439,7 +439,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  AddInputFromArray<int32_t>(TensorShape({2}), {2, 5});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
@@ -463,7 +463,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
   AddInputFromArray<float>(
       TensorShape({1, 4, 4, 1}),
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
@@ -483,7 +483,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  AddInputFromArray<int32_t>(TensorShape({2}), {5, 2});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
@@ -505,7 +505,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
   //  1, 2
   //  3, 4
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
-  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  AddInputFromArray<int32_t>(TensorShape({2}), {4, 4});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
@@ -530,7 +530,7 @@ TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
   //    [ 7, 7 ], [ 8, 8] ]
   AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
                            {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
-  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  AddInputFromArray<int32_t>(TensorShape({2}), {3, 3});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
diff --git a/tensorflow/core/kernels/image/resize_op_benchmark_test.cc b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
index acae1f0b49f2f9..852d31f96f7ca6 100644
--- a/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/resize_op_benchmark_test.cc
@@ -28,7 +28,7 @@ static Graph* Resize(const char* algorithm, int batches, int width,
   in.flat<float>().setRandom();
 
   Tensor out_size(DT_INT32, TensorShape({2}));
-  auto out_size_flat = out_size.flat<int32>();
+  auto out_size_flat = out_size.flat<int32_t>();
   out_size_flat(0) = width * 2;
   out_size_flat(1) = height * 2;
 
diff --git a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
index a754a8cec1fc62..42804ea06e129f 100644
--- a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
@@ -270,16 +270,19 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
                                         image_size.shape().DebugString()));
 
     // Note image_size_data(2) is the depth and unused.
-    const uint64 height_raw = internal::SubtleMustCopy(image_size.flat<T>()(0));
-    const uint64 width_raw = internal::SubtleMustCopy(image_size.flat<T>()(1));
-    OP_REQUIRES(context,
-                FastBoundsCheck(height_raw, std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("image height cannot be >= int32 max"));
+    const uint64_t height_raw =
+        internal::SubtleMustCopy(image_size.flat<T>()(0));
+    const uint64_t width_raw =
+        internal::SubtleMustCopy(image_size.flat<T>()(1));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(height_raw, std::numeric_limits<int32_t>::max()),
+        errors::InvalidArgument("image height cannot be >= int32 max"));
     OP_REQUIRES(context,
-                FastBoundsCheck(width_raw, std::numeric_limits<int32>::max()),
+                FastBoundsCheck(width_raw, std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument("image width cannot be >= int32 max"));
-    const int32_t height = static_cast<int32>(height_raw);
-    const int32_t width = static_cast<int32>(width_raw);
+    const int32_t height = static_cast<int32_t>(height_raw);
+    const int32_t width = static_cast<int32_t>(width_raw);
 
     // Ensure that the supplied bounding boxes are sane and convert them to
     // Rectangles.
@@ -328,10 +331,10 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
                                       boxes(b, i)));
         }
 
-        const int32_t x_min = static_cast<int32>(boxes(b, 1) * width);
-        const int32_t y_min = static_cast<int32>(boxes(b, 0) * height);
-        const int32_t x_max = static_cast<int32>(boxes(b, 3) * width);
-        const int32_t y_max = static_cast<int32>(boxes(b, 2) * height);
+        const int32_t x_min = static_cast<int32_t>(boxes(b, 1) * width);
+        const int32_t y_min = static_cast<int32_t>(boxes(b, 0) * height);
+        const int32_t x_max = static_cast<int32_t>(boxes(b, 3) * width);
+        const int32_t y_max = static_cast<int32_t>(boxes(b, 2) * height);
 
         bounding_boxes.push_back(Rectangle(x_min, y_min, x_max, y_max));
       }
@@ -432,7 +435,7 @@ class SampleDistortedBoundingBoxBaseOp : public OpKernel {
   }
 
  protected:
-  int32 max_attempts_;
+  int32_t max_attempts_;
   std::vector<float> area_range_;
   std::vector<float> aspect_ratio_range_;
   float min_object_covered_;
diff --git a/tensorflow/core/kernels/image/sampling_kernels.cc b/tensorflow/core/kernels/image/sampling_kernels.cc
index d03247fc7487bf..c72e206f4e424c 100644
--- a/tensorflow/core/kernels/image/sampling_kernels.cc
+++ b/tensorflow/core/kernels/image/sampling_kernels.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace functor {
 
 SamplingKernelType SamplingKernelTypeFromString(const absl::string_view str) {
-  const string lower_case = absl::AsciiStrToLower(str);
+  const std::string lower_case = absl::AsciiStrToLower(str);
   if (lower_case == "lanczos1") return Lanczos1Kernel;
   if (lower_case == "lanczos3") return Lanczos3Kernel;
   if (lower_case == "lanczos5") return Lanczos5Kernel;
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
index 3cacc10229495d..edd6ef09b3a8f4 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -70,7 +70,7 @@ absl::Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_INT32, tensorflow::TensorShape({output_size}),
       &spans->starts, alloc_attr));
-  auto starts_vec = spans->starts.vec<int32>();
+  auto starts_vec = spans->starts.vec<int32_t>();
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_FLOAT,
       tensorflow::TensorShape({spans->span_size * output_size}),
@@ -135,7 +135,7 @@ absl::Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
   };
   std::vector<std::vector<GradComponent>> grad_components(forward_input_size);
   auto weights_vec = spans.weights.vec<float>();
-  auto starts_vec = spans.starts.vec<int32>();
+  auto starts_vec = spans.starts.vec<int32_t>();
   for (int output_index = 0; output_index < forward_output_size;
        ++output_index) {
     int input_index = starts_vec(output_index);
@@ -163,7 +163,7 @@ absl::Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_INT32, tensorflow::TensorShape({forward_input_size}),
       &grad_spans->starts, alloc_attr));
-  auto grad_starts_vec = grad_spans->starts.vec<int32>();
+  auto grad_starts_vec = grad_spans->starts.vec<int32_t>();
   TF_RETURN_IF_ERROR(context->allocate_temp(
       tensorflow::DT_FLOAT,
       tensorflow::TensorShape({grad_spans->span_size * forward_input_size}),
@@ -273,7 +273,7 @@ class ScaleAndTranslateOp : public OpKernel {
   explicit ScaleAndTranslateOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
-    string kernel_type_str;
+    std::string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
     OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
@@ -293,15 +293,16 @@ class ScaleAndTranslateOp : public OpKernel {
     OP_REQUIRES(context, output_shape_t.NumElements() == 2,
                 errors::InvalidArgument("output_shape_t must have two elements",
                                         output_shape_t.shape().DebugString()));
-    auto output_shape_vec = output_shape_t.vec<int32>();
+    auto output_shape_vec = output_shape_t.vec<int32_t>();
     const int64_t output_height = internal::SubtleMustCopy(output_shape_vec(0));
     const int64_t output_width = internal::SubtleMustCopy(output_shape_vec(1));
 
     OP_REQUIRES(
         context,
-        FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) &&
+        FastBoundsCheck(input.dim_size(1),
+                        std::numeric_limits<int32_t>::max()) &&
             FastBoundsCheck(input.dim_size(2),
-                            std::numeric_limits<int32>::max()),
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("input sizes must be between 0 and max int32"));
 
     const int64_t batch_size = input.dim_size(0);
@@ -359,13 +360,13 @@ class ScaleAndTranslateOp : public OpKernel {
         intermediate_t.tensor<float, 4>();
 
     const functor::Spans& const_row_spans = row_spans;
-    typename TTypes<int32, 1>::ConstTensor row_starts(
-        const_row_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor row_starts(
+        const_row_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor row_weights(
         const_row_spans.weights.tensor<float, 1>());
     const functor::Spans& const_col_spans = col_spans;
-    typename TTypes<int32, 1>::ConstTensor col_starts(
-        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor col_weights(
         const_col_spans.weights.tensor<float, 1>());
 
@@ -384,7 +385,7 @@ class ScaleAndTranslateGradOp : public OpKernel {
   explicit ScaleAndTranslateGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
-    string kernel_type_str;
+    std::string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
     OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
@@ -417,9 +418,9 @@ class ScaleAndTranslateGradOp : public OpKernel {
 
     OP_REQUIRES(context,
                 FastBoundsCheck(forward_input_height,
-                                std::numeric_limits<int32>::max()) &&
+                                std::numeric_limits<int32_t>::max()) &&
                     FastBoundsCheck(forward_input_width,
-                                    std::numeric_limits<int32>::max()),
+                                    std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument(
                     "original sizes must be between 0 and max int32"));
     Tensor* output = nullptr;
@@ -464,13 +465,13 @@ class ScaleAndTranslateGradOp : public OpKernel {
         intermediate_t.tensor<float, 4>();
 
     const functor::Spans& const_row_spans = row_spans;
-    typename TTypes<int32, 1>::ConstTensor row_starts =
-        const_row_spans.starts.tensor<int32, 1>();
+    typename TTypes<int32_t, 1>::ConstTensor row_starts =
+        const_row_spans.starts.tensor<int32_t, 1>();
     typename TTypes<float, 1>::ConstTensor row_weights(
         const_row_spans.weights.tensor<float, 1>());
     const functor::Spans& const_col_spans = col_spans;
-    typename TTypes<int32, 1>::ConstTensor col_starts(
-        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<int32_t, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32_t, 1>());
     typename TTypes<float, 1>::ConstTensor col_weights(
         const_col_spans.weights.tensor<float, 1>());
 
@@ -485,8 +486,8 @@ class ScaleAndTranslateGradOp : public OpKernel {
 };
 
 template <typename T>
-void GatherColumns(OpKernelContext* context, int span_size, const int32* starts,
-                   const float* weights, const T* image,
+void GatherColumns(OpKernelContext* context, int span_size,
+                   const int32_t* starts, const float* weights, const T* image,
                    const int64_t input_height, const int64_t input_width,
                    const int64_t output_height, const int64_t output_width,
                    const int channels, float* output) {
@@ -538,7 +539,7 @@ inline void AddScaledVector(const T* in_vec, int vec_len, float weight,
 }
 
 template <typename T>
-void GatherRows(OpKernelContext* context, int span_size, const int32* starts,
+void GatherRows(OpKernelContext* context, int span_size, const int32_t* starts,
                 const float* weights, const T* image,
                 const int64_t input_height, const int64_t input_width,
                 const int64_t output_height, const int64_t output_width,
@@ -581,10 +582,10 @@ template <typename T>
 struct GatherSpans<CPUDevice, T> {
   void operator()(OpKernelContext* context, const CPUDevice& d,
                   int row_span_size,
-                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor row_starts,
                   typename TTypes<float, 1>::ConstTensor row_weights,
                   int col_span_size,
-                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor col_starts,
                   typename TTypes<float, 1>::ConstTensor col_weights,
                   typename TTypes<T, 4>::ConstTensor images,
                   typename TTypes<float, 4>::Tensor intermediate_buffer,
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.h b/tensorflow/core/kernels/image/scale_and_translate_op.h
index 672cc2a860b8f1..e2387a41ae381c 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.h
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.h
@@ -60,10 +60,10 @@ struct Spans {
 template <typename Device, typename T>
 struct GatherSpans {
   void operator()(OpKernelContext* context, const Device& d, int row_span_size,
-                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor row_starts,
                   typename TTypes<float, 1>::ConstTensor row_weights,
                   int col_span_size,
-                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<int32_t, 1>::ConstTensor col_starts,
                   typename TTypes<float, 1>::ConstTensor col_weights,
                   typename TTypes<T, 4>::ConstTensor input_images,
                   typename TTypes<float, 4>::Tensor intermediate_buffer,
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
index c21a3a95b90b64..9b69dae48790cb 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_benchmark_test.cc
@@ -26,8 +26,8 @@ void BM_ScaleAndTranslateOp(benchmark::State& state) {
   Tensor in(DT_FLOAT, TensorShape({1, 768, 768, 3}));
   in.flat<float>().setRandom();
   Tensor size(DT_INT32, TensorShape({2}));
-  size.flat<int32>()(0) = 772;
-  size.flat<int32>()(1) = 772;
+  size.flat<int32_t>()(0) = 772;
+  size.flat<int32_t>()(1) = 772;
   Tensor scale(DT_FLOAT, TensorShape({2}));
   scale.flat<float>()(0) = 1.0052;
   scale.flat<float>()(1) = 1.0052;
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op_test.cc b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
index 55bd559ddd6776..0def4456696781 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op_test.cc
@@ -193,7 +193,7 @@ void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
 
 class ScaleAndTranslateOpTest : public OpsTestBase {
  protected:
-  void CreateOp(const string& kernel_type_str, const bool antialias) {
+  void CreateOp(const std::string& kernel_type_str, const bool antialias) {
     TF_EXPECT_OK(NodeDefBuilder("scale_and_translate_op", "ScaleAndTranslate")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
@@ -244,8 +244,8 @@ class ScaleAndTranslateOpTest : public OpsTestBase {
 
   void RunTest(int output_image_height, int output_image_width,
                const Vector2f& scale, const Vector2f& translate) {
-    AddInputFromArray<int32>(TensorShape({2}),
-                             {output_image_height, output_image_width});
+    AddInputFromArray<int32_t>(TensorShape({2}),
+                               {output_image_height, output_image_width});
     AddInputFromArray<float>(TensorShape({2}), {scale[1], scale[0]});
     AddInputFromArray<float>(TensorShape({2}), {translate[1], translate[0]});
     absl::Status s = RunOpKernel();
@@ -417,10 +417,10 @@ TEST_F(ScaleAndTranslateOpTest, NonAntialiasedScaleAndTranslationTest) {
 }
 
 TEST_F(ScaleAndTranslateOpTest, TestKernelTypes) {
-  const std::vector<string> kKernelTypes = {
+  const std::vector<std::string> kKernelTypes = {
       "lanczos1", "lanczos3",  "lanczos5",     "box",
       "triangle", "keyscubic", "mitchellcubic"};
-  for (const string& kernel_type : kKernelTypes) {
+  for (const std::string& kernel_type : kKernelTypes) {
     CreateOp(kernel_type, true);
     constexpr int64_t kBatchSize = 2;
     constexpr int64_t kNumRowSquares = 10;
diff --git a/tensorflow/core/kernels/linalg/linalg_ops_common.h b/tensorflow/core/kernels/linalg/linalg_ops_common.h
index b4b98921b263d0..820f921c167400 100644
--- a/tensorflow/core/kernels/linalg/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.h
@@ -102,8 +102,9 @@ class LinearAlgebraOp : public OpKernel {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   // Returns true if it is safe to forward (alias) input to output buffer
diff --git a/tensorflow/core/kernels/linalg/lu_op.cc b/tensorflow/core/kernels/linalg/lu_op.cc
index 27ffb48f45915f..a59f5de7c22382 100644
--- a/tensorflow/core/kernels/linalg/lu_op.cc
+++ b/tensorflow/core/kernels/linalg/lu_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <limits>
 
 #include "absl/container/inlined_vector.h"
 #include "Eigen/Core"  // from @eigen_archive
@@ -61,8 +62,9 @@ class LuOp : public OpKernel {
   int64_t GetCostPerUnit(const TensorShape& input_matrix_shape) const {
     double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
     double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
index c75c494e6f8586..97088a08d65019 100644
--- a/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_solve_ls_op_impl.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
+#include <limits>
+
 #include "Eigen/Cholesky"  // from @eigen_archive
 #include "Eigen/Core"  // from @eigen_archive
 #include "Eigen/QR"  // from @eigen_archive
@@ -67,8 +69,9 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
     double cost = std::max(m, n) * std::min(m, n) * (std::min(m, n) + num_rhss);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/linalg/matrix_solve_op.cc b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
index 2fd9731b88bae9..89de57f1cd6800 100644
--- a/tensorflow/core/kernels/linalg/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_solve_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
+#include <limits>
 #include <numeric>
 
 #include "Eigen/Core"  // from @eigen_archive
@@ -67,8 +68,9 @@ class MatrixSolveOp : public LinearAlgebraOp<Scalar> {
     double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
     double cost = rows * rows * (rows + num_rhss);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/linalg/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
index c5a1823f2f69e5..7dfa1122e4d99b 100644
--- a/tensorflow/core/kernels/linalg/qr_op_impl.h
+++ b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 // individual kernels. A separate file is used for each instantiated kernel to
 // improve compilation times.
 #include <algorithm>
+#include <limits>
 #include <numeric>
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -89,8 +90,9 @@ class QrOp : public LinearAlgebraOp<Scalar> {
                   2 * min_size * min_size * min_size / 3.;
     // TODO(jpoulson): Increase the cost if full_matrices is true in a manner
     // that reflects the algorithm used for the expansion.
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   using Matrix = typename Base::Matrix;
diff --git a/tensorflow/core/kernels/linalg/svd_op_impl.h b/tensorflow/core/kernels/linalg/svd_op_impl.h
index 4e6745855e47ff..b63b99a7e6de96 100644
--- a/tensorflow/core/kernels/linalg/svd_op_impl.h
+++ b/tensorflow/core/kernels/linalg/svd_op_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 // individual kernels. A separate file is used for each instantiated kernel to
 // improve compilation times.
 #include <algorithm>
+#include <limits>
 
 #include "Eigen/SVD"  // from @eigen_archive
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -72,8 +73,9 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
     double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
     double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   using Matrix = typename Base::Matrix;
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
index 0b75d5c7d07c80..5e83c65b538491 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_matmul_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // See docs in ../ops/linalg_ops.cc.
 
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -75,8 +76,9 @@ class TridiagonalMatMulOp : public LinearAlgebraOp<Scalar> {
 
     const double cost = num_rhss * ((3 * num_eqs - 2) * mult_cost +
                                     (2 * num_eqs - 2) * add_cost);
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   // Needed to prevent writing result to the same location where input is.
diff --git a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
index ca7b7994f2efaa..a10391c693df5f 100644
--- a/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/tridiagonal_solve_op.cc
@@ -104,8 +104,9 @@ class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
       cost = num_eqs * (div_cost * (num_rhss + 1) +
                         (add_cost + mult_cost) * (2 * num_rhss + 1));
     }
-    return cost >= static_cast<double>(kint64max) ? kint64max
-                                                  : static_cast<int64_t>(cost);
+    return cost >= static_cast<double>(std::numeric_limits<int64_t>::max())
+               ? std::numeric_limits<int64_t>::max()
+               : static_cast<int64_t>(cost);
   }
 
   bool EnableInputForwarding() const final { return false; }
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 8af21582ee9652..8e6700f646d090 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -1174,8 +1174,10 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
   dims a_dims = dims{m, k};
   dims b_dims = dims{k, n};
   dims c_dims = dims{m, n};
-  dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
-  dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
+  dims a_strides =
+      absl::ascii_tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
+  dims b_strides =
+      absl::ascii_tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
   dims c_strides = dims{ldc, 1};
 
   // MklMatMul uses const alpha and beta, make guarantee here to ensure
diff --git a/tensorflow/core/kernels/numeric_options_utils.h b/tensorflow/core/kernels/numeric_options_utils.h
index ced38d3794f828..3c4caf81522f87 100644
--- a/tensorflow/core/kernels/numeric_options_utils.h
+++ b/tensorflow/core/kernels/numeric_options_utils.h
@@ -16,20 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/tsl/util/determinism.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 
-inline stream_executor::NumericOptions GetNumericOptions() {
-  return stream_executor::NumericOptions{
+inline stream_executor::EngineOptions GetNumericOptions() {
+  return stream_executor::EngineOptions{
       /*require_determinism=*/tsl::OpDeterminismRequired(),
-      /*allow_tf32=*/tsl::tensor_float_32_execution_enabled()};
+      /*allow_tf32=*/tsl::tensor_float_32_execution_enabled(),
+      /*require_command_buffer=*/false};
 }
 
-inline stream_executor::NumericOptions GetNumericOptionsForCuDnn() {
+inline stream_executor::EngineOptions GetNumericOptionsForCuDnn() {
   static bool cudnn_deterministic_env_var = [] {
     bool cudnn_deterministic = false;
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
@@ -37,7 +38,7 @@ inline stream_executor::NumericOptions GetNumericOptionsForCuDnn() {
                                    &cudnn_deterministic));
     return cudnn_deterministic;
   }();
-  stream_executor::NumericOptions result = GetNumericOptions();
+  stream_executor::EngineOptions result = GetNumericOptions();
   result.require_determinism |= cudnn_deterministic_env_var;
   return result;
 }
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
index e83fef2b9b4e20..9d0fc7530ae889 100644
--- a/tensorflow/core/kernels/population_count_op.cc
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -18,10 +18,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <bitset>
-
 #include "tensorflow/core/kernels/population_count_op.h"
 
+#include <bitset>
+#include <limits>
+
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -114,9 +115,10 @@ struct PopulationCount<CPUDevice, T> {
     // (bitset.count() -> output).  The .count() itself is relatively cheap.
     const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
                                Eigen::TensorOpCost::CastCost<int64_t, uint8>());
-    const int64_t shard_cost = (total_cost >= static_cast<double>(kint64max))
-                                   ? kint64max
-                                   : static_cast<int64_t>(total_cost);
+    const int64_t shard_cost =
+        (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
+            ? std::numeric_limits<int64_t>::max()
+            : static_cast<int64_t>(total_cost);
 
     auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index d902e8424d7486..7e218941a49e9a 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -64,20 +66,21 @@ class RaggedGatherOpBase : public OpKernel {
                                                 &params_nested_splits_in));
     OP_REQUIRES(
         context, params_nested_splits_in.size() > 0,
-        errors::InvalidArgument("params_nested_splits must be non empty"));
+        absl::InvalidArgumentError("params_nested_splits must be non empty"));
 
     const Tensor& params_dense_values_in =
         context->input(params_nested_splits_in.size());
     const Tensor& indices_in =
         context->input(params_nested_splits_in.size() + 1);
 
-    OP_REQUIRES(context, params_nested_splits_in[0].dims() > 0,
-                errors::InvalidArgument("Split tensors must not be scalars"));
+    OP_REQUIRES(
+        context, params_nested_splits_in[0].dims() > 0,
+        absl::InvalidArgumentError("Split tensors must not be scalars"));
     SPLITS_TYPE num_params = params_nested_splits_in[0].dim_size(0) - 1;
     OP_REQUIRES_OK(context, ValidateIndices(indices_in, num_params));
 
     OP_REQUIRES(context, params_dense_values_in.dims() > 0,
-                errors::InvalidArgument("params.rank must be nonzero"));
+                absl::InvalidArgumentError("params.rank must be nonzero"));
     SPLITS_TYPE num_params_dense_values = params_dense_values_in.dim_size(0);
 
     // Calculate the `splits`, and store the value slices that we need to
@@ -106,9 +109,9 @@ class RaggedGatherOpBase : public OpKernel {
     for (SPLITS_TYPE i = 0; i < indices.size(); ++i) {
       SPLITS_TYPE index = indices(i);
       if (index < 0 || index >= num_params) {
-        return errors::InvalidArgument(
-            "indices", SliceDebugString(indices_in.shape(), i), " = ", index,
-            " is not in [0, ", num_params, ")");
+        return absl::InvalidArgumentError(
+            absl::StrCat("indices", SliceDebugString(indices_in.shape(), i),
+                         " = ", index, " is not in [0, ", num_params, ")"));
       }
     }
     return absl::OkStatus();
@@ -201,18 +204,18 @@ class RaggedGatherOpBase : public OpKernel {
                                    ? num_params_dense_values
                                    : params_nested_splits[dim + 1].size();
       if (splits.size() == 0) {
-        return errors::InvalidArgument("Ragged splits may not be empty");
+        return absl::InvalidArgumentError("Ragged splits may not be empty");
       }
       if (splits(0) < 0) {
-        return errors::InvalidArgument("Ragged splits must be non-negative");
+        return absl::InvalidArgumentError("Ragged splits must be non-negative");
       }
       if (splits(splits.size() - 1) > last_split) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(
             "Ragged splits must not point past values");
       }
       for (int i = 1; i < splits.size(); ++i) {
         if (splits(i - 1) > splits(i)) {
-          return errors::InvalidArgument("Ragged splits must be sorted");
+          return absl::InvalidArgumentError("Ragged splits must be sorted");
         }
       }
     }
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index 409b5448243d90..db4f97c3e925de 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/range_sampler.h"
 
+#include <algorithm>
 #include <cmath>
-#include <unordered_set>
+#include <cstdint>
+#include <limits>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
@@ -83,13 +87,14 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
 
   if (unique) {
     CHECK_LE(static_cast<int64_t>(batch_size + avoided_values.size()), range_);
-    std::unordered_set<int64_t> used(batch_size);
+    absl::flat_hash_set<int64_t> used;
+    used.reserve(batch_size + avoided_values.size());
     used.insert(avoided_values.begin(), avoided_values.end());
     int num_picked = 0;
     num_tries = 0;
     while (num_picked < batch_size) {
       num_tries++;
-      CHECK_LT(num_tries, kint32max);
+      CHECK_LT(num_tries, std::numeric_limits<int32_t>::max());
       int64_t value = Sample(rnd);
       if (gtl::InsertIfNotPresent(&used, value)) {
         batch[num_picked++] = value;
@@ -177,7 +182,7 @@ float LogUniformSampler::Probability(int64_t value) const {
 
 ThreadUnsafeUnigramSampler::ThreadUnsafeUnigramSampler(int64_t range)
     : RangeSampler(range), picker_(range) {
-  CHECK_LT(range, kint32max);
+  CHECK_LT(range, std::numeric_limits<int32_t>::max());
 }
 
 int64_t ThreadUnsafeUnigramSampler::Sample(random::SimplePhilox* rnd) const {
@@ -189,8 +194,9 @@ float ThreadUnsafeUnigramSampler::Probability(int64_t value) const {
 }
 
 void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
-  int num_updates = std::min(static_cast<int>(values.size()),
-                             kint32max - picker_.total_weight());
+  int num_updates =
+      std::min(static_cast<int>(values.size()),
+               std::numeric_limits<int32_t>::max() - picker_.total_weight());
   for (int i = 0; i < num_updates; i++) {
     const int64_t value = values[i];
     picker_.set_weight(value, picker_.get_weight(value) + 1);
@@ -200,7 +206,7 @@ void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
 // Thread-safe unigram sampler
 UnigramSampler::UnigramSampler(int64_t range)
     : RangeSampler(range), unsafe_sampler_(range) {
-  CHECK_LT(range, kint32max);
+  CHECK_LT(range, std::numeric_limits<int32_t>::max());
 }
 
 int64_t UnigramSampler::Sample(random::SimplePhilox* rnd) const {
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index e2d797290297bd..0c5054077a1f35 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -45,11 +45,6 @@ limitations under the License.
 //   (use_locking=false), we never copy even if the variable's
 //   reference count is >1.
 
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/notification.h"
-#include "tensorflow/core/framework/op_requires.h"
-#include "tensorflow/core/framework/types.pb.h"
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -58,18 +53,25 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #endif
 
+#include <limits>
 #include <memory>
 #include <type_traits>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/notification.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -133,13 +135,15 @@ absl::Status CopyVariable(int output_idx, OpKernelContext* ctx,
       TF_CALL_ALL_TYPES(HANDLER);
       TF_CALL_float8_e5m2(HANDLER);
       TF_CALL_float8_e4m3fn(HANDLER);
+      TF_CALL_float4_e2m1fn(HANDLER);
       TF_CALL_int4(HANDLER);
       TF_CALL_uint4(HANDLER);
       TF_CALL_int2(HANDLER);
       TF_CALL_uint2(HANDLER);
 #undef HANDLER
       default:
-        return errors::Internal("Unsupported dtype", t->dtype());
+        return absl::InternalError(
+            absl::StrCat("Unsupported dtype: ", DataTypeString(t->dtype())));
     }
   }
   return absl::OkStatus();
@@ -152,12 +156,12 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
   const auto status = LookupResource(ctx, handle, &variable);
   OP_REQUIRES(ctx, status.ok(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "Could not find variable ", handle.name(), ". ",
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.message()));
+                  ", status error message=", status.message())));
 
   tf_shared_lock ml(*variable->mu());
   // We're acquiring a reference to the underlying buffer while
@@ -167,9 +171,9 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   if (!variable->copy_on_read_mode.load()) {
     OP_REQUIRES(
         ctx, dtype_ == t->dtype(),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "Trying to read variable with wrong dtype. Expected ",
-            DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
+            DataTypeString(dtype_), " got ", DataTypeString(t->dtype()))));
     ctx->set_output(0, *t);
   } else {
     OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
@@ -181,9 +185,9 @@ ReadVariablesOp::ReadVariablesOp(OpKernelConstruction* c) : OpKernel(c) {
   OP_REQUIRES_OK(c, c->GetAttr("N", &n));
   OP_REQUIRES_OK(c, c->GetAttr("dtypes", &dtypes_));
   OP_REQUIRES(c, n == dtypes_.size(),
-              errors::InvalidArgument(
+              absl::InvalidArgumentError(absl::StrCat(
                   "Mismatched number of arguments to ReadVariablesOp (", n,
-                  " vs. ", dtypes_.size(), ")"));
+                  " vs. ", dtypes_.size(), ")")));
 }
 
 void ReadVariablesOp::Compute(OpKernelContext* ctx) {
@@ -203,10 +207,10 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
   }
 
   OP_REQUIRES(ctx, uninitialized_vars.empty(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "In ReadVariablesOp the following variables were "
                   "found uninitialized: ",
-                  absl::StrJoin(uninitialized_vars, ", ")));
+                  absl::StrJoin(uninitialized_vars, ", "))));
 
   for (size_t i = 0; i < dtypes_.size(); ++i) {
     // We're acquiring a reference to the underlying buffer while
@@ -214,11 +218,11 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
     // writes.
     tf_shared_lock ml(*variables[i]->mu());
     OP_REQUIRES(ctx, dtypes_[i] == variables[i]->tensor()->dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to read variable ", handles[i]->name(),
                     " from Container: ", handles[i]->container(),
                     " with wrong dtype. Expected ", DataTypeString(dtypes_[i]),
-                    " got ", DataTypeString(variables[i]->tensor()->dtype())));
+                    " got ", DataTypeString(variables[i]->tensor()->dtype()))));
     if (variables[i]->copy_on_read_mode.load()) {
       OP_REQUIRES_OK(ctx, CopyVariable(i, ctx, variables[i]->tensor()));
     } else {
@@ -383,12 +387,12 @@ void DisableCopyOnReadOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
   const auto status = LookupResource(ctx, handle, &variable);
   OP_REQUIRES(ctx, status.ok(),
-              errors::FailedPrecondition(
+              absl::FailedPreconditionError(absl::StrCat(
                   "Could not find variable ", handle.name(), ". ",
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.message()));
+                  ", status error message=", status.message())));
   // If the variable is currently in copy-on-read mode, its refcount is 1
   if (variable->copy_on_read_mode.load()) {
     // Obtain an exclusive lock on the variable and change the access mode
@@ -420,10 +424,10 @@ class AssignVariableOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     OP_REQUIRES(context, dtype_ == context->input(1).dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Variable and value dtypes don't match; respectively, ",
                     DataTypeString(dtype_), " and ",
-                    DataTypeString(context->input(1).dtype())));
+                    DataTypeString(context->input(1).dtype()))));
     core::RefCountPtr<Var> variable;
     const Tensor& value = context->input(1);
     // Note: every resource-variable-manipulating op assumes copy-on-write
@@ -454,20 +458,20 @@ class AssignVariableOp : public OpKernel {
                 (variable->tensor()->dtype() == DT_INVALID &&
                  !variable->is_initialized) ||
                     variable->tensor()->dtype() == dtype_,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
-                    DataTypeString(dtype_)));
+                    DataTypeString(dtype_))));
     if (validate_shape_) {
       OP_REQUIRES(
           context,
           (!variable->is_initialized ||
            variable->tensor()->shape().IsSameSize(value.shape())),
-          errors::InvalidArgument(
+          absl::InvalidArgumentError(absl::StrCat(
               "Trying to assign to variable with tensor with wrong shape."
               " Expected ",
               variable->tensor()->shape().DebugString(), " got ",
-              value.shape().DebugString()));
+              value.shape().DebugString())));
     }
     if (variable->copy_on_read_mode.load()) {
       AllocatorAttributes attr;
@@ -496,9 +500,10 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
   explicit AssignVariableOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
-    OP_REQUIRES(c, dtype_ == DT_VARIANT,
-                errors::Internal("Variant kernel called with dtype: ",
-                                 DataTypeString(dtype_)));
+    OP_REQUIRES(
+        c, dtype_ == DT_VARIANT,
+        absl::InternalError(absl::StrCat("Variant kernel called with dtype: ",
+                                         DataTypeString(dtype_))));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -532,10 +537,10 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
     mutex_lock ml(*variable->mu());
     OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
-                    DataTypeString(DT_VARIANT)));
+                    DataTypeString(DT_VARIANT))));
     variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
 
@@ -574,6 +579,7 @@ TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_float8_e5m2(REGISTER_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_KERNELS);
+TF_CALL_float4_e2m1fn(REGISTER_KERNELS);
 TF_CALL_int4(REGISTER_KERNELS);
 TF_CALL_uint4(REGISTER_KERNELS);
 TF_CALL_int2(REGISTER_KERNELS);
@@ -592,6 +598,7 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
 TF_CALL_float8_e5m2(REGISTER_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_GPU_KERNELS);
+TF_CALL_float4_e2m1fn(REGISTER_GPU_KERNELS);
 TF_CALL_int4(REGISTER_GPU_KERNELS);
 TF_CALL_uint4(REGISTER_GPU_KERNELS);
 TF_CALL_int2(REGISTER_GPU_KERNELS);
@@ -633,7 +640,7 @@ class AssignUpdateVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, ValidateAssignUpdateVariableOpShapes(
                                 var_tensor->shape(), value.shape()));
     OP_REQUIRES(context, var_tensor->dtype() == value.dtype(),
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "DType of variable handle and value does not match."));
     OP_REQUIRES_OK(
         context, PrepareToUpdateVariable<Device, T>(
@@ -746,21 +753,21 @@ class ResourceGatherOp : public OpKernel {
     const Tensor& indices = c->input(1);
     OP_REQUIRES(
         c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-        errors::InvalidArgument("params must be at least 1 dimensional"));
-    OP_REQUIRES(
-        c, params.shape().dims() >= batch_dims_,
-        errors::InvalidArgument("params must have at least ", batch_dims_,
-                                " (batch_dims) dimensions but it has shape ",
-                                params.shape().DebugString()));
+        absl::InvalidArgumentError("params must be at least 1 dimensional"));
+    OP_REQUIRES(c, params.shape().dims() >= batch_dims_,
+                absl::InvalidArgumentError(
+                    absl::StrCat("params must have at least ", batch_dims_,
+                                 " (batch_dims) dimensions but it has shape ",
+                                 params.shape().DebugString())));
 
     // Check that we have enough index space
     const int64_t N = indices.NumElements();
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params.dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // The result shape is params.shape[:batch_dims] +
     // indices.shape[batch_dims:] + params.shape[batch_dims+1:].
@@ -817,11 +824,11 @@ class ResourceGatherOp : public OpKernel {
       functor::GatherFunctor<Device, T, Index> functor;
       int64_t bad_i = functor(c, params_flat, indices_flat, out_flat);
 
-      OP_REQUIRES(
-          c, bad_i < 0,
-          errors::InvalidArgument(
-              "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-              indices_flat(bad_i), " is not in [0, ", params.dim_size(0), ")"));
+      OP_REQUIRES(c, bad_i < 0,
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "indices", SliceDebugString(indices.shape(), bad_i),
+                      " = ", indices_flat(bad_i), " is not in [0, ",
+                      params.dim_size(0), ")")));
     }
   }
 
@@ -838,8 +845,8 @@ class ResourceGatherOp : public OpKernel {
     }
     OP_REQUIRES(
         ctx, batch_size != 0,
-        errors::InvalidArgument(
-            "Inner size of indices would result in batch_size of 0 and a ",
+        absl::InvalidArgumentError(
+            "Inner size of indices would result in batch_size of 0 and a "
             "division by 0 in the implementation. This is illegal"));
 
     auto indices_flat = indices->flat<Index>();
@@ -1014,7 +1021,7 @@ Status CopyTensorToHost(OpKernelContext* c, const Tensor& device_tensor,
   TF_RETURN_IF_ERROR(stream->Memcpy(host_tensor->flat<T>().data(), device_ptr,
                                     device_tensor.NumElements() * sizeof(T)));
   if (!stream) {
-    return errors::Internal("Failed to copy indices to host");
+    return absl::InternalError("Failed to copy indices to host");
   }
   return OkStatus();
 }
@@ -1027,9 +1034,9 @@ template <typename T, typename Index, scatter_op::UpdateOp Op>
 Status DoScatterOnCpu(OpKernelContext* c, Tensor* params, const Tensor& indices,
                       const Tensor& updates, Index num_indices) {
   if (!DataTypeCanUseMemcpy(params->dtype())) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(absl::StrCat(
         "GPU Scatter ops for dtype ", DataTypeString(params->dtype()),
-        " do not yet have a deterministic implementation");
+        " do not yet have a deterministic implementation"));
   }
   auto stream = c->op_device_context()->stream();
 
@@ -1050,7 +1057,7 @@ Status DoScatterOnCpu(OpKernelContext* c, Tensor* params, const Tensor& indices,
   TF_RETURN_IF_ERROR(stream->Memcpy(&params_ptr, host_params.flat<T>().data(),
                                     host_params.NumElements() * sizeof(T)));
   if (!stream) {
-    return errors::Internal("Failed to copy params to device");
+    return absl::InternalError("Failed to copy params to device");
   }
   // Deallocate host_params' buffer once the host-to-device copy is complete.
   // host_params is captured by value in the lambda so that its buffer is only
@@ -1090,17 +1097,17 @@ absl::Status DoScatter(OpKernelContext* c, Tensor* params,
       const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                   params_flat, update, indices_flat);
       if (bad_i >= 0) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")");
+            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")"));
       }
     } else {
       int64_t num_updates = updates.NumElements();
       if (!TensorShapeUtils::StartsWith(updates.shape(), indices.shape())) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "The shape of indices (", indices.shape().DebugString(),
             ") must be a prefix of the shape of updates (",
-            updates.shape().DebugString(), ")");
+            updates.shape().DebugString(), ")"));
       }
       auto updates_flat =
           updates.shaped<T, 2>({num_indices, num_updates / num_indices});
@@ -1108,9 +1115,9 @@ absl::Status DoScatter(OpKernelContext* c, Tensor* params,
       const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                   params_flat, updates_flat, indices_flat);
       if (bad_i >= 0) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")");
+            indices_flat(bad_i), " is not in [0, ", params->dim_size(0), ")"));
       }
     }
   }
@@ -1138,7 +1145,7 @@ class ResourceScatterUpdateOp : public OpKernel {
     // Check data type of update and resource to scatter.
     const DataType update_dtype = c->input(2).dtype();
     OP_REQUIRES(c, v->tensor()->dtype() == update_dtype,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(
                     "DType of scatter resource and updates does not match."));
 
     OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v.get()));
@@ -1166,33 +1173,32 @@ class ResourceScatterUpdateOp : public OpKernel {
     OP_REQUIRES(c,
                 updates.dims() == 0 ||
                     updates.dims() == indices.dims() + params->dims() - 1,
-                errors::InvalidArgument(
+                absl::InvalidArgumentError(absl::StrCat(
                     "Must have updates.shape = indices.shape + "
                     "params.shape[1:] or updates.shape = [], got ",
                     "updates.shape ", updates.shape().DebugString(),
                     ", indices.shape ", indices.shape().DebugString(),
-                    ", params.shape ", params->shape().DebugString()));
+                    ", params.shape ", params->shape().DebugString())));
 
     // Check that we have enough index space
     const int64_t N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "indices has too many elements for ",
+                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
+                    N_big, " > ", std::numeric_limits<Index>::max())));
     const Index N = static_cast<Index>(N_big);
-    OP_REQUIRES(
-        c, params->dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params->dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params->dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params->dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // Prevent division by 0
     if (isCPUDevice<Device>() && op == tensorflow::scatter_op::UpdateOp::DIV) {
       OP_REQUIRES(c, ValidateInput<T>(updates),
-                  errors::InvalidArgument("updates must not contain 0"));
+                  absl::InvalidArgumentError("updates must not contain 0"));
     }
 
     if (N > 0) {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
index 7d1dd915ee3383..ffa562d7158744 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
@@ -33,7 +33,7 @@ class RiscReshapeOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int32>("Tshape"),
+    Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tshape"),
     RiscReshapeOp);
 REGISTER_KERNEL_BUILDER(
     Name("RiscReshape").Device(DEVICE_CPU).TypeConstraint<int64_t>("Tshape"),
diff --git a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
index 98273b64cf6a7d..a7b7e73a8aa37b 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
@@ -34,8 +34,8 @@ class RiscShapeOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int32>("out_type"),
-    RiscShapeOp<int32>);
+    Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int32_t>("out_type"),
+    RiscShapeOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("RiscShape").Device(DEVICE_CPU).TypeConstraint<int64_t>("out_type"),
     RiscShapeOp<int64_t>);
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 12a8b153ebb2a7..44364ee6c4cd8a 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 // See docs in ../ops/state_ops.cc.
 
+#include <limits>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/scatter_functor.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,17 +55,18 @@ static bool ValidShapes(const Tensor& params, const Tensor& updates,
 static void DoValidationChecking(OpKernelContext* c, const Tensor& params,
                                  const Tensor& indices, const Tensor& updates) {
   OP_REQUIRES(c, params.IsInitialized(),
-              errors::FailedPrecondition("Null ref for params"));
+              absl::FailedPreconditionError("Null ref for params"));
   OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(params.shape()),
-              errors::InvalidArgument("params must be at least 1-D, got shape ",
-                                      params.shape().DebugString()));
-  OP_REQUIRES(
-      c, ValidShapes(params, updates, indices),
-      errors::InvalidArgument("Must have updates.shape = indices.shape + "
-                              "params.shape[1:] or updates.shape = [], got ",
-                              "updates.shape ", updates.shape().DebugString(),
-                              ", indices.shape ", indices.shape().DebugString(),
-                              ", params.shape ", params.shape().DebugString()));
+              absl::InvalidArgumentError(
+                  absl::StrCat("params must be at least 1-D, got shape ",
+                               params.shape().DebugString())));
+  OP_REQUIRES(c, ValidShapes(params, updates, indices),
+              absl::InvalidArgumentError(absl::StrCat(
+                  "Must have updates.shape = indices.shape + "
+                  "params.shape[1:] or updates.shape = [], got ",
+                  "updates.shape ", updates.shape().DebugString(),
+                  ", indices.shape ", indices.shape().DebugString(),
+                  ", params.shape ", params.shape().DebugString())));
 }
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
@@ -76,7 +82,7 @@ class ScatterUpdateOp : public OpKernel {
     if (std::is_same<Device, GPUDevice>::value) {
       OP_REQUIRES(
           c, !OpDeterminismRequired(),
-          errors::Unimplemented(
+          absl::UnimplementedError(
               "Determinism is not yet supported in GPU implementation of "
               "Scatter ops with ref inputs. Consider using resource variables "
               "instead if you want to run Scatter when op determinism is "
@@ -106,19 +112,18 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64_t N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(absl::StrCat(
+                    "indices has too many elements for ",
+                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
+                    N_big, " > ", std::numeric_limits<Index>::max())));
     const Index N = static_cast<Index>(indices.NumElements());
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
+    OP_REQUIRES(c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("params.shape[0] too large for ",
+                                 DataTypeString(DataTypeToEnum<Index>::v()),
+                                 " indexing: ", params.dim_size(0), " > ",
+                                 std::numeric_limits<Index>::max())));
 
     // We always return the input ref.
     c->forward_ref_input_to_ref_output(0, 0);
@@ -133,10 +138,10 @@ class ScatterUpdateOp : public OpKernel {
         const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                     params_flat, update, indices_flat);
         OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "indices", SliceDebugString(indices.shape(), bad_i),
                         " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
+                        params.dim_size(0), ")")));
       } else {
         auto updates_flat =
             updates.shaped<T, 2>({N, updates.NumElements() / N});
@@ -145,10 +150,10 @@ class ScatterUpdateOp : public OpKernel {
         const Index bad_i = functor(c, c->template eigen_device<Device>(),
                                     params_flat, updates_flat, indices_flat);
         OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
+                    absl::InvalidArgumentError(absl::StrCat(
                         "indices", SliceDebugString(indices.shape(), bad_i),
                         " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
+                        params.dim_size(0), ")")));
       }
     }
   }
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 658d0d161ab3ee..92bdec44aeb09a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 #include <vector>
 
@@ -577,7 +578,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
     // sorted.
     const SegmentId last_segment_id =
         num_indices > 0 ? segment_vec(num_indices - 1) : 0;
-    int64_t limit = dtidx_ == DataType::DT_INT32 ? kint32max : kint64max;
+    int64_t limit = dtidx_ == DataType::DT_INT32
+                        ? std::numeric_limits<int32_t>::max()
+                        : std::numeric_limits<int64_t>::max();
 
     OP_REQUIRES(
         context, last_segment_id < limit,
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index c91aff5a0ae5f3..9faeeafc45a72e 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // Ops for operating with sets. They are not checked in
 // to TensorFlow because we would first like to demonstrate successful
 // end-to-end use of these ops in eval and polish the api a bit like taking two
-// SparseTensor rather than on edense and one sparse.
+// SparseTensor rather than one dense and one sparse.
 
 #define EIGEN_USE_THREADS
 
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/ascii.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -323,8 +324,7 @@ SetOperation SetOperationFromContext(OpKernelConstruction* ctx) {
   if (!ctx->GetAttr("set_operation", &set_operation_str).ok()) {
     ctx->CtxFailure(errors::InvalidArgument("Missing set_operation."));
   } else {
-    std::transform(set_operation_str.begin(), set_operation_str.end(),
-                   set_operation_str.begin(), ::tolower);
+    absl::AsciiStrToLower(&set_operation_str);
     if ("a-b" == set_operation_str) {
       return A_MINUS_B;
     }
diff --git a/tensorflow/core/kernels/shuffle_common.h b/tensorflow/core/kernels/shuffle_common.h
index 0eea7fd43a335e..bc2334590b4eef 100644
--- a/tensorflow/core/kernels/shuffle_common.h
+++ b/tensorflow/core/kernels/shuffle_common.h
@@ -19,7 +19,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_SHUFFLE_COMMON_H_
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
+#include <limits>
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -87,7 +89,7 @@ absl::Status RandomShuffle(
           context->allocate_output(output_idx, input.shape(), &output));
       const auto input_mat = input.flat_outer_dims<T>();
       auto output_mat = output->flat_outer_dims<T>();
-      if (size < kint32max) {
+      if (size < std::numeric_limits<int32_t>::max()) {
         IndexedShuffle<int32>(size, input_mat, output_mat, uniform);
       } else {
         IndexedShuffle<int64_t>(size, input_mat, output_mat, uniform);
diff --git a/tensorflow/core/kernels/string_ngrams_op.cc b/tensorflow/core/kernels/string_ngrams_op.cc
index 3463a9ae2fbe72..2da0fcc1ad3f7d 100644
--- a/tensorflow/core/kernels/string_ngrams_op.cc
+++ b/tensorflow/core/kernels/string_ngrams_op.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cstdint>
+#include <limits>
 #include <locale>
 #include <string>
 
@@ -52,7 +54,7 @@ class StringNGramsOp : public tensorflow::OpKernel {
 
   absl::StatusOr<int> get_num_ngrams(const int length,
                                      const int ngram_width) const {
-    int64 limit = kint32max;
+    int64_t limit = std::numeric_limits<int32_t>::max();
     int pad_width = get_pad_width(ngram_width);
     if (pad_width > limit / 2 - length) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
index 6a0dabef7c0330..84650b04395d70 100644
--- a/tensorflow/core/kernels/string_strip_op.cc
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -43,7 +43,7 @@ class StringStripOp : public OpKernel {
     for (int64_t i = 0; i < input.size(); ++i) {
       absl::string_view entry(input(i));
       str_util::RemoveWhitespaceContext(&entry);
-      output(i) = string(entry);
+      output(i) = std::string(entry);
     }
   }
 };
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
index f9119259f4d934..a0486165a8ed2d 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_fast_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(absl::string_view)>
+template <uint64_t hash(absl::string_view)>
 class StringToHashBucketOp : public OpKernel {
  public:
   explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -46,8 +46,8 @@ class StringToHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = hash(input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = hash(input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
index 3231b6b87cd8d9..e7bb536ee542b4 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@@ -42,8 +42,8 @@ class LegacyStringToHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = Hash64(input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = Hash64(input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.h b/tensorflow/core/kernels/string_to_hash_bucket_op.h
index 71fba9b63ad6d1..a509ae2a337f2f 100644
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <uint64 hash(const uint64 (&)[2], const string&)>
+template <uint64_t hash(const uint64_t (&)[2], const std::string&)>
 class StringToKeyedHashBucketOp : public OpKernel {
  public:
   explicit StringToKeyedHashBucketOp(OpKernelConstruction* ctx)
@@ -53,8 +53,8 @@ class StringToKeyedHashBucketOp : public OpKernel {
 
     typedef decltype(input_flat.size()) Index;
     for (Index i = 0; i < input_flat.size(); ++i) {
-      const uint64 input_hash = hash(key_, input_flat(i));
-      const uint64 bucket_id = input_hash % num_buckets_;
+      const uint64_t input_hash = hash(key_, input_flat(i));
+      const uint64_t bucket_id = input_hash % num_buckets_;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
@@ -64,7 +64,7 @@ class StringToKeyedHashBucketOp : public OpKernel {
 
  private:
   int64_t num_buckets_;
-  uint64 key_[2];
+  uint64_t key_[2];
 
   StringToKeyedHashBucketOp(const StringToKeyedHashBucketOp&) = delete;
   void operator=(const StringToKeyedHashBucketOp&) = delete;
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 7d1553874d9a7a..3d234a23498448 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -68,7 +68,7 @@ class StringToNumberOp : public OpKernel {
                           StringToNumberOp<type>)
 REGISTER(float);
 REGISTER(double);
-REGISTER(int32);
+REGISTER(int32_t);
 REGISTER(int64_t);
 REGISTER(uint32_t);
 REGISTER(uint64_t);
diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc
index 0a427dcc294c73..ff31e7c0c2816b 100644
--- a/tensorflow/core/kernels/string_upper_op.cc
+++ b/tensorflow/core/kernels/string_upper_op.cc
@@ -63,7 +63,7 @@ class StringUpperOp : public OpKernel {
   }
 
  private:
-  string encoding_;
+  std::string encoding_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp);
diff --git a/tensorflow/core/kernels/string_util.cc b/tensorflow/core/kernels/string_util.cc
index faeb0069678a81..a0185d1bcbddd2 100644
--- a/tensorflow/core/kernels/string_util.cc
+++ b/tensorflow/core/kernels/string_util.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Sets unit value based on str.
-absl::Status ParseUnicodeEncoding(const string& str,
+absl::Status ParseUnicodeEncoding(const std::string& str,
                                   UnicodeEncoding* encoding) {
   if (str == "UTF-8") {
     *encoding = UnicodeEncoding::UTF8;
@@ -36,7 +36,7 @@ absl::Status ParseUnicodeEncoding(const string& str,
 }
 
 // Sets unit value based on str.
-absl::Status ParseCharUnit(const string& str, CharUnit* unit) {
+absl::Status ParseCharUnit(const std::string& str, CharUnit* unit) {
   if (str == "BYTE") {
     *unit = CharUnit::BYTE;
   } else if (str == "UTF8_CHAR") {
@@ -50,7 +50,7 @@ absl::Status ParseCharUnit(const string& str, CharUnit* unit) {
 
 // Return the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& str) {
+int32_t UTF8StrLen(const std::string& str) {
   const int32_t byte_size = str.size();
   const char* const end = str.data() + byte_size;
   const char* ptr = str.data();
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h
index 58230d3d3e3cf4..af790ad417c778 100644
--- a/tensorflow/core/kernels/string_util.h
+++ b/tensorflow/core/kernels/string_util.h
@@ -33,14 +33,15 @@ enum class CharUnit { BYTE, UTF8_CHAR };
 inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
 
 // Sets `encoding` based on `str`.
-absl::Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
+absl::Status ParseUnicodeEncoding(const std::string& str,
+                                  UnicodeEncoding* encoding);
 
 // Sets `unit` value based on `str`.
-absl::Status ParseCharUnit(const string& str, CharUnit* unit);
+absl::Status ParseCharUnit(const std::string& str, CharUnit* unit);
 
 // Returns the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& str);
+int32_t UTF8StrLen(const std::string& str);
 
 // Get the next UTF8 character position starting at the given position and
 // skipping the given number of characters. Position is a byte offset, and
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 3ea53cbe70f542..0b3fea3ab3e0d3 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -39,7 +39,7 @@ template <typename T>
 class SubstrOp : public OpKernel {
  public:
   explicit SubstrOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string unit;
+    std::string unit;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("unit", &unit));
     OP_REQUIRES_OK(ctx, ParseCharUnit(unit, &unit_));
   }
@@ -342,6 +342,6 @@ class SubstrOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                         \
       Name("Substr").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       SubstrOp<type>);
-REGISTER_SUBSTR(int32);
+REGISTER_SUBSTR(int32_t);
 REGISTER_SUBSTR(int64_t);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/substr_op_test.cc b/tensorflow/core/kernels/substr_op_test.cc
index 5f9eaa8ca7468a..4e630604e5f4c8 100644
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@@ -136,9 +136,9 @@ Graph* SetupSubstrGraph(const Tensor& input, const int32_t pos,
                         const int32_t len, const char* const unit) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor position(DT_INT32, TensorShape({}));
-  position.flat<int32>().setConstant(pos);
+  position.flat<int32_t>().setConstant(pos);
   Tensor length(DT_INT32, TensorShape({}));
-  length.flat<int32>().setConstant(len);
+  length.flat<int32_t>().setConstant(len);
 
   TF_CHECK_OK(NodeBuilder("substr_op", "Substr")
                   .Input(test::graph::Constant(g, input))
diff --git a/tensorflow/core/kernels/summary_audio_op.cc b/tensorflow/core/kernels/summary_audio_op.cc
index 99f2334e466a41..fb080f66fd5e52 100644
--- a/tensorflow/core/kernels/summary_audio_op.cc
+++ b/tensorflow/core/kernels/summary_audio_op.cc
@@ -44,7 +44,7 @@ class SummaryAudioOp : public OpKernel {
     OP_REQUIRES(c, tensor.dims() >= 2 && tensor.dims() <= 3,
                 errors::InvalidArgument("Tensor must be 3-D or 2-D, got: ",
                                         tensor.shape().DebugString()));
-    const string& base_tag = tag.scalar<tstring>()();
+    const std::string& base_tag = tag.scalar<tstring>()();
 
     float sample_rate = sample_rate_attr_;
     if (!has_sample_rate_attr_) {
diff --git a/tensorflow/core/kernels/summary_audio_op_test.cc b/tensorflow/core/kernels/summary_audio_op_test.cc
index 198627652f6002..987f7bdc1833f3 100644
--- a/tensorflow/core/kernels/summary_audio_op_test.cc
+++ b/tensorflow/core/kernels/summary_audio_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 33d8338ff4fcbc..6ca851979fd503 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -28,14 +28,14 @@ namespace tensorflow {
 
 class SummaryImageOp : public OpKernel {
  public:
-  typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
+  typedef Eigen::Tensor<uint8_t, 2, Eigen::RowMajor> Uint8Image;
 
   explicit SummaryImageOp(OpKernelConstruction* context) : OpKernel(context) {
     int64_t max_images_tmp;
     OP_REQUIRES_OK(context, context->GetAttr("max_images", &max_images_tmp));
     OP_REQUIRES(context, max_images_tmp < (1LL << 31),
                 errors::InvalidArgument("max_images must be < 2^31"));
-    max_images_ = static_cast<int32>(max_images_tmp);
+    max_images_ = static_cast<int32_t>(max_images_tmp);
     const TensorProto* proto;
     OP_REQUIRES_OK(context, context->GetAttr("bad_color", &proto));
     OP_REQUIRES_OK(context, context->device()->MakeTensorFromProto(
@@ -61,7 +61,7 @@ class SummaryImageOp : public OpKernel {
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
-    const string& base_tag = tags.scalar<tstring>()();
+    const std::string& base_tag = tags.scalar<tstring>()();
 
     OP_REQUIRES(c,
                 tensor.dim_size(0) < (1LL << 31) &&
@@ -87,8 +87,8 @@ class SummaryImageOp : public OpKernel {
     if (tensor.dtype() == DT_UINT8) {
       // For uint8 input, no normalization is necessary
       auto ith_image = [&tensor, batch_size, hw, depth](int i) {
-        auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
-        return typename TTypes<uint8>::ConstMatrix(
+        auto values = tensor.shaped<uint8_t, 3>({batch_size, hw, depth});
+        return typename TTypes<uint8_t>::ConstMatrix(
             &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
       };
       OP_REQUIRES_OK(
@@ -112,14 +112,14 @@ class SummaryImageOp : public OpKernel {
   template <class T>
   void NormalizeAndAddImages(OpKernelContext* c, const Tensor& tensor, int h,
                              int w, int hw, int depth, int batch_size,
-                             const string& base_tag, Summary* s) {
+                             const std::string& base_tag, Summary* s) {
     // For float and half images, nans and infs are replaced with bad_color.
     OP_REQUIRES(c, bad_color_.dim_size(0) >= depth,
                 errors::InvalidArgument(
                     "expected depth <= bad_color.size, got depth = ", depth,
                     ", bad_color.size = ", bad_color_.dim_size(0)));
-    auto bad_color_full = bad_color_.vec<uint8>();
-    typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
+    auto bad_color_full = bad_color_.vec<uint8_t>();
+    typename TTypes<uint8_t>::ConstVec bad_color(bad_color_full.data(), depth);
 
     // Float images must be scaled and translated.
     Uint8Image image(hw, depth);
@@ -142,7 +142,7 @@ class SummaryImageOp : public OpKernel {
   // differently in the float and uint8 cases: the float case needs a temporary
   // buffer which can be shared across calls to ith_image, but the uint8 case
   // does not.
-  absl::Status AddImages(const string& tag, int batch_size, int w, int h,
+  absl::Status AddImages(const std::string& tag, int batch_size, int w, int h,
                          int depth,
                          const std::function<Uint8Image(int)>& ith_image,
                          Summary* s) {
@@ -180,7 +180,7 @@ class SummaryImageOp : public OpKernel {
   template <class T>
   static void NormalizeFloatImage(int hw, int depth,
                                   typename TTypes<T>::ConstMatrix values,
-                                  typename TTypes<uint8>::ConstVec bad_color,
+                                  typename TTypes<uint8_t>::ConstVec bad_color,
                                   Uint8Image* image) {
     if (!image->size()) return;  // Nothing to do for empty images
 
@@ -241,7 +241,7 @@ class SummaryImageOp : public OpKernel {
       }
       if (finite) {
         image->chip<0>(i) = (values.template chip<0>(i) * scale + offset)
-                                .template cast<uint8>();
+                                .template cast<uint8_t>();
       } else {
         image->chip<0>(i) = bad_color;
       }
@@ -249,7 +249,7 @@ class SummaryImageOp : public OpKernel {
   }
 
  private:
-  int32 max_images_;
+  int32_t max_images_;
   Tensor bad_color_;
 };
 
diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc
index 9a136915c78ce3..43e4bb06b27104 100644
--- a/tensorflow/core/kernels/summary_image_op_test.cc
+++ b/tensorflow/core/kernels/summary_image_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index f423d4abaa5808..0f743c4dfa2590 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -36,21 +36,21 @@ class SummaryWriterInterface : public ResourceBase {
 
   // These are called in the OpKernel::Compute methods for the summary ops.
   virtual absl::Status WriteTensor(int64_t global_step, Tensor t,
-                                   const string& tag,
-                                   const string& serialized_metadata) = 0;
+                                   const std::string& tag,
+                                   const std::string& serialized_metadata) = 0;
 
   virtual absl::Status WriteScalar(int64_t global_step, Tensor t,
-                                   const string& tag) = 0;
+                                   const std::string& tag) = 0;
 
   virtual absl::Status WriteHistogram(int64_t global_step, Tensor t,
-                                      const string& tag) = 0;
+                                      const std::string& tag) = 0;
 
   virtual absl::Status WriteImage(int64_t global_step, Tensor t,
-                                  const string& tag, int max_images,
+                                  const std::string& tag, int max_images,
                                   Tensor bad_color) = 0;
 
   virtual absl::Status WriteAudio(int64_t global_step, Tensor t,
-                                  const string& tag, int max_outputs_,
+                                  const std::string& tag, int max_outputs_,
                                   float sample_rate) = 0;
 
   virtual absl::Status WriteGraph(int64_t global_step,
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 81d30e8dbee42b..bb1f5f8e755872 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -40,19 +40,19 @@ class CreateSummaryFileWriterOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("logdir", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("logdir must be a scalar"));
-    const string logdir = tmp->scalar<tstring>()();
+    const std::string logdir = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("max_queue", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("max_queue must be a scalar"));
-    const int32_t max_queue = tmp->scalar<int32>()();
+    const int32_t max_queue = tmp->scalar<int32_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("flush_millis", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("flush_millis must be a scalar"));
-    const int32_t flush_millis = tmp->scalar<int32>()();
+    const int32_t flush_millis = tmp->scalar<int32_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tmp->shape()),
                 errors::InvalidArgument("filename_suffix must be a scalar"));
-    const string filename_suffix = tmp->scalar<tstring>()();
+    const std::string filename_suffix = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
@@ -75,13 +75,13 @@ class CreateSummaryDbWriterOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* tmp;
     OP_REQUIRES_OK(ctx, ctx->input("db_uri", &tmp));
-    const string db_uri = tmp->scalar<tstring>()();
+    const std::string db_uri = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("experiment_name", &tmp));
-    const string experiment_name = tmp->scalar<tstring>()();
+    const std::string experiment_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("run_name", &tmp));
-    const string run_name = tmp->scalar<tstring>()();
+    const std::string run_name = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
-    const string user_name = tmp->scalar<tstring>()();
+    const std::string user_name = tmp->scalar<tstring>()();
 
     core::RefCountPtr<SummaryWriterInterface> s;
     OP_REQUIRES_OK(
@@ -140,9 +140,9 @@ class WriteSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
-    const string& serialized_metadata = tmp->scalar<tstring>()();
+    const std::string& serialized_metadata = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
@@ -220,7 +220,7 @@ class WriteScalarSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("value", &t));
@@ -242,7 +242,7 @@ class WriteHistogramSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("values", &t));
@@ -260,7 +260,7 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("max_images", &max_images_tmp));
     OP_REQUIRES(ctx, max_images_tmp < (1LL << 31),
                 errors::InvalidArgument("max_images must be < 2^31"));
-    max_images_ = static_cast<int32>(max_images_tmp);
+    max_images_ = static_cast<int32_t>(max_images_tmp);
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -270,7 +270,7 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     const Tensor* bad_color;
     OP_REQUIRES_OK(ctx, ctx->input("bad_color", &bad_color));
     OP_REQUIRES(
@@ -285,7 +285,7 @@ class WriteImageSummaryOp : public OpKernel {
   }
 
  private:
-  int32 max_images_;
+  int32_t max_images_;
 };
 REGISTER_KERNEL_BUILDER(Name("WriteImageSummary").Device(DEVICE_CPU),
                         WriteImageSummaryOp);
@@ -305,7 +305,7 @@ class WriteAudioSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
     const int64_t step = tmp->scalar<int64_t>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
-    const string& tag = tmp->scalar<tstring>()();
+    const std::string& tag = tmp->scalar<tstring>()();
     OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
     const float sample_rate = tmp->scalar<float>()();
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index cd6b96b0f68be6..3e456df7b6f888 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/summary_tensor_op.cc b/tensorflow/core/kernels/summary_tensor_op.cc
index 730ef6f38e5d62..5bbcb254685b36 100644
--- a/tensorflow/core/kernels/summary_tensor_op.cc
+++ b/tensorflow/core/kernels/summary_tensor_op.cc
@@ -43,7 +43,7 @@ class SummaryTensorOpV2 : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(string(tag.scalar<tstring>()()));  // NOLINT
+    v->set_tag(std::string(tag.scalar<tstring>()()));  // NOLINT
 
     if (tensor.dtype() == DT_STRING) {
       // tensor_util.makeNdarray doesn't work for strings in tensor_content
diff --git a/tensorflow/core/kernels/summary_tensor_op_test.cc b/tensorflow/core/kernels/summary_tensor_op_test.cc
index 58aff4ba331d02..87a3dd2f0b9a22 100644
--- a/tensorflow/core/kernels/summary_tensor_op_test.cc
+++ b/tensorflow/core/kernels/summary_tensor_op_test.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 static void EXPECT_SummaryMatches(const Summary& actual,
-                                  const string& expected_str) {
+                                  const std::string& expected_str) {
   Summary expected;
   CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected));
   EXPECT_EQ(expected.DebugString(), actual.DebugString());
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 2a8ff3c10186bd..2919db1e1ce36f 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -136,8 +136,9 @@ class TensorArray : public ResourceBase {
   // 'N' elements.  While the underlying storage is a std::vector and
   // can hold more than MAX_INT entries, in practice we do not expect
   // users to construct this many Tensors for storage in a TensorArray.
-  TensorArray(const string& key, const DataType& dtype, const Tensor& handle,
-              int32_t N, const PartialTensorShape& element_shape,
+  TensorArray(const std::string& key, const DataType& dtype,
+              const Tensor& handle, int32_t N,
+              const PartialTensorShape& element_shape,
               bool identical_element_shapes, bool dynamic_size,
               bool multiple_writes_aggregate, bool is_grad, int32_t marked_size,
               bool clear_after_read)
@@ -193,7 +194,7 @@ class TensorArray : public ResourceBase {
 
   template <typename Device, typename T>
   absl::Status WriteOrAggregateMany(OpKernelContext* ctx,
-                                    const std::vector<int32>& indices,
+                                    const std::vector<int32_t>& indices,
                                     std::vector<Tensor>* values) {
     mutex_lock l(mu_);
     int32_t i = 0;
@@ -228,7 +229,8 @@ class TensorArray : public ResourceBase {
   }
 
   template <typename Device, typename T>
-  absl::Status ReadMany(OpKernelContext* ctx, const std::vector<int32>& indices,
+  absl::Status ReadMany(OpKernelContext* ctx,
+                        const std::vector<int32_t>& indices,
                         std::vector<Tensor>* values) {
     mutex_lock l(mu_);
     values->clear();
@@ -260,7 +262,7 @@ class TensorArray : public ResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     mutex_lock l(mu_);
     CHECK(!closed_);
     return absl::StrCat("TensorArray[", tensors_.size(), "]");
@@ -272,7 +274,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the size of the TensorArray.
-  absl::Status Size(int32* size) {
+  absl::Status Size(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = tensors_.size();
@@ -290,7 +292,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the marked size of the TensorArray.
-  absl::Status MarkedSize(int32* size) {
+  absl::Status MarkedSize(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = marked_size_;
@@ -298,7 +300,7 @@ class TensorArray : public ResourceBase {
   }
 
   // Return the size that should be used by pack or concat op.
-  absl::Status PackOrConcatSize(int32* size) {
+  absl::Status PackOrConcatSize(int32_t* size) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(LockedReturnIfClosed());
     *size = is_grad_ ? marked_size_ : tensors_.size();
@@ -372,7 +374,7 @@ class TensorArray : public ResourceBase {
     return absl::OkStatus();
   }
 
-  const string key_;
+  const std::string key_;
 
   const DataType dtype_;
   Tensor handle_;
@@ -401,7 +403,7 @@ class TensorArray : public ResourceBase {
 
   // The size of the TensorArray after a (legacy) unpack or split is performed.
   // -1 if there has been no unpack or split performed on the TensorArray.
-  int32 marked_size_;
+  int32_t marked_size_;
 
   // The shape of each element in the TensorArray, may be partially known or not
   // known at all.
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 7e3da637c515bf..bd2956c734a1b7 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -57,8 +57,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace tensorflow {
 
-absl::Status GetHandle(OpKernelContext* ctx, string* container,
-                       string* ta_handle) {
+absl::Status GetHandle(OpKernelContext* ctx, std::string* container,
+                       std::string* ta_handle) {
   {
     Tensor tensor;
     // Assuming that handle is the input at index 0.
@@ -80,8 +80,8 @@ absl::Status GetHandle(OpKernelContext* ctx, string* container,
 }
 
 absl::Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
-  string container;
-  string ta_handle;
+  std::string container;
+  std::string ta_handle;
   if (ctx->input_dtype(0) != DT_RESOURCE) {
     TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
     ResourceMgr* rm = ctx->resource_manager();
@@ -197,13 +197,13 @@ class TensorArrayOp : public TensorArrayCreationOp {
           "TensorArray size must be scalar, but had shape: ",
           tensor_size->shape().DebugString());
     }
-    const int32_t size = tensor_size->scalar<int32>()();
+    const int32_t size = tensor_size->scalar<int32_t>()();
     if (size < 0) {
       return errors::InvalidArgument("Size should be >= 0.");
     }
 
     auto handle = tensor_array_output_handle->flat<tstring>();
-    string unique_tensor_array_name =
+    std::string unique_tensor_array_name =
         absl::StrCat(tensor_array_name_, "_",
                      TensorArray::tensor_array_counter.fetch_add(1));
     handle(0) = "_tensor_arrays";
@@ -230,7 +230,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
   bool identical_element_shapes_;
   bool dynamic_size_;
   bool clear_after_read_;
-  string tensor_array_name_;  // The name used to create the TensorArray.
+  std::string tensor_array_name_;  // The name used to create the TensorArray.
 
   TensorArrayOp(const TensorArrayOp&) = delete;
   void operator=(const TensorArrayOp&) = delete;
@@ -314,8 +314,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
   absl::Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
                                  Tensor* tensor_array_output_handle,
                                  TensorArray** output_tensor_array) override {
-    string container;
-    string tensor_array_name;
+    std::string container;
+    std::string tensor_array_name;
     if (ctx->input_dtype(0) != DT_RESOURCE) {
       TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &tensor_array_name));
       if (container != "_tensor_arrays") {
@@ -331,8 +331,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
         return errors::InvalidArgument("Wrong input container. ",
                                        resource.name());
       }
-      tensor_array_name =
-          string(absl::string_view(resource.name()).substr(container.size()));
+      tensor_array_name = std::string(
+          absl::string_view(resource.name()).substr(container.size()));
     }
 
     auto output_handle = tensor_array_output_handle->flat<tstring>();
@@ -407,7 +407,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
   // The gradient source for creating the given
   // gradient TensorArray.  This should be unique to each gradients
   // call.  Typical values look like "gradients", "gradients_1", ...
-  string source_;
+  std::string source_;
 
   TensorArrayGradOp(const TensorArrayGradOp&) = delete;
   void operator=(const TensorArrayGradOp&) = delete;
@@ -490,7 +490,7 @@ class TensorArrayWriteOp : public OpKernel {
     TensorArray* tensor_array = nullptr;
     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
     core::ScopedUnref unref(tensor_array);
-    const int32_t index = tensor_index->scalar<int32>()();
+    const int32_t index = tensor_index->scalar<int32_t>()();
     OP_REQUIRES(
         ctx, tensor_value->dtype() == tensor_array->ElemType(),
         errors::InvalidArgument("TensorArray dtype is ",
@@ -571,7 +571,7 @@ class TensorArrayReadOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
     core::ScopedUnref unref(tensor_array);
 
-    const int32_t index = tensor_index->scalar<int32>()();
+    const int32_t index = tensor_index->scalar<int32_t>()();
     OP_REQUIRES(
         ctx, dtype_ == tensor_array->ElemType(),
         errors::InvalidArgument(
@@ -669,7 +669,7 @@ class TensorArrayPackOrGatherOp : public OpKernel {
 
     int32_t num_indices;
     std::vector<Tensor> values;
-    std::vector<int32> indices;
+    std::vector<int32_t> indices;
     if (LEGACY_PACK) {
       OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&num_indices));
       indices.resize(num_indices);
@@ -681,7 +681,7 @@ class TensorArrayPackOrGatherOp : public OpKernel {
                   errors::InvalidArgument(
                       "Expected indices to be a vector, but received shape: ",
                       tensor_indices->shape().DebugString()));
-      const auto indices_t = tensor_indices->vec<int32>();
+      const auto indices_t = tensor_indices->vec<int32_t>();
       num_indices = tensor_indices->NumElements();
       indices.resize(num_indices);
       std::copy(indices_t.data(), indices_t.data() + num_indices,
@@ -911,7 +911,7 @@ class TensorArrayConcatOp : public OpKernel {
 
     // Read all the Tensors into a vector to keep track of their memory.
     std::vector<Tensor> values;
-    std::vector<int32> indices(array_size);
+    std::vector<int32_t> indices(array_size);
     std::iota(indices.begin(), indices.end(), 0);
     absl::Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
     OP_REQUIRES_OK(ctx, s);
@@ -1110,7 +1110,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     OP_REQUIRES(ctx,
                 FastBoundsCheck(element_shape.dim_size(0),
-                                std::numeric_limits<int32>::max()),
+                                std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument("tensor dim0 too large to unpack"));
 
     OP_REQUIRES(
@@ -1128,7 +1128,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     int32_t max_index;
     int32_t num_values;
-    std::vector<int32> write_indices;
+    std::vector<int32_t> write_indices;
     if (LEGACY_UNPACK) {
       num_values = element_shape.dim_size(0);
       max_index = num_values - 1;
@@ -1147,7 +1147,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
                       "Expected len(indices) == values.shape[0], but saw: ",
                       tensor_indices->NumElements(), " vs. ",
                       element_shape.dim_size(0)));
-      const auto indices_t = tensor_indices->vec<int32>();
+      const auto indices_t = tensor_indices->vec<int32_t>();
       num_values = tensor_indices->NumElements();
       max_index = (num_values == 0)
                       ? -1
@@ -1163,7 +1163,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
 
     // If dynamic size, we may have to resize the TensorArray to fit.
     if (dynamic_size && array_size < max_index + 1) {
-      array_size = static_cast<int32>(max_index + 1);
+      array_size = static_cast<int32_t>(max_index + 1);
     }
 
     if (LEGACY_UNPACK) {
@@ -1310,11 +1310,11 @@ class TensorArraySplitOp : public OpKernel {
                     tensor_lengths->shape().DebugString()));
     OP_REQUIRES(ctx,
                 FastBoundsCheck(tensor_lengths->NumElements(),
-                                std::numeric_limits<int32>::max()),
+                                std::numeric_limits<int32_t>::max()),
                 errors::InvalidArgument(
                     "Expected lengths to have < max int32 entries"));
 
-    int32_t num_tensors = static_cast<int32>(tensor_lengths->NumElements());
+    int32_t num_tensors = static_cast<int32_t>(tensor_lengths->NumElements());
     auto tensor_lengths_t = tensor_lengths->vec<int64_t>();
     std::vector<int64_t> cumulative_lengths;
     cumulative_lengths.reserve(num_tensors);
@@ -1402,7 +1402,7 @@ class TensorArraySplitOp : public OpKernel {
     // Record the concat size of the TensorArray.
     OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
 
-    std::vector<int32> indices(array_size);
+    std::vector<int32_t> indices(array_size);
     std::iota(indices.begin(), indices.end(), 0);
 
     absl::Status s = tensor_array->WriteOrAggregateMany<Device, T>(
@@ -1467,7 +1467,7 @@ class TensorArraySizeOp : public OpKernel {
     core::ScopedUnref unref(tensor_array);
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32>()())));
+    OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32_t>()())));
   }
 };
 
diff --git a/tensorflow/core/kernels/tensor_cord.cc b/tensorflow/core/kernels/tensor_cord.cc
index 56ff11e0d9493f..e529750929c9a2 100644
--- a/tensorflow/core/kernels/tensor_cord.cc
+++ b/tensorflow/core/kernels/tensor_cord.cc
@@ -40,7 +40,7 @@ void TensorCord::Encode(VariantTensorData* data) const {
 }
 
 bool TensorCord::Decode(VariantTensorData data) {
-  auto* str = new string(std::move(data.metadata_string()));
+  auto* str = new std::string(std::move(data.metadata_string()));
   Cleanup();
   chunks_.push_back(new CordRep(absl::string_view(*str), &StringReleaser, str));
   return true;
@@ -57,7 +57,7 @@ void TensorCord::TensorBufReleaser(void* tensor_buffer) {
 }
 
 void TensorCord::StringReleaser(void* str_ptr) {
-  delete static_cast<string*>(str_ptr);
+  delete static_cast<std::string*>(str_ptr);
 }
 
 namespace {
@@ -85,14 +85,15 @@ struct ResizeUninitializedTraits<
 };
 
 // Resize string `s` to `new_size`, leaving the data uninitialized.
-static inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-  ResizeUninitializedTraits<string>::Resize(s, new_size);
+static inline void STLStringResizeUninitialized(std::string* s,
+                                                size_t new_size) {
+  ResizeUninitializedTraits<std::string>::Resize(s, new_size);
 }
 
 }  // namespace
 
-TensorCord::operator string() const {
-  string out;
+TensorCord::operator std::string() const {
+  std::string out;
   STLStringResizeUninitialized(&out, size());
   char* data = const_cast<char*>(out.data());
   for (auto* rep : chunks_) {
diff --git a/tensorflow/core/kernels/tensor_cord.h b/tensorflow/core/kernels/tensor_cord.h
index 2d3d4e3fb01f71..75104f0022696c 100644
--- a/tensorflow/core/kernels/tensor_cord.h
+++ b/tensorflow/core/kernels/tensor_cord.h
@@ -114,7 +114,7 @@ class TensorCord {
   bool empty() const { return size() == 0; }
 
   // NOTE: This performs an expensive copy of the underlying data.
-  explicit operator string() const;
+  explicit operator std::string() const;
 
   class ChunkIterator {
    public:
@@ -188,9 +188,9 @@ class TensorCord {
     return ChunkIterator(this, chunks_.size());
   }
 
-  static string TypeName() { return kTypeName; }
+  static std::string TypeName() { return kTypeName; }
 
-  string DebugString() const {
+  std::string DebugString() const {
     return absl::StrCat("<TensorCord size=", size(), ">");
   }
 
@@ -217,7 +217,7 @@ class TensorCord {
       if (is_inline_) {
         return absl::string_view(
             rep_.internal.data() + 1,
-            *reinterpret_cast<const uint8*>(rep_.internal.data()));
+            *reinterpret_cast<const uint8_t*>(rep_.internal.data()));
       } else {
         return rep_.external.view;
       }
@@ -256,7 +256,7 @@ class TensorCord {
 
       explicit _rep_union(absl::string_view view) {
         DCHECK_LT(view.size(), kMaxInlineSize);
-        *reinterpret_cast<uint8*>(internal.data()) = view.size();
+        *reinterpret_cast<uint8_t*>(internal.data()) = view.size();
         std::memcpy(static_cast<char*>(internal.data() + 1), view.data(),
                     view.size());
       }
diff --git a/tensorflow/core/kernels/tensor_cord_test.cc b/tensorflow/core/kernels/tensor_cord_test.cc
index 038fddc7ce2276..450f9a1bdd43bf 100644
--- a/tensorflow/core/kernels/tensor_cord_test.cc
+++ b/tensorflow/core/kernels/tensor_cord_test.cc
@@ -80,7 +80,7 @@ TEST(TensorCordTest, Copy) {
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
   TensorCord tc_copy;
-  string a = "abc";
+  std::string a = "abc";
   {
     TensorCord tc(a, thunk, &cleaner);
     tc_copy = tc;
@@ -104,7 +104,7 @@ TEST(TensorCordTest, AppendCord) {
   TensorCord tc_0("abc", thunk_0, &cleaner_0);
   TensorCord tc_1("cba", thunk_1, &cleaner_1);
   tc_0.Append(tc_1);
-  EXPECT_EQ(string(tc_0), "abccba");
+  EXPECT_EQ(std::string(tc_0), "abccba");
   auto it = tc_0.chunk_begin();
   EXPECT_EQ(*it, "abc");
   ++it;
@@ -128,7 +128,7 @@ TEST(TensorCordTest, AppendView) {
   auto thunk_1 = CreateThunkFor(cleaner_1);
   TensorCord tc_0("abc", thunk_0, &cleaner_0);
   tc_0.Append("cba", thunk_1, &cleaner_1);
-  EXPECT_EQ(string(tc_0), "abccba");
+  EXPECT_EQ(std::string(tc_0), "abccba");
   auto it = tc_0.chunk_begin();
   EXPECT_EQ(*it, "abc");
   ++it;
@@ -147,7 +147,7 @@ TEST(TensorCordTest, Move) {
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
   TensorCord tc_copy;
-  string a = "abc";
+  std::string a = "abc";
   {
     TensorCord tc(a, thunk, &cleaner);
     tc_copy = std::move(tc);
@@ -167,7 +167,7 @@ TEST(TensorCordTest, CopyConstructor) {
   int cleaned = 0;
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
-  string a = "abc";
+  std::string a = "abc";
   TensorCord tc(a, thunk, &cleaner);
   TensorCord tc_copy(tc);
   EXPECT_EQ(tc.size(), 3);
@@ -187,7 +187,7 @@ TEST(TensorCordTest, MoveConstructor) {
   int cleaned = 0;
   auto cleaner = [&cleaned]() { ++cleaned; };
   auto thunk = CreateThunkFor(cleaner);
-  string a = "abc";
+  std::string a = "abc";
   TensorCord tc(a, thunk, &cleaner);
   TensorCord tc_copy(std::move(tc));
   EXPECT_EQ(tc_copy.size(), 3);
@@ -236,7 +236,7 @@ void TensorCordFromAbslCordBenchmark(benchmark::State& state, int num_elem,
                                      int string_size) {
   std::vector<absl::Cord> cords(num_elem);
   for (int i = 0; i < num_elem; ++i) {
-    string s(string_size, 'a');
+    std::string s(string_size, 'a');
     cords[i] = s;
   }
 
diff --git a/tensorflow/core/kernels/tensor_list.cc b/tensorflow/core/kernels/tensor_list.cc
index b65d4a96907d44..122324cbdc3148 100644
--- a/tensorflow/core/kernels/tensor_list.cc
+++ b/tensorflow/core/kernels/tensor_list.cc
@@ -35,16 +35,16 @@ void TensorList::Encode(VariantTensorData* data) const {
       invalid_indices.push_back(i);
     }
   }
-  string metadata;
+  std::string metadata;
   // TODO(b/118838800): Add a proto for storing the metadata.
   // Metadata format:
   // <num_invalid_tensors><invalid_indices><element_dtype><element_shape_proto>
-  core::PutVarint64(&metadata, static_cast<uint64>(invalid_indices.size()));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(invalid_indices.size()));
   for (size_t i : invalid_indices) {
-    core::PutVarint64(&metadata, static_cast<uint64>(i));
+    core::PutVarint64(&metadata, static_cast<uint64_t>(i));
   }
-  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
-  core::PutVarint64(&metadata, static_cast<uint64>(max_num_elements));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(element_dtype));
+  core::PutVarint64(&metadata, static_cast<uint64_t>(max_num_elements));
   TensorShapeProto element_shape_proto;
   element_shape.AsProto(&element_shape_proto);
   element_shape_proto.AppendToString(&metadata);
@@ -55,9 +55,9 @@ bool TensorList::Decode(const VariantTensorData& data) {
   // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
   // that we do not have to copy each tensor individually below. This would
   // require changing VariantTensorData::tensors() as well.
-  string metadata;
+  std::string metadata;
   data.get_metadata(&metadata);
-  uint64 scratch;
+  uint64_t scratch;
   absl::string_view iter(metadata);
   std::vector<size_t> invalid_indices;
   core::GetVarint64(&iter, &scratch);
@@ -91,7 +91,7 @@ bool TensorList::Decode(const VariantTensorData& data) {
   core::GetVarint64(&iter, &scratch);
   max_num_elements = static_cast<int>(scratch);
   TensorShapeProto element_shape_proto;
-  element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
+  element_shape_proto.ParseFromString(iter);
   element_shape = PartialTensorShape(element_shape_proto);
   return true;
 }
diff --git a/tensorflow/core/kernels/tensor_list.h b/tensorflow/core/kernels/tensor_list.h
index 5d3921cffe93af..bf2363e1ae4d9b 100644
--- a/tensorflow/core/kernels/tensor_list.h
+++ b/tensorflow/core/kernels/tensor_list.h
@@ -105,14 +105,14 @@ class TensorList {
 
   static const char kTypeName[];
 
-  string TypeName() const { return kTypeName; }
+  std::string TypeName() const { return kTypeName; }
 
   void Encode(VariantTensorData* data) const;
 
   bool Decode(const VariantTensorData& data);
 
   // TODO(apassos) fill this out
-  string DebugString() const { return "TensorList"; }
+  std::string DebugString() const { return "TensorList"; }
 
   PartialTensorShape element_shape;
 
diff --git a/tensorflow/core/kernels/tensor_map.h b/tensorflow/core/kernels/tensor_map.h
index cb4c827cc3c605..7f307c859226a1 100644
--- a/tensorflow/core/kernels/tensor_map.h
+++ b/tensorflow/core/kernels/tensor_map.h
@@ -93,14 +93,14 @@ class TensorMap {
 
   static const char kTypeName[];
 
-  string TypeName() const { return kTypeName; }
+  std::string TypeName() const { return kTypeName; }
 
   void Encode(VariantTensorData* data) const;
 
   bool Decode(const VariantTensorData& data);
 
   // TODO(apassos) fill this out
-  string DebugString() const { return "TensorMap"; }
+  std::string DebugString() const { return "TensorMap"; }
 
   // Access to the underlying tensor container.
   absl::flat_hash_map<TensorKey, Tensor>& tensors() {
diff --git a/tensorflow/core/kernels/tensor_map_test.cc b/tensorflow/core/kernels/tensor_map_test.cc
index 76c903f047cb19..5ea6d5242a7199 100644
--- a/tensorflow/core/kernels/tensor_map_test.cc
+++ b/tensorflow/core/kernels/tensor_map_test.cc
@@ -54,7 +54,7 @@ TEST(TensorMapTest, Insert) {
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it =
       tm.tensors().begin();
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v);
   map_it++;
   EXPECT_EQ(map_it, tm.tensors().end());
 }
@@ -68,7 +68,7 @@ TEST(TensorMapTest, Lookup) {
   Tensor f = map_it->second;
 
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(f, v);
+  test::ExpectTensorEqual<int32_t>(f, v);
 }
 
 TEST(TensorMapTest, Erase) {
@@ -91,7 +91,7 @@ TEST(TensorMapTest, SameKeyInsert) {
   EXPECT_EQ(b2, false);
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v1);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v1);
 }
 
 TEST(TensorMapTest, Replace) {
@@ -102,7 +102,7 @@ TEST(TensorMapTest, Replace) {
   tm[k] = v2;
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
   EXPECT_EQ(map_it->first, k);
-  test::ExpectTensorEqual<int32>(map_it->second, v2);
+  test::ExpectTensorEqual<int32_t>(map_it->second, v2);
 }
 
 TEST(TensorMapTest, ListKeys) {
@@ -153,7 +153,7 @@ TEST(TensorMapTest, Copy) {
   EXPECT_NE(tm.find(k), tm.tensors().end());
   EXPECT_NE(tmc.find(k), tmc.tensors().end());
   EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
-  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+  test::ExpectTensorEqual<int32_t>(tm.find(k)->second, tmc.find(k)->second);
 }
 
 TEST(TensorMapTest, EncodeDecode) {
@@ -169,7 +169,7 @@ TEST(TensorMapTest, EncodeDecode) {
   EXPECT_NE(tm.find(k), tm.tensors().end());
   EXPECT_NE(tmc.find(k), tmc.tensors().end());
   EXPECT_EQ(tm.find(k)->first, tmc.find(k)->first);
-  test::ExpectTensorEqual<int32>(tm.find(k)->second, tmc.find(k)->second);
+  test::ExpectTensorEqual<int32_t>(tm.find(k)->second, tmc.find(k)->second);
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
index cdf7dab23947d9..ca43aa9c8da329 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
@@ -36,15 +36,11 @@ template <typename Device, typename T>
 struct LaunchTensorToHashBucket {
   void operator()(OpKernelContext* c, const int64_t num_buckets, const T* input,
                   const int num_elems, int64_t* output) {
-    string format = "%";
     switch (DataTypeToEnum<T>::value) {
       case DT_INT8:
       case DT_INT16:
       case DT_INT32:
-        strings::Appendf(&format, "d");
-        break;
       case DT_INT64:
-        strings::Appendf(&format, "lld");
         break;
       default:
         bool type_not_supported = true;
@@ -55,9 +51,9 @@ struct LaunchTensorToHashBucket {
     }
 
     for (int i = 0; i < num_elems; ++i) {
-      string input_str = strings::Printf(format.c_str(), input[i]);
-      const uint64 input_hash = Fingerprint64(input_str);
-      const uint64 bucket_id = input_hash % num_buckets;
+      std::string input_str = absl::StrFormat("%d", input[i]);
+      const uint64_t input_hash = Fingerprint64(input_str);
+      const uint64_t bucket_id = input_hash % num_buckets;
       // The number of buckets is always in the positive range of int64 so is
       // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is
       // safe.
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
index 5a771a033b39b7..cb190d3ac871c7 100644
--- a/tensorflow/core/kernels/text_line_reader_op.cc
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 
 class TextLineReader : public ReaderBase {
  public:
-  TextLineReader(const string& node_name, int skip_header_lines, Env* env)
+  TextLineReader(const std::string& node_name, int skip_header_lines, Env* env)
       : ReaderBase(absl::StrCat("TextLineReader '", node_name, "'")),
         skip_header_lines_(skip_header_lines),
         env_(env),
@@ -41,7 +41,7 @@ class TextLineReader : public ReaderBase {
 
     input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
     for (; line_number_ < skip_header_lines_; ++line_number_) {
-      string line_contents;
+      std::string line_contents;
       absl::Status status = input_buffer_->ReadLine(&line_contents);
       if (absl::IsOutOfRange(status)) {
         // We ignore an end of file error when skipping header lines.
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index 82d3fac862eddd..8abc2eeaf2b7fa 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -29,8 +29,8 @@ namespace tensorflow {
 
 class TFRecordReader : public ReaderBase {
  public:
-  TFRecordReader(const string& node_name, const string& compression_type,
-                 Env* env)
+  TFRecordReader(const std::string& node_name,
+                 const std::string& compression_type, Env* env)
       : ReaderBase(absl::StrCat("TFRecordReader '", node_name, "'")),
         env_(env),
         offset_(0),
@@ -76,10 +76,10 @@ class TFRecordReader : public ReaderBase {
 
  private:
   Env* const env_;
-  uint64 offset_;
+  uint64_t offset_;
   std::unique_ptr<RandomAccessFile> file_;
   std::unique_ptr<io::RecordReader> reader_;
-  string compression_type_ = "";
+  std::string compression_type_ = "";
 };
 
 class TFRecordReaderOp : public ReaderOpKernel {
@@ -88,7 +88,7 @@ class TFRecordReaderOp : public ReaderOpKernel {
       : ReaderOpKernel(context) {
     Env* env = context->env();
 
-    string compression_type;
+    std::string compression_type;
     OP_REQUIRES_OK(context,
                    context->GetAttr("compression_type", &compression_type));
 
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index 8d825a682dbf9e..ab8ddccd15bfeb 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -51,9 +51,11 @@ template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
   // Ensures we can use 32-bit index.
   const int64 in_nelem = in.NumElements();
-  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(in_nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   const int64 out_nelem = out->NumElements();
-  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(out_nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   // Pack strides and input dimension sizes into one buffer.
   const int32 ndims = in.dims();
   gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 550c54ed59b147..b02cf949a5d5e9 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -585,23 +585,23 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 REGISTER_KERNEL_BUILDER(Name("Tile")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
-                            .TypeConstraint<int32>("Tmultiples"),
-                        TileOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("Tmultiples"),
+                        TileOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("Tile")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
                             .TypeConstraint<int64_t>("Tmultiples"),
-                        TileOp<CPUDevice, int64>);
+                        TileOp<CPUDevice, int64_t>);
 REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
-                            .TypeConstraint<int32>("Tmultiples"),
-                        TileGradientOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("Tmultiples"),
+                        TileGradientOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .Device(DEVICE_CPU)
                             .HostMemory("multiples")
                             .TypeConstraint<int64_t>("Tmultiples"),
-                        TileGradientOp<CPUDevice, int64>);
+                        TileGradientOp<CPUDevice, int64_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_TILE(type)                                      \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 4c4c79f843e503..1f4137b20f5d95 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -244,9 +244,10 @@ struct TopKFunctor<CPUDevice, T, Tidx> {
     const double sort_cost = (k == num_cols) ? base_cost : 4 * base_cost;
     const double copy_cost = 2 * k * Eigen::TensorOpCost::AddCost<T>();
     const double total_cost = sort_cost + copy_cost;
-    const int64_t final_cost = (total_cost >= static_cast<double>(kint64max))
-                                   ? kint64max
-                                   : static_cast<int64_t>(total_cost);
+    const int64_t final_cost =
+        (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
+            ? std::numeric_limits<int64_t>::max()
+            : static_cast<int64_t>(total_cost);
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           final_cost, SortIndices);
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 93033e4bc0c7ea..8e136527653774 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <algorithm>  // NOLINT
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -1564,9 +1566,9 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),          \
                           SparseApplyProximalGradientDescentOp<T, Tindices>);
 
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
 #undef REGISTER_KERNELS
 
@@ -2250,9 +2252,9 @@ class SparseApplyProximalAdagradOp : public OpKernel {
           .TypeConstraint<Tindices>("Tindices"),             \
       SparseApplyProximalAdagradOp<D##Device, T, Tindices>);
 
-REGISTER_KERNELS(CPU, float, int32);
+REGISTER_KERNELS(CPU, float, int32_t);
 REGISTER_KERNELS(CPU, float, int64_t);
-REGISTER_KERNELS(CPU, double, int32);
+REGISTER_KERNELS(CPU, double, int32_t);
 REGISTER_KERNELS(CPU, double, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -2580,9 +2582,9 @@ class SparseApplyAdagradDAOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),      \
                           SparseApplyAdagradDAOp<T, Tindices>);
 
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
 #undef REGISTER_KERNELS
 
@@ -3450,9 +3452,9 @@ class SparseApplyKerasMomentumOp : public OpKernel {
         momentum.scalar<T>(), use_nesterov_);
     OP_REQUIRES(
         ctx, bad_i < 0,
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "indices", SliceDebugString(indices.shape(), bad_i), " = ",
-            indices_flat(bad_i), " is not in [0, ", var.dim_size(0), ")"));
+            indices_flat(bad_i), " is not in [0, ", var.dim_size(0), ")")));
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -4463,15 +4465,15 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
                               .TypeConstraint<Tindices>("Tindices"),  \
                           SparseApplyCenteredRMSPropOp<T, Tindices>);
 
-REGISTER_KERNELS(Eigen::half, int32);
+REGISTER_KERNELS(Eigen::half, int32_t);
 REGISTER_KERNELS(Eigen::half, int64_t);
-REGISTER_KERNELS(float, int32);
+REGISTER_KERNELS(float, int32_t);
 REGISTER_KERNELS(float, int64_t);
-REGISTER_KERNELS(double, int32);
+REGISTER_KERNELS(double, int32_t);
 REGISTER_KERNELS(double, int64_t);
-REGISTER_KERNELS(complex64, int32);
+REGISTER_KERNELS(complex64, int32_t);
 REGISTER_KERNELS(complex64, int64_t);
-REGISTER_KERNELS(complex128, int32);
+REGISTER_KERNELS(complex128, int32_t);
 REGISTER_KERNELS(complex128, int64_t);
 
 #undef REGISTER_KERNELS
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index b8786c6a2a027b..496d0b181bf8d8 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -75,7 +75,7 @@ static Node* Random(Graph* g, int m, int n) {
 
 static Node* Iota(Graph* g, int n) {
   Tensor data(DT_INT32, TensorShape({n}));
-  int32* base = data.flat<int32>().data();
+  int32_t* base = data.flat<int32_t>().data();
   for (int i = 0; i < n; ++i) base[i] = i;
   return test::graph::Constant(g, data);
 }
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 37f35f6a58f9e1..683be3ff01a8ce 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -34,7 +34,7 @@ namespace tensorflow {
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 absl::Status DoTranspose(const Device& device, const Tensor& in,
-                         const absl::Span<const int32> perm, Tensor* out);
+                         const absl::Span<const int32_t> perm, Tensor* out);
 
 // Conjugate and transpose tensor 'in' into tensor 'out' according to dimension
 // permutation 'perm'.
@@ -45,7 +45,7 @@ absl::Status DoTranspose(const Device& device, const Tensor& in,
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 absl::Status DoConjugateTranspose(const Device& device, const Tensor& in,
-                                  const absl::Span<const int32> perm,
+                                  const absl::Span<const int32_t> perm,
                                   Tensor* out);
 
 // Convenience versions of DoTranspose that only swap the last (inner) two
@@ -64,14 +64,14 @@ absl::Status DoConjugateMatrixTranspose(const Device& device, const Tensor& in,
 template <typename Device, typename T, bool conjugate = false>
 struct Transpose {
   static void run(const Device& d, const Tensor& in,
-                  const absl::Span<const int32> perm, Tensor* out);
+                  const absl::Span<const int32_t> perm, Tensor* out);
 };
 
 // Implementation details.
 namespace internal {
 
 typedef absl::InlinedVector<int64_t, 8UL> TransposeDimsVec;
-typedef absl::InlinedVector<int32, 8UL> TransposePermsVec;
+typedef absl::InlinedVector<int32_t, 8UL> TransposePermsVec;
 
 // Helper function that takes a tensor shape, a permutation, combines the
 // neighboring shapes if their indices in the permutation are consecutive.
@@ -79,7 +79,7 @@ typedef absl::InlinedVector<int32, 8UL> TransposePermsVec;
 // Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
 // produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
 inline void ReduceTransposeDimensions(const TensorShape& shape,
-                                      absl::Span<const int32> perm,
+                                      absl::Span<const int32_t> perm,
                                       TransposePermsVec* new_perm,
                                       TransposeDimsVec* new_dims) {
   CHECK_EQ(shape.dims(), perm.size());
@@ -130,8 +130,8 @@ inline void ReduceTransposeDimensions(const TensorShape& shape,
 // That is, for all i, 0 <= perm[i] < input_shape.dims().
 // In practice, this is checked in TransposeOp::Compute prior to calling this
 // function, and the function sits here to facilitate unit testing.
-inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
-                                        const std::vector<int32>& permutation) {
+inline bool NonSingletonDimensionsAlign(
+    const TensorShape& input_shape, const std::vector<int32_t>& permutation) {
   int last_nonsingleton_perm_dim = -1;
   for (int perm_dim : permutation) {
     if (input_shape.dim_size(perm_dim) == 1) {
@@ -148,7 +148,7 @@ inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
 // Uses Eigen to transpose.
 template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const absl::Span<const int32> perm, bool conjugate,
+                         const absl::Span<const int32_t> perm, bool conjugate,
                          Tensor* out) {
   Eigen::array<int, NDIMS> p;
   for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
@@ -167,8 +167,8 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 template <typename Device>
 absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
-                             const absl::Span<const int32> perm, bool conjugate,
-                             Tensor* out) {
+                             const absl::Span<const int32_t> perm,
+                             bool conjugate, Tensor* out) {
   // log a msg
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
@@ -181,7 +181,7 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_UINT8:
     case DT_FLOAT8_E5M2:
     case DT_FLOAT8_E4M3FN:
-      Transpose<Device, uint8>::run(d, in, perm, out);
+      Transpose<Device, uint8_t>::run(d, in, perm, out);
       break;
 
     case DT_BFLOAT16:
@@ -190,20 +190,20 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_QINT16:
     case DT_QUINT16:
     case DT_UINT16:
-      Transpose<Device, uint16>::run(d, in, perm, out);
+      Transpose<Device, uint16_t>::run(d, in, perm, out);
       break;
 
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
     case DT_UINT32:
-      Transpose<Device, uint32>::run(d, in, perm, out);
+      Transpose<Device, uint32_t>::run(d, in, perm, out);
       break;
 
     case DT_DOUBLE:
     case DT_INT64:
     case DT_UINT64:
-      Transpose<Device, uint64>::run(d, in, perm, out);
+      Transpose<Device, uint64_t>::run(d, in, perm, out);
       break;
 
     case DT_COMPLEX64:
@@ -217,7 +217,7 @@ absl::Status DoTransposeImpl(const Device& d, const Tensor& in,
         Transpose<Device, complex64, /*conjugate=*/true>::run(d, in, perm, out);
 #endif
       } else {
-        Transpose<Device, uint64>::run(d, in, perm, out);
+        Transpose<Device, uint64_t>::run(d, in, perm, out);
       }
       break;
 
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 83a92f412c4339..7ce93d69c64c43 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -33,7 +33,7 @@ namespace {
 
 template <typename T, bool conjugate>
 void TransposeSimple(const CPUDevice& device, const Tensor& in,
-                     const absl::Span<const int32> perm, Tensor* out) {
+                     const absl::Span<const int32_t> perm, Tensor* out) {
   const int ndims = in.dims();
   absl::InlinedVector<int64_t, 8UL> in_strides =
       ComputeStride<int64_t>(in.shape());
@@ -73,7 +73,7 @@ void TransposeSimple(const CPUDevice& device, const Tensor& in,
 template <typename T, bool conjugate>
 struct Transpose<CPUDevice, T, conjugate> {
   static void run(const CPUDevice& d, const Tensor& in,
-                  const absl::Span<const int32> perm, Tensor* out) {
+                  const absl::Span<const int32_t> perm, Tensor* out) {
     switch (in.dims()) {
       case 2:
         internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, conjugate,
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index a3cd6f4c446ffe..64f4d2d33c50d4 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -67,7 +67,8 @@ void TransposeSimple(const GPUDevice& d, const Tensor& in,
                      const gtl::ArraySlice<int32> perm, Tensor* out) {
   // Ensures we can use 32-bit index.
   const int64 nelem = in.NumElements();
-  CHECK_LT(nelem, kint32max) << "Tensor too large to transpose on GPU";
+  CHECK_LT(nelem, std::numeric_limits<int32_t>::max())
+      << "Tensor too large to transpose on GPU";
   // Pack strides and permutation into one buffer.
   const int32 ndims = in.dims();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index b28f414f98631a..f3cebb32043787 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -50,10 +50,11 @@ class InvertPermutationOp : public OpKernel {
         context, TensorShapeUtils::IsVector(input.shape()),
         errors::InvalidArgument("invert_permutation expects a 1D vector."));
     auto Tin = input.vec<T>();
-    OP_REQUIRES(context,
-                FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("permutation of nonnegative int32s "
-                                        "must have <= int32 max elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(Tin.size(), std::numeric_limits<int32_t>::max()),
+        errors::InvalidArgument("permutation of nonnegative int32s "
+                                "must have <= int32 max elements"));
     const T N = static_cast<T>(Tin.size());  // Safe: bounds-checked above.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
@@ -72,18 +73,18 @@ class InvertPermutationOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    InvertPermutationOp<int32>);
+    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    InvertPermutationOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int64_t>("T"),
     InvertPermutationOp<int64_t>);
 
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_DEFAULT)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp<int32>);
+                        InvertPermutationOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_DEFAULT)
                             .TypeConstraint<int64_t>("T")
@@ -94,7 +95,7 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
 namespace {
 template <typename Tperm>
 absl::Status PermutationHelper(const Tensor& perm, const int dims,
-                               std::vector<int32>* permutation) {
+                               std::vector<int32_t>* permutation) {
   auto Vperm = perm.vec<Tperm>();
   if (dims != Vperm.size()) {
     return errors::InvalidArgument("transpose expects a vector of size ", dims,
@@ -105,7 +106,7 @@ absl::Status PermutationHelper(const Tensor& perm, const int dims,
   // asynchrony boundary is permutation.
   const volatile Tperm* perm_begin =
       reinterpret_cast<const volatile Tperm*>(Vperm.data());
-  *permutation = std::vector<int32>(perm_begin, perm_begin + dims);
+  *permutation = std::vector<int32_t>(perm_begin, perm_begin + dims);
 
   return absl::OkStatus();
 }
@@ -136,10 +137,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 
   // Although Tperm may be an int64 type, an int32 is sufficient to hold
   // dimension range values, so the narrowing here should be safe.
-  std::vector<int32> permutation;
+  std::vector<int32_t> permutation;
   const int dims = input.dims();
   if (perm.dtype() == DT_INT32) {
-    OP_REQUIRES_OK(ctx, PermutationHelper<int32>(perm, dims, &permutation));
+    OP_REQUIRES_OK(ctx, PermutationHelper<int32_t>(perm, dims, &permutation));
   } else {
     OP_REQUIRES_OK(ctx, PermutationHelper<int64_t>(perm, dims, &permutation));
   }
@@ -191,17 +192,16 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 }
 
 absl::Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                         absl::Span<const int32> perm,
+                                         absl::Span<const int32_t> perm,
                                          Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
                                    out);
 }
 
-absl::Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
-                                                  const Tensor& in,
-                                                  absl::Span<const int32> perm,
-                                                  Tensor* out) {
+absl::Status ConjugateTransposeCpuOp::DoTranspose(
+    OpKernelContext* ctx, const Tensor& in, absl::Span<const int32_t> perm,
+    Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
                                             perm, out);
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 8f0405b604f818..2e22a06107f610 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -29,7 +29,7 @@ class TransposeOp : public OpKernel {
 
  protected:
   virtual absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                   absl::Span<const int32> perm,
+                                   absl::Span<const int32_t> perm,
                                    Tensor* out) = 0;
   virtual bool IsConjugate() const { return false; }
 };
@@ -40,7 +40,8 @@ class TransposeCpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
 };
 
 #if defined(INTEL_MKL)
@@ -60,7 +61,8 @@ class TransposeGpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
 };
 
 
@@ -72,7 +74,8 @@ class ConjugateTransposeCpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
@@ -96,7 +99,8 @@ class ConjugateTransposeGpuOp : public TransposeOp {
 
  protected:
   absl::Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                           absl::Span<const int32> perm, Tensor* out) override;
+                           absl::Span<const int32_t> perm,
+                           Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
diff --git a/tensorflow/core/kernels/transpose_util_test.cc b/tensorflow/core/kernels/transpose_util_test.cc
index caba50c268dd8d..85a4e6e51fdc2f 100644
--- a/tensorflow/core/kernels/transpose_util_test.cc
+++ b/tensorflow/core/kernels/transpose_util_test.cc
@@ -23,14 +23,14 @@ namespace tensorflow {
 class TransposeUtilTest : public ::testing::Test {
  protected:
   void TestDimensionReduction(const TensorShape& shape,
-                              const absl::Span<const int32> perm,
-                              const absl::Span<const int32> expected_perm,
+                              const absl::Span<const int32_t> perm,
+                              const absl::Span<const int32_t> expected_perm,
                               const absl::Span<const int64_t> expected_dims) {
     internal::TransposePermsVec new_perm;
     internal::TransposeDimsVec new_dims;
     internal::ReduceTransposeDimensions(shape, perm, &new_perm, &new_dims);
 
-    absl::Span<const int32> computed_perm(new_perm);
+    absl::Span<const int32_t> computed_perm(new_perm);
     absl::Span<const int64_t> computed_dims(new_dims);
     EXPECT_EQ(computed_perm, expected_perm);
     EXPECT_EQ(computed_dims, expected_dims);
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index f657441686cfea..1fb049a25c393c 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -35,8 +35,8 @@ class TypedConditionalAccumulatorBase : public ConditionalAccumulatorBase {
  public:
   TypedConditionalAccumulatorBase(const DataType& dtype,
                                   const PartialTensorShape& shape,
-                                  const string& name,
-                                  const string& reduction_type)
+                                  const std::string& name,
+                                  const std::string& reduction_type)
       : ConditionalAccumulatorBase(dtype, shape, name, reduction_type) {}
 
   /**
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index e4c82f0ebde03a..fe8a408c4088c1 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -34,7 +34,7 @@ class TypedQueue : public QueueBase {
  public:
   TypedQueue(const int32_t capacity, const DataTypeVector& component_dtypes,
              const std::vector<TensorShape>& component_shapes,
-             const string& name);
+             const std::string& name);
 
   virtual absl::Status Initialize();  // Must be called before any other method.
 
@@ -47,7 +47,7 @@ class TypedQueue : public QueueBase {
 template <typename SubQueue>
 TypedQueue<SubQueue>::TypedQueue(
     int32_t capacity, const DataTypeVector& component_dtypes,
-    const std::vector<TensorShape>& component_shapes, const string& name)
+    const std::vector<TensorShape>& component_shapes, const std::string& name)
     : QueueBase(capacity, component_dtypes, component_shapes, name) {}
 
 template <typename SubQueue>
diff --git a/tensorflow/core/kernels/unary_ops_composition.cc b/tensorflow/core/kernels/unary_ops_composition.cc
index 112f32a8641fe1..bce5ac67714c24 100644
--- a/tensorflow/core/kernels/unary_ops_composition.cc
+++ b/tensorflow/core/kernels/unary_ops_composition.cc
@@ -43,12 +43,13 @@ struct UnaryOpsCompositionBase {
     int cost;
   };
 
-  bool HasComputeFn(const string& name) {
+  bool HasComputeFn(const std::string& name) {
     return compute_fns.find(name) != compute_fns.end();
   }
 
  protected:
-  void RegisterComputeFn(const string& name, ComputeFn compute_fn, int cost) {
+  void RegisterComputeFn(const std::string& name, ComputeFn compute_fn,
+                         int cost) {
     VLOG(5) << "Register compute fn: name=" << name << " cost=" << cost;
     compute_fns[name] = {compute_fn, cost};
   }
@@ -56,9 +57,9 @@ struct UnaryOpsCompositionBase {
  private:
   friend class UnaryOpsComposition<T>;
 
-  absl::Status ExportComputeFns(const std::vector<string>& op_names,
+  absl::Status ExportComputeFns(const std::vector<std::string>& op_names,
                                 std::vector<ComputeFn>* fns, int* cost) {
-    for (const string& op_name : op_names) {
+    for (const std::string& op_name : op_names) {
       auto it = compute_fns.find(op_name);
       if (it == compute_fns.end())
         return errors::InvalidArgument(
@@ -72,7 +73,7 @@ struct UnaryOpsCompositionBase {
     return absl::OkStatus();
   }
 
-  std::unordered_map<string, ComputeFnRegistration> compute_fns;
+  std::unordered_map<std::string, ComputeFnRegistration> compute_fns;
 };
 
 template <typename T>
@@ -151,7 +152,7 @@ class UnaryOpsComposition : public OpKernel {
 
   Support support_;
 
-  std::vector<string> op_names_;
+  std::vector<std::string> op_names_;
   std::vector<ComputeFn> fns_;
   int cost_ = 0;
 };
diff --git a/tensorflow/core/kernels/unary_ops_composition_test.cc b/tensorflow/core/kernels/unary_ops_composition_test.cc
index 1edec07e43dc75..773f9626173081 100644
--- a/tensorflow/core/kernels/unary_ops_composition_test.cc
+++ b/tensorflow/core/kernels/unary_ops_composition_test.cc
@@ -33,7 +33,8 @@ namespace {
 class UnaryOpsCompositionTest : public OpsTestBase {
  protected:
   template <typename T>
-  void RunComposedOp(const std::vector<string> op_names, T input, T expected) {
+  void RunComposedOp(const std::vector<std::string> op_names, T input,
+                     T expected) {
     TF_ASSERT_OK(NodeDefBuilder("unary_op_composition", "_UnaryOpsComposition")
                      .Input(FakeInput(DataTypeToEnum<T>::v()))
                      .Attr("T", DataTypeToEnum<T>::v())
@@ -82,8 +83,9 @@ TEST_F(UnaryOpsCompositionTest, Compose_Tanh_Relu6_F) {
 
 // Performance benchmarks below.
 
-string Function(int i) {
-  std::vector<string> ops = {"Tanh", "Relu", "Sigmoid", "Sqrt", "Log", "Exp"};
+std::string Function(int i) {
+  std::vector<std::string> ops = {"Tanh", "Relu", "Sigmoid",
+                                  "Sqrt", "Log",  "Exp"};
   return ops[i % ops.size()];
 }
 
@@ -127,7 +129,7 @@ static Graph* UnaryOpsCompo(int tensor_size, int repeat_graph,
   Tensor t(DT_FLOAT, TensorShape({tensor_size}));
   t.flat<float>() = t.flat<float>().setRandom();
 
-  std::vector<string> functions;
+  std::vector<std::string> functions;
   for (int j = 0; j < num_functions; ++j) {
     functions.push_back(Function(j));
   }
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index e9e0da41057342..417aed9e39a9f8 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -117,7 +117,7 @@ void unicode_error_callback(const void* context, UConverterToUnicodeArgs* args,
 // encoding position.
 // callback: function(UChar32 codepoint, int num_bytes_consumed_from_source_str,
 //                    bool fatal_format_error)
-void IterateUnicodeString(const string& str, UConverter* converter,
+void IterateUnicodeString(const std::string& str, UConverter* converter,
                           std::function<void(UChar32, int, bool)> callback) {
   const char* source = str.data();
   const char* limit = str.data() + str.length();
@@ -165,7 +165,7 @@ class WrappedConverter {
     }
   }
 
-  void init(const string& name) {
+  void init(const std::string& name) {
     if (converter_ && name == name_) {
       // Note: this reset is not typically needed, but if not done, then in some
       // cases the cached converter will maintain state of input endianness
@@ -193,7 +193,7 @@ class WrappedConverter {
   }
 
   UConverter* converter_ = nullptr;
-  string name_;
+  std::string name_;
 };
 
 struct ErrorOptions {
@@ -206,7 +206,7 @@ struct ErrorOptions {
 absl::Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
   *out = ErrorOptions();
 
-  string error_policy;
+  std::string error_policy;
   TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
 
   if (error_policy == "replace") {
@@ -251,7 +251,7 @@ class UnicodeTranscodeOp : public OpKernel {
   explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
 
-    string output_encoding;
+    std::string output_encoding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
     OP_REQUIRES_OK(ctx,
                    ParseUnicodeEncoding(output_encoding, &output_encoding_));
@@ -338,7 +338,7 @@ class UnicodeTranscodeOp : public OpKernel {
     Encode(output_encoding_, source, s);
   }
 
-  string input_encoding_;
+  std::string input_encoding_;
   ErrorOptions error_options_;
   UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
 };
@@ -420,7 +420,7 @@ class UnicodeDecodeBaseOp : public OpKernel {
     int row_split_index = 0;
     SPLITS_TYPE next_row_split = 0;
     for (int i = 0; i < input_vec.size(); ++i) {
-      const string& input = input_vec(i);
+      const std::string& input = input_vec(i);
       // Convert input strings into unicode values. Output to a list of
       // char_values, record row splits and char_to_byte_starts, which are all
       // the fields needed to construct a RaggedTensor.
@@ -441,7 +441,7 @@ class UnicodeDecodeBaseOp : public OpKernel {
         ctx, ctx->allocate_output(
                  "char_values", {static_cast<SPLITS_TYPE>(char_values.size())},
                  &output_char_values));
-    auto out_char_values = output_char_values->vec<int32>();
+    auto out_char_values = output_char_values->vec<int32_t>();
     if (generate_offsets_) {
       DCHECK(offset_values.size() == char_values.size());
       Tensor* output_offset_values;
@@ -453,18 +453,18 @@ class UnicodeDecodeBaseOp : public OpKernel {
 
       // Load output tensors from intermediate value arrays.
       for (int i = 0; i < char_values.size(); ++i) {
-        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_char_values(i) = static_cast<int32_t>(char_values[i]);
         out_offset_values(i) = offset_values[i];
       }
     } else {
       for (int i = 0; i < char_values.size(); ++i) {
-        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_char_values(i) = static_cast<int32_t>(char_values[i]);
       }
     }
   }
 
  private:
-  string input_encoding_;
+  std::string input_encoding_;
   ErrorOptions error_options_;
   bool generate_offsets_ = false;
 };
@@ -491,18 +491,18 @@ REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
                             .TypeConstraint<int64_t>("Tsplits"),
                         UnicodeDecodeWithOffsetsOp<int64_t>);
 REGISTER_KERNEL_BUILDER(
-    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
-    UnicodeDecodeOp<int32>);
+    Name("UnicodeDecode").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tsplits"),
+    UnicodeDecodeOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("Tsplits"),
-                        UnicodeDecodeWithOffsetsOp<int32>);
+                            .TypeConstraint<int32_t>("Tsplits"),
+                        UnicodeDecodeWithOffsetsOp<int32_t>);
 
 template <typename SPLITS_TYPE>
 class UnicodeEncodeOp : public OpKernel {
  public:
   explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string encoding_tmp;
+    std::string encoding_tmp;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &encoding_tmp));
     OP_REQUIRES_OK(ctx, ParseUnicodeEncoding(encoding_tmp, &encoding_));
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
@@ -521,7 +521,7 @@ class UnicodeEncodeOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Get inputs
     const Tensor& input_tensor = context->input(0);
-    const auto input_tensor_flat = input_tensor.flat<int32>();
+    const auto input_tensor_flat = input_tensor.flat<int32_t>();
     const Tensor& input_splits = context->input(1);
     const auto input_splits_flat = input_splits.flat<SPLITS_TYPE>();
 
@@ -602,7 +602,7 @@ REGISTER_KERNEL_BUILDER(
     Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int64_t>("Tsplits"),
     UnicodeEncodeOp<int64_t>);
 REGISTER_KERNEL_BUILDER(
-    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int32>("Tsplits"),
-    UnicodeEncodeOp<int32>);
+    Name("UnicodeEncode").Device(DEVICE_CPU).TypeConstraint<int32_t>("Tsplits"),
+    UnicodeEncodeOp<int32_t>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unicode_script_op.cc b/tensorflow/core/kernels/unicode_script_op.cc
index 70ab6ef39bf7f1..7a378cd5054b33 100644
--- a/tensorflow/core/kernels/unicode_script_op.cc
+++ b/tensorflow/core/kernels/unicode_script_op.cc
@@ -26,13 +26,13 @@ class UnicodeScriptOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor* input_tensor;
     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<int32>();
+    const auto& input_flat = input_tensor->flat<int32_t>();
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output("output", input_tensor->shape(),
                                             &output_tensor));
-    auto output_flat = output_tensor->flat<int32>();
+    auto output_flat = output_tensor->flat<int32_t>();
 
     icu::ErrorCode status;
     for (int i = 0; i < input_flat.size(); i++) {
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index 5e0e52f6736b57..dd9447444deac3 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -64,9 +65,9 @@ class UnravelIndexOp : public OpKernel {
     double prod = 1;
     uint64_t limit;
     if (dtidx_ == DataType::DT_INT64) {
-      limit = kint64max;
+      limit = std::numeric_limits<int64_t>::max();
     } else {
-      limit = kint32max;
+      limit = std::numeric_limits<int32_t>::max();
     }
 
     for (int i = 0; i < dims.size(); i++) {
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index bc57282dc06318..14f99e9ee9bb71 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -143,7 +143,6 @@ cc_library(
 
 tf_cc_test(
     name = "threadpool_test",
-    timeout = "short",
     srcs = ["threadpool_test.cc"],
     deps = [
         ":threadpool",
@@ -151,6 +150,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@local_xla//xla/tsl/platform:test_main",
     ],
 )
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 7984b2f325449f..5507dd0644e164 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include <optional>
 
 #include "absl/synchronization/barrier.h"
-#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -353,24 +354,30 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
 }
 
 TEST(ThreadPool, Parallelism) {
-  // TODO: b/433244133 - Re-enable this test once the flakiness is fixed.
-#if defined(PLATFORM_WINDOWS)
-  GTEST_SKIP() << "Skipping test on Windows due to flakiness.";
-#endif
   // Test that if we have N threads and schedule N tasks,
   // all tasks will be scheduled at the same time.
-  // Failure mode for this test will be episodic timeouts (does not terminate).
+  // Failure mode for this test will be timeouts.
   ThreadPool pool(Env::Default(), "test", kNumThreads);
   for (int iter = 0; iter < 2000; iter++) {
     absl::Barrier barrier(kNumThreads);
-    absl::BlockingCounter counter(kNumThreads);
+    // Expect each loop finishes less than 1s or much less. The semantic of
+    // counter, mutex and done here is the same as absl::BlockingCounter except
+    // that it waits for the condition with timeout.
+    std::atomic<int> counter(kNumThreads);
+    absl::Mutex mutex;
+    bool done = false;
     for (int t = 0; t < kNumThreads; ++t) {
       pool.Schedule([&]() {
         barrier.Block();
-        counter.DecrementCount();
+        if (--counter <= 0) {
+          absl::MutexLock lock(mutex);
+          done = true;
+        }
       });
     }
-    counter.Wait();
+    absl::MutexLock lock(mutex);
+    absl::Condition cond(+[](bool* done) { return *done; }, &done);
+    EXPECT_TRUE(mutex.AwaitWithTimeout(cond, absl::Seconds(1)));
   }
 }
 
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index c79a73c1a5e69b..a0eb727f1a4343 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -25,6 +25,8 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@org_sqlite",
     ],
 )
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 587021ac6a7574..defbd1bee365c6 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/strcat.h"
@@ -81,7 +83,8 @@ absl::StatusCode GetTfErrorCode(int code) {
 }
 
 template <typename... Args>
-absl::Status PrintfStatus(int rc, const char* fmt, Args&&... args) {
+absl::Status PrintfStatus(int rc, const absl::FormatSpec<Args...>& fmt,
+                          Args&&... args) {
   return {GetTfErrorCode(rc),
           strings::Printf(fmt, std::forward<Args>(args)...)};
 }
@@ -97,8 +100,7 @@ absl::Status SetPragma(Sqlite* db, const char* pragma,
                        const absl::string_view& value) {
   if (value.empty()) return absl::OkStatus();
   for (auto p = value.begin(); p < value.end(); ++p) {
-    if (!(('0' <= *p && *p <= '9') || ('A' <= *p && *p <= 'Z') ||
-          ('a' <= *p && *p <= 'z') || *p == '-')) {
+    if (!(absl::ascii_isalnum(*p) || *p == '-')) {
       return errors::InvalidArgument("Illegal pragma character");
     }
   }
@@ -123,7 +125,7 @@ absl::Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
 }  // namespace
 
 /* static */
-absl::Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
+absl::Status Sqlite::Open(const std::string& path, int flags, Sqlite** db) {
   flags |= SQLITE_OPEN_PRIVATECACHE;
   flags |= SQLITE_OPEN_URI;
   sqlite3* sqlite = nullptr;
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 992001e448e617..6b70e84106211f 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -81,7 +81,7 @@ class TF_LOCKABLE Sqlite : public core::RefCounted {
   ///
   /// This function sets PRAGMA values from TF_SQLITE_* environment
   /// variables. See sqlite.cc to learn more.
-  static absl::Status Open(const string& path, int flags, Sqlite** db);
+  static absl::Status Open(const std::string& path, int flags, Sqlite** db);
 
   /// \brief Creates SQLite statement.
   ///
@@ -165,7 +165,7 @@ class SqliteStatement {
   const char* sql() const { return sqlite3_sql(stmt_); }
 
   /// \brief Number of bytes bound since last *Reset().
-  uint64 size() { return size_; }
+  uint64_t size() { return size_; }
 
   /// \brief Executes query for fetching arbitrary rows.
   ///
@@ -302,7 +302,7 @@ class SqliteStatement {
   ///
   /// NULL values are returned as empty string. This method should be
   /// used for both BLOB and TEXT columns. See also: ColumnType().
-  string ColumnString(int column) const TF_MUST_USE_RESULT {
+  std::string ColumnString(int column) const TF_MUST_USE_RESULT {
     auto data = sqlite3_column_blob(stmt_, column);
     if (data == nullptr) return "";
     return {static_cast<const char*>(data),
@@ -377,7 +377,7 @@ class SqliteStatement {
   sqlite3_stmt* stmt_ = nullptr;
   int bind_error_ = SQLITE_OK;
   int bind_error_parameter_ = 0;
-  uint64 size_ = 0;
+  uint64_t size_ = 0;
 
   SqliteStatement(const SqliteStatement&) = delete;
   void operator=(const SqliteStatement&) = delete;
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index a3551ca1aa5664..99e50cde01f2fc 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -89,7 +89,7 @@ TEST_F(SqliteTest, Json1Extension) {
 #endif //DSQLITE_ENABLE_JSON1
 
 TEST_F(SqliteTest, NulCharsInString) {
-  string s;  // XXX: Want to write {2, '\0'} but not sure why not.
+  std::string s;  // XXX: Want to write {2, '\0'} but not sure why not.
   s.append(static_cast<size_t>(2), '\0');
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
@@ -108,7 +108,7 @@ TEST_F(SqliteTest, NulCharsInString) {
 }
 
 TEST_F(SqliteTest, Unicode) {
-  string s = "要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非";
+  std::string s = "要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
@@ -134,7 +134,7 @@ TEST_F(SqliteTest, StepAndResetClearsBindings) {
 }
 
 TEST_F(SqliteTest, SafeBind) {
-  string s = "hello";
+  std::string s = "hello";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
@@ -147,7 +147,7 @@ TEST_F(SqliteTest, SafeBind) {
 }
 
 TEST_F(SqliteTest, UnsafeBind) {
-  string s = "hello";
+  std::string s = "hello";
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlobUnsafe(1, s);
   stmt.BindTextUnsafe(2, s);
@@ -216,7 +216,7 @@ TEST_F(SqliteTest, PrepareFailed) {
   SqliteStatement stmt;
   absl::Status s = db_->Prepare("SELECT", &stmt);
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(string::npos, s.message().find("SELECT"));
+  EXPECT_NE(std::string::npos, s.message().find("SELECT"));
   EXPECT_EQ(SQLITE_ERROR, db_->errcode());
 }
 
@@ -224,7 +224,8 @@ TEST_F(SqliteTest, BindFailed) {
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (123)");
   stmt.BindInt(1, 123);
   absl::Status s = stmt.StepOnce();
-  EXPECT_NE(string::npos, s.message().find("INSERT INTO T (a) VALUES (123)"))
+  EXPECT_NE(std::string::npos,
+            s.message().find("INSERT INTO T (a) VALUES (123)"))
       << s.message();
 }
 
diff --git a/tensorflow/core/lib/gtl/edit_distance_test.cc b/tensorflow/core/lib/gtl/edit_distance_test.cc
index 8290334bbf71a7..b1771bb89f2ded 100644
--- a/tensorflow/core/lib/gtl/edit_distance_test.cc
+++ b/tensorflow/core/lib/gtl/edit_distance_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cctype>
 #include <vector>
+
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -96,7 +98,7 @@ TEST_F(LevenshteinDistanceTest, DifferentComparisons) {
                           std::equal_to<char>()),
       5);
   auto no_case_cmp = [](char c1, char c2) {
-    return std::tolower(c1) == std::tolower(c2);
+    return absl::ascii_tolower(c1) == absl::ascii_tolower(c2);
   };
   ASSERT_EQ(LevenshteinDistance(lower_, upper_, no_case_cmp), 3);
   ASSERT_EQ(LevenshteinDistance(upper_, lower_, no_case_cmp), 3);
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index f5deacd1026a1e..442fcbf8daea79 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_IO_PATH_H_
 #define TENSORFLOW_CORE_LIB_IO_PATH_H_
 
-#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/path.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_LIB_IO_PATH_H_
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index f3e585a0fe0334..fae5828c6b4daa 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -263,7 +263,6 @@ filegroup(
 # "/tensorflow/test/sampler_without_labels").
 exports_files(
     [
-        "cell_reader_test.cc",
         "collection_registry_test.cc",
         "counter_test.cc",
         "gauge_test.cc",
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index 23f5de6291ca7a..682f1a84687e45 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -171,7 +171,7 @@ TEST(CollectMetricsTest, Counter) {
 
 TEST(CollectMetricsTest, Gauge) {
   auto string_gauge_with_labels =
-      std::unique_ptr<Gauge<string, 2>>(Gauge<string, 2>::New(
+      std::unique_ptr<Gauge<std::string, 2>>(Gauge<std::string, 2>::New(
           "/tensorflow/test/string_gauge_with_labels",
           "String gauge with labels.", "MyLabel0", "MyLabel1"));
   auto inteter_gauge_without_labels = std::unique_ptr<Gauge<int64_t, 0>>(
@@ -463,13 +463,13 @@ class FakeClockEnv : public EnvWrapper {
   FakeClockEnv() : EnvWrapper(Env::Default()), current_millis_(0) {}
 
   // Manually advance the current time by 'millis' milliseconds.
-  void AdvanceByMillis(const uint64 millis) { current_millis_ += millis; }
+  void AdvanceByMillis(const uint64_t millis) { current_millis_ += millis; }
 
   // Method that this environment specifically overrides.
-  uint64 NowMicros() const override { return current_millis_ * 1000; }
+  uint64_t NowMicros() const override { return current_millis_ * 1000; }
 
  private:
-  uint64 current_millis_;
+  uint64_t current_millis_;
 };
 
 TEST(CollectionRegistryTest, WriteTimestamps) {
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
index 7443ab8f568527..3ba19e929723fe 100644
--- a/tensorflow/core/lib/monitoring/gauge_test.cc
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -65,8 +65,8 @@ TEST(UnlabeledGaugeTest, GetCell) {
   EXPECT_EQ(10, same_cell->value());
 }
 
-auto* string_gauge = Gauge<string, 0>::New("/tensorflow/test/string_gauge",
-                                           "Gauge of string value.");
+auto* string_gauge = Gauge<std::string, 0>::New("/tensorflow/test/string_gauge",
+                                                "Gauge of string value.");
 
 TEST(GaugeOfStringValue, InitializedWithEmptyString) {
   EXPECT_EQ("", string_gauge->GetCell()->value());
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
index 05ab7eb81a152c..0865f55d96dbed 100644
--- a/tensorflow/core/lib/monitoring/metric_def_test.cc
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -43,9 +43,9 @@ TEST(MetricDefTest, Simple) {
 
 TEST(MetricDefTest, StringsPersist) {
   // Ensure string attributes of the metric are copied into the metric
-  string name = "/tensorflow/metric0";
-  string description = "test description";
-  string label_description = "test label description";
+  std::string name = "/tensorflow/metric0";
+  std::string description = "test description";
+  std::string label_description = "test label description";
   const MetricDef<MetricKind::kCumulative, int64_t, 1> metric_def(
       name, description, label_description);
 
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index f9f03c1ee3e1af..41db93ae910a18 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string.h>
 
 #include <algorithm>
+#include <cstdint>
 
 #include "absl/base/casts.h"
 #include "tensorflow/core/lib/core/coding.h"
@@ -74,8 +75,9 @@ constexpr char kDataChunkId[] = "data";
 
 inline int16 FloatToInt16Sample(float data) {
   constexpr float kMultiplier = 1.0f * (1 << 15);
-  return std::min<float>(std::max<float>(roundf(data * kMultiplier), kint16min),
-                         kint16max);
+  return std::min<float>(std::max<float>(roundf(data * kMultiplier),
+                                         std::numeric_limits<int16_t>::min()),
+                         std::numeric_limits<int16_t>::max());
 }
 
 inline float Int16SampleToFloat(int16_t data) {
@@ -156,11 +158,12 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   if (wav_string == nullptr) {
     return errors::InvalidArgument("wav_string is null");
   }
-  if (sample_rate == 0 || sample_rate > kuint32max) {
+  if (sample_rate == 0 || sample_rate > std::numeric_limits<uint32_t>::max()) {
     return errors::InvalidArgument("sample_rate must be in (0, 2^32), got: ",
                                    sample_rate);
   }
-  if (num_channels == 0 || num_channels > kuint16max) {
+  if (num_channels == 0 ||
+      num_channels > std::numeric_limits<uint16_t>::max()) {
     return errors::InvalidArgument("num_channels must be in (0, 2^16), got: ",
                                    num_channels);
   }
@@ -172,8 +175,8 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   const size_t bytes_per_frame = kBytesPerSample * num_channels;
 
   // WAV represents the length of the file as a uint32 so file_size cannot
-  // exceed kuint32max.
-  if (file_size > kuint32max) {
+  // exceed std::numeric_limits<uint32_t>::max().
+  if (file_size > std::numeric_limits<uint32_t>::max()) {
     return errors::InvalidArgument(
         "Provided channels and frames cannot be encoded as a WAV.");
   }
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index bbdb643b56ff17..dfc75257cc85f5 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/wav/wav_io.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <limits>
 #include <string>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -45,8 +48,10 @@ TEST(WavIO, BadArguments) {
       error::INVALID_ARGUMENT,
       EncodeAudioAsS16LEWav(audio, 44100, 2, 3, (tstring*)nullptr).code());
 
-  const size_t kuint32max_plus_one = static_cast<size_t>(kuint32max) + 1;
-  const size_t kuint16max_plus_one = static_cast<size_t>(kuint16max) + 1;
+  const size_t kuint32max_plus_one =
+      static_cast<size_t>(std::numeric_limits<uint32_t>::max()) + 1;
+  const size_t kuint16max_plus_one =
+      static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1;
 
   // Zero values are invalid.
   EXPECT_EQ(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index b1f50645f1dafb..0f14acbac3b841 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -149,13 +149,13 @@ absl::Status ConcatGradHelper(const AttrSlice& attrs, FunctionDef* g,
   DataType T;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &T));
 
-  std::vector<string> shape_i;
-  std::vector<string> offset_i;
-  std::vector<string> dx_i;
+  std::vector<std::string> shape_i;
+  std::vector<std::string> offset_i;
+  std::vector<std::string> dx_i;
   for (int i = 0; i < N; ++i) {
-    shape_i.push_back(strings::StrCat("shapes:output:", i));
-    offset_i.push_back(strings::StrCat("offset:offset:", i));
-    dx_i.push_back(strings::StrCat("dx_", i, ":output:0"));
+    shape_i.push_back(absl::StrCat("shapes:output:", i));
+    offset_i.push_back(absl::StrCat("offset:offset:", i));
+    dx_i.push_back(absl::StrCat("dx_", i, ":output:0"));
   }
 
   // ConcatGrad(dim, x, dy):
@@ -175,7 +175,7 @@ absl::Status ConcatGradHelper(const AttrSlice& attrs, FunctionDef* g,
   // For each dx[i], we take a slice of dy. The offset and size of the
   // slice is given by offset[i] and shape[i].
   for (int i = 0; i < N; ++i) {
-    nodes.push_back({{strings::StrCat("dx_", i)},
+    nodes.push_back({{absl::StrCat("dx_", i)},
                      "Slice",
                      {"dy", offset_i[i], shape_i[i]},
                      {{"T", "$T"}, {"Index", DT_INT32}}});
@@ -270,10 +270,10 @@ REGISTER_OP_GRADIENT("SplitV", SplitVGrad);
 absl::Status ArrayToListGrad(const AttrSlice& attrs, FunctionDef* g) {
   int N;
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &N));
-  std::vector<string> dys;
+  std::vector<std::string> dys;
   dys.reserve(N);
   for (int i = 0; i < N; ++i) {
-    dys.push_back(strings::StrCat("dy:", i));
+    dys.push_back(absl::StrCat("dy:", i));
   }
   // clang-format off
   *g = FDH::Define(
diff --git a/tensorflow/core/ops/array_grad_test.cc b/tensorflow/core/ops/array_grad_test.cc
index df7d0aa6a40284..b68d4e7da8cbdd 100644
--- a/tensorflow/core/ops/array_grad_test.cc
+++ b/tensorflow/core/ops/array_grad_test.cc
@@ -172,7 +172,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test Concat.
   auto dx = ConcatGrad(1, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[0], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[0], test::AsScalar(0));
   test::ExpectClose(
       dx[1],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -185,7 +185,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test ConcatV2 with positive concat axis.
   dx = ConcatGradV2(1, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[dx.size() - 1], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[dx.size() - 1], test::AsScalar(0));
   test::ExpectClose(
       dx[0],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -198,7 +198,7 @@ TEST(ArrayGradTest, ConcatGrad) {
 
   // Test ConcatV2 with negative concat axis.
   dx = ConcatGradV2(-2, x0, x1, dy);
-  test::ExpectTensorEqual<int32>(dx[dx.size() - 1], test::AsScalar(0));
+  test::ExpectTensorEqual<int32_t>(dx[dx.size() - 1], test::AsScalar(0));
   test::ExpectClose(
       dx[0],
       test::AsTensor<float>({0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.,
@@ -289,7 +289,7 @@ TEST(ArrayGradTest, SplitGrad) {
   // SplitGrad
   {
     auto dx = SplitGrad(1, x, dy0, dy1);
-    test::ExpectTensorEqual<int32>(dx[0], expected_d_dim);
+    test::ExpectTensorEqual<int32_t>(dx[0], expected_d_dim);
     test::ExpectClose(dx[1], expected_dx);
   }
   // SplitVGrad
@@ -300,7 +300,7 @@ TEST(ArrayGradTest, SplitGrad) {
     auto dx = SplitVGrad(x, size_splits, 1, dy0, dy1);
     test::ExpectClose(dx[0], expected_dx);
     test::ExpectTensorEqual<int64_t>(dx[1], expected_d_size_splits);
-    test::ExpectTensorEqual<int32>(dx[2], expected_d_dim);
+    test::ExpectTensorEqual<int32_t>(dx[2], expected_d_dim);
   }
 }
 
@@ -329,7 +329,7 @@ std::vector<Tensor> ReshapeGrad(const Tensor& x, const Tensor& s,
 TEST(ArrayGradTest, ReshapeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto s = test::AsTensor<int32>({8, 5});
+  auto s = test::AsTensor<int32_t>({8, 5});
   Tensor dy(DT_FLOAT, {8, 5});
   test::FillIota<float>(&dy, 73);
   auto dx = ReshapeGrad(x, s, dy);
@@ -340,7 +340,7 @@ TEST(ArrayGradTest, ReshapeGrad) {
                   93.,  94.,  95.,  96.,  97.,  98.,  99.,  100., 101., 102.,
                   103., 104., 105., 106., 107., 108., 109., 110., 111., 112.},
                  {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0}));
 }
 
 std::vector<Tensor> ExpandDimsGrad(const Tensor& x, const Tensor& s,
@@ -368,7 +368,7 @@ std::vector<Tensor> ExpandDimsGrad(const Tensor& x, const Tensor& s,
 TEST(ArrayGradTest, ExpandDimsGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto s = test::AsTensor<int32>({1});
+  auto s = test::AsTensor<int32_t>({1});
   Tensor dy(DT_FLOAT, {2, 1, 4, 5});
   test::FillIota<float>(&dy, 73);
   auto dx = ExpandDimsGrad(x, s, dy);
@@ -379,7 +379,7 @@ TEST(ArrayGradTest, ExpandDimsGrad) {
                   93.,  94.,  95.,  96.,  97.,  98.,  99.,  100., 101., 102.,
                   103., 104., 105., 106., 107., 108., 109., 110., 111., 112.},
                  {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0}));
 }
 
 std::vector<Tensor> SqueezeGrad(const Tensor& x, const Tensor& dy) {
@@ -436,7 +436,7 @@ std::vector<Tensor> TransposeGrad(const Tensor& x, const Tensor& p,
 TEST(ArrayGradTest, TransposeGrad) {
   Tensor x(DT_FLOAT, {2, 4, 5});
   x.flat<float>().setZero();
-  auto p = test::AsTensor<int32>({2, 0, 1});
+  auto p = test::AsTensor<int32_t>({2, 0, 1});
   Tensor dy(DT_FLOAT, {5, 2, 4});
   test::FillIota<float>(&dy, 0);
   auto dx = TransposeGrad(x, p, dy);
@@ -446,7 +446,7 @@ TEST(ArrayGradTest, TransposeGrad) {
                                 4., 12., 20., 28., 36., 5., 13., 21., 29., 37.,
                                 6., 14., 22., 30., 38., 7., 15., 23., 31., 39.},
                                {2, 4, 5}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
 }
 
 std::vector<Tensor> ReverseGrad(const Tensor& x, const Tensor& dims,
@@ -510,13 +510,13 @@ std::vector<Tensor> ReverseV2Grad(const Tensor& x, const Tensor& axis,
 TEST(ArrayGradTest, ReverseV2Grad) {
   Tensor x(DT_FLOAT, {2, 3});
   x.flat<float>().setZero();
-  auto axis = test::AsTensor<int32>({1});
+  auto axis = test::AsTensor<int32_t>({1});
   Tensor dy(DT_FLOAT, {2, 3});
   test::FillIota<float>(&dy, 1);
   auto dx = ReverseV2Grad(x, axis, dy);
   test::ExpectTensorEqual<float>(
       dx[0], test::AsTensor<float>({3., 2., 1., 6., 5., 4.}, {2, 3}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0}));
 }
 
 std::vector<Tensor> SliceGrad(const Tensor& x, const Tensor& b, const Tensor& s,
@@ -546,8 +546,8 @@ std::vector<Tensor> SliceGrad(const Tensor& x, const Tensor& b, const Tensor& s,
 TEST(ArrayGradTest, SliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
-  auto begin = test::AsTensor<int32>({1, 1, 1});
-  auto size = test::AsTensor<int32>({1, 2, 2});
+  auto begin = test::AsTensor<int32_t>({1, 1, 1});
+  auto size = test::AsTensor<int32_t>({1, 2, 2});
   Tensor dy(DT_FLOAT, {1, 2, 2});
   test::FillIota<float>(&dy, 1);
   auto dx = SliceGrad(x, begin, size, dy);
@@ -558,8 +558,8 @@ TEST(ArrayGradTest, SliceGrad) {
                             0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                         },
                         {2, 3, 4}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-  test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+  test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
 }
 
 std::vector<Tensor> StridedSliceGrad(const Tensor& x, const Tensor& begin,
@@ -653,12 +653,12 @@ std::vector<Tensor> StridedSliceGradGrad(
 TEST(ArrayGradTest, StridedSliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
-  Tensor x_shape = test::AsTensor<int32>({2, 3, 4}, {3});
+  Tensor x_shape = test::AsTensor<int32_t>({2, 3, 4}, {3});
 
   {
-    auto start = test::AsTensor<int32>({1, 1, 1});
-    auto stop = test::AsTensor<int32>({2, 3, 3});
-    auto strides = test::AsTensor<int32>({1, 1, 1});
+    auto start = test::AsTensor<int32_t>({1, 1, 1});
+    auto stop = test::AsTensor<int32_t>({2, 3, 3});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1});
     Tensor dy(DT_FLOAT, {1, 2, 2});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 0,
@@ -673,8 +673,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -683,9 +683,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
 
   // test equivalent of python tf.gradients(foo[1:2, 1:3, 1:3])
   {
-    auto start = test::AsTensor<int32>({1, 1, 1});
-    auto stop = test::AsTensor<int32>({2, 3, 3});
-    auto strides = test::AsTensor<int32>({1, 1, 1});
+    auto start = test::AsTensor<int32_t>({1, 1, 1});
+    auto stop = test::AsTensor<int32_t>({2, 3, 3});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1});
     Tensor dy(DT_FLOAT, {1, 2, 2});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 0,
@@ -700,8 +700,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -711,9 +711,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
   // test equivalent of python tf.gradients(foo[1, 1:, :-2, None])
   {
     int dontcare = 66;
-    auto start = test::AsTensor<int32>({1, 1, dontcare, dontcare});
-    auto stop = test::AsTensor<int32>({2, dontcare, -2, dontcare});
-    auto strides = test::AsTensor<int32>({1, 1, 1, dontcare});
+    auto start = test::AsTensor<int32_t>({1, 1, dontcare, dontcare});
+    auto stop = test::AsTensor<int32_t>({2, dontcare, -2, dontcare});
+    auto strides = test::AsTensor<int32_t>({1, 1, 1, dontcare});
     Tensor dy(DT_FLOAT, {2, 2, 1});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 4, end_mask = 2, new_axis_mask = 8, shrink_axis_mask = 1,
@@ -728,8 +728,10 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               0., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0., 0.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1],
+                                     test::AsTensor<int32_t>({0, 0, 0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2],
+                                     test::AsTensor<int32_t>({0, 0, 0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -739,9 +741,9 @@ TEST(ArrayGradTest, StridedSliceGrad) {
   // test equivalent of tf.gradients(foo[1, ...]) i.e. foo[1, 0:3, 0:4]
   {
     int dontcare = 66;
-    auto start = test::AsTensor<int32>({1, dontcare});
-    auto stop = test::AsTensor<int32>({2, dontcare});
-    auto strides = test::AsTensor<int32>({1, 1});
+    auto start = test::AsTensor<int32_t>({1, dontcare});
+    auto stop = test::AsTensor<int32_t>({2, dontcare});
+    auto strides = test::AsTensor<int32_t>({1, 1});
     Tensor dy(DT_FLOAT, {3, 4});
     test::FillIota<float>(&dy, 1);
     int begin_mask = 0, end_mask = 0, new_axis_mask = 0, shrink_axis_mask = 1,
@@ -756,8 +758,8 @@ TEST(ArrayGradTest, StridedSliceGrad) {
                               1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
                           },
                           {2, 3, 4}));
-    test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0}));
-    test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[1], test::AsTensor<int32_t>({0, 0}));
+    test::ExpectTensorEqual<int32_t>(dx[2], test::AsTensor<int32_t>({0, 0}));
     auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
                                     begin_mask, end_mask, ellipsis_mask,
                                     new_axis_mask, shrink_axis_mask);
@@ -793,12 +795,13 @@ TEST(ArrayGradTest, BroadcastToGrad) {
   Tensor x(DT_FLOAT, {2, 2});
   x.flat<float>().setZero();
   Tensor shape(DT_INT32, {3});
-  test::FillValues<int32>(&shape, {2, 2, 2});
+  test::FillValues<int32_t>(&shape, {2, 2, 2});
   Tensor dy(DT_FLOAT, {2, 2, 2});
   test::FillIota<float>(&dy, 0);
   auto dx = BroadcastToGrad(x, shape, dy);
   test::ExpectClose(dx[0], test::AsTensor<float>({4., 6., 8., 10.}, {2, 2}));
-  test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}, {3}));
+  test::ExpectTensorEqual<int32_t>(dx[1],
+                                   test::AsTensor<int32_t>({0, 0, 0}, {3}));
 }
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 8d53c6dbb38425..0213914a564647 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <limits>
 #include <ostream>
 #include <vector>
 
@@ -43,7 +44,7 @@ using shape_inference::UnchangedShape;
 namespace {
 
 absl::Status GetAxisForPackAndUnpack(InferenceContext* c,
-                                     int32_t rank_after_pack, int32* axis) {
+                                     int32_t rank_after_pack, int32_t* axis) {
   TF_RETURN_IF_ERROR(c->GetAttr("axis", axis));
   if (*axis < -1 * rank_after_pack || *axis >= rank_after_pack) {
     return errors::InvalidArgument("Invalid axis: ", *axis, "; must be in [",
@@ -116,7 +117,7 @@ absl::Status PadShapeFn(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithValue(n_dim, num_dims, &n_dim));
 
   if (paddings_t->dtype() == DT_INT32) {
-    return PadKnown<int32>(c, input, paddings_t, num_dims);
+    return PadKnown<int32_t>(c, input, paddings_t, num_dims);
   } else {
     return PadKnown<int64_t>(c, input, paddings_t, num_dims);
   }
@@ -165,7 +166,7 @@ absl::Status TransposeShapeFn(InferenceContext* c) {
   if (perm != nullptr) {
     std::vector<int64_t> data;
     if (perm->dtype() == DT_INT32) {
-      data = AsInt64<int32>(perm, rank);
+      data = AsInt64<int32_t>(perm, rank);
     } else {
       data = AsInt64<int64_t>(perm, rank);
     }
@@ -660,7 +661,8 @@ REGISTER_OP("SplitV")
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, split_dim + 1, &input));
         std::vector<int64_t> data;
         if (size_splits->dtype() == DT_INT32) {
-          data = AsInt64<int32>(size_splits, size_splits->shape().dim_size(0));
+          data =
+              AsInt64<int32_t>(size_splits, size_splits->shape().dim_size(0));
         } else {
           data =
               AsInt64<int64_t>(size_splits, size_splits->shape().dim_size(0));
@@ -1033,7 +1035,8 @@ REGISTER_OP("ReverseV2")
         int32_t rank = c->Rank(input);
         std::vector<int64_t> axis_value;
         if (axis_tensor->dtype() == DT_INT32) {
-          axis_value = AsInt64<int32>(axis_tensor, axis_tensor->NumElements());
+          axis_value =
+              AsInt64<int32_t>(axis_tensor, axis_tensor->NumElements());
         } else {
           axis_value =
               AsInt64<int64_t>(axis_tensor, axis_tensor->NumElements());
@@ -1131,7 +1134,7 @@ REGISTER_OP("Fill")
       const Tensor* t = c->input_tensor(0);
       if (t != nullptr) {
         for (int i = 0; i < t->NumElements(); ++i) {
-          if ((index_type == DT_INT32 && t->vec<int32>()(i) < 0) ||
+          if ((index_type == DT_INT32 && t->vec<int32_t>()(i) < 0) ||
               (index_type == DT_INT64 && t->vec<int64_t>()(i) < 0)) {
             return errors::InvalidArgument("Fill dimensions must be >= 0");
           }
@@ -1249,7 +1252,7 @@ REGISTER_OP("GatherV2")
       // Note, axis can be negative.
       int64_t axis = 0;
       if (axis_t->dtype() == DT_INT32) {
-        axis = axis_t->scalar<int32>()();
+        axis = axis_t->scalar<int32_t>()();
       } else {
         axis = axis_t->scalar<int64_t>()();
       }
@@ -1482,7 +1485,7 @@ absl::Status UniqueIdxShapeFn(InferenceContext* c) {
   } else if (n == 1) {
     int64_t axis;
     if (axis_t->dtype() == DT_INT32) {
-      axis = static_cast<int64_t>(axis_t->flat<int32>()(0));
+      axis = static_cast<int64_t>(axis_t->flat<int32_t>()(0));
     } else {
       axis = axis_t->flat<int64_t>()(0);
     }
@@ -1753,7 +1756,7 @@ REGISTER_OP("StridedSlice")
 
       PartialTensorShape processing_shape, final_shape;
       bool is_identity, is_simple_slice, slice_dim0;
-      absl::InlinedVector<int64, 4UL> begin, end, strides;
+      absl::InlinedVector<int64_t, 4UL> begin, end, strides;
       TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
           begin_value, end_value, *strides_value, input_shape, begin_mask,
           end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
@@ -2026,7 +2029,7 @@ REGISTER_OP("MirrorPadGrad")
       }
 
       if (paddings_t->dtype() == DT_INT32) {
-        return MirrorPadKnown<int32>(c, input, paddings_t, input_rank);
+        return MirrorPadKnown<int32_t>(c, input, paddings_t, input_rank);
       } else {
         return MirrorPadKnown<int64_t>(c, input, paddings_t, input_rank);
       }
@@ -2107,7 +2110,7 @@ REGISTER_OP("ExpandDims")
 
       int64_t dim;
       if (dim_t->dtype() == DT_INT32) {
-        dim = static_cast<int64_t>(dim_t->flat<int32>()(0));
+        dim = static_cast<int64_t>(dim_t->flat<int32_t>()(0));
       } else {
         dim = dim_t->flat<int64_t>()(0);
       }
@@ -2151,7 +2154,7 @@ REGISTER_OP("Squeeze")
       const int32_t input_rank = c->Rank(input);
 
       // Validate and wrap squeeze dimensions.
-      std::vector<int32> squeeze_dims;
+      std::vector<int32_t> squeeze_dims;
       TF_RETURN_IF_ERROR(c->GetAttr("squeeze_dims", &squeeze_dims));
       for (int i = 0; i < squeeze_dims.size(); ++i) {
         if (squeeze_dims[i] < -input_rank || squeeze_dims[i] >= input_rank) {
@@ -2238,7 +2241,7 @@ std::vector<int64_t> GetFlatInt64(const Tensor& t) {
 // Converts int32 or int64 Tensor to flat std::vector<int64_t>.
 std::vector<int64_t> GetFlatInt64(const Tensor& t) {
   if (t.dtype() == DT_INT32) {
-    return GetFlatInt64<int32>(t);
+    return GetFlatInt64<int32_t>(t);
   } else {
     return GetFlatInt64<int64_t>(t);
   }
@@ -2489,7 +2492,7 @@ REGISTER_OP("SpaceToDepth")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     // TODO(pauldonnelly): Implement GPU kernels for NCHW_VECT_C.
     .SetShapeFn([](InferenceContext* c) {
-      string data_format_str;
+      std::string data_format_str;
       TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
       TensorFormat data_format;
       FormatFromString(data_format_str, &data_format);
@@ -2543,7 +2546,7 @@ REGISTER_OP("DepthToSpace")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     // TODO(pauldonnelly): Implement GPU kernels for NCHW and NCHW_VECT_C.
     .SetShapeFn([](InferenceContext* c) {
-      string data_format_str;
+      std::string data_format_str;
       TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
       TensorFormat data_format;
       FormatFromString(data_format_str, &data_format);
@@ -2602,7 +2605,7 @@ REGISTER_OP("ExtractImagePatches")
       ShapeHandle input_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
 
-      std::vector<int32> ksizes;
+      std::vector<int32_t> ksizes;
       TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
       if (ksizes.size() != 4) {
         return errors::InvalidArgument(
@@ -2611,7 +2614,7 @@ REGISTER_OP("ExtractImagePatches")
             ksizes.size());
       }
 
-      std::vector<int32> strides;
+      std::vector<int32_t> strides;
       TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
       if (strides.size() != 4) {
         return errors::InvalidArgument(
@@ -2620,7 +2623,7 @@ REGISTER_OP("ExtractImagePatches")
             strides.size());
       }
 
-      std::vector<int32> rates;
+      std::vector<int32_t> rates;
       TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
       if (rates.size() != 4) {
         return errors::InvalidArgument(
@@ -2692,7 +2695,7 @@ REGISTER_OP("ExtractVolumePatches")
       ShapeHandle input_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
 
-      std::vector<int32> ksizes;
+      std::vector<int32_t> ksizes;
       TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
       if (ksizes.size() != 5) {
         return errors::InvalidArgument(
@@ -2701,7 +2704,7 @@ REGISTER_OP("ExtractVolumePatches")
             ksizes.size());
       }
 
-      std::vector<int32> strides;
+      std::vector<int32_t> strides;
       TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
       if (strides.size() != 5) {
         return errors::InvalidArgument(
@@ -2863,7 +2866,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2902,7 +2905,7 @@ REGISTER_OP("QuantizeAndDequantizeV4")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2937,7 +2940,7 @@ REGISTER_OP("QuantizeAndDequantizeV4Grad")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -2977,7 +2980,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           return errors::InvalidArgument(
               "Axis cannot be >= kint32max value, got ", axis);
         }
@@ -3042,7 +3045,7 @@ REGISTER_OP("Dequantize")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
       if (axis != -1) {
         ShapeHandle input;
-        if (axis >= kint32max) {
+        if (axis >= std::numeric_limits<int32_t>::max()) {
           // Check int32 max bound for a corner case to prevent integer flow
           // when input actually has kint32max rank and above bound check is not
           // triggered.
@@ -3379,11 +3382,11 @@ REGISTER_OP("Fingerprint")
           return errors::InvalidArgument("`method` must be rank 0: ",
                                          method->shape());
         }
-        const string& method_string = method->scalar<tstring>()();
+        const std::string& method_string = method->scalar<tstring>()();
         if (method_string != "farmhash64") {
           return errors::InvalidArgument("Unsupported method: ", method_string);
         }
-        fingerprint_size = c->MakeDim(sizeof(uint64));
+        fingerprint_size = c->MakeDim(sizeof(uint64_t));
       }
 
       DimensionHandle batch = c->Dim(c->input(0), 0);
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 5546a6c158e7f1..ddbf13d893b44e 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -315,7 +315,7 @@ TEST(ArrayOpsTest, Fill_ShapeFn) {
   INFER_OK(op, "[?];?", "?");
   INFER_OK(op, "[4];?", "[?,?,?,?]");
 
-  Tensor in_t = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor in_t = test::AsTensor<int32_t>({1, 2, 3, 4});
   op.input_tensors[0] = &in_t;
   INFER_OK(op, "[4];?", "[1,2,3,4]");
 }
@@ -695,14 +695,14 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
 
   // Expand at front of tensor.
   for (int32_t idx : {0, -4}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[1,d0_0,d0_1,d0_2]");
   }
 
   // Expand at middle of tensor.
   for (int32_t idx : {1, -3}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,1,d0_1,d0_2]");
 
@@ -712,7 +712,7 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
     INFER_OK(op, "[5,?,7];?", "[d0_0,1,d0_1,d0_2]");
   }
   for (int32_t idx : {2, -2}) {
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,d0_1,1,d0_2]");
 
@@ -724,7 +724,7 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
 
   for (int32_t idx : {3, -1}) {
     // Expand at the end.
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[5,?,7];?", "[d0_0,d0_1,d0_2,1]");
 
@@ -735,31 +735,31 @@ TEST(ArrayOpsTest, ExpandDims_ShapeFn) {
   }
   for (int32_t idx : {4, -5}) {
     // Invalid idx.
-    dim_t = test::AsScalar<int32>(idx);
+    dim_t = test::AsScalar<int32_t>(idx);
     INFER_ERROR("not in the interval [-4, 3]", op, "[5,?,7];?");
     dim_t = test::AsScalar<int64_t>(idx);
     INFER_ERROR("not in the interval [-4, 3]", op, "[5,?,7];?");
   }
 
   // Expand using an input vector tensor.
-  std::vector<int32> dims;
+  std::vector<int32_t> dims;
   dims.push_back(0);
-  dim_t = test::AsTensor<int32>(dims);
+  dim_t = test::AsTensor<int32_t>(dims);
   INFER_OK(op, "?;?", "?");
   INFER_OK(op, "[5,?,7];?", "[1,d0_0,d0_1,d0_2]");
 
   // Expand using too many input elements.
   dims.push_back(1);
-  dim_t = test::AsTensor<int32>(dims);
+  dim_t = test::AsTensor<int32_t>(dims);
   INFER_ERROR("'dim' input must be a tensor with a single", op, "?;?");
   INFER_ERROR("'dim' input must be a tensor with a single", op, "[5,6,7];?");
 
   // Examples from ExpandDims doc.
-  dim_t = test::AsScalar<int32>(0);
+  dim_t = test::AsScalar<int32_t>(0);
   INFER_OK(op, "[2];[]", "[1,d0_0]");
-  dim_t = test::AsScalar<int32>(1);
+  dim_t = test::AsScalar<int32_t>(1);
   INFER_OK(op, "[2];[]", "[d0_0,1]");
-  dim_t = test::AsScalar<int32>(-1);
+  dim_t = test::AsScalar<int32_t>(-1);
   INFER_OK(op, "[2];[]", "[d0_0,1]");
 }
 
@@ -966,7 +966,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   INFER_OK(op, "[4];[?]", "?");
 
   // All dimensions provided.
-  Tensor new_shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor new_shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[1] = &new_shape;
   INFER_OK(op, "?;[3]", "[1,2,3]");
   INFER_OK(op, "[?];[3]", "[1,2,3]");
@@ -978,39 +978,39 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
 
   // Unknown dimensions.
   // Flatten:
-  new_shape = test::AsTensor<int32>({-1});
+  new_shape = test::AsTensor<int32_t>({-1});
   INFER_OK(op, "?;[1]", "[?]");
   INFER_OK(op, "[?];[1]", "[d0_0]");
   INFER_OK(op, "[2,2];[1]", "[4]");
   // The first dimension is inferred:
-  new_shape = test::AsTensor<int32>({2, -1});
+  new_shape = test::AsTensor<int32_t>({2, -1});
   INFER_OK(op, "[3,4];[2]", "[2,6]");
   // The total number of elements must be evenly divisible by the known
   // dimensions.
   INFER_ERROR("Dimension size must be evenly divisible by 2 but is 7", op,
               "[7];[2]");
   // Multiple missing dimensions cannot be inferred.
-  new_shape = test::AsTensor<int32>({-1, -1, 2});
+  new_shape = test::AsTensor<int32_t>({-1, -1, 2});
   INFER_OK(op, "[8];[3]", "[?,?,2]");
   INFER_OK(op, "?;[3]", "[?,?,2]");
 
   // Symbolic shape propagation
-  new_shape = test::AsTensor<int32>({-1, 2, 3});
+  new_shape = test::AsTensor<int32_t>({-1, 2, 3});
   INFER_OK(op, "[?,2,3];[3]", "[d0_0,2,3]");
 
   // Reshaping to a scalar.
-  new_shape = test::AsTensor<int32>({});
+  new_shape = test::AsTensor<int32_t>({});
   INFER_OK(op, "[1];[0]", "[]");
   INFER_ERROR(
       "Cannot reshape a tensor with 2 elements to shape [] (1 elements)", op,
       "[1,2];[0]");
 
   // Reshaping a tensor with no elements.
-  new_shape = test::AsTensor<int32>({-1});
+  new_shape = test::AsTensor<int32_t>({-1});
   INFER_OK(op, "[0];[1]", "[0]");
-  new_shape = test::AsTensor<int32>({-1, 6});
+  new_shape = test::AsTensor<int32_t>({-1, 6});
   INFER_OK(op, "[0,2];[1]", "[0,6]");
-  new_shape = test::AsTensor<int32>({0, -1});
+  new_shape = test::AsTensor<int32_t>({0, -1});
   INFER_OK(op, "[0,2];[1]", "[0,?]");
 }
 
@@ -1024,7 +1024,7 @@ TEST(ArrayOpsTest, QuantizedReshape_ShapeFn) {
   INFER_OK(op, "[?];?;?;?", "?;[];[]");
   INFER_OK(op, "[?];[?];?;?", "?;[];[]");
   INFER_OK(op, "[4];[?];?;?", "?;[];[]");
-  Tensor new_shape = test::AsTensor<int32>({1, 2, 3});
+  Tensor new_shape = test::AsTensor<int32_t>({1, 2, 3});
   op.input_tensors[1] = &new_shape;
   INFER_OK(op, "[?];[3];?;?", "[1,2,3];[];[]");
   INFER_OK(op, "[6];[3];?;?", "[1,2,3];[];[]");
@@ -1096,23 +1096,23 @@ TEST(ArrayOpsTest, Transpose_ShapeFn) {
   INFER_OK(op, "[?];?", "[?]");
   INFER_OK(op, "[?,?];[2]", "[?,?]");
   INFER_ERROR("Dimension must be 3 but is 2", op, "[1,2,3];[2]");
-  Tensor perm = test::AsTensor<int32>({0});
+  Tensor perm = test::AsTensor<int32_t>({0});
   op.input_tensors[1] = &perm;
   INFER_OK(op, "[?];[?]", "[d0_0]");
-  perm = test::AsTensor<int32>({1, 0});
+  perm = test::AsTensor<int32_t>({1, 0});
   INFER_OK(op, "?;[2]", "[?,?]");
   INFER_OK(op, "[?,?];[2]", "[d0_1,d0_0]");
   INFER_OK(op, "[1,?];[2]", "[d0_1,d0_0]");
   INFER_OK(op, "?;[0]", "in0");
 
   // Invalid arguments.
-  perm = test::AsTensor<int32>({1, 2});
+  perm = test::AsTensor<int32_t>({1, 2});
   INFER_ERROR("perm dim 2 is out of range of input rank 2", op, "[1,2];[2]");
-  perm = test::AsTensor<int32>({0});
+  perm = test::AsTensor<int32_t>({0});
   INFER_ERROR("Dimension must be 2 but is 1", op, "[1,2];[1]");
 
   // Larger valid cases.
-  perm = test::AsTensor<int32>({1, 0, 3, 4, 2});
+  perm = test::AsTensor<int32_t>({1, 0, 3, 4, 2});
   INFER_OK(op, "[0,1,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
   INFER_OK(op, "[0,?,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
 }
@@ -1163,7 +1163,7 @@ TEST(ArrayOpsTest, Bitcast_ShapeFn) {
 TEST(ArrayOpsTest, Squeeze_ShapeFn) {
   ShapeInferenceTestOp op("Squeeze");
 
-  auto rebuild_node_def = [&op](const std::vector<int32>& squeeze_dims) {
+  auto rebuild_node_def = [&op](const std::vector<int32_t>& squeeze_dims) {
     TF_ASSERT_OK(NodeDefBuilder("test", "Squeeze")
                      .Input("input", 0, DT_FLOAT)
                      .Attr("squeeze_dims", squeeze_dims)
@@ -1257,10 +1257,10 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
   INFER_OK(op, "?;[1,4]", "[?,?];[?,?]");
 
   // split_dim is known.
-  Tensor split_dim = test::AsTensor<int32>({1, 2});
+  Tensor split_dim = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[0] = &split_dim;
   INFER_ERROR("Input must be scalar but has rank 1", op, "[?];[?,?]");
-  split_dim = test::AsScalar<int32>(1);
+  split_dim = test::AsScalar<int32_t>(1);
   INFER_OK(op, "?;?", "?;?");
   INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
@@ -1269,21 +1269,21 @@ TEST(ArrayOpsTest, Split_ShapeFn) {
               "?;[1,5]");
 
   // split_dim too large.
-  split_dim = test::AsScalar<int32>(3);
+  split_dim = test::AsScalar<int32_t>(3);
   INFER_ERROR(
       "Dimension size, given by scalar input 3 must be in range [-3, 3)", op,
       "?;[1,4,8]");
 
   // Negative split_dim.
-  split_dim = test::AsScalar<int32>(-1);
+  split_dim = test::AsScalar<int32_t>(-1);
   INFER_OK(op, "?;?", "?;?");
   INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,?]", "[d1_0,?];[d1_0,?]");
   INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
   INFER_OK(op, "?;[1,4,8]", "[d1_0,d1_1,4];[d1_0,d1_1,4]");
-  split_dim = test::AsScalar<int32>(-2);
+  split_dim = test::AsScalar<int32_t>(-2);
   INFER_OK(op, "?;[1,4,8]", "[d1_0,2,d1_2];[d1_0,2,d1_2]");
-  split_dim = test::AsScalar<int32>(-4);
+  split_dim = test::AsScalar<int32_t>(-4);
   INFER_ERROR(
       "Dimension size, given by scalar input -4 must be in range [-3, 3)", op,
       "?;[1,4,8]");
@@ -1312,7 +1312,7 @@ TEST(ArrayOpsTest, Tile_ShapeFn) {
   INFER_OK(op, "?;[4]", "[?,?,?,?]");
 
   // Test a tile of a 4D input.
-  Tensor multiples = test::AsTensor<int32>({2, 3, 4, 5});
+  Tensor multiples = test::AsTensor<int32_t>({2, 3, 4, 5});
   op.input_tensors[1] = &multiples;
   INFER_OK(op, "[2,3,1,4];[4]", "[4,9,4,20]");
   // Test 64-bit tensor type
@@ -1362,12 +1362,12 @@ TEST(ArrayOpsTest, OneHot_ShapeFn) {
   INFER_OK(op, "?;[];?;?", "?");
 
   // Depth must be scalar.
-  Tensor depth = test::AsTensor<int32>({1, 2});
+  Tensor depth = test::AsTensor<int32_t>({1, 2});
   op.input_tensors[1] = &depth;
   INFER_ERROR("Input must be scalar but has rank 1", op, "?;[2];?;?");
 
   // Full information is available.
-  depth = test::AsScalar<int32>(2);
+  depth = test::AsScalar<int32_t>(2);
   INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,2,d0_1,d0_2]");
   set_axis(-1);
   INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,d0_1,d0_2,2]");
@@ -1375,9 +1375,10 @@ TEST(ArrayOpsTest, OneHot_ShapeFn) {
 
 TEST(ArrayOpsTest, ExtractImagePatchesShapeTest) {
   ShapeInferenceTestOp op("ExtractImagePatches");
-  auto set_op = [&op](const std::vector<int32>& ksizes,
-                      const std::vector<int32>& strides,
-                      const std::vector<int32>& rates, const string& padding) {
+  auto set_op = [&op](const std::vector<int32_t>& ksizes,
+                      const std::vector<int32_t>& strides,
+                      const std::vector<int32_t>& rates,
+                      const std::string& padding) {
     TF_ASSERT_OK(NodeDefBuilder("test", "ExtractImagePatches")
                      .Input("input", 0, DT_FLOAT)
                      .Attr("ksizes", ksizes)
@@ -1453,20 +1454,20 @@ TEST(ArrayOpsTest, SpaceToBatch_ShapeFn) {
   INFER_ERROR("rank", op, "[1,10,10,3];[4]");
   INFER_ERROR("3 and 2", op, "[1,10,10,3];[2,3]");
 
-  Tensor paddings = test::AsTensor<int32>({4, 2, 2, 4}, {{2, 2}});
+  Tensor paddings = test::AsTensor<int32_t>({4, 2, 2, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_OK(op, "[1,10,10,3];[2,2]", "[4,8,8,d0_3]");
   paddings = test::AsTensor<int64_t>({4, 2, 2, 4}, {{2, 2}});
   INFER_OK(op, "[1,10,10,3];[2,2]", "[4,8,8,d0_3]");
 
   // Bad paddings values
-  paddings = test::AsTensor<int32>({1, 2, 3, 4}, {{2, 2}});
+  paddings = test::AsTensor<int32_t>({1, 2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_ERROR("Dimension size must be evenly divisible by 2 but is 13", op,
               "[1,10,10,3];[2,2]");
 
   // Negative paddings
-  paddings = test::AsTensor<int32>({1, -2, 3, 4}, {{2, 2}});
+  paddings = test::AsTensor<int32_t>({1, -2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
   INFER_ERROR("cannot be negative", op, "[1,10,10,3];[2,2]");
 }
@@ -1491,13 +1492,13 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
 
   {
     // Dimensions are partially known, block_shape known.
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_OK(op, "[3,?,?,2];[2];?", "[18,?,?,d0_3]");
 
     // Dimensions are partially known, block_shape and paddings known.
     {
-      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      Tensor paddings = test::AsTensor<int32_t>({1, 1, 0, 1}, {{2, 2}});
       op.input_tensors[2] = &paddings;
       INFER_OK(op, "[3,?,2,2];[2];[2,2]", "[18,?,1,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1505,7 +1506,7 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
 
     // Dimensions are fully known, block_shape and paddings are known.
     {
-      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      Tensor paddings = test::AsTensor<int32_t>({1, 1, 0, 0}, {{2, 2}});
       op.input_tensors[2] = &paddings;
       INFER_OK(op, "[3,2,3,2];[2];[2,2]", "[18,2,1,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1518,16 +1519,16 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   INFER_ERROR("block_shape must have known size", op, "?;[?];?");
 
   {
-    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({0, 2});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    Tensor block_shape = test::AsTensor<int32_t>({1, 1});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, -1, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("paddings cannot be negative", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1535,9 +1536,9 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({3, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({3, 3});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, 0, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, 0, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("divisible", op, "[1,2,3,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1545,9 +1546,9 @@ TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({});
+    Tensor block_shape = test::AsTensor<int32_t>({});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({});
+    Tensor paddings = test::AsTensor<int32_t>({});
     op.input_tensors[2] = &paddings;
     INFER_OK(op, "?;[0];[0,2]", "?");
     op.input_tensors[1] = nullptr;
@@ -1586,17 +1587,17 @@ TEST(ArrayOpsTest, BatchToSpace_ShapeFn) {
   INFER_OK(op, "[4,8,8,3];[2,2]", "[1,10,10,d0_3]");
 
   // Bad croppings values
-  croppings = test::AsTensor<int32>({100, 2, 3, 4}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({100, 2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("Negative dimension size caused by subtracting", op,
               "[4,8,8,3];[2,2]");
-  croppings = test::AsTensor<int32>({1, 2, 3, 400}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({1, 2, 3, 400}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("Negative dimension size caused by subtracting", op,
               "[4,8,8,3];[2,2]");
 
   // Negative paddings
-  croppings = test::AsTensor<int32>({1, -2, 3, 4}, {{2, 2}});
+  croppings = test::AsTensor<int32_t>({1, -2, 3, 4}, {{2, 2}});
   op.input_tensors[1] = &croppings;
   INFER_ERROR("cannot be negative", op, "[4,8,8,3];[2,2]");
 }
@@ -1618,7 +1619,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   {
     // Dimensions are partially known, block_shape known.
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_OK(op, "[?,?,?,2];[2];?", "[?,?,?,d0_3]");
 
@@ -1626,7 +1627,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
     // Dimensions are partially known, block_shape and crops known.
     {
-      Tensor crops = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      Tensor crops = test::AsTensor<int32_t>({1, 1, 0, 1}, {{2, 2}});
       op.input_tensors[2] = &crops;
       INFER_OK(op, "[18,?,2,2];[2];[2,2]", "[3,?,5,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1634,7 +1635,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
     // Dimensions are fully known, block_shape and crops are known.
     {
-      Tensor crops = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      Tensor crops = test::AsTensor<int32_t>({1, 1, 0, 0}, {{2, 2}});
       op.input_tensors[2] = &crops;
       INFER_OK(op, "[18,2,1,2];[2];[2,2]", "[3,2,3,d0_3]");
       op.input_tensors[2] = nullptr;
@@ -1649,16 +1650,16 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
   INFER_ERROR("rank", op, "[2,2,3];[3];[3,2]");
 
   {
-    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({0, 2});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
   }
 
   {
-    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    Tensor block_shape = test::AsTensor<int32_t>({1, 1});
     op.input_tensors[1] = &block_shape;
-    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    Tensor paddings = test::AsTensor<int32_t>({0, -1, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &paddings;
     INFER_ERROR("crops cannot be negative", op, "[1,2,2];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1667,9 +1668,9 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   // The amount to crop exceeds the padded size.
   {
-    Tensor block_shape = test::AsTensor<int32>({2, 2});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 2});
     op.input_tensors[1] = &block_shape;
-    Tensor crops = test::AsTensor<int32>({3, 2, 0, 0}, {{2, 2}});
+    Tensor crops = test::AsTensor<int32_t>({3, 2, 0, 0}, {{2, 2}});
     op.input_tensors[2] = &crops;
     INFER_ERROR("Negative", op, "[4,2,3,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1678,7 +1679,7 @@ TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
 
   // The batch size is not divisible by the product of the block_shape.
   {
-    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    Tensor block_shape = test::AsTensor<int32_t>({2, 3});
     op.input_tensors[1] = &block_shape;
     INFER_ERROR("divisible", op, "[3,1,1,1];[2];[2,2]");
     op.input_tensors[1] = nullptr;
@@ -1755,22 +1756,22 @@ TEST(ArrayOpsTest, Slice_ShapeFn) {
 
   // Tests with known values.
   op.input_tensors.resize(3);
-  Tensor begin = test::AsTensor<int32>({0, 1, 2, 1});
-  Tensor sizes = test::AsTensor<int32>({1, 2, 1, 3});
+  Tensor begin = test::AsTensor<int32_t>({0, 1, 2, 1});
+  Tensor sizes = test::AsTensor<int32_t>({1, 2, 1, 3});
   op.input_tensors[1] = &begin;
   op.input_tensors[2] = &sizes;
   INFER_OK(op, "[2,3,4,5];[4];[4]", "[1,2,1,3]");
 
   // -1 in sizes means "get the rest"
-  sizes = test::AsTensor<int32>({-1, -1, 1, -1});
+  sizes = test::AsTensor<int32_t>({-1, -1, 1, -1});
   INFER_OK(op, "[2,3,4,5];[4];[4]", "[d0_0,2,1,4]");
 
-  begin = test::AsTensor<int32>({0, 1, 2, 6});
-  sizes = test::AsTensor<int32>({-1, -1, -1, -1});
+  begin = test::AsTensor<int32_t>({0, 1, 2, 6});
+  sizes = test::AsTensor<int32_t>({-1, -1, -1, -1});
   INFER_ERROR("Negative dimension size", op, "[2,3,4,5];[4];[4]");
 
-  begin = test::AsTensor<int32>({0, 1, 2, 5});
-  sizes = test::AsTensor<int32>({-1, -1, -1, -2});
+  begin = test::AsTensor<int32_t>({0, 1, 2, 5});
+  sizes = test::AsTensor<int32_t>({-1, -1, -1, -2});
   INFER_ERROR("cannot be < -1", op, "[2,3,4,5];[4];[4]");
 }
 
@@ -1784,7 +1785,7 @@ TEST(ArrayOpsTest, StridedSlice_ShapeFn) {
                    .Attr("shrink_axis_mask", 1)
                    .Finalize(&op.node_def));
   op.input_tensors.resize(4);
-  Tensor strides = test::AsTensor<int32>({1});
+  Tensor strides = test::AsTensor<int32_t>({1});
   op.input_tensors[3] = &strides;
   // Slicing on the 0-th dimension.
   INFER_OK(op, "[2,3,4,5];[1];[1];[1]", "[3,4,5]");
@@ -1799,7 +1800,7 @@ TEST(ArrayOpsTest, StridedSliceGrad_ShapeFn) {
   INFER_OK(op, "[?];?;?;?;?", "?");
   INFER_OK(op, "[4];?;?;?;?", "[?,?,?,?]");
 
-  Tensor in_t = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor in_t = test::AsTensor<int32_t>({1, 2, 3, 4});
   op.input_tensors[0] = &in_t;
   INFER_OK(op, "[4];?;?;?;?", "[1,2,3,4]");
 }
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index 8c31b76709db6c..85f103f360e29c 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -7,6 +7,7 @@ load(
     _if_llvm_arm_available = "if_llvm_arm_available",
     _if_llvm_hexagon_available = "if_llvm_hexagon_available",
     _if_llvm_powerpc_available = "if_llvm_powerpc_available",
+    _if_llvm_riscv_available = "if_llvm_riscv_available",
     _if_llvm_system_z_available = "if_llvm_system_z_available",
     _if_llvm_x86_available = "if_llvm_x86_available",
     _if_pywrap = "if_pywrap",
@@ -32,6 +33,7 @@ if_llvm_aarch64_available = _if_llvm_aarch64_available
 if_llvm_arm_available = _if_llvm_arm_available
 if_llvm_hexagon_available = _if_llvm_hexagon_available
 if_llvm_powerpc_available = _if_llvm_powerpc_available
+if_llvm_riscv_available = _if_llvm_riscv_available
 if_llvm_system_z_available = _if_llvm_system_z_available
 if_llvm_x86_available = _if_llvm_x86_available
 if_dynamic_kernels = _if_dynamic_kernels
diff --git a/tensorflow/core/platform/numbers.h b/tensorflow/core/platform/numbers.h
index 11bb0f09e63c59..0e7218085f4d3a 100644
--- a/tensorflow/core/platform/numbers.h
+++ b/tensorflow/core/platform/numbers.h
@@ -25,20 +25,12 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 // NOLINTBEGIN(misc-unused-using-decls)
-using tsl::strings::DoubleToBuffer;
-using tsl::strings::FastInt32ToBufferLeft;
-using tsl::strings::FastInt64ToBufferLeft;
-using tsl::strings::FastUInt32ToBufferLeft;
-using tsl::strings::FastUInt64ToBufferLeft;
-using tsl::strings::FloatToBuffer;
 using tsl::strings::FpToString;
 using tsl::strings::HexStringToUint64;
 using tsl::strings::HumanReadableElapsedTime;
 using tsl::strings::HumanReadableNum;
 using tsl::strings::HumanReadableNumBytes;
-using tsl::strings::kFastToBufferSize;
 using tsl::strings::LegacyPrecision;
-using tsl::strings::ProtoParseNumeric;
 using tsl::strings::safe_strto32;
 using tsl::strings::safe_strto64;
 using tsl::strings::safe_strtod;
diff --git a/tensorflow/core/platform/str_util.h b/tensorflow/core/platform/str_util.h
index fbea09afc9a470..1ba5b1ad531485 100644
--- a/tensorflow/core/platform/str_util.h
+++ b/tensorflow/core/platform/str_util.h
@@ -51,7 +51,6 @@ using tsl::str_util::StringReplace;
 using tsl::str_util::StripPrefix;
 using tsl::str_util::StripSuffix;
 using tsl::str_util::StripTrailingWhitespace;
-using tsl::str_util::Strnlen;
 using tsl::str_util::TitlecaseString;
 using tsl::str_util::Uppercase;
 // NOLINTEND(misc-unused-using-decls)
diff --git a/tensorflow/core/platform/stringprintf.h b/tensorflow/core/platform/stringprintf.h
index 27d30089f9e22c..b40057a9edf497 100644
--- a/tensorflow/core/platform/stringprintf.h
+++ b/tensorflow/core/platform/stringprintf.h
@@ -34,7 +34,6 @@ namespace tensorflow {
 namespace strings {
 // NOLINTBEGIN(misc-unused-using-decls)
 using tsl::strings::Appendf;
-using tsl::strings::Appendv;
 using tsl::strings::Printf;
 // NOLINTEND(misc-unused-using-decls)
 }  // namespace strings
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index cee95ef4324c64..9a4544f99734a1 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -46,6 +46,7 @@ using int16 ABSL_DEPRECATE_AND_INLINE() = int16_t;
 using int32 ABSL_DEPRECATE_AND_INLINE() = int32_t;
 using int64 ABSL_DEPRECATE_AND_INLINE() = int64_t;
 
+using tsl::float4_e2m1fn;
 using tsl::float8_e4m3b11fnuz;
 using tsl::float8_e4m3fn;
 using tsl::float8_e4m3fnuz;
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index 690128400b8be1..aeac95a4f300c7 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -129,9 +129,9 @@ class DeviceTracerTest : public ::testing::Test {
     }
   }
 
-  string x_;
-  string y_;
-  string y_neg_;
+  std::string x_;
+  std::string y_;
+  std::string y_neg_;
   GraphDef def_;
 };
 
@@ -189,11 +189,11 @@ TEST_F(DeviceTracerTest, RunWithTracer) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   TF_ASSERT_OK(tracer->Start());
@@ -213,11 +213,11 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<std::pair<std::string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
-  std::vector<string> output_names = {y_ + ":0"};
-  std::vector<string> target_nodes = {y_neg_};
+  std::vector<std::string> output_names = {y_ + ":0"};
+  std::vector<std::string> target_nodes = {y_neg_};
   std::vector<Tensor> outputs;
 
   // Prepares RunOptions and RunOutputs
diff --git a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index 9ea9bdac53091f..16a137bc766bb6 100644
--- a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -37,7 +37,7 @@ struct ExecStats {
 
 class AcceleratorUtilizationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[0]; }
+  std::string name() const override { return kCheckers[0]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
@@ -84,8 +84,8 @@ class AcceleratorUtilizationChecker : public Checker {
 
     if (accelerator_exec_stats_.find(node->canonical_device()) ==
         accelerator_exec_stats_.end()) {
-      accelerator_exec_stats_.insert(
-          std::pair<string, ExecStats>(node->canonical_device(), ExecStats()));
+      accelerator_exec_stats_.insert(std::pair<std::string, ExecStats>(
+          node->canonical_device(), ExecStats()));
     }
     ExecStats& stats = accelerator_exec_stats_.at(node->canonical_device());
 
@@ -102,8 +102,8 @@ class AcceleratorUtilizationChecker : public Checker {
     stats.exec_micros += exec.accelerator_exec_micros();
   }
 
-  std::map<string, ExecStats> accelerator_exec_stats_;
-  std::map<string, int64_t> ps_placement_;
+  std::map<std::string, ExecStats> accelerator_exec_stats_;
+  std::map<std::string, int64_t> ps_placement_;
   AdviceProto::Checker reports_;
 };
 
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
index 3fc345ccfc5f97..b21c8765b44ce7 100644
--- a/tensorflow/core/profiler/internal/advisor/checker.h
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -33,7 +33,7 @@ class Checker {
  public:
   virtual ~Checker() = default;
 
-  virtual string name() const = 0;
+  virtual std::string name() const = 0;
 
   AdviceProto::Checker Run(const AdvisorOptionsProto::CheckerOption& options,
                            const TFStats* stats) {
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index 4ec0cb571dd7c6..8437501130a9ec 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -27,7 +27,7 @@ namespace tfprof {
 
 class ExpensiveOperationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[2]; }
+  std::string name() const override { return kCheckers[2]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
@@ -58,7 +58,7 @@ class ExpensiveOperationChecker : public Checker {
       return;
     }
     const MultiGraphNodeProto* node = &root;
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     for (int i = 0; i < 3 && node->children_size() > 0; ++i) {
       node = &node->children(0);
       outputs.push_back(absl::StrFormat(
@@ -90,7 +90,7 @@ class ExpensiveOperationChecker : public Checker {
       return;
     }
 
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     CodeViewHelper(node, 0, &outputs);
     reports_.add_reports(absl::StrJoin(outputs, "\n"));
   }
@@ -102,7 +102,7 @@ class ExpensiveOperationChecker : public Checker {
     if (root.children_size() == 0) {
       return;
     }
-    std::vector<string> outputs;
+    std::vector<std::string> outputs;
     for (int i = 0; i < 3 && i < root.children_size(); ++i) {
       const GraphNodeProto& node = root.children(i);
       outputs.push_back(absl::StrFormat(
@@ -115,7 +115,7 @@ class ExpensiveOperationChecker : public Checker {
   }
 
   void CodeViewHelper(const MultiGraphNodeProto* node, int depth,
-                      std::vector<string>* outputs) {
+                      std::vector<std::string>* outputs) {
     if (node->children_size() <= 1 || depth > 3) {
       return;
     }
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index 5142639fea6b2d..de87bc990833f4 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -25,7 +25,7 @@ namespace tfprof {
 
 class OperationChecker : public Checker {
  public:
-  string name() const override { return kCheckers[1]; }
+  std::string name() const override { return kCheckers[1]; }
 
  private:
   AdviceProto::Checker Check(const AdvisorOptionsProto::CheckerOption& options,
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index e1db57cce895fd..65d704b7e84d2a 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -36,9 +36,9 @@ class Advisor {
 
   static AdvisorOptionsProto DefaultOptions() {
     AdvisorOptionsProto options;
-    std::vector<string> checkers(
+    std::vector<std::string> checkers(
         kCheckers, kCheckers + sizeof(kCheckers) / sizeof(*kCheckers));
-    for (const string& checker : checkers) {
+    for (const std::string& checker : checkers) {
       (*options.mutable_checkers())[checker];
     }
     return options;
@@ -66,7 +66,7 @@ class Advisor {
     }
     for (const auto& checker : ret.checkers()) {
       absl::FPrintF(stdout, "\n%s:\n", checker.first);
-      for (const string& r : checker.second.reports()) {
+      for (const std::string& r : checker.second.reports()) {
         absl::FPrintF(stdout, "%s\n", r);
       }
     }
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d6c1a29fe9e131..ef57e7dedde83e 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,11 +48,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     advisor_ = std::make_unique<Advisor>(stats_.get());
   }
 
-  std::unique_ptr<TFGraphNode> CreateNode(const string& name,
-                                          const string& type,
-                                          std::map<string, string> attrs,
-                                          int64_t step, int64_t start_miros,
-                                          int64_t end_rel_micros) {
+  std::unique_ptr<TFGraphNode> CreateNode(
+      const std::string& name, const std::string& type,
+      std::map<std::string, std::string> attrs, int64_t step,
+      int64_t start_miros, int64_t end_rel_micros) {
     node_defs_.push_back(std::make_unique<NodeDef>());
     NodeDef* def = node_defs_.back().get();
 
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 60dcd90ea131ab..142b156681dafd 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -85,7 +85,7 @@ string RunProfile(const string& command, const string& options,
 }  // namespace
 
 bool NewProfiler(const string* graph, const string* op_log) {
-  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
   if (graph && !graph->empty()) {
     if (!graph_ptr->ParseFromString(*graph)) {
       if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
@@ -126,7 +126,7 @@ double AddStep(int64_t step, const string* graph, const string* run_meta,
   CHECK(tf_stat);
 
   if (graph && !graph->empty()) {
-    std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+    std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
     if (!graph_ptr->ParseFromString(*graph)) {
       if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
         absl::FPrintF(stderr, "Failed to parse graph\n");
@@ -137,7 +137,7 @@ double AddStep(int64_t step, const string* graph, const string* run_meta,
 
   CHECK(run_meta && !run_meta->empty());
   // TODO(xpan): Better error handling.
-  std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
+  std::unique_ptr<RunMetadata> run_meta_ptr = std::make_unique<RunMetadata>();
   run_meta_ptr->ParseFromString(*run_meta);
   tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
 
@@ -175,7 +175,7 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* options) {
   CHECK(command) << "command mustn't be null";
   CHECK(options) << "options mustn't be null";
-  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  std::unique_ptr<GraphDef> graph_ptr = std::make_unique<GraphDef>();
   if (graph && !graph->empty()) {
     graph_ptr->ParseFromString(*graph);
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index b68c5c9e18d322..a6c88c2e4cda41 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -41,19 +41,20 @@ class TFProfShowTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
-    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    std::unique_ptr<OpLogProto> op_log_pb = std::make_unique<OpLogProto>();
     string op_log_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/tfprof_log");
@@ -62,8 +63,8 @@ class TFProfShowTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 3ef535b775047f..7491ef4e99229c 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -97,8 +97,8 @@ TFStats::TFStats(const string& filename,
     id_to_string_[entry.first] = entry.second;
   }
   for (const auto& node_pb : profile.nodes()) {
-    std::unique_ptr<TFGraphNode> node(
-        new TFGraphNode(node_pb.second, profile, &id_to_string_, &nodes_map_));
+    std::unique_ptr<TFGraphNode> node = std::make_unique<TFGraphNode>(
+        node_pb.second, profile, &id_to_string_, &nodes_map_);
     nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
         node_pb.second.name(), std::move(node)));
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 1dd0317e3b4d67..37cedfa641ae00 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -35,19 +35,20 @@ class TFProfStatsTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
-    std::unique_ptr<OpLogProto> op_log_pb(new OpLogProto());
+    std::unique_ptr<OpLogProto> op_log_pb = std::make_unique<OpLogProto>();
     string op_log_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/tfprof_log");
@@ -56,8 +57,8 @@ class TFProfStatsTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index b632636b44ae6c..b7304ebefda97d 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -33,7 +33,8 @@ class TFProfTensorTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
@@ -43,8 +44,8 @@ class TFProfTensorTest : public ::testing::Test {
     string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
                                     "core/profiler/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
-    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
-        new checkpoint::CheckpointReader(ckpt_path, status));
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader =
+        std::make_unique<checkpoint::CheckpointReader>(ckpt_path, status);
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 7efa804a39dd9f..6cf00f9bac90ca 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -35,12 +35,13 @@ class TFProfTimelineTest : public ::testing::Test {
     string graph_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/graph.pbtxt");
-    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    std::unique_ptr<tensorflow::GraphDef> graph_pb =
+        std::make_unique<tensorflow::GraphDef>();
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), graph_path, graph_pb.get(), false));
 
-    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
-        new tensorflow::RunMetadata());
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb =
+        std::make_unique<tensorflow::RunMetadata>();
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
                      "core/profiler/internal/testdata/run_meta");
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 8ad7c154ce2cd0..1c615c20999d12 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <cstdint>
+#include <limits>
 #include <map>
 #include <memory>
 #include <string>
@@ -47,7 +48,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 void completion(const char* buf, linenoiseCompletions* lc) {
-  string buf_str = buf;
+  std::string buf_str = buf;
   if (buf_str.find(' ') == buf_str.npos) {
     for (const char* opt : kCmds) {
       if (absl::StartsWith(opt, buf_str)) {
@@ -57,11 +58,12 @@ void completion(const char* buf, linenoiseCompletions* lc) {
     return;
   }
 
-  string prefix;
+  std::string prefix;
   int last_dash = buf_str.find_last_of(' ');
-  if (last_dash != string::npos) {
+  if (last_dash != std::string::npos) {
     prefix = buf_str.substr(0, last_dash + 1);
-    buf_str = buf_str.substr(last_dash + 1, kint32max);
+    buf_str =
+        buf_str.substr(last_dash + 1, std::numeric_limits<int32_t>::max());
   }
   for (const char* opt : kOptions) {
     if (absl::StartsWith(opt, buf_str)) {
@@ -71,11 +73,11 @@ void completion(const char* buf, linenoiseCompletions* lc) {
 }
 
 int Run(int argc, char** argv) {
-  string FLAGS_profile_path = "";
-  string FLAGS_graph_path = "";
-  string FLAGS_run_meta_path = "";
-  string FLAGS_op_log_path = "";
-  string FLAGS_checkpoint_path = "";
+  std::string FLAGS_profile_path = "";
+  std::string FLAGS_graph_path = "";
+  std::string FLAGS_run_meta_path = "";
+  std::string FLAGS_op_log_path = "";
+  std::string FLAGS_checkpoint_path = "";
   int32_t FLAGS_max_depth = 10;
   int64_t FLAGS_min_bytes = 0;
   int64_t FLAGS_min_peak_bytes = 0;
@@ -88,15 +90,15 @@ int Run(int argc, char** argv) {
   int64_t FLAGS_min_float_ops = 0;
   int64_t FLAGS_min_occurrence = 0;
   int64_t FLAGS_step = -1;
-  string FLAGS_order_by = "name";
-  string FLAGS_account_type_regexes = ".*";
-  string FLAGS_start_name_regexes = ".*";
-  string FLAGS_trim_name_regexes = "";
-  string FLAGS_show_name_regexes = ".*";
-  string FLAGS_hide_name_regexes;
+  std::string FLAGS_order_by = "name";
+  std::string FLAGS_account_type_regexes = ".*";
+  std::string FLAGS_start_name_regexes = ".*";
+  std::string FLAGS_trim_name_regexes = "";
+  std::string FLAGS_show_name_regexes = ".*";
+  std::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
-  string FLAGS_select = "micros";
-  string FLAGS_output = "";
+  std::string FLAGS_select = "micros";
+  std::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     absl::FPrintF(stderr, "%s\n", argv[i]);
   }
@@ -137,7 +139,7 @@ int Run(int argc, char** argv) {
       Flag("select", &FLAGS_select, "select"),
       Flag("output", &FLAGS_output, "output"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok) {
     absl::PrintF("%s", usage);
@@ -153,37 +155,37 @@ int Run(int argc, char** argv) {
     return 1;
   }
 
-  std::vector<string> account_type_regexes =
+  std::vector<std::string> account_type_regexes =
       absl::StrSplit(FLAGS_account_type_regexes, ',', absl::SkipEmpty());
-  std::vector<string> start_name_regexes =
+  std::vector<std::string> start_name_regexes =
       absl::StrSplit(FLAGS_start_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> trim_name_regexes =
+  std::vector<std::string> trim_name_regexes =
       absl::StrSplit(FLAGS_trim_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> show_name_regexes =
+  std::vector<std::string> show_name_regexes =
       absl::StrSplit(FLAGS_show_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> hide_name_regexes =
+  std::vector<std::string> hide_name_regexes =
       absl::StrSplit(FLAGS_hide_name_regexes, ',', absl::SkipEmpty());
-  std::vector<string> select =
+  std::vector<std::string> select =
       absl::StrSplit(FLAGS_select, ',', absl::SkipEmpty());
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
   absl::Status s = ParseOutput(FLAGS_output, &output_type, &output_options);
   CHECK(s.ok()) << s;
 
-  string cmd = "";
+  std::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty() &&
       FLAGS_run_meta_path.empty()) {
     PrintHelp();
     return 0;
   } else if (argc > 1) {
-    if (string(argv[1]) == kCmds[6]) {
+    if (std::string(argv[1]) == kCmds[6]) {
       PrintHelp();
       return 0;
     }
-    if (string(argv[1]) == kCmds[0] || string(argv[1]) == kCmds[1] ||
-        string(argv[1]) == kCmds[2] || string(argv[1]) == kCmds[3] ||
-        string(argv[1]) == kCmds[4]) {
+    if (std::string(argv[1]) == kCmds[0] || std::string(argv[1]) == kCmds[1] ||
+        std::string(argv[1]) == kCmds[2] || std::string(argv[1]) == kCmds[3] ||
+        std::string(argv[1]) == kCmds[4]) {
       cmd = argv[1];
     }
   }
@@ -221,7 +223,7 @@ int Run(int argc, char** argv) {
 
     std::unique_ptr<OpLogProto> op_log = std::make_unique<OpLogProto>();
     if (!FLAGS_op_log_path.empty()) {
-      string op_log_str;
+      std::string op_log_str;
       s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
       if (!s.ok()) {
         absl::FPrintF(stderr, "Failed to read op_log_path: %s\n", s.ToString());
@@ -235,7 +237,7 @@ int Run(int argc, char** argv) {
     tf_stat = std::make_unique<TFStats>(
         std::move(graph), nullptr, std::move(op_log), std::move(ckpt_reader));
 
-    std::vector<string> run_meta_files =
+    std::vector<std::string> run_meta_files =
         absl::StrSplit(FLAGS_run_meta_path, ',', absl::SkipEmpty());
     for (int i = 0; i < run_meta_files.size(); ++i) {
       std::unique_ptr<RunMetadata> run_meta = std::make_unique<RunMetadata>();
@@ -292,7 +294,7 @@ int Run(int argc, char** argv) {
       break;
     }
     looped = true;
-    string line_s = line;
+    std::string line_s = line;
     free(line);
 
     if (line_s.empty()) {
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index a31fddbcef3821..5bc5ea2f53be16 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -32,8 +32,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 namespace {
-string KeyValueToStr(const std::map<string, string>& kv_map) {
-  std::vector<string> kv_vec;
+std::string KeyValueToStr(const std::map<std::string, std::string>& kv_map) {
+  std::vector<std::string> kv_vec;
   kv_vec.reserve(kv_map.size());
   for (const auto& pair : kv_map) {
     kv_vec.push_back(absl::StrCat(pair.first, "=", pair.second));
@@ -42,18 +42,19 @@ string KeyValueToStr(const std::map<string, string>& kv_map) {
 }
 }  // namespace
 
-absl::Status ParseOutput(const string& output_opt, string* output_type,
-                         std::map<string, string>* output_options) {
+absl::Status ParseOutput(const std::string& output_opt,
+                         std::string* output_type,
+                         std::map<std::string, std::string>* output_options) {
   // The default is to use stdout.
   if (output_opt.empty()) {
     *output_type = kOutput[1];
     return absl::OkStatus();
   }
 
-  std::set<string> output_types(kOutput,
-                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  std::set<std::string> output_types(
+      kOutput, kOutput + sizeof(kOutput) / sizeof(*kOutput));
   auto opt_split = output_opt.find(':');
-  std::vector<string> kv_split;
+  std::vector<std::string> kv_split;
   if (opt_split == output_opt.npos) {
     if (output_types.find(output_opt) == output_types.end()) {
       return absl::Status(
@@ -74,8 +75,8 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
                               absl::SkipEmpty());
   }
 
-  std::set<string> valid_options;
-  std::set<string> required_options;
+  std::set<std::string> valid_options;
+  std::set<std::string> required_options;
   if (*output_type == kOutput[0]) {
     valid_options.insert(
         kTimelineOpts,
@@ -99,8 +100,8 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
             sizeof(kPprofRequiredOpts) / sizeof(*kPprofRequiredOpts));
   }
 
-  for (const string& kv_str : kv_split) {
-    const std::vector<string> kv =
+  for (const std::string& kv_str : kv_split) {
+    const std::vector<std::string> kv =
         absl::StrSplit(kv_str, '=', absl::SkipEmpty());
     if (kv.size() < 2) {
       return absl::Status(
@@ -113,11 +114,11 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
           absl::StrFormat("Unrecognized options %s for output_type: %s\n",
                           kv[0], *output_type));
     }
-    const std::vector<string> kv_without_key(kv.begin() + 1, kv.end());
+    const std::vector<std::string> kv_without_key(kv.begin() + 1, kv.end());
     (*output_options)[kv[0]] = absl::StrJoin(kv_without_key, "=");
   }
 
-  for (const string& opt : required_options) {
+  for (const std::string& opt : required_options) {
     if (output_options->find(opt) == output_options->end()) {
       return absl::Status(
           absl::StatusCode::kInvalidArgument,
@@ -129,7 +130,7 @@ absl::Status ParseOutput(const string& output_opt, string* output_type,
   return absl::OkStatus();
 }
 
-absl::Status Options::FromProtoStr(const string& opts_proto_str,
+absl::Status Options::FromProtoStr(const std::string& opts_proto_str,
                                    Options* opts) {
   OptionsProto opts_pb;
   if (!opts_pb.ParseFromString(opts_proto_str)) {
@@ -139,8 +140,8 @@ absl::Status Options::FromProtoStr(const string& opts_proto_str,
                      opts_proto_str));
   }
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
   absl::Status s = ParseOutput(opts_pb.output(), &output_type, &output_options);
   if (!s.ok()) return s;
 
@@ -162,18 +163,19 @@ absl::Status Options::FromProtoStr(const string& opts_proto_str,
       opts_pb.min_micros(), opts_pb.min_accelerator_micros(),
       opts_pb.min_cpu_micros(), opts_pb.min_params(), opts_pb.min_float_ops(),
       opts_pb.min_occurrence(), opts_pb.step(), opts_pb.order_by(),
-      std::vector<string>(opts_pb.account_type_regexes().begin(),
-                          opts_pb.account_type_regexes().end()),
-      std::vector<string>(opts_pb.start_name_regexes().begin(),
-                          opts_pb.start_name_regexes().end()),
-      std::vector<string>(opts_pb.trim_name_regexes().begin(),
-                          opts_pb.trim_name_regexes().end()),
-      std::vector<string>(opts_pb.show_name_regexes().begin(),
-                          opts_pb.show_name_regexes().end()),
-      std::vector<string>(opts_pb.hide_name_regexes().begin(),
-                          opts_pb.hide_name_regexes().end()),
+      std::vector<std::string>(opts_pb.account_type_regexes().begin(),
+                               opts_pb.account_type_regexes().end()),
+      std::vector<std::string>(opts_pb.start_name_regexes().begin(),
+                               opts_pb.start_name_regexes().end()),
+      std::vector<std::string>(opts_pb.trim_name_regexes().begin(),
+                               opts_pb.trim_name_regexes().end()),
+      std::vector<std::string>(opts_pb.show_name_regexes().begin(),
+                               opts_pb.show_name_regexes().end()),
+      std::vector<std::string>(opts_pb.hide_name_regexes().begin(),
+                               opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
-      std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
+      std::vector<std::string>(opts_pb.select().begin(),
+                               opts_pb.select().end()),
       output_type, output_options);
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/profiler/tfprof_options.h b/tensorflow/core/profiler/tfprof_options.h
index a518db50022f53..821a868b3e7816 100644
--- a/tensorflow/core/profiler/tfprof_options.h
+++ b/tensorflow/core/profiler/tfprof_options.h
@@ -102,7 +102,8 @@ static const char* const kPprofRequiredOpts[] = {
 
 struct Options {
  public:
-  static absl::Status FromProtoStr(const string& opts_proto_str, Options* opts);
+  static absl::Status FromProtoStr(const std::string& opts_proto_str,
+                                   Options* opts);
 
   virtual ~Options() = default;
   Options()
@@ -113,15 +114,16 @@ struct Options {
           int64_t min_residual_bytes, int64_t min_output_bytes,
           int64_t min_micros, int64_t min_accelerator_micros,
           int64_t min_cpu_micros, int64_t min_params, int64_t min_float_ops,
-          int64_t min_occurrence, int64_t step, const string& order_by,
-          const std::vector<string>& account_type_regexes,
-          const std::vector<string>& start_name_regexes,
-          const std::vector<string>& trim_name_regexes,
-          const std::vector<string>& show_name_regexes,
-          const std::vector<string>& hide_name_regexes,
-          bool account_displayed_op_only, const std::vector<string>& select,
-          const string& output_type,
-          const std::map<string, string>& output_options)
+          int64_t min_occurrence, int64_t step, const std::string& order_by,
+          const std::vector<std::string>& account_type_regexes,
+          const std::vector<std::string>& start_name_regexes,
+          const std::vector<std::string>& trim_name_regexes,
+          const std::vector<std::string>& show_name_regexes,
+          const std::vector<std::string>& hide_name_regexes,
+          bool account_displayed_op_only,
+          const std::vector<std::string>& select,
+          const std::string& output_type,
+          const std::map<std::string, std::string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_peak_bytes(min_peak_bytes),
@@ -145,7 +147,7 @@ struct Options {
         output_type(output_type),
         output_options(output_options) {}
 
-  string ToString() const;
+  std::string ToString() const;
 
   int max_depth;
   int64_t min_bytes;
@@ -159,26 +161,27 @@ struct Options {
   int64_t min_float_ops;
   int64_t min_occurrence;
   int64_t step;
-  string order_by;
+  std::string order_by;
 
-  std::vector<string> account_type_regexes;
-  std::vector<string> start_name_regexes;
-  std::vector<string> trim_name_regexes;
-  std::vector<string> show_name_regexes;
-  std::vector<string> hide_name_regexes;
+  std::vector<std::string> account_type_regexes;
+  std::vector<std::string> start_name_regexes;
+  std::vector<std::string> trim_name_regexes;
+  std::vector<std::string> show_name_regexes;
+  std::vector<std::string> hide_name_regexes;
   bool account_displayed_op_only;
 
-  std::set<string> select;
+  std::set<std::string> select;
 
-  string output_type;
-  std::map<string, string> output_options;
+  std::string output_type;
+  std::map<std::string, std::string> output_options;
 };
 
 // Parse the -output option.
 // 'output_opt': User input string with format: output_type:key=value,key=value.
 // 'output_type' and 'output_options' are extracted from 'output_opt'.
-absl::Status ParseOutput(const string& output_opt, string* output_type,
-                         std::map<string, string>* output_options);
+absl::Status ParseOutput(const std::string& output_opt,
+                         std::string* output_type,
+                         std::map<std::string, std::string>* output_options);
 
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 7d43a017663e57..291246e549fa41 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -58,8 +58,6 @@ cc_library(
     hdrs = ["math_utils.h"],
     visibility = [
         "//perftools/accelerators/xprof/convert:__pkg__",
-        "//perftools/accelerators/xprof/service:__pkg__",
-        "//perftools/accelerators/xprof/xplane:__pkg__",
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
         "//perftools/gputools/profiler/collector:__pkg__",
         "//tensorflow/core/profiler/rpc:__pkg__",
@@ -85,7 +83,6 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = [
         "//perftools/accelerators/xprof/convert:__pkg__",
-        "//perftools/accelerators/xprof/xplane:__pkg__",
         "//perftools/gputools/profiler/collector:__pkg__",
     ],
     deps = [
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index cced3f25e32e6d..52a69150cae3fb 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -791,7 +791,14 @@ message ConfigProto {
     // directory.
     string tf2xla_dump_dir = 35;
 
-    // Next: 36
+    // When set to true, the online op cost analysis will be enabled. This
+    // will run the cost analysis once for the first model execution and
+    // use the obtained cost to optimize subsequent executions.
+    // Typically `stream_merge_threshold` should be tuned to set to a non-zero
+    // value when this option is enabled.
+    bool online_cost_analysis = 36;
+
+    // Next: 37
   }
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 94a9967e13f7de..e6e5a48e159168 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2387  // Updated: 2025/10/21
+#define TF_GRAPH_DEF_VERSION 2414  // Updated: 2025/11/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 1c985c50aada77..7967ea7945b9d9 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -78,6 +78,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
     ],
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 32f4a8ceb6d380..c7f12aed50daa3 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/base/nullability.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "xla/tsl/platform/errors.h"
diff --git a/tensorflow/core/tfrt/fallback/fallback_state_test.cc b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
index 9f71bde2e9e410..16531164c6e5d4 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state_test.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/base/nullability.h"
+#include "absl/status/status_matchers.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/const_op.h"
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index 1dde8567cd3d78..f8ebd91f053f72 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -148,17 +148,14 @@ absl::StatusOr<OpKernelRunner> OpKernelRunner::Create(
     return absl::InternalError(
         absl::StrCat("Failed to create OpKernel for op: ", op_name));
   }
-  return OpKernelRunner(op_name, device, function_library_runtime,
-                        std::move(op_kernel));
+  return OpKernelRunner(device, function_library_runtime, std::move(op_kernel));
 }
 
 OpKernelRunner::OpKernelRunner(
-    absl::string_view op_name, tensorflow::Device* device,
+    tensorflow::Device* device,
     tensorflow::FunctionLibraryRuntime* function_library_runtime,
     std::unique_ptr<tensorflow::OpKernel> op_kernel)
-    : op_kernel_(std::move(op_kernel)),
-      info_(std::make_unique<Info>()),
-      op_name_(op_name) {
+    : op_kernel_(std::move(op_kernel)), info_(std::make_unique<Info>()) {
   DCHECK(device);
   DCHECK(function_library_runtime);
 
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index c7b3b3b6bc96b6..317d0956b4a247 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -129,10 +129,14 @@ class OpKernelRunner {
 
  private:
   explicit OpKernelRunner(
-      absl::string_view op_name, tensorflow::Device* device,
+      tensorflow::Device* device,
       tensorflow::FunctionLibraryRuntime* function_library_runtime,
       std::unique_ptr<OpKernel> op_kernel);
 
+  std::unique_ptr<OpKernel> op_kernel_;
+  absl::Span<const AllocatorAttributes> input_alloc_attrs_;
+  absl::Span<const AllocatorAttributes> output_alloc_attrs_;
+
   struct Info {
     tensorflow::Device* device = nullptr;
     tensorflow::FunctionLibraryRuntime* function_library_runtime = nullptr;
@@ -141,13 +145,7 @@ class OpKernelRunner {
     absl::InlinedVector<AllocatorAttributes, 4UL> input_alloc_attrs;
     absl::InlinedVector<AllocatorAttributes, 1UL> output_alloc_attrs;
   };
-
-  std::unique_ptr<OpKernel> op_kernel_;
-  absl::Span<const AllocatorAttributes> input_alloc_attrs_;
   std::unique_ptr<Info> info_;
-  absl::Span<const AllocatorAttributes> output_alloc_attrs_;
-
-  std::string op_name_;
 };
 
 // OpKernelRunState keeps the states needed for per-kernel execution.
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 5b11a2be9846ba..4aa8584e7e4041 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -142,6 +142,7 @@ cc_library(
         "//tensorflow/core/tfrt/mlrt/interpreter:register_span",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@local_xla//xla/tsl/platform:errors",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
index daecf14a6d0eb7..fe9833ea071531 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
@@ -117,7 +118,11 @@ void ExecuteKernelRunner(
     }
 
     if (ABSL_PREDICT_FALSE(!op_kernel_context.status().ok())) {
-      frame.execution_context().Fail(op_kernel_context.status());
+      absl::Status status = op_kernel_context.status();
+      tsl::errors::AppendToMessage(
+          &status, absl::StrCat("Error from kernel: ",
+                                kernel_runner.op_kernel()->name_view()));
+      frame.execution_context().Fail(std::move(status));
       return;
     }
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
index f687c0f1f2b84f..bf316332ed4f89 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
@@ -399,7 +399,8 @@ TEST(KernelTest, CreateExecuteOpError) {
 
   EXPECT_THAT(
       execution_context.status(),
-      absl_testing::StatusIs(absl::StatusCode::kInternal, "test error"));
+      absl_testing::StatusIs(absl::StatusCode::kInternal,
+                             "test error\n\tError from kernel: TestError"));
 }
 
 REGISTER_OP("TestAsyncIdentity")
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 208363a315f6ce..e9ae97cc24c597 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -19,7 +19,8 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/infra/mira/...",
-        # copybara:uncomment "//learning/pathways/serving/...",
+        "//learning/pathways/serving/...",
+        # copybara:uncomment "//third_party/pathways/serving/...",
         # copybara:uncomment "//learning/serving/...",
         # copybara:uncomment "//smartass/brain/...",
         # copybara:uncomment "//quality/webanswers/servo2/...",
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 4a43cb0b79be96..5c99062a5cecf9 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -16,8 +16,6 @@ package_group(
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/infra/mira/...",
         # copybara:uncomment "//learning/serving/...",
-        # copybara:uncomment "//learning/pathways/serving/model_tests/...",
-        # copybara:uncomment "//learning/pathways/serving/runtime/...",
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests/...",
         "//tensorflow/compiler/mlir/tfrt/...",
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
index 7e9214e801db60..ae3317128c7294 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 31a18f642bcea2..0fc8f06b2b5e53 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -80,6 +80,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using CostAnalysisOptions =
+    tensorflow::tfrt_stub::GraphExecutionOptions::CostAnalysisOptions;
+
 // Wraps an `Eigen::ThreadPoolInterface` as a
 // `tensorflow::thread::ThreadPoolInterface`.
 class ThreadPoolInterfaceWrapper : public thread::ThreadPoolInterface {
@@ -244,7 +247,7 @@ class TfrtSession : public tensorflow::Session {
     optimization_options.session_options = &options_;
     FunctionLibraryDefinition flib_def = fallback_state->func_lib_def();
     optimization_options.flib_def = &flib_def;
-    std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
+    std::unordered_map<std::string, std::unique_ptr<Graph>> partition_graphs;
     auto initial_graph =
         std::make_unique<tensorflow::Graph>(tensorflow::OpRegistry::Global());
     tensorflow::GraphConstructorOptions opts;
@@ -504,7 +507,12 @@ class TfrtSession : public tensorflow::Session {
     compile_options.tpu_fuse_ops = tpu_use_tpu_runner_;
     compile_options.hoist_invariant_ops = true;
     compile_options.sink_in_invariant_ops = true;
+
     compile_options.cost_threshold = 1024;
+    if (options_.config.experimental().stream_merge_threshold() > 0) {
+      compile_options.cost_threshold =
+          options_.config.experimental().stream_merge_threshold();
+    }
 
     if (use_gpu_) {
       options.enable_tfrt_gpu = true;
@@ -518,6 +526,10 @@ class TfrtSession : public tensorflow::Session {
 
     options.model_metadata = options_.config.experimental().session_metadata();
     options.enable_mlrt = enable_mlrt_;
+    if (options_.config.experimental().online_cost_analysis()) {
+      options.cost_analysis_options.version =
+          CostAnalysisOptions::CostAnalysisVersion::kOnce;
+    }
 
     return options;
   }
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 92595afecd9a80..0f51606fda3ff4 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -14,11 +14,12 @@ package_group(
         "//learning/brain/google/xla/kernels/...",
         # copybara:uncomment "//learning/brain/research/pjrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
-        # copybara:uncomment "//learning/pathways/serving/...",
+        "//learning/pathways/serving/...",
         "//learning/serving/...",
         "//quality/webanswers/...",
         "//smartass/brain/inference/...",
         # copybara:uncomment "//smartass/brain/ops/...",
+        # copybara:uncomment "//third_party/pathways/serving/...",
         "//tensorflow/c/eager/...",
         "//tensorflow/compiler/mlir/tfrt/...",
         "//tensorflow/core/runtime_fallback/...",
diff --git a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
index 722ceed2cdd412..1e1aaa11483827 100644
--- a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
@@ -56,7 +56,7 @@ constexpr char kFinalizeOp[] = "FinalizeTPUEmbedding";
 constexpr char kEmbeddingConfigurationAttr[] = "config";
 
 absl::Status AddSynchronizationNode(
-    const NodeDef& sync_node_def, const string& device_name,
+    const NodeDef& sync_node_def, const std::string& device_name,
     absl::Span<Node* const> end_nodes,
     absl::Span<const DistributedTPURewriteHelpers::OutputDependency>
         output_dependencies,
@@ -88,8 +88,9 @@ absl::Status AddSynchronizationNode(
 }
 
 absl::Status AddSetupPropagationEmbeddingNode(
-    const string& device_name, const string& node_name, const string& op_name,
-    absl::Span<Node* const> input_nodes, Graph* graph, Node** node) {
+    const std::string& device_name, const std::string& node_name,
+    const std::string& op_name, absl::Span<Node* const> input_nodes,
+    Graph* graph, Node** node) {
   NodeDef node_def;
   node_def.set_name(node_name);
   node_def.set_op(op_name);
@@ -109,7 +110,7 @@ absl::Status AddSetupPropagationEmbeddingNode(
 }
 
 absl::Status AddExecutePartitionerNode(
-    const string& configuration_device_name, const string& config,
+    const std::string& configuration_device_name, const std::string& config,
     absl::Span<Node* const> input_dependencies, Graph* graph,
     Node** partitioner_node) {
   NodeDef partitioner_def;
@@ -128,7 +129,7 @@ absl::Status AddExecutePartitionerNode(
   return absl::OkStatus();
 }
 
-absl::Status AddConfigureMemoryNode(const string& host_device_name,
+absl::Status AddConfigureMemoryNode(const std::string& host_device_name,
                                     Node* partitioner_node, Graph* graph,
                                     Node** embedding_node) {
   NodeDef embedding_def;
@@ -142,7 +143,7 @@ absl::Status AddConfigureMemoryNode(const string& host_device_name,
   return absl::OkStatus();
 }
 
-absl::Status AddCollateMemoryNode(const string& configuration_device_name,
+absl::Status AddCollateMemoryNode(const std::string& configuration_device_name,
                                   absl::Span<Node* const> memory_nodes,
                                   Graph* graph, Node** embedding_node) {
   return AddSetupPropagationEmbeddingNode(
@@ -153,10 +154,10 @@ absl::Status AddCollateMemoryNode(const string& configuration_device_name,
       /*node=*/embedding_node);
 }
 
-absl::Status AddConfigureHostNode(const string& host_device_name,
-                                  const string& config, Node* partitioner_node,
-                                  Node* memory_node, Graph* graph,
-                                  Node** embedding_node) {
+absl::Status AddConfigureHostNode(const std::string& host_device_name,
+                                  const std::string& config,
+                                  Node* partitioner_node, Node* memory_node,
+                                  Graph* graph, Node** embedding_node) {
   NodeDef embedding_def;
   embedding_def.set_name(graph->NewName("configure_tpu_embedding_host"));
   embedding_def.set_op(kConfigureHostOp);
@@ -172,7 +173,7 @@ absl::Status AddConfigureHostNode(const string& host_device_name,
   return absl::OkStatus();
 }
 
-absl::Status AddConnectHostsNode(const string& host_device_name,
+absl::Status AddConnectHostsNode(const std::string& host_device_name,
                                  absl::Span<Node* const> configure_host_nodes,
                                  Graph* graph, Node** connect_node) {
   return AddSetupPropagationEmbeddingNode(
@@ -183,7 +184,7 @@ absl::Status AddConnectHostsNode(const string& host_device_name,
       /*node=*/connect_node);
 }
 
-absl::Status AddFinalizeNode(const string& configuration_device_name,
+absl::Status AddFinalizeNode(const std::string& configuration_device_name,
                              Node* partitioner_node, Node* memory_node,
                              Graph* graph, Node** finalize_node) {
   NodeDef finalize_def;
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
index 5afd434be6cc6a..2b568d2db40b27 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 
 // LINT.IfChange
 absl::Status DistributedTPURewriteHelpers::GetSystemDevice(
-    const string& system_spec_string, const DeviceSet& device_set,
+    const std::string& system_spec_string, const DeviceSet& device_set,
     DeviceNameUtils::ParsedName* system_spec, Device** system_device) {
   if (!DeviceNameUtils::ParseFullName(system_spec_string, system_spec)) {
     system_spec->Clear();
@@ -72,7 +72,7 @@ absl::Status DistributedTPURewriteHelpers::GetSystemDevice(
                                    system_spec_string, "'");
   } else if (system_devices.size() > 1) {
     // Validate that all system devices are part of the same job.
-    std::unordered_set<string> job_names;
+    std::unordered_set<std::string> job_names;
     for (auto device : system_devices) {
       const auto& parsed_name = device->parsed_name();
       TF_RET_CHECK(parsed_name.has_job);
@@ -136,7 +136,7 @@ absl::Status DistributedTPURewriteHelpers::GetHostSystemDevices(
 
   // Check that all the devices belong to the same job.
   TF_RET_CHECK((*host_system_devices)[0]->parsed_name().has_job);
-  const string& job_name = (*host_system_devices)[0]->parsed_name().job;
+  const std::string& job_name = (*host_system_devices)[0]->parsed_name().job;
   int replica = (*host_system_devices)[0]->parsed_name().replica;
   for (const auto host_device : *host_system_devices) {
     const auto& parsed_name = host_device->parsed_name();
@@ -215,10 +215,10 @@ absl::Status DistributedTPURewriteHelpers::GetTPUDevices(
 // LINT.ThenChange(//tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc)
 
 absl::Status DistributedTPURewriteHelpers::ForConfigurationNodeMatchingType(
-    const string& node_type, Graph* graph, const DeviceSet& device_set,
+    const std::string& node_type, Graph* graph, const DeviceSet& device_set,
     const std::function<
         absl::Status(const NodeDef& configuration_node_def,
-                     const string& configuration_device_name,
+                     const std::string& configuration_device_name,
                      const std::vector<Device*>& host_devices,
                      const std::vector<Node*>& input_dependencies,
                      const std::vector<OutputDependency>& output_dependencies,
@@ -232,12 +232,12 @@ absl::Status DistributedTPURewriteHelpers::ForConfigurationNodeMatchingType(
   }
 
   for (Node* node : nodes) {
-    string spec_string = node->requested_device();
+    std::string spec_string = node->requested_device();
     DeviceNameUtils::ParsedName spec;
     Device* device;
     TF_RETURN_IF_ERROR(
         GetSystemDevice(spec_string, device_set, &spec, &device));
-    const string& device_name = device->name();
+    const std::string& device_name = device->name();
 
     std::vector<Device*> host_devices;
     TF_RETURN_IF_ERROR(GetHostSystemDevices(spec, device_set, &host_devices));
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
index ae4bfc8b63d3ae..3863f27baffe96 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_helpers.h
@@ -44,7 +44,7 @@ class DistributedTPURewriteHelpers {
   // system_spec_string to identify the TPU_SYSTEM on replica 0, task 0 of the
   // job that contains the TPU hardware.
   // TODO(b/110910013): Possibly remove the tpu system device.
-  static absl::Status GetSystemDevice(const string& system_spec_string,
+  static absl::Status GetSystemDevice(const std::string& system_spec_string,
                                       const DeviceSet& device_set,
                                       DeviceNameUtils::ParsedName* system_spec,
                                       Device** system_device);
@@ -91,10 +91,10 @@ class DistributedTPURewriteHelpers {
     int dst_input;
   };
   static absl::Status ForConfigurationNodeMatchingType(
-      const string& node_type, Graph* graph, const DeviceSet& device_set,
+      const std::string& node_type, Graph* graph, const DeviceSet& device_set,
       const std::function<
           absl::Status(const NodeDef& configuration_node_def,
-                       const string& configuration_device_name,
+                       const std::string& configuration_device_name,
                        const std::vector<Device*>& host_devices,
                        const std::vector<Node*>& input_dependencies,
                        const std::vector<OutputDependency>& output_dependencies,
diff --git a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
index 2730408736181e..d6a810b202c009 100644
--- a/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
+++ b/tensorflow/core/tpu/graph_rewrite/incomplete_nodedef_builder.h
@@ -33,23 +33,24 @@ namespace tensorflow {
 // TODO(jpienaar): Clean up NodeDefBuilder and remove this class.
 class IncompleteNodeDefBuilder {
  public:
-  IncompleteNodeDefBuilder(const string& name, const string& op,
+  IncompleteNodeDefBuilder(const std::string& name, const std::string& op,
                            const NodeDebugInfo& debug);
 
-  IncompleteNodeDefBuilder& AddAttr(const string& attr, const DataType& type);
-  IncompleteNodeDefBuilder& AddAttr(const string& attr, int val);
+  IncompleteNodeDefBuilder& AddAttr(const std::string& attr,
+                                    const DataType& type);
+  IncompleteNodeDefBuilder& AddAttr(const std::string& attr, int val);
 
-  IncompleteNodeDefBuilder& Device(const string& device);
+  IncompleteNodeDefBuilder& Device(const std::string& device);
 
   absl::Status Build(Graph* graph, Node** n);
 
-  static IncompleteNodeDefBuilder Identity(const string& name,
+  static IncompleteNodeDefBuilder Identity(const std::string& name,
                                            const DataType& type,
                                            const NodeDebugInfo& debug);
-  static IncompleteNodeDefBuilder Merge(const string& name,
+  static IncompleteNodeDefBuilder Merge(const std::string& name,
                                         const DataType& type,
                                         const NodeDebugInfo& debug, int n);
-  static IncompleteNodeDefBuilder Switch(const string& name,
+  static IncompleteNodeDefBuilder Switch(const std::string& name,
                                          const DataType& type,
                                          const NodeDebugInfo& debug);
 
diff --git a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
index 2b015a383ff2ab..b5c287d1726b93 100644
--- a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
@@ -51,7 +51,7 @@ namespace {
 // Check the number of outputs for RecvActivationsNode or for number of inputs
 // For SendGradientsNode.
 absl::Status CheckNumInputsOrOutputs(
-    const int32 num_input_or_outputs, const std::string& attribute_name,
+    const int32_t num_input_or_outputs, const std::string& attribute_name,
     const std::string& node_name,
     const tpu::TPUEmbeddingConfiguration& tpu_embedding_config) {
   if (tpu_embedding_config.feature_descriptor_size() == 0 &&
@@ -129,7 +129,7 @@ absl::StatusOr<NodeDef> MakeRecvActivationsNodeDef(
         "Malformed config attribute in the RecvTPUEmbeddingActivations node.");
   }
 
-  int32 num_outputs;
+  int32_t num_outputs;
   TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(old_activations_node_def),
                                  "num_outputs", &num_outputs));
 
@@ -185,7 +185,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
         "Malformed config attribute in the SendTPUEmbeddingGradients node.");
   }
 
-  int32 num_inputs;
+  int32_t num_inputs;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(AttrSlice(old_gradients_node_def), "N", &num_inputs));
 
@@ -193,7 +193,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
                                              "SendTPUEmbeddingGradients",
                                              tpu_embedding_config));
 
-  int32 dynamic_inputs_tag_count = 0;
+  int32_t dynamic_inputs_tag_count = 0;
   if (!GetNodeAttr(AttrSlice(old_gradients_node_def), "NN",
                    &dynamic_inputs_tag_count)
            .ok()) {
@@ -209,7 +209,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
         status_or_dynamic_inputs_tag_count.status().message());
   }
 
-  const int32 expected_dynamic_inputs_tag_count =
+  const int32_t expected_dynamic_inputs_tag_count =
       status_or_dynamic_inputs_tag_count.value();
 
   if (dynamic_inputs_tag_count != expected_dynamic_inputs_tag_count) {
@@ -221,7 +221,7 @@ absl::StatusOr<NodeDef> MakeSendGradientsNodeDef(
   }
 
   if (data_inputs.size() !=
-      static_cast<uint64>(num_inputs + dynamic_inputs_tag_count)) {
+      static_cast<uint64_t>(num_inputs + dynamic_inputs_tag_count)) {
     return absl::InvalidArgumentError(absl::StrFormat(
         "Mismatch in the number of inputs for SendTPUEmbeddingGradients node, "
         "expected: %d, actual: %d",
diff --git a/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc b/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
index 7d923357fb91fb..36f6fde1ebe817 100644
--- a/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
+++ b/tensorflow/core/tpu/graph_rewrite/update_tpu_embedding_ops_passes.cc
@@ -109,10 +109,10 @@ absl::Status UpdateTPUEmbeddingEnqueueOrdinalPass::Run(
   single_tpu_device_spec.has_id = false;
   options.device_set->FindMatchingDevices(single_tpu_device_spec,
                                           &task_devices);
-  int64 num_tpus_per_task = task_devices.size();
+  int64_t num_tpus_per_task = task_devices.size();
 
   for (Node* node : embedding_nodes) {
-    int64 replica_id;
+    int64_t replica_id;
     if (TryGetNodeAttr(node->attrs(), kXlaReplicaIdAttrName, &replica_id)) {
       node->AddAttr("device_ordinal", replica_id % num_tpus_per_task);
     }
@@ -128,7 +128,7 @@ absl::Status UpdateMapsForModeOverride(
     std::map<std::string, N>* enqueue_op,
     std::map<std::string, bool>* found_recv_op,
     std::map<std::string, bool>* found_grad_send_op) {
-  string layer_call_index;
+  std::string layer_call_index;
   if (TryGetNodeAttr(attrs, "_tpu_embedding_layer", &layer_call_index)) {
     if ((op == kTPURecvOps[0]) || (op == kTPURecvOps[1])) {
       // We will prevent users from creating multiple copies of the
@@ -269,7 +269,7 @@ absl::Status UpdateTPUEmbeddingModePass::UpdateFunctionDefEnqueueOp(
   TF_RET_CHECK(!node->input(mode_override).empty());
 
   // Find input node
-  string select_name = std::vector<std::string>(
+  std::string select_name = std::vector<std::string>(
       absl::StrSplit(node->input(mode_override), ':'))[0];
   int select = 0;
   while ((select < function->node_def_size()) &&
diff --git a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
index cda7e336896f0b..ca5c50074b2a0c 100644
--- a/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/variable_merger_pass.cc
@@ -50,8 +50,8 @@ namespace {
 // The name of a stateful op is semantically meaningful because ops with the
 // same name will share the same kernel. We therefore form new op names using a
 // deterministic function (a fingerprint) of the old names.
-uint64 MergedOpFingerprint(absl::Span<Node* const> ops) {
-  std::vector<string> op_names;
+uint64_t MergedOpFingerprint(absl::Span<Node* const> ops) {
+  std::vector<std::string> op_names;
   op_names.reserve(ops.size());
   for (const Node* node : ops) {
     op_names.push_back(node->name());
@@ -59,13 +59,13 @@ uint64 MergedOpFingerprint(absl::Span<Node* const> ops) {
   return Fingerprint64(absl::StrJoin(op_names, ","));
 }
 
-absl::Status MergeVarHandleOps(const string& device,
+absl::Status MergeVarHandleOps(const std::string& device,
                                absl::Span<Node* const> nodes, Graph* graph) {
   int num_var_handles(nodes.size());
   if (num_var_handles <= 1) return absl::OkStatus();
 
-  std::vector<string> containers(num_var_handles);
-  std::vector<string> names(num_var_handles);
+  std::vector<std::string> containers(num_var_handles);
+  std::vector<std::string> names(num_var_handles);
   DataTypeVector dtypes(num_var_handles);
   std::vector<PartialTensorShape> shapes(num_var_handles);
   for (int i = 0; i < num_var_handles; ++i) {
@@ -150,7 +150,7 @@ absl::Status VariableMergerPass::Run(
 
   // Find VarHandleOps that are graph roots and group them by assigned device.
   // Also find any ReadVariableOps that are consumers of those handles.
-  absl::flat_hash_map<string, std::vector<Node*>> var_handle_ops_by_device;
+  absl::flat_hash_map<std::string, std::vector<Node*>> var_handle_ops_by_device;
   absl::flat_hash_set<Node*> read_variable_ops;
 
   for (Node* m : graph->source_node()->out_nodes()) {
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index 4da64d9185ff75..fbd6a9c0d46d5e 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -180,7 +180,7 @@ class HostComputeOp : public XlaOpKernel {
     // Send values to the host.
     std::vector<xla::XlaOp> send_to_host_tokens;
     for (int i = 0; i < input_handles.size(); ++i) {
-      const string channel_name = GetDeviceToHostChannelName(send_key_, i);
+      const std::string channel_name = GetDeviceToHostChannelName(send_key_, i);
       xla::Shape xla_shape;
       OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(input_dtypes_[i],
                                                 input_shapes[i], &xla_shape));
@@ -242,7 +242,7 @@ class HostComputeOp : public XlaOpKernel {
     // Copy results to the device.
     std::vector<xla::XlaOp> recv_from_host_tokens;
     for (int i = 0; i < output_shapes->size(); ++i) {
-      const string channel_name = GetHostToDeviceChannelName(recv_key_, i);
+      const std::string channel_name = GetHostToDeviceChannelName(recv_key_, i);
       // Specify frontend attributes.
       xla::FrontendAttributes attrs;
       (*attrs.mutable_map())[xla::kXlaHostTransferRendezvousNameAttr] =
@@ -403,21 +403,21 @@ class HostComputeOp : public XlaOpKernel {
 
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
-  std::vector<string> ancestors_;
+  std::vector<std::string> ancestors_;
   std::vector<TensorShape> static_output_shapes_;
   std::vector<xla::Shape> static_xla_output_shapes_;
-  string original_node_name_;
+  std::string original_node_name_;
   // If static_xla_output_shapes_.size() == 1 then xla_output_shape_ is the
   // unique output shape, otherwise it is a tuple of all the xla_output_shapes_.
   xla::Shape static_xla_output_shape_;
-  string send_key_;
-  string recv_key_;
+  std::string send_key_;
+  std::string recv_key_;
   // If shape inference is performed at runtime, the graph needed to perform
   // shape inference is stored in this function.
   std::unique_ptr<FunctionBody> shape_inference_graph_function_;
   int64_t cost_estimate_;
   int64_t tpu_core_;
-  std::vector<string> token_input_nodes_;
+  std::vector<std::string> token_input_nodes_;
 
   HostComputeOp(const HostComputeOp&) = delete;
   void operator=(const HostComputeOp&) = delete;
@@ -470,9 +470,9 @@ class SendToHostOp : public XlaOpKernel {
 
  private:
   DataType input_dtype_;
-  string key_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::string key_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   SendToHostOp(const SendToHostOp&) = delete;
   void operator=(const SendToHostOp&) = delete;
 };
@@ -529,9 +529,9 @@ class RecvFromHostOp : public XlaOpKernel {
  private:
   DataType output_dtype_;
   TensorShape output_shape_;
-  string key_;
-  std::vector<string> token_input_nodes_;
-  string original_node_name_;
+  std::string key_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
   RecvFromHostOp(const RecvFromHostOp&) = delete;
   void operator=(const RecvFromHostOp&) = delete;
 };
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
index ce2bb7df6ab0d6..d9f89b172a815d 100644
--- a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -45,7 +45,7 @@ class OutfeedEnqueueOp : public XlaOpKernel {
     OP_REQUIRES_OK(
         ctx, TensorShapeToXLAShape(dtype_, ctx->InputShape(0), &xla_shape));
     // Outfeed configuration is only needed for embedding outfeed.
-    const string outfeed_config;
+    const std::string outfeed_config;
     xla::Outfeed(ctx->Input(0), xla_shape, outfeed_config);
   }
 
@@ -83,7 +83,7 @@ class OutfeedEnqueueTupleOp : public XlaOpKernel {
     auto b = ctx->builder();
     auto tuple = xla::Tuple(b, handles);
     // Outfeed configuration is only needed for embedding outfeed.
-    const string outfeed_config;
+    const std::string outfeed_config;
     xla::Outfeed(tuple, tuple_shape, outfeed_config);
   }
 
diff --git a/tensorflow/core/tpu/tpu_compile.cc b/tensorflow/core/tpu/tpu_compile.cc
index c020904ae1b992..f882ddc4fc0194 100644
--- a/tensorflow/core/tpu/tpu_compile.cc
+++ b/tensorflow/core/tpu/tpu_compile.cc
@@ -148,7 +148,8 @@ absl::Status AssignDevicesToArgsAndRetvals(
   auto assign = [&](Node* node,
                     const xla::OpSharding& sharding) -> absl::Status {
     if (sharding.type() == xla::OpSharding::MAXIMAL) {
-      const string device = CoreDevice(sharding.tile_assignment_devices(0));
+      const std::string device =
+          CoreDevice(sharding.tile_assignment_devices(0));
       node->set_assigned_device_name(device);
       node->set_requested_device(device);
     } else {
@@ -180,16 +181,17 @@ absl::Status AssignDevicesToArgsAndRetvals(
 
 void ConvertGraphShapeInfoToShapeMap(
     const Graph& graph, const GraphShapeInfo& graph_shape_info,
-    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map) {
+    std::unordered_map<std::string, std::vector<PartialTensorShape>>*
+        shape_map) {
   // Builds a map from node name to Node* for `graph`.
-  std::unordered_map<string, Node*> index;
+  std::unordered_map<std::string, Node*> index;
   for (Node* node : graph.nodes()) {
     index[node->name()] = node;
   }
   // Discards the resource handle shape info while converting to the correct map
   // form.
   for (const auto& node_shape_info : graph_shape_info) {
-    const string& node_name = node_shape_info.first;
+    const std::string& node_name = node_shape_info.first;
     const std::vector<InferredShape>& output_shapes = node_shape_info.second;
     // Gets the vector of partial shapes, first converting node name to Node*
     // using index. graph is the subgraph of the original graph assigned to a
@@ -248,7 +250,7 @@ absl::Status OptimizeGraph(const tpu::TPUCompileMetadataProto& metadata,
         metadata, arg_shapes, graph->get(), flr, &shape_info));
     // Converts the GraphShapeInfo into the form needed by the constant-folding
     // pass of the optimizer.
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
     optimizer_opts.shape_map = &shape_map;
     optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
@@ -259,7 +261,7 @@ absl::Status OptimizeGraph(const tpu::TPUCompileMetadataProto& metadata,
     GraphShapeInfo shape_info;
     TF_RETURN_IF_ERROR(internal::RunShapeInferenceOnComputation(
         metadata, arg_shapes, graph->get(), flr, &shape_info));
-    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>> shape_map;
     ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
     GraphOptimizer::Options optimizer_opts;
     optimizer_opts.shape_map = &shape_map;
@@ -487,7 +489,7 @@ absl::Status CompileTFFunctionToHlo(
   TF_RETURN_IF_ERROR(compiler->flib_runtime()->Instantiate(
       function.name(), AttrSlice(&function.attr()), &handle));
   const FunctionBody* fbody = compiler->flib_runtime()->GetFunctionBody(handle);
-  const string function_id =
+  const std::string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
 
   std::unique_ptr<Graph> graph(new Graph(&flib_definition));
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 6508679f953999..587d6341527a20 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -475,7 +475,7 @@ absl::Status LoadOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
-  string table_name;
+  std::string table_name;
   TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
   // Exactly one must be non-default.
   if ((table_id >= 0) == (!table_name.empty())) {
@@ -505,7 +505,7 @@ absl::Status RetrieveOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
-  string table_name;
+  std::string table_name;
   TF_RETURN_IF_ERROR(c->GetAttr("table_name", &table_name));
   // Exactly one must be non-default.
   if ((table_id >= 0) == (!table_name.empty())) {
diff --git a/tensorflow/core/tpu/tpu_execute.h b/tensorflow/core/tpu/tpu_execute.h
index bfa177d66a7e4c..aab534085423a7 100644
--- a/tensorflow/core/tpu/tpu_execute.h
+++ b/tensorflow/core/tpu/tpu_execute.h
@@ -42,7 +42,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
     const TPUHostTransferInfoProto& host_transfers,
     const xla::HloProto& hlo_metadata,
     std::vector<xla::ExecutionInput> arguments,
-    const std::string& rendezvous_key_base, uint32 rng_seed,
+    const std::string& rendezvous_key_base, uint32_t rng_seed,
     tpu::TpuNodeContext* node_context, xla::DeviceAssignment* device_assignment,
     CancellationManager* cancellation_manager, OpKernelContext* ctx,
     stream_executor::Stream* stream,
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index ea11c9bdd8f36a..33592c87f7d113 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
@@ -448,9 +449,7 @@ bool OpPropertyHelper::ModifiesInputsInPlace(TFOp op) {
     return false;
   }
 
-  std::string lower_op_name = op_name.str();
-  std::transform(lower_op_name.begin(), lower_op_name.end(),
-                 lower_op_name.begin(), ::tolower);
+  std::string lower_op_name = absl::AsciiStrToLower(op_name.str());
   if (absl::StrContains(lower_op_name, "inplace")) return true;
 
   return op->hasAttr("in_place") || op->hasAttr("inplace");
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index 4a7bb1653563ed..65c37b8b468825 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -668,8 +669,9 @@ FuncAttr BasePattern::Outline(Operation *op, PatternRewriter &rewriter,
             op->getAttrOfType<StringAttr>(dialect_.getNameAttrIdentifier())) {
       llvm::raw_string_ostream os(new_func_name);
       os << "_tfg_region_specialized_";
-      for (char c : llvm::map_range(
-               op_name.getValue(), [](char c) { return isalnum(c) ? c : '_'; }))
+      for (char c : llvm::map_range(op_name.getValue(), [](char c) {
+             return llvm::isAlnum(c) ? c : '_';
+           }))
         os << c;
       os << '_' << llvm::to_string(region.getRegionNumber());
       os.flush();
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
index 24de700824f138..47f6c83698edf3 100644
--- a/tensorflow/core/util/activation_mode.cc
+++ b/tensorflow/core/util/activation_mode.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-absl::Status GetActivationModeFromString(const string& str_value,
+absl::Status GetActivationModeFromString(const std::string& str_value,
                                          ActivationMode* value) {
   if (str_value == "None") {
     *value = NONE;
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
index a541794ef9dae8..c01eff0c83530b 100644
--- a/tensorflow/core/util/activation_mode.h
+++ b/tensorflow/core/util/activation_mode.h
@@ -40,7 +40,7 @@ enum ActivationMode {
 };
 
 // Specialization to parse an attribute directly into a ActivationMode enum.
-absl::Status GetActivationModeFromString(const string& str_value,
+absl::Status GetActivationModeFromString(const std::string& str_value,
                                          ActivationMode* value);
 
 inline absl::string_view ToString(ActivationMode mode) {
diff --git a/tensorflow/core/util/bcast_test.cc b/tensorflow/core/util/bcast_test.cc
index 0791dacb156185..3c0bd9abaaeafc 100644
--- a/tensorflow/core/util/bcast_test.cc
+++ b/tensorflow/core/util/bcast_test.cc
@@ -23,13 +23,14 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string BCast(const tensorflow::BCast::Vec& x, const tensorflow::BCast::Vec& y,
-             const bool fewer_dims_optimization = true) {
+std::string BCast(const tensorflow::BCast::Vec& x,
+                  const tensorflow::BCast::Vec& y,
+                  const bool fewer_dims_optimization = true) {
   tensorflow::BCast b(x, y, fewer_dims_optimization);
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
+  std::string ret;
   absl::StrAppend(&ret, "[", absl::StrJoin(b.x_reshape(), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.x_bcast(), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.y_reshape(), ","), "]");
@@ -41,26 +42,26 @@ string BCast(const tensorflow::BCast::Vec& x, const tensorflow::BCast::Vec& y,
   return ret;
 }
 
-string BCastBatchIndices(const tensorflow::BCast::Vec& x,
-                         const tensorflow::BCast::Vec& y,
-                         const bool fewer_dims_optimization = true) {
+std::string BCastBatchIndices(const tensorflow::BCast::Vec& x,
+                              const tensorflow::BCast::Vec& y,
+                              const bool fewer_dims_optimization = true) {
   tensorflow::BCast b(x, y, fewer_dims_optimization,
                       /*return_flattened_batch_indices=*/true);
-  string ret;
+  std::string ret;
   absl::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
   return ret;
 }
 
-string BCastList3(const tensorflow::BCast::Vec& x,
-                  const tensorflow::BCast::Vec& y,
-                  const tensorflow::BCast::Vec& z,
-                  const bool fewer_dims_optimization = true) {
+std::string BCastList3(const tensorflow::BCast::Vec& x,
+                       const tensorflow::BCast::Vec& y,
+                       const tensorflow::BCast::Vec& z,
+                       const bool fewer_dims_optimization = true) {
   tensorflow::BCastList<3> b({x, y, z}, fewer_dims_optimization);
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
+  std::string ret;
   absl::StrAppend(&ret, "[", absl::StrJoin(b.reshape(0), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.bcast(0), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.reshape(1), ","), "]");
@@ -571,7 +572,7 @@ TEST(BCastTest, Complex_BCast_To_Each_Other) {
   //   y = np.arange(0,21).reshape([7,1,3,1])
   //   np.shape(x + y)
   //   Out[.]: (11, 7, 5, 3, 2)
-  string truth =
+  std::string truth =
       "[11,1,5,1,2][1,7,1,3,1][1,7,1,3,1][11,1,5,1,2]"
       "[11,7,5,3,2]"
       "[11,7,5,3,2]"
@@ -592,7 +593,7 @@ TEST(BCastListTest, Complex_BCast_To_Each_Other) {
   //   np.shape(x + y + z)
   //   Out[.]: (11, 7, 5, 3, 2)
   //
-  string truth =
+  std::string truth =
       "[11,1,1,1,2][1,7,5,3,1]"
       "[1,7,1,3,1][11,1,5,1,2]"
       "[1,1,5,1,1][11,7,1,3,2]"
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index d76d2fce3d0b03..2bf0b27a24924d 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -13,22 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/util/command_line_flags.h"
+
 #include <ctype.h>
+
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
 // The returned array is only valid for the lifetime of the input vector.
 // We're using const casting because we need to pass in an argv-style array of
 // char* pointers for the API, even though we know they won't be altered.
-std::vector<char *> CharPointerVectorFromStrings(
-    const std::vector<string> &strings) {
+std::vector<char*> CharPointerVectorFromStrings(
+    const std::vector<std::string>& strings) {
   std::vector<char *> result;
   result.reserve(strings.size());
-  for (const string &string : strings) {
+  for (const std::string& string : strings) {
     result.push_back(const_cast<char *>(string.c_str()));
   }
   return result;
@@ -44,23 +47,24 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   bool some_switch_set_via_hook = true;
   bool some_switch_set_capitalized = false;
   bool some_switch_set_by_number = false;
-  string some_name_set_directly = "something_a";
-  string some_name_set_via_hook = "something_b";
+  std::string some_name_set_directly = "something_a";
+  std::string some_name_set_via_hook = "something_b";
   float some_float_set_directly = -23.23f;
   float some_float_set_via_hook = -25.23f;
-  std::vector<string> argv_strings = {"program_name",
-                                      "--some_int32_set_directly=20",
-                                      "--some_int32_set_via_hook=50",
-                                      "--some_int64_set_directly=214748364700",
-                                      "--some_int64_set_via_hook=214748364710",
-                                      "--some_switch_set_directly",
-                                      "--some_switch_set_via_hook=false",
-                                      "--some_switch_set_capitalized=True",
-                                      "--some_switch_set_by_number=1",
-                                      "--some_name_set_directly=somethingelse",
-                                      "--some_name_set_via_hook=anythingelse",
-                                      "--some_float_set_directly=42.0",
-                                      "--some_float_set_via_hook=43.0"};
+  std::vector<std::string> argv_strings = {
+      "program_name",
+      "--some_int32_set_directly=20",
+      "--some_int32_set_via_hook=50",
+      "--some_int64_set_directly=214748364700",
+      "--some_int64_set_via_hook=214748364710",
+      "--some_switch_set_directly",
+      "--some_switch_set_via_hook=false",
+      "--some_switch_set_capitalized=True",
+      "--some_switch_set_by_number=1",
+      "--some_name_set_directly=somethingelse",
+      "--some_name_set_via_hook=anythingelse",
+      "--some_float_set_directly=42.0",
+      "--some_float_set_via_hook=43.0"};
   int argc = argv_strings.size();
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok = Flags::Parse(
@@ -101,7 +105,7 @@ TEST(CommandLineFlagsTest, BasicUsage) {
                "some name set directly"),
           Flag(
               "some_name_set_via_hook",
-              [&](string value) {
+              [&](std::string value) {
                 some_name_set_via_hook = std::move(value);
                 return true;
               },
@@ -136,7 +140,8 @@ TEST(CommandLineFlagsTest, BasicUsage) {
 TEST(CommandLineFlagsTest, BadIntValue) {
   int some_int = 10;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int=notanumber"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_int=notanumber"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok = Flags::Parse(&argc, argv_array.data(),
                                 {Flag("some_int", &some_int, "some int")});
@@ -149,7 +154,8 @@ TEST(CommandLineFlagsTest, BadIntValue) {
 TEST(CommandLineFlagsTest, BadBoolValue) {
   bool some_switch = false;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_switch=notabool"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_switch=notabool"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -163,8 +169,8 @@ TEST(CommandLineFlagsTest, BadBoolValue) {
 TEST(CommandLineFlagsTest, BadFloatValue) {
   float some_float = -23.23f;
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name",
-                                      "--some_float=notanumber"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_float=notanumber"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -177,7 +183,7 @@ TEST(CommandLineFlagsTest, BadFloatValue) {
 
 TEST(CommandLineFlagsTest, FailedInt32Hook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int32=200"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_int32=200"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -191,7 +197,7 @@ TEST(CommandLineFlagsTest, FailedInt32Hook) {
 
 TEST(CommandLineFlagsTest, FailedInt64Hook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_int64=200"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_int64=200"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -205,7 +211,8 @@ TEST(CommandLineFlagsTest, FailedInt64Hook) {
 
 TEST(CommandLineFlagsTest, FailedFloatHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_float=200.0"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_float=200.0"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -218,7 +225,8 @@ TEST(CommandLineFlagsTest, FailedFloatHook) {
 
 TEST(CommandLineFlagsTest, FailedBoolHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_switch=true"};
+  std::vector<std::string> argv_strings = {"program_name",
+                                           "--some_switch=true"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
@@ -231,11 +239,13 @@ TEST(CommandLineFlagsTest, FailedBoolHook) {
 
 TEST(CommandLineFlagsTest, FailedStringHook) {
   int argc = 2;
-  std::vector<string> argv_strings = {"program_name", "--some_name=true"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_name=true"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
-  bool parsed_ok = Flags::Parse(
-      &argc, argv_array.data(),
-      {Flag("some_name", [](string value) { return false; }, "", "some name")});
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag(
+                       "some_name", [](std::string value) { return false; }, "",
+                       "some name")});
 
   EXPECT_EQ(false, parsed_ok);
   EXPECT_EQ(argc, 1);
@@ -243,17 +253,18 @@ TEST(CommandLineFlagsTest, FailedStringHook) {
 
 TEST(CommandLineFlagsTest, RepeatedStringHook) {
   int argc = 3;
-  std::vector<string> argv_strings = {"program_name", "--some_name=this",
-                                      "--some_name=that"};
+  std::vector<std::string> argv_strings = {"program_name", "--some_name=this",
+                                           "--some_name=that"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   int call_count = 0;
   bool parsed_ok = Flags::Parse(&argc, argv_array.data(),
-                                {Flag("some_name",
-                                      [&call_count](string value) {
-                                        call_count++;
-                                        return true;
-                                      },
-                                      "", "some name")});
+                                {Flag(
+                                    "some_name",
+                                    [&call_count](std::string value) {
+                                      call_count++;
+                                      return true;
+                                    },
+                                    "", "some name")});
 
   EXPECT_EQ(true, parsed_ok);
   EXPECT_EQ(argc, 1);
@@ -262,20 +273,21 @@ TEST(CommandLineFlagsTest, RepeatedStringHook) {
 
 // Return whether str==pat, but allowing any whitespace in pat
 // to match zero or more whitespace characters in str.
-static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
+static bool MatchWithAnyWhitespace(const std::string& str,
+                                   const std::string& pat) {
   bool matching = true;
   int pat_i = 0;
   for (int str_i = 0; str_i != str.size() && matching; str_i++) {
-    if (isspace(str[str_i])) {
-      matching = (pat_i != pat.size() && isspace(pat[pat_i]));
+    if (absl::ascii_isspace(str[str_i])) {
+      matching = (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i]));
     } else {
-      while (pat_i != pat.size() && isspace(pat[pat_i])) {
+      while (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i])) {
         pat_i++;
       }
       matching = (pat_i != pat.size() && str[str_i] == pat[pat_i++]);
     }
   }
-  while (pat_i != pat.size() && isspace(pat[pat_i])) {
+  while (pat_i != pat.size() && absl::ascii_isspace(pat[pat_i])) {
     pat_i++;
   }
   return (matching && pat_i == pat.size());
@@ -285,15 +297,15 @@ TEST(CommandLineFlagsTest, UsageString) {
   int some_int = 10;
   int64_t some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
-  string some_name = "something";
+  std::string some_name = "something";
   // Don't test float in this case, because precision is hard to predict and
   // match against, and we don't want a franky test.
-  const string tool_name = "some_tool_name";
-  string usage = Flags::Usage(tool_name + "<flags>",
-                              {Flag("some_int", &some_int, "some int"),
-                               Flag("some_int64", &some_int64, "some int64"),
-                               Flag("some_switch", &some_switch, "some switch"),
-                               Flag("some_name", &some_name, "some name")});
+  const std::string tool_name = "some_tool_name";
+  std::string usage = Flags::Usage(
+      tool_name + "<flags>", {Flag("some_int", &some_int, "some int"),
+                              Flag("some_int64", &some_int64, "some int64"),
+                              Flag("some_switch", &some_switch, "some switch"),
+                              Flag("some_name", &some_name, "some name")});
   // Match the usage message, being sloppy about whitespace.
   const char *expected_usage =
       " usage: some_tool_name <flags>\n"
@@ -311,10 +323,10 @@ TEST(CommandLineFlagsTest, UsageString) {
 
 namespace {
 template <typename T, typename ExpectationFun>
-void PrefixTestTempl(ExpectationFun expectation_fun, const T &value0,
-                     const T &value1, string str0, string str1) {
+void PrefixTestTempl(ExpectationFun expectation_fun, const T& value0,
+                     const T& value1, std::string str0, std::string str1) {
   int argc = 3;
-  std::vector<string> argv_strings = {
+  std::vector<std::string> argv_strings = {
       "program_name",
       "--hello" + str0,
       "--hello_world" + str1,
@@ -347,7 +359,7 @@ TEST(CommandLineFlagsTest, OneArgumentIsAPrefixOfAnother) {
   PrefixTestTempl<bool>(expect_eq, false, true, "=false", "");
   PrefixTestTempl<bool>(expect_eq, true, false, "=true", "=false");
   PrefixTestTempl<bool>(expect_eq, true, false, "", "=false");
-  PrefixTestTempl<string>(expect_eq, "a", "b", "=a", "=b");
+  PrefixTestTempl<std::string>(expect_eq, "a", "b", "=a", "=b");
   PrefixTestTempl<float>(expect_near, 0.1f, 0.2f, "=0.1", "=0.2");
 }
 
diff --git a/tensorflow/core/util/debug_data_dumper.h b/tensorflow/core/util/debug_data_dumper.h
index 44eee52c5b37a2..5716a77059b407 100644
--- a/tensorflow/core/util/debug_data_dumper.h
+++ b/tensorflow/core/util/debug_data_dumper.h
@@ -127,7 +127,7 @@ class DebugDataDumper {
   std::optional<std::string> name_filter_;
 
   // The groups filter.
-  std::set<string> groups_filter_;
+  std::set<std::string> groups_filter_;
 
   // A flag indicating whether to dump wrapped graphs.
   bool dump_wrapped_;
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 3e16d3995a7cbc..1d98ecbe2bde67 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -38,7 +38,8 @@ void MaybeSetDebugEventTimestamp(DebugEvent* debug_event, Env* env) {
 }
 }  // namespace
 
-SingleDebugEventFileWriter::SingleDebugEventFileWriter(const string& file_path)
+SingleDebugEventFileWriter::SingleDebugEventFileWriter(
+    const std::string& file_path)
     : env_(Env::Default()),
       file_path_(file_path),
       num_outstanding_events_(0),
@@ -120,7 +121,7 @@ absl::Status SingleDebugEventFileWriter::Close() {
   return status;
 }
 
-const string SingleDebugEventFileWriter::FileName() { return file_path_; }
+const std::string SingleDebugEventFileWriter::FileName() { return file_path_; }
 
 mutex DebugEventsWriter::factory_mu_(LINKER_INITIALIZED);
 
@@ -128,11 +129,11 @@ DebugEventsWriter::~DebugEventsWriter() { Close().IgnoreError(); }
 
 // static
 DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
-    const string& dump_root, const string& tfdbg_run_id,
+    const std::string& dump_root, const std::string& tfdbg_run_id,
     int64_t circular_buffer_size) {
   mutex_lock l(DebugEventsWriter::factory_mu_);
-  std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
-      DebugEventsWriter::GetDebugEventsWriterMap();
+  std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
+      writer_pool = DebugEventsWriter::GetDebugEventsWriterMap();
   if (writer_pool->find(dump_root) == writer_pool->end()) {
     std::unique_ptr<DebugEventsWriter> writer(
         new DebugEventsWriter(dump_root, tfdbg_run_id, circular_buffer_size));
@@ -143,10 +144,10 @@ DebugEventsWriter* DebugEventsWriter::GetDebugEventsWriter(
 
 // static
 absl::Status DebugEventsWriter::LookUpDebugEventsWriter(
-    const string& dump_root, DebugEventsWriter** debug_events_writer) {
+    const std::string& dump_root, DebugEventsWriter** debug_events_writer) {
   mutex_lock l(DebugEventsWriter::factory_mu_);
-  std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>* writer_pool =
-      DebugEventsWriter::GetDebugEventsWriterMap();
+  std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
+      writer_pool = DebugEventsWriter::GetDebugEventsWriterMap();
   if (writer_pool->find(dump_root) == writer_pool->end()) {
     return errors::FailedPrecondition(
         "No DebugEventsWriter has been created at dump root ", dump_root);
@@ -182,7 +183,7 @@ absl::Status DebugEventsWriter::Init() {
   metadata_writer_.reset();
 
   // The metadata file should be created.
-  string metadata_filename = GetFileNameInternal(METADATA);
+  std::string metadata_filename = GetFileNameInternal(METADATA);
   metadata_writer_ =
       std::make_unique<SingleDebugEventFileWriter>(metadata_filename);
   if (metadata_writer_ == nullptr) {
@@ -243,7 +244,7 @@ absl::Status DebugEventsWriter::WriteExecution(Execution* execution) {
     DebugEvent debug_event;
     MaybeSetDebugEventTimestamp(&debug_event, env_);
     debug_event.set_allocated_execution(execution);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
 
     mutex_lock l(execution_buffer_mu_);
@@ -268,7 +269,7 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
     DebugEvent debug_event;
     MaybeSetDebugEventTimestamp(&debug_event, env_);
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
 
     mutex_lock l(graph_execution_trace_buffer_mu_);
@@ -281,8 +282,8 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
 }
 
 absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
-    const string& tfdbg_context_id, const string& device_name,
-    const string& op_name, int32_t output_slot, int32_t tensor_debug_mode,
+    const std::string& tfdbg_context_id, const std::string& device_name,
+    const std::string& op_name, int32_t output_slot, int32_t tensor_debug_mode,
     const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
@@ -301,16 +302,16 @@ absl::Status DebugEventsWriter::WriteGraphExecutionTrace(
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
-    const string& debug_event_str, DebugEventFileType type) {
+    const std::string& debug_event_str, DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   (*writer)->WriteSerializedDebugEvent(debug_event_str);
 }
 
 void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
-    const string& debug_event_str, DebugEventFileType type) {
+    const std::string& debug_event_str, DebugEventFileType type) {
   const std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
-  std::deque<string>* buffer = nullptr;
+  std::deque<std::string>* buffer = nullptr;
   mutex* mu = nullptr;
   switch (type) {
     case EXECUTION:
@@ -340,7 +341,7 @@ void DebugEventsWriter::WriteSerializedExecutionDebugEvent(
   }
 }
 
-int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
+int DebugEventsWriter::RegisterDeviceAndGetId(const std::string& device_name) {
   mutex_lock l(device_mu_);
   int& device_id = device_name_to_id_[device_name];
   if (device_id == 0) {
@@ -350,7 +351,7 @@ int DebugEventsWriter::RegisterDeviceAndGetId(const string& device_name) {
     DebuggedDevice* debugged_device = debug_event.mutable_debugged_device();
     debugged_device->set_device_name(device_name);
     debugged_device->set_device_id(device_id);
-    string serialized;
+    std::string serialized;
     debug_event.SerializeToString(&serialized);
     graphs_writer_->WriteSerializedDebugEvent(serialized);
   }
@@ -403,7 +404,7 @@ absl::Status DebugEventsWriter::FlushExecutionFiles() {
   return absl::OkStatus();
 }
 
-string DebugEventsWriter::FileName(DebugEventFileType type) {
+std::string DebugEventsWriter::FileName(DebugEventFileType type) {
   if (file_prefix_.empty()) {
     Init().IgnoreError();
   }
@@ -418,7 +419,7 @@ absl::Status DebugEventsWriter::Close() {
     }
   }
 
-  std::vector<string> failed_to_close_files;
+  std::vector<std::string> failed_to_close_files;
 
   if (metadata_writer_ != nullptr) {
     if (!metadata_writer_->Close().ok()) {
@@ -472,16 +473,16 @@ absl::Status DebugEventsWriter::Close() {
 }
 
 // static
-std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
+std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
 DebugEventsWriter::GetDebugEventsWriterMap() {
-  static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
-      writer_pool =
-          new std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>();
+  static std::unordered_map<std::string,
+                            std::unique_ptr<DebugEventsWriter>>* writer_pool =
+      new std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>();
   return writer_pool;
 }
 
-DebugEventsWriter::DebugEventsWriter(const string& dump_root,
-                                     const string& tfdbg_run_id,
+DebugEventsWriter::DebugEventsWriter(const std::string& dump_root,
+                                     const std::string& tfdbg_run_id,
                                      int64_t circular_buffer_size)
     : env_(Env::Default()),
       dump_root_(dump_root),
@@ -499,7 +500,7 @@ DebugEventsWriter::DebugEventsWriter(const string& dump_root,
 absl::Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
-  const string filename = GetFileNameInternal(type);
+  const std::string filename = GetFileNameInternal(type);
   writer->reset();
 
   *writer = std::make_unique<SingleDebugEventFileWriter>(filename);
@@ -521,7 +522,7 @@ absl::Status DebugEventsWriter::SerializeAndWriteDebugEvent(
   if (writer != nullptr) {
     // Timestamp is in seconds, with double precision.
     MaybeSetDebugEventTimestamp(debug_event, env_);
-    string str;
+    std::string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
     return absl::OkStatus();
@@ -557,7 +558,7 @@ void DebugEventsWriter::SelectWriter(
   }
 }
 
-const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
+const std::string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
   switch (type) {
     case METADATA:
       return kMetadataSuffix;
@@ -572,13 +573,13 @@ const string DebugEventsWriter::GetSuffix(DebugEventFileType type) {
     case GRAPH_EXECUTION_TRACES:
       return kGraphExecutionTracesSuffix;
     default:
-      string suffix;
+      std::string suffix;
       return suffix;
   }
 }
 
-string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
-  const string suffix = GetSuffix(type);
+std::string DebugEventsWriter::GetFileNameInternal(DebugEventFileType type) {
+  const std::string suffix = GetSuffix(type);
   return absl::StrCat(file_prefix_, ".", suffix);
 }
 
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 7b1042790d7913..abc7397d71dbb0 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -49,7 +49,7 @@ enum DebugEventFileType {
 // TFRecord files, and hence utilizes multiple objects of this helper class.
 class SingleDebugEventFileWriter {
  public:
-  explicit SingleDebugEventFileWriter(const string& file_path);
+  explicit SingleDebugEventFileWriter(const std::string& file_path);
 
   absl::Status Init();
 
@@ -58,11 +58,11 @@ class SingleDebugEventFileWriter {
   absl::Status Flush();
   absl::Status Close();
 
-  const string FileName();
+  const std::string FileName();
 
  private:
   Env* env_;
-  const string file_path_;
+  const std::string file_path_;
   std::atomic_int_fast32_t num_outstanding_events_;
 
   std::unique_ptr<WritableFile> writable_file_;
@@ -108,15 +108,15 @@ class DebugEventsWriter {
   //     behavior.
   // Returns:
   //   A pointer to a DebugEventsWriter object: a per-dump_root singleton.
-  static DebugEventsWriter* GetDebugEventsWriter(const string& dump_root,
-                                                 const string& tfdbg_run_id,
-                                                 int64_t circular_buffer_size);
+  static DebugEventsWriter* GetDebugEventsWriter(
+      const std::string& dump_root, const std::string& tfdbg_run_id,
+      int64_t circular_buffer_size);
   // Look up existing events writer by dump_root.
   // If no DebugEventsWriter has been created at the dump_root, a non-OK
   // Status will be returned. Else an OK status will be returned, with
   // the pointer to the existing instance provided by reference.
   static absl::Status LookUpDebugEventsWriter(
-      const string& dump_root, DebugEventsWriter** debug_events_writer);
+      const std::string& dump_root, DebugEventsWriter** debug_events_writer);
   ~DebugEventsWriter();
 
   // Sets the debug event filenames and opens file for writing.
@@ -168,9 +168,9 @@ class DebugEventsWriter {
   //   tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  absl::Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                        const string& device_name,
-                                        const string& op_name,
+  absl::Status WriteGraphExecutionTrace(const std::string& tfdbg_context_id,
+                                        const std::string& device_name,
+                                        const std::string& op_name,
                                         int32_t output_slot,
                                         int32_t tensor_debug_mode,
                                         const Tensor& tensor_value);
@@ -180,7 +180,7 @@ class DebugEventsWriter {
   // and GRAPHS files.
   // NOTE: Actually used in the Python binding, to avoid overhead of
   // serializing and parsing protos at the language interface.
-  void WriteSerializedNonExecutionDebugEvent(const string& debug_event_str,
+  void WriteSerializedNonExecutionDebugEvent(const std::string& debug_event_str,
                                              DebugEventFileType type);
 
   // Writes a serialized DebugEvent to one of the debug-events files
@@ -189,13 +189,13 @@ class DebugEventsWriter {
   // circular_buffer_size is configured to be >0.
   // NOTE: Actually used in the Python binding, to avoid overhead of
   // serializing and parsing protos at the language interface.
-  void WriteSerializedExecutionDebugEvent(const string& debug_event_str,
+  void WriteSerializedExecutionDebugEvent(const std::string& debug_event_str,
                                           DebugEventFileType type);
 
   // Given name of the device, retrieve a unique integer ID. As a side effect,
   // if this is the first time this object encounters the device name,
   // writes a DebuggedDevice proto to the .graphs file in the file set.
-  int RegisterDeviceAndGetId(const string& device_name);
+  int RegisterDeviceAndGetId(const std::string& device_name);
 
   // EventWriter automatically flushes and closes on destruction, but
   // this method is provided for users who want to write to disk sooner
@@ -213,7 +213,7 @@ class DebugEventsWriter {
   absl::Status Close();
 
  private:
-  static std::unordered_map<string, std::unique_ptr<DebugEventsWriter>>*
+  static std::unordered_map<std::string, std::unique_ptr<DebugEventsWriter>>*
 
   // Get a static map from dump-root path to DebugEventsWriter objects.
   // This helps the per-dump-root singletone pattern.
@@ -222,12 +222,13 @@ class DebugEventsWriter {
   // Guards calls to the GetDebugEventsWriter() method.
   static mutex factory_mu_;
 
-  DebugEventsWriter(const string& dump_root, const string& tfdbg_run_id,
+  DebugEventsWriter(const std::string& dump_root,
+                    const std::string& tfdbg_run_id,
                     int64_t circular_buffer_size);
 
   // Get the path prefix. The same for all files, which differ only in the
   // suffix.
-  string FileName(DebugEventFileType type);
+  std::string FileName(DebugEventFileType type);
 
   // Initialize the TFRecord writer for non-metadata file type.
   absl::Status InitNonMetadataFile(DebugEventFileType type);
@@ -237,25 +238,26 @@ class DebugEventsWriter {
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
-  const string GetSuffix(DebugEventFileType type);
-  string GetFileNameInternal(DebugEventFileType type);
+  const std::string GetSuffix(DebugEventFileType type);
+  std::string GetFileNameInternal(DebugEventFileType type);
 
   Env* env_;
-  const string dump_root_;
-  const string tfdbg_run_id_;
+  const std::string dump_root_;
+  const std::string tfdbg_run_id_;
 
-  string file_prefix_;
+  std::string file_prefix_;
   bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
   mutex initialization_mu_;
 
   const int64_t circular_buffer_size_;
-  std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
+  std::deque<std::string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
   mutex execution_buffer_mu_;
-  std::deque<string> graph_execution_trace_buffer_
+  std::deque<std::string> graph_execution_trace_buffer_
       TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
   mutex graph_execution_trace_buffer_mu_;
 
-  absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
+  absl::flat_hash_map<std::string, int> device_name_to_id_
+      TF_GUARDED_BY(device_mu_);
   mutex device_mu_;
 
   std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 2021010735aebb..2bd17cbda55a9d 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -37,8 +37,8 @@ Env* env() { return Env::Default(); }
 
 class DebugEventsWriterTest : public ::testing::Test {
  public:
-  static string GetDebugEventFileName(DebugEventsWriter* writer,
-                                      DebugEventFileType type) {
+  static std::string GetDebugEventFileName(DebugEventsWriter* writer,
+                                           DebugEventFileType type) {
     return writer->FileName(type);
   }
 
@@ -46,12 +46,12 @@ class DebugEventsWriterTest : public ::testing::Test {
                                    DebugEventFileType type,
                                    std::vector<DebugEvent>* protos) {
     protos->clear();
-    const string filename = writer->FileName(type);
+    const std::string filename = writer->FileName(type);
     std::unique_ptr<RandomAccessFile> debug_events_file;
     TF_CHECK_OK(env()->NewRandomAccessFile(filename, &debug_events_file));
     io::RecordReader* reader = new io::RecordReader(debug_events_file.get());
 
-    uint64 offset = 0;
+    uint64_t offset = 0;
     DebugEvent actual;
     while (ReadDebugEventProto(reader, &offset, &actual)) {
       protos->push_back(actual);
@@ -60,7 +60,7 @@ class DebugEventsWriterTest : public ::testing::Test {
     delete reader;
   }
 
-  static bool ReadDebugEventProto(io::RecordReader* reader, uint64* offset,
+  static bool ReadDebugEventProto(io::RecordReader* reader, uint64_t* offset,
                                   DebugEvent* proto) {
     tstring record;
     absl::Status s = reader->ReadRecord(offset, &record);
@@ -88,8 +88,8 @@ class DebugEventsWriterTest : public ::testing::Test {
     }
   }
 
-  string dump_root_;
-  string tfdbg_run_id_;
+  std::string dump_root_;
+  std::string tfdbg_run_id_;
 };
 
 TEST_F(DebugEventsWriterTest, GetDebugEventsWriterSameRootGivesSameObject) {
@@ -134,7 +134,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentGetDebugEventsWriterDiffDumpRoots) {
   std::vector<DebugEventsWriter*> writers;
   mutex mu;
   auto fn = [this, &counter, &writers, &mu]() {
-    const string new_dump_root =
+    const std::string new_dump_root =
         io::JoinPath(dump_root_, strings::Printf("%ld", counter.fetch_add(1)));
     DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
         new_dump_root, tfdbg_run_id_,
@@ -159,7 +159,7 @@ TEST_F(DebugEventsWriterTest, GetDebugEventsWriterDifferentRoots) {
   // Test the DebugEventsWriters for different directories are different.
   DebugEventsWriter* writer_1 = DebugEventsWriter::GetDebugEventsWriter(
       dump_root_, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
-  const string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
+  const std::string dump_root_2 = io::JoinPath(dump_root_, "subdirectory");
   DebugEventsWriter* writer_2 = DebugEventsWriter::GetDebugEventsWriter(
       dump_root_2, tfdbg_run_id_, DebugEventsWriter::kDefaultCyclicBufferSize);
   EXPECT_NE(writer_1, writer_2);
@@ -177,7 +177,7 @@ TEST_F(DebugEventsWriterTest, GetAndInitDebugEventsWriter) {
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
   // Check the content of the file version string.
-  const string file_version = actuals[0].debug_metadata().file_version();
+  const std::string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
   // Check the tfdbg run ID.
@@ -223,7 +223,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentInitCalls) {
   EXPECT_EQ(actuals.size(), 1);
   EXPECT_GT(actuals[0].debug_metadata().tensorflow_version().length(), 0);
   // Check the content of the file version string.
-  const string file_version = actuals[0].debug_metadata().file_version();
+  const std::string file_version = actuals[0].debug_metadata().file_version();
   EXPECT_EQ(file_version.find(DebugEventsWriter::kVersionPrefix), 0);
   EXPECT_GT(file_version.size(), strlen(DebugEventsWriter::kVersionPrefix));
   EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
@@ -247,7 +247,7 @@ TEST_F(DebugEventsWriterTest, InitTwiceDoesNotCreateNewMetadataFile) {
   EXPECT_EQ(actuals[0].debug_metadata().tfdbg_run_id(), "test_tfdbg_run_id");
   EXPECT_GE(actuals[0].debug_metadata().file_version().size(), 0);
 
-  string metadata_path_1 =
+  std::string metadata_path_1 =
       GetDebugEventFileName(writer, DebugEventFileType::METADATA);
   TF_ASSERT_OK(writer->Init());
   EXPECT_EQ(GetDebugEventFileName(writer, DebugEventFileType::METADATA),
@@ -434,7 +434,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
       new thread::ThreadPool(Env::Default(), "test_pool", 8);
   std::atomic_int_fast64_t counter(0);
   auto fn = [&writer, &counter]() {
-    const string file_path = strings::Printf(
+    const std::string file_path = strings::Printf(
         "/home/tf_programs/program_%.3ld.py", counter.fetch_add(1));
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
@@ -451,8 +451,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -475,7 +475,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
       new thread::ThreadPool(Env::Default(), "test_pool", 8);
   std::atomic_int_fast64_t counter(0);
   auto fn = [&writer, &counter]() {
-    const string file_path = strings::Printf(
+    const std::string file_path = strings::Printf(
         "/home/tf_programs/program_%.3ld.py", counter.fetch_add(1));
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
@@ -493,8 +493,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -545,8 +545,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
   std::vector<DebugEvent> actuals;
   ReadDebugEventProtos(writer, DebugEventFileType::SOURCE_FILES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> file_paths;
-  std::vector<string> host_names;
+  std::vector<std::string> file_paths;
+  std::vector<std::string> host_names;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     file_paths.push_back(actuals[i].source_file().file_path());
     host_names.push_back(actuals[i].source_file().host_name());
@@ -560,7 +560,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
 
   ReadDebugEventProtos(writer, DebugEventFileType::STACK_FRAMES, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> stack_frame_ids;
+  std::vector<std::string> stack_frame_ids;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     stack_frame_ids.push_back(actuals[i].stack_frame_with_id().id());
   }
@@ -571,8 +571,8 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
 
   ReadDebugEventProtos(writer, DebugEventFileType::GRAPHS, &actuals);
   EXPECT_EQ(actuals.size(), kConcurrentWrites / 3);
-  std::vector<string> op_types;
-  std::vector<string> op_names;
+  std::vector<std::string> op_types;
+  std::vector<std::string> op_names;
   for (int32_t i = 0; i < kConcurrentWrites / 3; ++i) {
     op_types.push_back(actuals[i].graph_op_creation().op_type());
     op_names.push_back(actuals[i].graph_op_creation().op_name());
@@ -809,7 +809,7 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
   int device_ids[8];
   for (int i = 0; i < 8; ++i) {
     thread_pool->Schedule([i, &writer, &device_ids]() {
-      const string device_name = strings::Printf(
+      const std::string device_name = strings::Printf(
           "/job:localhost/replica:0/task:0/device:GPU:%d", i % 4);
       device_ids[i] = writer->RegisterDeviceAndGetId(device_name);
     });
@@ -833,7 +833,7 @@ TEST_F(DebugEventsWriterTest, RegisterDeviceAndGetIdTrace) {
   // are 8 threads each calling `RegisterDeviceAndGetId`.
   EXPECT_EQ(actuals.size(), 4);
   for (const DebugEvent& actual : actuals) {
-    const string& device_name = actual.debugged_device().device_name();
+    const std::string& device_name = actual.debugged_device().device_name();
     int device_index = -1;
     CHECK(absl::SimpleAtoi(device_name.substr(strlen(
                                "/job:localhost/replica:0/task:0/device:GPU:")),
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 8334b0de47c45d..f75670557d804d 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -39,10 +39,11 @@ using strings::StrCat;
 
 struct NameCounts {
   mutex counts_mutex;
-  std::unordered_map<string, int> counts;
+  std::unordered_map<std::string, int> counts;
 };
 
-string MakeUniqueFilename(string name, const string& suffix = ".pbtxt") {
+std::string MakeUniqueFilename(std::string name,
+                               const std::string& suffix = ".pbtxt") {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -60,7 +61,7 @@ string MakeUniqueFilename(string name, const string& suffix = ".pbtxt") {
     count = instance.counts[name]++;
   }
 
-  string filename = name;
+  std::string filename = name;
   if (count > 0) {
     absl::StrAppend(&filename, "_", count);
   }
@@ -78,7 +79,7 @@ struct GraphDumperConfig {
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper = nullptr;
-    string suffix = ".pbtxt";
+    std::string suffix = ".pbtxt";
   } config TF_GUARDED_BY(mu);
 
   // Returns whether a custom dumper is set.
@@ -93,8 +94,8 @@ GraphDumperConfig& GetGraphDumperConfig() {
   return config;
 }
 
-string GetDumpGraphFormatLowerCase() {
-  string fmt;
+std::string GetDumpGraphFormatLowerCase() {
+  std::string fmt;
   absl::Status status =
       tsl::ReadStringFromEnvVar("TF_DUMP_GRAPH_FMT", "TXT", &fmt);
   if (!status.ok()) {
@@ -105,8 +106,8 @@ string GetDumpGraphFormatLowerCase() {
   return fmt;
 }
 
-string GetDumpGraphSuffix() {
-  string fmt = GetDumpGraphFormatLowerCase();
+std::string GetDumpGraphSuffix() {
+  std::string fmt = GetDumpGraphFormatLowerCase();
   if (fmt == "txt") {
     return ".pbtxt";
   } else if (fmt == "bin") {
@@ -145,11 +146,12 @@ class StderrWritableFile : public WritableFile {
   }
 };
 
-absl::Status CreateWritableFile(Env* env, const string& dirname,
-                                const string& name, const string& suffix,
-                                string* filepath,
+absl::Status CreateWritableFile(Env* env, const std::string& dirname,
+                                const std::string& name,
+                                const std::string& suffix,
+                                std::string* filepath,
                                 std::unique_ptr<WritableFile>* file) {
-  string dir;
+  std::string dir;
   if (!dirname.empty()) {
     dir = dirname;
   } else {
@@ -187,8 +189,8 @@ absl::Status CreateWritableFile(Env* env, const string& dirname,
 
 absl::Status WriteProtoToUniqueFile(const tensorflow::protobuf::Message& proto,
                                     WritableFile* file) {
-  string s;
-  string format = GetDumpGraphFormatLowerCase();
+  std::string s;
+  std::string format = GetDumpGraphFormatLowerCase();
   if (format == "txt" &&
       !::tensorflow::protobuf::TextFormat::PrintToString(proto, &s)) {
     return absl::FailedPreconditionError("Unable to convert proto to text.");
@@ -209,7 +211,7 @@ absl::Status WriteProtoToUniqueFile(const tensorflow::protobuf::Message& proto,
 
 absl::Status WriteProtoToUniqueFile(
     const tensorflow::protobuf::MessageLite& proto, WritableFile* file) {
-  string s;
+  std::string s;
   if (!SerializeToStringDeterministic(proto, &s)) {
     return errors::Internal("Failed to serialize proto to string.");
   }
@@ -223,10 +225,10 @@ absl::Status WriteProtoToUniqueFile(
 
 }  // anonymous namespace
 
-string DumpToFile(const string& name, const string& dirname,
-                  const string& suffix, absl::string_view type_name,
-                  std::function<absl::Status(WritableFile*)> dumper) {
-  string filepath;
+std::string DumpToFile(const std::string& name, const std::string& dirname,
+                       const std::string& suffix, absl::string_view type_name,
+                       std::function<absl::Status(WritableFile*)> dumper) {
+  std::string filepath;
   std::unique_ptr<WritableFile> file;
   absl::Status status = CreateWritableFile(Env::Default(), dirname, name,
                                            suffix, &filepath, &file);
@@ -249,32 +251,34 @@ void SetGraphDumper(
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper,
-    string suffix) {
+    std::string suffix) {
   GraphDumperConfig& dumper_config = GetGraphDumperConfig();
   mutex_lock lock(dumper_config.mu);
   dumper_config.config.dumper = dumper;
   dumper_config.config.suffix = suffix;
 }
 
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
-                          const string& dirname) {
+std::string DumpGraphDefToFile(const std::string& name,
+                               GraphDef const& graph_def,
+                               const std::string& dirname) {
   return DumpToFile(name, dirname, GetDumpGraphSuffix(), "Graph",
                     [&](WritableFile* file) {
                       return WriteProtoToUniqueFile(graph_def, file);
                     });
 }
 
-string DumpCostGraphDefToFile(const string& name, CostGraphDef const& graph_def,
-                              const string& dirname) {
+std::string DumpCostGraphDefToFile(const std::string& name,
+                                   CostGraphDef const& graph_def,
+                                   const std::string& dirname) {
   return DumpToFile(name, dirname, GetDumpGraphSuffix(), "Graph",
                     [&](WritableFile* file) {
                       return WriteProtoToUniqueFile(graph_def, file);
                     });
 }
 
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def,
-                       const string& dirname) {
+std::string DumpGraphToFile(const std::string& name, Graph const& graph,
+                            const FunctionLibraryDefinition* flib_def,
+                            const std::string& dirname) {
   auto& dumper_config = GetGraphDumperConfig();
   if (dumper_config.IsSet()) {
     GraphDumperConfig::Config config;
@@ -298,16 +302,17 @@ string DumpGraphToFile(const string& name, Graph const& graph,
   return DumpGraphDefToFile(name, graph_def, dirname);
 }
 
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
-                             const string& dirname) {
+std::string DumpFunctionDefToFile(const std::string& name,
+                                  FunctionDef const& fdef,
+                                  const std::string& dirname) {
   return DumpToFile(
       name, dirname, GetDumpGraphSuffix(), "FunctionDef",
       [&](WritableFile* file) { return WriteProtoToUniqueFile(fdef, file); });
 }
 
-string DumpProtoToFile(const string& name,
-                       tensorflow::protobuf::Message const& proto,
-                       const string& dirname) {
+std::string DumpProtoToFile(const std::string& name,
+                            tensorflow::protobuf::Message const& proto,
+                            const std::string& dirname) {
   return DumpToFile(
       name, dirname, GetDumpGraphSuffix(), proto.GetTypeName(),
       [&](WritableFile* file) { return WriteProtoToUniqueFile(proto, file); });
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
index 0d0c55754dd8ff..2a81c55a0232f2 100644
--- a/tensorflow/core/util/dump_graph.h
+++ b/tensorflow/core/util/dump_graph.h
@@ -42,29 +42,32 @@ namespace tensorflow {
 // 'name' with '.pbtxt' or '.pb'. If a graph has already been dumped by
 // this process with the same name, suffixes with "_n.pb(txt)", where 'n' is a
 // sequence number.
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
-                          const string& dirname = "");
+std::string DumpGraphDefToFile(const std::string& name,
+                               GraphDef const& graph_def,
+                               const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, use CostGraphDef instead of GraphDef.
-string DumpCostGraphDefToFile(const string& name, CostGraphDef const& graph_def,
-                              const string& dirname = "");
+std::string DumpCostGraphDefToFile(const std::string& name,
+                                   CostGraphDef const& graph_def,
+                                   const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
 // and an optional function library 'flib_def'. Returns the file name chosen.
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def = nullptr,
-                       const string& dirname = "");
+std::string DumpGraphToFile(const std::string& name, Graph const& graph,
+                            const FunctionLibraryDefinition* flib_def = nullptr,
+                            const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
 // proto. Returns the file name chosen.
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
-                             const string& dirname = "");
+std::string DumpFunctionDefToFile(const std::string& name,
+                                  FunctionDef const& fdef,
+                                  const std::string& dirname = "");
 
 // Similar to DumpGraphDefToFile, but dumps a proto of any type. Returns the
 // file name chosen.
-string DumpProtoToFile(const string& name,
-                       tensorflow::protobuf::Message const& proto,
-                       const string& dirname = "");
+std::string DumpProtoToFile(const std::string& name,
+                            tensorflow::protobuf::Message const& proto,
+                            const std::string& dirname = "");
 
 // Sets a custom Graph dumper. If set, this dumper will be used to dump graphs
 // instead via DumpGraphToFile. As the custom dumper may not produce protobufs,
@@ -74,14 +77,14 @@ void SetGraphDumper(
                                const FunctionLibraryDefinition* flib_def,
                                WritableFile*)>
         dumper,
-    string suffix = ".pbtxt");
+    std::string suffix = ".pbtxt");
 
 // Dump data to a file.
 // This function will create a WritableFile and pass it to the dumper.
 // The dumper callback will be responsible for writing data to the file.
-string DumpToFile(const string& name, const string& dirname,
-                  const string& suffix, absl::string_view type_name,
-                  std::function<absl::Status(WritableFile*)> dumper);
+std::string DumpToFile(const std::string& name, const std::string& dirname,
+                       const std::string& suffix, absl::string_view type_name,
+                       std::function<absl::Status(WritableFile*)> dumper);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
index 935ca41a7e9d26..c80f74690f0f6d 100644
--- a/tensorflow/core/util/dump_graph_test.cc
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -35,7 +35,7 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
   TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
-  string ret = DumpGraphToFile("graph", graph);
+  std::string ret = DumpGraphToFile("graph", graph);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph.pbtxt"));
   ret = DumpGraphToFile("graph", graph);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph_1.pbtxt"));
@@ -43,7 +43,7 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
   GraphDef gdef;
   TF_ASSERT_OK(ReadTextProto(
       Env::Default(), io::JoinPath(testing::TmpDir(), "graph.pbtxt"), &gdef));
-  string read, written;
+  std::string read, written;
   gdef.AppendToString(&read);
   graph.ToGraphDefDebug().AppendToString(&written);
   EXPECT_EQ(read, written);
@@ -52,14 +52,14 @@ TEST(DumpGraph, DumpGraphToFileSuccess) {
 TEST(DumpGraph, DumpGraphToFileNoEnvPrefix) {
   Graph graph(OpRegistry::Global());
   unsetenv("TF_DUMP_GRAPH_PREFIX");
-  string ret = DumpGraphToFile("graph", graph);
+  std::string ret = DumpGraphToFile("graph", graph);
   EXPECT_TRUE(absl::StrContains(ret, "TF_DUMP_GRAPH_PREFIX not specified"));
 }
 
 TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
   FunctionDef fdef;
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
-  string ret = DumpFunctionDefToFile("function", fdef);
+  std::string ret = DumpFunctionDefToFile("function", fdef);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
 }
 
@@ -72,8 +72,9 @@ TEST(DumpGraph, DumpProtoToFileSuccess) {
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   setenv("TF_DUMP_GRAPH_FMT", "TXT", 1);
-  string expected_filepath = io::JoinPath(testing::TmpDir(), "node_def.pbtxt");
-  string actual_filepath = DumpProtoToFile("node_def", ndef_in);
+  std::string expected_filepath =
+      io::JoinPath(testing::TmpDir(), "node_def.pbtxt");
+  std::string actual_filepath = DumpProtoToFile("node_def", ndef_in);
   EXPECT_EQ(expected_filepath, actual_filepath);
 
   NodeDef ndef_out;
@@ -81,7 +82,7 @@ TEST(DumpGraph, DumpProtoToFileSuccess) {
   EXPECT_EQ(ndef_in.DebugString(), ndef_out.DebugString());
 
   setenv("TF_DUMP_GRAPH_FMT", "BIN", 1);
-  string ret = DumpProtoToFile("node_def", ndef_in);
+  std::string ret = DumpProtoToFile("node_def", ndef_in);
   EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "node_def_1.pb"));
   TF_ASSERT_OK(ReadBinaryProto(Env::Default(), ret, &ndef_out));
   EXPECT_EQ(ndef_out.DebugString(), ndef_in.DebugString());
diff --git a/tensorflow/core/util/einsum_op_util.cc b/tensorflow/core/util/einsum_op_util.cc
index 55151c724af993..bb37622670f4d6 100644
--- a/tensorflow/core/util/einsum_op_util.cc
+++ b/tensorflow/core/util/einsum_op_util.cc
@@ -28,9 +28,10 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status ValidateEinsumEquation(
-    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
-    string* output_subscript) {
-  absl::InlinedVector<string, 2UL> inputs_and_output_subscripts =
+    const std::string& equation,
+    absl::InlinedVector<std::string, 2UL>* input_subscripts,
+    std::string* output_subscript) {
+  absl::InlinedVector<std::string, 2UL> inputs_and_output_subscripts =
       absl::StrSplit(equation, "->");
   if (inputs_and_output_subscripts.size() != 2) {
     return errors::InvalidArgument(
@@ -63,7 +64,7 @@ EinsumDimensionType GetDimensionType(bool is_removed, bool is_unique) {
 }
 
 // Maps the character labels to consecutive integers.
-void MapToLabels(const string& subscript, Labels* labels,
+void MapToLabels(const std::string& subscript, Labels* labels,
                  absl::flat_hash_map<char, int>* label_mapping) {
   for (int i = 0; i < subscript.size(); ++i) {
     const char label_char = subscript[i];
@@ -82,13 +83,13 @@ void MapToLabels(const string& subscript, Labels* labels,
 }
 
 absl::Status ParseEinsumEquation(
-    const string& equation, OperandLabels* input_labels, Labels* output_labels,
-    std::vector<EinsumDimensionType>* label_types,
+    const std::string& equation, OperandLabels* input_labels,
+    Labels* output_labels, std::vector<EinsumDimensionType>* label_types,
     OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
     absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
     bool* output_has_ellipsis) {
-  absl::InlinedVector<string, 2UL> input_str;
-  string output_str;
+  absl::InlinedVector<std::string, 2UL> input_str;
+  std::string output_str;
   TF_RETURN_IF_ERROR(ValidateEinsumEquation(equation, &input_str, &output_str));
 
   // Temporary map from single character labels to (consecutive) integer labels.
diff --git a/tensorflow/core/util/einsum_op_util.h b/tensorflow/core/util/einsum_op_util.h
index 6155b8a08d663b..6e45c47fea99b8 100644
--- a/tensorflow/core/util/einsum_op_util.h
+++ b/tensorflow/core/util/einsum_op_util.h
@@ -53,16 +53,17 @@ enum EinsumDimensionType {
 
 // Parses and validates an einsum equation in explicit form.
 absl::Status ValidateEinsumEquation(
-    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
-    string* output_subscript);
+    const std::string& equation,
+    absl::InlinedVector<std::string, 2UL>* input_subscripts,
+    std::string* output_subscript);
 
 // Parses and validates the equation and the input shapes. Single character
 // labels are integerized and we populate input and output label subscripts
 // and corresponding counts. Also create the mapping from (named) labels to
 // their EinsumDimensionType.
 absl::Status ParseEinsumEquation(
-    const string& equation, OperandLabels* input_labels, Labels* output_labels,
-    std::vector<EinsumDimensionType>* label_types,
+    const std::string& equation, OperandLabels* input_labels,
+    Labels* output_labels, std::vector<EinsumDimensionType>* label_types,
     OperandLabelCounts* input_label_counts, LabelCounts* output_label_counts,
     absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
     bool* output_has_ellipsis);
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index 3a592550009e64..c8d5a41b1fc848 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -33,20 +33,22 @@ limitations under the License.
 namespace tensorflow {
 
 bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
-                   string* diff, const EqualGraphDefOptions& options) {
+                   std::string* diff, const EqualGraphDefOptions& options) {
   // Intentionally do not check that versions match so that this routine can
   // be used for less brittle golden file tests.
   return EqualRepeatedNodeDef(actual.node(), expected.node(), diff, options);
 }
 
-uint64 GraphDefHash(const GraphDef& gdef, const EqualGraphDefOptions& options) {
+uint64_t GraphDefHash(const GraphDef& gdef,
+                      const EqualGraphDefOptions& options) {
   return RepeatedNodeDefHash(gdef.node(), options);
 }
 
 bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
                           const protobuf::RepeatedPtrField<NodeDef>& expected,
-                          string* diff, const EqualGraphDefOptions& options) {
-  std::unordered_map<string, const NodeDef*> actual_index;
+                          std::string* diff,
+                          const EqualGraphDefOptions& options) {
+  std::unordered_map<std::string, const NodeDef*> actual_index;
   for (const NodeDef& node : actual) {
     actual_index[node.name()] = &node;
   }
@@ -80,11 +82,11 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
   return true;
 }
 
-uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
-                           const EqualGraphDefOptions& options) {
-  uint64 h = 0xDECAFCAFFE;
+uint64_t RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
+                             const EqualGraphDefOptions& options) {
+  uint64_t h = 0xDECAFCAFFE;
   // Insert NodeDefs into map to deterministically sort by name
-  std::map<string, const NodeDef*> nodes;
+  std::map<std::string, const NodeDef*> nodes;
   for (const NodeDef& node : ndefs) {
     nodes[node.name()] = &node;
   }
@@ -97,8 +99,8 @@ uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
 
 namespace {
 
-string JoinStringField(const protobuf::RepeatedPtrField<string>& f) {
-  string ret;
+std::string JoinStringField(const protobuf::RepeatedPtrField<std::string>& f) {
+  std::string ret;
   for (int i = 0; i < f.size(); ++i) {
     if (i > 0) absl::StrAppend(&ret, ", ");
     absl::StrAppend(&ret, f.Get(i));
@@ -108,8 +110,8 @@ string JoinStringField(const protobuf::RepeatedPtrField<string>& f) {
 
 }  // namespace
 
-bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
-                  const EqualGraphDefOptions& options) {
+bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected,
+                  std::string* diff, const EqualGraphDefOptions& options) {
   if (actual.name() != expected.name()) {
     if (diff != nullptr) {
       *diff = strings::StrCat("Actual node name '", actual.name(),
@@ -166,8 +168,8 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
     }
   }
 
-  std::unordered_set<string> actual_control;
-  std::unordered_set<string> expected_control;
+  std::unordered_set<std::string> actual_control;
+  std::unordered_set<std::string> expected_control;
   for (int i = first_control_input; i < actual.input_size(); ++i) {
     actual_control.insert(actual.input(i));
     expected_control.insert(expected.input(i));
@@ -190,7 +192,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
     return false;
   }
 
-  std::unordered_set<string> actual_attr;
+  std::unordered_set<std::string> actual_attr;
   for (const auto& a : actual.attr()) {
     if (options.ignore_internal_attrs && !a.first.empty() &&
         a.first[0] == '_') {
@@ -236,8 +238,8 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
   return true;
 }
 
-uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
-  uint64 h = Hash64(ndef.name());
+uint64_t NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
+  uint64_t h = Hash64(ndef.name());
   h = Hash64(ndef.op().data(), ndef.op().size(), h);
   h = Hash64(ndef.device().data(), ndef.device().size(), h);
 
@@ -252,16 +254,16 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   }
 
   // Control inputs. Order irrelevant.
-  std::set<string> ndef_control;
+  std::set<std::string> ndef_control;
   for (int i = first_control_input; i < ndef.input_size(); ++i) {
     ndef_control.insert(ndef.input(i));
   }
-  for (const string& s : ndef_control) {
+  for (const std::string& s : ndef_control) {
     h = Hash64(s.data(), s.size(), h);
   }
 
   // Attributes
-  std::map<string, AttrValue> ndef_attr;
+  std::map<std::string, AttrValue> ndef_attr;
   for (const auto& a : ndef.attr()) {
     if (options.ignore_internal_attrs && !a.first.empty() &&
         a.first[0] == '_') {
diff --git a/tensorflow/core/util/equal_graph_def.h b/tensorflow/core/util/equal_graph_def.h
index 9803b2dba60452..2c720dcbfdadfd 100644
--- a/tensorflow/core/util/equal_graph_def.h
+++ b/tensorflow/core/util/equal_graph_def.h
@@ -37,7 +37,7 @@ struct EqualGraphDefOptions {
 // we use node names to match up nodes between the graphs, and so the naming of
 // nodes must be consistent.
 bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
-                   string* diff, const EqualGraphDefOptions& options = {});
+                   std::string* diff, const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `gdef` that is consistent with EqualGraphDef. In other
 // words, if two graph defs compare equal according to EqualGraphDef,
@@ -46,16 +46,16 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 GraphDefHash(const GraphDef& gdef,
-                    const EqualGraphDefOptions& options = {});
+uint64_t GraphDefHash(const GraphDef& gdef,
+                      const EqualGraphDefOptions& options = {});
 
 // Determines if actual and expected are equal, ignoring: ordering of
 // attrs, internal attributes (if set in `options`), and control inputs.
 //
 // If the NodeDefs are different and
 // diff != nullptr, *diff is set to an explanation of the difference.
-bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
-                  const EqualGraphDefOptions& options = {});
+bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected,
+                  std::string* diff, const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `ndef` that is consistent with EqualNodeDef. In other
 // words, if two node defs compare equal according to EqualNodeDef, NodeDefHash
@@ -64,15 +64,15 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 NodeDefHash(const NodeDef& ndef,
-                   const EqualGraphDefOptions& options = {});
+uint64_t NodeDefHash(const NodeDef& ndef,
+                     const EqualGraphDefOptions& options = {});
 
 // Determines if actual and expected are equal, ignoring ordering. If they're
 // different and diff != nullptr, *diff is set to an explanation of the
 // difference.
 bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
                           const protobuf::RepeatedPtrField<NodeDef>& expected,
-                          string* diff,
+                          std::string* diff,
                           const EqualGraphDefOptions& options = {});
 
 // Returns a hash of `ndefs` that is consistent with EqualRepeatedNodeDef.
@@ -83,8 +83,8 @@ bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
 // Similarly to protobuf deterministic serialization, hash value is
 // guaranteed to be stable only for a given binary. In particular, one should
 // probably not persist the returned value.
-uint64 RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
-                           const EqualGraphDefOptions& options = {});
+uint64_t RepeatedNodeDefHash(const protobuf::RepeatedPtrField<NodeDef>& ndefs,
+                             const EqualGraphDefOptions& options = {});
 
 #define TF_EXPECT_GRAPH_EQ(expected, actual)            \
   do {                                                  \
diff --git a/tensorflow/core/util/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
index 77ca8eaec36804..c989b60c371751 100644
--- a/tensorflow/core/util/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -72,7 +72,7 @@ class EqualGraphDefTest : public ::testing::Test {
 
   GraphDefBuilder e_;
   GraphDefBuilder a_;
-  string diff_;
+  std::string diff_;
 };
 
 TEST_F(EqualGraphDefTest, Match) {
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index c017aeeb6ff2d1..4d4b16a078848b 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -32,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-EventsWriter::EventsWriter(const string& file_prefix)
+EventsWriter::EventsWriter(const std::string& file_prefix)
     // TODO(jeff,sanjay): Pass in env and use that here instead of Env::Default
     : env_(Env::Default()),
       file_prefix_(file_prefix),
@@ -44,7 +44,7 @@ EventsWriter::~EventsWriter() {
 
 absl::Status EventsWriter::Init() { return InitWithSuffix(""); }
 
-absl::Status EventsWriter::InitWithSuffix(const string& suffix) {
+absl::Status EventsWriter::InitWithSuffix(const std::string& suffix) {
   file_suffix_ = suffix;
   return InitIfNeeded();
 }
@@ -99,7 +99,7 @@ absl::Status EventsWriter::InitIfNeeded() {
   return absl::OkStatus();
 }
 
-string EventsWriter::FileName() {
+std::string EventsWriter::FileName() {
   if (filename_.empty()) {
     InitIfNeeded().IgnoreError();
   }
@@ -120,7 +120,7 @@ void EventsWriter::WriteSerializedEvent(absl::string_view event_str) {
 // NOTE(touts); This is NOT the function called by the Python code.
 // Python calls WriteSerializedEvent(), see events_writer.i.
 void EventsWriter::WriteEvent(const Event& event) {
-  string record;
+  std::string record;
   event.AppendToString(&record);
   WriteSerializedEvent(record);
 }
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index 344893f9dba984..f1aeb7981daab4 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -39,7 +39,7 @@ namespace {
 Env* env() { return Env::Default(); }
 
 void WriteSimpleValue(EventsWriter* writer, double wall_time, int64_t step,
-                      const string& tag, float simple_value) {
+                      const std::string& tag, float simple_value) {
   Event event;
   event.set_wall_time(wall_time);
   event.set_step(step);
@@ -54,7 +54,7 @@ void WriteFile(EventsWriter* writer) {
   WriteSimpleValue(writer, 2345, 35, "bar", -42);
 }
 
-static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
+static bool ReadEventProto(io::RecordReader* reader, uint64_t* offset,
                            Event* proto) {
   tstring record;
   absl::Status s = reader->ReadRecord(offset, &record);
@@ -64,13 +64,13 @@ static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
   return ParseProtoUnlimited(proto, record);
 }
 
-void VerifyFile(const string& filename) {
+void VerifyFile(const std::string& filename) {
   TF_CHECK_OK(env()->FileExists(filename));
   std::unique_ptr<RandomAccessFile> event_file;
   TF_CHECK_OK(env()->NewRandomAccessFile(filename, &event_file));
   io::RecordReader* reader = new io::RecordReader(event_file.get());
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
 
   Event actual;
   CHECK(ReadEventProto(reader, &offset, &actual));
@@ -109,41 +109,41 @@ void VerifyFile(const string& filename) {
   delete reader;
 }
 
-string GetDirName(const string& suffix) {
+std::string GetDirName(const std::string& suffix) {
   return io::JoinPath(testing::TmpDir(), suffix);
 }
 
 TEST(EventWriter, WriteFlush) {
-  string file_prefix = GetDirName("/writeflush_test");
+  std::string file_prefix = GetDirName("/writeflush_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Flush());
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   VerifyFile(filename);
 }
 
 TEST(EventWriter, WriteClose) {
-  string file_prefix = GetDirName("/writeclose_test");
+  std::string file_prefix = GetDirName("/writeclose_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   VerifyFile(filename);
 }
 
 TEST(EventWriter, WriteDelete) {
-  string file_prefix = GetDirName("/writedelete_test");
+  std::string file_prefix = GetDirName("/writedelete_test");
   EventsWriter* writer = new EventsWriter(file_prefix);
   WriteFile(writer);
-  string filename = writer->FileName();
+  std::string filename = writer->FileName();
   delete writer;
   VerifyFile(filename);
 }
 
 TEST(EventWriter, FailFlush) {
-  string file_prefix = GetDirName("/failflush_test");
+  std::string file_prefix = GetDirName("/failflush_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
@@ -151,9 +151,9 @@ TEST(EventWriter, FailFlush) {
 }
 
 TEST(EventWriter, FailClose) {
-  string file_prefix = GetDirName("/failclose_test");
+  std::string file_prefix = GetDirName("/failclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   WriteFile(&writer);
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
@@ -161,22 +161,22 @@ TEST(EventWriter, FailClose) {
 }
 
 TEST(EventWriter, InitWriteClose) {
-  string file_prefix = GetDirName("/initwriteclose_test");
+  std::string file_prefix = GetDirName("/initwriteclose_test");
   EventsWriter writer(file_prefix);
   TF_EXPECT_OK(writer.Init());
-  string filename0 = writer.FileName();
+  std::string filename0 = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename0));
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
-  string filename1 = writer.FileName();
+  std::string filename1 = writer.FileName();
   EXPECT_EQ(filename0, filename1);
   VerifyFile(filename1);
 }
 
 TEST(EventWriter, NameWriteClose) {
-  string file_prefix = GetDirName("/namewriteclose_test");
+  std::string file_prefix = GetDirName("/namewriteclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename));
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Close());
@@ -184,18 +184,18 @@ TEST(EventWriter, NameWriteClose) {
 }
 
 TEST(EventWriter, NameClose) {
-  string file_prefix = GetDirName("/nameclose_test");
+  std::string file_prefix = GetDirName("/nameclose_test");
   EventsWriter writer(file_prefix);
-  string filename = writer.FileName();
+  std::string filename = writer.FileName();
   TF_EXPECT_OK(writer.Close());
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
 }
 
 TEST(EventWriter, FileDeletionBeforeWriting) {
-  string file_prefix = GetDirName("/fdbw_test");
+  std::string file_prefix = GetDirName("/fdbw_test");
   EventsWriter writer(file_prefix);
-  string filename0 = writer.FileName();
+  std::string filename0 = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename0));
   env()->SleepForMicroseconds(
       2000000);  // To make sure timestamp part of filename will differ.
@@ -203,7 +203,7 @@ TEST(EventWriter, FileDeletionBeforeWriting) {
   TF_EXPECT_OK(writer.Init());  // Init should reopen file.
   WriteFile(&writer);
   TF_EXPECT_OK(writer.Flush());
-  string filename1 = writer.FileName();
+  std::string filename1 = writer.FileName();
   EXPECT_NE(filename0, filename1);
   VerifyFile(filename1);
 }
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 4af991b2495e83..5a41ad1982a242 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -107,17 +107,17 @@ auto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) {
 template <typename A>
 void EnableAliasing(A&& a) {}
 
-uint8 PeekTag(protobuf::io::CodedInputStream* stream) {
+uint8_t PeekTag(protobuf::io::CodedInputStream* stream) {
   DCHECK(stream != nullptr);
   const void* ptr;
   int size;
   if (!stream->GetDirectBufferPointer(&ptr, &size)) return 0;
-  return *static_cast<const uint8*>(ptr);
+  return *static_cast<const uint8_t*>(ptr);
 }
 
-constexpr uint8 kVarintTag(uint32 tag) { return (tag << 3) | 0; }
-constexpr uint8 kDelimitedTag(uint32 tag) { return (tag << 3) | 2; }
-constexpr uint8 kFixed32Tag(uint32 tag) { return (tag << 3) | 5; }
+constexpr uint8_t kVarintTag(uint32_t tag) { return (tag << 3) | 0; }
+constexpr uint8_t kDelimitedTag(uint32_t tag) { return (tag << 3) | 2; }
+constexpr uint8_t kFixed32Tag(uint32_t tag) { return (tag << 3) | 5; }
 
 namespace parsed {
 
@@ -133,7 +133,7 @@ class Feature {
       *dtype = DT_INVALID;
       return absl::OkStatus();
     }
-    uint8 oneof_tag = static_cast<uint8>(*serialized_.data());
+    uint8_t oneof_tag = static_cast<uint8_t>(*serialized_.data());
     serialized_.remove_prefix(1);
     switch (oneof_tag) {
       case kDelimitedTag(1):
@@ -155,15 +155,16 @@ class Feature {
 
   bool GetNumElementsInBytesList(int* num_elements) {
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length = 0;
+    uint32_t length = 0;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
     *num_elements = 0;
     while (!stream.ExpectAtEnd()) {
       if (!stream.ExpectTag(kDelimitedTag(1))) return false;
-      uint32 bytes_length = 0;
+      uint32_t bytes_length = 0;
       if (!stream.ReadVarint32(&bytes_length)) return false;
       if (!stream.Skip(bytes_length)) return false;
       ++*num_elements;
@@ -188,18 +189,19 @@ class Feature {
     DCHECK(bytes_list != nullptr);
 
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
 
     EnableAliasing(&stream);
 
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     while (!stream.ExpectAtEnd()) {
       if (!stream.ExpectTag(kDelimitedTag(1))) return false;
       // parse string
-      uint32 bytes_length;
+      uint32_t bytes_length;
       if (!stream.ReadVarint32(&bytes_length)) return false;
       tstring* bytes = construct_at_end(bytes_list);
       if (bytes == nullptr) return false;
@@ -214,14 +216,15 @@ class Feature {
   bool ParseFloatList(Result* float_list) {
     DCHECK(float_list != nullptr);
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     if (!stream.ExpectAtEnd()) {
-      uint8 peek_tag = PeekTag(&stream);
+      uint8_t peek_tag = PeekTag(&stream);
       if (peek_tag != kDelimitedTag(1) && peek_tag != kFixed32Tag(1)) {
         return false;
       }
@@ -229,7 +232,7 @@ class Feature {
       constexpr int32_t kNumFloatBytes = 4;
       if (peek_tag == kDelimitedTag(1)) {                       // packed
         if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
-        uint32 packed_length;
+        uint32_t packed_length;
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
@@ -245,16 +248,16 @@ class Feature {
             sizeof(typename Result::value_type) == kNumFloatBytes) {
           // Calculate the length of the buffer available what can be less than
           // what we requested in resize in case of a LimitedArraySlice.
-          const uint32 bytes_to_copy =
-              std::min(static_cast<uint32>((float_list->size() - initial_size) *
-                                           kNumFloatBytes),
-                       packed_length);
+          const uint32_t bytes_to_copy = std::min(
+              static_cast<uint32_t>((float_list->size() - initial_size) *
+                                    kNumFloatBytes),
+              packed_length);
           if (!stream.ReadRaw(float_list->data() + initial_size, bytes_to_copy))
             return false;
         } else {
           int64_t index = initial_size;
           while (!stream.ExpectAtEnd()) {
-            uint32 buffer32;
+            uint32_t buffer32;
             if (!stream.ReadLittleEndian32(&buffer32)) return false;
             if (index < float_list->size()) {
               float_list->data()[index] = absl::bit_cast<float>(buffer32);
@@ -274,7 +277,7 @@ class Feature {
         int64_t index = initial_size;
         while (!stream.ExpectAtEnd()) {
           if (!stream.ExpectTag(kFixed32Tag(1))) return false;
-          uint32 buffer32;
+          uint32_t buffer32;
           if (!stream.ReadLittleEndian32(&buffer32)) return false;
           float_list->data()[index] = absl::bit_cast<float>(buffer32);
           ++index;
@@ -290,20 +293,21 @@ class Feature {
   bool ParseInt64List(Result* int64_list) {
     DCHECK(int64_list != nullptr);
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+        reinterpret_cast<const uint8_t*>(serialized_.data()),
+        serialized_.size());
     EnableAliasing(&stream);
-    uint32 length;
+    uint32_t length;
     if (!stream.ReadVarint32(&length)) return false;
     auto limit = stream.PushLimit(length);
 
     if (!stream.ExpectAtEnd()) {
-      uint8 peek_tag = PeekTag(&stream);
+      uint8_t peek_tag = PeekTag(&stream);
       if (peek_tag != kDelimitedTag(1) && peek_tag != kVarintTag(1)) {
         return false;
       }
       if (peek_tag == kDelimitedTag(1)) {                       // packed
         if (!stream.ExpectTag(kDelimitedTag(1))) return false;  // packed tag
-        uint32 packed_length;
+        uint32_t packed_length;
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
@@ -340,7 +344,7 @@ using Example = std::vector<FeatureMapEntry>;
 }  // namespace parsed
 
 inline bool SkipExtraneousTag(protobuf::io::CodedInputStream* stream) {
-  uint32 data;
+  uint32_t data;
   protobuf_uint64 dummy;
   switch (stream->ReadTag() & 0x7) {
     case 0:  // varint
@@ -368,7 +372,7 @@ bool ParseString(protobuf::io::CodedInputStream* stream,
                  absl::string_view* result) {
   DCHECK(stream != nullptr);
   DCHECK(result != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   if (length == 0) {
     *result = absl::string_view(nullptr, 0);
@@ -379,7 +383,7 @@ bool ParseString(protobuf::io::CodedInputStream* stream,
   if (!stream->GetDirectBufferPointer(&stream_alias, &stream_size)) {
     return false;
   }
-  if (static_cast<uint32>(stream_size) < length) return false;
+  if (static_cast<uint32_t>(stream_size) < length) return false;
   *result = absl::string_view(static_cast<const char*>(stream_alias), length);
   stream->Skip(length);
   return true;
@@ -389,7 +393,7 @@ bool ParseFeatureMapEntry(protobuf::io::CodedInputStream* stream,
                           parsed::FeatureMapEntry* feature_map_entry) {
   DCHECK(stream != nullptr);
   DCHECK(feature_map_entry != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   auto limit = stream->PushLimit(length);
 
@@ -422,7 +426,7 @@ bool ParseFeatures(protobuf::io::CodedInputStream* stream,
                    parsed::Example* example) {
   DCHECK(stream != nullptr);
   DCHECK(example != nullptr);
-  uint32 length;
+  uint32_t length;
   if (!stream->ReadVarint32(&length)) return false;
   auto limit = stream->PushLimit(length);
   while (!stream->ExpectAtEnd()) {
@@ -455,14 +459,14 @@ bool ParseExample(protobuf::io::CodedInputStream* stream,
 bool ParseExample(absl::string_view serialized, parsed::Example* example) {
   DCHECK(example != nullptr);
   protobuf::io::CodedInputStream stream(
-      reinterpret_cast<const uint8*>(serialized.data()), serialized.size());
+      reinterpret_cast<const uint8_t*>(serialized.data()), serialized.size());
   EnableAliasing(&stream);
   return ParseExample(&stream, example);
 }
 
 }  // namespace
 
-bool TestFastParse(const string& serialized, Example* example) {
+bool TestFastParse(const std::string& serialized, Example* example) {
   DCHECK(example != nullptr);
   parsed::Example parsed_example;
   if (!ParseExample(serialized, &parsed_example)) return false;
@@ -473,7 +477,7 @@ bool TestFastParse(const string& serialized, Example* example) {
     // I.e. last entry in the map overwrites all the previous ones.
     parsed::FeatureMapEntry& name_and_feature =
         parsed_example[parsed_example_size - i - 1];
-    string name(name_and_feature.first);
+    std::string name(name_and_feature.first);
     if ((*features.mutable_feature()).count(name) > 0) continue;
 
     auto& value = (*features.mutable_feature())[name];
@@ -562,10 +566,10 @@ struct SparseBuffer {
 };
 
 struct SeededHasher {
-  uint64 operator()(absl::string_view s) const {
+  uint64_t operator()(absl::string_view s) const {
     return Hash64(s.data(), s.size(), seed);
   }
-  uint64 seed{0xDECAFCAFFE};
+  uint64_t seed{0xDECAFCAFFE};
 };
 
 void LogDenseFeatureDataLoss(absl::string_view feature_name) {
@@ -631,7 +635,7 @@ absl::Status FastParseSerializedExample(
     parsed::Feature& feature = name_and_feature.second;
 
     std::pair<size_t, Type> d_and_type;
-    uint64 h = hasher(feature_name);
+    uint64_t h = hasher(feature_name);
     if (!config_index.Find(h, &d_and_type)) continue;
 
     size_t d = d_and_type.first;
@@ -1302,7 +1306,7 @@ absl::Status FastParseExample(const Config& config,
       size_t delta = 0;
 
       if (indices->NumElements() > 0) {
-        int64* ix_p = &indices->matrix<int64_t>()(offset, 0);
+        int64_t* ix_p = &indices->matrix<int64_t>()(offset, 0);
         size_t example_index = first_example_of_minibatch(i);
         for (size_t example_end_index : buffer.example_end_indices) {
           size_t feature_index = 0;
@@ -1339,7 +1343,7 @@ absl::Status FastParseExample(const Config& config,
     if (config.ragged[d].splits_dtype == DT_INT64) {
       row_splits->flat<int64_t>()(0) = 0;
     } else {
-      row_splits->flat<int32>()(0) = 0;
+      row_splits->flat<int32_t>()(0) = 0;
     }
 
     TensorShape values_shape;
@@ -1356,13 +1360,13 @@ absl::Status FastParseExample(const Config& config,
       // Update row_splits.  row_splits are formed by concatenating the example
       // end_indices (adjusting each to start after the previous one ends).
       if (config.ragged[d].splits_dtype == DT_INT64) {
-        int64* row_splits_out = &row_splits->flat<int64_t>()(splits_offset);
+        int64_t* row_splits_out = &row_splits->flat<int64_t>()(splits_offset);
         int64_t start = *row_splits_out;
         for (size_t example_end_index : buffer.example_end_indices) {
           *++row_splits_out = start + example_end_index;
         }
       } else {
-        int32* row_splits_out = &row_splits->flat<int32>()(splits_offset);
+        int32_t* row_splits_out = &row_splits->flat<int32_t>()(splits_offset);
         int32_t start = *row_splits_out;
         for (size_t example_end_index : buffer.example_end_indices) {
           *++row_splits_out = start + example_end_index;
@@ -1561,7 +1565,7 @@ absl::Status FastParseSingleExample(const Config& config,
     parsed::Feature& feature = name_and_feature.second;
 
     std::pair<size_t, Type> d_and_type;
-    uint64 h = hasher(feature_name);
+    uint64_t h = hasher(feature_name);
     if (!config_index.Find(h, &d_and_type)) continue;
 
     size_t d = d_and_type.first;
@@ -1873,7 +1877,7 @@ struct FeatureProtos {
 // Map from feature name to FeatureProtos for that feature.
 using FeatureProtosMap = absl::flat_hash_map<absl::string_view, FeatureProtos>;
 
-string ExampleName(const absl::Span<const tstring> example_names, int n) {
+std::string ExampleName(const absl::Span<const tstring> example_names, int n) {
   return example_names.empty() ? "<unknown>" : example_names[n];
 }
 
@@ -1882,14 +1886,14 @@ string ExampleName(const absl::Span<const tstring> example_names, int n) {
 inline int ParseBytesFeature(protobuf::io::CodedInputStream* stream,
                              tstring* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(1)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
     while (!stream->ExpectAtEnd()) {
-      uint32 bytes_length;
+      uint32_t bytes_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&bytes_length)) {
         return -1;
@@ -1927,22 +1931,22 @@ inline void PadInt64Feature(int num_to_pad, int64_t* out) {
 inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
                              float* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(2)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
-    uint8 peek_tag = PeekTag(stream);
+    uint8_t peek_tag = PeekTag(stream);
     if (peek_tag == kDelimitedTag(1)) {  // packed
-      uint32 packed_length;
+      uint32_t packed_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&packed_length)) {
         return -1;
       }
       auto packed_limit = stream->PushLimit(packed_length);
       while (!stream->ExpectAtEnd()) {
-        uint32 buffer32;
+        uint32_t buffer32;
         if (!stream->ReadLittleEndian32(&buffer32)) {
           return -1;
         }
@@ -1954,7 +1958,7 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
       stream->PopLimit(packed_limit);
     } else if (peek_tag == kFixed32Tag(1)) {
       while (!stream->ExpectAtEnd()) {
-        uint32 buffer32;
+        uint32_t buffer32;
         if (!stream->ExpectTag(kFixed32Tag(1)) ||
             !stream->ReadLittleEndian32(&buffer32)) {
           return -1;
@@ -1978,15 +1982,15 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
 inline int ParseInt64Feature(protobuf::io::CodedInputStream* stream,
                              int64_t* out) {
   int num_elements = 0;
-  uint32 length;
+  uint32_t length;
   if (!stream->ExpectTag(kDelimitedTag(3)) || !stream->ReadVarint32(&length)) {
     return -1;
   }
   if (length > 0) {
     auto limit = stream->PushLimit(length);
-    uint8 peek_tag = PeekTag(stream);
+    uint8_t peek_tag = PeekTag(stream);
     if (peek_tag == kDelimitedTag(1)) {  // packed
-      uint32 packed_length;
+      uint32_t packed_length;
       if (!stream->ExpectTag(kDelimitedTag(1)) ||
           !stream->ReadVarint32(&packed_length)) {
         return -1;
@@ -2070,7 +2074,7 @@ inline int GetFeatureLength(DataType dtype,
 }
 
 inline DataType ParseDataType(protobuf::io::CodedInputStream* stream) {
-  uint8 peek_tag = PeekTag(stream);
+  uint8_t peek_tag = PeekTag(stream);
   switch (peek_tag) {
     case kDelimitedTag(1):
       return DT_STRING;
@@ -2104,7 +2108,7 @@ inline bool SkipEmptyFeature(protobuf::io::CodedInputStream* stream,
     default:
       return false;
   }
-  uint32 length;
+  uint32_t length;
   return stream->ReadVarint32(&length) && length == 0;
 }
 
@@ -2116,7 +2120,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
   for (int d = 0; d < examples.size(); d++) {
     const tstring& example = examples[d];
     protobuf::io::CodedInputStream stream(
-        reinterpret_cast<const uint8*>(example.data()), example.size());
+        reinterpret_cast<const uint8_t*>(example.data()), example.size());
     // Not clear what this does. Why not stream.EnableAliasing()?
     EnableAliasing(&stream);
 
@@ -2135,7 +2139,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
             ExampleName(example_names, d));
       }
       if (features != nullptr) {
-        uint32 length;
+        uint32_t length;
         if (!stream.ReadVarint32(&length)) {
           return errors::InvalidArgument(
               "Invalid protocol message input, example id: ",
@@ -2144,7 +2148,7 @@ absl::Status ExtractFeaturesFromSequenceExamples(
         auto limit = stream.PushLimit(length);
         while (!stream.ExpectAtEnd()) {
           absl::string_view key, value;
-          uint32 length;
+          uint32_t length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&length)) {
             return errors::InvalidArgument(
@@ -2187,7 +2191,7 @@ absl::Status GetContextFeatureLengths(
       const auto& proto = feature.protos[d];
       if (proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(proto.data()), proto.size());
+          reinterpret_cast<const uint8_t*>(proto.data()), proto.size());
       EnableAliasing(&stream);
       int num_elements = GetFeatureLength(feature.dtype, &stream);
       if (num_elements < 0) {
@@ -2226,10 +2230,10 @@ absl::Status GetSequenceFeatureLengths(
       size_t num_rows = 0;
       size_t num_elements = 0;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(proto.data()), proto.size());
+          reinterpret_cast<const uint8_t*>(proto.data()), proto.size());
       EnableAliasing(&stream);
       while (!stream.ExpectAtEnd()) {
-        uint32 feature_bytes;
+        uint32_t feature_bytes;
         if (!stream.ExpectTag(kDelimitedTag(1)) ||
             !stream.ReadVarint32(&feature_bytes)) {
           return errors::InvalidArgument("Error in sequence feature ", c.first,
@@ -2358,7 +2362,7 @@ absl::Status ParseContextDenseFeatures(
         num_elements += c.default_value.NumElements();
       } else if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         num_elements += ParseFeature(dtype, &stream, &out, &out_offset);
@@ -2408,7 +2412,7 @@ absl::Status ParseContextSparseFeatures(
       const auto& feature_proto = feature.protos[e];
       if (feature_proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(feature_proto.data()),
+          reinterpret_cast<const uint8_t*>(feature_proto.data()),
           feature_proto.size());
       EnableAliasing(&stream);
       size_t num_added =
@@ -2458,9 +2462,9 @@ absl::Status ParseContextRaggedFeatures(
         Tensor(allocator, splits_dtype, splits_shape);
     Tensor& out_values = context_result->ragged_values[t];
     size_t out_values_offset = 0;
-    int32* int32_splits =
+    int32_t* int32_splits =
         is_batch && splits_dtype == DT_INT32
-            ? context_result->ragged_splits[t].vec<int32>().data()
+            ? context_result->ragged_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_splits =
         is_batch && splits_dtype == DT_INT64
@@ -2478,7 +2482,7 @@ absl::Status ParseContextRaggedFeatures(
       const auto& feature_proto = feature.protos[e];
       if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         size_t num_added =
@@ -2499,7 +2503,7 @@ absl::Status ParseContextRaggedFeatures(
       int actual_splits =
           int32_splits
               ? int32_splits -
-                    context_result->ragged_splits[t].vec<int32>().data()
+                    context_result->ragged_splits[t].vec<int32_t>().data()
               : int64_splits -
                     context_result->ragged_splits[t].vec<int64_t>().data();
       if (actual_splits != num_examples + 1) {
@@ -2591,11 +2595,11 @@ absl::Status ParseSequenceDenseFeatures(
         }
       } else if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         while (!stream.ExpectAtEnd()) {
-          uint32 feature_length;
+          uint32_t feature_length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&feature_length)) {
             return errors::InvalidArgument("Error in sequence feature ",
@@ -2716,12 +2720,12 @@ absl::Status ParseSequenceSparseFeatures(
       const auto& feature_proto = feature.protos[e];
       if (feature_proto.empty()) continue;
       protobuf::io::CodedInputStream stream(
-          reinterpret_cast<const uint8*>(feature_proto.data()),
+          reinterpret_cast<const uint8_t*>(feature_proto.data()),
           feature_proto.size());
       EnableAliasing(&stream);
       size_t num_rows = 0;
       while (!stream.ExpectAtEnd()) {
-        uint32 feature_length;
+        uint32_t feature_length;
         if (!stream.ExpectTag(kDelimitedTag(1)) ||
             !stream.ReadVarint32(&feature_length)) {
           // This should be unreachable -- we already scanned the feature in
@@ -2821,17 +2825,17 @@ absl::Status ParseSequenceRaggedFeatures(
         Tensor(allocator, splits_dtype, outer_splits_shape);
     Tensor& out_values = sequence_result->ragged_values[t];
     size_t out_values_offset = 0;
-    int32* int32_inner_splits =
+    int32_t* int32_inner_splits =
         splits_dtype == DT_INT32
-            ? sequence_result->ragged_splits[t].vec<int32>().data()
+            ? sequence_result->ragged_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_inner_splits =
         splits_dtype == DT_INT64
             ? sequence_result->ragged_splits[t].vec<int64_t>().data()
             : nullptr;
-    int32* int32_outer_splits =
+    int32_t* int32_outer_splits =
         is_batch && splits_dtype == DT_INT32
-            ? sequence_result->ragged_outer_splits[t].vec<int32>().data()
+            ? sequence_result->ragged_outer_splits[t].vec<int32_t>().data()
             : nullptr;
     int64_t* int64_outer_splits =
         is_batch && splits_dtype == DT_INT64
@@ -2855,11 +2859,11 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& feature_proto = feature.protos[e];
       if (!feature_proto.empty()) {
         protobuf::io::CodedInputStream stream(
-            reinterpret_cast<const uint8*>(feature_proto.data()),
+            reinterpret_cast<const uint8_t*>(feature_proto.data()),
             feature_proto.size());
         EnableAliasing(&stream);
         while (!stream.ExpectAtEnd()) {
-          uint32 feature_length;
+          uint32_t feature_length;
           if (!stream.ExpectTag(kDelimitedTag(1)) ||
               !stream.ReadVarint32(&feature_length)) {
             // This should be unreachable -- we already scanned the feature in
@@ -2916,7 +2920,7 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& inner_splits = sequence_result->ragged_splits[t];
       int num_inner_splits =
           int32_inner_splits
-              ? int32_inner_splits - inner_splits.vec<int32>().data()
+              ? int32_inner_splits - inner_splits.vec<int32_t>().data()
               : int64_inner_splits - inner_splits.vec<int64_t>().data();
       if (num_inner_splits != expected_num_rows + 1) {
         return errors::InvalidArgument("Unexpected number of rows for feature ",
@@ -2927,7 +2931,7 @@ absl::Status ParseSequenceRaggedFeatures(
       const auto& outer_splits = sequence_result->ragged_outer_splits[t];
       int num_outer_splits =
           int32_outer_splits
-              ? int32_outer_splits - outer_splits.vec<int32>().data()
+              ? int32_outer_splits - outer_splits.vec<int32_t>().data()
               : int64_outer_splits - outer_splits.vec<int64_t>().data();
       if (num_outer_splits != num_examples + 1) {
         return errors::InvalidArgument(
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 6ba6d89ab5aa01..583a4238737807 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -164,7 +164,7 @@ absl::Status FastParseSequenceExample(
 // It uses the same specialized parser as FastParseExample which is efficient.
 // But then constructs Example which is relatively slow.
 // It is exported here as a convenient API to test parser part separately.
-bool TestFastParse(const string& serialized, Example* example);
+bool TestFastParse(const std::string& serialized, Example* example);
 
 }  // namespace example
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 0b8a62d09cd03e..2df2732e1bc705 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -41,8 +41,8 @@ constexpr char kSparseInt64Key[] = "sparse_int64";
 constexpr char kSparseFloatKey[] = "sparse_float";
 constexpr char kSparseStringKey[] = "sparse_string";
 
-string SerializedToReadable(string serialized) {
-  string result;
+std::string SerializedToReadable(std::string serialized) {
+  std::string result;
   result += '"';
   for (char c : serialized)
     absl::StrAppend(&result, "\\x", absl::Hex(c, absl::kZeroPad2));
@@ -51,15 +51,15 @@ string SerializedToReadable(string serialized) {
 }
 
 template <class T>
-string Serialize(const T& example) {
-  string serialized;
+std::string Serialize(const T& example) {
+  std::string serialized;
   example.SerializeToString(&serialized);
   return serialized;
 }
 
 // Tests that serialized gets parsed identically by TestFastParse(..)
 // and the regular Example.ParseFromString(..).
-void TestCorrectness(const string& serialized) {
+void TestCorrectness(const std::string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
@@ -150,7 +150,7 @@ TEST(FastParse, DenseInt64WithContext) {
       .mutable_int64_list()
       ->add_value(15);
 
-  string serialized = Serialize(example) + Serialize(context);
+  std::string serialized = Serialize(example) + Serialize(context);
 
   {
     Example deserialized;
@@ -183,10 +183,10 @@ TEST(FastParse, EmptyFeatures) {
   TestCorrectness(Serialize(example));
 }
 
-void TestCorrectnessJson(const string& json) {
+void TestCorrectnessJson(const std::string& json) {
   auto resolver = protobuf::util::NewTypeResolverForDescriptorPool(
       "type.googleapis.com", protobuf::DescriptorPool::generated_pool());
-  string serialized;
+  std::string serialized;
   auto s = protobuf::util::JsonToBinaryString(
       resolver, "type.googleapis.com/tensorflow.Example", json, &serialized);
   EXPECT_TRUE(s.ok()) << s;
@@ -220,7 +220,7 @@ TEST(FastParse, SingleInt64) {
   TestCorrectness(Serialize(example));
 }
 
-static string ExampleWithSomeFeatures() {
+static std::string ExampleWithSomeFeatures() {
   Example example;
 
   (*example.mutable_features()->mutable_feature())[""];
@@ -328,13 +328,13 @@ TEST(FastParse, StatsCollection) {
   }
 }
 
-string RandStr(random::SimplePhilox* rng) {
+std::string RandStr(random::SimplePhilox* rng) {
   static const char key_char_lookup[] =
       "0123456789{}~`!@#$%^&*()"
       "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
       "abcdefghijklmnopqrstuvwxyz";
   auto len = 1 + rng->Rand32() % 200;
-  string str;
+  std::string str;
   str.reserve(len);
   while (len-- > 0) {
     str.push_back(
@@ -347,18 +347,18 @@ string RandStr(random::SimplePhilox* rng) {
 void Fuzz(random::SimplePhilox* rng) {
   // Generate keys.
   auto num_keys = 1 + rng->Rand32() % 100;
-  std::unordered_set<string> unique_keys;
+  std::unordered_set<std::string> unique_keys;
   for (auto i = 0; i < num_keys; ++i) {
     unique_keys.emplace(RandStr(rng));
   }
 
   // Generate serialized example.
   Example example;
-  string serialized_example;
+  std::string serialized_example;
   auto num_concats = 1 + rng->Rand32() % 4;
   std::vector<Feature::KindCase> feat_types(
       {Feature::kBytesList, Feature::kFloatList, Feature::kInt64List});
-  std::vector<string> all_keys(unique_keys.begin(), unique_keys.end());
+  std::vector<std::string> all_keys(unique_keys.begin(), unique_keys.end());
   while (num_concats--) {
     example.Clear();
     auto num_active_keys = 1 + rng->Rand32() % all_keys.size();
@@ -410,7 +410,7 @@ void Fuzz(random::SimplePhilox* rng) {
 }
 
 TEST(FastParse, FuzzTest) {
-  const uint64 seed = 1337;
+  const uint64_t seed = 1337;
   random::PhiloxRandom philox(seed);
   random::SimplePhilox rng(&philox);
   auto num_runs = 200;
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index fc0573a53be71c..e4bf04377822b3 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -63,10 +63,10 @@ absl::Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
   return absl::OkStatus();
 }
 
-absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
-                              const string& key, const DataType& dtype,
-                              const TensorShape& shape, const Feature& feature,
-                              Tensor* out) {
+absl::Status FeatureDenseCopy(const std::size_t out_index,
+                              const std::string& name, const std::string& key,
+                              const DataType& dtype, const TensorShape& shape,
+                              const Feature& feature, Tensor* out) {
   const std::size_t num_elements = shape.num_elements();
   const std::size_t offset = out_index * num_elements;
 
@@ -109,7 +109,7 @@ absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
       auto out_p = out->flat<tstring>().data() + offset;
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
-                     [](const string* s) { return *s; });
+                     [](const std::string* s) { return *s; });
       return absl::OkStatus();
     }
     default:
@@ -118,7 +118,7 @@ absl::Status FeatureDenseCopy(const std::size_t out_index, const string& name,
   }
 }
 
-Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
+Tensor FeatureSparseCopy(const std::size_t batch, const std::string& key,
                          const DataType& dtype, const Feature& feature) {
   switch (dtype) {
     case DT_INT64: {
@@ -144,7 +144,7 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
       auto out_p = out.flat<tstring>().data();
       std::transform(values.value().data(),
                      values.value().data() + num_elements, out_p,
-                     [](const string* s) { return *s; });
+                     [](const std::string* s) { return *s; });
       return out;
     }
     default:
@@ -221,7 +221,8 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
 }
 
 absl::Status SingleExampleProtoToTensors(
-    const Example& example, const string& example_name, const int batch_index,
+    const Example& example, const std::string& example_name,
+    const int batch_index,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features,
     std::vector<Tensor*>* output_dense_values_tensor,
@@ -232,7 +233,7 @@ absl::Status SingleExampleProtoToTensors(
   // Handle dense features.
   for (size_t d = 0; d < fixed_len_features.size(); ++d) {
     const FixedLenFeature& feature_config = fixed_len_features[d];
-    const string& key = feature_config.key;
+    const std::string& key = feature_config.key;
     const DataType& dtype = feature_config.dtype;
     const TensorShape& shape = feature_config.shape;
     const Tensor& default_value = feature_config.default_value;
@@ -274,7 +275,7 @@ absl::Status SingleExampleProtoToTensors(
   // Handle sparse features.
   for (size_t d = 0; d < var_len_features.size(); ++d) {
     const VarLenFeature& feature_config = var_len_features[d];
-    const string& key = feature_config.key;
+    const std::string& key = feature_config.key;
     const DataType& dtype = feature_config.dtype;
     const auto& feature_found = feature_dict.find(key);
 
@@ -324,7 +325,7 @@ absl::Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
 
 absl::Status BatchExampleProtoToTensors(
     const std::vector<const Example*>& examples,
-    const std::vector<string>& names,
+    const std::vector<std::string>& names,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
     std::vector<Tensor>* output_dense_values_tensor,
@@ -368,7 +369,7 @@ absl::Status BatchExampleProtoToTensors(
 
   for (size_t b = 0; b < examples.size(); ++b) {
     const Example& ex = *(examples[b]);
-    const string& example_name = (has_names) ? names[b] : "<unknown>";
+    const std::string& example_name = (has_names) ? names[b] : "<unknown>";
     TF_RETURN_IF_ERROR(SingleExampleProtoToTensors(
         ex, example_name, b, fixed_len_features, var_len_features,
         &output_dense_values_tensor_ptrs, &sparse_values_tmp));
@@ -455,7 +456,7 @@ absl::Status ParseExampleAttrs::FinishInit(int op_version) {
     return errors::InvalidArgument(
         "len(ragged_keys) != len(ragged_split_types)");
   }
-  if (num_dense > std::numeric_limits<int32>::max()) {
+  if (num_dense > std::numeric_limits<int32_t>::max()) {
     return errors::InvalidArgument("num_dense_ too large");
   }
   for (const DataType& type : dense_types) {
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index f3ff81ec98944f..2b2dda892523a8 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -40,20 +40,20 @@ namespace tensorflow {
 
 // "Dense" feature configuration.
 struct FixedLenFeature {
-  string key;
+  std::string key;
   DataType dtype;
   TensorShape shape;
   Tensor default_value;
-  string values_output_tensor_name;
+  std::string values_output_tensor_name;
 };
 
 // "Sparse" feature configuration.
 struct VarLenFeature {
-  string key;
+  std::string key;
   DataType dtype;
-  string values_output_tensor_name;
-  string indices_output_tensor_name;
-  string shapes_output_tensor_name;
+  std::string values_output_tensor_name;
+  std::string indices_output_tensor_name;
+  std::string shapes_output_tensor_name;
 };
 
 // Given a single tensorflow::Example, with an optional example name
@@ -77,7 +77,7 @@ struct VarLenFeature {
 // CopyIntoSparseTensor can be used to copy from the temporary vector
 // into the final allocated tensors.
 absl::Status SingleExampleProtoToTensors(
-    const Example& example, const string& name, int batch_index,
+    const Example& example, const std::string& name, int batch_index,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features,
     std::vector<Tensor*>* output_dense_values_tensor,
@@ -111,7 +111,7 @@ absl::Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
 // allocated using a provided Allocator within this method.
 absl::Status BatchExampleProtoToTensors(
     const std::vector<const Example*>& examples,
-    const std::vector<string>& names,
+    const std::vector<std::string>& names,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features, Allocator* allocator,
     std::vector<Tensor>* output_dense_values_tensor,
@@ -130,8 +130,8 @@ absl::Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
 
 // For a single Example, copy a dense feature value into an output
 // dense value tensor Out at the provided out_index offset.
-absl::Status FeatureDenseCopy(std::size_t out_index, const string& name,
-                              const string& key, const DataType& dtype,
+absl::Status FeatureDenseCopy(std::size_t out_index, const std::string& name,
+                              const std::string& key, const DataType& dtype,
                               const TensorShape& shape, const Feature& feature,
                               Tensor* out);
 
@@ -142,7 +142,7 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
 
 // For a single Example, and given sparse feature return a temporary output
 // Tensor suitable for being collected in the temporary sparse value vector.
-Tensor FeatureSparseCopy(std::size_t batch, const string& key,
+Tensor FeatureSparseCopy(std::size_t batch, const std::string& key,
                          const DataType& dtype, const Feature& feature);
 
 // Copy a temporary Tensor into the final sparse indices and values
@@ -251,10 +251,10 @@ struct ParseSequenceExampleAttrs {
   absl::Status Init(ContextType* ctx, int op_version = 1) {
     switch (op_version) {
       case 1: {
-        std::vector<string> missing_empty_vector;
+        std::vector<std::string> missing_empty_vector;
         TF_RETURN_IF_ERROR(ctx->GetAttr(
             "feature_list_dense_missing_assumed_empty", &missing_empty_vector));
-        for (const string& feature : missing_empty_vector) {
+        for (const std::string& feature : missing_empty_vector) {
           feature_list_dense_missing_assumed_empty.insert(feature);
         }
       }
@@ -300,7 +300,7 @@ struct ParseSequenceExampleAttrs {
     return FinishInit(op_version);
   }
 
-  std::unordered_set<string> feature_list_dense_missing_assumed_empty;
+  std::unordered_set<std::string> feature_list_dense_missing_assumed_empty;
   int64_t num_context_sparse;
   int64_t num_context_dense;
   int64_t num_context_ragged;
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
index d4a6c552dfbe08..6ee738641a11f5 100644
--- a/tensorflow/core/util/exec_on_stall.h
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -82,7 +82,7 @@ class ExecuteOnStall {
   Env* env_;
   std::function<void()> f_;
   int64_t deadline_;
-  int32 poll_microseconds_;
+  int32_t poll_microseconds_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/fake_clock_env.cc b/tensorflow/core/util/fake_clock_env.cc
index 55c7841a4cf9a9..81b8254559548a 100644
--- a/tensorflow/core/util/fake_clock_env.cc
+++ b/tensorflow/core/util/fake_clock_env.cc
@@ -28,7 +28,7 @@ void FakeClockEnv::AdvanceByMicroseconds(int64_t micros) {
   }
 }
 
-uint64 FakeClockEnv::NowMicros() const {
+uint64_t FakeClockEnv::NowMicros() const {
   {
     mutex_lock l(mu_);
     return current_time_;
diff --git a/tensorflow/core/util/fake_clock_env.h b/tensorflow/core/util/fake_clock_env.h
index 2ded1708aed79c..7d1e9305dd5b92 100644
--- a/tensorflow/core/util/fake_clock_env.h
+++ b/tensorflow/core/util/fake_clock_env.h
@@ -43,11 +43,11 @@ class FakeClockEnv : public EnvWrapper {
   void AdvanceByMicroseconds(int64_t micros);
 
   // Returns the current time of FakeClockEnv in microseconds.
-  uint64 NowMicros() const override;
+  uint64_t NowMicros() const override;
 
  private:
   mutable mutex mu_;
-  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t current_time_ TF_GUARDED_BY(mu_) = 0;
 
   FakeClockEnv(const FakeClockEnv&) = delete;
   void operator=(const FakeClockEnv&) = delete;
diff --git a/tensorflow/core/util/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
index e028ec5c66acbc..63691f0b3c05b2 100644
--- a/tensorflow/core/util/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -100,12 +100,12 @@ struct ImageResizerState {
     OP_REQUIRES(
         context,
         FastBoundsCheck(input_shape.dim_size(1),
-                        std::numeric_limits<int32>::max()) &&
+                        std::numeric_limits<int32_t>::max()) &&
             FastBoundsCheck(input_shape.dim_size(2),
-                            std::numeric_limits<int32>::max()),
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument("input sizes must be between 0 and max int32"));
-    in_height = static_cast<int32>(input_shape.dim_size(1));
-    in_width = static_cast<int32>(input_shape.dim_size(2));
+    in_height = static_cast<int32_t>(input_shape.dim_size(1));
+    in_width = static_cast<int32_t>(input_shape.dim_size(2));
 
     // Verify the output tensor's shape.
     const Tensor& shape_t = context->input(1);
@@ -117,7 +117,7 @@ struct ImageResizerState {
                                         shape_t.shape().DebugString()));
 
     // Verify and assign `out_height` and `out_width`.
-    auto Svec = shape_t.vec<int32>();
+    auto Svec = shape_t.vec<int32_t>();
     out_height = internal::SubtleMustCopy(Svec(0));
     out_width = internal::SubtleMustCopy(Svec(1));
     OP_REQUIRES(context, out_height > 0 && out_width > 0,
@@ -222,8 +222,9 @@ struct ImageResizerGradientState {
 
     OP_REQUIRES(
         context,
-        FastBoundsCheck(original_height, std::numeric_limits<int32>::max()) &&
-            FastBoundsCheck(original_width, std::numeric_limits<int32>::max()),
+        FastBoundsCheck(original_height, std::numeric_limits<int32_t>::max()) &&
+            FastBoundsCheck(original_width,
+                            std::numeric_limits<int32_t>::max()),
         errors::InvalidArgument(
             "original sizes must be between 0 and max int32"));
 
diff --git a/tensorflow/core/util/matmul_bcast_test.cc b/tensorflow/core/util/matmul_bcast_test.cc
index 418db5ae33258b..1f9762e1fdbbb0 100644
--- a/tensorflow/core/util/matmul_bcast_test.cc
+++ b/tensorflow/core/util/matmul_bcast_test.cc
@@ -22,11 +22,11 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string MatMulBCastToStr(const MatMulBCast& b) {
+std::string MatMulBCastToStr(const MatMulBCast& b) {
   if (!b.IsValid()) {
     return "invalid";
   }
-  string ret;
+  std::string ret;
   absl::StrAppend(&ret, "[",
                   absl::StrJoin(b.output_batch_shape().dim_sizes(), ","), "]");
   absl::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index c3729774d5a07c..82ab2423465bc8 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -28,10 +28,10 @@ namespace tensorflow {
 
 namespace {
 
-uint64 DecodeUint64LittleEndian(const uint8* buffer) {
-  uint64 result = 0;
-  for (int i = 0; i < static_cast<int>(sizeof(uint64)); ++i) {
-    result |= static_cast<uint64>(buffer[i]) << (8 * i);
+uint64_t DecodeUint64LittleEndian(const uint8_t* buffer) {
+  uint64_t result = 0;
+  for (int i = 0; i < static_cast<int>(sizeof(uint64_t)); ++i) {
+    result |= static_cast<uint64_t>(buffer[i]) << (8 * i);
   }
   return result;
 }
@@ -42,21 +42,21 @@ namespace {
 
 class ReadOnlyMemoryRegionFromMemmapped : public ReadOnlyMemoryRegion {
  public:
-  ReadOnlyMemoryRegionFromMemmapped(const void* data, uint64 length)
+  ReadOnlyMemoryRegionFromMemmapped(const void* data, uint64_t length)
       : data_(data), length_(length) {}
   ~ReadOnlyMemoryRegionFromMemmapped() override = default;
   const void* data() override { return data_; }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  private:
   const void* const data_;
-  const uint64 length_;
+  const uint64_t length_;
   // intentionally copyable
 };
 
 class RandomAccessFileFromMemmapped : public RandomAccessFile {
  public:
-  RandomAccessFileFromMemmapped(const void* data, uint64 length)
+  RandomAccessFileFromMemmapped(const void* data, uint64_t length)
       : data_(data), length_(length) {}
 
   ~RandomAccessFileFromMemmapped() override = default;
@@ -66,14 +66,14 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
         "RandomAccessFileFromMemmapped does not support Name()");
   }
 
-  absl::Status Read(uint64 offset, size_t to_read, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t to_read, absl::string_view* result,
                     char* scratch) const override {
     if (offset >= length_) {
       *result = absl::string_view(scratch, 0);
       return absl::Status(absl::StatusCode::kOutOfRange, "Read after file end");
     }
-    const uint64 region_left =
-        std::min(length_ - offset, static_cast<uint64>(to_read));
+    const uint64_t region_left =
+        std::min(length_ - offset, static_cast<uint64_t>(to_read));
     *result = absl::string_view(reinterpret_cast<const char*>(data_) + offset,
                                 region_left);
     return (region_left == to_read)
@@ -84,7 +84,7 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
  private:
   const void* const data_;
-  const uint64 length_;
+  const uint64_t length_;
   // intentionally copyable
 };
 
@@ -92,7 +92,7 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
 MemmappedFileSystem::MemmappedFileSystem() = default;
 
-absl::Status MemmappedFileSystem::FileExists(const string& fname,
+absl::Status MemmappedFileSystem::FileExists(const std::string& fname,
                                              TransactionToken* token) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -105,7 +105,7 @@ absl::Status MemmappedFileSystem::FileExists(const string& fname,
 }
 
 absl::Status MemmappedFileSystem::NewRandomAccessFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -121,7 +121,7 @@ absl::Status MemmappedFileSystem::NewRandomAccessFile(
 }
 
 absl::Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
@@ -136,9 +136,9 @@ absl::Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystem::GetFileSize(const string& filename,
+absl::Status MemmappedFileSystem::GetFileSize(const std::string& filename,
                                               TransactionToken* token,
-                                              uint64* size) {
+                                              uint64_t* size) {
   if (!mapped_memory_) {
     return errors::FailedPrecondition("MemmappedEnv is not initialized");
   }
@@ -150,10 +150,10 @@ absl::Status MemmappedFileSystem::GetFileSize(const string& filename,
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystem::Stat(const string& fname,
+absl::Status MemmappedFileSystem::Stat(const std::string& fname,
                                        TransactionToken* token,
                                        FileStatistics* stat) {
-  uint64 size;
+  uint64_t size;
   auto status = GetFileSize(fname, token, &size);
   if (status.ok()) {
     stat->length = size;
@@ -162,85 +162,85 @@ absl::Status MemmappedFileSystem::Stat(const string& fname,
 }
 
 absl::Status MemmappedFileSystem::NewWritableFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* wf) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
 absl::Status MemmappedFileSystem::NewAppendableFile(
-    const string& filename, TransactionToken* token,
+    const std::string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("memmapped format doesn't support writing");
 }
 
-absl::Status MemmappedFileSystem::GetChildren(const string& filename,
-                                              TransactionToken* token,
-                                              std::vector<string>* strings) {
+absl::Status MemmappedFileSystem::GetChildren(
+    const std::string& filename, TransactionToken* token,
+    std::vector<std::string>* strings) {
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
 absl::Status MemmappedFileSystem::GetMatchingPaths(
-    const string& pattern, TransactionToken* token,
-    std::vector<string>* results) {
+    const std::string& pattern, TransactionToken* token,
+    std::vector<std::string>* results) {
   return errors::Unimplemented(
       "memmapped format doesn't support GetMatchingPaths");
 }
 
-absl::Status MemmappedFileSystem::DeleteFile(const string& filename,
+absl::Status MemmappedFileSystem::DeleteFile(const std::string& filename,
                                              TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
 
-absl::Status MemmappedFileSystem::CreateDir(const string& dirname,
+absl::Status MemmappedFileSystem::CreateDir(const std::string& dirname,
                                             TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support CreateDir");
 }
 
-absl::Status MemmappedFileSystem::DeleteDir(const string& dirname,
+absl::Status MemmappedFileSystem::DeleteDir(const std::string& dirname,
                                             TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support DeleteDir");
 }
 
-absl::Status MemmappedFileSystem::RenameFile(const string& filename_from,
-                                             const string& filename_to,
+absl::Status MemmappedFileSystem::RenameFile(const std::string& filename_from,
+                                             const std::string& filename_to,
                                              TransactionToken* token) {
   return errors::Unimplemented("memmapped format doesn't support RenameFile");
 }
 
-const void* MemmappedFileSystem::GetMemoryWithOffset(uint64 offset) const {
-  return reinterpret_cast<const uint8*>(mapped_memory_->data()) + offset;
+const void* MemmappedFileSystem::GetMemoryWithOffset(uint64_t offset) const {
+  return reinterpret_cast<const uint8_t*>(mapped_memory_->data()) + offset;
 }
 
 constexpr const char MemmappedFileSystem::kMemmappedPackagePrefix[];
 constexpr const char MemmappedFileSystem::kMemmappedPackageDefaultGraphDef[];
 
-absl::Status MemmappedFileSystem::InitializeFromFile(Env* env,
-                                                     const string& filename) {
+absl::Status MemmappedFileSystem::InitializeFromFile(
+    Env* env, const std::string& filename) {
   TF_RETURN_IF_ERROR(
       env->NewReadOnlyMemoryRegionFromFile(filename, &mapped_memory_));
   directory_.clear();
-  if (mapped_memory_->length() <= sizeof(uint64)) {
+  if (mapped_memory_->length() <= sizeof(uint64_t)) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Invalid package size");
   }
   const auto memory_start =
-      reinterpret_cast<const uint8*>(mapped_memory_->data());
-  const uint64 directory_offset = DecodeUint64LittleEndian(
-      memory_start + mapped_memory_->length() - sizeof(uint64));
-  if (directory_offset > mapped_memory_->length() - sizeof(uint64)) {
+      reinterpret_cast<const uint8_t*>(mapped_memory_->data());
+  const uint64_t directory_offset = DecodeUint64LittleEndian(
+      memory_start + mapped_memory_->length() - sizeof(uint64_t));
+  if (directory_offset > mapped_memory_->length() - sizeof(uint64_t)) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Invalid directory offset");
   }
   MemmappedFileSystemDirectory proto_directory;
   if (!ParseProtoUnlimited(
           &proto_directory, memory_start + directory_offset,
-          mapped_memory_->length() - directory_offset - sizeof(uint64))) {
+          mapped_memory_->length() - directory_offset - sizeof(uint64_t))) {
     return errors::DataLoss("Corrupted memmapped model file: ", filename,
                             " Can't parse its internal directory");
   }
 
   // Iterating in reverse order to get lengths of elements;
-  uint64 prev_element_offset = directory_offset;
+  uint64_t prev_element_offset = directory_offset;
   for (auto element_iter = proto_directory.element().rbegin();
        element_iter != proto_directory.element().rend(); ++element_iter) {
     // Check that the element offset is in the right range.
@@ -262,19 +262,19 @@ absl::Status MemmappedFileSystem::InitializeFromFile(Env* env,
   return absl::OkStatus();
 }
 
-bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
+bool MemmappedFileSystem::IsMemmappedPackageFilename(
+    const std::string& filename) {
   return absl::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
 bool IsValidRegionChar(char c) {
-  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
-         (c >= '0' && c <= '9') || c == '_' || c == '.';
+  return absl::ascii_isalnum(c) || c == '_' || c == '.';
 }
 }  // namespace
 
 bool MemmappedFileSystem::IsWellFormedMemmappedPackageFilename(
-    const string& filename) {
+    const std::string& filename) {
   if (!IsMemmappedPackageFilename(filename)) {
     return false;
   }
@@ -290,7 +290,7 @@ bool MemmappedFileSystem::IsWellFormedMemmappedPackageFilename(
 
 MemmappedEnv::MemmappedEnv(Env* env) : EnvWrapper(env) {}
 
-absl::Status MemmappedEnv::GetFileSystemForFile(const string& fname,
+absl::Status MemmappedEnv::GetFileSystemForFile(const std::string& fname,
                                                 FileSystem** result) {
   if (MemmappedFileSystem::IsMemmappedPackageFilename(fname)) {
     if (!memmapped_file_system_) {
@@ -304,7 +304,7 @@ absl::Status MemmappedEnv::GetFileSystemForFile(const string& fname,
 }
 
 absl::Status MemmappedEnv::GetRegisteredFileSystemSchemes(
-    std::vector<string>* schemes) {
+    std::vector<std::string>* schemes) {
   const auto status = EnvWrapper::GetRegisteredFileSystemSchemes(schemes);
   if (status.ok()) {
     schemes->emplace_back(MemmappedFileSystem::kMemmappedPackagePrefix);
@@ -312,7 +312,8 @@ absl::Status MemmappedEnv::GetRegisteredFileSystemSchemes(
   return status;
 }
 
-absl::Status MemmappedEnv::InitializeFromFile(const string& package_filename) {
+absl::Status MemmappedEnv::InitializeFromFile(
+    const std::string& package_filename) {
   std::unique_ptr<MemmappedFileSystem> file_system_ptr(new MemmappedFileSystem);
   const auto status =
       file_system_ptr->InitializeFromFile(target(), package_filename);
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 225defc49ceceb..69f3b62949fd67 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -64,58 +64,63 @@ class MemmappedFileSystem : public FileSystem {
 
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  absl::Status FileExists(const string& fname,
+  absl::Status FileExists(const std::string& fname,
                           TransactionToken* token) override;
   absl::Status NewRandomAccessFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
   // All these functions return Unimplemented error, the memmapped storage is
   // read only.
-  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
                                std::unique_ptr<WritableFile>* result) override;
   absl::Status NewAppendableFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override;
-  absl::Status GetChildren(const string& dir, TransactionToken* token,
-                           std::vector<string>* r) override;
-  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                                std::vector<string>* results) override;
-  absl::Status DeleteFile(const string& f, TransactionToken* token) override;
-  absl::Status CreateDir(const string& d, TransactionToken* token) override;
-  absl::Status DeleteDir(const string& d, TransactionToken* token) override;
-  absl::Status RenameFile(const string& s, const string& t,
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<std::string>* r) override;
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override;
+  absl::Status DeleteFile(const std::string& f,
+                          TransactionToken* token) override;
+  absl::Status CreateDir(const std::string& d,
+                         TransactionToken* token) override;
+  absl::Status DeleteDir(const std::string& d,
+                         TransactionToken* token) override;
+  absl::Status RenameFile(const std::string& s, const std::string& t,
                           TransactionToken* token) override;
 
   // These functions are implemented.
-  absl::Status GetFileSize(const string& f, TransactionToken* token,
-                           uint64* s) override;
+  absl::Status GetFileSize(const std::string& f, TransactionToken* token,
+                           uint64_t* s) override;
   // Currently just returns size.
-  absl::Status Stat(const string& fname, TransactionToken* token,
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
                     FileStatistics* stat) override;
 
   // Initializes filesystem from a file in memmapped format.
-  absl::Status InitializeFromFile(Env* env, const string& filename);
+  absl::Status InitializeFromFile(Env* env, const std::string& filename);
 
   // Checks if the filename has a correct prefix.
-  static bool IsMemmappedPackageFilename(const string& filename);
+  static bool IsMemmappedPackageFilename(const std::string& filename);
 
-  static bool IsWellFormedMemmappedPackageFilename(const string& filename);
+  static bool IsWellFormedMemmappedPackageFilename(const std::string& filename);
 
  private:
   struct FileRegion {
-    FileRegion(uint64 o, uint64 l) : offset(o), length(l) {}
+    FileRegion(uint64_t o, uint64_t l) : offset(o), length(l) {}
 
-    uint64 offset;  // Offset from the beginning of the file.
-    uint64 length;  // Length of the region.
+    uint64_t offset;  // Offset from the beginning of the file.
+    uint64_t length;  // Length of the region.
   };
 
-  using DirectoryType = std::unordered_map<string, FileRegion>;
+  using DirectoryType = std::unordered_map<std::string, FileRegion>;
 
-  const void* GetMemoryWithOffset(uint64 offset) const;
+  const void* GetMemoryWithOffset(uint64_t offset) const;
 
   std::unique_ptr<ReadOnlyMemoryRegion> mapped_memory_;
   DirectoryType directory_;
@@ -128,11 +133,11 @@ class MemmappedEnv : public EnvWrapper {
  public:
   explicit MemmappedEnv(Env* env);
   ~MemmappedEnv() override = default;
-  absl::Status GetFileSystemForFile(const string& fname,
+  absl::Status GetFileSystemForFile(const std::string& fname,
                                     FileSystem** result) override;
   absl::Status GetRegisteredFileSystemSchemes(
-      std::vector<string>* schemes) override;
-  absl::Status InitializeFromFile(const string& filename);
+      std::vector<std::string>* schemes) override;
+  absl::Status InitializeFromFile(const std::string& filename);
 
  protected:
   std::unique_ptr<MemmappedFileSystem> memmapped_file_system_;
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 9e9bce6a883349..ea54a143771906 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -38,7 +38,7 @@ constexpr char kTensor2FileName[] = "memmapped_package://t2";
 constexpr char kProtoFileName[] = "memmapped_package://b";
 constexpr int kTestGraphDefVersion = 666;
 
-absl::Status CreateMemmappedFileSystemFile(const string& filename,
+absl::Status CreateMemmappedFileSystemFile(const std::string& filename,
                                            bool corrupted,
                                            Tensor* test_tensor) {
   Env* env = Env::Default();
@@ -72,8 +72,8 @@ absl::Status CreateMemmappedFileSystemFile(const string& filename,
 TEST(MemmappedFileSystemTest, SimpleTest) {
   const TensorShape test_tensor_shape = {10, 200};
   Tensor test_tensor(DT_FLOAT, test_tensor_shape);
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "memmapped_env_test");
+  const std::string dir = testing::TmpDir();
+  const std::string filename = io::JoinPath(dir, "memmapped_env_test");
   TF_ASSERT_OK(CreateMemmappedFileSystemFile(filename, false, &test_tensor));
 
   // Check that we can memmap the created file.
@@ -96,7 +96,7 @@ TEST(MemmappedFileSystemTest, SimpleTest) {
             absl::string_view(static_cast<const char*>(memory_region->data()),
                               test_tensor.TotalBytes()));
   // Check that GetFileSize works.
-  uint64 file_size = 0;
+  uint64_t file_size = 0;
   TF_ASSERT_OK(memmapped_env.GetFileSize(kTensor2FileName, &file_size));
   EXPECT_EQ(test_tensor.TotalBytes(), file_size);
 
@@ -134,8 +134,9 @@ TEST(MemmappedFileSystemTest, Corrupted) {
   // Create a corrupted file (it is not closed it properly).
   const TensorShape test_tensor_shape = {100, 200};
   Tensor test_tensor(DT_FLOAT, test_tensor_shape);
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "memmapped_env_corrupted_test");
+  const std::string dir = testing::TmpDir();
+  const std::string filename =
+      io::JoinPath(dir, "memmapped_env_corrupted_test");
   TF_ASSERT_OK(CreateMemmappedFileSystemFile(filename, true, &test_tensor));
   MemmappedFileSystem memmapped_env;
   ASSERT_NE(memmapped_env.InitializeFromFile(Env::Default(), filename),
@@ -144,8 +145,8 @@ TEST(MemmappedFileSystemTest, Corrupted) {
 
 TEST(MemmappedFileSystemTest, ProxyToDefault) {
   MemmappedEnv memmapped_env(Env::Default());
-  const string dir = testing::TmpDir();
-  const string filename = io::JoinPath(dir, "test_file");
+  const std::string dir = testing::TmpDir();
+  const std::string filename = io::JoinPath(dir, "test_file");
   // Check that we can create write and read ordinary file.
   std::unique_ptr<WritableFile> writable_file_temp;
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
@@ -156,10 +157,10 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
-  const string test_string = "bla-bla-bla";
+  const std::string test_string = "bla-bla-bla";
   TF_ASSERT_OK(writable_file->Append(test_string));
   TF_ASSERT_OK(writable_file->Close());
-  uint64 file_length = 0;
+  uint64_t file_length = 0;
   TF_EXPECT_OK(memmapped_env.GetFileSize(filename, &file_length));
   EXPECT_EQ(test_string.length(), file_length);
   FileStatistics stat;
diff --git a/tensorflow/core/util/memmapped_file_system_writer.cc b/tensorflow/core/util/memmapped_file_system_writer.cc
index ce5d435b8a7a3f..37a84f08c9b968 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.cc
+++ b/tensorflow/core/util/memmapped_file_system_writer.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 absl::Status MemmappedFileSystemWriter::InitializeToFile(
-    Env* env, const string& filename) {
+    Env* env, const std::string& filename) {
   auto status = env->NewWritableFile(filename, &output_file_);
   if (status.ok()) {
     output_file_offset_ = 0;
@@ -27,8 +27,8 @@ absl::Status MemmappedFileSystemWriter::InitializeToFile(
   return status;
 }
 
-absl::Status MemmappedFileSystemWriter::SaveTensor(const Tensor& tensor,
-                                                   const string& element_name) {
+absl::Status MemmappedFileSystemWriter::SaveTensor(
+    const Tensor& tensor, const std::string& element_name) {
   if (!output_file_) {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: saving tensor into not opened file");
@@ -56,7 +56,7 @@ absl::Status MemmappedFileSystemWriter::SaveTensor(const Tensor& tensor,
 }
 
 absl::Status MemmappedFileSystemWriter::SaveProtobuf(
-    const protobuf::MessageLite& message, const string& element_name) {
+    const protobuf::MessageLite& message, const std::string& element_name) {
   if (!output_file_) {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: saving protobuf into not opened file");
@@ -69,7 +69,7 @@ absl::Status MemmappedFileSystemWriter::SaveProtobuf(
         MemmappedFileSystem::kMemmappedPackagePrefix,
         " and include [A-Za-z0-9_.]");
   }
-  const string encoded = message.SerializeAsString();
+  const std::string encoded = message.SerializeAsString();
   AddToDirectoryElement(element_name, encoded.size());
   const auto res = output_file_->Append(encoded);
   if (res.ok()) {
@@ -80,11 +80,11 @@ absl::Status MemmappedFileSystemWriter::SaveProtobuf(
 
 namespace {
 
-absl::string_view EncodeUint64LittleEndian(uint64 val, char* output_buffer) {
-  for (unsigned int i = 0; i < sizeof(uint64); ++i) {
+absl::string_view EncodeUint64LittleEndian(uint64_t val, char* output_buffer) {
+  for (unsigned int i = 0; i < sizeof(uint64_t); ++i) {
     output_buffer[i] = (val >> i * 8);
   }
-  return {output_buffer, sizeof(uint64)};
+  return {output_buffer, sizeof(uint64_t)};
 }
 
 }  // namespace
@@ -94,11 +94,11 @@ absl::Status MemmappedFileSystemWriter::FlushAndClose() {
     return errors::FailedPrecondition(
         "MemmappedEnvWritter: flushing into not opened file");
   }
-  const string dir = directory_.SerializeAsString();
+  const std::string dir = directory_.SerializeAsString();
   TF_RETURN_IF_ERROR(output_file_->Append(dir));
 
   // Write the directory offset.
-  char buffer[sizeof(uint64)];
+  char buffer[sizeof(uint64_t)];
   TF_RETURN_IF_ERROR(output_file_->Append(
       EncodeUint64LittleEndian(output_file_offset_, buffer)));
 
@@ -109,13 +109,13 @@ absl::Status MemmappedFileSystemWriter::FlushAndClose() {
   return absl::OkStatus();
 }
 
-absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64 alignment) {
-  const uint64 alignment_rest = output_file_offset_ % alignment;
-  const uint64 to_write_for_alignment =
+absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64_t alignment) {
+  const uint64_t alignment_rest = output_file_offset_ % alignment;
+  const uint64_t to_write_for_alignment =
       (alignment_rest == 0) ? 0 : alignment - (output_file_offset_ % alignment);
-  static constexpr uint64 kFillerBufferSize = 16;
+  static constexpr uint64_t kFillerBufferSize = 16;
   const char kFillerBuffer[kFillerBufferSize] = {};
-  for (uint64 rest = to_write_for_alignment; rest > 0;) {
+  for (uint64_t rest = to_write_for_alignment; rest > 0;) {
     absl::string_view sp(kFillerBuffer, std::min(rest, kFillerBufferSize));
     TF_RETURN_IF_ERROR(output_file_->Append(sp));
     rest -= sp.size();
@@ -124,8 +124,8 @@ absl::Status MemmappedFileSystemWriter::AdjustAlignment(uint64 alignment) {
   return absl::OkStatus();
 }
 
-void MemmappedFileSystemWriter::AddToDirectoryElement(const string& name,
-                                                      uint64 length) {
+void MemmappedFileSystemWriter::AddToDirectoryElement(const std::string& name,
+                                                      uint64_t length) {
   MemmappedFileSystemDirectoryElement* new_directory_element =
       directory_.add_element();
   new_directory_element->set_offset(output_file_offset_);
diff --git a/tensorflow/core/util/memmapped_file_system_writer.h b/tensorflow/core/util/memmapped_file_system_writer.h
index 9d0db92758252d..0a61e648c58c38 100644
--- a/tensorflow/core/util/memmapped_file_system_writer.h
+++ b/tensorflow/core/util/memmapped_file_system_writer.h
@@ -31,19 +31,20 @@ class MemmappedFileSystemWriter {
  public:
   MemmappedFileSystemWriter() = default;
   ~MemmappedFileSystemWriter() = default;
-  absl::Status InitializeToFile(Env* env, const string& filename);
-  absl::Status SaveTensor(const Tensor& tensor, const string& element_name);
+  absl::Status InitializeToFile(Env* env, const std::string& filename);
+  absl::Status SaveTensor(const Tensor& tensor,
+                          const std::string& element_name);
   absl::Status SaveProtobuf(const protobuf::MessageLite& message,
-                            const string& element_name);
+                            const std::string& element_name);
   // Writes out the directory of regions and closes the output file.
   absl::Status FlushAndClose();
 
  private:
-  absl::Status AdjustAlignment(uint64 alignment);
-  void AddToDirectoryElement(const string& element_name, uint64 length);
+  absl::Status AdjustAlignment(uint64_t alignment);
+  void AddToDirectoryElement(const std::string& element_name, uint64_t length);
   MemmappedFileSystemDirectory directory_;
   // The current offset in the file, to support alignment.
-  uint64 output_file_offset_ = 0;
+  uint64_t output_file_offset_ = 0;
   std::unique_ptr<WritableFile> output_file_;
   MemmappedFileSystemWriter(const MemmappedFileSystemWriter&) = delete;
   void operator=(const MemmappedFileSystemWriter&) = delete;
diff --git a/tensorflow/core/util/mirror_pad_mode.cc b/tensorflow/core/util/mirror_pad_mode.cc
index 39364886219b29..3b628d42eb2d84 100644
--- a/tensorflow/core/util/mirror_pad_mode.cc
+++ b/tensorflow/core/util/mirror_pad_mode.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 absl::Status GetNodeAttr(const NodeDef& node_def, absl::string_view attr_name,
                          MirrorPadMode* value) {
-  string str_value;
+  std::string str_value;
   TF_RETURN_IF_ERROR(GetNodeAttr(node_def, attr_name, &str_value));
   if (str_value == "REFLECT") {
     *value = MirrorPadMode::REFLECT;
@@ -36,6 +36,8 @@ absl::Status GetNodeAttr(const NodeDef& node_def, absl::string_view attr_name,
   return absl::OkStatus();
 }
 
-string GetMirrorPadModeAttrString() { return "mode: {'REFLECT', 'SYMMETRIC'}"; }
+std::string GetMirrorPadModeAttrString() {
+  return "mode: {'REFLECT', 'SYMMETRIC'}";
+}
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/mirror_pad_mode.h b/tensorflow/core/util/mirror_pad_mode.h
index eea7c7415268a9..a90a2b131a8ab1 100644
--- a/tensorflow/core/util/mirror_pad_mode.h
+++ b/tensorflow/core/util/mirror_pad_mode.h
@@ -39,7 +39,7 @@ enum class MirrorPadMode {
 
 // Return the string containing the list of valid padding modes, that can be
 // used as an Attr() in REGISTER_OP.
-string GetMirrorPadModeAttrString();
+std::string GetMirrorPadModeAttrString();
 
 // Forward declaration to avoid including core/framework/graph.proto.
 class NodeDef;
diff --git a/tensorflow/core/util/overflow.h b/tensorflow/core/util/overflow.h
index a041d86b4a2a07..eb37d294b5b537 100644
--- a/tensorflow/core/util/overflow.h
+++ b/tensorflow/core/util/overflow.h
@@ -32,9 +32,9 @@ inline int64_t MultiplyWithoutOverflow(int64_t x, int64_t y) {
   // Multiply in uint64 rather than int64 since signed overflow is undefined.
   // Negative values will wrap around to large unsigned values in the casts
   // (see section 4.7 [conv.integral] of the C++14 standard).
-  const uint64 ux = x;
-  const uint64 uy = y;
-  const uint64 uxy = ux * uy;
+  const uint64_t ux = x;
+  const uint64_t uy = y;
+  const uint64_t uxy = ux * uy;
 
   // Check if we overflow uint64, using a cheap check if both inputs are small
   if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
@@ -54,9 +54,9 @@ inline int64_t AddWithoutOverflow(int64_t x, int64_t y) {
   // Add in uint64 rather than int64 since signed overflow is undefined.
   // Negative values will wrap around to large unsigned values in the casts
   // (see section 4.7 [conv.integral] of the C++14 standard).
-  const uint64 ux = x;
-  const uint64 uy = y;
-  const uint64 uxy = ux + uy;
+  const uint64_t ux = x;
+  const uint64_t uy = y;
+  const uint64_t uxy = ux + uy;
 
   // Cast back to signed. A negative value signals an overflow.
   return static_cast<int64_t>(uxy);
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 41989d277b55fc..6ec5b99bfc38d4 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -69,13 +69,13 @@ absl::Status CheckValidPadding(Padding padding_type,
   return absl::OkStatus();
 }
 
-string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
+std::string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
 
-string GetPaddingAttrStringWithExplicit() {
+std::string GetPaddingAttrStringWithExplicit() {
   return "padding: {'SAME', 'VALID', 'EXPLICIT'}";
 }
 
-string GetExplicitPaddingsAttrString() {
+std::string GetExplicitPaddingsAttrString() {
   return "explicit_paddings: list(int) = []";
 }
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index 2a03c511ed1492..594de0fbc95414 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -50,15 +50,15 @@ template <class value>
 class PresizedCuckooMap {
  public:
   // The key type is fixed as a pre-hashed key for this specialized use.
-  typedef uint64 key_type;
+  typedef uint64_t key_type;
 
-  explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
+  explicit PresizedCuckooMap(uint64_t num_entries) { Clear(num_entries); }
 
-  void Clear(uint64 num_entries) {
+  void Clear(uint64_t num_entries) {
     cpq_.reset(new CuckooPathQueue());
     double n(num_entries);
     n /= kLoadFactor;
-    num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
+    num_buckets_ = (static_cast<uint64_t>(n) / kSlotsPerBucket);
     // Very small cuckoo tables don't work, because the probability
     // of having same-bucket hashes is large.  We compromise for those
     // uses by having a larger static starting size.
@@ -74,12 +74,12 @@ class PresizedCuckooMap {
   // Returns false if k is already in table or if the table
   // is full; true otherwise.
   bool InsertUnique(const key_type k, const value& v) {
-    uint64 tk = key_transform(k);
-    uint64 b1 = fast_map_to_buckets(tk);
-    uint64 b2 = fast_map_to_buckets(h2(tk));
+    uint64_t tk = key_transform(k);
+    uint64_t b1 = fast_map_to_buckets(tk);
+    uint64_t b2 = fast_map_to_buckets(h2(tk));
 
     // Merged find and duplicate checking.
-    uint64 target_bucket = 0;
+    uint64_t target_bucket = 0;
     int target_slot = kNoSpace;
 
     for (auto bucket : {b1, b2}) {
@@ -104,14 +104,14 @@ class PresizedCuckooMap {
 
   // Returns true if found.  Sets *out = value.
   bool Find(const key_type k, value* out) const {
-    uint64 tk = key_transform(k);
+    uint64_t tk = key_transform(k);
     return FindInBucket(k, fast_map_to_buckets(tk), out) ||
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
   // Prefetch memory associated with the key k into cache.
   void PrefetchKey(const key_type k) const {
-    const uint64 tk = key_transform(k);
+    const uint64_t tk = key_transform(k);
     absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(tk)].keys);
     absl::PrefetchToLocalCache(&buckets_[fast_map_to_buckets(h2(tk))].keys);
   }
@@ -138,7 +138,7 @@ class PresizedCuckooMap {
   // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
   // nodes to visit, calculated below, and is a good value.
 
-  static constexpr uint8 kMaxBFSPathLen = 5;
+  static constexpr uint8_t kMaxBFSPathLen = 5;
 
   // Constants for BFS cuckoo path search:
   // The visited list must be maintained for all but the last level of search
@@ -151,7 +151,7 @@ class PresizedCuckooMap {
   static constexpr int kVisitedListSize = 170;
 
   static constexpr int kNoSpace = -1;  // SpaceAvailable return
-  static constexpr uint64 kUnusedSlot = ~(0ULL);
+  static constexpr uint64_t kUnusedSlot = ~(0ULL);
 
   // Buckets are organized with key_types clustered for access speed
   // and for compactness while remaining aligned.
@@ -164,7 +164,7 @@ class PresizedCuckooMap {
   // the number of cache lines dirtied during search.
 
   struct CuckooPathEntry {
-    uint64 bucket;
+    uint64_t bucket;
     int depth;
     int parent;       // To index in the visited array.
     int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
@@ -208,27 +208,27 @@ class PresizedCuckooMap {
   // collisions, OR must ensure that their keys are always in
   // the range 0 - (uint64max - 1).  This transforms 'not found flag'
   // keys into something else.
-  inline uint64 key_transform(const key_type k) const {
+  inline uint64_t key_transform(const key_type k) const {
     return k + (k == kUnusedSlot);
   }
 
   // h2 performs a very quick mix of h to generate the second bucket hash.
   // Assumes there is plenty of remaining entropy in the initial h.
-  inline uint64 h2(uint64 h) const {
-    const uint64 m = 0xc6a4a7935bd1e995;
+  inline uint64_t h2(uint64_t h) const {
+    const uint64_t m = 0xc6a4a7935bd1e995;
     return m * ((h >> 32) | (h << 32));
   }
 
   // alt_bucket identifies the "other" bucket for key k, where
   // other is "the one that isn't bucket b"
-  inline uint64 alt_bucket(key_type k, uint64 b) const {
+  inline uint64_t alt_bucket(key_type k, uint64_t b) const {
     if (fast_map_to_buckets(k) != b) {
       return fast_map_to_buckets(k);
     }
     return fast_map_to_buckets(h2(k));
   }
 
-  inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
+  inline void InsertInternal(key_type k, const value& v, uint64_t b, int slot) {
     Bucket* bptr = &buckets_[b];
     bptr->keys[slot] = k;
     bptr->values[slot] = v;
@@ -236,7 +236,7 @@ class PresizedCuckooMap {
 
   // For the associative cuckoo table, check all of the slots in
   // the bucket to see if the key is present.
-  bool FindInBucket(key_type k, uint64 b, value* out) const {
+  bool FindInBucket(key_type k, uint64_t b, value* out) const {
     const Bucket& bref = buckets_[b];
     for (int i = 0; i < kSlotsPerBucket; i++) {
       if (bref.keys[i] == k) {
@@ -249,7 +249,7 @@ class PresizedCuckooMap {
 
   //  returns either kNoSpace or the index of an
   //  available slot (0 <= slot < kSlotsPerBucket)
-  inline int SpaceAvailable(uint64 bucket) const {
+  inline int SpaceAvailable(uint64_t bucket) const {
     const Bucket& bref = buckets_[bucket];
     for (int i = 0; i < kSlotsPerBucket; i++) {
       if (bref.keys[i] == kUnusedSlot) {
@@ -259,7 +259,7 @@ class PresizedCuckooMap {
     return kNoSpace;
   }
 
-  inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
+  inline void CopyItem(uint64_t src_bucket, int src_slot, uint64_t dst_bucket,
                        int dst_slot) {
     Bucket& src_ref = buckets_[src_bucket];
     Bucket& dst_ref = buckets_[dst_bucket];
@@ -267,7 +267,7 @@ class PresizedCuckooMap {
     dst_ref.values[dst_slot] = src_ref.values[src_slot];
   }
 
-  bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
+  bool CuckooInsert(key_type k, const value& v, uint64_t b1, uint64_t b2) {
     int visited_end = 0;
     cpq_->reset();
 
@@ -299,10 +299,10 @@ class PresizedCuckooMap {
           const Bucket& bref = buckets_[e.bucket];
           for (int i = 0; i < kSlotsPerBucket; i++) {
             int slot = (start_slot + i) % kSlotsPerBucket;
-            uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
+            uint64_t next_bucket = alt_bucket(bref.keys[slot], e.bucket);
             // Optimization:  Avoid single-step cycles (from e, don't
             // add a child node that is actually e's parent).
-            uint64 e_parent_bucket = visited_[e.parent].bucket;
+            uint64_t e_parent_bucket = visited_[e.parent].bucket;
             if (next_bucket != e_parent_bucket) {
               cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
             }
@@ -315,7 +315,7 @@ class PresizedCuckooMap {
     return false;
   }
 
-  inline uint64 fast_map_to_buckets(uint64 x) const {
+  inline uint64_t fast_map_to_buckets(uint64_t x) const {
     // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
     // using Lemire's alternative to modulo reduction:
     // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
@@ -324,7 +324,7 @@ class PresizedCuckooMap {
   }
 
   // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
-  uint64 num_buckets_;
+  uint64_t num_buckets_;
   std::vector<Bucket> buckets_;
 
   std::unique_ptr<CuckooPathQueue> cpq_;
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index cc657e0032f2e5..ad46c36cd7d258 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -45,17 +45,17 @@ TEST(PresizedCuckooMapTest, Prefetch) {
 TEST(PresizedCuckooMapTest, TooManyItems) {
   static constexpr int kTableSize = 1000;
   PresizedCuckooMap<int> pscm(kTableSize);
-  for (uint64 i = 0; i < kTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+  for (uint64_t i = 0; i < kTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     ASSERT_TRUE(pscm.InsertUnique(key, i));
   }
   // Try to over-fill the table.  A few of these
   // inserts will succeed, but should start failing.
-  uint64 failed_at = 0;
-  for (uint64 i = kTableSize; i < (2 * kTableSize); i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+  uint64_t failed_at = 0;
+  for (uint64_t i = kTableSize; i < (2 * kTableSize); i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     if (!pscm.InsertUnique(key, i)) {
       failed_at = i;
       break;
@@ -66,10 +66,10 @@ TEST(PresizedCuckooMapTest, TooManyItems) {
 
   // Requirement 2:  Table must preserve all items inserted prior
   // to the failure.
-  for (uint64 i = 0; i < failed_at; i++) {
+  for (uint64_t i = 0; i < failed_at; i++) {
     int out;
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     EXPECT_TRUE(pscm.Find(key, &out));
     EXPECT_EQ(out, i);
   }
@@ -78,7 +78,7 @@ TEST(PresizedCuckooMapTest, TooManyItems) {
 TEST(PresizedCuckooMapTest, ZeroSizeMap) {
   PresizedCuckooMap<int> pscm(0);
   int out;
-  for (uint64 i = 0; i < 100; i++) {
+  for (uint64_t i = 0; i < 100; i++) {
     EXPECT_FALSE(pscm.Find(i, &out));
   }
 }
@@ -102,13 +102,13 @@ TEST(PresizedCuckooMapTest, RepeatedClear) {
 void RunFill(int64_t table_size) {
   PresizedCuckooMap<int> pscm(table_size);
   for (int64_t i = 0; i < table_size; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     EXPECT_TRUE(pscm.InsertUnique(key, i));
   }
   for (int64_t i = 0; i < table_size; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(int64_t)));
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(int64_t)));
     int out;
     EXPECT_TRUE(pscm.Find(key, &out));
     EXPECT_EQ(out, i);
@@ -125,24 +125,24 @@ TEST(PresizedCuckooMapTest, Duplicates) {
   static constexpr int kSmallTableSize = 1000;
   PresizedCuckooMap<int> pscm(kSmallTableSize);
 
-  for (uint64 i = 0; i < kSmallTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < kSmallTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     EXPECT_TRUE(pscm.InsertUnique(key, i));
   }
 
-  for (uint64 i = 0; i < kSmallTableSize; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < kSmallTableSize; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     EXPECT_FALSE(pscm.InsertUnique(key, i));
   }
 }
 
-static void CalculateKeys(uint64 num, std::vector<uint64> *dst) {
+static void CalculateKeys(uint64_t num, std::vector<uint64_t>* dst) {
   dst->resize(num);
-  for (uint64 i = 0; i < num; i++) {
-    uint64 key =
-        Fingerprint64(string(reinterpret_cast<char *>(&i), sizeof(uint64)));
+  for (uint64_t i = 0; i < num; i++) {
+    uint64_t key = Fingerprint64(
+        std::string(reinterpret_cast<char*>(&i), sizeof(uint64_t)));
     dst->at(i) = key;
   }
 }
@@ -150,12 +150,12 @@ static void CalculateKeys(uint64 num, std::vector<uint64> *dst) {
 void BM_CuckooFill(::testing::benchmark::State &state) {
   const int arg = state.range(0);
 
-  uint64 table_size = arg;
-  std::vector<uint64> calculated_keys;
+  uint64_t table_size = arg;
+  std::vector<uint64_t> calculated_keys;
   CalculateKeys(table_size, &calculated_keys);
   for (auto s : state) {
     PresizedCuckooMap<int> pscm(table_size);
-    for (uint64 i = 0; i < table_size; i++) {
+    for (uint64_t i = 0; i < table_size; i++) {
       pscm.InsertUnique(calculated_keys[i], i);
     }
   }
@@ -166,18 +166,18 @@ BENCHMARK(BM_CuckooFill)->Arg(1000)->Arg(10000000);
 void BM_CuckooRead(::testing::benchmark::State &state) {
   const int arg = state.range(0);
 
-  uint64 table_size = arg;
-  std::vector<uint64> calculated_keys;
+  uint64_t table_size = arg;
+  std::vector<uint64_t> calculated_keys;
   CalculateKeys(table_size, &calculated_keys);
   PresizedCuckooMap<int> pscm(table_size);
-  for (uint64 i = 0; i < table_size; i++) {
+  for (uint64_t i = 0; i < table_size; i++) {
     pscm.InsertUnique(calculated_keys[i], i);
   }
 
   int i = 0;
   for (auto s : state) {
     // Avoid using '%', which is expensive.
-    uint64 key_index = i;
+    uint64_t key_index = i;
     ++i;
     if (i == table_size) i = 0;
 
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index e8364169e4b8c2..42ec4e98b04071 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 using errors::InvalidArgument;
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types) {
   *row_partition_types = GetRowPartitionTypesHelper(row_partition_type_strings);
   if (row_partition_types->size() != row_partition_type_strings.size()) {
diff --git a/tensorflow/core/util/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
index 438e3d1d5ab080..28d698b73a3ca8 100644
--- a/tensorflow/core/util/ragged_to_dense_util.h
+++ b/tensorflow/core/util/ragged_to_dense_util.h
@@ -26,17 +26,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-string RowPartitionTypeToString(RowPartitionType row_partition_type);
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type);
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types);
 
 // ContextType must be InferenceContext or OpKernelConstruction.
 template <typename ContextType>
 absl::Status GetRowPartitionTypes(
     ContextType* context, std::vector<RowPartitionType>* row_partition_types) {
-  std::vector<string> row_partition_type_strings;
+  std::vector<std::string> row_partition_type_strings;
   TF_RETURN_IF_ERROR(
       context->GetAttr("row_partition_types", &row_partition_type_strings));
   return GetRowPartitionTypesHelper(row_partition_type_strings,
@@ -44,7 +44,7 @@ absl::Status GetRowPartitionTypes(
 }
 
 absl::Status GetRowPartitionTypesHelper(
-    const std::vector<string>& row_partition_type_strings,
+    const std::vector<std::string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types);
 
 absl::Status CombineRaggedTensorToTensorShapes(
diff --git a/tensorflow/core/util/ragged_to_dense_util_test.cc b/tensorflow/core/util/ragged_to_dense_util_test.cc
index 9752e0a8a410e2..ad3af9c6267e20 100644
--- a/tensorflow/core/util/ragged_to_dense_util_test.cc
+++ b/tensorflow/core/util/ragged_to_dense_util_test.cc
@@ -77,7 +77,7 @@ TEST(CombineRaggedTensorToTensorShapes, UnknownShapeDenseValue) {
 }
 
 TEST(GetRowPartitionTypesHelper, BasicTest) {
-  const std::vector<string> row_partition_type_strings = {
+  const std::vector<std::string> row_partition_type_strings = {
       "FIRST_DIM_SIZE", "VALUE_ROWIDS", "ROW_SPLITS"};
   std::vector<RowPartitionType> row_partition_types;
   TF_ASSERT_OK(GetRowPartitionTypesHelper(row_partition_type_strings,
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 74da8ee9994d2f..c8f40a224756e0 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -44,8 +44,8 @@ TEST(TestReporter, UsesEnv) {
 
   // Set a file we can't possibly create, check for failure
   setenv(TestReporter::kTestReporterEnv, "/cant/find/me:!", 1);
-  CHECK_EQ(string(std::getenv(TestReporter::kTestReporterEnv)),
-           string("/cant/find/me:!"));
+  CHECK_EQ(std::string(std::getenv(TestReporter::kTestReporterEnv)),
+           std::string("/cant/find/me:!"));
   TestReporter test_reporter("b1");
   absl::Status s = test_reporter.Initialize();
   ExpectHasSubstr(s.ToString(), "/cant/find/me");
@@ -92,14 +92,15 @@ TEST(TestReporter, CreateCloseCreateAgainSkipsSecond) {
 }
 
 TEST(TestReporter, Benchmark) {
-  string fname = absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b1/2/3");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.Benchmark(1, 1.0, 2.0, 3.0));
   TF_EXPECT_OK(test_reporter.Close());
 
-  string expected_fname = absl::StrCat(fname, "b1__2__3");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b1__2__3");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
@@ -115,15 +116,16 @@ TEST(TestReporter, Benchmark) {
 }
 
 TEST(TestReporter, SetProperties) {
-  string fname = absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b2/3/4");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
   TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
 
   TF_EXPECT_OK(test_reporter.Close());
-  string expected_fname = absl::StrCat(fname, "b2__3__4");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b2__3__4");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
@@ -137,15 +139,16 @@ TEST(TestReporter, SetProperties) {
 }
 
 TEST(TestReporter, AddMetrics) {
-  string fname = absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  std::string fname =
+      absl::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
   TestReporter test_reporter(fname, "b3/4/5");
   TF_EXPECT_OK(test_reporter.Initialize());
   TF_EXPECT_OK(test_reporter.AddMetric("metric1", 2.0));
   TF_EXPECT_OK(test_reporter.AddMetric("metric2", 3.0));
 
   TF_EXPECT_OK(test_reporter.Close());
-  string expected_fname = absl::StrCat(fname, "b3__4__5");
-  string read;
+  std::string expected_fname = absl::StrCat(fname, "b3__4__5");
+  std::string read;
   TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
 
   BenchmarkEntries benchmark_entries;
diff --git a/tensorflow/core/util/saved_tensor_slice_util.cc b/tensorflow/core/util/saved_tensor_slice_util.cc
index 3db9ec58811e0d..25d7562ba2f359 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util.cc
@@ -27,8 +27,9 @@ namespace checkpoint {
 
 const char kSavedTensorSlicesKey[] = "";
 
-string EncodeTensorNameSlice(const string& name, const TensorSlice& slice) {
-  string buffer;
+std::string EncodeTensorNameSlice(const std::string& name,
+                                  const TensorSlice& slice) {
+  std::string buffer;
   // All the tensor slice keys will start with a 0
   tensorflow::strings::OrderedCode::WriteNumIncreasing(&buffer, 0);
   tensorflow::strings::OrderedCode::WriteString(&buffer, name);
@@ -44,10 +45,10 @@ string EncodeTensorNameSlice(const string& name, const TensorSlice& slice) {
   return buffer;
 }
 
-absl::Status DecodeTensorNameSlice(const string& code, string* name,
+absl::Status DecodeTensorNameSlice(const std::string& code, std::string* name,
                                    tensorflow::TensorSlice* slice) {
   absl::string_view src(code);
-  uint64 x;
+  uint64_t x;
   if (!tensorflow::strings::OrderedCode::ReadNumIncreasing(&src, &x)) {
     return errors::Internal("Failed to parse the leading number: src = ", src);
   }
@@ -65,11 +66,11 @@ absl::Status DecodeTensorNameSlice(const string& code, string* name,
     return errors::Internal("Expecting positive rank of the tensor, got ", x,
                             ", src = ", src);
   }
-  if (x >= kint32max) {
+  if (x >= std::numeric_limits<int32_t>::max()) {
     return errors::Internal("Too many elements ", x);
   }
   slice->SetFullSlice(x);
-  for (int d = 0; d < static_cast<int32>(x); ++d) {
+  for (int d = 0; d < static_cast<int32_t>(x); ++d) {
     // We expected 2x integers
     int64_t start, length;
     if (!tensorflow::strings::OrderedCode::ReadSignedNumIncreasing(&src,
@@ -89,13 +90,13 @@ absl::Status DecodeTensorNameSlice(const string& code, string* name,
   return absl::OkStatus();
 }
 
-absl::Status ParseShapeAndSlice(const string& shape_and_slice,
+absl::Status ParseShapeAndSlice(const std::string& shape_and_slice,
                                 TensorShape* shape, TensorSlice* slice,
                                 TensorShape* shape_slice) {
   CHECK(!shape_and_slice.empty());
   // Syntax: dim0 dim1 dim2 ... <slice string>
   // Where slice string is defined in core/framework/tensor_slice.h
-  std::vector<string> splits = str_util::Split(shape_and_slice, ' ');
+  std::vector<std::string> splits = str_util::Split(shape_and_slice, ' ');
 
   // Must have at least 2 strings.
   if (splits.size() < 2) {
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 102e32a9ebc156..8b3d8f355b3ba0 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -43,17 +43,17 @@ extern const char kSavedTensorSlicesKey[];
 //  <dim-1-start><dim-1-length>
 //  ...
 
-string EncodeTensorNameSlice(const string& name,
-                             const tensorflow::TensorSlice& slice);
+std::string EncodeTensorNameSlice(const std::string& name,
+                                  const tensorflow::TensorSlice& slice);
 
 // Parse out the name and the slice from string encoded as an ordered code.
-absl::Status DecodeTensorNameSlice(const string& code, string* name,
+absl::Status DecodeTensorNameSlice(const std::string& code, std::string* name,
                                    tensorflow::TensorSlice* slice);
 
 // Extracts the full shape, slice spec, and shape of the slice from
 // "shape_and_slice".  On non-OK return, caller must clear the out-arguments
 // before reusing.
-absl::Status ParseShapeAndSlice(const string& shape_and_slice,
+absl::Status ParseShapeAndSlice(const std::string& shape_and_slice,
                                 TensorShape* shape, TensorSlice* slice,
                                 TensorShape* shape_slice);
 
@@ -127,17 +127,17 @@ TENSOR_PROTO_EXTRACT_TYPE(float, float, float);
 TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
-TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
+TENSOR_PROTO_EXTRACT_TYPE(int32_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(uint32_t, uint32, uint32_t);
 TENSOR_PROTO_EXTRACT_TYPE(int64_t, int64, protobuf_int64);
-TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
-TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(int16, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(qint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32);
-TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint64_t, uint64, protobuf_uint64);
+TENSOR_PROTO_EXTRACT_TYPE(uint16_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(uint8_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(int8_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(int16_t, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(qint8, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32_t);
+TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32_t);
 
 #undef TENSOR_PROTO_EXTRACT_TYPE_COMPLEX
 #undef TENSOR_PROTO_EXTRACT_TYPE_HELPER
@@ -146,7 +146,7 @@ TENSOR_PROTO_EXTRACT_TYPE(quint16, int, int32);
 // Custom implementation for qint32, based on the one for int32.
 
 template <>
-struct SaveTypeTraits<qint32> : SaveTypeTraits<int32> {};
+struct SaveTypeTraits<qint32> : SaveTypeTraits<int32_t> {};
 
 template <>
 inline int TensorProtoDataSize<qint32>(const TensorProto& t) {
@@ -154,15 +154,15 @@ inline int TensorProtoDataSize<qint32>(const TensorProto& t) {
 }
 
 template <>
-inline const int32* TensorProtoData<qint32>(const TensorProto& t) {
+inline const int32_t* TensorProtoData<qint32>(const TensorProto& t) {
   static_assert(SaveTypeTraits<qint32>::supported,
                 "Specified type qint32 not supported for Restore");
-  return reinterpret_cast<const int32*>(t.int_val().data());
+  return reinterpret_cast<const int32_t*>(t.int_val().data());
 }
 
 inline void Fill(const qint32* data, size_t n, TensorProto* t) {
-  const int32* p = reinterpret_cast<const int32*>(data);
-  typename protobuf::RepeatedField<int32> copy(p, p + n);
+  const int32_t* p = reinterpret_cast<const int32_t*>(data);
+  typename protobuf::RepeatedField<int32_t> copy(p, p + n);
   t->mutable_int_val()->Swap(&copy);
 }
 
@@ -172,7 +172,7 @@ template <>
 struct SaveTypeTraits<Eigen::half> {
   static constexpr bool supported = true;
   typedef int SavedType;
-  typedef protobuf::RepeatedField<int32> RepeatedField;
+  typedef protobuf::RepeatedField<int32_t> RepeatedField;
 };
 
 template <>
@@ -186,17 +186,17 @@ inline const int* TensorProtoData<Eigen::half>(const TensorProto& t) {
 }
 
 template <>
-inline protobuf::RepeatedField<int32>* MutableTensorProtoData<Eigen::half>(
+inline protobuf::RepeatedField<int32_t>* MutableTensorProtoData<Eigen::half>(
     TensorProto* t) {
   return t->mutable_half_val();
 }
 
 template <>
 inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
-  typename protobuf::RepeatedField<int32>* val = t->mutable_half_val();
+  typename protobuf::RepeatedField<int32_t>* val = t->mutable_half_val();
   val->Resize(n, 0);
   for (size_t i = 0; i < n; ++i) {
-    val->Set(i, Eigen::numext::bit_cast<uint16>(data[i]));
+    val->Set(i, Eigen::numext::bit_cast<uint16_t>(data[i]));
   }
 }
 
@@ -205,8 +205,8 @@ inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
 template <>
 struct SaveTypeTraits<tstring> {
   static constexpr bool supported = true;
-  typedef const string* SavedType;
-  typedef protobuf::RepeatedPtrField<string> RepeatedField;
+  typedef const std::string* SavedType;
+  typedef protobuf::RepeatedPtrField<std::string> RepeatedField;
 };
 
 template <>
@@ -215,14 +215,15 @@ inline int TensorProtoDataSize<tstring>(const TensorProto& t) {
 }
 
 template <>
-inline const string* const* TensorProtoData<tstring>(const TensorProto& t) {
+inline const std::string* const* TensorProtoData<tstring>(
+    const TensorProto& t) {
   static_assert(SaveTypeTraits<tstring>::supported,
                 "Specified type tstring not supported for Restore");
   return t.string_val().data();
 }
 
 template <>
-inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
+inline protobuf::RepeatedPtrField<std::string>* MutableTensorProtoData<tstring>(
     TensorProto* t) {
   static_assert(SaveTypeTraits<tstring>::supported,
                 "Specified type tstring not supported for Save");
@@ -231,7 +232,7 @@ inline protobuf::RepeatedPtrField<string>* MutableTensorProtoData<tstring>(
 
 template <>
 inline void Fill(const tstring* data, size_t n, TensorProto* t) {
-  typename protobuf::RepeatedPtrField<string> copy(data, data + n);
+  typename protobuf::RepeatedPtrField<std::string> copy(data, data + n);
   t->mutable_string_val()->Swap(&copy);
 }
 
diff --git a/tensorflow/core/util/saved_tensor_slice_util_test.cc b/tensorflow/core/util/saved_tensor_slice_util_test.cc
index 0880c48162edfe..40fb0b581c70d3 100644
--- a/tensorflow/core/util/saved_tensor_slice_util_test.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util_test.cc
@@ -31,8 +31,8 @@ namespace {
 TEST(TensorShapeUtilTest, TensorNameSliceToOrderedCode) {
   {
     TensorSlice s = TensorSlice::ParseOrDie("-:-:1,3:4,5");
-    string buffer = EncodeTensorNameSlice("foo", s);
-    string name;
+    std::string buffer = EncodeTensorNameSlice("foo", s);
+    std::string name;
     s.Clear();
     TF_CHECK_OK(DecodeTensorNameSlice(buffer, &name, &s));
     EXPECT_EQ("foo", name);
diff --git a/tensorflow/core/util/semver_test.cc b/tensorflow/core/util/semver_test.cc
index b1dfdadd866ca6..22d7eaca043127 100644
--- a/tensorflow/core/util/semver_test.cc
+++ b/tensorflow/core/util/semver_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstddef>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -27,13 +28,11 @@ namespace {
 bool IsDotOrIdentifierChar(char c) {
   if (c == '.') return true;
   if (c == '-') return true;
-  if (c >= 'A' && c <= 'Z') return true;
-  if (c >= 'a' && c <= 'z') return true;
-  if (c >= '0' && c <= '9') return true;
-  return false;
+  return absl::ascii_isalnum(c);
 }
 
-bool ConsumeDotSeparatedIdentifiers(absl::string_view* s, const string& prefix,
+bool ConsumeDotSeparatedIdentifiers(absl::string_view* s,
+                                    const std::string& prefix,
                                     absl::string_view* val) {
   if (!absl::ConsumePrefix(s, prefix)) return false;
   size_t i;
@@ -50,8 +49,7 @@ TEST(SemverTest, VersionStringFollowsSemver) {
   // Poor approximation of the semver 2.0 specification at www.semver.org.  Feel
   // free to refine further (for example, check for leading 0s in numbers), but
   // avoid adding dependencies.
-  GTEST_SKIP() << "TODO: Weekly-sync 24-12-10,  Currently failing on ROCm!";
-  uint64 major, minor, patch;
+  uint64_t major, minor, patch;
   absl::string_view prerelease, metadata;
   absl::string_view semver(TF_VERSION_STRING);
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 3e22ae9a848a62..afa764a2e15227 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -439,9 +439,9 @@ BundleWriter::BundleWriter(Env* env, absl::string_view prefix,
   data_path_ = DataFilename(prefix_, 0, 1);
   metadata_path_ = MetaFilename(prefix_);
   if (use_temp_file_) {
-    data_path_ = strings::StrCat(data_path_, ".tempstate", random::New64());
+    data_path_ = absl::StrCat(data_path_, ".tempstate", random::New64());
     metadata_path_ =
-        strings::StrCat(metadata_path_, ".tempstate", random::New64());
+        absl::StrCat(metadata_path_, ".tempstate", random::New64());
   }
 
   status_ = env_->CreateDir(string(io::Dirname(prefix_)));
@@ -1230,7 +1230,7 @@ string BundleReader::DebugString() {
 
     strings::StrAppend(&shape_str, key(), " (", DataType_Name(entry.dtype()),
                        ") ", TensorShape(entry.shape()).DebugString());
-    strings::StrAppend(&shape_str, "\n");
+    absl::StrAppend(&shape_str, "\n");
   }
   return shape_str;
 }
diff --git a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
index 5b53f8e89c138d..549f96a204d37b 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 // Needed for encoding and decoding ResourceDeleter Variant.
@@ -375,10 +376,11 @@ class GrpcPollingThread {
   explicit GrpcPollingThread(std::string thread_name) {
     // Thread name can only have alpha numeric characters. Remove special
     // characters from input thread_name.
-    thread_name.erase(
-        std::remove_if(thread_name.begin(), thread_name.end(),
-                       [](auto const c) -> bool { return !std::isalnum(c); }),
-        thread_name.end());
+    thread_name.erase(std::remove_if(thread_name.begin(), thread_name.end(),
+                                     [](unsigned char const c) -> bool {
+                                       return !absl::ascii_isalnum(c);
+                                     }),
+                      thread_name.end());
     thread_.reset(Env::Default()->StartThread(
         ThreadOptions(), absl::StrCat("GrpcPollingThread", thread_name),
         [this]() {
@@ -406,7 +408,7 @@ class GrpcPollingThread {
 class RpcClient : public ResourceBase {
  public:
   explicit RpcClient(std::string address, std::string resource_name,
-                     int64 timeout_in_ms)
+                     int64_t timeout_in_ms)
       : server_address_(address),
         thread_(resource_name),
         timeout_in_ms_(timeout_in_ms) {
@@ -428,7 +430,7 @@ class RpcClient : public ResourceBase {
 
   void CallAsync(const std::string& method_name,
                  const std::vector<Tensor>& inputs, CallResponse* response,
-                 StatusCallback callback, int64 timeout_in_ms) {
+                 StatusCallback callback, int64_t timeout_in_ms) {
     CallRequest request;
     request.set_method(method_name);
     for (const auto& t : inputs) {
@@ -436,7 +438,7 @@ class RpcClient : public ResourceBase {
     }
     ::grpc::ClientContext context;
     // Use per call timeout if specified, otherwise use default client timeout.
-    int64 timeout = timeout_in_ms > 0 ? timeout_in_ms : timeout_in_ms_;
+    int64_t timeout = timeout_in_ms > 0 ? timeout_in_ms : timeout_in_ms_;
     new RPCState<CallResponse>(
         stub_.get(), cq_, "/tensorflow.rpc.RpcService/Call", request, response,
         /*done=*/std::move(callback),
@@ -468,7 +470,7 @@ class RpcClient : public ResourceBase {
   ::grpc::CompletionQueue* cq_;
   GrpcPollingThread thread_;
   std::unique_ptr<thread::ThreadPool> callback_threadpool_;
-  int64 timeout_in_ms_;
+  int64_t timeout_in_ms_;
 };
 
 class RpcFutureResource : public ResourceBase {
@@ -685,7 +687,7 @@ void RpcServerRegisterOp::Compute(OpKernelContext* ctx) {
     instantiate_opts.input_devices.push_back(ctx->device()->name());
   }
 
-  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+  absl::flat_hash_map<std::string, std::vector<std::string>> composite_devices;
   for (int i = 0; i < captured.size(); ++i) {
     if (captured[i].dtype() == DT_RESOURCE) {
       instantiate_opts.input_devices.push_back(GetFunctionResourceInputDevice(
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 6f812644f7986f..1705ba2425577c 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -313,7 +313,7 @@ cc_library(
         "//tensorflow/c/eager:c_api_internal",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_xla//xla/pjrt:pjrt_future",
+        "@local_xla//xla:future",
     ],
 )
 
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index e57218e541ff02..c237a1315783f4 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api_experimental.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/cc/tensor_with_layout.h"
 
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace dtensor {
 
 template <typename T = void>
-using Future = ::xla::PjRtFuture<T>;
+using Future = ::xla::Future<T>;
 
 // ParallelExecutor Interface
 // Note: The interface is under development and APIs are subject to change.
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index cc5bf67cbde467..91eab6f8438dc2 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -227,7 +227,7 @@ StatusOr<Layout> GetBroadcastLayoutForElementWise(
   const int rank_offset_b = std::max(0, rank_a - rank_b);
   absl::flat_hash_map<std::string, int> mesh_dim_map_a;
   absl::flat_hash_map<std::string, int> mesh_dim_map_b;
-  std::vector<string> output_layout_specs;
+  std::vector<std::string> output_layout_specs;
 
   auto unsharded_specs = [](const int new_size) -> std::vector<std::string> {
     std::vector<std::string> spec_strs(new_size, Layout::kUnshardedDim);
@@ -531,7 +531,7 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
   // has the given mesh in it. If it exists, simply return that op's value.
   TF_ASSIGN_OR_RETURN(const auto mesh, ExtractDeviceMeshFromOp(cluster));
   if (!mesh) return errors::InvalidArgument("missing mesh on cluster");
-  string serialized_mesh = mesh->ToString();
+  std::string serialized_mesh = mesh->ToString();
   mlir::Value ret_val;
   auto result = cluster.walk([&](mlir::TF::FloorModOp op) -> mlir::WalkResult {
     if (op->hasAttrOfType<mlir::StringAttr>(kMeshCoordinatesAttr) &&
@@ -547,12 +547,12 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
 
   // We didn't find a FloorModOp for the given mesh, so we must produce the
   // FloorModOp and add the attr so we can find it on next call.
-  std::vector<int32> mesh_shape(mesh->rank());
+  std::vector<int32_t> mesh_shape(mesh->rank());
   for (int i = 0; i < mesh->rank(); ++i) mesh_shape[i] = mesh->dim(i).size;
 
   // This product represents the [b*c*d, c*d, d, 1] from the function
   // documentation.
-  std::vector<int32> running_product(mesh->rank());
+  std::vector<int32_t> running_product(mesh->rank());
   running_product[mesh->rank() - 1] = 1;
   for (int i = mesh->rank() - 1; i > 0; --i)
     running_product[i - 1] = running_product[i] * mesh_shape[i];
@@ -685,13 +685,13 @@ namespace {
 // used. In order to ensure that all branch functions of TF control flow ops are
 // unique, we keep track of atomic counter for each control flow functions.
 // See b/174253694 for more details.
-std::atomic<int32> dtensor_controlflow_function_counter{0};
+std::atomic<int32_t> dtensor_controlflow_function_counter{0};
 
 }  // namespace
 
 mlir::StringAttr GetUniqueControlflowFnName(const std::string& prefix,
                                             mlir::OpBuilder& builder) {
-  int32 unique_id = dtensor_controlflow_function_counter++;
+  int32_t unique_id = dtensor_controlflow_function_counter++;
   return builder.getStringAttr(
       absl::StrCat(prefix, "_dtensor_function_", unique_id));
 }
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index 78a03f05a5540b..60271a08b8922d 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -81,7 +81,7 @@ void IdentifyTPUFunctions(
   if (!main_func) return;
 
   for (auto call : main_func.getOps<mlir::TF::StatefulPartitionedCallOp>()) {
-    auto mesh_or_status = Mesh::FromString(string(call.getConfig()));
+    auto mesh_or_status = Mesh::FromString(std::string(call.getConfig()));
     // Function calls created by end users instead of being converted from
     // tf_device.cluster do not have a serialized mesh as a config attribute. We
     // ignore the error returned from parsing in this case.
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 5b40ab389a8853..9b38fcdeb48bb0 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -487,7 +487,7 @@ pytype_strict_library(
         ":test_util",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/flags",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/dtensor/tests/tensor_layout_test.cc b/tensorflow/dtensor/tests/tensor_layout_test.cc
index 78bec8ba5d198e..bc4118651b2c30 100644
--- a/tensorflow/dtensor/tests/tensor_layout_test.cc
+++ b/tensorflow/dtensor/tests/tensor_layout_test.cc
@@ -175,7 +175,8 @@ TEST_F(LayoutTest, LayoutToFromString) {
 }
 
 TEST_F(LayoutTest, LayoutToFromStringNotSharded) {
-  std::string layout_str = "sharding_specs:x," + string(Layout::kUnshardedDim) +
+  std::string layout_str = "sharding_specs:x," +
+                           std::string(Layout::kUnshardedDim) +
                            ", mesh:|x=1|0|0|/job:localhost/task:0/device:CPU:0";
   EXPECT_EQ(layout_str, Layout::FromString(layout_str)->ToString());
 }
@@ -223,7 +224,7 @@ TEST_F(LayoutTest, OnTPUMesh) {
 }
 
 TEST_F(LayoutTest, NumShardsAsVector) {
-  std::vector<int32> shards = {4, 8};
+  std::vector<int32_t> shards = {4, 8};
   EXPECT_EQ(BatchLayout().num_shards(), shards);
 }
 
@@ -233,7 +234,7 @@ TEST_F(LayoutTest, IsReplicated) {
 
 TEST_F(LayoutTest, MeshDeviceLocations) {
   Layout layout = BatchLayout();
-  absl::InlinedVector<int64, 4> offset = {1, 2};
+  absl::InlinedVector<int64_t, 4> offset = {1, 2};
   EXPECT_THAT(layout.mesh().device_location(10),
               absl_testing::IsOkAndHolds(offset));
   offset = {2, 2};
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
index 8bc0a1cba96f69..09e4db2bc04bfa 100644
--- a/tensorflow/examples/wav_to_spectrogram/main.cc
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/log/log.h"
@@ -28,12 +29,12 @@ int main(int argc, char* argv[]) {
   // They define where the graph and input data is located, and what kind of
   // input the model expects. If you train your own model, or use something
   // other than inception_v3, then you'll need to update these.
-  tensorflow::string input_wav =
+  std::string input_wav =
       "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
   int32_t window_size = 256;
   int32_t stride = 128;
   float brightness = 64.0f;
-  tensorflow::string output_image = "spectrogram.png";
+  std::string output_image = "spectrogram.png";
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
       tensorflow::Flag("window_size", &window_size,
@@ -45,7 +46,7 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("output_image", &output_image,
                        "where to save the spectrogram image to"),
   };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
     LOG(ERROR) << usage;
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 2a4b4a88e2549a..6536ef720e58ea 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -44,10 +45,9 @@ using tensorflow::Output;
 using tensorflow::TensorShape;
 
 // Runs a TensorFlow graph to convert an audio file into a visualization.
-absl::Status WavToSpectrogram(const tensorflow::string& input_wav,
-                              int32_t window_size, int32_t stride,
-                              float brightness,
-                              const tensorflow::string& output_image) {
+absl::Status WavToSpectrogram(const std::string& input_wav, int32_t window_size,
+                              int32_t stride, float brightness,
+                              const std::string& output_image) {
   auto root = tensorflow::Scope::NewRootScope();
   using namespace tensorflow::ops;  // NOLINT(build/namespaces)
   // The following block creates a TensorFlow graph that:
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
index 62c95218b02575..1c6809fa6cf3ad 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -26,9 +26,8 @@ limitations under the License.
 // in the path to the audio file, the window size and stride parameters
 // controlling the spectrogram creation, the brightness scaling to use, and a
 // path to save the output PNG file to.
-absl::Status WavToSpectrogram(const tensorflow::string& input_wav,
-                              int32_t window_size, int32_t stride,
-                              float brightness,
-                              const tensorflow::string& output_image);
+absl::Status WavToSpectrogram(const std::string& input_wav, int32_t window_size,
+                              int32_t stride, float brightness,
+                              const std::string& output_image);
 
 #endif  // TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
index e5997114454b72..019741f49a93f6 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
 
+#include <string>
+
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
@@ -22,12 +24,12 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
-  const tensorflow::string input_wav =
+  const std::string input_wav =
       tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
-  const tensorflow::string output_image = tensorflow::io::JoinPath(
+  const std::string output_image = tensorflow::io::JoinPath(
       tensorflow::testing::TmpDir(), "output_image.png");
   float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-  tensorflow::string wav_string;
+  std::string wav_string;
   TF_ASSERT_OK(
       tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
   TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 0dcd088e5771d3..a1b28fb161836e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -6511,6 +6511,9 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 type ComplexAbsAttr func(optionalAttr)
 
 // ComplexAbsTout sets the optional Tout attribute to value.
+//
+// value: Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+// Need to be `tf.float64` when the type of `x` is `tf.complex128`.
 // If not specified, defaults to DT_FLOAT
 func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
@@ -20905,6 +20908,9 @@ func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []t
 type ImagAttr func(optionalAttr)
 
 // ImagTout sets the optional Tout attribute to value.
+//
+// value: Need to be `tf.float32` when the type of `x` is `tf.complex64`.
+// Need to be `tf.float64` when the type of `x` is `tf.complex128`.
 // If not specified, defaults to DT_FLOAT
 func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
@@ -21910,7 +21916,15 @@ type IsotonicRegressionAttr func(optionalAttr)
 
 // IsotonicRegressionOutputDtype sets the optional output_dtype attribute to value.
 //
-// value: Dtype of output.
+// value: Dtype of the output tensor.
+//
+// Note on supported input-output type combinations:
+// * For floating-point types, the output has the same dtype as the input.
+// * For 8-bit and 16-bit integer inputs, the output is a 32-bit float.
+// * For 32-bit and 64-bit integer inputs, the output is a 64-bit float.
+//
+// Using unsupported dtype pairs (for example, input=float64 with output=float32)
+// will result in a "Could not find device for node" error.
 // If not specified, defaults to DT_FLOAT
 func IsotonicRegressionOutputDtype(value tf.DataType) IsotonicRegressionAttr {
 	return func(m optionalAttr) {
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 12a1caaa992acd..e59411ec49aa02 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -70,6 +70,7 @@ const (
 	Float8e4m3fnuz    DataType = C.TF_FLOAT8_E4M3FNUZ
 	Float8e4m3b11fnuz DataType = C.TF_FLOAT8_E4M3B11FNUZ
 	Float8e5m2fnuz    DataType = C.TF_FLOAT8_E5M2FNUZ
+	Float4e2m1fn      DataType = C.TF_FLOAT4_E2M1FN
 	Int4              DataType = C.TF_INT4
 	Uint4             DataType = C.TF_UINT4
 	Int2              DataType = C.TF_INT2
@@ -563,7 +564,7 @@ func isTensorSerializable(dataType DataType) error {
 	// serialization and deserialization of Tensors.  Till then capitalize
 	// on knowledge of the implementation for numeric types.
 	switch dataType {
-	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Int4, Uint4, Int2, Uint2:
+	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Float4e2m1fn, Int4, Uint4, Int2, Uint2:
 		return nil
 	default:
 		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index bffb769004b56e..099e9e40a36f12 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/java/src/gen/cc/op_specs.h"
 
-#include <cctype>
 #include <map>
 #include <sstream>
 #include <string>
@@ -23,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/strip.h"
@@ -188,7 +188,7 @@ string SnakeToCamelCase(const string& str, bool upper = false) {
     if (c == '_') {
       cap = true;
     } else if (cap) {
-      result += toupper(c);
+      result += absl::ascii_toupper(c);
       cap = false;
     } else {
       result += c;
diff --git a/tensorflow/js/ops/ts_op_gen.cc b/tensorflow/js/ops/ts_op_gen.cc
index eff8be42576483..3b5f0145ba983c 100644
--- a/tensorflow/js/ops/ts_op_gen.cc
+++ b/tensorflow/js/ops/ts_op_gen.cc
@@ -57,13 +57,13 @@ class GenTypeScriptOp {
   ~GenTypeScriptOp();
 
   // Returns the generated code as a string:
-  string Code();
+  std::string Code();
 
  private:
   void ProcessArgs();
   void ProcessAttrs();
-  void AddAttrForArg(const string& attr, int arg_index);
-  string InputForAttr(const OpDef::AttrDef& op_def_attr);
+  void AddAttrForArg(const std::string& attr, int arg_index);
+  std::string InputForAttr(const OpDef::AttrDef& op_def_attr);
 
   void AddMethodSignature();
   void AddOpAttrs();
@@ -73,7 +73,7 @@ class GenTypeScriptOp {
   const ApiDef& api_def_;
 
   // Placeholder string for all generated code:
-  string result_;
+  std::string result_;
 
   // Holds in-order vector of Op inputs:
   std::vector<ArgDefs> input_op_args_;
@@ -82,7 +82,7 @@ class GenTypeScriptOp {
   std::vector<OpAttrs> op_attrs_;
 
   // Stores attributes-to-arguments by name:
-  typedef std::unordered_map<string, std::vector<int>> AttrArgIdxMap;
+  typedef std::unordered_map<std::string, std::vector<int>> AttrArgIdxMap;
   AttrArgIdxMap attr_arg_idx_map_;
 
   // Holds number of outputs:
@@ -94,7 +94,7 @@ GenTypeScriptOp::GenTypeScriptOp(const OpDef& op_def, const ApiDef& api_def)
 
 GenTypeScriptOp::~GenTypeScriptOp() = default;
 
-string GenTypeScriptOp::Code() {
+std::string GenTypeScriptOp::Code() {
   ProcessArgs();
   ProcessAttrs();
 
@@ -144,7 +144,7 @@ void GenTypeScriptOp::ProcessAttrs() {
   }
 }
 
-void GenTypeScriptOp::AddAttrForArg(const string& attr, int arg_index) {
+void GenTypeScriptOp::AddAttrForArg(const std::string& attr, int arg_index) {
   // Keep track of attributes-to-arguments by name. These will be used for
   // construction Op attributes that require information about the inputs.
   auto iter = attr_arg_idx_map_.find(attr);
@@ -155,8 +155,8 @@ void GenTypeScriptOp::AddAttrForArg(const string& attr, int arg_index) {
   }
 }
 
-string GenTypeScriptOp::InputForAttr(const OpDef::AttrDef& op_def_attr) {
-  string inputs;
+std::string GenTypeScriptOp::InputForAttr(const OpDef::AttrDef& op_def_attr) {
+  std::string inputs;
   auto arg_list = attr_arg_idx_map_.find(op_def_attr.name());
   if (arg_list != attr_arg_idx_map_.end()) {
     for (auto iter = arg_list->second.begin(); iter != arg_list->second.end();
@@ -235,7 +235,7 @@ void WriteTSOp(const OpDef& op_def, const ApiDef& api_def, WritableFile* ts) {
 }
 
 void StartFile(WritableFile* ts_file) {
-  const string header =
+  const std::string header =
       R"header(/**
  * @license
  * Copyright 2018 Google Inc. All Rights Reserved.
@@ -266,7 +266,7 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }  // namespace
 
 void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& ts_filename) {
+                const std::string& ts_filename) {
   Env* env = Env::Default();
 
   std::unique_ptr<WritableFile> ts_file = nullptr;
diff --git a/tensorflow/js/ops/ts_op_gen.h b/tensorflow/js/ops/ts_op_gen.h
index fcd46a17a77c32..fb0c7c34d0d8bc 100644
--- a/tensorflow/js/ops/ts_op_gen.h
+++ b/tensorflow/js/ops/ts_op_gen.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 // Generated code is written to the file ts_filename:
 void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& ts_filename);
+                const std::string& ts_filename);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
index 45170ed846fd3d..f0f65f3d86428b 100644
--- a/tensorflow/js/ops/ts_op_gen_test.cc
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -81,8 +81,9 @@ op {
 )";
 
 // Generate TypeScript code
-void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
-                          string* ts_file_text) {
+void GenerateTsOpFileText(const std::string& op_def_str,
+                          const std::string& api_def_str,
+                          std::string* ts_file_text) {
   Env* env = Env::Default();
   OpList op_defs;
   protobuf::TextFormat::ParseFromString(
@@ -93,7 +94,7 @@ void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
     TF_ASSERT_OK(api_def_map.LoadApiDef(api_def_str));
   }
 
-  const string& tmpdir = testing::TmpDir();
+  const std::string& tmpdir = testing::TmpDir();
   const auto ts_file_path = io::JoinPath(tmpdir, "test.ts");
 
   WriteTSOps(op_defs, api_def_map, ts_file_path);
@@ -101,10 +102,10 @@ void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
 }
 
 TEST(TsOpGenTest, TestImports) {
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", "", &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 import * as tfc from '@tensorflow/tfjs-core';
 import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 )";
@@ -112,38 +113,38 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }
 
 TEST(TsOpGenTest, InputSingleAndList) {
-  const string api_def = R"pb(
+  const std::string api_def = R"pb(
     op { graph_op_name: "Foo" arg_order: "dim" arg_order: "images" }
   )pb";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function Foo(dim: tfc.Tensor, images: tfc.Tensor[]): tfc.Tensor {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, TestVisibility) {
-  const string api_def = R"(
+  const std::string api_def = R"(
 op {
   graph_op_name: "Foo"
   visibility: HIDDEN
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
 )";
   ExpectDoesNotContainStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, SkipDeprecated) {
-  const string op_def = R"(
+  const std::string op_def = R"(
 op {
   name: "DeprecatedFoo"
   input_arg {
@@ -172,14 +173,14 @@ op {
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText(op_def, "", &ts_file_text);
 
   ExpectDoesNotContainStr(ts_file_text, "DeprecatedFoo");
 }
 
 TEST(TsOpGenTest, MultiOutput) {
-  const string op_def = R"(
+  const std::string op_def = R"(
 op {
   name: "MultiOutputFoo"
   input_arg {
@@ -212,20 +213,20 @@ op {
 }
 )";
 
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText(op_def, "", &ts_file_text);
 
-  const string expected = R"(
+  const std::string expected = R"(
 export function MultiOutputFoo(input: tfc.Tensor): tfc.Tensor[] {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
 
 TEST(TsOpGenTest, OpAttrs) {
-  string ts_file_text;
+  std::string ts_file_text;
   GenerateTsOpFileText("", "", &ts_file_text);
 
-  const string expectedFooAttrs = R"(
+  const std::string expectedFooAttrs = R"(
   const opAttrs = [
     createTensorsTypeOpAttr('T', images),
     {name: 'N', type: nodeBackend().binding.TF_ATTR_INT, value: images.length}
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index cc31fb44714592..35e6d6d534aba4 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -269,6 +269,11 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseDiv(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_DYNAMIC_UPDATE_SLICE: {
+      return ParseDynamicUpdateSlice(op, error_reporter, allocator,
+                                     builtin_data);
+    }
+
     case BuiltinOperator_ELU: {
       return ParseElu(op, error_reporter, allocator, builtin_data);
     }
@@ -987,7 +992,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_COS:
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DENSIFY:
-    case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_HASHTABLE_FIND:
     case BuiltinOperator_HASHTABLE_IMPORT:
@@ -1469,6 +1473,14 @@ TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseDynamicUpdateSlice(const Operator*, ErrorReporter*,
+                                     BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 28aacff61828b3..26b36df194bfe0 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -148,6 +148,11 @@ TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseDynamicUpdateSlice(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
 TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 9f97ec049bc01a..17e39bf8475247 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -106,6 +106,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         ":operator",
+        "//tensorflow/compiler/mlir/lite:allocation",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:stderr_reporter",
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index fcbda4e4fb0c81..075718509d367c 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -571,7 +571,8 @@ TfLiteInterpreter* InterpreterCreateWithOpResolver(
                                               ? optional_error_reporter.get()
                                               : tflite::DefaultErrorReporter();
   tflite::InterpreterBuilder builder(model->impl->GetModel(), *op_resolver,
-                                     error_reporter);
+                                     error_reporter, nullptr,
+                                     model->impl->allocation());
 
   if (optional_options && optional_options->telemetry_profiler) {
     std::unique_ptr<tflite::telemetry::TelemetryProfiler> profiler;
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 2ebb6b3d567a75..aa85ddbf113ce4 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -423,19 +423,19 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
       context, node_index, &local_node, &registration);
   if (status != kTfLiteOk) return status;
 
-  // When the 'registration' object obtained via 'GetNodeAndRegistration'
-  // has its 'registration_external' field set then we can load that into the
-  // caller's 'registration_external' pointer and return early.
+  // When the `registration` object obtained via `GetNodeAndRegistration`
+  // has its `registration_external` field set then we can load that into the
+  // caller's `registration_external` pointer and return early.
   *node = reinterpret_cast<TfLiteOpaqueNode*>(local_node);
   if (registration->registration_external) {
     *registration_external = registration->registration_external;
     return kTfLiteOk;
   }
 
-  // When the 'registration' object obtained via 'GetNodeAndRegistration'
-  // does *not* have its 'registration_external' field set then we need to
+  // When the `registration` object obtained via `GetNodeAndRegistration`
+  // does *not* have its `registration_external` field set then we need to
   // create a TfLiteOperator on the fly, and set its field according
-  // to the 'TfLiteRegistration' object.
+  // to the `TfLiteRegistration` object.
   auto derived_registration =
       tflite::internal::CommonOpaqueConversionUtil::ObtainOperator(
           context, registration, node_index);
@@ -459,12 +459,12 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
   TfLiteContext* context = reinterpret_cast<TfLiteContext*>(opaque_context);
   TfLiteDelegate* delegate = reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
 
-  // Wrap the provided 'registration_external' as a regular 'TfLiteRegistration'
+  // Wrap the provided `registration_external` as a regular `TfLiteRegistration`
   // object to reduce the places in the TF Lite runtime that need to be aware
-  // of 'TfLiteOperator's.  Note that it is important to
-  // brace-initialize the 'TfLiteRegistration' so that we pass a registration to
-  // 'ReplaceNodeSubsetsWithDelegateKernels' that has all of its fields set to
-  // null, except the 'registration_external' one.
+  // of `TfLiteOperator`s.  Note that it is important to
+  // brace-initialize the `TfLiteRegistration` so that we pass a registration to
+  // `ReplaceNodeSubsetsWithDelegateKernels` that has all of its fields set to
+  // null, except the `registration_external` one.
   TfLiteRegistration registration{};
   registration.registration_external = registration_external;
 
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index 24b556db238523..d037d36d828bdf 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -70,7 +70,7 @@ TFL_CAPI_EXPORT extern TfLiteType TfLiteOpaqueTensorType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
 /// Returns the number of dimensions that the tensor has.  Returns -1 in case
-/// the 'opaque_tensor' does not have its dimensions property set.
+/// the `opaque_tensor` does not have its dimensions property set.
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
     const TfLiteOpaqueTensor* opaque_tensor);
 
@@ -78,29 +78,29 @@ TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorDim(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index);
 
-/// Loads into the provided 'num_dims' the number of dimensions that the
-/// tensor's signature has. Returns 'kTfLiteOk' if 'num_dims' was successfully
-/// loaded. Any other return code indicates an error and 'num_dims' won't be
+/// Loads into the provided `num_dims` the number of dimensions that the
+/// tensor's signature has. Returns `kTfLiteOk` if `num_dims` was successfully
+/// loaded. Any other return code indicates an error and `num_dims` won't be
 /// loaded.
 ///
 /// A tensor's dimension signature encodes shapes with unknown dimensions with
 /// -1.  E.g. for a tensor with three dimensions, whose first dimension has an
 /// unknown size, and the second and third dimension have a size of 2, the
-/// dimension signature is [-1,2,2], and 'TfLiteOpaqueTensorGetNumDimsSignature'
-/// loads 3 into 'num_dims'. If the tensor does not have its dimension signature
-/// field set then 'num_dims' is set to -1.
+/// dimension signature is [-1,2,2], and `TfLiteOpaqueTensorGetNumDimsSignature`
+/// loads 3 into `num_dims`. If the tensor does not have its dimension signature
+/// field set then `num_dims` is set to -1.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetNumDimsSignature(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t* num_dims);
 
-/// Loads into the provided 'dim_length' the length of the tensor in the
-/// 'dim_index' signature dimension or -1 if that dimension has unknown length.
-/// Returns 'kTfLiteOk' if 'dim_length' was successfully loaded. Any
-/// other return code indicates an error and 'dim_length' won't be loaded.
+/// Loads into the provided `dim_length` the length of the tensor in the
+/// `dim_index` signature dimension or -1 if that dimension has unknown length.
+/// Returns `kTfLiteOk` if `dim_length` was successfully loaded. Any
+/// other return code indicates an error and `dim_length` won't be loaded.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetDimSignature(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index,
     int32_t* dim_length);
 
-/// Returns 'non-zero' if the provided 'opaque_tensor' is a variable, and
+/// Returns `non-zero` if the provided `opaque_tensor` is a variable, and
 /// returns zero otherwise.
 TFL_CAPI_EXPORT extern int TfLiteOpaqueTensorIsVariable(
     const TfLiteOpaqueTensor* opaque_tensor);
@@ -114,7 +114,7 @@ TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's allocation type.
+/// Returns the `opaque_tensor`'s allocation type.
 TFL_CAPI_EXPORT extern TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
@@ -142,11 +142,11 @@ TFL_CAPI_EXPORT extern TfLiteRunStep TfLiteOpaqueTensorGetShapeKnownStep(
 TFL_CAPI_EXPORT extern const char* TfLiteOpaqueTensorName(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's quantization information.
+/// Returns the `opaque_tensor`'s quantization information.
 TFL_CAPI_EXPORT extern TfLiteQuantization TfLiteOpaqueTensorGetQuantization(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-/// Returns the 'opaque_tensor's quantization parameters.
+/// Returns the `opaque_tensor`'s quantization parameters.
 TFL_CAPI_EXPORT extern TfLiteQuantizationParams
 TfLiteOpaqueTensorGetQuantizationParams(
     const TfLiteOpaqueTensor* opaque_tensor);
@@ -161,61 +161,61 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyToBuffer(
     const TfLiteOpaqueTensor* opaque_tensor, void* output_data,
     size_t output_data_size);
 
-/// Returns the number of strings stored in the provided 'tensor'.
+/// Returns the number of strings stored in the provided `tensor`.
 /// Returns -1 in case of failure.
 int TfLiteOpaqueTensorGetStringCount(const TfLiteOpaqueTensor* tensor);
 
-/// Stores the address of the n-th (denoted by the provided 'index') string
-/// contained in the provided 'tensor' in the provided '*str' pointer.  Stores
-/// the length of the string in the provided '*len' argument.
+/// Stores the address of the n-th (denoted by the provided `index`) string
+/// contained in the provided `tensor` in the provided `*str` pointer.  Stores
+/// the length of the string in the provided `*len` argument.
 ///
-/// Returns 'kTfLiteOk' if '*str' and '*len' have been set successfully.  Any
-/// other return value indicates a failure, which leaves '*str' and '*len' in an
+/// Returns `kTfLiteOk` if `*str` and `*len` have been set successfully.  Any
+/// other return value indicates a failure, which leaves `*str` and `*len` in an
 /// unspecified state.
 ///
 /// The range of valid indices is defined by the half open interval [0, N),
 /// where N == TfLiteOpaqueTensorGetStringCount(tensor).
 ///
-/// Note that 'str' is not guaranteed to be null-terminated. Also note that this
+/// Note that `str` is not guaranteed to be null-terminated. Also note that this
 /// function will not create a copy of the underlying string data.  The data is
-/// owned by the 'tensor'.
+/// owned by the `tensor`.
 TfLiteStatus TfLiteOpaqueTensorGetString(const TfLiteOpaqueTensor* tensor,
                                          int index, const char** str, int* len);
 
-/// Writes the array of strings specified by 'str_array' into
-/// the specified 'tensor'.  The strings provided via the 'str_array' are being
-/// copied into the 'tensor'. Returns 'kTfLiteOk' in case of success.  Any other
+/// Writes the array of strings specified by `str_array` into
+/// the specified `tensor`.  The strings provided via the `str_array` are being
+/// copied into the `tensor`. Returns `kTfLiteOk` in case of success.  Any other
 /// return value indicates a failure.
 ///
-/// The provided 'str_array_len' must denote the length of 'str_array'
-/// and 'str_n_len[i]' must denote the length of the i-th string.
+/// The provided `str_array_len` must denote the length of `str_array`
+/// and `str_n_len[i]` must denote the length of the i-th string.
 ///
 /// The provided strings don't need to be null terminated and may contain
-/// embedded null characters.  The amount of bytes copied into the 'tensor' is
-/// entirely determined by 'str_n_len[i]' and it is the caller's responsibility
+/// embedded null characters.  The amount of bytes copied into the `tensor` is
+/// entirely determined by `str_n_len[i]` and it is the caller's responsibility
 /// to set this value correctly to avoid undefined behavior.
 ///
-/// Also note that calling 'TfLiteOpaqueTensorWriteStrings' deallocates any
-/// previously stored data in the 'tensor'.
+/// Also note that calling `TfLiteOpaqueTensorWriteStrings` deallocates any
+/// previously stored data in the `tensor`.
 TfLiteStatus TfLiteOpaqueTensorWriteStrings(TfLiteOpaqueTensor* tensor,
                                             const char* const* str_array,
                                             int str_array_len,
                                             const int* str_n_len);
 
-/// Writes the string pointed to by the provided 'str' pointer of length 'len'
-/// into the provided 'tensor'.  The string provided via 'str' is
-/// copied into the 'tensor'.  Returns 'kTfLiteOk' in case of success. Any
+/// Writes the string pointed to by the provided `str` pointer of length `len`
+/// into the provided `tensor`.  The string provided via `str` is
+/// copied into the `tensor`.  Returns `kTfLiteOk` in case of success. Any
 /// other return value indicates a failure.
 ///
-/// Note that calling 'TfLiteOpaqueTensorWriteString' deallocates any
-/// previously stored data in the 'tensor'.  E.g. suppose 't' denotes a
-/// 'TfLiteOpaqueTensor*', then calling 'TfLiteOpaqueTensorWriteString(t, "AB",
-/// 2)' followed by a call to 'TfLiteOpaqueTensorWriteString(t, "CD", 2)' will
-/// lead to 't' containing 'CD', not 'ABCD'.
+/// Note that calling `TfLiteOpaqueTensorWriteString` deallocates any
+/// previously stored data in the `tensor`.  E.g. suppose `t` denotes a
+/// `TfLiteOpaqueTensor*`, then calling `TfLiteOpaqueTensorWriteString(t, "AB",
+/// 2)` followed by a call to `TfLiteOpaqueTensorWriteString(t, "CD", 2)` will
+/// lead to `t` containing `CD`, not `ABCD`.
 ///
-/// 'TfLiteOpaqueTensorWriteString' is a convenience function for the use case
+/// `TfLiteOpaqueTensorWriteString` is a convenience function for the use case
 /// of writing a single string to a tensor and its effects are identical to
-/// calling 'TfLiteOpaqueTensorWriteStrings' with an array of a single string.
+/// calling `TfLiteOpaqueTensorWriteStrings` with an array of a single string.
 TfLiteStatus TfLiteOpaqueTensorWriteString(TfLiteOpaqueTensor* tensor,
                                            const char* str, int len);
 
@@ -228,46 +228,46 @@ TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderCreate();
 /// Deletes an opaque tensor builder object.
 void TfLiteOpaqueTensorBuilderDelete(TfLiteOpaqueTensorBuilder* builder);
 
-/// Sets the 'TfLiteType' of the provided 'builder' to the provided 'type'.
-/// Returns the address of the provided 'builder', so that builder calls can be
+/// Sets the `TfLiteType` of the provided `builder` to the provided `type`.
+/// Returns the address of the provided `builder`, so that builder calls can be
 /// chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetType(
     TfLiteOpaqueTensorBuilder* builder, TfLiteType type);
 
-/// Sets the raw data of the provided 'builder' to the provided 'data'. Returns
-/// the address of the provided 'builder', so that builder calls can be chained
+/// Sets the raw data of the provided `builder` to the provided `data`. Returns
+/// the address of the provided `builder`, so that builder calls can be chained
 /// together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetData(
     TfLiteOpaqueTensorBuilder* builder, void* data);
 
-/// Sets the allocation type of the provided 'builder' to the provided
-/// 'allocation_type'.  The 'allocation_type' must be one of the following:
-/// 'kTfLiteDynamic', 'kTfLiteArenaRw' or 'kTfLiteArenaRwPersistent'.  If the
-/// provided 'allocation_type' is not one of those values then
-/// 'TfLiteOpaqueContextAddTensor' will return an error. Returns the address of
-/// the provided 'builder', so that builder calls can be chained together.
+/// Sets the allocation type of the provided `builder` to the provided
+/// `allocation_type`.  The `allocation_type` must be one of the following:
+/// `kTfLiteDynamic`, `kTfLiteArenaRw` or `kTfLiteArenaRwPersistent`.  If the
+/// provided `allocation_type` is not one of those values then
+/// `TfLiteOpaqueContextAddTensor` will return an error. Returns the address of
+/// the provided `builder`, so that builder calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetAllocationType(
     TfLiteOpaqueTensorBuilder* builder, TfLiteAllocationType allocation_type);
 
-/// Sets the quantization params of the provided 'builder' to the provided
-/// 'params'. Returns the address of the provided 'builder', so that builder
+/// Sets the quantization params of the provided `builder` to the provided
+/// `params`. Returns the address of the provided `builder`, so that builder
 /// calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantizationParams(
     TfLiteOpaqueTensorBuilder* builder, TfLiteQuantizationParams params);
 
-/// Sets the quantization of the provided 'builder' to the provided
-/// 'quantization'. Returns the address of the provided 'builder', so that
+/// Sets the quantization of the provided `builder` to the provided
+/// `quantization`. Returns the address of the provided `builder`, so that
 /// builder calls can be chained together.
 TfLiteOpaqueTensorBuilder* TfLiteOpaqueTensorBuilderSetQuantization(
     TfLiteOpaqueTensorBuilder* builder, TfLiteQuantization quantization);
 
-/// Sets the allocation type of the provided 'tensor' to 'kTfLiteDynamic'.
-/// This function has no effect if the 'tensor's allocation type is already
-/// 'kTfLiteDynamic'.  The provided 'tensor' must not be null.
+/// Sets the allocation type of the provided `tensor` to `kTfLiteDynamic`.
+/// This function has no effect if the `tensor`'s allocation type is already
+/// `kTfLiteDynamic`.  The provided `tensor` must not be null.
 void TfLiteOpaqueTensorSetAllocationTypeToDynamic(TfLiteOpaqueTensor* tensor);
 
-/// Sets the allocation type of the provided 'tensor' to 'kTfLiteNonCpu'.
-/// The provided 'tensor' must not be null.
+/// Sets the allocation type of the provided `tensor` to `kTfLiteNonCpu`.
+/// The provided `tensor` must not be null.
 ///
 /// WARNING: This is an experimental API and subject to change.
 void TfLiteOpaqueTensorSetNonCpuAllocation(TfLiteOpaqueTensor* tensor);
@@ -285,11 +285,11 @@ TFL_CAPI_EXPORT extern TfLiteOpaqueTensor* TfLiteOpaqueNodeGetOutput(
     TfLiteOpaqueContext* opaque_context, const TfLiteOpaqueNode* opaque_node,
     int index);
 
-/// Gets the number of input tensors of the provided 'opaque_node'.
+/// Gets the number of input tensors of the provided `opaque_node`.
 TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfInputs(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Gets the number of output tensors of the provided 'opaque_node'.
+/// Gets the number of output tensors of the provided `opaque_node`.
 TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
     const TfLiteOpaqueNode* opaque_node);
 
@@ -299,57 +299,57 @@ TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Returns the builtin data associated with the provided 'opaque_node'.
+/// Returns the builtin data associated with the provided `opaque_node`.
 ///
 /// The builtin init data associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the returned
-/// address remains valid throughout the lifetime of the 'opaque_node'.
+/// address remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetBuiltinData(
     const TfLiteOpaqueNode* opaque_node);
 
-/// Loads into the provided '*init_data' pointer the address of the custom init
-/// data associated with the provided 'opaque_node'.  The length of data is
-/// loaded into the provided 'size' pointer.  Returns 'kTfLiteOk' in case
+/// Loads into the provided `*init_data` pointer the address of the custom init
+/// data associated with the provided `opaque_node`.  The length of data is
+/// loaded into the provided `size` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'init_data' and 'size' in an unspecified state.
+/// `init_data` and `size` in an unspecified state.
 ///
 /// The custom init data associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the returned
-/// address remains valid throughout the lifetime of the 'opaque_node'.
+/// address remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueNodeGetCustomInitialData(
     const TfLiteOpaqueNode* opaque_node, const void** init_data, int* size);
 
-/// Loads into the provided '*inputs' pointer the starting address of an array
+/// Loads into the provided `*inputs` pointer the starting address of an array
 /// of indices representing the tensors that are inputs of the provided
-/// 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_inputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
-/// return value indicates a failure and will leave 'inputs' and
-/// 'num_inputs' in an unspecified state.
+/// `opaque_node`. The length of the array is loaded into the provided
+/// `num_inputs` pointer. Returns `kTfLiteOk` in case of success.  Any other
+/// return value indicates a failure and will leave `inputs` and
+/// `num_inputs` in an unspecified state.
 ///
 /// The input tensors associated with a node would typically be set during the
 /// creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeInputs(
     const TfLiteOpaqueNode* opaque_node, const int** inputs, int* num_inputs);
 
-/// Loads into the provided '*outputs' pointer the starting address of an array
+/// Loads into the provided `*outputs` pointer the starting address of an array
 /// of indices representing the tensors that are outputs of the provided
-/// 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_outputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
-/// return value indicates a failure and will leave 'outputs' and
-/// 'num_outputs' in an unspecified state.
+/// `opaque_node`. The length of the array is loaded into the provided
+/// `num_outputs` pointer. Returns `kTfLiteOk` in case of success.  Any other
+/// return value indicates a failure and will leave `outputs` and
+/// `num_outputs` in an unspecified state.
 ///
 /// The output tensors associated with a node would typically be set during the
 /// creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
     const TfLiteOpaqueNode* opaque_node, const int** outputs, int* num_outputs);
 
@@ -362,35 +362,35 @@ TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeSetTemporaries(
     TfLiteOpaqueNode* opaque_node, const int* temporaries, int num_temporaries);
 
-/// Loads into the provided '*temporaries' pointer the starting address of an
+/// Loads into the provided `*temporaries` pointer the starting address of an
 /// array of indices representing the temporary tensors associated with the
-/// provided 'opaque_node'. The length of the array is loaded into the provided
-/// 'num_temporaries' pointer. Returns 'kTfLiteOk' in case of success.  Any
-/// other return value indicates a failure and will leave 'temporaries' and
-/// 'num_temporaries' in an unspecified state.
+/// provided `opaque_node`. The length of the array is loaded into the provided
+/// `num_temporaries` pointer. Returns `kTfLiteOk` in case of success.  Any
+/// other return value indicates a failure and will leave `temporaries` and
+/// `num_temporaries` in an unspecified state.
 ///
 /// The temporary tensors associated with a node would typically be set during
 /// the creation of the associated interpreter, through a mechanism like the
 /// interpreter builder that loads a TFLite model and initialises the
 /// interpreter's nodes accordingly.  Under these conditions the loaded address
-/// remains valid throughout the lifetime of the 'opaque_node'.
+/// remains valid throughout the lifetime of the `opaque_node`.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueNodeTemporaries(const TfLiteOpaqueNode* opaque_node,
                                          const int** temporaries,
                                          int* num_temporaries);
 
-/// Given an 'index_of_input', which must be in the range of [0, N), where N is
-/// the number of input tensors of the provided 'opaque_node', returns the
+/// Given an `index_of_input`, which must be in the range of [0, N), where N is
+/// the number of input tensors of the provided `opaque_node`, returns the
 /// (global) index of the tensor that holds the input.  Returns -1 if
-/// 'index_of_input' is not within the [0, N) range.
+/// `index_of_input` is not within the [0, N) range.
 TFL_CAPI_EXPORT
 int TfLiteOpaqueNodeGetInputTensorIndex(const TfLiteOpaqueNode* opaque_node,
                                         int index_of_input);
 
-/// Given an 'index_of_output', which must be in the range of [0, N), where N is
-/// the number of output tensors of the provided 'opaque_node', returns the
+/// Given an `index_of_output`, which must be in the range of [0, N), where N is
+/// the number of output tensors of the provided `opaque_node`, returns the
 /// (global) index of the tensor that holds the output. Returns -1 if
-/// 'index_of_output' is not within the [0, N) range.
+/// `index_of_output` is not within the [0, N) range.
 TFL_CAPI_EXPORT
 int TfLiteOpaqueNodeGetOutputTensorIndex(const TfLiteOpaqueNode* opaque_node,
                                          int index_of_output);
@@ -415,26 +415,26 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExternalContext(
     TfLiteOpaqueContext* opaque_context, void** external_context,
     TfLiteExternalContextType type);
 
-/// Given the specified 'opaque_context' and 'node_index', load the caller's
-/// opaque '*node' and '*registration_external' pointer.  Return 'kTfLiteOk' if
-/// both the '*node' as well as the '*registration_external' have been loaded
-/// correctly.  Any other return code indicates a failure and both '*node' as
-/// well as '*registration_external' will be in an unspecified state.
+/// Given the specified `opaque_context` and `node_index`, load the caller's
+/// opaque `*node` and `*registration_external` pointer.  Return `kTfLiteOk` if
+/// both the `*node` as well as the `*registration_external` have been loaded
+/// correctly.  Any other return code indicates a failure and both `*node` as
+/// well as `*registration_external` will be in an unspecified state.
 ///
 /// A caller can obtain a node's index by calling
-/// 'TfLiteOpaqueContextGetExecutionPlan', which provides an array of node
+/// `TfLiteOpaqueContextGetExecutionPlan`, which provides an array of node
 /// indices, sorted in execution order.  A node index might also come from the
 /// data structures passed to the delegate kernel's callback parameters, like
-/// the delegate parameters data structure passed to the 'init' callback that
+/// the delegate parameters data structure passed to the `init` callback that
 /// contains an array of node indices that are meant to be handled by the
 /// delegate kernel.
 ///
 /// This function is expected to be called from within a delegate callback, like
-/// 'Prepare', or a delegate kernel callback (i.e., a callback registered with
-/// a 'TfLiteOperator' object).
+/// `Prepare`, or a delegate kernel callback (i.e., a callback registered with
+/// a `TfLiteOperator` object).
 ///
-/// The loaded '*node' and '*registration_external' pointers will generally
-/// remain valid for the lifetime of the associated 'opaque_context', but can be
+/// The loaded `*node` and `*registration_external` pointers will generally
+/// remain valid for the lifetime of the associated `opaque_context`, but can be
 /// invalidated through API calls where delegates get un-applied, like API calls
 /// that modify the model graph via a delegate, or if input tensors get
 /// re-sized.
@@ -484,67 +484,67 @@ TFL_CAPI_EXPORT
 TfLiteOpaqueTensor* TfLiteOpaqueContextGetOpaqueTensor(
     const TfLiteOpaqueContext* opaque_context, int index);
 
-/// Loads into the provided '*inputs' pointer the starting address of an array
+/// Loads into the provided `*inputs` pointer the starting address of an array
 /// of indices representing the tensors that are inputs to the subgraph that is
-/// associated with the provided 'opaque_context'.  The length of the array is
-/// loaded into the provided 'num_inputs' pointer.  Returns 'kTfLiteOk' in case
+/// associated with the provided `opaque_context`.  The length of the array is
+/// loaded into the provided `num_inputs` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'inputs' and 'num_inputs' in an unspecified state.  Calls to 'SetInputs' on
+/// `inputs` and `num_inputs` in an unspecified state.  Calls to `SetInputs` on
 /// the associated subgraph invalidate the loaded pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetInputs(
     const struct TfLiteOpaqueContext* opaque_context, const int** inputs,
     int* num_inputs);
 
-/// Loads into the provided '*outputs' pointer the starting address of an array
+/// Loads into the provided `*outputs` pointer the starting address of an array
 /// of indices representing the tensors that are outputs to the subgraph that is
-/// associated with the provided 'opaque_context'.  The length of the array is
-/// loaded into the provided 'num_outputs' pointer.  Returns 'kTfLiteOk' in case
+/// associated with the provided `opaque_context`.  The length of the array is
+/// loaded into the provided `num_outputs` pointer.  Returns `kTfLiteOk` in case
 /// of success.  Any other return value indicates a failure and will leave
-/// 'outputs' and 'num_outputs' in an unspecified state.  Calls to 'SetOutputs'
+/// `outputs` and `num_outputs` in an unspecified state.  Calls to `SetOutputs`
 /// on the associated subgraph invalidate the loaded pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetOutputs(
     const struct TfLiteOpaqueContext* opaque_context, const int** outputs,
     int* num_outputs);
 
-/// Loads into the provided '*variables' pointer the starting address of an
+/// Loads into the provided `*variables` pointer the starting address of an
 /// array of indices representing the tensors that are variables to the subgraph
-/// that is associated with the provided 'opaque_context'.  The length of the
-/// array is loaded into the provided 'num_variables' pointer.  Returns
-/// 'kTfLiteOk' in case of success.  Any other return value indicates a failure
-/// and will leave 'variables' and 'num_variables' in an unspecified state.
-/// Calls to 'SetVariables' on the associated subgraph invalidate the loaded
+/// that is associated with the provided `opaque_context`.  The length of the
+/// array is loaded into the provided `num_variables` pointer.  Returns
+/// `kTfLiteOk` in case of success.  Any other return value indicates a failure
+/// and will leave `variables` and `num_variables` in an unspecified state.
+/// Calls to `SetVariables` on the associated subgraph invalidate the loaded
 /// pointers.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetVariables(
     const struct TfLiteOpaqueContext* opaque_context, const int** variables,
     int* num_variables);
 
-/// Returns the number of nodes associated with the provided 'opaque_context'.
+/// Returns the number of nodes associated with the provided `opaque_context`.
 TFL_CAPI_EXPORT
 size_t TfLiteOpaqueContextGetNumNodes(
     const struct TfLiteOpaqueContext* opaque_context);
 
-/// Returns the number of tensors associated with the provided 'opaque_context'.
+/// Returns the number of tensors associated with the provided `opaque_context`.
 TFL_CAPI_EXPORT
 size_t TfLiteOpaqueContextGetNumTensors(
     const struct TfLiteOpaqueContext* opaque_context);
 
 /// Returns the name of the subgraph that is associated with the provided
-/// 'opaque_context'.  Typically the returned pointer will remain valid
+/// `opaque_context`.  Typically the returned pointer will remain valid
 /// throughout the lifetime of the subgraph, but may be invalidated by a call to
-/// 'Subgraph::SetName'.
+/// `Subgraph::SetName`.
 TFL_CAPI_EXPORT
 const char* TfLiteOpaqueContextGetName(
     const struct TfLiteOpaqueContext* opaque_context);
 
-/// Resizes the provided 'tensor' that is associated with the provided
-/// 'context' so that the 'tensor's shape matches the dimensionality specified
-/// via the provided 'new_size' array.  Returns 'kTfLiteOk' in
+/// Resizes the provided `tensor` that is associated with the provided
+/// `context` so that the `tensor`'s shape matches the dimensionality specified
+/// via the provided `new_size` array.  Returns `kTfLiteOk` in
 /// case of success.  Any other return value indicates a failure and will leave
-/// the 'tensor' in an unspecified state.  The TF Lite runtime takes ownership
-/// of the 'new_size' array, even in case of failure.
+/// the `tensor` in an unspecified state.  The TF Lite runtime takes ownership
+/// of the `new_size` array, even in case of failure.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
                                              TfLiteOpaqueTensor* tensor,
@@ -614,23 +614,23 @@ TfLiteStatus TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(
     TfLiteOpaqueContext* opaque_context, int subgraph_index);
 
 /// Loads metadata of a TF Lite node's custom initialization data. Specifically:
-/// * Loads into the supplied 'fd' the file descriptor of the file that stores
-///   the 'node's custom  initialization data.  This output parameter will be
+/// * Loads into the supplied `fd` the file descriptor of the file that stores
+///   the `node`'s custom  initialization data.  This output parameter will be
 ///   loaded if the TF Lite runtime has access to the file descriptor, though
 ///   this is not always the case, e.g. if a client provides a tflite::Model
-///   directly to the TF Lite runtime.  If 'fd' can be loaded then 'kTfLiteOk'
-///   will be returned, otherwise 'kTfLiteError' is returned.
-/// * Loads into the supplied 'custom_initial_data_offset_in_file' pointer the
-///   offset of the 'node's custom init data in the file associated with 'fd'.
-///   This output parameter will be set to -1 if the 'node' does not have custom
+///   directly to the TF Lite runtime.  If `fd` can be loaded then `kTfLiteOk`
+///   will be returned, otherwise `kTfLiteError` is returned.
+/// * Loads into the supplied `custom_initial_data_offset_in_file` pointer the
+///   offset of the `node`'s custom init data in the file associated with `fd`.
+///   This output parameter will be set to -1 if the `node` does not have custom
 ///   init data set.
-/// * Loads into the supplied 'custom_initial_data_size' the size of the
+/// * Loads into the supplied `custom_initial_data_size` the size of the
 ///   custom initialization data.  This output parameter will be set to -1 if
-///   the 'node' does not have custom init data set.
+///   the `node` does not have custom init data set.
 ///
-/// Returns 'kTfLiteOk' when 'fd' has been loaded successfully and
-/// 'kTfLiteError' otherwise.  Note that this means that 'kTfLiteOk' can be
-/// returned, even if the 'node' does not have custom init data set.
+/// Returns `kTfLiteOk` when `fd` has been loaded successfully and
+/// `kTfLiteError` otherwise.  Note that this means that `kTfLiteOk` can be
+/// returned, even if the `node` does not have custom init data set.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetNodeInitDataMmapInfo(
     const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node, int* fd,
@@ -638,17 +638,17 @@ TfLiteStatus TfLiteOpaqueContextGetNodeInitDataMmapInfo(
     int64_t* custom_initial_data_size);
 
 /// Adds an additional tensor and configures its properties based on the
-/// provided 'builder', preserving pre-existing Tensor entries. If non-null,
-/// the value pointed to by 'new_tensor_index' will be set to the index of the
-/// new tensor.  Returns 'kTfLiteOk' when the tensor has been added
-/// successfully.  Returns 'kTfLiteError' in case of failure.
+/// provided `builder`, preserving pre-existing Tensor entries. If non-null,
+/// the value pointed to by `new_tensor_index` will be set to the index of the
+/// new tensor.  Returns `kTfLiteOk` when the tensor has been added
+/// successfully.  Returns `kTfLiteError` in case of failure.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextAddTensor(TfLiteOpaqueContext* context,
                                           TfLiteOpaqueTensorBuilder* builder,
                                           int* new_tensor_index);
 
-/// Populates the size in bytes of a provide 'type' into 'bytes'.  Returns
-/// 'kTfLiteOk' for valid types, and 'kTfLiteError' otherwise.
+/// Populates the size in bytes of a provide `type` into `bytes`.  Returns
+/// `kTfLiteOk` for valid types, and `kTfLiteError` otherwise.
 TFL_CAPI_EXPORT
 TfLiteStatus TfLiteOpaqueContextGetSizeOfType(TfLiteOpaqueContext* context,
                                               TfLiteType type, size_t* bytes);
@@ -664,17 +664,17 @@ TfLiteStatus TfLiteOpaqueContextGetMetadata(TfLiteOpaqueContext* context,
                                             const char* name, const char** ptr,
                                             size_t* bytes);
 
-/// Reports an error message formed by using the provided 'format' string in
+/// Reports an error message formed by using the provided `format` string in
 /// combination with the data provided via the unnamed arguments following
-/// the 'format' parameter ('...').  The intended usage and behavior is the same
-/// as with 'printf' with regards to how the data and the formatting string
+/// the `format` parameter (`...`).  The intended usage and behavior is the same
+/// as with `printf` with regards to how the data and the formatting string
 /// interact.  E.g.
-/// 'TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);'
+/// `TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);`
 ///
-/// The provided 'opaque_context' will be used for reporting the resulting error
+/// The provided `opaque_context` will be used for reporting the resulting error
 /// message.
 ///
-/// Note that TF Lite clients can use macros like 'TF_LITE_OPAQUE_ENSURE' to
+/// Note that TF Lite clients can use macros like `TF_LITE_OPAQUE_ENSURE` to
 /// check for certain conditions to be true, and print an error message if the
 /// condition does not hold.  Direct usage of this function from application
 /// code should therefore be rare.
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index 292a1a959d2243..aab15e3a854bb6 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -725,7 +725,8 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
       if (subgraph->SetTensorParametersReadOnly(
               i, type, get_name(tensor), dims, quantization, buffer_ptr,
               buffer_size, allocation_, sparsity,
-              /*buffer_identifier=*/tensor->buffer()) != kTfLiteOk) {
+              /*buffer_identifier=*/tensor->buffer(),
+              /*external_buffer_id=*/tensor->external_buffer()) != kTfLiteOk) {
         TF_LITE_REPORT_ERROR(error_reporter_,
                              "Tensor %d is invalidly specified in schema.\n",
                              i);
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 6c7a8b3c7b8e30..848b28f108adf3 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -83,7 +83,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 13);
+             /* max_version = */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -179,7 +179,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
              /* min_version = */ 1,
@@ -217,7 +217,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
diff --git a/tensorflow/lite/core/model_building.h b/tensorflow/lite/core/model_building.h
index 8b9b87e28ce0d2..989c6ea0a739e5 100644
--- a/tensorflow/lite/core/model_building.h
+++ b/tensorflow/lite/core/model_building.h
@@ -104,7 +104,7 @@ class [[nodiscard]] Buffer {
 template <TfLiteType kType, class T>
 void Assign(Buffer b, std::vector<int> shape, const std::vector<T>& data,
             Quantization quantization) {
-  using Storage = TfLiteTypeToType<kType>::Type;
+  using Storage = typename TfLiteTypeToType<kType>::Type;
   std::unique_ptr<Storage[]> buffer_data(new Storage[data.size()]);
   std::copy(begin(data), end(data), buffer_data.get());
   Assign(
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 9a0f2e0f2d7e00..996d36b7e9725f 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -992,7 +992,8 @@ TfLiteStatus Subgraph::AllocateTensors(InliningStrategy auto_inline) {
   TF_LITE_ENSURE_STATUS(RedoAllDelegates());
 
   if (options_ && options_->GetShloCompositeInlining() &&
-      auto_inline == InliningStrategy::kAutoInline) {
+      auto_inline == InliningStrategy::kAutoInline &&
+      !IsDelegationSkippable() && !IsFullyDelegated()) {
     TF_LITE_ENSURE_STATUS(InlineCompositeNodes());
   }
 
@@ -1917,7 +1918,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t ndims,
     const int* dims, TfLiteQuantization quantization, const char* buffer,
     size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity,
-    const size_t buffer_identifier) {
+    const size_t buffer_identifier, const size_t external_buffer_id) {
   // Ensure quantization cleanup on failure.
   ScopedTfLiteQuantization scoped_quantization(&quantization);
   ScopedTfLiteSparsity scoped_sparsity(sparsity);
@@ -1968,6 +1969,10 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
   if (buffer_identifier != kTfLiteNoBufferIdentifier) {
     tensor_buffer_identifiers_[tensor_index] = buffer_identifier;
   }
+  if (external_buffer_id != kTfLiteNoBufferIdentifier &&
+      external_buffer_id != 0) {
+    tensor_external_buffer_ids_[tensor_index] = external_buffer_id;
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 9e5791ca2c2952..ba952c431bc3fa 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -130,22 +130,32 @@ class Subgraph {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter. `quantization` ownership is passed to the subgraph.
+  // `buffer_identifier`: An optional value to identify the buffer. If set to
+  // a value other than kTfLiteNoBufferIdentifier, this tensor is considered a
+  // constant tensor shared across multiple subgraphs / interpreters.
+  // `external_buffer_id`: An optional value to identify the external buffer. If
+  // set to a value other than kTfLiteNoBufferIdentifier, this tensor is
+  // considered a tensor using an external buffer shared across multiple
+  // subgraphs / interpreters.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantization quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr,
       TfLiteSparsity* sparsity = nullptr,
-      size_t buffer_identifier = kTfLiteNoBufferIdentifier) {
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier,
+      size_t external_buffer_id = kTfLiteNoBufferIdentifier) {
     return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
                                        dims.data(), quantization, buffer, bytes,
-                                       allocation, sparsity, buffer_identifier);
+                                       allocation, sparsity, buffer_identifier,
+                                       external_buffer_id);
   }
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, size_t ndims,
       const int* dims, TfLiteQuantization quantization, const char* buffer,
       size_t bytes, const Allocation* allocation = nullptr,
       TfLiteSparsity* sparsity = nullptr,
-      size_t buffer_identifier = kTfLiteNoBufferIdentifier);
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier,
+      size_t external_buffer_id = kTfLiteNoBufferIdentifier);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
@@ -611,6 +621,11 @@ class Subgraph {
     return tensor_buffer_identifiers_;
   }
 
+  const std::unordered_map<size_t, size_t>& GetExternalTensorBufferIdentifiers()
+      const {
+    return tensor_external_buffer_ids_;
+  }
+
   // Replaces the node for the given execution index with the subgraph.
   //
   // - The node and subgraph tensor counts must match.
@@ -1220,6 +1235,10 @@ class Subgraph {
   // Maps tensor constant buffers used in the subgraph to a model-wide
   // identifiers.
   std::unordered_map<size_t, size_t> tensor_buffer_identifiers_;
+
+  // Maps tensor external buffer ids used in the subgraph to a model-wide
+  // identifiers.
+  std::unordered_map<size_t, size_t> tensor_external_buffer_ids_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 0c5d4781fbdf25..92f81d68892b69 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -106,7 +106,6 @@ build_test(
     tags = [
         "noasan",
         "nomsan",
-        "notap",  # TODO(b/415812396): Re-enable once the test is fixed.
         "notsan",
         "nozapfhahn",
     ],
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 99391a32c24e27..6ab8553c5d1fb6 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -277,6 +277,12 @@ TfLiteDelegate* delegate1 = TfLiteXNNPackDelegateCreate(&xnnpack_options);
 if (interpreter1->ModifyGraphWithDelegate(delegate1) != kTfLiteOk) {
   // Handle errors...
 }
+// Signal to the weight cache provider that there's no building to be done
+// anymore. That way subsequent interpreter setups won't try to continue
+// building the cache.
+weight_cache.StopBuild();
+
+// Modify graph with delegate, as above...
 TfLiteDelegate* delegate2 = TfLiteXNNPackDelegateCreate(&xnnpack_options);
 if (interpreter2->ModifyGraphWithDelegate(delegate2) != kTfLiteOk) {
   // Handle errors...
@@ -287,8 +293,10 @@ if (interpreter2->ModifyGraphWithDelegate(delegate2) != kTfLiteOk) {
 // directly read from disk the 2nd time.
 ```
 
-Warning: Sharing the cache is not thread safe for writing. You should always do
-one full run of one of the interpreters before starting threading.
+Warning: Sharing the cache is not thread safe for building. You should always do
+one full run of one of the interpreters before starting threading. **Once the
+building run is done**, call `weight_cache.StopBuild()` before using the weight
+cache provider to build other delegate instances.
 
 ## Profiling
 
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.h b/tensorflow/lite/delegates/xnnpack/file_util.h
index 931bf9fd064f1c..cddc0a4c615f06 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.h
+++ b/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_FILE_UTIL_H_
 
+#if !defined(_WIN32)
 #include <sys/types.h>
+#endif
 
 #include <cstddef>
 #include <utility>
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index ec0affec8f2afd..e9ccdbfd8eedd9 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -144,7 +144,11 @@ bool WeightCacheBuilder::Start(const char* path, const FileDescriptor& fd) {
 }
 
 bool WeightCacheBuilder::StartBuildStep() {
-  XNNPACK_RETURN_CHECK(IsStarted());
+  XNNPACK_RETURN_CHECK(IsStarted(),
+                       "Trying to start a build step in an invalid builder.")
+  XNNPACK_RETURN_CHECK(!is_build_step_.exchange(true),
+                       "Failed to start build step: already started. This may "
+                       "be a concurrency issue.");
 
   // Reload flatbuffer data.
   XNNPackCacheHeader header;
@@ -169,7 +173,6 @@ bool WeightCacheBuilder::StartBuildStep() {
   build_segment_start_ = fd_.SetPos(header.buffer_list_offset);
   XNNPACK_RETURN_CHECK(build_segment_start_ != -1);
 
-  is_build_step_ = true;
   return true;
 }
 
@@ -216,14 +219,12 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
 }
 
 bool WeightCacheBuilder::StopBuildStep() {
-  if (!is_build_step_) {
-    return true;
-  }
-  XNNPACK_RETURN_CHECK(fd_.IsValid(),
-                       "cache file ('%s') is not open for writing: %s.",
-                       file_path_.c_str(), strerror(errno));
+  XNNPACK_RETURN_CHECK(is_build_step_,
+                       "Attempting to stop a non existing build step. This may "
+                       "be a concurrency issue.");
+  XNNPACK_RETURN_CHECK(fd_.IsValid(), "cache file ('%s') is not open.",
+                       file_path_.c_str());
 
-  is_build_step_ = false;
   if (fd_.GetPos() == build_segment_start_ && first_write_done_) {
     // Nothing was written to the file, we can exit early.
     return true;
@@ -271,6 +272,7 @@ bool WeightCacheBuilder::StopBuildStep() {
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
                   "XNNPack weight cache: written to '%s'.", file_path_.c_str());
   first_write_done_ = true;
+  is_build_step_ = false;
   return true;
 }
 
@@ -600,10 +602,11 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
 void* MMapWeightCacheProvider::OffsetToAddr(const size_t offset) {
   // While the cache is being built, the buffer could grow and need to be
   // reallocated so we cannot ensure pointer stability.
-  XNNPACK_ABORT_CHECK(
-      !IsBuilding(),
-      "Cannot get the address of a buffer in a cache during a building step.");
-  return offset_to_addr_[offset];
+  auto it = offset_to_addr_.find(offset);
+  XNNPACK_ABORT_CHECK(it != offset_to_addr_.end(),
+                      "Cannot get the address of a buffer in a cache before "
+                      "the build step that introduces it has finished.");
+  return it->second;
 }
 
 void MMapWeightCacheProvider::Release() {
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index e6e4797ba8001a..7dd04a20f2095f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
 
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -205,7 +206,7 @@ class WeightCacheBuilder {
   FileDescriptorView fd_;
   std::string file_path_;
 
-  bool is_build_step_ = false;
+  std::atomic<bool> is_build_step_ = false;
 };
 
 // Allows XNNPack to directly load packed weights from disk instead of having to
@@ -248,6 +249,11 @@ class MMapWeightCacheProvider {
   [[nodiscard /*Starting to build a cache file may fail.*/]]
   bool StartBuild(const char* file_path, FileDescriptor fd = FileDescriptor());
 
+  // If the cache is still being built, this signals that all of the building
+  // operations are done and that `CanStartBuildStep()` should now return
+  // `false`.
+  void StopBuild() { building_run_ = false; }
+
   // Sets the weight file path and loads it.
   [[nodiscard /*Loading a cache file may fail.*/]]
   bool Load(const std::string& path, FileDescriptor fd = FileDescriptor());
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index c1253ac64dc4c0..1e1f1d69426ec4 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index aea79a35d14356..be6b9a041b02fa 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -8,6 +8,8 @@ them in your Android projects.
 
 ## Use Nightly Snapshots
 
+Warning: Support for nightly snapshots is currently broken (b/446167415).
+
 To use nightly snapshots, add the following repo to your root Gradle build
 config.
 
diff --git a/tensorflow/lite/java/AndroidManifestGpuApi.xml b/tensorflow/lite/java/AndroidManifestGpuApi.xml
index 1343f5da8920cc..bb4b19398ad5e5 100644
--- a/tensorflow/lite/java/AndroidManifestGpuApi.xml
+++ b/tensorflow/lite/java/AndroidManifestGpuApi.xml
@@ -2,6 +2,9 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.lite.gpu.api">
 
+    <uses-sdk
+        android:minSdkVersion="21" />
+
     <application>
         <!-- Applications that target Android S+ require explicit declaration of
              any referenced vendor-provided libraries. -->
@@ -19,4 +22,3 @@
     </application>
 
 </manifest>
-
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index d07c6bd7281d87..6a3ec9f57e2a02 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2225,6 +2225,7 @@ cc_test(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -2572,10 +2573,13 @@ cc_test(
     ],
     tags = ["tflite_nnapi"],
     deps = [
+        ":kernel_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:tensor_ctypes",
+        "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index fd3a9320999447..2359d13b7edd85 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -57,7 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, op_context.input != nullptr);
 
-  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteInt4 ||
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteInt2 ||
+                              op_context.input->type == kTfLiteInt4 ||
                               op_context.input->type == kTfLiteUInt8 ||
                               op_context.input->type == kTfLiteInt8 ||
                               op_context.input->type == kTfLiteInt16 ||
diff --git a/tensorflow/lite/kernels/dequantize.h b/tensorflow/lite/kernels/dequantize.h
index 77668d47704982..07888d7a16d05b 100644
--- a/tensorflow/lite/kernels/dequantize.h
+++ b/tensorflow/lite/kernels/dequantize.h
@@ -72,7 +72,12 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
     per_channel_op_params.zero_point = zero_points.data();
   }
   const int8_t* input_data;
-  const size_t bytes_unpacked = input->bytes * 2;
+  size_t bytes_unpacked;
+  if (input->type == kTfLiteInt2) {
+    bytes_unpacked = input->bytes * 4;
+  } else {
+    bytes_unpacked = input->bytes * 2;
+  }
   auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
 
   if (input->type == kTfLiteInt4) {
@@ -80,6 +85,11 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
         GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
         /*bit_width=*/4, unpacked_input_data.get());
     input_data = unpacked_input_data.get();
+  } else if (input->type == kTfLiteInt2) {
+    tflite::tensor_utils::UnpackPackedIntToInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        /*bit_width=*/2, unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
   } else {
     input_data = GetTensorData<int8_t>(input);
   }
@@ -91,6 +101,7 @@ inline TfLiteStatus PerChannelDequantizeImpl(TfLiteContext* context,
           GetTensorData<uint8_t>(input), GetTensorShape(output),
           GetTensorData<float>(output));
       break;
+    case kTfLiteInt2:
     case kTfLiteInt4:
     case kTfLiteInt8:
       reference_ops::PerChannelDequantize<int8_t>(
@@ -115,7 +126,12 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
   op_params.zero_point = input->params.zero_point;
   op_params.scale = input->params.scale;
   const int8_t* input_data;
-  const size_t bytes_unpacked = input->bytes * 2;
+  size_t bytes_unpacked;
+  if (input->type == kTfLiteInt2) {
+    bytes_unpacked = input->bytes * 4;
+  } else {
+    bytes_unpacked = input->bytes * 2;
+  }
   auto unpacked_input_data = std::make_unique<int8_t[]>(bytes_unpacked);
 
   if (input->type == kTfLiteInt4) {
@@ -124,6 +140,12 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
         /*bit_width=*/4, unpacked_input_data.get());
     input_data = unpacked_input_data.get();
+  } else if (input->type == kTfLiteInt2) {
+    // Use GetTensorShape(input).FlatSize() for num_elements.
+    tflite::tensor_utils::UnpackPackedIntToInt8(
+        GetTensorData<int8_t>(input), GetTensorShape(input).FlatSize(),
+        /*bit_width=*/2, unpacked_input_data.get());
+    input_data = unpacked_input_data.get();
   } else {
     input_data = GetTensorData<int8_t>(input);
   }
@@ -140,6 +162,7 @@ TfLiteStatus DequantizeImpl(TfLiteContext* context, TfLiteNode* node,
             GetTensorShape(output), GetTensorData<float>(output));
       }
       break;
+    case kTfLiteInt2:
     case kTfLiteInt4:
     case kTfLiteInt8:
       if (kernel_type == kReference) {
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index bf098c91a170ef..d60bebb3049b5f 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -19,12 +19,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/memory/memory.h"
 #include "Eigen/Core"  // from @eigen_archive
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -75,6 +71,15 @@ class DequantizeOpModel : public SingleOpModel {
                        data_int8.data() + data_int8.size());
   }
 
+  template <typename T>
+  void SetInputInt2(int input, const std::vector<T> data) {
+    auto non_const = *const_cast<std::vector<T>*>(&data);
+    std::vector<int8_t> data_int8(non_const.size());
+    std::copy(non_const.begin(), non_const.end(), data_int8.begin());
+    PopulateTensor2bit(input, 0, data_int8.data(),
+                       data_int8.data() + data_int8.size());
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
  protected:
@@ -92,6 +97,15 @@ TEST(DequantizeOpTest, Int4) {
               ElementsAreArray(ArrayFloatNear({4, 3.5, -3, -3.5})));
 }
 
+TEST(DequantizeOpTest, Int2) {
+  DequantizeOpModel m(TensorType_INT2, {1, 4}, 0.5, -1, 6);
+
+  m.SetInputInt2<int8_t>(0, {1, 0, -1, -2});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({1.0, 0.5, 0.0, -0.5})));
+}
+
 TEST(DequantizeOpTest, Uint8) {
   // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
   DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127, 1);
@@ -185,5 +199,22 @@ TEST(DequantizePerChannelOpTest, Int8) {
                   {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
 }
 
+TEST(DequantizePerChannelOpTest, Int2) {
+  // scales={0.5, 1.0}, zero_points={-1, 0}, channel_dim=0
+  DequantizePerChannelOpModel m(TensorType_INT2, {2, 2}, {0.5, 1.0}, {-1, 0}, 0,
+                                6);
+  m.SetInputInt2<int8_t>(0, {1, 0, -1, -2});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  // Dequantization formula: (val - zp) * scale
+  // Channel 0: scale=0.5, zp=-1.
+  // val=1: (1 - (-1)) * 0.5 = 1.0
+  // val=0: (0 - (-1)) * 0.5 = 0.5
+  // Channel 1: scale=1.0, zp=0
+  // val=-1: (-1 - 0) * 1.0 = -1.0
+  // val=-2: (-2 - 0) * 1.0 = -2.0
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({1.0, 0.5, -1.0, -2.0})));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index da06ee3c3e892e..dcce5022c49dbb 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -186,7 +186,7 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
                                TfLiteFullyConnectedParams* params) {
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_shuffled =
       is_quantized && (params->weights_format ==
@@ -448,7 +448,8 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
       TF_LITE_ENSURE(context, (filter->type == kTfLiteInt8 ||
-                               filter->type == kTfLiteInt4));
+                               filter->type == kTfLiteInt4 ||
+                               filter->type == kTfLiteInt2));
       TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                         per_channel_quantization_size);
       TF_LITE_ENSURE_EQ(
@@ -654,7 +655,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
-       (filter->type == kTfLiteInt4));
+       (filter->type == kTfLiteInt4) || (filter->type == kTfLiteInt2));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_pie = kernel_type == kLegacyPie;
 
@@ -666,7 +667,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
-  if (filter->type == kTfLiteInt4) {
+  if (filter->type == kTfLiteInt4 || filter->type == kTfLiteInt2) {
     TF_LITE_ENSURE_MSG(
         context,
         kTfLiteOk == VerifyQuantizationZeroPoint(filter, /*expected_value=*/0),
@@ -1420,6 +1421,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         if (kernel_type == kReference) {
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           reference_ops::FullyConnected(
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
@@ -1456,8 +1458,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 "Invalid quantized and sparse fully-connected format.");
             return kTfLiteError;
           }
-          // Int4 support for sparse filter tensor is currently not supported
+          // Int4/Int2 support for sparse filter tensor is currently not
+          // supported
           TF_LITE_ENSURE(context, filter->type != kTfLiteInt4);
+          TF_LITE_ENSURE(context, filter->type != kTfLiteInt2);
           if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
               sparsity.dim_metadata[2].dense_size == 16) {
             // Block sparse with block size of 1x16.
@@ -1485,6 +1489,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1514,6 +1526,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 GetTensorShape(filter).FlatSize(), /*bit_width=*/4,
                 unpacked_filter_data.get());
             filter_data = unpacked_filter_data.get();
+          } else if (filter->type == kTfLiteInt2) {
+            const size_t bytes_unpacked = filter->bytes * 4;
+            unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+            tflite::tensor_utils::UnpackPackedIntToInt8(
+                GetTensorData<int8_t>(filter),
+                GetTensorShape(filter).FlatSize(), /*bit_width=*/2,
+                unpacked_filter_data.get());
+            filter_data = unpacked_filter_data.get();
           } else {
             filter_data = GetTensorData<int8_t>(filter);
           }
@@ -1762,14 +1782,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
       }
     case kTfLiteInt8:
-      if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
-        return EvalQuantized<kernel_type>(context, node, params, data, input,
-                                          filter, bias, output);
-      } else {
-        TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
-        return kTfLiteError;
-      }
     case kTfLiteInt4:
+    case kTfLiteInt2:
       if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
         return EvalQuantized<kernel_type>(context, node, params, data, input,
                                           filter, bias, output);
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 1239a3888677f8..bf707d135e4eca 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/absl_check.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -159,22 +160,34 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       std::vector<int64_t> per_channel_quantization_offsets(
           per_channel_quantization_scales.size(), 0);
       weights_ = AddInput({filter_type,
-                           {units_, input_size_},
-                           0,
-                           0,
-                           0,
-                           0,
-                           true,
+                           /*shape=*/{units_, input_size_},
+                           /*min=*/0,
+                           /*max=*/0,
+                           /*scale=*/0,
+                           /*zero_point=*/0,
+                           /*per_channel_quantization=*/true,
                            per_channel_quantization_scales,
                            per_channel_quantization_offsets,
-                           0});
+                           /*channel_index=*/0});
     } else {
       // per-tensor
       float min = input.min;
       float max = input.max;
-      if (filter_type == TensorType_INT4 || filter_type == TensorType_INT8) {
-        min = filter_type == TensorType_INT4 ? -7.f : -63.5f;
-        max = filter_type == TensorType_INT4 ? 7.f : 64.f;
+      switch (filter_type) {
+        case TensorType_INT4:
+          min = -7.f;
+          max = 7.f;
+          break;
+        case TensorType_INT2:
+          min = -2.f;
+          max = 2.f;
+          break;
+        case TensorType_INT8:
+          min = -63.5f;
+          max = 64.f;
+          break;
+        default:
+          break;
       }
       weights_ = AddInput({filter_type, {units_, input_size_}, min, max});
     }
@@ -292,6 +305,13 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     QuantizeAndPopulate4bit(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(weights_);
+    std::vector<int8_t> u =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(weights_, 0, u.data(), u.data() + u.size());
+  }
+
   template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
@@ -372,6 +392,12 @@ class PerChannelQuantizedFullyConnectedOpModel
     PerChannelSymmetricQuantizeAndPopulate(weights_, data);
   }
 
+  void SetWeights2bit(const std::vector<float>& data) {
+    // 2 bit logic handled in PerChannelSymmetricQuantizeAndPopulate.
+    ABSL_CHECK_EQ(interpreter_->tensor(weights_)->type, kTfLiteInt2);
+    PerChannelSymmetricQuantizeAndPopulate(weights_, data);
+  }
+
   template <typename T>
   void SetInput(const std::vector<float>& data) {
     QuantizeAndPopulate<T>(input_, data);
@@ -734,6 +760,38 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt2) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128}, TensorType_INT32, false,
+      false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1., 2., 3.});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  // The quantization parameters for the model.
+  // input s, zp: 0.5, -1
+  // filter s, zp: 0.5, 0
+  // output s, zp: 1, -1
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              testing::Pointwise(testing::FloatEq(),
+                                 {26.0, 27.0, 28.0, 8.0, 9.0, 10.0}));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
@@ -863,6 +921,34 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt4) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(103, 104, 105, 97, 98, 99));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestPerChannelQuantizedInt2) {
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT8, {}, -127, 128},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(25, 26, 27, 7, 8, 9));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16NoBias) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
@@ -1018,6 +1104,37 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(1536, 2048, 2560, 11776, 12288, 12800));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestPerChannelQuantizedInt16Bias32Weight2) {
+  const float scale = 128.0 / 65536;
+  PerChannelQuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT16, {2, 10}, 0, 0, scale, 0},
+      /*per_channel_quantization_scales=*/{1.0, 1.0, 1.0},
+      /*output=*/{TensorType_INT16, {}, 0, 0, scale, 0},
+      /*bias_type=*/TensorType_INT32, false, false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, TensorType_INT2);
+
+  m.SetWeights2bit({
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 0
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 1
+      1, 0, 1, 0, 1, 0, 1, 0, -1, 0,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int16_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({26, 27, 28, 8, 9, 10})));
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAre(13312, 13824, 14336, 4096, 4608, 5120));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16Bias64) {
   const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index d845b3ee6f5184..debdc5142e963d 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <limits>
 #include <memory>
 #include <tuple>
@@ -34,6 +35,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/kernels/internal/reference/mul.h"
 #include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 #if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 #include <Accelerate/Accelerate.h>
@@ -4798,6 +4801,78 @@ inline void Slice(const tflite::SliceParams& op_params,
   return Slice(op_params, input_shape, output_shape, &writer);
 }
 
+// Iterates through the desired slice region and copies nibbles directly from
+// the input to the output tensor.
+inline void SliceInt4(const tflite::SliceParams& op_params,
+                      const RuntimeShape& input_shape,
+                      const TfLiteTensor* input,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  ruy::profiler::ScopeLabel label("SliceInt4");
+
+  const int8_t* input_data = GetTensorData<int8_t>(input);
+  int8_t* output_data = GetTensorData<int8_t>(output);
+
+  // Clear output buffer, as we will be writing nibbles.
+  const int output_byte_size = (output_shape.FlatSize() + 1) / 2;
+  memset(output_data, 0, output_byte_size);
+
+  // Calculate the start and stop indices for each dimension of the slice.
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  int start[5];
+  int stop[5];
+  for (int i = 0; i < 5; ++i) {
+    int padded_i = 5 - i;
+    start[i] =
+        begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] =
+        (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+            ? ext_input_shape.Dims(i)
+            : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  // Loop over the slice region and copy nibbles.
+  int output_nibble_idx = 0;
+  for (int i0 = start[0]; i0 < stop[0]; ++i0) {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1) {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2) {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3) {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4) {
+            const int input_nibble_idx =
+                Offset(ext_input_shape, i0, i1, i2, i3, i4);
+
+            // Get nibble from input. Since int4 data is packed, two nibbles
+            // share a byte.
+            const int8_t input_byte = input_data[input_nibble_idx / 2];
+            int8_t nibble;
+            if (input_nibble_idx % 2 == 0) {  // low nibble
+              // The `(val << 4) >> 4` trick is to sign-extend the 4-bit value.
+              nibble = static_cast<int8_t>(input_byte << 4) >> 4;
+            } else {  // high nibble
+              nibble = input_byte >> 4;
+            }
+
+            // Set nibble in output.
+            if (output_nibble_idx % 2 == 0) {
+              // First nibble of a byte. We simply set the lower 4 bits.
+              output_data[output_nibble_idx / 2] = (nibble & 0x0F);
+            } else {
+              // Second nibble. OR with existing low nibble.
+              output_data[output_nibble_idx / 2] |= (nibble << 4);
+            }
+            output_nibble_idx++;
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
              const T* input2_data, const RuntimeShape& output_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/tensorflow/lite/kernels/internal/reference/broadcast_to.h
index f106b2b52f6c35..0cd03db926df67 100644
--- a/tensorflow/lite/kernels/internal/reference/broadcast_to.h
+++ b/tensorflow/lite/kernels/internal/reference/broadcast_to.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
 
+#include <cstddef>
+
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -83,7 +85,8 @@ inline void BroadcastTo(const RuntimeShape& unextended_input_shape,
   // If non-broadcasting, just copy data from input to output tensor.
   if (last_broadcast_dim == -1) {
     memcpy(output_data, input_data,
-           unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type));
+           static_cast<size_t>(unextended_input_shape.FlatSize()) *
+               static_cast<size_t>(TfLiteTypeGetSize(data_type)));
     return;
   }
 
diff --git a/tensorflow/lite/kernels/internal/reference/slice.h b/tensorflow/lite/kernels/internal/reference/slice.h
index cb73ea0d0c4c6c..feddd639584c09 100644
--- a/tensorflow/lite/kernels/internal/reference/slice.h
+++ b/tensorflow/lite/kernels/internal/reference/slice.h
@@ -15,7 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SLICE_H_
 
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -74,6 +81,27 @@ inline void Slice(const tflite::SliceParams& op_params,
   return Slice(op_params, input_shape, output_shape, &writer);
 }
 
+inline void SliceInt4(const tflite::SliceParams& op_params,
+                      const RuntimeShape& input_shape,
+                      const TfLiteTensor* input,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  const int num_input_elements = input_shape.FlatSize();
+  std::vector<int8_t> unpacked_input(num_input_elements);
+  tensor_utils::UnpackPackedIntToInt8(GetTensorData<int8_t>(input),
+                                      num_input_elements, 4,
+                                      unpacked_input.data());
+
+  const int num_output_elements = output_shape.FlatSize();
+  std::vector<int8_t> unpacked_output(num_output_elements);
+
+  reference_ops::Slice<int8_t>(op_params, input_shape, unpacked_input.data(),
+                               output_shape, unpacked_output.data());
+
+  tensor_utils::PackInt8IntoDenseInt(unpacked_output.data(),
+                                     num_output_elements, 4,
+                                     GetTensorData<int8_t>(output));
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index f57c3adf5d7cd1..842a9dc99d2dd6 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -280,7 +280,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
              /* min_version */ 1,
-             /* max_version */ 11);
+             /* max_version */ 14);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX_REF(),
@@ -380,7 +380,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU_REF());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM_REF(),
              /* min_version = */ 1,
@@ -415,7 +415,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF(),
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index d8ff57364fe0f7..62b4ae9440668f 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/kernels/internal/reference/slice.h"
+
 #include <stdint.h>
 
 #include <algorithm>
@@ -206,6 +208,27 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // The dimensions in the kernel used to be in reverse-order, and TFLite
   // arranged the begins and sizes vectors accordingly. This macro incorporates
   // the needed reversing.
+#define TF_LITE_SLICE_INT4()                                            \
+  {                                                                     \
+    TF_LITE_ENSURE_EQ(context, begins.size(), kMaxDim);                 \
+    TF_LITE_ENSURE_EQ(context, sizes.size(), kMaxDim);                  \
+    tflite::SliceParams op_params;                                      \
+    op_params.begin_count = kMaxDim;                                    \
+    op_params.size_count = kMaxDim;                                     \
+    for (int i = 0; i < kMaxDim; ++i) {                                 \
+      op_params.begin[i] = begins[i];                                   \
+      op_params.size[i] = sizes[i];                                     \
+    }                                                                   \
+                                                                        \
+    if (kernel_type == kGenericOptimized) {                             \
+      optimized_ops::SliceInt4(op_params, GetTensorShape(input), input, \
+                               GetTensorShape(output), output);         \
+    } else {                                                            \
+      reference_ops::SliceInt4(op_params, GetTensorShape(input), input, \
+                               GetTensorShape(output), output);         \
+    }                                                                   \
+  }
+
 #define TF_LITE_SLICE(data_type)                                               \
   {                                                                            \
     TF_LITE_ENSURE_EQ(context, begins.size(), kMaxDim);                        \
@@ -231,6 +254,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       TF_LITE_SLICE(float);
       break;
+    case kTfLiteInt4:
+      TF_LITE_SLICE_INT4();
+      break;
     case kTfLiteInt32:
       TF_LITE_SLICE(int32_t);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 4a016c44a4544f..feb02c48d2f3aa 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -16,11 +16,16 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "Eigen/Core"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
@@ -67,6 +72,12 @@ class SliceOpModel : public SingleOpModel {
   }
 
   void SetInput(std::initializer_list<input_type> data) {
+    if constexpr (std::is_same<input_type, int8_t>::value) {
+      if (interpreter_->tensor(input_)->type == kTfLiteInt4) {
+        PopulateTensor4bit(input_, 0, data.begin(), data.end());
+        return;
+      }
+    }
     PopulateTensor<input_type>(input_, data);
   }
   void SetStringInput(std::vector<string> data) {
@@ -253,6 +264,22 @@ TEST_P(SliceOpTest, SliceInt8) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST_P(SliceOpTest, SliceInt4) {
+  SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                  {2, 1, -1, 1}, TensorType_INT32,
+                                  TensorType_INT4, GetParam());
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  const TfLiteTensor* output_tensor = m.GetOutputTensor();
+  int num_elements = NumElements(output_tensor);
+  std::vector<int8_t> unpacked_output(num_elements);
+  tensor_utils::UnpackPackedIntToInt8(GetTensorData<int8_t>(output_tensor),
+                                      num_elements,
+                                      /*bit_width=*/4, unpacked_output.data());
+  EXPECT_THAT(unpacked_output, ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 TEST_P(SliceOpTest, SliceInt16) {
   SliceOpModel<int16_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
                                    {2, 1, -1, 1}, TensorType_INT32,
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index a15807600ea7bd..cbdb74d29d04aa 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -109,6 +109,9 @@ inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
   if (type == kTfLiteInt4) {
     min = -7;
     max = 7;
+  } else if (type == kTfLiteInt2) {
+    min = -2;
+    max = 1;
   }
 
   q.reserve(data.size());
@@ -570,6 +573,15 @@ class SingleOpModel {
                        quantized_output.data() + quantized_output.size());
   }
 
+  void QuantizeAndPopulate2bit(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt2;
+    std::vector<int8_t> quantized_output =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                       quantized_output.data() + quantized_output.size());
+  }
+
   void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
     std::vector<int8_t> q = QuantizeTensor(index, data);
     PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
@@ -583,6 +595,10 @@ class SingleOpModel {
       std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
                                                t->params.zero_point, t->type);
       PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
+    } else if (t->type == kTfLiteInt2) {
+      std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
+                                               t->params.zero_point, t->type);
+      PopulateTensor2bit(index, /*offset=*/0, q.data(), q.data() + q.size());
     } else {
       std::vector<int8_t> q = QuantizeTensor(index, data);
       PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
@@ -663,6 +679,9 @@ class SingleOpModel {
       PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
                          quantized_output.data() + quantized_output.size());
 
+    } else if (t->type == kTfLiteInt2) {
+      PopulateTensor2bit(index, /*offset=*/0, quantized_output.data(),
+                         quantized_output.data() + quantized_output.size());
     } else {
       PopulateTensor(index, /*offset=*/0, quantized_output.data(),
                      quantized_output.data() + quantized_output.size());
@@ -888,6 +907,9 @@ class SingleOpModel {
         } else if (t.type == TensorType_INT4) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
+        } else if (t.type == TensorType_INT2) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt2);
         } else {
           ABSL_LOG(FATAL) << "No support for the requested quantized type";
         }
@@ -940,6 +962,9 @@ class SingleOpModel {
     if (type == kTfLiteInt4) {
       qmin = -7;
       qmax = 7;
+    } else if (type == kTfLiteInt2) {
+      qmin = -2;
+      qmax = 2;
     } else {
       qmin = std::numeric_limits<T>::min();
       qmax = std::numeric_limits<T>::max();
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index 9615b10e2f80f3..c82e97e00ed311 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -126,7 +126,7 @@ objc_library(
         "//tensorflow/lite:testdata/multi_signatures.bin",
     ],
     sdk_frameworks = ["XCTest"],
-    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
+    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_arm64"],
     deps = [":TensorFlowLite"],
 )
 
@@ -159,7 +159,7 @@ objc_library(
     ],
     module_name = "TestApp",
     tags = TFL_DEFAULT_TAGS + [
-        "builder_default_ios_x86_64",
+        "builder_default_ios_arm64",
         "manual",
     ],
     deps = [
diff --git a/tensorflow/lite/profiling/memory_info.cc b/tensorflow/lite/profiling/memory_info.cc
index be4982231c4d06..dc9b262623643e 100644
--- a/tensorflow/lite/profiling/memory_info.cc
+++ b/tensorflow/lite/profiling/memory_info.cc
@@ -92,7 +92,9 @@ MemoryUsage GetMemoryUsage() {
       result.private_footprint_bytes = (vm_swap_kb + res.ru_maxrss) * 1024;
     }
   }
-#if defined(__NO_MALLINFO__) || !defined(__GLIBC__)
+#if defined(__NO_MALLINFO__) || !defined(__GLIBC__) ||         \
+    defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
   result.total_allocated_bytes = -1;
   result.in_use_allocated_bytes = -1;
 #elif __GLIBC_MINOR__ >= 33
@@ -103,7 +105,9 @@ MemoryUsage GetMemoryUsage() {
   const auto mem = mallinfo();
   result.total_allocated_bytes = mem.arena;
   result.in_use_allocated_bytes = mem.uordblks;
-#endif  // defined(__NO_MALLINFO__)
+#endif  // defined(__NO_MALLINFO__) || !defined(__GLIBC__) || \
+        // defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) ||
+        // defined(THREAD_SANITIZER)
 #elif defined(__APPLE__)
   struct task_vm_info vm_info;
   mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
diff --git a/tensorflow/lite/profiling/memory_info.h b/tensorflow/lite/profiling/memory_info.h
index 75516f26f3fee9..0416c9148fd27a 100644
--- a/tensorflow/lite/profiling/memory_info.h
+++ b/tensorflow/lite/profiling/memory_info.h
@@ -30,6 +30,9 @@ struct MemoryUsage {
 
   // Indicates whether obtaining memory usage is supported on the platform, thus
   // indicating whether the values defined in this struct make sense or not.
+  // Note that even if this returns true, some of the fields in the struct may
+  // not be supported by GetMemoryUsage(); in such cases, unsupported fields
+  // will be set to kValueNotSet (zero) or -1.
   static bool IsSupported();
 
   MemoryUsage()
@@ -127,7 +130,7 @@ struct MemoryUsage {
 };
 
 // Return the memory usage from the system.
-// Note: this currently only works on Linux-based and Apple systems.
+// Note: this currently only works on Linux-based, Apple, and Windows systems.
 MemoryUsage GetMemoryUsage();
 
 }  // namespace memory
diff --git a/tensorflow/lite/profiling/memory_info_test.cc b/tensorflow/lite/profiling/memory_info_test.cc
index 0e49ec868a2feb..0cd3abf60f7255 100644
--- a/tensorflow/lite/profiling/memory_info_test.cc
+++ b/tensorflow/lite/profiling/memory_info_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/profiling/memory_info.h"
 
 #include <memory>
-#include <new>
 #include <sstream>
 #include <string>
 
@@ -75,10 +74,32 @@ TEST(MemoryUsage, GetMemoryUsage) {
   }
 
   EXPECT_GE(result.mem_footprint_kb, size / 1024);
+#if (defined(__linux__) && !defined(ADDRESS_SANITIZER) &&         \
+     !defined(MEMORY_SANITIZER) && !defined(THREAD_SANITIZER)) || \
+    (defined(__APPLE__) && !defined(THREAD_SANITIZER)) || defined(_WIN32)
   EXPECT_GE(result.total_allocated_bytes, size);
+  EXPECT_NE(result.total_allocated_bytes, -1);
   EXPECT_GE(result.in_use_allocated_bytes, size);
+  EXPECT_NE(result.in_use_allocated_bytes, -1);
+#else
+  // The mallinfo() function, which is used on Linux, returns invalid
+  // results when address/memory/thread sanitizer is enabled, e.g.
+  // <https://github.com/google/sanitizers/issues/1845>, so the
+  // *_allocated_bytes fields are not supported in those cases,
+  // and should be set to either -1 or kValueNotSet(0).
+  // For Apple platforms, the mstats() function returns invalid results when
+  // thread sanitizer is enabled.
+  if (result.total_allocated_bytes != -1) {
+    EXPECT_EQ(result.total_allocated_bytes, MemoryUsage::kValueNotSet);
+  }
+  if (result.in_use_allocated_bytes != -1) {
+    EXPECT_EQ(result.in_use_allocated_bytes, MemoryUsage::kValueNotSet);
+  }
+#endif  // (defined(__linux__) && !defined(ADDRESS_SANITIZER) && \
+        // !defined(MEMORY_SANITIZER) && !defined(THREAD_SANITIZER)) || \
+        // (defined(__APPLE__) && !defined(THREAD_SANITIZER)) || defined(_WIN32)
   EXPECT_GE(result.private_footprint_bytes, size);
-#endif
+#endif  // defined(__linux__) || defined(__APPLE__) || defined(_WIN32)
 }
 
 // The main aim of this test is just to exercise the code for
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 9a37d8ae832edf..3880f6461ed74b 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -321,7 +321,7 @@ py_strict_test(
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/trackable:autotrackable",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_jax//:pkg",
+        "@pypi//jax",
     ],
 )
 
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 7bd4c5e9411271..3337f6badfb515 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -25,8 +25,6 @@
 from tensorflow.compiler.mlir.lite import types_pb2 as _types_pb2
 from tensorflow.compiler.mlir.lite.metrics import converter_error_data_pb2
 from tensorflow.compiler.mlir.lite.python import wrap_converter
-from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2
-from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
 from tensorflow.lite.python.convert_phase import Component
@@ -432,7 +430,6 @@ def build_conversion_flags(
     guarantee_all_funcs_one_use=False,
     enable_mlir_variable_quantization=False,
     disable_fuse_mul_and_fc=False,
-    quantization_options: Optional[quant_opts_pb2.QuantizationOptions] = None,
     ir_dump_dir=None,
     ir_dump_pass_regex=None,
     ir_dump_func_regex=None,
@@ -441,9 +438,6 @@ def build_conversion_flags(
     print_ir_after=None,
     print_ir_module_scope=None,
     elide_elementsattrs_if_larger=None,
-    quantization_config: Optional[
-        quantization_config_pb2.QuantizationConfig
-    ] = None,
     use_buffer_offset=False,
     reduce_type_precision=False,
     qdq_conversion_mode=None,
@@ -548,11 +542,6 @@ def build_conversion_flags(
       graph.
     disable_fuse_mul_and_fc: Disable fusing input multiplication with
       fullyconnected operations. Useful when quantizing weights.
-    quantization_options: [Deprecated] Config to indicate quantization options
-      of each components (ex: weight, bias, activation). This can be a preset
-      method or a custom method, and allows finer, modular control. This option
-      will override any other existing quantization flags. We plan on gradually
-      migrating all quantization-related specs into this option.
     ir_dump_dir: A string specifying the target directory to output MLIR dumps
       produced during conversion. If populated, enables MLIR dumps.
     ir_dump_pass_regex: A string containing a regular expression for filtering
@@ -570,8 +559,6 @@ def build_conversion_flags(
       operation when printing IR for print_ir_[before|after].
     elide_elementsattrs_if_larger: An int, if specified elides ElementsAttrs
       with '...' that have more elements than the given upper limit.
-    quantization_config: Configures the StableHLO Quantizer. See the comments in
-      `QuantizationConfig` protobuf definition for details.
     use_buffer_offset: Force the model use buffer_offset & buffer_size fields
       instead of data. i.e. store the constant tensor and custom op binaries
       outside of Flatbuffers
@@ -682,10 +669,6 @@ def build_conversion_flags(
       enable_mlir_variable_quantization
   )
   conversion_flags.disable_fuse_mul_and_fc = disable_fuse_mul_and_fc
-  if quantization_options:  # Deprecated
-    conversion_flags.quantization_options.CopyFrom(quantization_options)
-  if quantization_config:
-    conversion_flags.quantization_config.CopyFrom(quantization_config)
 
   # Transfer debug options. Check for existence before populating in order to
   # leverage defaults specified in proto definition.
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 922f655dfc855d..ccf2bf2b61651e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -615,7 +615,12 @@ PyObject* InterpreterWrapper::SetTensor(int tensor_index, PyObject* value,
   TfLiteTensor* tensor =
       interpreter_->subgraph(subgraph_index)->tensor(tensor_index);
 
-  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
+  if (tensor->type == kTfLiteInt4) {
+    return python_utils::SetInt4Tensor(tensor, array, tensor_index);
+  }
+
+  TfLiteType incoming_type = python_utils::TfLiteTypeFromPyArray(array);
+  if (incoming_type != tensor->type) {
     PyErr_Format(PyExc_ValueError,
                  "Cannot set tensor:"
                  " Got value of type %s"
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index 096e8a879908a1..44203e9f146dda 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 
 #define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
 #include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 
 namespace tflite {
@@ -220,5 +223,67 @@ bool FillStringBufferWithPyArray(PyObject* value,
   return false;
 }
 
+// Helper function to pack int8 numpy array data into an INT4 tensor.
+PyObject* SetInt4Tensor(TfLiteTensor* tensor, PyArrayObject* array,
+                        int tensor_index) {
+  TfLiteType incoming_type = TfLiteTypeFromPyArray(array);
+  if (incoming_type != kTfLiteInt8) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Expected a numpy array of int8 for INT4 input "
+                 "%d, name: %s, but got %s",
+                 tensor_index, tensor->name, TfLiteTypeGetName(incoming_type));
+    return nullptr;
+  }
+
+  size_t num_elements = 1;
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    num_elements *= tensor->dims->data[i];
+  }
+  size_t expected_packed_bytes = (num_elements + 1) / 2;
+  size_t actual_numpy_bytes = PyArray_NBYTES(array);
+
+  if (actual_numpy_bytes != num_elements) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Numpy array for INT4 input %d, name: %s, has %zu bytes, "
+                 "but expected %zu bytes for %zu elements",
+                 tensor_index, tensor->name, actual_numpy_bytes, num_elements,
+                 num_elements);
+    return nullptr;
+  }
+
+  if (tensor->data.raw == nullptr && tensor->bytes) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Tensor is unallocated. Try calling allocate_tensors()"
+                 " first for input %d, name: %s",
+                 tensor_index, tensor->name);
+    return nullptr;
+  }
+
+  // Pack the int8 array into int4
+  uint8_t* packed_data = reinterpret_cast<uint8_t*>(tensor->data.raw);
+  int8_t* numpy_data = reinterpret_cast<int8_t*>(PyArray_DATA(array));
+  for (size_t i = 0; i < expected_packed_bytes; ++i) {
+    int8_t first_nibble = numpy_data[2 * i];
+    int8_t second_nibble =
+        (2 * i + 1 < num_elements) ? numpy_data[2 * i + 1] : 0;
+    if ((first_nibble < -8 || first_nibble > 7) ||
+        (second_nibble < -8 || second_nibble > 7)) {
+      PyErr_Format(PyExc_ValueError,
+                   "Cannot set tensor:"
+                   " Values for INT4 input must be between -8 and 7.");
+      return nullptr;
+    }
+    // Pack the two int8 values into a single byte. The first nibble
+    // occupies the lower 4 bits and the second nibble occupies the upper 4
+    // bits. We mask the first nibble with 0x0F to ensure only the lower 4
+    // bits are used, handling potential sign extension in the int8 value.
+    packed_data[i] = (first_nibble & 0x0F) | (second_nibble << 4);
+  }
+  Py_RETURN_NONE;
+}
+
 }  // namespace python_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
index e04418c32df7f4..d543a758c14e4d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.h
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -72,6 +72,10 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
 bool FillStringBufferWithPyArray(PyObject* value,
                                  DynamicBuffer* dynamic_buffer);
 
+// Helper function to pack int8 numpy array data into an INT4 tensor.
+PyObject* SetInt4Tensor(TfLiteTensor* tensor, PyArrayObject* array,
+                        int tensor_index);
+
 }  // namespace python_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 46d817cd0c065c..6f0139eeea9229 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_protobuf//:protobuf_headers",
     ],
 )
@@ -307,6 +308,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/lite/toco/format_port.h b/tensorflow/lite/toco/format_port.h
index b0f1f2e7c14845..2c0516021fc1cc 100644
--- a/tensorflow/lite/toco/format_port.h
+++ b/tensorflow/lite/toco/format_port.h
@@ -12,59 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// This file is used to provide equivalents of internal absl::FormatF
-// and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
-// a full C++ example.
-// TODO(aselle): When absl adds support for StrFormat, use that instead.
+
 #ifndef TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
 #define TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
 
 #include <string>
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/lite/toco/toco_types.h"
+#include "absl/strings/str_format.h"
 
 namespace toco {
 namespace port {
 
-/// Identity (default case)
-template <class T>
-T IdentityOrConvertStringToRaw(T foo) {
-  return foo;
-}
-
-// Overloaded case where we return std::string.
-inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
-  return foo.c_str();
-}
-
-// Delegate to TensorFlow Appendf function until absl has an equivalent.
-template <typename... Args>
-inline void AppendFHelper(std::string* destination, const char* fmt,
-                          Args&&... args) {
-  tensorflow::strings::Appendf(destination, fmt, args...);
-}
-
-// Specialization for no argument format string (avoid security bug).
-inline void AppendFHelper(std::string* destination, const char* fmt) {
-  tensorflow::strings::Appendf(destination, "%s", fmt);
-}
-
-// Append formatted string (with format fmt and args args) to the string
-// pointed to by destination. fmt follows C printf semantics.
-// One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline void AppendF(std::string* destination, const char* fmt, Args&&... args) {
-  AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
+inline void AppendF(std::string* destination,
+                    const absl::FormatSpec<Args...>& fmt, Args&&... args) {
+  absl::StrAppendFormat(destination, fmt, args...);
 }
 
-// Return formatted string (with format fmt and args args). fmt follows C printf
-// semantics. One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline std::string StringF(const char* fmt, Args&&... args) {
-  std::string result;
-  AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
-  return result;
+inline std::string StringF(const absl::FormatSpec<Args...>& fmt,
+                           Args&&... args) {
+  return absl::StrFormat(fmt, args...);
 }
 
 }  // namespace port
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
index ca581832f7b4e1..a6e00c588af7fc 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -48,7 +48,7 @@ absl::Status FuseActivationFunctions::Run(Model* model, std::size_t op_index,
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
         "Not fusing activation function %s into %s because it has more than "
-        "one  consumed output",
+        "one consumed output",
         LogName(*ac_op), LogName(*op));
     return absl::OkStatus();
   }
@@ -59,8 +59,8 @@ absl::Status FuseActivationFunctions::Run(Model* model, std::size_t op_index,
   DCHECK_GE(count_ops_consuming_output, 1);
   if (count_ops_consuming_output > 1) {
     AddMessageF(
-        "Not fusing activation function into %s because it is consumed by more "
-        "than 1 other operator",
+        "Not fusing activation function %s into %s because it is consumed by "
+        "more than 1 other operator",
         LogName(*ac_op), LogName(*op));
     return absl::OkStatus();
   }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 7e0b57c8dd5d60..a87449cd9d00de 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/toco_port.h"
@@ -43,7 +44,8 @@ class GraphTransformation {
   // Adds a message; normally only called by the graph transformation
   // itself during its run (this function could be protected).
   template <typename... Args>
-  void AddMessageF(const char* format, const Args&... args) {
+  void AddMessageF(const absl::FormatSpec<Args...>& format,
+                   const Args&... args) {
     return messages_.push_back(toco::port::StringF(format, args...));
   }
 
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 553830bd9c5d32..4a41cd3565e1df 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -18,12 +18,14 @@ limitations under the License.
 // Portability layer for toco tool. Mainly, abstract filesystem access so we
 // can build and use on google internal environments and on OSX.
 
+#include <cstdint>
 #include <string>
+
 #include "google/protobuf/text_format.h"
-#include "tensorflow/lite/toco/format_port.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/lite/toco/format_port.h"
 #if defined(PLATFORM_GOOGLE)
 #include "absl/strings/cord.h"
 #endif  // PLATFORM_GOOGLE
@@ -39,17 +41,16 @@ limitations under the License.
 namespace std {
 
 template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream os ;
-    os << value ;
-    return os.str() ;
+std::string to_string(T value) {
+  std::ostringstream os;
+  os << value;
+  return os.str();
 }
 
 #ifdef __ARM_ARCH_7A__
 double round(double x);
 #endif
-}
+}  // namespace std
 #endif
 
 namespace toco {
@@ -84,7 +85,7 @@ void CopyToBuffer(const ::absl::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const std::string& src, char* dest);
 
-inline uint32 ReverseBits32(uint32 n) {
+inline uint32_t ReverseBits32(uint32_t n) {
   n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
   n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2);
   n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4);
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 91ad3c4582d2f8..9384380ca6037e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -1357,12 +1357,11 @@ TfLiteStatus BenchmarkTfLiteModel::LoadModel() {
 std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
     const {
   tflite::ops::builtin::BuiltinOpResolver* resolver = nullptr;
-  // When --use_xnnpack is explicitly set to false, skip applying the default
-  // XNNPACK delegate in TfLite runtime so that the original execution path
-  // based on the unmodified model graph is still exercised.
+  // When --use_xnnpack is explicitly set, skip applying the default XNNPACK
+  // delegate in TfLite runtime so that the execution path either doesn't use
+  // the XNNPack delegate or only uses the one applied explicitly.
   if (params_.HasParam("use_xnnpack") &&
-      params_.HasValueSet<bool>("use_xnnpack") &&
-      !params_.Get<bool>("use_xnnpack")) {
+      params_.HasValueSet<bool>("use_xnnpack")) {
     resolver =
         new tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
   } else {
diff --git a/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto b/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
index ab0c5d2588eb8c..2001d216d28b20 100644
--- a/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
+++ b/tensorflow/lite/tools/benchmark/proto/benchmark_result.proto
@@ -1,3 +1,18 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 syntax = "proto2";
 
 package tflite.tools.benchmark;
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index ac8aa40a20d0d4..9967630502dd64 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG da9a34d9bb68f339c35d2da480ab0734b0a26429
+  GIT_TAG 2dbaa1cd9faac161a59f4e1f3d0835991e2370d9
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 05059794c50d15..c4344d667f158f 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -153,6 +153,11 @@ InputTensorData CreateRandomTensorData(std::string name, TfLiteType type,
           num_elements,
           std::uniform_int_distribution<int16_t>(low_range, high_range));
     }
+    case kTfLiteUInt16: {
+      return CreateInputTensorData<uint16_t>(
+          num_elements,
+          std::uniform_int_distribution<uint16_t>(low_range, high_range));
+    }
     case kTfLiteUInt8: {
       // std::uniform_int_distribution is specified not to support char types.
       return CreateInputTensorData<uint8_t>(
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index 6fc7b87bdf1e1a..d42f796cc1e228 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -300,9 +300,6 @@ absl::Status CheckDepthwiseConvGpuDelegateCompatibility(
   if (bias && NumElements(bias->dims) != output_depth) {
     return absl::InvalidArgumentError("bias.size != output.c");
   }
-  if (depth_multiplier != 1 && input_depth != 1) {
-    return absl::UnimplementedError("depth_multiplier != 1 && input.c != 1");
-  }
   return absl::OkStatus();
 }
 
@@ -1007,6 +1004,12 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig,
                                          /*required_outputs=*/1));
       return absl::OkStatus();
 
+    case kTfLiteBuiltinTopkV2:
+      RETURN_IF_ERROR(CheckInputsOutputs(op_sig,
+                                         /*required_runtime_inputs=*/1,
+                                         /*required_outputs=*/2));
+      return absl::OkStatus();
+
     case kTfLiteBuiltinTranspose:
       RETURN_IF_ERROR(CheckInputsOutputs(op_sig,
                                          /*required_runtime_inputs=*/1,
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 84547ebf0b9b0e..e8e80a18fc3c7b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -615,7 +615,7 @@ py_strict_library(
     deps = [
         ":keras_lib",
         "//third_party/py/numpy",
-        "@pypi_scipy//:pkg",
+        "@pypi//scipy",
         "@six_archive//:six",
     ],
 )
@@ -767,7 +767,6 @@ pywrap_tensorflow_macro(
         "//tensorflow/c:env",
         "//tensorflow/c:kernels",
         "//tensorflow/c:kernels_experimental",
-        "//tensorflow/c:logging",
         "//tensorflow/c:ops",
         "//tensorflow/c:python_api",
         "//tensorflow/c:safe_ptr",
diff --git a/tensorflow/python/_pywrap_tensorflow.def b/tensorflow/python/_pywrap_tensorflow.def
index 2ac5a6dca25d14..8cb3b304b23695 100644
--- a/tensorflow/python/_pywrap_tensorflow.def
+++ b/tensorflow/python/_pywrap_tensorflow.def
@@ -111,6 +111,7 @@ EXPORTS
   ??0SignatureDef@tensorflow@@IEAA@PEAVArena@protobuf@google@@@Z
   ??0SimpleFieldComparator@util@protobuf@google@@QEAA@XZ
   ??0SingleMachine@grappler@tensorflow@@QEAA@HHH@Z
+  ??0SnappyOutputBuffer@io@tsl@@QEAA@PEAVWritableFile@2@HH@Z
   ??0SourceMgr@llvm@@QEAA@XZ
   ??0SourceMgrDiagnosticHandler@mlir@@QEAA@AEAVSourceMgr@llvm@@PEAVMLIRContext@1@$$QEAV?$unique_function@$$A6A_NVLocation@mlir@@@Z@3@@Z
   ??0SparseCoreLayoutStacker@tpu@tensorflow@@QEAA@H_NH@Z
@@ -217,6 +218,7 @@ EXPORTS
   ?BatchShardedOnMesh@Layout@dtensor@tensorflow@@SA?AV123@AEBVMesh@23@HAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@H@Z
   ?BoundPort@GrpcDataServerBase@data@tensorflow@@QEAAHXZ
   ?Build@KernelDefBuilder@tensorflow@@QEAAPEBVKernelDef@2@XZ
+  ?ByteSwapTensor@tensorflow@@YA?AVStatus@lts_20250814@absl@@PEAVTensor@1@@Z
   ?Canonicalize@FunctionParameterCanonicalizer@tensorflow@@QEAA_NPEAU_object@@0V?$Span@PEAU_object@@@lts_20250814@absl@@@Z
   ?Capture@StackTrace@tensorflow@@SA?AV?$shared_ptr@VStackTrace@tensorflow@@@std@@H@Z
   ?CatPieces@strings_internal@lts_20250814@absl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$initializer_list@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@5@@Z
@@ -310,6 +312,7 @@ EXPORTS
   ?EncodeToProto@ThreadSafeHistogram@histogram@tsl@@QEBAXPEAVHistogramProto@tensorflow@@_N@Z
   ?EnqueueOpsFanin@GrapplerItem@grappler@tensorflow@@QEBA?AV?$vector@PEBVNodeDef@tensorflow@@V?$allocator@PEBVNodeDef@tensorflow@@@std@@@std@@XZ
   ?EnsureInitialized@DataServiceDispatcherClient@data@tensorflow@@MEAA?AVStatus@lts_20250814@absl@@XZ
+  ?EnsureMemoryTypes@tensorflow@@YA?AVStatus@lts_20250814@absl@@AEBVDeviceType@tsl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAVGraph@1@@Z
   ?Enter@PyContextManager@tensorflow@@QEAA_NPEAU_object@@@Z
   ?EqualAttrValueWrapper@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@0@Z
   ?EqualGraphDefWrapper@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@0@Z
@@ -513,6 +516,7 @@ EXPORTS
   ?Matrix@InferenceContext@shape_inference@tensorflow@@QEAA?AVShapeHandle@23@UDimensionOrConstant@23@0@Z
   ?MaybeRaiseExceptionFromTFStatus@tensorflow@@YAHPEAUTSL_Status@@PEAU_object@@@Z
   ?MemoryTypesForNode@tensorflow@@YA?AVStatus@lts_20250814@absl@@PEBVOpRegistryInterface@1@AEBVDeviceType@tsl@@AEBVNodeDef@1@PEAV?$InlinedVector@W4MemoryType@tensorflow@@$03V?$allocator@W4MemoryType@tensorflow@@@std@@@34@3@Z
+  ?MemoryTypeForOutput@tensorflow@@YA?AVStatus@lts_20250814@absl@@AEBVDeviceType@tsl@@PEBVGraph@1@PEBVNode@1@HPEAW4MemoryType@1@@Z
   ?Merge@InferenceContext@shape_inference@tensorflow@@QEAA?AVStatus@lts_20250814@absl@@VDimensionHandle@23@0PEAV723@@Z
   ?Merge@InferenceContext@shape_inference@tensorflow@@QEAA?AVStatus@lts_20250814@absl@@VShapeHandle@23@0PEAV723@@Z
   ?MeshDimNames@Mesh@dtensor@tensorflow@@QEBA?AV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@XZ
@@ -554,7 +558,6 @@ EXPORTS
   ?PrintModelAnalysis@tfprof@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBV34@0000@Z
   ?PrintResult@MetaOptimizer@grappler@tensorflow@@QEAAXXZ
   ?PrintStepStats@StatSummarizer@tensorflow@@QEBAXXZ
-  ?Printf@strings@tsl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBDZZ
   ?ProcessStepStats@StatSummarizer@tensorflow@@QEAAXAEBVStepStats@2@@Z
   ?Profile@tfprof@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBV34@0@Z
   ?ProfilerFromFile@tfprof@tensorflow@@YAXPEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
@@ -782,6 +785,7 @@ EXPORTS
   ?add_ret_check_failure@MakeErrorStream@status_macros@xla@@QEAAAEAVMakeErrorStreamWithOutput@123@PEBD@Z
   ?checkInput@FileCheck@llvm@@QEAA_NAEAVSourceMgr@2@VStringRef@2@PEAV?$vector@UFileCheckDiag@llvm@@V?$allocator@UFileCheckDiag@llvm@@@std@@@std@@@Z
   ?clear_value@AttrValue@tensorflow@@QEAAXXZ
+  ?configureGpuToNVVMConversionLegality@mlir@@YAXAEAVConversionTarget@1@@Z
   ?cost@PyListChecker@py_dispatch@tensorflow@@UEBAHXZ
   ?cost@PyUnionChecker@py_dispatch@tensorflow@@UEBAHXZ
   ?create@ModuleOp@mlir@@SA?AV12@AEAVOpBuilder@2@VLocation@2@V?$optional@VStringRef@llvm@@@std@@@Z
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index fb8d75d26471dc..5e514b407150be 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -13,7 +13,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -39,7 +39,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -65,7 +65,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -80,7 +80,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/utils:ag_logging",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -93,7 +93,7 @@ py_strict_library(
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -105,7 +105,7 @@ py_strict_library(
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -127,7 +127,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct/static_analysis:liveness",
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -143,7 +143,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -160,7 +160,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -176,7 +176,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct/static_analysis:activity",
         "//tensorflow/python/autograph/pyct/static_analysis:annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -187,7 +187,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/pyct:templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 7d45b9e3e8c808..8e1697b166ca98 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -36,7 +36,7 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/pyct:errors",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index c9ba2649e38f7b..434c0f2a08d57d 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -15,7 +15,7 @@ py_strict_library(
         ":ast_util",
         ":parser",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -33,7 +33,7 @@ py_strict_library(
         ":templates",
         ":transformer",
         "//tensorflow/python/autograph/utils:ag_logging",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -45,7 +45,7 @@ py_strict_library(
         ":anno",
         ":parser",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -63,7 +63,7 @@ py_strict_library(
     name = "gast_util",
     srcs = ["gast_util.py"],
     visibility = ["//visibility:public"],
-    deps = ["@pypi_gast//:pkg"],
+    deps = ["@pypi//gast"],
 )
 
 py_strict_library(
@@ -80,8 +80,8 @@ py_strict_library(
         ":errors",
         ":inspect_utils",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_astunparse//:pkg",
-        "@pypi_gast//:pkg",
+        "@pypi//astunparse",
+        "@pypi//gast",
     ],
 )
 
@@ -109,7 +109,7 @@ py_strict_library(
         ":parser",
         ":pretty_printer",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -117,7 +117,7 @@ py_strict_library(
     name = "anno",
     srcs = ["anno.py"],
     visibility = ["//visibility:public"],
-    deps = ["@pypi_gast//:pkg"],
+    deps = ["@pypi//gast"],
 )
 
 py_strict_library(
@@ -135,7 +135,7 @@ py_strict_library(
         ":parser",
         ":pretty_printer",
         ":templates",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -146,7 +146,7 @@ py_strict_library(
     deps = [
         ":anno",
         ":parser",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -156,8 +156,8 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         ":anno",
-        "@pypi_astunparse//:pkg",
-        "@pypi_gast//:pkg",
+        "@pypi//astunparse",
+        "@pypi//gast",
     ],
 )
 
@@ -182,8 +182,8 @@ py_strict_library(
     srcs = ["pretty_printer.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "@pypi_gast//:pkg",
-        "@pypi_termcolor//:pkg",
+        "@pypi//gast",
+        "@pypi//termcolor",
     ],
 )
 
@@ -207,7 +207,7 @@ py_strict_test(
         ":parser",
         ":pretty_printer",
         ":qual_names",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -229,7 +229,7 @@ py_strict_test(
     deps = [
         ":cfg",
         ":parser",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -243,7 +243,7 @@ py_strict_test(
         ":loader",
         ":parser",
         ":pretty_printer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
@@ -314,7 +314,7 @@ py_strict_test(
         ":errors",
         ":parser",
         ":pretty_printer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -351,7 +351,7 @@ py_strict_test(
         ":qual_names",
         ":templates",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -365,7 +365,7 @@ py_strict_test(
         ":origin_info",
         ":parser",
         ":transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -377,7 +377,7 @@ py_strict_test(
     deps = [
         ":transformer",
         ":transpiler",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 298a08ad0b596a..4e812b3a1ebca1 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -15,7 +15,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:gast_util",
         "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -25,7 +25,7 @@ py_strict_test(
     tags = ["no_oss"],
     deps = [
         ":common_transformers",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 8a00fb201556a8..b1e553236395db 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -14,7 +14,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -26,7 +26,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -57,7 +57,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -72,7 +72,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -84,7 +84,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:transformer",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
@@ -100,7 +100,7 @@ py_strict_test(
     deps = [
         ":activity",
         ":annos",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
         #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:naming",
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index cb603b8c2eab19..a5c7eb4083282a 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -14,7 +14,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/autograph/pyct:templates",
         "//third_party/py/numpy",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index ebe403225c6a29..e06fc32c32eeee 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -187,7 +187,6 @@ py_strict_library(
     srcs = ["device_lib.py"],
     visibility = [
         "//tensorflow:internal",
-        "//third_party/mlperf/submissions/training/v0_7/models:__subpackages__",
         "//third_party/py/cleverhans:__subpackages__",
     ],
     deps = [
@@ -325,7 +324,7 @@ py_strict_library(
         "//tensorflow/python/util:numpy_compat",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 1199be3ddf631d..bf0b2577ca0be3 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 10, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 11, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 7b63b93fe651da..bf82a1b23dcd44 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -76,7 +76,7 @@ py_strict_library(
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/framework:dtypes",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 5add62b32dfeb4..af27ef03f7a200 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -163,7 +163,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 9cb56335a99608..560b4bd5ac0416 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "Python.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "pybind11/chrono.h"  // from @pybind11
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
@@ -130,7 +131,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
          const std::string& protocol) -> tensorflow::data::DataServiceMetadata {
         tensorflow::data::DataServiceMetadata metadata;
         tensorflow::data::DataServiceDispatcherClient client(address, protocol);
-        int64_t deadline_micros = tensorflow::kint64max;
+        int64_t deadline_micros = std::numeric_limits<int64_t>::max();
         absl::Status status;
         Py_BEGIN_ALLOW_THREADS;
         status = tensorflow::data::grpc_util::Retry(
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 770abe89b454bc..cba1693d90b02b 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -87,7 +87,7 @@ py_strict_library(
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest_util",
         "//tensorflow/python/util:tf_export",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
@@ -116,7 +116,7 @@ py_strict_test(
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index 92ef5c4b57d467..f30a5a8c6668ec 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -562,7 +562,7 @@ py_strict_library(
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 8191b2e213b014..7b5f6a94506487 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -500,7 +500,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:variable_scope",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
@@ -1406,7 +1406,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
@@ -1722,7 +1722,7 @@ distribute_py_strict_test(
         "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops:variable_v1",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
@@ -2241,8 +2241,8 @@ py_strict_library(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
-        "@pypi_tblib//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
+        "@pypi//tblib",  # build_cleaner: keep
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/distribute/experimental/rpc/BUILD b/tensorflow/python/distribute/experimental/rpc/BUILD
index 76482d4386ce6c..de78c957828708 100644
--- a/tensorflow/python/distribute/experimental/rpc/BUILD
+++ b/tensorflow/python/distribute/experimental/rpc/BUILD
@@ -59,6 +59,6 @@ tf_py_strict_test(
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
diff --git a/tensorflow/python/distribute/failure_handling/BUILD b/tensorflow/python/distribute/failure_handling/BUILD
index 5066dabaebda47..d43e018787cc6e 100644
--- a/tensorflow/python/distribute/failure_handling/BUILD
+++ b/tensorflow/python/distribute/failure_handling/BUILD
@@ -45,7 +45,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:tf_logging",
-        "@pypi_requests//:pkg",
+        "@pypi//requests",
         "@six_archive//:six",
     ],
 )
@@ -133,7 +133,7 @@ tf_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_dill//:pkg",  # build_cleaner: keep
+        "@pypi//dill",  # build_cleaner: keep
     ],
 )
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 2018c782c74d27..4090be7e26477f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -791,6 +791,9 @@ cuda_py_benchmark_test(
 cuda_py_strict_test(
     name = "run_eager_op_as_function_test",
     srcs = ["run_eager_op_as_function_test.py"],
+    exec_properties = {
+        "cpp_link.mem": "16g",
+    },
     tags = [
         "no_windows",  # b/207695287
     ],
@@ -1108,6 +1111,7 @@ cuda_py_strict_test(
         "no_oss",  # This test launches local server.
         "nofastbuild",  # times out
         "optonly",  # times out
+        "requires-mem:20g",  # Add this for Test Execution OOM
     ],
     deps = [
         ":cancellation",
@@ -1137,7 +1141,7 @@ cuda_py_strict_test(
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
@@ -1271,7 +1275,7 @@ py_strict_library(
         "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/framework:ops",
-        "@pypi_gast//:pkg",
+        "@pypi//gast",
     ],
 )
 
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
index 21471586b37c6a..7424a2dc2f674e 100644
--- a/tensorflow/python/eager/benchmarks/BUILD
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -15,6 +15,7 @@ cuda_py_benchmark_test(
     srcs = ["kpi_benchmark_test.py"],
     tags = [
         "no_windows",  #  b/141617449
+        "notap",  # b/456542868
         "optonly",
     ],
     deps = [
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 9fee97bee904f5..876451a22ef4a2 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -227,7 +227,6 @@ cuda_py_strict_test(
     tags = [
         "nomac",  # b/157056289
     ],
-    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":attributes",
         ":polymorphic_function",
@@ -318,19 +317,13 @@ tf_py_strict_test(
 tf_xla_py_strict_test(
     name = "polymorphic_function_xla_jit_test",
     srcs = ["polymorphic_function_xla_jit_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/185944215) # Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    disabled_backends = [
-        "cpu_ondemand",
+    enabled_backends = [
+        "cpu",
     ],
-    enable_mlir_bridge = True,
     tags = [
         "no_mac",
         "no_oss",  # TODO(b/295654746)
         "no_pip",
-        "no_tfrt",  # TODO(b/185944215)
         "no_windows",
     ],
     use_xla_device = False,
@@ -366,11 +359,6 @@ tf_xla_py_strict_test(
 tf_xla_py_strict_test(
     name = "polymorphic_function_xla_test",
     srcs = ["polymorphic_function_xla_test.py"],
-    # copybara:uncomment_begin
-    # #TODO(b/185944215) # Remove once the bug is fixed.
-    # disable_tpu_tfrt = True,
-    # copybara:uncomment_end
-    enable_mlir_bridge = False,
     tags = [
         "no_pip",
         "no_windows",
@@ -475,7 +463,6 @@ cuda_py_strict_test(
     name = "argument_naming_test",
     size = "medium",
     srcs = ["argument_naming_test.py"],
-    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":polymorphic_function",
         "//tensorflow/core:protos_all_py",
@@ -612,11 +599,9 @@ tf_xla_py_strict_test(
         "gpu_a100",
         "gpu_h100",
     ],
-    enable_mlir_bridge = True,
     tags = [
         "no_mac",
         "no_pip",
-        "no_tfrt",  # TODO(b/185944215)
         "no_windows",
     ],
     use_xla_device = False,
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 1ff9e314e6d3ab..a5066356052041 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -105,8 +105,8 @@ def testNumpyValueWithCast(self):
     ctx = context.context()
     # Bad dtype value.
     with self.assertRaisesRegex(TypeError, "Invalid dtype argument value"):
-      # The max value of TF_DataType is 32, so using 33 for the dtype fails.
-      ops.EagerTensor(values, device=ctx.device_name, dtype=33)
+      # The max value of TF_DataType is 33, so using 34 for the dtype fails.
+      ops.EagerTensor(values, device=ctx.device_name, dtype=34)
 
   def testNumpyOrderHandling(self):
     n = np.array([[1, 2], [3, 4]], order="F")
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index 7cd1ac17d52a78..837ba6ec259354 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -21,6 +21,7 @@ class Flag:
 class Flags:
     enable_aggressive_constant_replication: Flag
     enable_colocation_key_propagation_in_while_op_lowering: Flag
+    enable_fatal_error_on_collective_abort: Flag
     enable_function_pruning_before_inlining: Flag
     enable_graph_debug_info_caching_for_stack_frames: Flag
     enable_nested_function_shape_inference: Flag
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 29bd5918e38497..ba216815b3a623 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -303,7 +303,7 @@ py_strict_library(
         "//tensorflow/python/eager:execute",
         "//tensorflow/security/fuzzing/py:annotation_types",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1758,7 +1758,7 @@ py_strict_library(
     deps = [
         ":composite_tensor",
         "//tensorflow/python/util:nest",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1786,7 +1786,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -1929,7 +1929,7 @@ pytype_strict_library(
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -2213,7 +2213,7 @@ pytype_strict_library(
         "//tensorflow/python/util/protobuf",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
@@ -2781,7 +2781,7 @@ tf_py_strict_test(
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index fcd2da85f4ff1b..0d00a881abe5ac 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -479,6 +479,19 @@ def __reduce__(self):
     "dtypes.experimental.float8_e5m2fnuz", "experimental.float8_e5m2fnuz"
 ).export_constant(__name__, "float8_e5m2fnuz")
 
+float4_e2m1fn = DType(types_pb2.DT_FLOAT4_E2M1FN)
+doc_typealias.document(
+    obj=float4_e2m1fn,
+    doc=(
+        "4-bit float with 2 exponent bits and 1 mantissa bits, with extended"
+        " finite range.  This type has no representation for both inf and NaN."
+    ),
+)
+tf_export(
+    "dtypes.experimental.float4_e2m1fn", "experimental.float4_e2m1fn"
+).export_constant(__name__, "float4_e2m1fn")
+
+
 int4 = DType(types_pb2.DT_INT4)
 doc_typealias.document(obj=int4, doc="Signed 4-bit integer.")
 tf_export("dtypes.experimental.int4", "experimental.int4").export_constant(
@@ -533,6 +546,7 @@ def __reduce__(self):
 float8_e4m3fnuz_ref = DType(types_pb2.DT_FLOAT8_E4M3FNUZ_REF)
 float8_e4m3b11fnuz_ref = DType(types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF)
 float8_e5m2fnuz_ref = DType(types_pb2.DT_FLOAT8_E5M2FNUZ_REF)
+float4_e2m1fn_ref = DType(types_pb2.DT_FLOAT4_E2M1FN_REF)
 int4_ref = DType(types_pb2.DT_INT4_REF)
 uint4_ref = DType(types_pb2.DT_UINT4_REF)
 int2_ref = DType(types_pb2.DT_INT2_REF)
@@ -567,6 +581,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ: float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN: float4_e2m1fn,
     types_pb2.DT_INT4: int4,
     types_pb2.DT_UINT4: uint4,
     types_pb2.DT_INT2: int2,
@@ -599,6 +614,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: float8_e4m3fnuz_ref,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: float8_e4m3b11fnuz_ref,
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: float8_e5m2fnuz_ref,
+    types_pb2.DT_FLOAT4_E2M1FN_REF: float4_e2m1fn_ref,
     types_pb2.DT_INT4_REF: int4_ref,
     types_pb2.DT_UINT4_REF: uint4_ref,
     types_pb2.DT_INT2_REF: int2_ref,
@@ -635,6 +651,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: "float8_e4m3fnuz",
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: "float8_e4m3b11fnuz",
     types_pb2.DT_FLOAT8_E5M2FNUZ: "float8_e5m2fnuz",
+    types_pb2.DT_FLOAT4_E2M1FN: "float4_e2m1fn",
     types_pb2.DT_INT4: "int4",
     types_pb2.DT_UINT4: "uint4",
     types_pb2.DT_INT2: "int2",
@@ -667,6 +684,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: "float8_e4m3fnuz_ref",
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: "float8_e4m3b11fnuz_ref",
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: "float8_e5m2fnuz_ref",
+    types_pb2.DT_FLOAT4_E2M1FN_REF: "float4_e2m1fn_ref",
     types_pb2.DT_INT4_REF: "int4_ref",
     types_pb2.DT_UINT4_REF: "uint4_ref",
     types_pb2.DT_INT2_REF: "int2_ref",
@@ -704,6 +722,7 @@ def __reduce__(self):
 _np_float8_e4m3b11fnuz = ml_dtypes.float8_e4m3b11fnuz
 _np_float8_e5m2 = ml_dtypes.float8_e5m2
 _np_float8_e5m2fnuz = ml_dtypes.float8_e5m2fnuz
+_np_float4_e2m1fn = ml_dtypes.float4_e2m1fn
 _np_int4 = ml_dtypes.int4
 _np_uint4 = ml_dtypes.uint4
 _np_int2 = ml_dtypes.int2
@@ -742,6 +761,7 @@ def __reduce__(self):
     _np_float8_e4m3fnuz: float8_e4m3fnuz,
     _np_float8_e4m3b11fnuz: float8_e4m3b11fnuz,
     _np_float8_e5m2fnuz: float8_e5m2fnuz,
+    _np_float4_e2m1fn: float4_e2m1fn,
     _np_int4: int4,
     _np_uint4: uint4,
     _np_int2: int2,
@@ -797,6 +817,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ: _np_float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ: _np_float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ: _np_float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN: _np_float4_e2m1fn,
     types_pb2.DT_INT4: _np_int4,
     types_pb2.DT_UINT4: _np_uint4,
     types_pb2.DT_INT2: _np_int2,
@@ -828,6 +849,7 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E4M3FNUZ_REF: _np_float8_e4m3fnuz,
     types_pb2.DT_FLOAT8_E4M3B11FNUZ_REF: _np_float8_e4m3b11fnuz,
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: _np_float8_e5m2fnuz,
+    types_pb2.DT_FLOAT4_E2M1FN_REF: _np_float4_e2m1fn,
     types_pb2.DT_INT4_REF: _np_int4,
     types_pb2.DT_UINT4_REF: _np_uint4,
     types_pb2.DT_INT2_REF: _np_int2,
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 00a019a8760c81..0f299a19ac0daf 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -96,6 +96,9 @@ def testNumpyConversion(self):
     self.assertIs(dtypes.float8_e5m2, dtypes.as_dtype(dtypes._np_float8_e5m2))
     self.assertIs(dtypes.float8_e4m3fn,
                   dtypes.as_dtype(dtypes._np_float8_e4m3fn))
+    self.assertIs(
+        dtypes.float4_e2m1fn, dtypes.as_dtype(dtypes._np_float4_e2m1fn)
+    )
     self.assertIs(
         dtypes.float8_e4m3fnuz, dtypes.as_dtype(dtypes._np_float8_e4m3fnuz)
     )
@@ -138,6 +141,7 @@ def testRealDtype(self):
         dtypes.float8_e4m3fnuz,
         dtypes.float8_e4m3b11fnuz,
         dtypes.float8_e5m2fnuz,
+        dtypes.float4_e2m1fn,
         dtypes.int4,
         dtypes.uint4,
         dtypes.int2,
@@ -171,6 +175,7 @@ def testStringConversion(self):
         dtypes.float8_e4m3b11fnuz, dtypes.as_dtype("float8_e4m3b11fnuz")
     )
     self.assertIs(dtypes.float8_e5m2fnuz, dtypes.as_dtype("float8_e5m2fnuz"))
+    self.assertIs(dtypes.float4_e2m1fn, dtypes.as_dtype("float4_e2m1fn"))
     self.assertIs(dtypes.int4, dtypes.as_dtype("int4"))
     self.assertIs(dtypes.uint4, dtypes.as_dtype("uint4"))
     self.assertIs(dtypes.int2, dtypes.as_dtype("int2"))
@@ -191,8 +196,12 @@ def testStringConversion(self):
     self.assertIs(dtypes.qint32_ref, dtypes.as_dtype("qint32_ref"))
     self.assertIs(dtypes.bfloat16_ref, dtypes.as_dtype("bfloat16_ref"))
     self.assertIs(dtypes.float8_e5m2_ref, dtypes.as_dtype("float8_e5m2_ref"))
-    self.assertIs(dtypes.float8_e4m3fn_ref,
-                  dtypes.as_dtype("float8_e4m3fn_ref"))
+    self.assertIs(
+        dtypes.float8_e4m3fn_ref, dtypes.as_dtype("float8_e4m3fn_ref")
+    )
+    self.assertIs(
+        dtypes.float4_e2m1fn_ref, dtypes.as_dtype("float4_e2m1fn_ref")
+    )
     self.assertIs(dtypes.int4_ref, dtypes.as_dtype("int4_ref"))
     self.assertIs(dtypes.uint4_ref, dtypes.as_dtype("uint4_ref"))
     self.assertIs(dtypes.int2_ref, dtypes.as_dtype("int2_ref"))
@@ -230,6 +239,7 @@ def testIsInteger(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_integer, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_integer, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_integer, False)
     self.assertEqual(dtypes.as_dtype("int4").is_integer, True)
     self.assertEqual(dtypes.as_dtype("uint4").is_integer, True)
     self.assertEqual(dtypes.as_dtype("int2").is_integer, True)
@@ -259,6 +269,7 @@ def testIsFloating(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_floating, True)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_floating, True)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_floating, True)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_floating, True)
     self.assertEqual(dtypes.as_dtype("int4").is_floating, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_floating, False)
     self.assertEqual(dtypes.as_dtype("int2").is_floating, False)
@@ -288,6 +299,7 @@ def testIsComplex(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_complex, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_complex, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_complex, False)
     self.assertEqual(dtypes.as_dtype("int4").is_complex, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_complex, False)
     self.assertEqual(dtypes.as_dtype("int2").is_complex, False)
@@ -317,6 +329,7 @@ def testIsUnsigned(self):
     self.assertEqual(dtypes.as_dtype("float8_e4m3fnuz").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("float8_e4m3b11fnuz").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("float4_e2m1fn").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("int4").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_unsigned, True)
     self.assertEqual(dtypes.as_dtype("int2").is_unsigned, False)
@@ -398,6 +411,9 @@ def testMinMax(self):
       if numpy_dtype == dtypes.float8_e5m2fnuz.as_numpy_dtype:
         self.assertEqual(dtype.min, -57344.0)
         self.assertEqual(dtype.max, 57344.0)
+      if numpy_dtype == dtypes.float4_e2m1fn.as_numpy_dtype:
+        self.assertEqual(dtype.min, -6)
+        self.assertEqual(dtype.max, 6)
       if numpy_dtype == dtypes.int4.as_numpy_dtype:
         self.assertEqual(dtype.min, -8)
         self.assertEqual(dtype.max, 7)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 34c3a3bec391ba..7ac2e7caa68cbc 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1388,6 +1388,7 @@ def _type_list_to_str(types):
     dtypes.float8_e4m3fnuz: "f8e4m3fnuz",
     dtypes.float8_e4m3b11fnuz: "f8e4m3b11fnuz",
     dtypes.float8_e5m2fnuz: "f8e5m2fnuz",
+    dtypes.float4_e2m1fn: "f4e2m1fn",
     dtypes.int4: "i4",
     dtypes.uint4: "u4",
     dtypes.int2: "i2",
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 05b764bb5eb77e..3a3c8bca1317d2 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -75,9 +75,9 @@ def write_graph(graph_or_graph_def, logdir, name, as_text=True):
     file_io.recursive_create_dir(logdir)
   path = os.path.join(logdir, name)
   if as_text:
-    file_io.atomic_write_string_to_file(path,
-                                        text_format.MessageToString(
-                                            graph_def, float_format=''))
+    file_io.atomic_write_string_to_file(
+        path, text_format.MessageToString(graph_def)
+    )
   else:
     file_io.atomic_write_string_to_file(
         path, graph_def.SerializeToString(deterministic=True))
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 4011142b236c04..8c5d4753d36235 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -94,6 +95,7 @@ const std::unordered_map<string, string> dtype_type{
     {"_dtypes.float8_e4m3fnuz", "_atypes.Float8e4m3fnuz"},
     {"_dtypes.float8_e4m3b11fnuz", "_atypes.Float8e4m3b11fnuz"},
     {"_dtypes.float8_e5m2fnuz", "_atypes.Float8e5m2fnuz"},
+    {"_dtypes.float4_e2m1fn", "_atypes.Float4e2m1fn"},
     {"_dtypes.int4", "_atypes.Int4"},
     {"_dtypes.uint4", "_atypes.UInt4"},
     {"_dtypes.int2", "_atypes.Int2"},
@@ -783,14 +785,15 @@ void GenerateLowerCaseOpName(const string& str, string* result) {
     // Emit a joiner only if a previous-lower-to-now-upper or a
     // now-upper-to-next-lower transition happens.
     // (But don't emit an extra joiner if we just saw a namespace separator
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+    if (absl::ascii_isupper(c) && (i > 0)) {
+      if (absl::ascii_islower(str[i - 1]) ||
+          ((i < last_index) && absl::ascii_islower(str[i + 1]))) {
         if (!(str[i - 1] == namespace_separator)) {
           result->push_back(joiner);
         }
       }
     }
-    result->push_back(tolower(c));
+    result->push_back(absl::ascii_tolower(c));
   }
 }
 
@@ -2118,7 +2121,7 @@ void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   OpList ops;
-  ops.ParseFromArray(op_list_buf, op_list_len);
+  ops.ParseFromString(absl::string_view(op_list_buf, op_list_len));
 
   ApiDefMap api_def_map(ops);
   return GetPythonOpsImpl(ops, api_def_map, OpRegOffsets(), {}, {});
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index febfdc89a91464..b5308f677a0d38 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -62,6 +62,7 @@ TEST(PythonOpGen, TypeAnnotateAllOps) {
   const std::string all_types =
       ", \"_atypes.BFloat16\", \"_atypes.Bool\", \"_atypes.Complex128\", "
       "\"_atypes.Complex64\", \"_atypes.Float16\", \"_atypes.Float32\", "
+      "\"_atypes.Float4e2m1fn\", "
       "\"_atypes.Float64\", "
       "\"_atypes.Float8e4m3b11fnuz\", \"_atypes.Float8e4m3fn\", "
       "\"_atypes.Float8e4m3fnuz\", \"_atypes.Float8e5m2\", "
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 66894e173e86a7..b44ef77a7e901d 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -167,6 +167,26 @@ def FastAppendFloat8e5m2fnuzArrayToTensorProto(tensor_proto, proto_values):
   )
 
 
+def SlowAppendFloat4e2m1fnArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.float8_val += (
+      numpy_compat.np_asarray(
+          proto_values, dtype=dtypes.float4_e2m1fn.as_numpy_dtype)
+      .view(np.uint8)
+      .tobytes()
+  )
+
+
+def FastAppendFloat4e2m1fnArrayToTensorProto(tensor_proto, proto_values):
+  # Note: This requires a corresponding C++ binding in
+  # fast_tensor_util.AppendFloat8ArrayToTensorProto
+  fast_tensor_util.AppendFloat8ArrayToTensorProto(
+      tensor_proto,
+      numpy_compat.np_asarray(
+          proto_values, dtype=dtypes.float4_e2m1fn.as_numpy_dtype
+      ).view(np.uint8),
+  )
+
+
 def SlowAppendInt4ArrayToTensorProto(tensor_proto, proto_values):
   # The actual bit representation of int4 as a bit-field is
   # implementation-defined, so we need to explicitly cast each
@@ -249,6 +269,9 @@ def SlowAppendUInt2ArrayToTensorProto(tensor_proto, proto_values):
       dtypes.float8_e5m2fnuz.as_numpy_dtype: (
           FastAppendFloat8e5m2fnuzArrayToTensorProto
       ),
+      dtypes.float4_e2m1fn.as_numpy_dtype: (
+          FastAppendFloat4e2m1fnArrayToTensorProto
+      ),
       dtypes.int4.as_numpy_dtype: SlowAppendInt4ArrayToTensorProto,
       dtypes.uint4.as_numpy_dtype: SlowAppendUInt4ArrayToTensorProto,
       dtypes.int2.as_numpy_dtype: SlowAppendInt2ArrayToTensorProto,
@@ -297,6 +320,9 @@ def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
       dtypes.float8_e4m3fn.as_numpy_dtype: (
           SlowAppendFloat8e4m3fnArrayToTensorProto
       ),
+      dtypes.float4_e2m1fn.as_numpy_dtype: (
+          SlowAppendFloat4e2m1fnArrayToTensorProto
+      ),
       np.float16: SlowAppendFloat16ArrayToTensorProto,
       np.float32: SlowAppendFloat32ArrayToTensorProto,
       np.float64: SlowAppendFloat64ArrayToTensorProto,
@@ -397,6 +423,7 @@ def _FlattenToStrings(nested_strings):
     dtypes.float8_e4m3fnuz,
     dtypes.float8_e4m3b11fnuz,
     dtypes.float8_e5m2fnuz,
+    dtypes.float4_e2m1fn,
     dtypes.bfloat16,
     # int4 / uint4 / int2 / uint2 intentionally not listed, since their binary
     # representation is implementation-dependent.
@@ -779,6 +806,7 @@ def MakeNdarray(tensor):
   elif tensor_dtype in [
       dtypes.float8_e5m2,
       dtypes.float8_e4m3fn,
+      dtypes.float4_e2m1fn
   ]:
     values = np.fromiter(tensor.float8_val, dtype=np.uint8)
     values.dtype = dtype
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 68dd9967142fdf..bb4ad4cab194a4 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -375,13 +375,39 @@ def testFloat8e5m2fnuz(self):
         t,
     )
 
+  def testFloat4e2m1fn(self):
+    test_type = dtypes.float4_e2m1fn.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(np.array([6, 0.5], dtype=test_type))
+    # 0x7 = 011 1 = 2^(3-1) x (1+0.5) = 6
+    # 0x1 = 000 1 = 2^(0) x 0.5 = 0.5
+    expected_bytes = r"\x07\x01"
+    self.assertProtoEquals(
+        f"""
+      dtype: DT_FLOAT4_E2M1FN
+      tensor_shape {{
+        dim {{
+          size: 2
+        }}
+      }}
+      tensor_content: "{expected_bytes}"
+      """,
+        t,
+    )
+
+    a = tensor_util.MakeNdarray(t)
+    self.assertEqual(test_type, a.dtype)
+    self.assertAllClose(np.array([6, 0.5], dtype=test_type), a)
+
   def testInt(self):
     t = tensor_util.make_tensor_proto(10)
-    self.assertProtoEquals("""
+    self.assertProtoEquals(
+        """
       dtype: DT_INT32
       tensor_shape {}
       int_val: 10
-      """, t)
+      """,
+        t,
+    )
     a = tensor_util.MakeNdarray(t)
     self.assertEqual(np.int32, a.dtype)
     self.assertAllClose(np.array(10, dtype=np.int32), a)
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 44239ebe140536..1c54cbc3cf6813 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace grappler {
 
 CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
-                           const string& suffix)
+                           const std::string& suffix)
     : item_(&item),
       measure_estimator_(cluster, 10, 0),
       analytical_estimator_(cluster, /*use_static_shapes=*/false,
@@ -86,7 +86,7 @@ void CostAnalyzer::GatherCosts() {
 
   CostGraphDef cost_graph_analytical_filtered;
   CostGraphDef cost_graph_measured_filtered;
-  std::map<string, const CostGraphDef_Node*> measured_nodes;
+  std::map<std::string, const CostGraphDef_Node*> measured_nodes;
   for (const auto& node : cost_graph_measured.node()) {
     measured_nodes[node.name()] = &node;
   }
@@ -139,7 +139,7 @@ void CostAnalyzer::PreprocessCosts() {
   }
 }
 
-void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
+void CostAnalyzer::SortOpsByTime(std::map<std::string, OpPerfSummary> ops) {
   for (const auto& op : ops) {
     ops_.push_back(op.second);
   }
@@ -152,9 +152,9 @@ void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
 }
 
 void CostAnalyzer::AnalyzeCosts() {
-  std::map<string, OpPerfSummary> ops;
+  std::map<std::string, OpPerfSummary> ops;
   for (const auto& op_perf : op_perf_.op_performance()) {
-    string op_name = op_perf.op().op();
+    std::string op_name = op_perf.op().op();
     ops[op_name].count++;
     ops[op_name].time += op_perf.compute_cost();
     ops[op_name].compute_time += op_perf.compute_time();
@@ -263,7 +263,7 @@ void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report,
       os << "    Inputs" << std::endl;
       for (int i = 0; i < op_perf_.op_performance_size(); i++) {
         const auto& perf = op_perf_.op_performance(i);
-        string op_name = perf.op().op();
+        std::string op_name = perf.op().op();
         os << std::setw(width) << op_name << ",";
         os << std::setw(width_wide) << perf.compute_cost() << ",";
         os << std::setw(width_wide) << perf.compute_time() << ",";
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index b14a89b1f318d9..f3cc59931279a4 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -37,7 +37,7 @@ struct GrapplerItem;
 
 // Aggregated perf summary for ops of the same type in a graph.
 struct OpPerfSummary {
-  string name;
+  std::string name;
   int64_t count;
   int64_t time;
   int64_t compute_time;
@@ -52,7 +52,7 @@ struct OpPerfSummary {
 class CostAnalyzer {
  public:
   explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
-                        const string& suffix);
+                        const std::string& suffix);
   absl::Status GenerateReport(std::ostream& os, bool per_node_report,
                               bool verbose);
 
@@ -62,7 +62,7 @@ class CostAnalyzer {
   void GatherCosts();
   void PreprocessCosts();
   void AnalyzeCosts();
-  void SortOpsByTime(std::map<string, OpPerfSummary> ops);
+  void SortOpsByTime(std::map<std::string, OpPerfSummary> ops);
   void PrintAnalysis(std::ostream& os, bool per_node_report,
                      bool verbose) const;
 
@@ -77,7 +77,7 @@ class CostAnalyzer {
   int64_t total_time_measured_serialized_;
   int64_t total_time_analytical_upper_;
   int64_t total_time_analytical_lower_;
-  string suffix_;
+  std::string suffix_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7786247dc7b75d..e01db384639222 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -44,7 +44,7 @@ py_library(
         "//tensorflow/python/saved_model",
         "//tensorflow/python/training",
         "//tensorflow/python/util:nest",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 623c8e365cb9ef..c984ef3207b9a8 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -32,6 +32,7 @@
 # canonical name.
 _TF_ACTIVATIONS_V2 = {
     'softmax_v2': 'softmax',
+    'log_softmax_v2': 'log_softmax',
 }
 
 
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index a160ef5a230c8a..38d083d2170e61 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -96,7 +96,7 @@ py_library(
         "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
 
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index a8e9553946d353..00a16b849e5ab8 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -55,6 +55,6 @@ py_library(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/training:saver",
-        "@pypi_h5py//:pkg",
+        "@pypi//h5py",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index af0e4464396a3e..47780b93e9e430 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -481,7 +481,7 @@ cuda_py_strict_test(
         "//tensorflow/python/ops:manip_ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index 4fe7bec4388a30..b7c79c74dae5f9 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -49,7 +49,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/platform:client_testlib",
-        "@pypi_zstandard//:pkg",
+        "@pypi//zstandard",
     ],
 )
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 395470f00776f3..c33014cc3ae5b2 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -225,6 +225,9 @@ absl::Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       } else if (pyarray_type == custom_dtypes.float8_e5m2fnuz) {
         *out_tf_datatype = TF_FLOAT8_E5M2FNUZ;
         break;
+      } else if (pyarray_type == custom_dtypes.float4_e2m1fn) {
+        *out_tf_datatype = TF_FLOAT4_E2M1FN;
+        break;
       } else if (pyarray_type == custom_dtypes.int4) {
         *out_tf_datatype = TF_INT4;
         break;
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 62032943b672d8..703bab0f65a7b8 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -207,6 +207,9 @@ absl::Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
     case TF_FLOAT8_E5M2FNUZ:
       *out_pyarray_type = custom_dtypes.float8_e5m2fnuz;
       break;
+    case TF_FLOAT4_E2M1FN:
+      *out_pyarray_type = custom_dtypes.float4_e2m1fn;
+      break;
     case TF_INT4:
       *out_pyarray_type = custom_dtypes.int4;
       break;
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 4e00e29451401a..069280d4425fb7 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -2694,7 +2694,7 @@ py_strict_library(
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_opt_einsum//:pkg",
+        "@pypi//opt_einsum",
     ],
 )
 
@@ -3063,7 +3063,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_should_use",
         "//tensorflow/python/util:traceback_utils",
         "@absl_py//absl/logging",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -3513,7 +3513,6 @@ cuda_py_strict_test(
     main = "math_ops_test.py",
     tags = [
         "no_windows_gpu",
-        "cuda-only", #TODO(rocm): weekly sync 240919
     ],
     deps = [
         ":array_ops",
@@ -3558,7 +3557,7 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -3788,7 +3787,7 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-        "@pypi_opt_einsum//:pkg",
+        "@pypi//opt_einsum",
     ],
 )
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index ea87ef6a61b558..25af67e758b9ed 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -117,14 +117,14 @@ def __init__(self,
     r"""Initialize a `LinearOperatorToeplitz`.
 
     Args:
-      col: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The first column of the operator. Allowed dtypes: `float16`, `float32`,
-          `float64`, `complex64`, `complex128`. Note that the first entry of
-          `col` is assumed to be the same as the first entry of `row`.
-      row: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The first row of the operator. Allowed dtypes: `float16`, `float32`,
-          `float64`, `complex64`, `complex128`. Note that the first entry of
-          `row` is assumed to be the same as the first entry of `col`.
+      col: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`. The first
+        column of the operator. Allowed dtypes: `float16`, `float32`, `float64`,
+        `complex64`, `complex128`. Note that the first entry of `col` is assumed
+        to be the same as the first entry of `row`.
+      row: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`. The first row
+        of the operator. Allowed dtypes: `float16`, `float32`, `float64`,
+        `complex64`, `complex128`. Note that the first entry of `row` is assumed
+        to be the same as the first entry of `col`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
@@ -134,7 +134,14 @@ def __init__(self,
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
+        Should not be `False` for that only square Toeplitz operators currently
+        supported.
       name: A name for this `LinearOperator`.
+
+    Raises:
+      ValueError:  If `is_square` is `False`.
+      ValueError:  If `is_positive_definite` is `True` and
+        `is_non_singular` is `False`.
     """
     parameters = dict(
         col=col,
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index e20c78164f57fe..828c0610205904 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -6,7 +6,6 @@ package(
     default_visibility = [
         "//tensorflow:internal",
         "//third_party/py/tensorflow_numerics:__subpackages__",
-        "//third_party/py/trax/tf_numpy/numpy:__pkg__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index aa014ddb2c9232..de0b0fb57ccacb 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -1363,6 +1363,12 @@ def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missin
 
   a = np_array_ops.array(a)
 
+  if a.dtype in (dtypes.complex64, dtypes.complex128):
+    raise TypeError(
+        'argsort does not support complex64/complex128 dtypes. '
+        f'Received dtype: {a.dtype}'
+    )
+
   def _argsort(a, axis, stable):
     if axis is None:
       a = array_ops.reshape(a, [-1])
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index 2a6b6368e8fb14..0eb113e604c043 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -178,6 +178,20 @@ def testArgsort(self):
     a = np.zeros(100)
     np.testing.assert_equal(np_math_ops.argsort(a, kind='stable'), r)
 
+    def testArgsortRaisesErrorForComplexDtypes(self):
+      """Test that argsort raises TypeError for complex64 and complex128."""
+      complex64_array = np.array([1 + 2j, 3 + 4j, 5 + 6j], dtype=np.complex64)
+      with self.assertRaisesRegex(
+          TypeError, 'argsort does not support complex64/complex128 dtypes'
+      ):
+        np_math_ops.argsort(complex64_array)
+
+      complex128_array = np.array([1 + 2j, 3 + 4j, 5 + 6j], dtype=np.complex128)
+      with self.assertRaisesRegex(
+          TypeError, 'argsort does not support complex64/complex128 dtypes'
+      ):
+        np_math_ops.argsort(complex128_array)
+
   def testArgMaxArgMin(self):
     data = [
         0,
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index 6e8147bdc2c468..85b1a33e9df4d9 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -219,7 +219,6 @@ py_strict_library(
     # copybara:uncomment_begin(google-only)
     # visibility = visibility + [
     # "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-    # "//third_party/mlperf:__subpackages__",
     # "//third_party/py/tf_slim:__subpackages__",
     # "//tensorflow:internal",
     # "//tensorflow_models:__subpackages__",
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index caee366f6fdd89..7db2ea8d444037 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -41,7 +41,7 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
 
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
index a4e82101944856..1c273057c4ce94 100644
--- a/tensorflow/python/profiler/integration_test/BUILD
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -33,6 +33,6 @@ cuda_py_strict_test(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
-        "@pypi_portpicker//:pkg",
+        "@pypi//portpicker",
     ],
 )
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
index 600d8156c8290b..42cba093fe0180 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
@@ -184,8 +184,8 @@ void DefineFingerprintingModule(py::module main_module) {
 
   m.def(
       "Singleprint",
-      [](uint64 graph_def_program_hash, uint64 signature_def_hash,
-         uint64 saved_object_graph_hash, uint64 checkpoint_hash) {
+      [](uint64_t graph_def_program_hash, uint64_t signature_def_hash,
+         uint64_t saved_object_graph_hash, uint64_t checkpoint_hash) {
         absl::StatusOr<std::string> singleprint = fingerprinting::Singleprint(
             graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
             checkpoint_hash);
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
index 1826d732130d37..bb0694823a91d9 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
@@ -391,7 +391,7 @@ void DefineMetricsModule(py::module main_module) {
       "CalculateFileSize",
       [](const char* filename) {
         Env* env = Env::Default();
-        uint64 filesize = 0;
+        uint64_t filesize = 0;
         if (!env->GetFileSize(filename, &filesize).ok()) {
           return (int64_t)-1;
         }
@@ -420,7 +420,7 @@ void DefineMetricsModule(py::module main_module) {
 
   m.def(
       "GetCheckpointSize",
-      [](const char* api_label, uint64 filesize) {
+      [](const char* api_label, uint64_t filesize) {
         return metrics::CheckpointSize(api_label, filesize).value();
       },
       py::kw_only(), py::arg("api_label"), py::arg("filesize"),
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index 9f8bff4a26b237..23e479f8321e90 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -118,7 +118,7 @@ tf_py_strict_test(
         "//tensorflow/python/ops:summary_ops_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:training_util",
-        "@pypi_tb_nightly//:pkg",
+        "@pypi//tb_nightly",
     ],
 )
 
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 8885094d7f8433..f32a097914a6df 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -377,7 +377,7 @@ py::object TFE_Py_ExecuteCancelable_wrapper(
 }
 
 static py::object TF_ListPhysicalDevices() {
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   absl::Status s = tensorflow::DeviceFactory::ListAllPhysicalDevices(&devices);
   MaybeRaiseRegisteredFromStatus(s);
   PyObject* result = PyList_New(devices.size());
@@ -391,7 +391,7 @@ static py::object TF_ListPhysicalDevices() {
 }
 
 static py::object TF_ListPluggablePhysicalDevices() {
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   absl::Status s =
       tensorflow::DeviceFactory::ListPluggablePhysicalDevices(&devices);
   MaybeRaiseRegisteredFromStatus(s);
@@ -405,9 +405,10 @@ static py::object TF_ListPluggablePhysicalDevices() {
   return tensorflow::PyoOrThrow(result.release());
 }
 
-static std::unordered_map<string, string> TF_GetDeviceDetails(int index) {
+static std::unordered_map<std::string, std::string> TF_GetDeviceDetails(
+    int index) {
   tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
-  std::unordered_map<string, string> device_details;
+  std::unordered_map<std::string, std::string> device_details;
   absl::Status s =
       tensorflow::DeviceFactory::GetAnyDeviceDetails(index, &device_details);
   tensorflow::Set_TF_Status_from_Status(status.get(), s);
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 1aeb32aa4ace09..43fc0b6f3f364f 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -399,6 +399,7 @@ py_strict_test(
     ],
     tags = [
         "noasan",  # TODO(b/222716501)
+        "notap",  # b/456542868
     ],
     deps = [
         ":saved_model_cli_lib",
@@ -583,7 +584,6 @@ tf_cc_test(
         ":aot_compiled_x_matmul_y_small",
         ":aot_compiled_x_plus_y",
         # LINT.ThenChange(//tensorflow/tools/pip_package/xla_build/pip_test/run_xla_aot_test.sh)
-        "@local_xla//xla/service/cpu:runtime_matmul_acl",
         "//tensorflow/core:test",
         "//tensorflow/core/platform:logging",
         "@eigen_archive//:eigen3",
@@ -595,6 +595,7 @@ tf_py_test(
     size = "small",
     srcs = ["saved_model_cli_sanitize_list_test.py"],
     python_version = "PY3",
+    tags = ["notap"],  # b/456542868
     deps = [
         ":saved_model_cli",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index ce1f36ce273e32..1a0b647700e62a 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -41,7 +41,7 @@ py_strict_library(
         "//tensorflow/python/profiler:profiler_v2",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -52,6 +52,6 @@ py_strict_binary(
     deps = [
         ":capture_tpu_profile_lib",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 8ecdd61826eec5..8f47d2e1498e07 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -219,7 +219,7 @@ py_strict_library(
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/training/quantize_training_wrapper.cc b/tensorflow/python/training/quantize_training_wrapper.cc
index 1b0020e2cc6b40..191f6346f0aa22 100644
--- a/tensorflow/python/training/quantize_training_wrapper.cc
+++ b/tensorflow/python/training/quantize_training_wrapper.cc
@@ -27,9 +27,9 @@ limitations under the License.
 namespace py = pybind11;
 
 namespace tensorflow {
-static PyObject* DoQuantizeTrainingOnGraphDefHelper(const string& input_graph,
-                                                    int num_bits) {
-  string result;
+static PyObject* DoQuantizeTrainingOnGraphDefHelper(
+    const std::string& input_graph, int num_bits) {
+  std::string result;
   // TODO(suharshs): Make the QuantizeAndDequantizeV2 configurable.
   tensorflow::MaybeRaiseFromStatus(
       tensorflow::DoQuantizeTrainingOnSerializedGraphDef(
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index d7d2f0c6834a50..47057327ec43fb 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -19,7 +19,7 @@ pytype_strict_library(
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
@@ -62,7 +62,7 @@ pytype_strict_library(
         ":core",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
-        "@pypi_typing_extensions//:pkg",
+        "@pypi//typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 9d00df368088ef..5875bf0e16668d 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -783,7 +783,7 @@ py_strict_library(
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
         "//tensorflow/python:pywrap_tensorflow",
         ":_pywrap_utils",
     ],
@@ -969,7 +969,7 @@ py_strict_library(
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
         "//third_party/py/numpy",
-        "@pypi_wrapt//:pkg",
+        "@pypi//wrapt",
     ],
 )
 
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index e2bac19de7527d..6a78c6668d9643 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace swig {
 
-string TryFindKernelClass(const string& serialized_node_def) {
+std::string TryFindKernelClass(const std::string& serialized_node_def) {
   tensorflow::NodeDef node_def;
   if (!node_def.ParseFromString(serialized_node_def)) {
     LOG(WARNING) << "Error parsing node_def";
@@ -50,7 +50,7 @@ string TryFindKernelClass(const string& serialized_node_def) {
                  << node_def.ShortDebugString();
     return "";
   }
-  string class_name = "";
+  std::string class_name = "";
   status = tensorflow::FindKernelDef(
       tensorflow::DeviceType(parsed_name.type.c_str()), node_def,
       nullptr /* kernel_def */, &class_name);
diff --git a/tensorflow/python/util/kernel_registry.h b/tensorflow/python/util/kernel_registry.h
index 1ba76f020bf391..ea64df7de46d8b 100644
--- a/tensorflow/python/util/kernel_registry.h
+++ b/tensorflow/python/util/kernel_registry.h
@@ -26,7 +26,7 @@ namespace swig {
 // Returns the kernel class name required to execute <node_def> on the device
 // type of <node_def.device>, or an empty string if the kernel class is not
 // found or the device name is invalid.
-string TryFindKernelClass(const string& serialized_node_def);
+std::string TryFindKernelClass(const std::string& serialized_node_def);
 
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
index 43d618e27a83af..c73ff67fc85317 100644
--- a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
+++ b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
@@ -101,7 +101,7 @@ struct type_caster<tensorflow::DataType> {
 namespace tensorflow {
 
 static py::object CheckpointReader_GetTensor(
-    tensorflow::checkpoint::CheckpointReader* reader, const string& name) {
+    tensorflow::checkpoint::CheckpointReader* reader, const std::string& name) {
   Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
   PyObject* py_obj = Py_None;
   std::unique_ptr<tensorflow::Tensor> tensor;
diff --git a/tensorflow/python/util/transform_graph_wrapper.cc b/tensorflow/python/util/transform_graph_wrapper.cc
index dc6c5cb18e3e13..892d087c647115 100644
--- a/tensorflow/python/util/transform_graph_wrapper.cc
+++ b/tensorflow/python/util/transform_graph_wrapper.cc
@@ -30,10 +30,10 @@ namespace py = pybind11;
 
 namespace tensorflow {
 
-string TransformGraphWithStringInputs(string graph_def_string,
-                                      string inputs_string,
-                                      string outputs_string,
-                                      string transforms_string) {
+std::string TransformGraphWithStringInputs(std::string graph_def_string,
+                                           std::string inputs_string,
+                                           std::string outputs_string,
+                                           std::string transforms_string) {
   GraphDef graph_def;
   if (!graph_def.ParseFromString(graph_def_string)) {
     MaybeRaiseFromStatus(
@@ -46,15 +46,15 @@ string TransformGraphWithStringInputs(string graph_def_string,
   if (!parse_status.ok()) {
     MaybeRaiseFromStatus(parse_status);
   }
-  std::vector<string> inputs = str_util::Split(inputs_string, ',');
-  std::vector<string> outputs = str_util::Split(outputs_string, ',');
+  std::vector<std::string> inputs = str_util::Split(inputs_string, ',');
+  std::vector<std::string> outputs = str_util::Split(outputs_string, ',');
 
   absl::Status transform_status = graph_transforms::TransformGraph(
       inputs, outputs, params_list, &graph_def);
   if (!transform_status.ok()) {
     MaybeRaiseFromStatus(transform_status);
   }
-  string result;
+  std::string result;
   if (!graph_def.SerializeToString(&result)) {
     MaybeRaiseFromStatus(
         errors::InvalidArgument("Couldn't serialize output as a GraphDef"));
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index cf641b97fa6085..5b767f43cdd66f 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -54,15 +54,15 @@ constexpr const char VARIABLES_MODULE[] =
     "tensorflow.python.ops.variables";
 constexpr const char CORE_TYPES_MODULE[] =
     "tensorflow.python.types.core";
-string PyObjectToString(PyObject* o);
+std::string PyObjectToString(PyObject* o);
 }  // namespace
 
-std::unordered_map<string, PyObject*>* RegisteredPyObjectMap() {
-  static auto* m = new std::unordered_map<string, PyObject*>();
+std::unordered_map<std::string, PyObject*>* RegisteredPyObjectMap() {
+  static auto* m = new std::unordered_map<std::string, PyObject*>();
   return m;
 }
 
-PyObject* GetRegisteredPyObject(const string& name) {
+PyObject* GetRegisteredPyObject(const std::string& name) {
   const auto* m = RegisteredPyObjectMap();
   auto it = m->find(name);
   if (it == m->end()) {
@@ -75,7 +75,7 @@ PyObject* GetRegisteredPyObject(const string& name) {
 }
 
 PyObject* RegisterPyObject(PyObject* name, PyObject* value) {
-  string key;
+  std::string key;
   if (PyBytes_Check(name)) {
     key = PyBytes_AsString(name);
 #if PY_MAJOR_VERSION >= 3
@@ -136,7 +136,7 @@ absl::string_view GetClassName(PyObject* o) {
   return name;
 }
 
-string PyObjectToString(PyObject* o) {
+std::string PyObjectToString(PyObject* o) {
   if (o == nullptr) {
     return "<null object>";
   }
@@ -145,7 +145,7 @@ string PyObjectToString(PyObject* o) {
 #if PY_MAJOR_VERSION < 3
     string s(PyString_AS_STRING(str));
 #else
-    string s(PyUnicode_AsUTF8(str));
+    std::string s(PyUnicode_AsUTF8(str));
 #endif
     Py_DECREF(str);
     return absl::StrCat("type=", GetClassName(o), " str=", s);
@@ -787,8 +787,8 @@ bool FlattenHelper(
 
 // Sets error using keys of 'dict1' and 'dict2'.
 // 'dict1' and 'dict2' are assumed to be Python dictionaries.
-void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
-                           bool* is_type_error) {
+void SetDifferentKeysError(PyObject* dict1, PyObject* dict2,
+                           std::string* error_msg, bool* is_type_error) {
   Safe_PyObjectPtr k1(MappingKeys(dict1));
   if (PyErr_Occurred() || k1.get() == nullptr) {
     *error_msg =
@@ -822,7 +822,7 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 // with appropriate error and sets `is_type_error` to true iff
 // the error to be raised should be TypeError.
 bool AssertSameStructureHelper(
-    PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
+    PyObject* o1, PyObject* o2, bool check_types, std::string* error_msg,
     bool* is_type_error, const std::function<int(PyObject*)>& is_nested_helper,
     const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter,
     bool check_composite_tensor_type_spec) {
@@ -832,8 +832,9 @@ bool AssertSameStructureHelper(
   const bool is_nested2 = is_nested_helper(o2);
   if (PyErr_Occurred()) return false;
   if (is_nested1 != is_nested2) {
-    string seq_str = is_nested1 ? PyObjectToString(o1) : PyObjectToString(o2);
-    string non_seq_str =
+    std::string seq_str =
+        is_nested1 ? PyObjectToString(o1) : PyObjectToString(o2);
+    std::string non_seq_str =
         is_nested1 ? PyObjectToString(o2) : PyObjectToString(o1);
     *is_type_error = false;
     *error_msg = tensorflow::strings::StrCat(
@@ -1200,7 +1201,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
   const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
       expand_composites ? GetValueIteratorForComposite : GetValueIterator;
   const bool check_composite_tensor_type_spec = expand_composites;
-  string error_msg;
+  std::string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
                             is_nested_helper, get_value_iterator,
@@ -1225,7 +1226,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
 
 PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
                                      bool check_types) {
-  string error_msg;
+  std::string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
                             IsNestedForDataHelper, GetValueIterator, false);
diff --git a/tensorflow/security/fuzzing/cc/BUILD b/tensorflow/security/fuzzing/cc/BUILD
index bf419d8332cc21..cae51b2ae8f5d2 100644
--- a/tensorflow/security/fuzzing/cc/BUILD
+++ b/tensorflow/security/fuzzing/cc/BUILD
@@ -33,6 +33,7 @@ tf_cc_fuzz_test(
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -144,6 +145,7 @@ tf_cc_fuzz_test(
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:stringprintf",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
index 66ddf4f75566ad..1f09a16a700d77 100644
--- a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string_view>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
@@ -27,8 +28,8 @@ namespace {
 void FuzzTest(std::string_view data) {
   std::string ns = tensorflow::str_util::ArgDefCase(data);
   for (const auto &c : ns) {
-    const bool is_letter = 'a' <= c && c <= 'z';
-    const bool is_digit = '0' <= c && c <= '9';
+    const bool is_letter = absl::ascii_isalpha(c);
+    const bool is_digit = absl::ascii_isdigit(c);
     if (!is_letter && !is_digit) {
       assert(c == '_');
     }
diff --git a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
index 32f56250bccecf..3f7f48db920480 100644
--- a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
@@ -26,7 +26,7 @@ namespace {
 
 void FuzzTest(std::string data) {
   absl::string_view sp(data);
-  tensorflow::uint64 val;
+  uint64_t val;
 
   const bool leading_digits =
       tensorflow::str_util::ConsumeLeadingDigits(&sp, &val);
diff --git a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
index 1498ecdd91c3ed..7445ae9b3c8219 100644
--- a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
@@ -38,7 +38,7 @@ namespace fuzzing {
 
 FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
                                       int number_of_output_arguments) {
-  std::vector<string> in_def_vec;
+  std::vector<std::string> in_def_vec;
   in_def_vec.reserve(number_of_input_arguments);
   for (int c = 0; c < number_of_input_arguments; ++c) {
     in_def_vec.push_back(absl::StrCat("in", c, ":float"));
@@ -50,12 +50,12 @@ FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
     body_nodes.push_back(
         {{"zero"}, "Const", {}, {{"value", const_value}, {"dtype", DT_FLOAT}}});
   }
-  std::vector<string> out_def_vec;
+  std::vector<std::string> out_def_vec;
   out_def_vec.reserve(number_of_output_arguments);
-  std::vector<std::pair<string, string>> ret_def;
+  std::vector<std::pair<std::string, std::string>> ret_def;
   ret_def.reserve(number_of_output_arguments);
   for (int c = 0; c < number_of_output_arguments; ++c) {
-    string output_id = "out" + std::to_string(c);
+    std::string output_id = "out" + std::to_string(c);
     out_def_vec.push_back(output_id + ":float");
     if (c < number_of_input_arguments) {
       ret_def.emplace_back(output_id, "in" + std::to_string(c));
diff --git a/tensorflow/security/fuzzing/cc/fuzz_session.h b/tensorflow/security/fuzzing/cc/fuzz_session.h
index b492c6e91da2d3..61a8fa03ff874d 100644
--- a/tensorflow/security/fuzzing/cc/fuzz_session.h
+++ b/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -142,13 +142,13 @@ class FuzzSession {
   // any returned output.
   // Note: We are ignoring Status from Run here since fuzzers don't need to
   // check it (as that will slow them down and printing/logging is useless).
-  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+  void RunInputs(const std::vector<std::pair<std::string, Tensor>>& inputs) {
     RunInputsWithStatus(inputs).IgnoreError();
   }
 
   // Same as RunInputs but don't ignore status
   absl::Status RunInputsWithStatus(
-      const std::vector<std::pair<string, Tensor>>& inputs) {
+      const std::vector<std::pair<std::string, Tensor>>& inputs) {
     return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
diff --git a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
index dab62696ce7946..025212d78bcfaf 100644
--- a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzBincount class that wraps a single operation node session.
-class FuzzBincount : public FuzzSession<Tensor, int32, Tensor> {
+class FuzzBincount : public FuzzSession<Tensor, int32_t, Tensor> {
   void BuildGraph(const Scope& scope) override {
     auto arr = tensorflow::ops::Placeholder(scope.WithOpName("arr"), DT_INT32);
     auto size =
@@ -36,10 +36,10 @@ class FuzzBincount : public FuzzSession<Tensor, int32, Tensor> {
         tensorflow::ops::Placeholder(scope.WithOpName("weights"), DT_INT32);
     tensorflow::ops::Bincount(scope.WithOpName("output"), arr, size, weights);
   }
-  void FuzzImpl(const Tensor& arr, const int32& nbins,
+  void FuzzImpl(const Tensor& arr, const int32_t& nbins,
                 const Tensor& weights) final {
     Tensor size(DT_INT32, {});
-    size.flat<int32>()(0) = nbins;
+    size.flat<int32_t>()(0) = nbins;
 
     absl::Status s = RunInputsWithStatus(
         {{"arr", arr}, {"size", size}, {"weights", weights}});
@@ -58,7 +58,7 @@ FUZZ_TEST_F(FuzzBincount, Fuzz)
                                                     /*dim_lower_bound=*/0,
                                                     /*dim_upper_bound=*/10),
                                                 fuzztest::Just(DT_INT32)),
-                 fuzztest::InRange<int32>(0, 10),
+                 fuzztest::InRange<int32_t>(0, 10),
                  fuzzing::AnyValidNumericTensor(fuzzing::AnyValidTensorShape(
                                                     /*max_rank=*/5,
                                                     /*dim_lower_bound=*/0,
diff --git a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
index 92272768086d55..1631998a20ee25 100644
--- a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzConcat class that wraps a single operation node session.
-class FuzzConcat : public FuzzSession<Tensor, Tensor, int32> {
+class FuzzConcat : public FuzzSession<Tensor, Tensor, int32_t> {
   void BuildGraph(const Scope& scope) override {
     auto value1 =
         tensorflow::ops::Placeholder(scope.WithOpName("value1"), DT_INT32);
@@ -43,7 +43,7 @@ class FuzzConcat : public FuzzSession<Tensor, Tensor, int32> {
                             axis);
   }
   void FuzzImpl(const Tensor& value1, const Tensor& value2,
-                const int32& axis) final {
+                const int32_t& axis) final {
     Tensor axis_tensor(DT_INT32, {});
     axis_tensor.scalar<int32_t>()() = axis;
     absl::Status s = RunInputsWithStatus(
@@ -66,7 +66,7 @@ FUZZ_TEST_F(FuzzConcat, Fuzz)
                                                     /*dim_lower_bound=*/0,
                                                     /*dim_upper_bound=*/10),
                                                 fuzztest::Just(DT_INT32)),
-                 fuzztest::InRange<int32>(0, 6));
+                 fuzztest::InRange<int32_t>(0, 6));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
index 76a8ffe5f9bef7..724c4ef164d890 100644
--- a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/stringprintf.h"
 
 // This is a fuzzer for tensorflow::strings::Printf
@@ -27,9 +28,11 @@ void FuzzTest(const std::vector<std::string> ss) {
   const std::string all = ss[0] + ss[1] + ss[2];
 
   int n[4] = {-1, -1, -1, -1};
-  const std::string ret =
-      tensorflow::strings::Printf("%n%s%n%s%n%s%n", &n[0], ss[0].c_str(), &n[1],
-                                  ss[1].c_str(), &n[2], ss[2].c_str(), &n[3]);
+  const std::string ret = tensorflow::strings::Printf(
+      "%n%s%n%s%n%s%n", absl::FormatCountCapture(&n[0]), ss[0].c_str(),
+      absl::FormatCountCapture(&n[1]), ss[1].c_str(),
+      absl::FormatCountCapture(&n[2]), ss[2].c_str(),
+      absl::FormatCountCapture(&n[3]));
 
   int size_so_far = 0;
   for (int i = 0; i < 3; i++) {
diff --git a/tensorflow/security/fuzzing/py/annotation_types.py b/tensorflow/security/fuzzing/py/annotation_types.py
index 874b66c6818101..9fd00b1757ad11 100644
--- a/tensorflow/security/fuzzing/py/annotation_types.py
+++ b/tensorflow/security/fuzzing/py/annotation_types.py
@@ -39,6 +39,7 @@ def _create_dtype_wrapper(name, underlying_dtype: _dtypes.DType):
 Float8e5m2fnuz = _create_dtype_wrapper(
     "Float8e5m2fnuz", _dtypes.float8_e5m2fnuz
 )
+Float4e2m1fn = _create_dtype_wrapper("Float4e2m1fn", _dtypes.float4_e2m1fn)
 Float16 = _create_dtype_wrapper("Float16", _dtypes.float16)
 Float32 = _create_dtype_wrapper("Float32", _dtypes.float32)
 Float64 = _create_dtype_wrapper("Float64", _dtypes.float64)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index a8ebc50ee2074d..2d357c24349ff5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -191,6 +191,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_STRING
     }
+    field {
+      name: "online_cost_analysis"
+      number: 36
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     enum_type {
       name: "MlirBridgeRollout"
       value {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index a6bc90e0026c6b..c94a936fb441e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -327,6 +327,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "online_cost_analysis"
+        number: 36
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       enum_type {
         name: "MlirBridgeRollout"
         value {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
index 39984d58cbdcd1..2de9041f8d31e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.dtypes.experimental"
 tf_module {
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index a38bafff65a376..a3cc35cad08b4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "extension_type"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index 3509a8bb917bfe..b709330d102931 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\', \'allow_last_axis_squeeze\', \'name\', \'optional\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
index 39984d58cbdcd1..2de9041f8d31e8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.dtypes.experimental"
 tf_module {
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 75d2ccd9d4a11f..ab6b7184b7932f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "extension_type"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "float4_e2m1fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "float8_e4m3b11fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/tests/convert_from_multiline.cc b/tensorflow/tools/api/tests/convert_from_multiline.cc
index cd138e250f2aef..5c51fdf66b3cde 100644
--- a/tensorflow/tools/api/tests/convert_from_multiline.cc
+++ b/tensorflow/tools/api/tests/convert_from_multiline.cc
@@ -28,12 +28,12 @@ namespace tensorflow {
 namespace {
 constexpr char kApiDefFilePattern[] = "*.pbtxt";
 
-absl::Status ConvertFilesFromMultiline(const string& input_dir,
-                                       const string& output_dir) {
+absl::Status ConvertFilesFromMultiline(const std::string& input_dir,
+                                       const std::string& output_dir) {
   Env* env = Env::Default();
 
-  const string file_pattern = io::JoinPath(input_dir, kApiDefFilePattern);
-  std::vector<string> matching_paths;
+  const std::string file_pattern = io::JoinPath(input_dir, kApiDefFilePattern);
+  std::vector<std::string> matching_paths;
   TF_CHECK_OK(env->GetMatchingPaths(file_pattern, &matching_paths));
 
   if (!env->IsDirectory(output_dir).ok()) {
@@ -41,10 +41,10 @@ absl::Status ConvertFilesFromMultiline(const string& input_dir,
   }
 
   for (const auto& path : matching_paths) {
-    string contents;
+    std::string contents;
     TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, path, &contents));
     contents = tensorflow::PBTxtFromMultiline(contents);
-    string output_path = io::JoinPath(output_dir, io::Basename(path));
+    std::string output_path = io::JoinPath(output_dir, io::Basename(path));
     // Write contents to output_path
     TF_RETURN_IF_ERROR(
         tensorflow::WriteStringToFile(env, output_path, contents));
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index fad14ad8eb647d..84c420c3ed6f9d 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -63,9 +63,9 @@ namespace benchmark_model {
 namespace {
 
 absl::Status InitializeVariables(Session* session,
-                                 const std::vector<string>& init_ops) {
+                                 const std::vector<std::string>& init_ops) {
   LOG(INFO) << "Initializing graph variables";
-  for (const string& init_op : init_ops) {
+  for (const std::string& init_op : init_ops) {
     TF_RETURN_IF_ERROR(session->Run({}, {}, {init_op}, nullptr));
   }
   return absl::OkStatus();
@@ -85,16 +85,16 @@ void InitializeTensor(const std::vector<float>& initialization_values,
 
 void CreateTensorsFromInputInfo(
     const std::vector<InputLayerInfo>& inputs,
-    std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
+    std::vector<std::pair<std::string, tensorflow::Tensor> >* input_tensors) {
   for (const InputLayerInfo& input : inputs) {
     Tensor input_tensor(input.data_type, input.shape);
     switch (input.data_type) {
       case DT_INT32: {
-        InitializeTensor<int32>(input.initialization_values, &input_tensor);
+        InitializeTensor<int32_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_INT64: {
-        InitializeTensor<int64>(input.initialization_values, &input_tensor);
+        InitializeTensor<int64_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_FLOAT: {
@@ -106,7 +106,7 @@ void CreateTensorsFromInputInfo(
         break;
       }
       case DT_UINT8: {
-        InitializeTensor<uint8>(input.initialization_values, &input_tensor);
+        InitializeTensor<uint8_t>(input.initialization_values, &input_tensor);
         break;
       }
       case DT_BOOL: {
@@ -131,15 +131,15 @@ void CreateTensorsFromInputInfo(
 
 absl::Status GetOutputShapes(
     const std::vector<InputLayerInfo>& inputs,
-    const std::set<string>& wanted_shapes, Session* session,
-    std::unordered_map<string, TensorShape>* node_shapes) {
-  std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+    const std::set<std::string>& wanted_shapes, Session* session,
+    std::unordered_map<std::string, TensorShape>* node_shapes) {
+  std::vector<std::pair<std::string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
   std::vector<tensorflow::Tensor> output_tensors;
-  std::vector<string> output_tensor_names;
-  for (const string& wanted_shape : wanted_shapes) {
+  std::vector<std::string> output_tensor_names;
+  for (const std::string& wanted_shape : wanted_shapes) {
     bool is_input = false;
-    for (const std::pair<string, tensorflow::Tensor>& input_tensor :
+    for (const std::pair<std::string, tensorflow::Tensor>& input_tensor :
          input_tensors) {
       if (input_tensor.first == wanted_shape) {
         (*node_shapes)[wanted_shape] = input_tensor.second.shape();
@@ -155,31 +155,31 @@ absl::Status GetOutputShapes(
       session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
   CHECK_EQ(output_tensors.size(), output_tensor_names.size());
   for (int i = 0; i < output_tensor_names.size(); ++i) {
-    const string& wanted_shape_name = output_tensor_names[i];
+    const std::string& wanted_shape_name = output_tensor_names[i];
     const TensorShape& found_shape = output_tensors[i].shape();
     (*node_shapes)[wanted_shape_name] = found_shape;
   }
   return absl::OkStatus();
 }
 
-absl::Status CalculateFlops(const GraphDef& graph,
-                            const std::vector<InputLayerInfo>& inputs,
-                            Session* session, int64_t* total_flops,
-                            std::unordered_map<string, int64_t>* flops_by_op) {
-  std::unordered_set<string> floppable_ops = {
+absl::Status CalculateFlops(
+    const GraphDef& graph, const std::vector<InputLayerInfo>& inputs,
+    Session* session, int64_t* total_flops,
+    std::unordered_map<std::string, int64_t>* flops_by_op) {
+  std::unordered_set<std::string> floppable_ops = {
       "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul",
       "DepthwiseConv2dNative"};
 
-  std::set<string> wanted_shapes;
+  std::set<std::string> wanted_shapes;
   for (const NodeDef& node : graph.node()) {
     if (floppable_ops.count(node.op())) {
-      for (const string& input : node.input()) {
+      for (const std::string& input : node.input()) {
         wanted_shapes.insert(input);
       }
       wanted_shapes.insert(node.name());
     }
   }
-  std::unordered_map<string, TensorShape> found_shapes;
+  std::unordered_map<std::string, TensorShape> found_shapes;
   TF_RETURN_IF_ERROR(
       GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
 
@@ -228,10 +228,10 @@ absl::Status CalculateFlops(const GraphDef& graph,
   return absl::OkStatus();
 }
 
-void RecordBenchmarkEntry(const string& output_prefix,
-                          const string& benchmark_name, const string& postfix,
-                          int num_runs, double total_time_s,
-                          double throughput = -1.0) {
+void RecordBenchmarkEntry(const std::string& output_prefix,
+                          const std::string& benchmark_name,
+                          const std::string& postfix, int num_runs,
+                          double total_time_s, double throughput = -1.0) {
   std::stringstream stream;
   stream << benchmark_name;
   if (!postfix.empty()) {
@@ -262,7 +262,7 @@ void SleepSeconds(double sleep_seconds) {
 
 }  // namespace
 
-absl::Status InitializeSession(int num_threads, const string& graph,
+absl::Status InitializeSession(int num_threads, const std::string& graph,
                                std::unique_ptr<Session>* session,
                                std::unique_ptr<GraphDef>* graph_def) {
   LOG(INFO) << "Loading TensorFlow.";
@@ -298,10 +298,11 @@ absl::Status InitializeSession(int num_threads, const string& graph,
 }
 
 absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                          const std::vector<string>& outputs,
-                          const std::vector<string>& targets, Session* session,
-                          StatSummarizer* stats, int64_t* inference_time_us) {
-  std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+                          const std::vector<std::string>& outputs,
+                          const std::vector<std::string>& targets,
+                          Session* session, StatSummarizer* stats,
+                          int64_t* inference_time_us) {
+  std::vector<std::pair<std::string, tensorflow::Tensor> > input_tensors;
   CreateTensorsFromInputInfo(inputs, &input_tensors);
 
   std::vector<tensorflow::Tensor> output_tensors;
@@ -337,8 +338,8 @@ absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
 absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                               double max_time_s,
                               const std::vector<InputLayerInfo>& inputs,
-                              const std::vector<string>& outputs,
-                              const std::vector<string>& targets,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& targets,
                               Session* session, StatSummarizer* stats,
                               int64_t* total_time_us,
                               int64_t* actual_num_runs) {
@@ -384,21 +385,21 @@ absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
 }
 
 int Main(int argc, char** argv) {
-  string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
-  string init_ops_string = "";
-  string input_layer_string = "input:0";
-  string input_layer_shape_string = "1,224,224,3";
-  string input_layer_type_string = "float";
-  string input_layer_values_string = "";
-  string output_layer_string = "output:0";
-  string target_layer_string = "";
+  std::string graph = "/data/local/tmp/tensorflow_inception_graph.pb";
+  std::string init_ops_string = "";
+  std::string input_layer_string = "input:0";
+  std::string input_layer_shape_string = "1,224,224,3";
+  std::string input_layer_type_string = "float";
+  std::string input_layer_values_string = "";
+  std::string output_layer_string = "output:0";
+  std::string target_layer_string = "";
   int max_num_runs = 1000;
-  string max_time = "10.0";
-  string inference_delay = "-1.0";
-  string inter_benchmark_delay = "-1.0";
+  std::string max_time = "10.0";
+  std::string inference_delay = "-1.0";
+  std::string inter_benchmark_delay = "-1.0";
   int num_threads = -1;
-  string benchmark_name = "";
-  string output_prefix = "";
+  std::string benchmark_name = "";
+  std::string output_prefix = "";
   bool show_sizes = false;
   bool show_run_order = true;
   int run_order_limit = 0;
@@ -446,7 +447,7 @@ int Main(int argc, char** argv) {
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
+  std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
 
   if (!parse_result) {
@@ -454,16 +455,19 @@ int Main(int argc, char** argv) {
     return -1;
   }
 
-  std::vector<string> init_ops = str_util::Split(init_ops_string, ',');
-  std::vector<string> input_layers = str_util::Split(input_layer_string, ',');
-  std::vector<string> input_layer_shapes =
+  std::vector<std::string> init_ops = str_util::Split(init_ops_string, ',');
+  std::vector<std::string> input_layers =
+      str_util::Split(input_layer_string, ',');
+  std::vector<std::string> input_layer_shapes =
       str_util::Split(input_layer_shape_string, ':');
-  std::vector<string> input_layer_types =
+  std::vector<std::string> input_layer_types =
       str_util::Split(input_layer_type_string, ',');
-  std::vector<string> input_layer_values =
+  std::vector<std::string> input_layer_values =
       str_util::Split(input_layer_values_string, ':');
-  std::vector<string> output_layers = str_util::Split(output_layer_string, ',');
-  std::vector<string> target_layers = str_util::Split(target_layer_string, ',');
+  std::vector<std::string> output_layers =
+      str_util::Split(output_layer_string, ',');
+  std::vector<std::string> target_layers =
+      str_util::Split(target_layer_string, ',');
   if ((input_layers.size() != input_layer_shapes.size()) ||
       (input_layers.size() != input_layer_types.size())) {
     LOG(ERROR) << "There must be the same number of items in --input_layer,"
@@ -552,9 +556,9 @@ int Main(int argc, char** argv) {
     CHECK(DataTypeFromString(input_layer_types[n], &input.data_type))
         << input_layer_types[n] << " was an invalid type";
 
-    std::vector<string> split_layer_shapes =
+    std::vector<std::string> split_layer_shapes =
         str_util::Split(input_layer_shapes[n], ',');
-    for (const string& layer_shape : split_layer_shapes) {
+    for (const std::string& layer_shape : split_layer_shapes) {
       int32_t tmp;
       CHECK(absl::SimpleAtoi(layer_shape, &tmp))
           << "Incorrect size string specified: " << input_layer_shapes[n];
@@ -568,11 +572,11 @@ int Main(int argc, char** argv) {
     }
     input.name = input_layers[n];
     if (n < input_layer_values.size()) {
-      std::vector<string> string_tokens =
+      std::vector<std::string> string_tokens =
           str_util::Split(input_layer_values[n], ',');
       input.initialization_values.clear();
       input.initialization_values.reserve(string_tokens.size());
-      for (const string& str_val : string_tokens) {
+      for (const std::string& str_val : string_tokens) {
         float val;
         CHECK(absl::SimpleAtof(str_val, &val))
             << "Incorrect initialization values string specified: "
@@ -641,14 +645,14 @@ int Main(int argc, char** argv) {
 
   if (show_flops) {
     int64_t total_flops;
-    std::unordered_map<string, int64_t> flops_by_op;
+    std::unordered_map<std::string, int64_t> flops_by_op;
     absl::Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
                                               &total_flops, &flops_by_op);
     if (!flop_status.ok()) {
       LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
       return -1;
     }
-    string pretty_flops;
+    std::string pretty_flops;
     if (total_flops < 1000) {
       pretty_flops = absl::StrCat(total_flops, " FLOPs");
     } else if (total_flops < (1000 * 1000)) {
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index a2349de570ed65..de476688339044 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -33,29 +33,30 @@ namespace benchmark_model {
 
 // Used to help construct dummy inputs for the benchmarking.
 struct InputLayerInfo {
-  string name;
+  std::string name;
   DataType data_type;
   TensorShape shape;
   std::vector<float> initialization_values;
 };
 
 // Loads a model from disk into a new session.
-absl::Status InitializeSession(int num_threads, const string& graph,
+absl::Status InitializeSession(int num_threads, const std::string& graph,
                                std::unique_ptr<Session>* session,
                                std::unique_ptr<GraphDef>* graph_def);
 
 // Does a single run of the model that's been loaded into the given session.
 absl::Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
-                          const std::vector<string>& outputs,
-                          const std::vector<string>& targets, Session* session,
-                          StatSummarizer* stats, int64_t* inference_time_us);
+                          const std::vector<std::string>& outputs,
+                          const std::vector<std::string>& targets,
+                          Session* session, StatSummarizer* stats,
+                          int64_t* inference_time_us);
 
 // Runs the model multiple time, keeping track of timing information.
 absl::Status TimeMultipleRuns(double sleep_seconds, int num_runs,
                               double max_time_s,
                               const std::vector<InputLayerInfo>& inputs,
-                              const std::vector<string>& outputs,
-                              const std::vector<string>& targets,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& targets,
                               Session* session, StatSummarizer* stats,
                               int64_t* total_time_us, int64_t* actual_num_runs);
 
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index f6657a2bb5b322..d09558ed194fea 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 void CreateTestGraph(const ::tensorflow::Scope& root,
                      benchmark_model::InputLayerInfo* input,
-                     string* output_name, GraphDef* graph_def) {
+                     std::string* output_name, GraphDef* graph_def) {
   // Create a simple graph and write it to filename_pb.
   const int input_width = 400;
   const int input_height = 10;
@@ -59,15 +59,15 @@ void CreateTestGraph(const ::tensorflow::Scope& root,
 }
 
 TEST(BenchmarkModelTest, InitializeAndRun) {
-  const string dir = testing::TmpDir();
-  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
+  const std::string dir = testing::TmpDir();
+  const std::string filename_pb = io::JoinPath(dir, "graphdef.pb");
   auto root = Scope::NewRootScope().ExitOnError();
 
   benchmark_model::InputLayerInfo input;
-  string output_name;
+  std::string output_name;
   GraphDef graph_def;
   CreateTestGraph(root, &input, &output_name, &graph_def);
-  string graph_def_serialized;
+  std::string graph_def_serialized;
   graph_def.SerializeToString(&graph_def_serialized);
   TF_ASSERT_OK(
       WriteStringToFile(Env::Default(), filename_pb, graph_def_serialized));
@@ -88,12 +88,12 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
 }
 
 TEST(BenchmarkModeTest, TextProto) {
-  const string dir = testing::TmpDir();
-  const string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
+  const std::string dir = testing::TmpDir();
+  const std::string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
   auto root = Scope::NewRootScope().ExitOnError();
 
   benchmark_model::InputLayerInfo input;
-  string output_name;
+  std::string output_name;
   GraphDef graph_def;
   CreateTestGraph(root, &input, &output_name, &graph_def);
   TF_ASSERT_OK(WriteTextProto(Env::Default(), filename_txt, graph_def));
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 2b78cee47ad843..d6bcf1efd6b9f6 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-BAZEL_VERSION="7.4.1"
+BAZEL_VERSION="7.7.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cpu.sh b/tensorflow/tools/ci_build/linux/rocm/run_cpu.sh
index c01864abf30041..017faf16a89d16 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_cpu.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cpu.sh
@@ -40,5 +40,5 @@ bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.ba
           --action_env=TF_NEED_ROCM=0 \
           --action_env=TF_PYTHON_VERSION=$PYTHON_VERSION \
           --local_test_jobs=${N_BUILD_JOBS} \
-	  --repo_env=ROCM_PATH=/opt/rocm \
+          --repo_env=ROCM_PATH=/opt/rocm \
           --jobs=${N_BUILD_JOBS}
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
index c75f10773af962..ea43eddc474626 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
@@ -66,6 +66,8 @@ EXCLUDED_TESTS=(
 
     # @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_device_test_amdgpu_any
     TritonEmitterTest.FusionWithOutputContainingMoreThanInt32MaxElementsExecutesCorrectly
+    TritonEmitterTest.ConvertF16ToF8E5M2Exhaustive
+    TritonEmitterTest.RocmWarpSizeIsSetCorrectly
     BasicDotAlgorithmEmitterTestSuite/BasicDotAlgorithmEmitterTest.BasicAlgorithmIsEmittedCorrectly/ALG_DOT_F16_F16_F16
 
     # @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_int4_device_test_amdgpu_any
@@ -73,6 +75,14 @@ EXCLUDED_TESTS=(
 
     # @local_xla//xla/backends/gpu/codegen/triton:fusion_emitter_parametrized_test_amdgpu_any
     TritonNormalizationTest.CanFuseAndEmitDiamondWithBF16Converts
+    ElementwiseTestSuiteF16/UnaryElementwiseTest.ElementwiseUnaryOpExecutesCorrectly/f16_cosine
+    ElementwiseTestSuiteF16/BinaryElementwiseTest.ElementwiseBinaryOpExecutesCorrectly/f16_atan2
+    ElementwiseTestSuiteF16/BinaryElementwiseTest.ElementwiseFusionExecutesCorrectly/f16_atan2
+
+    # @local_xla//xla/service/gpu/tests:command_buffer_test_amdgpu_any
+    CommandBufferTests/CommandBufferTest.WhileLoop/*
+    CommandBufferTests/CommandBufferTest.IndexConditional/*
+    CommandBufferTests/CommandBufferTest.TrueFalseConditional/*
 
     # @local_xla//xla/backends/gpu/runtime:command_buffer_conversion_pass_test_amdgpu_any
     CommandBufferConversionPassTest.ConvertWhileThunk
@@ -88,14 +98,33 @@ EXCLUDED_TESTS=(
     DotTf32Tf32F32Tests/DotAlgorithmSupportTest.AlgorithmIsSupportedFromCudaCapability/dot_tf32_tf32_f32_*
     DotTf32Tf32F32X3Tests/DotAlgorithmSupportTest.AlgorithmIsSupportedFromCudaCapability/dot_tf32_tf32_f32_*
 
+    # @local_xla//xla/service/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any_notfrt
+    # @local_xla//xla/service/gpu/transforms:triton_fusion_numerics_verifier_test_amdgpu_any
+    TritonFusionNumericsVerifierTest.CompilationSucceedsEvenIfKernelWillSpillRegisters
+    TritonFusionNumericsVerifierTest.VerifyThatDisablingTritonIsFast
+
     # @local_xla//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any
     CubSortKeysTest.CompareToReferenceNumpyOrderGt
     CubSortKeysTest.CompareToReferenceTotalOrderLt
     CubSort/CubSortKeysTest.*
     CubSort/CubSortPairsTest.*
 
+    # @local_xla//xla/backends/gpu/runtime:cub_sort_thunk_test
+    CubSortThunkTest.ProtoRoundTrip
+
     # @local_xla//xla/service/gpu/transforms:cublas_gemm_rewriter_test_amdgpu_any
     CublasLtGemmRewriteTest.MatrixBiasSwishActivation
+    CublasLtGemmRewriteTest.VectorBiasReluActivationF16Padded
+    CublasLtGemmRewriteTest.VectorBiasF16Padded
+    CublasLtGemmRewriteTest.ReluActivationF16Padded
+    CublasLtGemmRewriteTest.VectorBiasReluActivationBF16Padded
+    CublasLtGemmRewriteTest.BF16VectorBiasPadded
+    CublasLtGemmRewriteTest.ApproxGeluActivationBF16
+    CublasLtGemmRewriteTest.ReluActivationBF16Padded
+    CublasLtGemmRewriteTest.VectorBiasBF16Padded
+
+    # @local_xla//xla/service/gpu:determinism_test_amdgpu_any
+    DeterminismTest.Conv
 
     # @local_xla//xla/tests:sample_file_test_amdgpu_any
     # @local_xla//xla/tests:sample_file_test_amdgpu_any_notfrt
@@ -107,8 +136,8 @@ EXCLUDED_TESTS=(
     # @local_xla//xla/tests:scatter_test_amdgpu_any_notfrt
     ScatterTest.TensorFlowScatterV1_UpdateTwice
 
-    # @local_xla//xla/service/gpu/llvm_gpu_backend:amdgpu_bitcode_link_test
-    BitcodeLinkTest.TestLinkEmbeded
+    # @local_xla//xla/tests:multioutput_fusion_test_amdgpu_any
+    MultiOutputFusionTest.MultiOutputReduceFusionMajorWithExtraOutput
 )
 
 bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.bazelrc test \
@@ -125,5 +154,9 @@ bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.b
     --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
     --test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
     -- @local_xla//xla/... \
-    -@local_xla//xla/service/gpu/tests:sorting.hlo.test_mi200
+    -@local_xla//xla/service/gpu/tests:sorting_test_amdgpu_any \
+    -@local_xla//xla/service/gpu/tests:sorting.hlo.test_mi200 \
+    -@local_xla//xla/backends/gpu/codegen/emitters/tests:reduce_row/mof_scalar_variadic.hlo.test \
+    -@local_xla//xla/backends/gpu/codegen/emitters/tests:reduce_row/side_output_broadcast.hlo.test \
+    -@local_xla//xla/tools/hlo_opt:tests/gpu_hlo_llvm.hlo.test
     # ^^^ TODO (rocm) weekly-sync-20251021 excluded test files
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index a6643ae6a8bd9a..0218139a17fd4c 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,7 +17,7 @@
 
 # Keeps Bazel versions of the build scripts.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=7.4.1
+LATEST_BAZEL_VERSION=7.7.0
 # LINT.ThenChange(
 #   //tf_keras/google/kokoro/pip/build_and_upload_pip_package.sh,
 #   //tensorflow/opensource_only/.bazelversion,
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 6656142411a31b..92db0e1cf9445b 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -132,7 +132,7 @@ py_strict_library(
     srcs = ["fenced_doctest_lib.py"],
     deps = [
         ":tf_doctest_lib",
-        "@pypi_astor//:pkg",
+        "@pypi//astor",
     ],
 )
 
@@ -169,7 +169,7 @@ py_strict_test(
         ":generate2_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/platform:test",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
@@ -200,7 +200,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
-        "@pypi_packaging//:pkg",
+        "@pypi//packaging",
     ],
 )
 
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 477c04d3e936f6..b5fbef19051f8a 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04@sha256:353675e2a41babd526e2b837d7ec780c2a05bca0164f7ea5dbbd433d21d166fc
+FROM ubuntu:24.04@sha256:66460d557b25769b102175144d538d88219c077c678a49af4afca6fbfc1b5252
 
 LABEL maintainer="Shanqing Cai <cais@google.com>"
 
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
index 4dbf5f8ba9b208..5c63d3dcf70a81 100644
--- a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -35,9 +35,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 absl::Status RealMain(int argc, char** argv) {
-  string input_file_path;
-  string output_file_path;
-  string optimization_pass;
+  std::string input_file_path;
+  std::string output_file_path;
+  std::string optimization_pass;
 
   const std::vector<Flag> flag_list = {
       Flag("input_file_path", &input_file_path, "Location of the input graph."),
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 8baf7c38b9a117..fc846178897682 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -53,15 +53,16 @@ class FakeDevice : public Device {
 
  public:
   absl::Status Sync() override;
-  static std::unique_ptr<Device> Make(const string& name, const string& type);
+  static std::unique_ptr<Device> Make(const std::string& name,
+                                      const std::string& type);
 };
 
 absl::Status FakeDevice::Sync() {
   return errors::Unimplemented("FakeDevice::Sync()");
 }
 
-std::unique_ptr<Device> FakeDevice::Make(const string& name,
-                                         const string& type) {
+std::unique_ptr<Device> FakeDevice::Make(const std::string& name,
+                                         const std::string& type) {
   DeviceAttributes device_attributes;
   device_attributes.set_name(name);
   device_attributes.set_device_type(DeviceType(type).type());
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
index 3a8bcf14ee5ffe..1d946b65c97c00 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
@@ -86,14 +86,14 @@ class RepeatedStringSplitter : public ComposableSplitter {
   }
 };
 
-RepeatedString SetUpRepeatedString(std::vector<string> strings) {
+RepeatedString SetUpRepeatedString(std::vector<std::string> strings) {
   RepeatedString message;
   *message.mutable_strings() = {strings.begin(), strings.end()};
   return message;
 }
 
 TEST(RepeatedStringSplitterTest, TestSplitChunks) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   TF_ASSERT_OK_AND_ASSIGN(auto ret, splitter.Split());
@@ -131,7 +131,7 @@ TEST(RepeatedStringSplitterTest, TestSplitChunks) {
 }
 
 static void CheckChunks(riegeli::RecordReaderBase& reader,
-                        std::vector<string>& strings) {
+                        std::vector<std::string>& strings) {
   ChunkMetadata chunk_metadata;
   reader.Seek(reader.Size().value());
   reader.SeekBack();
@@ -165,7 +165,7 @@ static void CheckChunks(riegeli::RecordReaderBase& reader,
 }
 
 TEST(RepeatedStringSplitterTest, TestWrite) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
 
@@ -185,7 +185,7 @@ TEST(RepeatedStringSplitterTest, TestWrite) {
 }
 
 TEST(RepeatedStringSplitterTest, TestWriteToString) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   auto string_output_results = splitter.WriteToString();
@@ -202,7 +202,7 @@ TEST(RepeatedStringSplitterTest, TestWriteToString) {
 
 #if !IS_OSS
 TEST(RepeatedStringSplitterTest, TestWriteToCord) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
   auto cord_output_results = splitter.WriteToCord();
@@ -255,11 +255,11 @@ class RepeatedRepeatedStringSplitter : public ComposableSplitter {
 };
 
 TEST(ComposableTest, RepeatedRepeatedStringTest) {
-  std::vector<string> strings1 = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings1 = {"piece-1", "piece-2", "piece-3"};
   auto rs1 = SetUpRepeatedString(strings1);
-  std::vector<string> strings2 = {"new-strings-1"};
+  std::vector<std::string> strings2 = {"new-strings-1"};
   auto rs2 = SetUpRepeatedString(strings2);
-  std::vector<string> strings3 = {"foo-1", "foo-2"};
+  std::vector<std::string> strings3 = {"foo-1", "foo-2"};
   auto rs3 = SetUpRepeatedString(strings3);
 
   std::vector<RepeatedString> rs = {rs1, rs2, rs3};
@@ -275,8 +275,8 @@ TEST(ComposableTest, RepeatedRepeatedStringTest) {
   ChunkedMessage* chunked_message = ret.chunked_message;
   ASSERT_NE(chunked_message, nullptr);
 
-  std::vector<string> expected_chunks = {"piece-1",       "piece-2", "piece-3",
-                                         "new-strings-1", "foo-1",   "foo-2"};
+  std::vector<std::string> expected_chunks = {
+      "piece-1", "piece-2", "piece-3", "new-strings-1", "foo-1", "foo-2"};
 
   // RepeatedRepeatedStringSplitter sets the first chunk as the user-provided
   // message, so the expected size is 7.
@@ -300,12 +300,12 @@ TEST(ComposableTest, RepeatedRepeatedStringTest) {
 }
 
 TEST(ComposableTest, ChildSplitterTest) {
-  std::vector<string> strings1 = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings1 = {"piece-1", "piece-2", "piece-3"};
   auto message1 = SetUpRepeatedString(strings1);
   RepeatedStringSplitter splitter(&message1);
   std::vector<FieldType> fields = {};
 
-  std::vector<string> strings2 = {"s1", "s2"};
+  std::vector<std::string> strings2 = {"s1", "s2"};
   auto message2 = SetUpRepeatedString(strings2);
   RepeatedStringSplitter child(&message2, &splitter, &fields);
 
@@ -338,7 +338,7 @@ class NoOpSplitter : public ComposableSplitter {
 };
 
 TEST(NoOpSplitterTest, TestWrite) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  std::vector<std::string> strings = {"piece-1", "piece-2", "piece-3"};
   auto message = SetUpRepeatedString(strings);
   NoOpSplitter splitter(&message);
 
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index db66265156d0ea..2c05eb4267f817 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -68,6 +68,7 @@ cc_library(
     }),
     deps = [
         "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ] + if_ios(["//tensorflow/core/platform:logging"]),
 )
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index d92879db5ab9e2..953a8e8cfa76c7 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -45,8 +45,8 @@ static const char kPlaceholderFile[] =
     "tensorflow/tools/proto_text/placeholder.txt";
 
 bool IsPlaceholderFile(const char* s) {
-  string ph(kPlaceholderFile);
-  string str(s);
+  std::string ph(kPlaceholderFile);
+  std::string str(s);
   return str.size() >= strlen(kPlaceholderFile) &&
          ph == str.substr(str.size() - ph.size());
 }
@@ -76,14 +76,15 @@ int MainImpl(int argc, char** argv) {
     return -1;
   }
 
-  const string output_root = argv[1];
-  const string output_relative_path = kTensorFlowHeaderPrefix + string(argv[2]);
+  const std::string output_root = argv[1];
+  const std::string output_relative_path =
+      kTensorFlowHeaderPrefix + std::string(argv[2]);
 
-  string src_relative_path;
+  std::string src_relative_path;
   bool has_placeholder = false;
   for (int i = 3; i < argc; ++i) {
     if (IsPlaceholderFile(argv[i])) {
-      const string s(argv[i]);
+      const std::string s(argv[i]);
       src_relative_path = s.substr(0, s.size() - strlen(kPlaceholderFile));
       has_placeholder = true;
     }
@@ -102,13 +103,14 @@ int MainImpl(int argc, char** argv) {
 
   for (int i = 3; i < argc; i++) {
     if (IsPlaceholderFile(argv[i])) continue;
-    const string proto_path = string(argv[i]).substr(src_relative_path.size());
+    const std::string proto_path =
+        std::string(argv[i]).substr(src_relative_path.size());
 
     const tensorflow::protobuf::FileDescriptor* fd =
         importer.Import(proto_path);
 
     const int index = proto_path.find_last_of('.');
-    string proto_path_no_suffix = proto_path.substr(0, index);
+    std::string proto_path_no_suffix = proto_path.substr(0, index);
 
     proto_path_no_suffix =
         proto_path_no_suffix.substr(output_relative_path.size());
@@ -118,8 +120,8 @@ int MainImpl(int argc, char** argv) {
 
     // Three passes, one for each output file.
     for (int pass = 0; pass < 3; ++pass) {
-      string suffix;
-      string data;
+      std::string suffix;
+      std::string data;
       if (pass == 0) {
         suffix = ".pb_text.h";
         data = code.header;
@@ -131,7 +133,8 @@ int MainImpl(int argc, char** argv) {
         data = code.cc;
       }
 
-      const string path = output_root + "/" + proto_path_no_suffix + suffix;
+      const std::string path =
+          output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) {
         // We don't expect this output to be generated. It was specified in the
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 7123a2e144681b..acc884bad434b1 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -36,14 +37,14 @@ namespace tensorflow {
 namespace {
 
 template <typename... Args>
-string StrCat(const Args&... args) {
+std::string StrCat(const Args&... args) {
   std::ostringstream s;
   std::vector<int> give_me_a_name{((s << args), 0)...};
   return s.str();
 }
 
 template <typename... Args>
-string StrAppend(string* to_append, const Args&... args) {
+std::string StrAppend(std::string* to_append, const Args&... args) {
   *to_append += StrCat(args...);
   return *to_append;
 }
@@ -60,7 +61,7 @@ string StrAppend(string* to_append, const Args&... args) {
 // the field names (it's a loop over all names), and tracking of has_seen.
 class Generator {
  public:
-  explicit Generator(const string& tf_header_prefix)
+  explicit Generator(const std::string& tf_header_prefix)
       : tf_header_prefix_(tf_header_prefix),
         header_(&code_.header),
         header_impl_(&code_.header_impl),
@@ -73,9 +74,9 @@ class Generator {
 
  private:
   struct Section {
-    explicit Section(string* str) : str(str) {}
-    string* str;
-    string indent;
+    explicit Section(std::string* str) : str(str) {}
+    std::string* str;
+    std::string indent;
   };
 
   // Switches the currently active section to <section>.
@@ -110,7 +111,7 @@ class Generator {
   // <field_expr> is code that when emitted yields the field's value.
   void AppendFieldValueAppend(const FieldDescriptor& field,
                               const bool omit_default,
-                              const string& field_expr);
+                              const std::string& field_expr);
 
   // Appends the print code for as single field.
   void AppendFieldAppend(const FieldDescriptor& field);
@@ -135,17 +136,17 @@ class Generator {
   void AddNamespaceToCurrentSection(absl::string_view package, bool open);
 
   // Appends the given headers as sorted #include lines.
-  void AddHeadersToCurrentSection(const std::vector<string>& headers);
+  void AddHeadersToCurrentSection(const std::vector<std::string>& headers);
 
   // When adding #includes for tensorflow headers, prefix them with this.
-  const string tf_header_prefix_;
+  const std::string tf_header_prefix_;
   ProtoTextFunctionCode code_;
   Section* cur_ = nullptr;
   Section header_;
   Section header_impl_;
   Section cc_;
 
-  std::unordered_set<string> map_append_signatures_included_;
+  std::unordered_set<std::string> map_append_signatures_included_;
 
   Generator(const Generator&) = delete;
   void operator=(const Generator&) = delete;
@@ -153,8 +154,8 @@ class Generator {
 
 // Returns the prefix needed to reference objects defined in <fd>. E.g.
 // "::tensorflow::test".
-string GetPackageReferencePrefix(const FileDescriptor* fd) {
-  string result = "::";
+std::string GetPackageReferencePrefix(const FileDescriptor* fd) {
+  std::string result = "::";
   absl::string_view package = fd->package();
   for (size_t i = 0; i < package.size(); ++i) {
     if (package[i] == '.') {
@@ -168,67 +169,67 @@ string GetPackageReferencePrefix(const FileDescriptor* fd) {
 }
 
 // Returns the name of the class generated by proto to represent <d>.
-string GetClassName(const Descriptor& d) {
+std::string GetClassName(const Descriptor& d) {
   if (d.containing_type() == nullptr) return std::string(d.name());
   return StrCat(GetClassName(*d.containing_type()), "_", d.name());
 }
 
 // Returns the name of the class generated by proto to represent <ed>.
-string GetClassName(const EnumDescriptor& ed) {
+std::string GetClassName(const EnumDescriptor& ed) {
   if (ed.containing_type() == nullptr) return std::string(ed.name());
   return StrCat(GetClassName(*ed.containing_type()), "_", ed.name());
 }
 
 // Returns the qualified name that refers to the class generated by proto to
 // represent <d>.
-string GetQualifiedName(const Descriptor& d) {
+std::string GetQualifiedName(const Descriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()), GetClassName(d));
 }
 
 // Returns the qualified name that refers to the class generated by proto to
 // represent <ed>.
-string GetQualifiedName(const EnumDescriptor& d) {
+std::string GetQualifiedName(const EnumDescriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()), GetClassName(d));
 }
 
 // Returns the qualified name that refers to the generated
 // AppendProtoDebugString function for <d>.
-string GetQualifiedAppendFn(const Descriptor& d) {
+std::string GetQualifiedAppendFn(const Descriptor& d) {
   return StrCat(GetPackageReferencePrefix(d.file()),
                 "internal::AppendProtoDebugString");
 }
 
 // Returns the name of the generated function that returns an enum value's
 // string value.
-string GetEnumNameFn(const EnumDescriptor& enum_d) {
+std::string GetEnumNameFn(const EnumDescriptor& enum_d) {
   return StrCat("EnumName_", GetClassName(enum_d));
 }
 
 // Returns the qualified name of the function returned by GetEnumNameFn().
-string GetQualifiedEnumNameFn(const EnumDescriptor& enum_d) {
+std::string GetQualifiedEnumNameFn(const EnumDescriptor& enum_d) {
   return StrCat(GetPackageReferencePrefix(enum_d.file()),
                 GetEnumNameFn(enum_d));
 }
 
 // Returns the name of a generated header file, either the public api (if impl
 // is false) or the internal implementation header (if impl is true).
-string GetProtoTextHeaderName(const FileDescriptor& fd, bool impl) {
+std::string GetProtoTextHeaderName(const FileDescriptor& fd, bool impl) {
   const int dot_index = fd.name().find_last_of('.');
   return StrCat(fd.name().substr(0, dot_index),
                 (impl ? ".pb_text-impl.h" : ".pb_text.h"));
 }
 
 // Returns the name of the header generated by the proto library for <fd>.
-string GetProtoHeaderName(const FileDescriptor& fd) {
+std::string GetProtoHeaderName(const FileDescriptor& fd) {
   const int dot_index = fd.name().find_last_of('.');
   return StrCat(fd.name().substr(0, dot_index), ".pb.h");
 }
 
 // Returns the C++ class name for the given proto field.
-string GetCppClass(const FieldDescriptor& d) {
-  string cpp_class = d.cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE
-                         ? GetQualifiedName(*d.message_type())
-                         : std::string(d.cpp_type_name());
+std::string GetCppClass(const FieldDescriptor& d) {
+  std::string cpp_class = d.cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE
+                              ? GetQualifiedName(*d.message_type())
+                              : std::string(d.cpp_type_name());
 
   // In open-source TensorFlow, the definition of int64 varies across
   // platforms. The following line, which is manipulated during internal-
@@ -243,8 +244,8 @@ string GetCppClass(const FieldDescriptor& d) {
 // Returns the string that can be used for a header guard for the generated
 // headers for <fd>, either for the public api (if impl is false) or the
 // internal implementation header (if impl is true).
-string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
-  string s(fd.name());
+std::string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
+  std::string s(fd.name());
   std::replace(s.begin(), s.end(), '/', '_');
   std::replace(s.begin(), s.end(), '.', '_');
   return s + (impl ? "_IMPL_H_" : "_H_");
@@ -252,7 +253,7 @@ string GetHeaderGuard(const FileDescriptor& fd, bool impl) {
 
 void Generator::AppendFieldValueAppend(const FieldDescriptor& field,
                                        const bool omit_default,
-                                       const string& field_expr) {
+                                       const std::string& field_expr) {
   // This does not emit code with proper presence semantics (e.g. it doesn't
   // check 'has' fields on non-messages).
   CHECK(!field.has_presence() || field.containing_oneof() != nullptr ||
@@ -351,8 +352,8 @@ void Generator::AppendFieldAppend(const FieldDescriptor& field) {
   } else {
     const auto* oneof = field.containing_oneof();
     if (oneof != nullptr) {
-      string camel_name(field.camelcase_name());
-      camel_name[0] = toupper(camel_name[0]);
+      std::string camel_name(field.camelcase_name());
+      camel_name[0] = absl::ascii_toupper(camel_name[0]);
       Print("if (msg.", oneof->name(), "_case() == ",
             GetQualifiedName(*oneof->containing_type()), "::k", camel_name,
             ") {");
@@ -369,10 +370,11 @@ void Generator::AppendFieldAppend(const FieldDescriptor& field) {
 }
 
 void Generator::AppendEnumFunctions(const EnumDescriptor& enum_d) {
-  const string sig = StrCat("const char* ", GetEnumNameFn(enum_d), "(\n    ",
-                            GetQualifiedName(enum_d), " value)");
+  const std::string sig =
+      StrCat("const char* ", GetEnumNameFn(enum_d), "(\n    ",
+             GetQualifiedName(enum_d), " value)");
   SetOutput(&header_);
-  Print().Print("// Enum text output for ", string(enum_d.full_name()));
+  Print().Print("// Enum text output for ", std::string(enum_d.full_name()));
   Print(sig, ";");
 
   SetOutput(&cc_);
@@ -389,7 +391,7 @@ void Generator::AppendEnumFunctions(const EnumDescriptor& enum_d) {
 
 void Generator::AppendParseMessageFunction(const Descriptor& md) {
   const bool map_append = (md.options().map_entry());
-  string sig;
+  std::string sig;
   if (!map_append) {
     sig = StrCat("bool ProtoParseFromString(\n    const string& s,\n    ",
                  GetQualifiedName(md), "* msg)");
@@ -476,8 +478,8 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
   for (int i = 0; i < md.field_count(); ++i) {
     const FieldDescriptor* field = md.field(i);
     absl::string_view field_name = field->name();
-    string mutable_value_expr;
-    string set_value_prefix;
+    std::string mutable_value_expr;
+    std::string set_value_prefix;
     if (map_append) {
       mutable_value_expr = StrCat("&map_", field_name);
       set_value_prefix = StrCat("map_", field_name, " = ");
@@ -551,7 +553,7 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
           "Scanner::LETTER_DIGIT_DASH_UNDERSCORE)."
           "GetResult(nullptr, &value)) return false;");
       const auto* enum_d = field->enum_type();
-      string value_prefix;
+      std::string value_prefix;
       if (enum_d->containing_type() == nullptr) {
         value_prefix = GetPackageReferencePrefix(enum_d->file());
       } else {
@@ -561,7 +563,7 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
       for (int enum_i = 0; enum_i < enum_d->value_count(); ++enum_i) {
         const auto* value_d = enum_d->value(enum_i);
         absl::string_view value_name = value_d->name();
-        string condition = StrCat("value == \"", value_name, "\"");
+        std::string condition = StrCat("value == \"", value_name, "\"");
 
         Print(enum_i == 0 ? "" : "} else ", "if (", condition, ") {");
         Nest();
@@ -628,14 +630,14 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
 void Generator::AppendDebugStringFunctions(const Descriptor& md) {
   SetOutput(&header_impl_).Print();
   SetOutput(&header_).Print().Print("// Message-text conversion for ",
-                                    string(md.full_name()));
+                                    std::string(md.full_name()));
 
   // Append the two debug string functions for <md>.
   for (int short_pass = 0; short_pass < 2; ++short_pass) {
     const bool short_debug = (short_pass == 1);
 
     // Make the Get functions.
-    const string sig = StrCat(
+    const std::string sig = StrCat(
         "string ", short_debug ? "ProtoShortDebugString" : "ProtoDebugString",
         "(\n    const ", GetQualifiedName(md), "& msg)");
     SetOutput(&header_).Print(sig, ";");
@@ -652,7 +654,7 @@ void Generator::AppendDebugStringFunctions(const Descriptor& md) {
   }
 
   // Make the Append function.
-  const string sig =
+  const std::string sig =
       StrCat("void AppendProtoDebugString(\n",
              "    ::tensorflow::strings::ProtoTextOutput* o,\n    const ",
              GetQualifiedName(md), "& msg)");
@@ -703,7 +705,7 @@ void Generator::AppendMessageFunctions(const Descriptor& md) {
 void Generator::AddNamespaceToCurrentSection(absl::string_view package,
                                              bool open) {
   Print();
-  std::vector<string> parts = {""};
+  std::vector<std::string> parts = {""};
   for (size_t i = 0; i < package.size(); ++i) {
     if (package[i] == '.') {
       parts.resize(parts.size() + 1);
@@ -722,8 +724,9 @@ void Generator::AddNamespaceToCurrentSection(absl::string_view package,
   }
 }
 
-void Generator::AddHeadersToCurrentSection(const std::vector<string>& headers) {
-  std::vector<string> sorted = headers;
+void Generator::AddHeadersToCurrentSection(
+    const std::vector<std::string>& headers) {
+  std::vector<std::string> sorted = headers;
   std::sort(sorted.begin(), sorted.end());
   for (const auto& h : sorted) {
     Print("#include \"", h, "\"");
@@ -783,7 +786,7 @@ void Generator::Generate(const FileDescriptor& fd) {
   std::set<const Descriptor*> all_d;
   GetAllFileDescriptorsFromFile(&fd, &all_fd, &all_d);
 
-  std::vector<string> headers;
+  std::vector<std::string> headers;
 
   // Add header to header file.
   SetOutput(&header_);
@@ -862,8 +865,8 @@ void Generator::Generate(const FileDescriptor& fd) {
 
 }  // namespace
 
-ProtoTextFunctionCode GetProtoTextFunctionCode(const FileDescriptor& fd,
-                                               const string& tf_header_prefix) {
+ProtoTextFunctionCode GetProtoTextFunctionCode(
+    const FileDescriptor& fd, const std::string& tf_header_prefix) {
   Generator gen(tf_header_prefix);
   gen.Generate(fd);
   return gen.code();
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
index 2878717634b2b3..edcf913b8c284c 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
@@ -22,9 +22,9 @@ limitations under the License.
 namespace tensorflow {
 
 struct ProtoTextFunctionCode {
-  string header;       // for a file named proto_name + ".pb_text.h"
-  string header_impl;  // for a file named proto_name + ".pb_text-impl.h"
-  string cc;           // for a file named proto_name + ".pb_text.cc"
+  std::string header;       // for a file named proto_name + ".pb_text.h"
+  std::string header_impl;  // for a file named proto_name + ".pb_text-impl.h"
+  std::string cc;           // for a file named proto_name + ".pb_text.cc"
 };
 
 // Returns the generated source code for a proto file descriptor.
@@ -46,7 +46,7 @@ struct ProtoTextFunctionCode {
 //      in proto.
 ProtoTextFunctionCode GetProtoTextFunctionCode(
     const tensorflow::protobuf::FileDescriptor& fd,
-    const string& tf_header_prefix);
+    const std::string& tf_header_prefix);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index db0beb6d58084d..075922dc2be020 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -59,7 +59,7 @@ std::string PrintTextFormat(const tensorflow::protobuf::Message& message) {
 // new message using the generated parse function. Return the new message.
 template <typename T>
 T RoundtripParseProtoOrDie(const T& input, bool short_text) {
-  const string s =
+  const std::string s =
       short_text ? PrintShortTextFormat(input) : PrintTextFormat(input);
   T t;
   EXPECT_TRUE(ProtoParseFromString(s, &t)) << "Failed to parse " << s;
@@ -120,10 +120,10 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
 
   // Max numeric values.
   proto.Clear();
-  proto.set_optional_int32(std::numeric_limits<int32>::max());
+  proto.set_optional_int32(std::numeric_limits<int32_t>::max());
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::max());
-  proto.set_optional_uint32(std::numeric_limits<uint32>::max());
-  proto.set_optional_uint64(std::numeric_limits<uint64>::max());
+  proto.set_optional_uint32(std::numeric_limits<uint32_t>::max());
+  proto.set_optional_uint64(std::numeric_limits<uint64_t>::max());
   // TODO(b/67475677): Re-enable after resolving float precision issue
   // proto.set_optional_float(std::numeric_limits<float>::max());
   proto.set_optional_double(std::numeric_limits<double>::max());
@@ -138,7 +138,7 @@ TEST(CreateProtoDebugStringLibTest, ValidSimpleTypes) {
 
   // Lowest numeric values.
   proto.Clear();
-  proto.set_optional_int32(std::numeric_limits<int32>::lowest());
+  proto.set_optional_int32(std::numeric_limits<int32_t>::lowest());
   proto.set_optional_int64(std::numeric_limits<protobuf_int64>::lowest());
   // TODO(b/67475677): Re-enable after resolving float precision issue
   // proto.set_optional_float(std::numeric_limits<float>::lowest());
@@ -361,14 +361,15 @@ TEST(CreateProtoDebugStringLibTest, RecursiveMessage) {
 }
 
 template <typename T>
-T ParseProto(const string& value_text_proto) {
+T ParseProto(const std::string& value_text_proto) {
   T value;
   EXPECT_TRUE(protobuf::TextFormat::ParseFromString(value_text_proto, &value))
       << value_text_proto;
   return value;
 }
 
-TestAllTypes::NestedMessage ParseNestedMessage(const string& value_text_proto) {
+TestAllTypes::NestedMessage ParseNestedMessage(
+    const std::string& value_text_proto) {
   return ParseProto<TestAllTypes::NestedMessage>(value_text_proto);
 }
 
@@ -494,11 +495,11 @@ TEST(CreateProtoDebugStringLibTest, Enums) {
   EXPECT_PARSE_FAILURE("optional_nested_enum: 'BAR'");
   EXPECT_PARSE_FAILURE("optional_nested_enum: \"BAR\" ");
 
-  EXPECT_EQ(string("BAR"),
-            string(EnumName_TestAllTypes_NestedEnum(TestAllTypes::BAR)));
+  EXPECT_EQ(std::string("BAR"),
+            std::string(EnumName_TestAllTypes_NestedEnum(TestAllTypes::BAR)));
   // out of range - returns empty string (see NameOfEnum in proto library).
-  EXPECT_EQ(string(""), string(EnumName_TestAllTypes_NestedEnum(
-                            static_cast<TestAllTypes_NestedEnum>(123))));
+  EXPECT_EQ(std::string(""), std::string(EnumName_TestAllTypes_NestedEnum(
+                                 static_cast<TestAllTypes_NestedEnum>(123))));
 }
 
 TEST(CreateProtoDebugStringLibTest, Oneof) {
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index 10afb46f493cbe..8d35977d14a987 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:22.04@sha256:4e0171b9275e12d375863f2b3ae9ce00a4c53ddda176bd55868df97ac6f21a6e as builder
+FROM ubuntu:22.04@sha256:09506232a8004baa32c47d68f1e5c307d648fdd59f5e7eaa42aaf87914100db3 as builder
 ################################################################################
 
 # Install devtoolset build dependencies
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
index 1598845e61e497..9ab22386abd31b 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 774444369e224d..166d87f63e8617 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,7 +1,7 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index fbf723bb44ade9..7f9de39c1e1644 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -50,7 +50,7 @@ def initialize_rbe_configs():
     # The `ml-build`'s base image is a standard `ubuntu22.04` image.
     # Note that in order to use this image with RBE GPU builds, you need to have hermetic CUDA
     # toolchain integrated into your project, and pass
-    # `--@cuda_driver//:enable_forward_compatibility=true` to Bazel command.
+    # `--@cuda_driver//:include_cuda_umd_libs=true` to Bazel command.
     ml_build_rbe_config("docker://us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build@sha256:ea67e8453d8b09c2ba48853da5e79efef4b65804b4a48dfae4b4da89ffd38405")
 
     # TF-Version-Specific SIG Build RBE Configs. The crosstool generated from these
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index bdcfc6f4089221..ddd87ae0cf9786 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,9 +1,9 @@
 """Macro that creates external repositories for remote config."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
+load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/tensorflow/tools/toolchains/win/BUILD b/tensorflow/tools/toolchains/win/BUILD
index 258ca032ecd1ea..3aeae034d1f4b2 100644
--- a/tensorflow/tools/toolchains/win/BUILD
+++ b/tensorflow/tools/toolchains/win/BUILD
@@ -14,22 +14,12 @@ platform(
         "@platforms//cpu:x86_64",
         "@platforms//os:windows",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
 
 # Register the clang-cl platform
@@ -40,20 +30,10 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
diff --git a/tensorflow/tools/toolchains/win2022/BUILD b/tensorflow/tools/toolchains/win2022/BUILD
index 1499e7f0767ab9..0dba97d9d4a4b7 100644
--- a/tensorflow/tools/toolchains/win2022/BUILD
+++ b/tensorflow/tools/toolchains/win2022/BUILD
@@ -16,20 +16,11 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-	  value: "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "win2022"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc",
+        "OSFamily": "Windows",
+        "Pool": "win2022",
+        "dockerNetwork": "off",
+        "cache-silo-key": "20251105-1762360217",
+    },
 )
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index c6bb37b04ef155..24768c79646884 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -19,38 +19,6 @@ def _tf_bind():
     # If that ends up being the case, please leave a comment explaining
     # why we can't depend on the canonical build target.
 
-    # Needed by Protobuf
-    native.bind(
-        name = "grpc_cpp_plugin",
-        actual = "@com_github_grpc_grpc//src/compiler:grpc_cpp_plugin",
-    )
-    native.bind(
-        name = "grpc_python_plugin",
-        actual = "@com_github_grpc_grpc//src/compiler:grpc_python_plugin",
-    )
-
-    native.bind(
-        name = "grpc_lib",
-        actual = "@com_github_grpc_grpc//:grpc++",
-    )
-
-    native.bind(
-        name = "grpc_lib_unsecure",
-        actual = "@com_github_grpc_grpc//:grpc++_unsecure",
-    )
-
-    # Needed by Protobuf
-    native.bind(
-        name = "python_headers",
-        actual = str(Label("@local_xla//third_party/python_runtime:headers")),
-    )
-
-    # Needed by Protobuf
-    native.bind(
-        name = "six",
-        actual = "@six_archive//:six",
-    )
-
 def workspace():
     http_archive(
         name = "inception_v1",
@@ -140,10 +108,10 @@ def workspace():
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "4133c6c2045de5d7a133f6fc008ee6bd613af778f12143d09003e908dd541d8c",
-        strip_prefix = "rules_ml_toolchain-d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53",
+        sha256 = "a7951a86c4e9783302230b859237d953a6c8c301b219d344e05d70496eeefa52",
+        strip_prefix = "rules_ml_toolchain-0d383c69076f637d55eaae0b6e0ee2980b1345a9",
         urls = [
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/0d383c69076f637d55eaae0b6e0ee2980b1345a9.tar.gz",
         ],
     )
 
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index b9d2f4771d1742..324b9b9ac2b84e 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -38,15 +38,15 @@ load("@local_xla//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("@local_xla//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("@local_xla//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
 load("@local_xla//third_party/triton:workspace.bzl", triton = "repo")
+load("@local_xla//tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
+load("@local_xla//tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+load("@local_xla//tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
+load("@local_xla//tools/toolchains/clang6:repo.bzl", "clang6_configure")
+load("@local_xla//tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+load("@local_xla//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("@local_xla//tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
-load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
-load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
-load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
-load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
-load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
-load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
@@ -137,7 +137,7 @@ def _tf_toolchains():
     # Point //external/local_config_arm_compiler to //external/arm_compiler
     arm_compiler_configure(
         name = "local_config_arm_compiler",
-        build_file = "//tensorflow/tools/toolchains/cpus/arm:template.BUILD",
+        build_file = "@local_xla//tools/toolchains/cpus/arm:template.BUILD",
         remote_config_repo_arm = "../arm_compiler",
         remote_config_repo_aarch64 = "../aarch64_compiler",
     )
@@ -148,7 +148,7 @@ def _tf_toolchains():
     # TFLite crossbuild toolchain for embeddeds Linux
     arm_linux_toolchain_configure(
         name = "local_config_embedded_arm",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:template.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:template.BUILD",
         aarch64_repo = "../aarch64_linux_toolchain",
         armhf_repo = "../armhf_linux_toolchain",
     )
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "a23040b6307b67da2319de292a0bb4b39d0e5913fae50a90f955eafa1acb81c7",
-        strip_prefix = "XNNPACK-da9a34d9bb68f339c35d2da480ab0734b0a26429",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/da9a34d9bb68f339c35d2da480ab0734b0a26429.zip"),
+        sha256 = "b75c85b77e2d20b710763978c00385b27869f28a5f0a4967050c6d06767043ce",
+        strip_prefix = "XNNPACK-2dbaa1cd9faac161a59f4e1f3d0835991e2370d9",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2dbaa1cd9faac161a59f4e1f3d0835991e2370d9.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -210,9 +210,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "@local_xla//third_party:cudnn_frontend.BUILD",
         patch_file = ["@local_xla//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "34dfe01057e43e799af207522aa0c863ad3177f8c1568b6e7a7e4ccf1cbff769",
-        strip_prefix = "cudnn-frontend-1.11.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.11.0.zip"),
+        sha256 = "257b3b7f8a99abc096094abc9e5011659117b647d55293bcd2c5659f9181b99e",
+        strip_prefix = "cudnn-frontend-1.13.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.13.0.zip"),
     )
 
     tf_http_archive(
@@ -288,7 +288,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "aarch64_linux_toolchain",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:aarch64-linux-toolchain.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:aarch64-linux-toolchain.BUILD",
         sha256 = "50cdef6c5baddaa00f60502cc8b59cc11065306ae575ad2f51e412a9b2a90364",
         strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu",
         urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu.tar.xz"),
@@ -296,7 +296,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "armhf_linux_toolchain",
-        build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:armhf-linux-toolchain.BUILD",
+        build_file = "@local_xla//tools/toolchains/embedded/arm-linux:armhf-linux-toolchain.BUILD",
         sha256 = "3f76650b1d048036473b16b647b8fd005ffccd1a2869c10994967e0e49f26ac2",
         strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf",
         urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf.tar.xz"),
diff --git a/third_party/xla/.bazelversion b/third_party/xla/.bazelversion
index 815da58b7a9ed1..1985849fb58967 100644
--- a/third_party/xla/.bazelversion
+++ b/third_party/xla/.bazelversion
@@ -1 +1 @@
-7.4.1
+7.7.0
diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
index c1922b6e46371c..be021681a3ab53 100644
--- a/third_party/xla/.github/workflows/benchmark_postsubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -235,7 +235,7 @@ jobs:
           gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: xspace-artifacts-${{ matrix.job_info.pool }}-${{ matrix.job_info.platform }}
           path: ${{ env.XSPACE_FILE }}
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
index 013248ba912a4e..8d139c0722c4c7 100644
--- a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -207,7 +207,7 @@ jobs:
           gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA3" "$GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: cpu-xla-benchmarks-xspace-${{ matrix.job_info.pool }}
           path: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
index e68884cfd8bf1b..125c3aca6ad5c4 100644
--- a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -198,7 +198,7 @@ jobs:
           upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
           path: ${{ github.workspace }}/output/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/nightly_benchmarks.yml b/third_party/xla/.github/workflows/nightly_benchmarks.yml
index 85ba6aeafaa79d..73c05a76474b5c 100644
--- a/third_party/xla/.github/workflows/nightly_benchmarks.yml
+++ b/third_party/xla/.github/workflows/nightly_benchmarks.yml
@@ -182,7 +182,7 @@ jobs:
           gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/postsubmit_benchmark.yml b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
index 9d1a4c521e6d08..2eba615b929eb3 100644
--- a/third_party/xla/.github/workflows/postsubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
@@ -224,7 +224,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
index e3747a7fae78a8..8545446350cc5e 100644
--- a/third_party/xla/.github/workflows/presubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -199,7 +199,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: results-${{ env.CONFIG_ID }} 
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index bad3d5cd940db2..8ffbd537b77d8d 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -58,7 +58,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v3.pre.node20
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v3.pre.node20
         with:
           name: SARIF file
           path: results.sarif
@@ -67,6 +67,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@16140ae1a102900babc80a33c44059580f687047 # v4.30.9
+        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v4.31.2
         with:
           sarif_file: results.sarif
diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index d037084670ecd1..1e9616304cf07a 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -40,9 +40,9 @@ bazel_dep(name = "rules_ml_toolchain")
 # echo "sha256-${HASH}"
 archive_override(
     module_name = "rules_ml_toolchain",
-    integrity = "sha256-QTPGwgRd5dehM/b8AI7mvWE693jxIUPQkAPpCN1UHYw=",
-    strip_prefix = "rules_ml_toolchain-d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53",
-    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53.tar.gz"],
+    integrity = "sha256-p5UahsTpeDMCIwuFkjfZU6bIwwGyGdNE4F1wSW7u+lI=",
+    strip_prefix = "rules_ml_toolchain-0d383c69076f637d55eaae0b6e0ee2980b1345a9",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/0d383c69076f637d55eaae0b6e0ee2980b1345a9.tar.gz"],
 )
 
 # TODO: Upstream the patch?
@@ -84,6 +84,7 @@ use_repo(
     "XNNPACK",
     "cpuinfo",
     "cudnn_frontend_archive",
+    "dlpack",
     "ducc",
     "eigen_archive",
     "farmhash_archive",
@@ -205,3 +206,9 @@ use_repo(rocm_configure, "local_config_rocm")
 
 tensorrt_configure = use_extension("//third_party/extensions:tensorrt_configure.bzl", "tensorrt_configure_ext")
 use_repo(tensorrt_configure, "local_config_tensorrt")
+
+pjrt_nightly_timestamp = use_extension("//build_tools/pjrt_wheels:nightly.bzl", "nightly_timestamp_repo_bzlmod")
+use_repo(pjrt_nightly_timestamp, "nightly_timestamp")
+
+pjrt_rc_number = use_extension("//build_tools/pjrt_wheels:release_candidate.bzl", "rc_number_repo_bzlmod")
+use_repo(pjrt_rc_number, "rc_number")
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 31e7e3b6c80ca8..a0ca133038a805 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -9,10 +9,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "4133c6c2045de5d7a133f6fc008ee6bd613af778f12143d09003e908dd541d8c",
-    strip_prefix = "rules_ml_toolchain-d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53",
+    sha256 = "a7951a86c4e9783302230b859237d953a6c8c301b219d344e05d70496eeefa52",
+    strip_prefix = "rules_ml_toolchain-0d383c69076f637d55eaae0b6e0ee2980b1345a9",
     urls = [
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/0d383c69076f637d55eaae0b6e0ee2980b1345a9.tar.gz",
     ],
 )
 
@@ -152,3 +152,13 @@ load(
 nvshmem_redist_init_repository(
     nvshmem_redistributions = NVSHMEM_REDISTRIBUTIONS,
 )
+
+# This is used for building nightly PJRT wheels.
+load("//build_tools/pjrt_wheels:nightly.bzl", "nightly_timestamp_repo")
+
+nightly_timestamp_repo(name = "nightly_timestamp")
+
+# This is used for building release candidate PJRT wheels.
+load("//build_tools/pjrt_wheels:release_candidate.bzl", "rc_number_repo")
+
+rc_number_repo(name = "rc_number")
diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index 8b25ec8e148662..b32c1983de5fce 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -377,7 +377,13 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS,
     repo="openxla/xla",
-    configs=("sycl", "sycl_hermetic", "icpx_clang"),
+    configs=(
+        "nonccl",
+        "rbe_linux_cpu",
+        "sycl",
+        "sycl_hermetic",
+        "icpx_clang",
+    ),
     target_patterns=_XLA_ONEAPI_TARGET_PATTERNS,
     build_tag_filters=oneapi_build_tag_filter,
     test_tag_filters=oneapi_test_tag_filter,
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index f5a5cf7312c552..53a7ff54de6a03 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -92,8 +92,8 @@ bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneap
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build --nobuild -- //xla/stream_executor/sycl/... //xla/service/gpu/...
-bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/stream_executor/sycl/... //xla/service/gpu/...
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=nonccl --config=rbe_linux_cpu --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build --nobuild -- //xla/stream_executor/sycl/... //xla/service/gpu/...
+bazel build --build_tag_filters=oneapi-only,requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --test_tag_filters=oneapi-only,-requires-gpu-intel,-requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-cuda-only,-rocm-only,-no-oneapi --config=nonccl --config=rbe_linux_cpu --config=sycl --config=sycl_hermetic --config=icpx_clang --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/stream_executor/sycl/... //xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_MACOS_ARM64_CPU_KOKORO
diff --git a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
index d04ded8e2e2901..8a25f59fa9c7e6 100644
--- a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
+++ b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
@@ -1,10 +1,18 @@
 load("@cuda_cudart//:version.bzl", cuda_major_version = "VERSION")
+load("@nightly_timestamp//:timestamp.bzl", "XLA_NIGHTLY_TIMESTAMP")
+load("@rc_number//:rc_number.bzl", "XLA_RC_NUMBER")
 load("@rules_python//python:packaging.bzl", "py_wheel")
 
 # This ensures we can only build plugins for selected CUDA versions.
 cuda_label = "cuda" + cuda_major_version if cuda_major_version else "null"
 
-wheel_version = "0.0.0.dev0"
+# If we're building a nightly, append .devYYYYMMDD to the version
+# If we're not, the timestamp is an empty string
+# If we're building a release candidate, append rc# to the version
+# If we're not, the rc number is an empty string
+# Note that PEP 440 supports both of these at the same time, but
+# our CI will never set both.
+wheel_version = "0.0.0" + XLA_RC_NUMBER + XLA_NIGHTLY_TIMESTAMP
 
 wheel_platform = select({
     "//conditions:default": "manylinux_2_27_x86_64",
diff --git a/third_party/xla/build_tools/pjrt_wheels/nightly.bzl b/third_party/xla/build_tools/pjrt_wheels/nightly.bzl
new file mode 100644
index 00000000000000..f479196ac2f9d4
--- /dev/null
+++ b/third_party/xla/build_tools/pjrt_wheels/nightly.bzl
@@ -0,0 +1,35 @@
+"""If we're building a nightly, we use this to pass a timestamp for the wheel version."""
+
+def _nightly_timestamp_impl(rctx):
+    timestamp_val = rctx.getenv("XLA_NIGHTLY_TIMESTAMP", "")  # Default to ""
+
+    # Smuggle the value via a new .bzl file
+    if timestamp_val:
+        rctx.file(
+            "timestamp.bzl",
+            content = 'XLA_NIGHTLY_TIMESTAMP = ".dev{}"'.format(timestamp_val),
+        )
+    else:
+        rctx.file(
+            "timestamp.bzl",
+            content = 'XLA_NIGHTLY_TIMESTAMP = ""',
+        )
+
+    # Create a BUILD file to make timestamp.bzl addressable
+    rctx.file("BUILD.bazel", content = "")
+
+nightly_timestamp_repo = repository_rule(
+    implementation = _nightly_timestamp_impl,
+    environ = ["XLA_NIGHTLY_TIMESTAMP"],
+)
+
+# bzlmod implementation
+def _nightly_timestamp_ext_impl(mctx):  # @unused
+    nightly_timestamp_repo(
+        name = "nightly_timestamp",
+    )
+
+nightly_timestamp_repo_bzlmod = module_extension(
+    implementation = _nightly_timestamp_ext_impl,
+    environ = ["XLA_NIGHTLY_TIMESTAMP"],
+)
diff --git a/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl b/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl
new file mode 100644
index 00000000000000..cf5f1bcd9e6d2f
--- /dev/null
+++ b/third_party/xla/build_tools/pjrt_wheels/release_candidate.bzl
@@ -0,0 +1,35 @@
+"""If we're building a release candidate, we use this to pass a rc number for the wheel version."""
+
+def _rc_number_impl(rctx):
+    rc_number = rctx.getenv("XLA_RC_NUMBER", "")  # Default to ""
+
+    # Smuggle the value via a new .bzl file
+    if rc_number:
+        rctx.file(
+            "rc_number.bzl",
+            content = 'XLA_RC_NUMBER = "{}"'.format(rc_number),
+        )
+    else:
+        rctx.file(
+            "rc_number.bzl",
+            content = 'XLA_RC_NUMBER = ""',
+        )
+
+    # Create a BUILD file to make timestamp.bzl addressable
+    rctx.file("BUILD.bazel", content = "")
+
+rc_number_repo = repository_rule(
+    implementation = _rc_number_impl,
+    environ = ["XLA_RC_NUMBER"],
+)
+
+# bzlmod implementation
+def _rc_number_ext_impl(mctx):  # @unused
+    rc_number_repo(
+        name = "rc_number",
+    )
+
+rc_number_repo_bzlmod = module_extension(
+    implementation = _rc_number_ext_impl,
+    environ = ["XLA_RC_NUMBER"],
+)
diff --git a/third_party/xla/build_tools/rocm/platform/BUILD b/third_party/xla/build_tools/rocm/platform/BUILD
deleted file mode 100644
index 94e73c9f542638..00000000000000
--- a/third_party/xla/build_tools/rocm/platform/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2025 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD b/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD
deleted file mode 100644
index 5c05c005498f4d..00000000000000
--- a/third_party/xla/build_tools/rocm/platform/linux_x64/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2025 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-platform(
-    name = "linux_x64",
-    constraint_values = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-        "@bazel_tools//tools/cpp:clang",
-    ],
-    exec_properties = {
-        # SHA256 for rocm/tensorflow-build:latest-jammy-python3.10-rocm7.0.0
-        "container-image": "docker://rocm/tensorflow-build@sha256:7cd444ac48657fee2f5087fbda7766266704d3f8fb2299f681952ae4eabed060",
-        "OSFamily": "Linux",
-    },
-)
diff --git a/third_party/xla/xla/python/tools/_types.pyi b/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
old mode 100644
new mode 100755
similarity index 53%
rename from third_party/xla/xla/python/tools/_types.pyi
rename to third_party/xla/build_tools/rocm/rocm_tag_filters.sh
index f355656f05b674..ed03b599377c4d
--- a/third_party/xla/xla/python/tools/_types.pyi
+++ b/third_party/xla/build_tools/rocm/rocm_tag_filters.sh
@@ -1,4 +1,5 @@
-# Copyright 2024 The OpenXLA Authors.
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +12,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 # ==============================================================================
 
-from typing import Union
-import numpy as np
-from xla import xla_data_pb2
 
-# LINT.IfChange
-NdarrayTree = Union[np.ndarray, tuple['NdarrayTree', ...]]
-def make_ndarray(proto: xla_data_pb2.LiteralProto, /) -> NdarrayTree: ...
-def dtype_to_etype(dtype: np.dtype, /) -> xla_data_pb2.PrimitiveType: ...
-def etype_to_dtype(ptype: xla_data_pb2.PrimitiveType, /) -> np.dtype: ...
-# LINT.ThenChange(types.py, _types.cc)
+TAG_FILTERS=(
+    -no_gpu
+    -requires-gpu-intel
+    -requires-gpu-nvidia
+    -cuda-only
+    -oneapi-only
+    -requires-gpu-sm60
+    -requires-gpu-sm60-only
+    -requires-gpu-sm70
+    -requires-gpu-sm70-only
+    -requires-gpu-sm80
+    -requires-gpu-sm80-only
+    -requires-gpu-sm86
+    -requires-gpu-sm86-only
+    -requires-gpu-sm89
+    -requires-gpu-sm89-only
+    -requires-gpu-sm90
+    -requires-gpu-sm90-only
+)
+
+echo $(IFS=, ; echo "${TAG_FILTERS[*]}")
diff --git a/third_party/xla/build_tools/rocm/rocm_xla.bazelrc b/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
index a63387a4119394..eeaccd6f9f873c 100644
--- a/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
+++ b/third_party/xla/build_tools/rocm/rocm_xla.bazelrc
@@ -1,23 +1,23 @@
 # Test-related settings.
-try-import /usertools/rocm.bazelrc
 
 build:rocm_dev --remote_upload_local_results=false
 build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
 
 build:rocm_rbe --bes_backend="grpcs://wardite.cluster.engflow.com"
 build:rocm_rbe --bes_results_url="https://wardite.cluster.engflow.com/invocation/"
-build:rocm_rbe --host_platform="//build_tools/rocm/platform/linux_x64"
-build:rocm_rbe --extra_execution_platforms="//build_tools/rocm/platform/linux_x64"
-build:rocm_rbe --platforms="//build_tools/rocm/platform/linux_x64"
+build:rocm_rbe --host_platform="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --extra_execution_platforms="@local_config_rocm//rocm:linux_x64"
+build:rocm_rbe --platforms="@local_config_rocm//rocm:linux_x64"
 build:rocm_rbe --bes_timeout=600s
 build:rocm_rbe --tls_client_certificate="/tf/certificates/ci-cert.crt"
 build:rocm_rbe --tls_client_key="/tf/certificates/ci-cert.key"
 build:rocm_rbe --spawn_strategy=remote,local
+build:rocm_rbe --grpc_keepalive_time=30s
 
 test:rocm_rbe --jobs=200
 test:rocm_rbe --remote_executor=grpcs://wardite.cluster.engflow.com
 test:rocm_rbe --remote_timeout=3600
-test:rocm_rbe --strategy=TestRunner=remote
+test:rocm_rbe --strategy=TestRunner=remote,local
 
 build:tsan --strip=never
 build:tsan --copt -fsanitize=thread
@@ -26,7 +26,55 @@ build:tsan --copt -fno-omit-frame-pointer
 build:tsan --linkopt -fsanitize=thread
 build:tsan --linkopt -g
 build:tsan --//build_tools/rocm:sanitizer=tsan
+build:tsan --test_env=TSAN_OPTIONS=suppressions=build_tools/rocm/tsan_ignore_list.txt::history_size=7:ignore_noninstrumented_modules=1
+build:tsan --run_under=//build_tools/rocm:sanitizer_wrapper
 
 build:asan --test_env=ASAN_OPTIONS=suppressions=build_tools/rocm/asan_ignore_list.txt:use_sigaltstack=0
 build:asan --test_env=LSAN_OPTIONS=suppressions=build_tools/rocm/lsan_ignore_list.txt:use_sigaltstack=0
 build:asan --//build_tools/rocm:sanitizer=asan
+build:asan --run_under=//build_tools/rocm:sanitizer_wrapper
+
+test:xla_sgpu -- \
+//xla/... \
+-//xla/backends/gpu/collectives:gpu_clique_key_test \
+-//xla/backends/gpu/collectives:nccl_communicator_test \
+-//xla/service:collective_ops_utils_test \
+-//xla/service:collective_pipeliner_test \
+-//xla/service:collective_permute_cycle_test \
+-//xla/service:batched_gather_scatter_normalizer_test \
+-//xla/service:all_reduce_simplifier_test \
+-//xla/service:all_gather_simplifier_test \
+-//xla/service:reduce_scatter_decomposer_test \
+-//xla/service:reduce_scatter_reassociate_test \
+-//xla/service:reduce_scatter_combiner_test \
+-//xla/service:scatter_simplifier_test \
+-//xla/service:sharding_propagation_test \
+-//xla/service:sharding_remover_test \
+-//xla/service:p2p_schedule_preparation_test \
+-//xla/pjrt/distributed:topology_util_test \
+-//xla/pjrt/distributed:client_server_test
+
+test:xla_mgpu -- \
+//xla/tests:collective_ops_e2e_test \
+//xla/tests:collective_ops_test \
+//xla/tests:collective_pipeline_parallelism_test \
+//xla/tests:replicated_io_feed_test \
+//xla/backends/gpu/collectives:gpu_clique_key_test \
+//xla/backends/gpu/collectives:nccl_communicator_test \
+//xla/backends/gpu/runtime:all_reduce_test \
+//xla/service:collective_ops_utils_test \
+//xla/service:collective_pipeliner_test \
+//xla/service:collective_permute_cycle_test \
+//xla/service:batched_gather_scatter_normalizer_test \
+//xla/service:all_reduce_simplifier_test \
+//xla/service:all_gather_simplifier_test \
+//xla/service:reduce_scatter_decomposer_test \
+//xla/service:reduce_scatter_reassociate_test \
+//xla/service:reduce_scatter_combiner_test \
+//xla/service:scatter_simplifier_test \
+//xla/service:sharding_propagation_test \
+//xla/service:sharding_remover_test \
+//xla/service:p2p_schedule_preparation_test \
+//xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
+//xla/pjrt/distributed:topology_util_test \
+//xla/pjrt/distributed:client_server_test 
diff --git a/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc b/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc
new file mode 100644
index 00000000000000..f648ef7385cf2e
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/rocm_xla_ci.bazelrc
@@ -0,0 +1,3 @@
+# CI related imports
+try-import /usertools/rocm.bazelrc
+try-import %workspace%/build_tools/rocm/rocm_xla.bazelrc
\ No newline at end of file
diff --git a/third_party/xla/build_tools/rocm/run_xla.sh b/third_party/xla/build_tools/rocm/run_xla.sh
index c7645757b19a5b..8df56376d92a52 100755
--- a/third_party/xla/build_tools/rocm/run_xla.sh
+++ b/third_party/xla/build_tools/rocm/run_xla.sh
@@ -37,30 +37,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""
 
-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="gpu,requires-gpu-amd,-multi_gpu,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-cuda-only,-oneapi-only"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm
+
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
 
-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_sgpu \
+    --build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
+    --profile=/tf/pkg/profile.json.gz \
     --test_timeout=920,2400,7200,9600 \
     --test_sharding_strategy=disabled \
     --test_output=errors \
@@ -70,8 +63,6 @@ bazel \
     --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
     --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
     --action_env=TF_ROCM_AMDGPU_TARGETS=${AMD_GPU_GFX_ID} \
-    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+    --action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
     --repo_env="ROCM_PATH=$ROCM_PATH" \
-    --run_under=//build_tools/ci:parallel_gpu_execute \
-    -- //xla/...
+    --run_under=//build_tools/ci:parallel_gpu_execute
diff --git a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
index 8d677bdf93e7b4..e175651140ba14 100755
--- a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
@@ -18,16 +18,22 @@
 set -e
 set -x
 
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),gpu,-multi_gpu,-multi_gpu_h100,requires-gpu-amd,,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
+
 SCRIPT_DIR=$(dirname $0)
-bazel --bazelrc="$SCRIPT_DIR/rocm_xla.bazelrc" test \
-	"$@" \
-	--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
-	--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-requires-gpu-intel,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
+bazel --bazelrc="$SCRIPT_DIR/rocm_xla_ci.bazelrc" test \
+	--build_tag_filters=$TAG_FILTERS \
+    --test_tag_filters=$TAG_FILTERS \
 	--profile=/tf/pkg/profile.json.gz \
 	--keep_going \
 	--test_env=TF_TESTS_PER_GPU=1 \
-	--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
-	--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+	--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
 	--test_output=errors \
 	--local_test_jobs=2 \
-	--run_under=//build_tools/rocm:parallel_gpu_execute
+	--run_under=//build_tools/rocm:parallel_gpu_execute \
+	"$@"
diff --git a/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh b/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
index 14724e2fe44d70..1ecf94167bee04 100755
--- a/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
+++ b/third_party/xla/build_tools/rocm/run_xla_multi_gpu.sh
@@ -53,31 +53,23 @@ echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
 echo ""
 
-# First positional argument (if any) specifies the ROCM_INSTALL_DIR
-if [[ -n $1 ]]; then
-    ROCM_INSTALL_DIR=$1
-else
-    if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm/
-    else
-        ROCM_INSTALL_DIR=$ROCM_PATH
-    fi
-fi
-
 export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_ROCM=1
-export ROCM_PATH=$ROCM_INSTALL_DIR
-TAGS_FILTER="-requires-gpu-nvidia,-oss_excluded,-oss_serial"
-UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
-TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+export ROCM_PATH=/opt/rocm/
+
+SCRIPT_DIR=$(realpath $(dirname $0))
+TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
+
+if [ ! -d /tf/pkg ]; then
+	mkdir -p /tf/pkg
+fi
 
-bazel \
-    test \
-    --define xnn_enable_avxvnniint8=false \
-    --define xnn_enable_avx512fp16=false \
-    --config=rocm_gcc \
-    --build_tag_filters=${TAGS_FILTER} \
-    --test_tag_filters=${TAGS_FILTER} \
+bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
+    --config=rocm_ci \
+    --config=xla_mgpu \
+    --build_tag_filters=${TAG_FILTERS} \
+    --test_tag_filters=${TAG_FILTERS} \
+    --profile=/tf/pkg/profile.json.gz \
     --test_timeout=920,2400,7200,9600 \
     --test_sharding_strategy=disabled \
     --test_output=errors \
@@ -90,12 +82,4 @@ bazel \
     --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
     --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
     --action_env=NCCL_MAX_NCHANNELS=1 \
-    --repo_env="ROCM_PATH=$ROCM_PATH" \
-    -- //xla/tests:collective_ops_e2e_test \
-       //xla/tests:collective_ops_test \
-       //xla/tests:collective_pipeline_parallelism_test \
-       //xla/tests:replicated_io_feed_test \
-       //xla/tools/multihost_hlo_runner:functional_hlo_runner_test \
-       //xla/pjrt/distributed:topology_util_test \
-       //xla/pjrt/distributed:client_server_test \
-       //xla/backends/gpu/runtime:all_reduce_test
+    --repo_env="ROCM_PATH=$ROCM_PATH"
diff --git a/third_party/xla/build_tools/rocm/tsan_ignore_list.txt b/third_party/xla/build_tools/rocm/tsan_ignore_list.txt
new file mode 100644
index 00000000000000..9f88753e2fcc5a
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/tsan_ignore_list.txt
@@ -0,0 +1,35 @@
+race:libhsa-runtime64.so
+race:libamdhip64.so
+race:hipStreamSynchronize
+race:libhipblaslt.so
+race:libamd_comgr.so
+race:librccl.so
+
+# Abseil reference counting (DropRef / RefCount init)
+race:tsl::ReferenceCounted
+race:absl::lts_*::Mutex
+race:absl::lts_*::CondVar
+
+# XLA GPU RawSEDeviceMemory RCReference reuse
+race:xla::RawSEDeviceMemory
+race:xla::gpu::AllocateDestinationBuffer
+race:xla::LocalDeviceState::ThenRelease
+
+# To be fixed
+race:xla::GpuAsyncHostToDeviceTransferManager::TransferRawDataToSubBuffer
+race:xla::LiteralBase::Piece::DeallocateBuffers
+race:xla::PjRtStreamExecutorLoadedExecutable::ExecuteHelper
+race:xla::PjRtStreamExecutorClient::BufferFromHostBufferInternal
+race:xla::PjRtStreamExecutorClient::AllocateAndRecordEvent
+race:xla::HloRunnerPjRt::TransferLiteralsFromDevice
+race:xla::MutableLiteralBase::~MutableLiteralBase
+race:xla::MutableLiteralBase::PopulateR1<int>
+race:xla::gpu::GpuCompiler::CompileSingleModule
+race:xla::LiteralBase::Piece::Storage::Storage
+race:xla::LocalClient::TransferFromOutfeedLocal
+race:llvm::cl::opt_storage<bool, false, false>::setValue<int>
+
+race:xla::gpu::(anonymous namespace)::RecoverExp2Pattern::initStaticsIfNeeded*
+race:lld::lldMain
+race:llvm::*
+race:xla::gpu::GpuExecutable::ExecuteAsyncOnStream
diff --git a/third_party/xla/docs/_toc.yaml b/third_party/xla/docs/_toc.yaml
index e5b747ba5d2fc1..6303e971676ac9 100644
--- a/third_party/xla/docs/_toc.yaml
+++ b/third_party/xla/docs/_toc.yaml
@@ -29,6 +29,10 @@ toc:
     path: /xla/determinism
   - title: Emitters
     path: /xla/emitters
+  - title: Error Codes
+    path: /xla/error_codes
+  - title: Errors Overview
+    path: /xla/errors_overview
   - title: Hermetic CUDA overview
     path: /xla/hermetic_cuda
   - title: Indexing Analysis
diff --git a/third_party/xla/docs/error_codes.md b/third_party/xla/docs/error_codes.md
new file mode 100644
index 00000000000000..212b4c7efb3167
--- /dev/null
+++ b/third_party/xla/docs/error_codes.md
@@ -0,0 +1,5 @@
+# XLA Error codes
+
+This page is a list of all error codes emitted by the XLA compiler.
+
+-   [E0102](./errors/error_0102.md)
diff --git a/third_party/xla/docs/errors/error_0102.md b/third_party/xla/docs/errors/error_0102.md
new file mode 100644
index 00000000000000..5f58474c5b04a7
--- /dev/null
+++ b/third_party/xla/docs/errors/error_0102.md
@@ -0,0 +1,53 @@
+# Error code: 0102
+
+**Category:** Program input buffer mismatch
+
+**Type:** Runtime
+
+## Error log example
+
+```
+XlaRuntimeError: INVALID_ARGUMENT: Executable(jit_embedding_pipeline_step_fn) expected parameter 2482 of size 5242880 (bf16[16,1280,40]{2,1,0:T(8,128)(2,1)}) but got buffer with incompatible size 1638400 (bf16[16,1280,40]{1,2,0:T(8,128)(2,1)}): while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).
+```
+
+## Why do these happen?
+
+This error occurs when the XLA runtime detects a mismatch between the size of a
+memory buffer expected by a compiled program and the size of the buffer that is
+actually provided at execution time. The error message indicates both the
+expected and actual sizes, as well as the tensor shapes and layouts.
+
+Note that these errors might occur even if two tensors have the same shape but
+their size in memory can be different if their physical layout (how the data is
+tiled and arranged on the hardware) is different.
+
+These errors are predominantly caused by: - **Checkpoint and XLA configuration
+mismatch** - A model is trained and a checkpoint is saved. The physical layout
+of the weights in that checkpoint is determined by the exact XLA version and
+configuration (e.g. XLA flags) at that time. Later, this checkpoint is loaded in
+a different environment where the configuration has changed. A new flag, a
+different default value, or a change in the model/XLA code can cause the runtime
+to expect a different physical layout for the weights. When the old buffer from
+the checkpoint is passed to the new compiled XLA program, the runtime throws an
+error. - **Hardware/Topology-Specific Layouts** - The XLA compiler is free to
+choose different physical layouts for tensors to optimize performance on
+different hardware. A layout that is optimal for v4 TPU might be different from
+a v5 TPU, or even for different pod slices of the same chip (e.g., 4x4x4 vs
+4x8). The error occurs when a model is compiled with an assumption about one
+topology's layout, but at runtime it is scheduled on a different topology, or
+there is a bug in the compiler's layout logic for a specific piece of hardware.
+
+## How can a user fix their program when they do happen?
+
+-   Ensure configuration consistency between model export and re-runs from
+    checkpoints:
+    -   Avoid using old checkpoints with new code unless you are certain that no
+        layout-affecting changes have been made.
+    -   Re-export the Saved Model: If you suspect a checkpoint/configuration
+        mismatch, the most reliable solution is to re-export the saved model
+        using the exact same (and current) codebase and configuration that you
+        are using for inference or fine-tuning.
+    -   Check for configuration changes (e.g. XLA flags) between the two runs.
+-   Hardware/Topology-Specific layouts:
+    -   Check for hardware version and topology mismatches if switching hardware
+        or topologies.
diff --git a/third_party/xla/docs/errors_overview.md b/third_party/xla/docs/errors_overview.md
new file mode 100644
index 00000000000000..4c392281d789b8
--- /dev/null
+++ b/third_party/xla/docs/errors_overview.md
@@ -0,0 +1,36 @@
+# XLA Errors Overview
+
+XLA errors are categorized into different XLA error sources. Each source has a
+list of an additional context other than the error message, which will be
+attached to each error within the category.
+
+🚧 Note that this standarization effort is a work in progress so not all error
+messages will have an attached error code yet.
+
+An example error log might look like:
+
+```
+XlaRuntimeError: RESOURCE_EXHAUSTED: XLA:TPU compile permanent error. Ran out of memory in memory space hbm. Used 49.34G of 32.00G hbm. Exceeded hbm capacity by 17.34G. Total hbm usage >= 49.34G: reserved 3.12M program unknown size arguments 49.34G
+
+JaxRuntimeError: RESOURCE_EXHAUSTED: Ran out of memory in memory space vmem while allocating on stack for %ragged_latency_optimized_all_gather_lhs_contracting_gated_matmul_kernel.18 = bf16[2048,4096]{1,0:T(8,128)(2,1)} custom-call(%get-tuple-element.18273, %get-tuple-element.18274, %get-tuple-element.18275, %get-tuple-element.18276, %get-tuple-element.18277, /*index=5*/%bitcast.8695, %get-tuple-element.19201, %get-tuple-element.19202, %get-tuple-element.19203, %get-tuple-element.19204), custom_call_target=""
+```
+
+## Statuses and CHECK failures
+
+In general, in XLA we can flag corrupted execution with two mechanisms: statuses
+and CHECK macro failures.
+
+Statuses are meant for non-fatal, recoverable errors. The assumption is that the
+function returns, and execution continues down the path where the caller
+explicitly checks the returned Status object. It's useful for handling invalid
+user input or expected resource constraints.
+
+On the other hand, CHECK failures cover programmer's errors or violations of
+invariants that should never happen if the code is correct. In case of an
+activated CHECK the program will log the error message and immediately
+terminate. It could ensure internal consistency, such as checking that a pointer
+is non-null before dereferencing it.
+
+## Error codes
+
+Here is an index list with all [error codes](error_codes.md).
diff --git a/third_party/xla/docs/pjrt/cpp_api_overview.md b/third_party/xla/docs/pjrt/cpp_api_overview.md
index d69bead946455e..7eb417fce20ecf 100644
--- a/third_party/xla/docs/pjrt/cpp_api_overview.md
+++ b/third_party/xla/docs/pjrt/cpp_api_overview.md
@@ -80,7 +80,7 @@ For communicating with frameworks, buffers know how to convert to and from an
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(...) {...}
 
 // Buffer to Literal
-xla::PjRtFuture<> ToLiteral(xla::MutableLiteralBase* literal) override {...}
+xla::Future<> ToLiteral(xla::MutableLiteralBase* literal) override {...}
 ```
 
 APIs for creating a buffer have [Buffer Semantics](https://github.com/openxla/xla/blob/3e448cf9e86775a37ec5f7d3c69dfb20e0c760df/xla/pjrt/pjrt_client.h#L858)
@@ -138,7 +138,7 @@ reference. These buffers are then provided as arguments to the `Execute` method.
 
 ## PJRT Concepts
 
-### PjRtFutures & Async Computations
+### Futures & Async Computations
 
 If any part of a plugin is implemented asynchronously, it _must_ properly
 implement futures.
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index 291fa38d476161..1e18f11ed3e1b3 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -200,9 +200,11 @@ common:cuda --@rules_ml_toolchain//common:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 common:cuda --@local_config_cuda//cuda:include_cuda_libs=true
+common:cuda --@cuda_driver//:include_cuda_umd_libs=true
 
 # This configuration is used for building the wheels.
 common:cuda_wheel --@local_config_cuda//cuda:include_cuda_libs=false
+common:cuda_wheel --@cuda_driver//:include_cuda_umd_libs=false
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 common:cuda_clang --config=cuda
@@ -282,19 +284,14 @@ common:rocm_base --define=xnn_enable_avxvnniint8=false
 common:rocm_base --define=xnn_enable_avx512fp16=false
 common:rocm_base --repo_env TF_NEED_ROCM=1
 
-# Depraceted, will be removed once all build/test scripts are migrated from --config=rocm.
-common:rocm --config=rocm_base 
-
-common:rocm_gcc --config=rocm_base
-common:rocm_gcc --copt=-Wno-stringop-truncation
-
 common:rocm_clang_official --config=rocm_base
 common:rocm_clang_official --action_env=CLANG_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
 common:rocm_clang_official --action_env=TF_ROCM_CLANG="1"
 common:rocm_clang_official --linkopt="-fuse-ld=lld"
 common:rocm_clang_official --host_linkopt="-fuse-ld=lld"
 
-common:rocm_ci --config=rocm_clang_official
+common:rocm --config=rocm_clang_official
+common:rocm_ci --config=rocm
 
 common:rocm_ci_hermetic --config=rocm_clang_official
 common:rocm_ci_hermetic --repo_env="OS=ubuntu_22.04"
@@ -485,7 +482,6 @@ common:use_tar_archive_files --repo_env=USE_LLVM_TAR_ARCHIVE_FILES=1
 common:use_tar_archive_files --repo_env=USE_MIRRORED_TAR_ARCHIVE_FILES=1
 
 # Make Bazel not try to probe the host system for a C++ toolchain.
-common:rbe_base --config=use_tar_archive_files
 common:rbe_base --config=resultstore
 common:rbe_base --define=EXECUTOR=remote
 common:rbe_base --jobs=800
@@ -548,9 +544,6 @@ common:rbe_linux_cuda --config=rbe_linux_cpu
 common:rbe_linux_cuda --repo_env=TF_SYSROOT=
 # For Remote build execution -- GPU configuration
 common:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-# Enable forward compatibility for CUDA builds because RBE docker image doesn't
-# have latest CUDA drivers installed.
-common:rbe_linux_cuda --@cuda_driver//:enable_forward_compatibility=true
 
 common:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 common:rbe_linux_cuda_nvcc --config=cuda_nvcc
diff --git a/third_party/xla/third_party/compute_library/build_defs.bzl b/third_party/xla/third_party/compute_library/build_defs.bzl
index cd85be26bde540..c4dc1d9c0d7313 100644
--- a/third_party/xla/third_party/compute_library/build_defs.bzl
+++ b/third_party/xla/third_party/compute_library/build_defs.bzl
@@ -1,6 +1,6 @@
 def if_enable_acl(if_true, if_false = []):
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": if_true,
+        Label("//third_party/compute_library:build_with_acl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -15,6 +15,6 @@ def acl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
+        Label("//third_party/compute_library:build_with_acl"): ["@compute_library//:arm_compute"],
         "//conditions:default": [],
     })
diff --git a/third_party/xla/third_party/extensions/python_version.bzl b/third_party/xla/third_party/extensions/python_version.bzl
index 83c4127cfa2eb6..d74a5ace691c25 100644
--- a/third_party/xla/third_party/extensions/python_version.bzl
+++ b/third_party/xla/third_party/extensions/python_version.bzl
@@ -8,6 +8,8 @@ _PY_VERSION_BZL = """
 HERMETIC_PYTHON_VERSION = "{version}"
 HERMETIC_PYTHON_VERSION_KIND = "{py_kind}"
 USE_PYWRAP_RULES = {use_pywrap_rules}
+# TODO(pcloudy): Figure out how to support requirements_lock in Bzlmod.
+REQUIREMENTS = "//:requirements.txt"
 """
 
 def _python_version_repo_impl(repository_ctx):
@@ -29,6 +31,7 @@ python_version_repo = repository_rule(
     implementation = _python_version_repo_impl,
     environ = [
         "HERMETIC_PYTHON_VERSION",
+        "HERMETIC_PYTHON_VERSION_KIND",
         "USE_PYWRAP_RULES",
     ],
 )
diff --git a/third_party/xla/third_party/extensions/remote_execution_configure.bzl b/third_party/xla/third_party/extensions/remote_execution_configure.bzl
index bc9f9e0efb521a..5728fad8e461b5 100644
--- a/third_party/xla/third_party/extensions/remote_execution_configure.bzl
+++ b/third_party/xla/third_party/extensions/remote_execution_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for remote_execution."""
 
-load("@local_xla//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tools/toolchains/remote:configure.bzl", "remote_execution_configure")
 
 remote_execution_configure_ext = module_extension(
     implementation = lambda mctx: remote_execution_configure(name = "local_config_remote_execution"),
diff --git a/third_party/xla/third_party/extensions/rocm_configure.bzl b/third_party/xla/third_party/extensions/rocm_configure.bzl
index b08948085caf34..cf755c8bc307fd 100644
--- a/third_party/xla/third_party/extensions/rocm_configure.bzl
+++ b/third_party/xla/third_party/extensions/rocm_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for rocm."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 
 rocm_configure_ext = module_extension(
     implementation = lambda mctx: rocm_configure(name = "local_config_rocm"),
diff --git a/third_party/xla/third_party/extensions/tensorrt_configure.bzl b/third_party/xla/third_party/extensions/tensorrt_configure.bzl
index 6e795e3a0226e4..87f9d8e08cafd3 100644
--- a/third_party/xla/third_party/extensions/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/extensions/tensorrt_configure.bzl
@@ -1,6 +1,6 @@
 """Module extension for tensorrt."""
 
-load("@local_xla//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 
 tensorrt_configure_ext = module_extension(
     implementation = lambda mctx: tensorrt_configure(name = "local_config_tensorrt"),
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index e4e0c53e83999f..9a1fa830d14c8a 100755
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -32,6 +32,7 @@ HIP_RUNTIME_PATH = '%{rocm_root}/lib'
 HIP_RUNTIME_LIBRARY = '%{rocm_root}/lib'
 ROCR_RUNTIME_PATH = '%{rocm_root}/lib'
 ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
+TMPDIR= '%{tmpdir}'
 VERBOSE = '%{crosstool_verbose}'=='1'
 
 def Log(s):
@@ -232,6 +233,8 @@ def InvokeHipcc(argv, log=False):
 
 
 def main():
+  if TMPDIR:
+    os.environ['TMPDIR'] = TMPDIR
   # ignore PWD env var
   os.environ['PWD']=''
 
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index 794f338dd38951..ec1edae410899c 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -136,11 +136,14 @@ cc_library(
     deps = [":rocm_config"],
 )
 
+# workaround to bring data to the same fs layout as expected in the rocm libs
+# rocblas assumes that miopen db files are located in ../share/miopen/db directory
+# hibplatslt assumes that tensile files are located in ../hipblaslt/library directory
 cc_library(
     name = "rocm_rpath",
     linkopts = select({
         ":build_hermetic": [
-            "-Wl,-rpath,%{rocm_toolkit_path}/lib",
+            "-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
         ":multiple_rocm_paths": [
             "-Wl,-rpath=%{rocm_lib_paths}",
@@ -164,7 +167,7 @@ cc_library(
 cc_library(
     name = "rocm_hip",
     srcs = glob([
-        "%{rocm_root}/lib/libamdhip*.so",
+        "%{rocm_root}/lib/libamdhip*.so*",
         "%{rocm_root}/lib/libhiprtc.so*",
         "%{rocm_root}/lib/libhiprtc-builtins.so*",
     ]),
@@ -189,9 +192,10 @@ cc_library(
 cc_library(
     name = "hip_runtime",
     srcs = glob([
-        "%{rocm_root}/lib/libamdhip*.so",
+        "%{rocm_root}/lib/libamdhip*.so*",
         "%{rocm_root}/lib/libhiprtc.so*",
         "%{rocm_root}/lib/libhiprtc-builtins.so*",
+        "%{rocm_root}/lib/libamd_comgr.so*",
     ]),
     hdrs = glob(["%{rocm_root}/include/hip/**"]),
     include_prefix = "rocm",
@@ -201,6 +205,7 @@ cc_library(
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
+        ":amd_comgr",
         ":rocm_config",
         ":rocprofiler_register",
         ":system_libs",
@@ -218,36 +223,46 @@ cc_library(
     includes = [
         "%{rocm_root}/include",
     ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # rocblas assumes that tensile files are located in ../roblas/libraries directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":hipblaslt",
+        ":rocm_config",
+        ":roctracer",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
     name = "rocfft",
-    srcs = glob(["%{rocm_root}/lib/librocfft*.so*"]),
+    data = glob(["%{rocm_root}/lib/librocfft*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
+    linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
     name = "hipfft",
-    srcs = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
+    data = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include",
     ],
+    linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -276,14 +291,12 @@ cc_library(
     includes = [
         "%{rocm_root}/include",
     ],
-    # workaround to  bring miopen db files to the same fs layout as expected in the lib
-    # rocblas assumes that miopen db files are located in ../share/miopen/db directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
-        ":rocm_config",
         ":rocm-core",
+        ":rocm_config",
+        ":rocm_rpath",
     ],
 )
 
@@ -351,7 +364,23 @@ cc_library(
 cc_library(
     name = "roctracer",
     hdrs = glob(["%{rocm_root}/include/roctracer/**"]),
-    data = glob(["%{rocm_root}/lib/libroctracer*.so*"]),
+    data = glob([
+        "%{rocm_root}/lib/libroctracer*.so*",
+        "%{rocm_root}/lib/libroctx64.so*",
+    ]),
+    include_prefix = "rocm",
+    includes = [
+        "%{rocm_root}/include/",
+    ],
+    strip_include_prefix = "%{rocm_root}",
+    visibility = ["//visibility:public"],
+    deps = [":rocm_config"],
+)
+
+cc_library(
+    name = "rocprofiler-sdk",
+    srcs = glob(["%{rocm_root}/lib/librocprofiler-sdk*.so*"]),
+    hdrs = glob(["%{rocm_root}/include/rocprofiler-sdk/**"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
@@ -369,10 +398,12 @@ cc_library(
     includes = [
         "%{rocm_root}/include/",
     ],
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -410,10 +441,10 @@ cc_library(
     ],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     deps = [
         ":hipblas-common",
         ":rocm_config",
+        ":rocm_rpath",
     ],
 )
 
@@ -429,7 +460,6 @@ cc_library(
     deps = [":rocm_config"],
 )
 
-
 cc_library(
     name = "rocm-core",
     srcs = glob([
@@ -450,12 +480,13 @@ cc_library(
     includes = [
         "%{rocm_root}/include/hipblaslt",
     ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # hibplatslt assumes that tensile files are located in ../hipblaslt/library directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":hip_runtime",
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -553,3 +584,16 @@ filegroup(
     srcs = glob(["%{rocm_root}/**"]),
     visibility = ["//visibility:public"],
 )
+
+platform(
+    name = "linux_x64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+        "@bazel_tools//tools/cpp:clang",
+    ],
+    exec_properties = {
+        "container-image": "docker://%{rocm_rbe_docker_image}",
+        "OSFamily": "Linux",
+    },
+)
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
index a17b21435510a2..31eb2b2b66e354 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
+++ b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
@@ -149,6 +149,18 @@ rocm_redist_ubuntu_22_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-register/rocprofiler-register_0.4.0.60401-83~22.04_amd64.deb",
                 sha256 = "da49a66ca3e6ee8b9491777c2b5170b6020e8308371e26b869d7af81bc50f571",
             ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-sdk/rocprofiler-sdk_0.6.0-83~22.04_amd64.deb",
+                sha256 = "9890bc2ddbf563edbb50fc8d227a56da462a8bcdd08dbc2f549c924b89c42a59",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-sdk-roctx/rocprofiler-sdk-roctx_0.6.0-83~22.04_amd64.deb",
+                sha256 = "87eddf255f80bbe3c371e0a087a1045ec82766903077de2fb708ed1958ab62fb",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hsa-rocr-dev/hsa-rocr-dev_1.15.0.60401-83~22.04_amd64.deb",
+                sha256 = "e1221293f8a5501ef01324c381e61d76a5d2e0f11032e4d33766dc53efc75813",
+            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand6.4.1/rocrand6.4.1_3.3.0.60401-83~22.04_amd64.deb",
                 sha256 = "ebc85dfef24a03afc28671e3df47519f520bedd08643ef5957dd5b08e15dc1f1",
@@ -214,10 +226,6 @@ rocm_redist_ubuntu_22_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~22.04_amd64.deb",
                 sha256 = "fbd647e1b13e7aa2c14c9581f9102c069ddab9ecb47a4b226d433ec37b19e92d",
             ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "885cf3f3a52ebde9caadf6348a6cda28fd15e3bc52bab0c90b587d72b29ff7ef",
-            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt6.2.0/hipblaslt6.2.0_0.8.0.60200-66~22.04_amd64.deb",
                 sha256 = "af3bea7cda7c1af147c3baae1cb3a8846ff571fe713c3a83d0924810bee734fe",
@@ -226,6 +234,10 @@ rocm_redist_ubuntu_22_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt-dev6.2.0/hipblaslt-dev6.2.0_0.8.0.60200-66~22.04_amd64.deb",
                 sha256 = "d9d2c80228ebfe74ebb98fc74fda57be498ab84ca40358355c616dfd38efded2",
             ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
+                sha256 = "885cf3f3a52ebde9caadf6348a6cda28fd15e3bc52bab0c90b587d72b29ff7ef",
+            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~22.04_amd64.deb",
                 sha256 = "468026fa8eb70121f0c545557a926ddc41228cef9457b4a00d8fc3a36b04310f",
@@ -389,4 +401,185 @@ rocm_redist_ubuntu_22_04 = {
         ],
         "rocm_root": "opt/rocm-6.2.0",
     },
+    "6.1.0": {
+        "archives": [
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/c/comgr6.1.0/comgr6.1.0_2.7.0.60100-82~22.04_amd64.deb",
+                sha256 = "49967e2e98b96a95c618a1db7eacf8892b2700e0cf88960b3b0097da081ec1c8",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.1.40091.60100-82~22.04_amd64.deb",
+                sha256 = "8cb31ffd9d313e19a6e9b7bed8a106d0ed59fe92f479fa042405217f787cae16",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
+                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas6.1.0/hipblas6.1.0_2.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "4703e568dd8d6314b81508260b9d799c577ee38ae59655ce6a4782c0f6d3e3ef",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas-dev6.1.0/hipblas-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "3b6ddd2df992002afd0684de4ace6a6e86e497c4db95813febd7c0da851f8da5",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcc6.1.0/hipcc6.1.0_1.0.0.60100-82~22.04_amd64.deb",
+                sha256 = "e11db2414fec41b45f605616a10793956611850b42406bdf5c4f067e195e502a",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcub-dev6.1.0/hipcub-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "888d7643506f00023b617beb8446d09608216dae075e978c7862a41adb7e94c2",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft6.1.0/hipfft6.1.0_1.0.14.60100-82~22.04_amd64.deb",
+                sha256 = "e8de4cd7a377a718e8c4392e02fafbe3f43f38a2397aaf5cd2136eb03c43a5c3",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft-dev6.1.0/hipfft-dev6.1.0_1.0.14.60100-82~22.04_amd64.deb",
+                sha256 = "afaf929e06c43310b5325a735169e73af85cf5e764d43ef319038d25484201b5",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver6.1.0/hipsolver6.1.0_2.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "6bb779344bd39e9da75ee8474d7de5d10e6993d627e6cbd9ac7a3fcf260b1a6a",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver-dev6.1.0/hipsolver-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "450c0849203b69da15d593fde712555328715626027df980823df9458f9b4631",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse6.1.0/hipsparse6.1.0_3.0.1.60100-82~22.04_amd64.deb",
+                sha256 = "b3806a85a483da4fa06f8e4edf917c5ceb1a4c00af6426ec61fbec23828291b3",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse-dev6.1.0/hipsparse-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
+                sha256 = "7003d85e42e988d9e5b80da0d5b81aa34a393ded1d9d567b0edf06e3ba2fc9b4",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand6.1.0/hiprand6.1.0_2.10.16.60100-82~22.04_amd64.deb",
+                sha256 = "bf6678ba14b9baebe6fe39a0aeaeeb2c10b8154a5e9c6d0223d8b01f36d1a7b9",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand-dev6.1.0/hiprand-dev6.1.0_2.10.16.60100-82~22.04_amd64.deb",
+                sha256 = "3f2069097efc8a9bbf1cb9be60f7240dcd17a5380614b2b6faf29c7b53b657c4",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-rocr6.1.0/hsa-rocr6.1.0_1.13.0.60100-82~22.04_amd64.deb",
+                sha256 = "562904659abd5e905a806b4ffc30af5c25442e3d6143e6a99b4660badced2b86",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip6.1.0/miopen-hip6.1.0_3.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "2f26448b8ef551383bf16f0e066dd6f4b7539b51f382b7028b377de5164f8b63",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip-dev6.1.0/miopen-hip-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "df6d5a8151f216dc02cd96e45d0ca8133cca51d272ede25eb30898f07d0f3e82",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl6.1.0/rccl6.1.0_2.18.6.60100-82~22.04_amd64.deb",
+                sha256 = "2d367697957bba93c79e8da1d1bc7c8bbd8d07fb7f013de7c83824f9047372f1",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl-dev6.1.0/rccl-dev6.1.0_2.18.6.60100-82~22.04_amd64.deb",
+                sha256 = "1c4927aa49e4dcb441608c3fa6ec86c9d078aaa767214be2b213c1a8421a3929",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas6.1.0/rocblas6.1.0_4.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "43cb1dd308f08a9d766ed846bd4d345b74fcc3a87e6e7ee727c7e5cf49629416",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas-dev/rocblas-dev_4.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "fc8bef370666fad72c01fc131749ccb835b8bfcb1639ed43dda26b9e64702b3b",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft6.1.0/rocfft6.1.0_1.0.27.60100-82~22.04_amd64.deb",
+                sha256 = "50d0ad3cb37a69285b6132a17fdefbdea2e18ab6faf8265ead44ef3d7a4d16cb",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft-dev6.1.0/rocfft-dev6.1.0_1.0.27.60100-82~22.04_amd64.deb",
+                sha256 = "2e091de9499e493c03a79ca7673b9c8640f896051542ff3c4f635efccc97d10f",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-core6.1.0/rocm-core6.1.0_6.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "685993f25f9da6e17cf69bc7dc9cdde0ca33b9955474a11bb903cae0d4a25d66",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "57c4212475dd5a8fe2bdab92eeff71332a0d408615dc2a4254482eb46d13e212",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
+                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60100-82~22.04_amd64.deb",
+                sha256 = "7e155798e1027dd4fc0d49a89865245f3017090e44ea057584b8b86d5ea931cd",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocminfo/rocminfo_1.0.0.60100-82~22.04_amd64.deb",
+                sha256 = "b7cf95b3b20e3accba23de34265ac408603176279412fda116dce47047a36e7b",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm/rocm-llvm_17.0.0.24103.60100-82~22.04_amd64.deb",
+                sha256 = "4c245a83e48517d627f34f52c0e7020434dcf4ef4ef073c736afc60e69f8b6f2",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm-dev6.1.0/rocm-llvm-dev6.1.0_17.0.0.24103.60100-82~22.04_amd64.deb",
+                sha256 = "c99854c0b92ea9c530be6c656157d26587b74c4ea1e9e12522570438a189d5b9",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-smi-lib6.1.0/rocm-smi-lib6.1.0_7.0.0.60100-82~22.04_amd64.deb",
+                sha256 = "4b02aa9e5e09a36303e185def69ae67702a7177a5e6793e00565c8c6fdd32f88",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprim-dev6.1.0/rocprim-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
+                sha256 = "ce3ce32ed9692c58d1a6ba089a7c07b27d2935b0f126a1c84b214cd2433ebe48",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprofiler-register6.1.0/rocprofiler-register6.1.0_0.3.0.60100-82~22.04_amd64.deb",
+                sha256 = "73b877f13ba65c6ba01197452c3b538f50f687d54ae0b3428c85c07bff20dcb7",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocrand-dev6.1.0/rocrand-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
+                sha256 = "afcdfa0cbc71363ccd9bb71f421343b12263bc88d42fa9a4c78c60bbc3fa17d3",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer6.1.0/roctracer6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
+                sha256 = "c98aedc99d252bf40b8069f497d24d60e2eaca25d001471e42ceb4df531ecba7",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer-dev6.1.0/roctracer-dev6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
+                sha256 = "9c2967d988e7a1408a3e4b2c83177eb7c88af939619a9d0d5ab7af2db9489884",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver6.1.0/rocsolver6.1.0_3.25.0.60100-82~22.04_amd64.deb",
+                sha256 = "65a6270f66194e033af1dc4b238bf7ecdfa439933b9c330bcb307caf516e8b3b",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver-dev6.1.0/rocsolver-dev6.1.0_3.25.0.60100-82~22.04_amd64.deb",
+                sha256 = "ca40789c82d3e46f2951cb0b1a7d5e8026daf5af6d597693746d95b8a49cd9a1",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsparse6.1.0/rocsparse6.1.0_3.1.2.60100-82~22.04_amd64.deb",
+                sha256 = "ede46a9ccd505543425c5f75c6e8180c05c3b865dd638edbe297237664b3fe31",
+            ),
+            struct(
+                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
+                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
+            ),
+            struct(
+                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
+                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
+            ),
+            struct(
+                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
+                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
+            ),
+            struct(
+                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
+                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
+            ),
+        ],
+        "rocm_root": "opt/rocm-6.1.0",
+    },
 }
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
index 7efd5e88c357a0..3658342c5d98aa 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
+++ b/third_party/xla/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
@@ -17,6 +17,10 @@ rocm_redist_ubuntu_24_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas/hipblas_2.4.0.60401-83~24.04_amd64.deb",
                 sha256 = "feaf51ba1b97d59d525a7317f6940dce01de85b18ab1356e198aae6287c9f25a",
             ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-dev/hipblas-dev_2.4.0.60401-83~24.04_amd64.deb",
+                sha256 = "4503107e2979b014870781067e2ad7976b9981e0a84fdde288a6247187e36725",
+            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblaslt6.4.1/hipblaslt6.4.1_0.12.1.60401-83~24.04_amd64.deb",
                 sha256 = "f3b3b5456f0b69b82c7ab1ccc9762a3a09e2ddc42b73cfa38a4c060755d5fb91",
@@ -25,10 +29,6 @@ rocm_redist_ubuntu_24_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblaslt-dev6.4.1/hipblaslt-dev6.4.1_0.12.1.60401-83~22.04_amd64.deb",
                 sha256 = "23a8ed2da0d56cf79336eef3d502c57daf06934e8ca7225b4e6b449e59de3dab",
             ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-dev/hipblas-dev_2.4.0.60401-83~24.04_amd64.deb",
-                sha256 = "4503107e2979b014870781067e2ad7976b9981e0a84fdde288a6247187e36725",
-            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hipblas-common-dev/hipblas-common-dev_1.0.0.60401-83~24.04_amd64.deb",
                 sha256 = "cc68c954a933b63727b9503fd55d83ca334387c5edf5bb8ba5143d04a9e6deaa",
@@ -149,6 +149,18 @@ rocm_redist_ubuntu_24_04 = {
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-register/rocprofiler-register_0.4.0.60401-83~24.04_amd64.deb",
                 sha256 = "b0d459cf9deaab61c199c629c9298d50a7f50f538c6f08fc9a99c420290c6e04",
             ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-sdk-roctx/rocprofiler-sdk-roctx_0.6.0-83~24.04_amd64.deb",
+                sha256 = "a965d93d4de982a7b86e0681bf6ffa0a1b5d8e4b47ab99f611fe664baee920d3",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocprofiler-sdk/rocprofiler-sdk_0.6.0-83~24.04_amd64.deb",
+                sha256 = "5212e822f652f4ecf662077aa469ee649d2d470b9d91d2aba2175b1821050e73",
+            ),
+            struct(
+                url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/h/hsa-rocr-dev/hsa-rocr-dev_1.15.0.60401-83~24.04_amd64.deb",
+                sha256 = "eb63dfa66564fc3d2ff73aaa763c4f7a61c0308187f62a1ac246bb739a31333e",
+            ),
             struct(
                 url = "https://repo.radeon.com/rocm/apt/6.4.1/pool/main/r/rocrand6.4.1/rocrand6.4.1_3.3.0.60401-83~24.04_amd64.deb",
                 sha256 = "d572e64ca54d29482829a694badc65d09893d18ae3352c8d8d37abae23e7fd58",
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index 215685040c57ae..a7ce7c364c99e8 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -50,9 +50,14 @@ _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DISTRIBUTION_PATH = "rocm/rocm_dist"
 _OS = "OS"
 _ROCM_VERSION = "ROCM_VERSION"
+_TMPDIR = "TMPDIR"
 
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
 _TF_ROCM_MULTIPLE_PATHS = "TF_ROCM_MULTIPLE_PATHS"
+_TF_ROCM_RBE_DOCKER_IMAGE = "TF_ROCM_RBE_DOCKER_IMAGE"
+
+# rocm/tensorflow-build:latest-jammy-python3.11-rocm7.0.2
+_DEFAULT_TF_ROCM_RBE_DOCKER_IMAGE = "rocm/tensorflow-build@sha256:a2672ff2510b369b4a5f034272a518dc93c2e492894e3befaeef19649632ccaa"
 _LLVM_PATH = "LLVM_PATH"
 
 def verify_build_defines(params):
@@ -328,11 +333,13 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
             ("hipsparse", repo_path),
             ("roctracer64", repo_path),
             ("rocsolver", repo_path),
+            ("rocsolver", repo_path),
+            ("hipsolver", repo_path),
             ("hipfft", repo_path),
             ("rocrand", repo_path),
-            ("hipsolver", repo_path),
             ("hipblas", repo_path),
             ("hipblaslt", repo_path),
+            ("rocprofiler-sdk", repo_path),
         ]
     ]
 
@@ -493,24 +500,6 @@ def _norm_path(path):
         path = path[:-1]
     return path
 
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
 
@@ -618,8 +607,6 @@ def _create_local_rocm_repository(repository_ctx):
 
     clang_offload_bundler_path = rocm_toolkit_path + "/llvm/bin/clang-offload-bundler"
 
-    have_hipblaslt = "1" if rocm_libs["hipblaslt"] != None else "0"
-
     # Set up BUILD file for rocm/
     repository_ctx.template(
         "rocm/build_defs.bzl",
@@ -634,24 +621,18 @@ def _create_local_rocm_repository(repository_ctx):
             ),
             "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
             "%{rocm_version_number}": str(rocm_version_number),
-            "%{rocm_hipblaslt}": "True" if rocm_libs["hipblaslt"] != None else "False",
+            "%{rocm_hipblaslt}": "True",
         },
     )
 
     repository_dict = {
         "%{rocm_root}": rocm_toolkit_path,
         "%{rocm_toolkit_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path)),
+        "%{rocm_rbe_docker_image}": repository_ctx.os.environ.get(_TF_ROCM_RBE_DOCKER_IMAGE, _DEFAULT_TF_ROCM_RBE_DOCKER_IMAGE),
     }
 
     tf_sysroot = _tf_sysroot(repository_ctx)
 
-    if rocm_libs["hipblaslt"] != None:
-        repository_dict["%{hipblaslt_lib}"] = rocm_libs["hipblaslt"].file_name
-
-    if rocm_version_number >= 40500:
-        repository_dict["%{hipsolver_lib}"] = rocm_libs["hipsolver"].file_name
-        repository_dict["%{hipblas_lib}"] = rocm_libs["hipblas"].file_name
-
     multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
     if multiple_paths:
         paths_list = multiple_paths.split(":")
@@ -739,6 +720,11 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_amdgpu_targets}": ",".join(
                 ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
             ),
+            "%{tmpdir}": get_host_environ(
+                repository_ctx,
+                _TMPDIR,
+                "",
+            ),
         },
     )
 
@@ -755,7 +741,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_version_number}": rocm_config.rocm_version_number,
             "%{miopen_version_number}": rocm_config.miopen_version_number,
             "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
+            "%{hipblaslt_flag}": "1",
             "%{hip_soversion_number}": rocm_libs["amdhip64"].soversion,
             "%{rocblas_soversion_number}": rocm_libs["rocblas"].soversion,
             "%{hipblaslt_soversion_number}": rocm_libs["hipblaslt"].soversion if rocm_libs["hipblaslt"] != None else "",
@@ -782,7 +768,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocm_version_number}": rocm_config.rocm_version_number,
             "%{miopen_version_number}": rocm_config.miopen_version_number,
             "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
+            "%{hipblaslt_flag}": "1",
             "%{hip_soversion_number}": rocm_libs["amdhip64"].soversion,
             "%{rocblas_soversion_number}": rocm_libs["rocblas"].soversion,
             "%{hipblaslt_soversion_number}": rocm_libs["hipblaslt"].soversion if rocm_libs["hipblaslt"] != None else "",
@@ -863,6 +849,8 @@ _ENVIRONS = [
     "CLANG_COMPILER_PATH",
     _OS,
     _ROCM_VERSION,
+    _TF_ROCM_RBE_DOCKER_IMAGE,
+    _TF_ROCM_MULTIPLE_PATHS,
 ]
 
 remote_rocm_configure = repository_rule(
diff --git a/third_party/xla/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/hwloc/hwloc.BUILD
index ac935ad148c3f8..1828fdba5f9d81 100644
--- a/third_party/xla/third_party/hwloc/hwloc.BUILD
+++ b/third_party/xla/third_party/hwloc/hwloc.BUILD
@@ -272,6 +272,10 @@ cc_library(
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
+        "@local_xla//xla/tsl:linux_riscv64": [
+            "hwloc/topology-linux.c",
+            "include/hwloc/linux.h",
+        ],
         "@local_xla//xla/tsl:linux_s390x": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
diff --git a/third_party/xla/third_party/llvm/generated.patch b/third_party/xla/third_party/llvm/generated.patch
index 2856e5f03f103c..db599297ad269a 100644
--- a/third_party/xla/third_party/llvm/generated.patch
+++ b/third_party/xla/third_party/llvm/generated.patch
@@ -1,34 +1,1778 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/mlir/test/Target/SPIRV/function-decorations-asserts.mlir b/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
---- a/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
-+++ b/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
-@@ -0,0 +1,20 @@
-+// REQUIRES: asserts
-+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file --debug %s | FileCheck %s
-+
-+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-+    spirv.func @linkage_attr_test_kernel()  "DontInline"  attributes {}  {
-+        %uchar_0 = spirv.Constant 0 : i8
-+        %ushort_1 = spirv.Constant 1 : i16
-+        %uint_0 = spirv.Constant 0 : i32
-+        spirv.FunctionCall @outside.func.with.linkage(%uchar_0):(i8) -> ()
-+        spirv.Return
-+    }
-+    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outside.func", linkage_type = <Import>>
-+    spirv.func @outside.func.with.linkage(%arg0 : i8) -> () "Pure" attributes {
-+      linkage_attributes=#spirv.linkage_attributes<
-+        linkage_name="outside.func",
-+        linkage_type=<Import>
-+      >
+diff -ruN --strip-trailing-cr a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
+--- a/libc/src/stdio/printf_core/vfprintf_internal.h
++++ b/libc/src/stdio/printf_core/vfprintf_internal.h
+@@ -51,8 +51,11 @@
+ LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                          size_t nmemb, ::FILE *f) {
+   // Need to use system errno in this case, as system write will set this errno
+-  // which we need to propagate back into our code.
+-  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
++  // which we need to propagate back into our code. fwrite only modifies errno
++  // if there was an error, and errno may have previously been nonzero. Only
++  // return errno if there was an error.
++  size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f);
++  return {members_written, members_written == nmemb ? 0 : errno};
+ }
+ #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
+ } // namespace internal
+diff -ruN --strip-trailing-cr a/libcxx/include/fstream b/libcxx/include/fstream
+--- a/libcxx/include/fstream
++++ b/libcxx/include/fstream
+@@ -315,8 +315,14 @@
+         traits_type::copy(__str, this->gptr(), __n);
+         this->__gbump_ptrdiff(__n);
+       }
+-      if (__len - __n >= this->egptr() - this->eback())
+-        return std::fread(__str + __n, sizeof(char_type), __len - __n, __file_);
++      const streamsize __remainder    = __len - __n;
++      const streamsize __buffer_space = this->egptr() - this->eback();
++
++      if (__remainder >= __buffer_space)
++        return std::fread(__str + __n, sizeof(char_type), __remainder, __file_) + __n;
++      else if (__remainder > 0)
++        return basic_streambuf<_CharT, _Traits>::xsgetn(__str + __n, __remainder) + __n;
++      return __n;
+     }
+     return basic_streambuf<_CharT, _Traits>::xsgetn(__str, __len);
+   }
+diff -ruN --strip-trailing-cr a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
+--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
++++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
+@@ -228,7 +228,7 @@
+ static cl::opt<bool>
+     EnableDFAJumpThreading("enable-dfa-jump-thread",
+                            cl::desc("Enable DFA jump threading"),
+-                           cl::init(true), cl::Hidden);
++                           cl::init(false), cl::Hidden);
+ 
+ static cl::opt<bool>
+     EnableHotColdSplit("hot-cold-split",
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
++++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+@@ -162,6 +162,8 @@
+                                  const SCEV *ExitCount,
+                                  PHINode *IndVar, SCEVExpander &Rewriter);
+ 
++  bool sinkUnusedInvariants(Loop *L);
++
+ public:
+   IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                  const DataLayout &DL, TargetLibraryInfo *TLI,
+@@ -1077,6 +1079,85 @@
+   return true;
+ }
+ 
++//===----------------------------------------------------------------------===//
++//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
++//===----------------------------------------------------------------------===//
++
++/// If there's a single exit block, sink any loop-invariant values that
++/// were defined in the preheader but not used inside the loop into the
++/// exit block to reduce register pressure in the loop.
++bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
++  BasicBlock *ExitBlock = L->getExitBlock();
++  if (!ExitBlock) return false;
++
++  BasicBlock *Preheader = L->getLoopPreheader();
++  if (!Preheader) return false;
++
++  bool MadeAnyChanges = false;
++  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
++
++    // Skip BB Terminator.
++    if (Preheader->getTerminator() == &I)
++      continue;
++
++    // New instructions were inserted at the end of the preheader.
++    if (isa<PHINode>(I))
++      break;
++
++    // Don't move instructions which might have side effects, since the side
++    // effects need to complete before instructions inside the loop.  Also don't
++    // move instructions which might read memory, since the loop may modify
++    // memory. Note that it's okay if the instruction might have undefined
++    // behavior: LoopSimplify guarantees that the preheader dominates the exit
++    // block.
++    if (I.mayHaveSideEffects() || I.mayReadFromMemory())
++      continue;
++
++    // Skip debug or pseudo instructions.
++    if (I.isDebugOrPseudoInst())
++      continue;
++
++    // Skip eh pad instructions.
++    if (I.isEHPad())
++      continue;
++
++    // Don't sink alloca: we never want to sink static alloca's out of the
++    // entry block, and correctly sinking dynamic alloca's requires
++    // checks for stacksave/stackrestore intrinsics.
++    // FIXME: Refactor this check somehow?
++    if (isa<AllocaInst>(&I))
++      continue;
++
++    // Determine if there is a use in or before the loop (direct or
++    // otherwise).
++    bool UsedInLoop = false;
++    for (Use &U : I.uses()) {
++      Instruction *User = cast<Instruction>(U.getUser());
++      BasicBlock *UseBB = User->getParent();
++      if (PHINode *P = dyn_cast<PHINode>(User)) {
++        unsigned i =
++          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
++        UseBB = P->getIncomingBlock(i);
++      }
++      if (UseBB == Preheader || L->contains(UseBB)) {
++        UsedInLoop = true;
++        break;
++      }
 +    }
-+    spirv.func @inside.func() -> () "Pure" attributes {} {spirv.Return}
++
++    // If there is, the def must remain in the preheader.
++    if (UsedInLoop)
++      continue;
++
++    // Otherwise, sink it to the exit block.
++    I.moveBefore(ExitBlock->getFirstInsertionPt());
++    SE->forgetValue(&I);
++    MadeAnyChanges = true;
++  }
++
++  return MadeAnyChanges;
++}
++
+ static void replaceExitCond(BranchInst *BI, Value *NewCond,
+                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+   auto *OldCond = BI->getCondition();
+@@ -1984,6 +2065,10 @@
+ 
+   // The Rewriter may not be used from this point on.
+ 
++  // Loop-invariant instructions in the preheader that aren't used in the
++  // loop may be sunk below the loop to reduce register pressure.
++  Changed |= sinkUnusedInvariants(L);
++
+   // rewriteFirstIterationLoopExitValues does not rely on the computation of
+   // trip count and therefore can further simplify exit values in addition to
+   // rewriteLoopExitValues.
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
+--- a/llvm/lib/Transforms/Scalar/LICM.cpp
++++ b/llvm/lib/Transforms/Scalar/LICM.cpp
+@@ -211,15 +211,9 @@
+ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                              MemorySSAUpdater &MSSAU);
+ 
+-static void moveInstructionBefore(
+-    Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
+-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+-    MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
+-
+-static bool sinkUnusedInvariantsFromPreheaderToExit(
+-    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+-    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
++static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
++                                  ICFLoopSafetyInfo &SafetyInfo,
++                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
+ 
+ static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
+                                 function_ref<void(Instruction *)> Fn);
+@@ -477,12 +471,6 @@
+                                     TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
+             : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
+                          MSSAU, &SafetyInfo, Flags, ORE);
+-
+-  // sink pre-header defs that are unused in-loop into the unique exit to reduce
+-  // pressure.
+-  Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
+-                                                     SE, DT, Flags, ORE);
+-
+   Flags.setIsSink(false);
+   if (Preheader)
+     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
+@@ -1468,80 +1456,19 @@
+ 
+ static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
+                                   ICFLoopSafetyInfo &SafetyInfo,
+-                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
+-                                  MemorySSA::InsertionPlace Point) {
++                                  MemorySSAUpdater &MSSAU,
++                                  ScalarEvolution *SE) {
+   SafetyInfo.removeInstruction(&I);
+   SafetyInfo.insertInstructionTo(&I, Dest->getParent());
+   I.moveBefore(*Dest->getParent(), Dest);
+   if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+           MSSAU.getMemorySSA()->getMemoryAccess(&I)))
+-    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
++    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
++                      MemorySSA::BeforeTerminator);
+   if (SE)
+     SE->forgetBlockAndLoopDispositions(&I);
+ }
+ 
+-// If there's a single exit block, sink any loop-invariant values that were
+-// defined in the preheader but not used inside the loop into the exit block
+-// to reduce register pressure in the loop.
+-static bool sinkUnusedInvariantsFromPreheaderToExit(
+-    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
+-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
+-    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
+-  BasicBlock *ExitBlock = L->getExitBlock();
+-  if (!ExitBlock)
+-    return false;
+-
+-  BasicBlock *Preheader = L->getLoopPreheader();
+-  if (!Preheader)
+-    return false;
+-
+-  bool MadeAnyChanges = false;
+-
+-  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+-
+-    // Skip terminator.
+-    if (Preheader->getTerminator() == &I)
+-      continue;
+-
+-    // New instructions were inserted at the end of the preheader.
+-    if (isa<PHINode>(I))
+-      break;
+-
+-    // Don't move instructions which might have side effects, since the side
+-    // effects need to complete before instructions inside the loop. Note that
+-    // it's okay if the instruction might have undefined behavior: LoopSimplify
+-    // guarantees that the preheader dominates the exit block.
+-    if (I.mayHaveSideEffects())
+-      continue;
+-
+-    if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
+-      continue;
+-
+-    // Determine if there is a use in or before the loop (direct or
+-    // otherwise).
+-    bool UsedInLoopOrPreheader = false;
+-    for (Use &U : I.uses()) {
+-      auto *UserI = cast<Instruction>(U.getUser());
+-      BasicBlock *UseBB = UserI->getParent();
+-      if (auto *PN = dyn_cast<PHINode>(UserI)) {
+-        UseBB = PN->getIncomingBlock(U);
+-      }
+-      if (UseBB == Preheader || L->contains(UseBB)) {
+-        UsedInLoopOrPreheader = true;
+-        break;
+-      }
+-    }
+-    if (UsedInLoopOrPreheader)
+-      continue;
+-
+-    moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
+-                          MSSAU, SE, MemorySSA::Beginning);
+-    MadeAnyChanges = true;
+-  }
+-
+-  return MadeAnyChanges;
+-}
+-
+ static Instruction *sinkThroughTriviallyReplaceablePHI(
+     PHINode *TPN, Instruction *I, LoopInfo *LI,
+     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
++++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+@@ -73,10 +73,10 @@
+ }
+ 
+ ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
+-; GFX908:    NumSgprs: 56
+-; GFX908-GCNTRACKERS:    NumSgprs: 56
++; GFX908:    NumSgprs: 64
++; GFX908-GCNTRACKERS:    NumSgprs: 64
+ ; GFX908:    NumVgprs: 43
+-; GFX908-GCNTRACKERS:    NumVgprs: 40
++; GFX908-GCNTRACKERS:    NumVgprs: 39
+ ; GFX908:    Occupancy: 5
+ ; GFX908-GCNTRACKERS:    Occupancy: 6
+ 
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
++++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+@@ -22,6 +22,8 @@
+ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
+ 
+ ; OFFREG is offset system SGPR
++; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
++; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
+ ; GCN: NumVgprs: 256
+ ; GCN: ScratchSize: 640
+ 
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
+--- a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
++++ b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
+@@ -212,33 +212,37 @@
+ ; CHECK-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
+ ; CHECK-NEXT:    andi. r3, r3, 1
+ ; CHECK-NEXT:    li r3, -1
+-; CHECK-NEXT:    li r4, 0
+ ; CHECK-NEXT:    li r30, 0
+ ; CHECK-NEXT:    crmove 4*cr2+lt, gt
+ ; CHECK-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
+ ; CHECK-NEXT:    b .LBB3_2
++; CHECK-NEXT:    .p2align 4
+ ; CHECK-NEXT:  .LBB3_1: # %if.end116
+ ; CHECK-NEXT:    #
+ ; CHECK-NEXT:    bl callee
+ ; CHECK-NEXT:    nop
+ ; CHECK-NEXT:    mr r3, r29
+-; CHECK-NEXT:    li r4, 0
+-; CHECK-NEXT:    .p2align 4
+-; CHECK-NEXT:  .LBB3_2: # %while.body5.i
+-; CHECK-NEXT:    #
+-; CHECK-NEXT:    addi r4, r4, -1
+-; CHECK-NEXT:    cmpwi r4, 0
+-; CHECK-NEXT:    bgt cr0, .LBB3_2
+-; CHECK-NEXT:  # %bb.3: # %while.cond12.preheader.i
+-; CHECK-NEXT:    #
++; CHECK-NEXT:  .LBB3_2: # %cond.end.i.i
++; CHECK-NEXT:    # =>This Loop Header: Depth=1
++; CHECK-NEXT:    # Child Loop BB3_3 Depth 2
+ ; CHECK-NEXT:    lwz r29, 0(r3)
++; CHECK-NEXT:    li r5, 0
++; CHECK-NEXT:    extsw r4, r29
++; CHECK-NEXT:    .p2align 5
++; CHECK-NEXT:  .LBB3_3: # %while.body5.i
++; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
++; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
++; CHECK-NEXT:    addi r5, r5, -1
++; CHECK-NEXT:    cmpwi r5, 0
++; CHECK-NEXT:    bgt cr0, .LBB3_3
++; CHECK-NEXT:  # %bb.4: # %while.cond12.preheader.i
++; CHECK-NEXT:    #
+ ; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
+-; CHECK-NEXT:  # %bb.4: # %for.cond99.preheader
++; CHECK-NEXT:  # %bb.5: # %for.cond99.preheader
+ ; CHECK-NEXT:    #
+-; CHECK-NEXT:    extsw r4, r29
+ ; CHECK-NEXT:    ld r5, 0(r3)
+-; CHECK-NEXT:    stw r3, 0(r3)
+ ; CHECK-NEXT:    sldi r4, r4, 2
++; CHECK-NEXT:    stw r3, 0(r3)
+ ; CHECK-NEXT:    stwx r30, r5, r4
+ ; CHECK-NEXT:    b .LBB3_1
+ ;
+@@ -252,33 +256,37 @@
+ ; CHECK-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
+ ; CHECK-BE-NEXT:    andi. r3, r3, 1
+ ; CHECK-BE-NEXT:    li r3, -1
+-; CHECK-BE-NEXT:    li r4, 0
+ ; CHECK-BE-NEXT:    li r30, 0
+ ; CHECK-BE-NEXT:    crmove 4*cr2+lt, gt
+ ; CHECK-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
+ ; CHECK-BE-NEXT:    b .LBB3_2
++; CHECK-BE-NEXT:    .p2align 4
+ ; CHECK-BE-NEXT:  .LBB3_1: # %if.end116
+ ; CHECK-BE-NEXT:    #
+ ; CHECK-BE-NEXT:    bl callee
+ ; CHECK-BE-NEXT:    nop
+ ; CHECK-BE-NEXT:    mr r3, r29
+-; CHECK-BE-NEXT:    li r4, 0
+-; CHECK-BE-NEXT:    .p2align 4
+-; CHECK-BE-NEXT:  .LBB3_2: # %while.body5.i
+-; CHECK-BE-NEXT:    #
+-; CHECK-BE-NEXT:    addi r4, r4, -1
+-; CHECK-BE-NEXT:    cmpwi r4, 0
+-; CHECK-BE-NEXT:    bgt cr0, .LBB3_2
+-; CHECK-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
+-; CHECK-BE-NEXT:    #
++; CHECK-BE-NEXT:  .LBB3_2: # %cond.end.i.i
++; CHECK-BE-NEXT:    # =>This Loop Header: Depth=1
++; CHECK-BE-NEXT:    # Child Loop BB3_3 Depth 2
+ ; CHECK-BE-NEXT:    lwz r29, 0(r3)
++; CHECK-BE-NEXT:    li r5, 0
++; CHECK-BE-NEXT:    extsw r4, r29
++; CHECK-BE-NEXT:    .p2align 5
++; CHECK-BE-NEXT:  .LBB3_3: # %while.body5.i
++; CHECK-BE-NEXT:    # Parent Loop BB3_2 Depth=1
++; CHECK-BE-NEXT:    # => This Inner Loop Header: Depth=2
++; CHECK-BE-NEXT:    addi r5, r5, -1
++; CHECK-BE-NEXT:    cmpwi r5, 0
++; CHECK-BE-NEXT:    bgt cr0, .LBB3_3
++; CHECK-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
++; CHECK-BE-NEXT:    #
+ ; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
+-; CHECK-BE-NEXT:  # %bb.4: # %for.cond99.preheader
++; CHECK-BE-NEXT:  # %bb.5: # %for.cond99.preheader
+ ; CHECK-BE-NEXT:    #
+-; CHECK-BE-NEXT:    extsw r4, r29
+ ; CHECK-BE-NEXT:    ld r5, 0(r3)
+-; CHECK-BE-NEXT:    stw r3, 0(r3)
+ ; CHECK-BE-NEXT:    sldi r4, r4, 2
++; CHECK-BE-NEXT:    stw r3, 0(r3)
+ ; CHECK-BE-NEXT:    stwx r30, r5, r4
+ ; CHECK-BE-NEXT:    b .LBB3_1
+ ;
+@@ -292,28 +300,32 @@
+ ; CHECK-P9-NEXT:    std r0, 80(r1)
+ ; CHECK-P9-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
+ ; CHECK-P9-NEXT:    li r3, -1
+-; CHECK-P9-NEXT:    li r4, 0
+ ; CHECK-P9-NEXT:    li r30, 0
+ ; CHECK-P9-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
+ ; CHECK-P9-NEXT:    crmove 4*cr2+lt, gt
+ ; CHECK-P9-NEXT:    b .LBB3_2
++; CHECK-P9-NEXT:    .p2align 4
+ ; CHECK-P9-NEXT:  .LBB3_1: # %if.end116
+ ; CHECK-P9-NEXT:    #
+ ; CHECK-P9-NEXT:    bl callee
+ ; CHECK-P9-NEXT:    nop
+ ; CHECK-P9-NEXT:    mr r3, r29
++; CHECK-P9-NEXT:  .LBB3_2: # %cond.end.i.i
++; CHECK-P9-NEXT:    # =>This Loop Header: Depth=1
++; CHECK-P9-NEXT:    # Child Loop BB3_3 Depth 2
++; CHECK-P9-NEXT:    lwz r29, 0(r3)
+ ; CHECK-P9-NEXT:    li r4, 0
+-; CHECK-P9-NEXT:    .p2align 4
+-; CHECK-P9-NEXT:  .LBB3_2: # %while.body5.i
+-; CHECK-P9-NEXT:    #
++; CHECK-P9-NEXT:    .p2align 5
++; CHECK-P9-NEXT:  .LBB3_3: # %while.body5.i
++; CHECK-P9-NEXT:    # Parent Loop BB3_2 Depth=1
++; CHECK-P9-NEXT:    # => This Inner Loop Header: Depth=2
+ ; CHECK-P9-NEXT:    addi r4, r4, -1
+ ; CHECK-P9-NEXT:    cmpwi r4, 0
+-; CHECK-P9-NEXT:    bgt cr0, .LBB3_2
+-; CHECK-P9-NEXT:  # %bb.3: # %while.cond12.preheader.i
++; CHECK-P9-NEXT:    bgt cr0, .LBB3_3
++; CHECK-P9-NEXT:  # %bb.4: # %while.cond12.preheader.i
+ ; CHECK-P9-NEXT:    #
+-; CHECK-P9-NEXT:    lwz r29, 0(r3)
+ ; CHECK-P9-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
+-; CHECK-P9-NEXT:  # %bb.4: # %for.cond99.preheader
++; CHECK-P9-NEXT:  # %bb.5: # %for.cond99.preheader
+ ; CHECK-P9-NEXT:    #
+ ; CHECK-P9-NEXT:    ld r4, 0(r3)
+ ; CHECK-P9-NEXT:    extswsli r5, r29, 2
+@@ -331,28 +343,32 @@
+ ; CHECK-P9-BE-NEXT:    std r0, 96(r1)
+ ; CHECK-P9-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
+ ; CHECK-P9-BE-NEXT:    li r3, -1
+-; CHECK-P9-BE-NEXT:    li r4, 0
+ ; CHECK-P9-BE-NEXT:    li r30, 0
+ ; CHECK-P9-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
+ ; CHECK-P9-BE-NEXT:    crmove 4*cr2+lt, gt
+ ; CHECK-P9-BE-NEXT:    b .LBB3_2
++; CHECK-P9-BE-NEXT:    .p2align 4
+ ; CHECK-P9-BE-NEXT:  .LBB3_1: # %if.end116
+ ; CHECK-P9-BE-NEXT:    #
+ ; CHECK-P9-BE-NEXT:    bl callee
+ ; CHECK-P9-BE-NEXT:    nop
+ ; CHECK-P9-BE-NEXT:    mr r3, r29
++; CHECK-P9-BE-NEXT:  .LBB3_2: # %cond.end.i.i
++; CHECK-P9-BE-NEXT:    # =>This Loop Header: Depth=1
++; CHECK-P9-BE-NEXT:    # Child Loop BB3_3 Depth 2
++; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
+ ; CHECK-P9-BE-NEXT:    li r4, 0
+-; CHECK-P9-BE-NEXT:    .p2align 4
+-; CHECK-P9-BE-NEXT:  .LBB3_2: # %while.body5.i
+-; CHECK-P9-BE-NEXT:    #
++; CHECK-P9-BE-NEXT:    .p2align 5
++; CHECK-P9-BE-NEXT:  .LBB3_3: # %while.body5.i
++; CHECK-P9-BE-NEXT:    # Parent Loop BB3_2 Depth=1
++; CHECK-P9-BE-NEXT:    # => This Inner Loop Header: Depth=2
+ ; CHECK-P9-BE-NEXT:    addi r4, r4, -1
+ ; CHECK-P9-BE-NEXT:    cmpwi r4, 0
+-; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_2
+-; CHECK-P9-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
++; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_3
++; CHECK-P9-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
+ ; CHECK-P9-BE-NEXT:    #
+-; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
+ ; CHECK-P9-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
+-; CHECK-P9-BE-NEXT:  # %bb.4: # %for.cond99.preheader
++; CHECK-P9-BE-NEXT:  # %bb.5: # %for.cond99.preheader
+ ; CHECK-P9-BE-NEXT:    #
+ ; CHECK-P9-BE-NEXT:    ld r4, 0(r3)
+ ; CHECK-P9-BE-NEXT:    extswsli r5, r29, 2
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
+--- a/llvm/test/Other/new-pm-defaults.ll
++++ b/llvm/test/Other/new-pm-defaults.ll
+@@ -208,7 +208,6 @@
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+ ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+@@ -133,7 +133,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+@@ -118,7 +118,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+@@ -127,7 +127,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+@@ -165,7 +165,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+@@ -167,7 +167,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
++++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+@@ -131,7 +131,6 @@
+ ; CHECK-O-NEXT: Running pass: BDCEPass
+ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
+ ; CHECK-O-NEXT: Running pass: InstCombinePass
+-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+ ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
+--- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
++++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
+@@ -7,11 +7,11 @@
+ ; CHECK-LABEL: define void @f
+ ; CHECK-SAME: (ptr addrspace(7) [[ARG:%.*]]) {
+ ; CHECK-NEXT:  bb:
+-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
+ ; CHECK-NEXT:    br label [[BB1:%.*]]
+ ; CHECK:       bb1:
+ ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB1]]
+ ; CHECK:       bb2:
++; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
+ ; CHECK-NEXT:    br label [[BB3:%.*]]
+ ; CHECK:       bb3:
+ ; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr addrspace(7) [[SCEVGEP]], align 4
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
+--- a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
++++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
+@@ -4,31 +4,33 @@
+ 
+ define i32 @remove_loop(i32 %size) #0 {
+ ; CHECK-V8M-LABEL: @remove_loop(
++; CHECK-V8M-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+ ; CHECK-V8M-NEXT:  entry:
+-; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
++; CHECK-V8M-NEXT:    br label %[[WHILE_COND:.*]]
++; CHECK-V8M:       while.cond:
++; CHECK-V8M-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
++; CHECK-V8M:       while.end:
++; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+ ; CHECK-V8M-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
+ ; CHECK-V8M-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
+ ; CHECK-V8M-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
+ ; CHECK-V8M-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
+ ; CHECK-V8M-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+-; CHECK-V8M-NEXT:    br label [[WHILE_COND:%.*]]
+-; CHECK-V8M:       while.cond:
+-; CHECK-V8M-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+-; CHECK-V8M:       while.end:
+ ; CHECK-V8M-NEXT:    ret i32 [[TMP4]]
+ ;
+ ; CHECK-V8A-LABEL: @remove_loop(
++; CHECK-V8A-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+ ; CHECK-V8A-NEXT:  entry:
+-; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
++; CHECK-V8A-NEXT:    br label %[[WHILE_COND:.*]]
++; CHECK-V8A:       while.cond:
++; CHECK-V8A-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
++; CHECK-V8A:       while.end:
++; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+ ; CHECK-V8A-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
+ ; CHECK-V8A-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
+ ; CHECK-V8A-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
+ ; CHECK-V8A-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
+ ; CHECK-V8A-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+-; CHECK-V8A-NEXT:    br label [[WHILE_COND:%.*]]
+-; CHECK-V8A:       while.cond:
+-; CHECK-V8A-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+-; CHECK-V8A:       while.end:
+ ; CHECK-V8A-NEXT:    ret i32 [[TMP4]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
+--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
++++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
+@@ -77,8 +77,6 @@
+ ; CHECK-NEXT:    [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]]
+ ; CHECK-NEXT:    br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]]
+ ; CHECK:       for.body29.preheader:
+-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
+-; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
+ ; CHECK-NEXT:    br label [[FOR_BODY29:%.*]]
+ ; CHECK:       for.body29:
+ ; CHECK-NEXT:    [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ]
+@@ -102,6 +100,8 @@
+ ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]]
+ ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]]
+ ; CHECK:       for.end40.loopexit:
++; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
++; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
+ ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, ptr [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]]
+ ; CHECK-NEXT:    [[SCEVGEP94:%.*]] = getelementptr i32, ptr [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]]
+ ; CHECK-NEXT:    br label [[FOR_END40]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
+--- a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
++++ b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
+@@ -1,5 +1,5 @@
+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+-; RUN: opt < %s -passes='require<scalar-evolution>,indvars,loop-mssa(licm)' -S | FileCheck %s
++; RUN: opt < %s -passes=indvars -S | FileCheck %s
+ 
+ define i32 @logical_and_2ops(i32 %n, i32 %m) {
+ ; CHECK-LABEL: @logical_and_2ops(
+@@ -56,10 +56,10 @@
+ ; CHECK:       loop:
+ ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
+ ; CHECK:       exit:
++; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
+ ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
+-; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
+-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
+-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
++; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
++; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
+ ; CHECK-NEXT:    ret i32 [[UMIN1]]
+ ;
+ entry:
+@@ -84,10 +84,10 @@
+ ; CHECK:       loop:
+ ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+ ; CHECK:       exit:
++; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
+ ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
+-; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
+-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
+-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
++; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
++; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
+ ; CHECK-NEXT:    ret i32 [[UMIN1]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
+--- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
++++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
+@@ -932,9 +932,6 @@
+ define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress {
+ ; CHECK-LABEL: @ult_multiuse_profit(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[START:%.*]], 1
+-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP2]] to i16
+-; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP1]], i16 254)
+ ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 254 to i8
+ ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+ ; CHECK:       for.body:
+@@ -943,6 +940,9 @@
+ ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]]
+ ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+ ; CHECK:       for.end:
++; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[START:%.*]], 1
++; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i16
++; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP2]], i16 254)
+ ; CHECK-NEXT:    ret i16 [[UMAX]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr116483.ll b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
+--- a/llvm/test/Transforms/IndVarSimplify/pr116483.ll
++++ b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
+@@ -4,16 +4,16 @@
+ define i32 @test() {
+ ; CHECK-LABEL: define i32 @test() {
+ ; CHECK-NEXT:  [[ENTRY:.*:]]
++; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
++; CHECK:       [[LOOP_BODY]]:
++; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
++; CHECK:       [[EXIT]]:
+ ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 0, 3
+ ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[XOR]], 329
+ ; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[MUL]] to i16
+ ; CHECK-NEXT:    [[SEXT:%.*]] = shl i16 [[CONV]], 8
+ ; CHECK-NEXT:    [[CONV1:%.*]] = ashr i16 [[SEXT]], 8
+ ; CHECK-NEXT:    [[CONV3:%.*]] = zext i16 [[CONV1]] to i32
+-; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+-; CHECK:       [[LOOP_BODY]]:
+-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
+-; CHECK:       [[EXIT]]:
+ ; CHECK-NEXT:    ret i32 [[CONV3]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr24783.ll b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
+--- a/llvm/test/Transforms/IndVarSimplify/pr24783.ll
++++ b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
+@@ -7,11 +7,11 @@
+ define void @f(ptr %end.s, ptr %loc, i32 %p) {
+ ; CHECK-LABEL: @f(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
+ ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
+ ; CHECK:       while.body.i:
+ ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[WHILE_BODY_I]]
+ ; CHECK:       loop.exit:
++; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
+ ; CHECK-NEXT:    store ptr [[END]], ptr [[LOC:%.*]], align 8
+ ; CHECK-NEXT:    ret void
+ ;
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr39673.ll b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
+--- a/llvm/test/Transforms/IndVarSimplify/pr39673.ll
++++ b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
+@@ -148,7 +148,6 @@
+ define i16 @neg_loop_carried(i16 %arg) {
+ ; CHECK-LABEL: @neg_loop_carried(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
+ ; CHECK-NEXT:    br label [[LOOP1:%.*]]
+ ; CHECK:       loop1:
+ ; CHECK-NEXT:    [[L1:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[L1_ADD:%.*]], [[LOOP1]] ]
+@@ -156,6 +155,7 @@
+ ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[L1_ADD]], 2
+ ; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP1]], label [[LOOP2_PREHEADER:%.*]]
+ ; CHECK:       loop2.preheader:
++; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
+ ; CHECK-NEXT:    br label [[LOOP2:%.*]]
+ ; CHECK:       loop2:
+ ; CHECK-NEXT:    [[K2:%.*]] = phi i16 [ [[K2_ADD:%.*]], [[LOOP2]] ], [ [[TMP0]], [[LOOP2_PREHEADER]] ]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr63763.ll b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
+--- a/llvm/test/Transforms/IndVarSimplify/pr63763.ll
++++ b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
+@@ -16,13 +16,13 @@
+ ; CHECK-NEXT:    [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24
+ ; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nsw i32 7, [[CONV2]]
+ ; CHECK-NEXT:    call void @use(i32 [[INVARIANT_OP]])
+-; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
+-; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
+-; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
+ ; CHECK-NEXT:    br label [[LOOP:%.*]]
+ ; CHECK:       loop:
+ ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+ ; CHECK:       exit:
++; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
++; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
++; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
+ ; CHECK-NEXT:    ret i32 [[INVARIANT_OP_US]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
+--- a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
++++ b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
+@@ -4,21 +4,22 @@
+ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+ 
+ define i32 @remove_loop(i32 %size) {
+-; CHECK-LABEL: @remove_loop(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
++; CHECK-LABEL: define i32 @remove_loop(
++; CHECK-SAME: i32 [[SIZE:%.*]]) {
++; CHECK-NEXT:  [[ENTRY:.*]]:
++; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
++; CHECK:       [[WHILE_COND]]:
++; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], %[[ENTRY]] ], [ [[SUB:%.*]], %[[WHILE_COND]] ]
++; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
++; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
++; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_COND]], label %[[WHILE_END:.*]]
++; CHECK:       [[WHILE_END]]:
++; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
+ ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
+ ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
+ ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
+ ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
+ ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
+-; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+-; CHECK:       while.cond:
+-; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_COND]] ]
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
+-; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
+-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_COND]], label [[WHILE_END:%.*]]
+-; CHECK:       while.end:
+ ; CHECK-NEXT:    ret i32 [[TMP4]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
++++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+@@ -76,10 +76,6 @@
+ ; CHECK-LABEL: @narow_canonical_iv_wide_multiplied_iv(
+ ; CHECK-NEXT:  entry:
+ ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1)
+-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
+-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
+-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
+ ; CHECK-NEXT:    br label [[LOOP:%.*]]
+ ; CHECK:       loop:
+ ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+@@ -88,6 +84,10 @@
+ ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
+ ; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]]
+ ; CHECK:       exit:
++; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
++; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
++; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
++; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
+ ; CHECK-NEXT:    ret i64 [[TMP6]]
+ ;
+ entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
+--- a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
++++ b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
+@@ -23,8 +23,8 @@
+ ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
+ ; CHECK:       while.cond192:
+ ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
+-; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+-; CHECK-NEXT:      i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
++; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
++; CHECK-NEXT:    i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
+ ; CHECK-NEXT:    ]
+ ; CHECK:       while.cond215.preheader:
+ ; CHECK-NEXT:    br label [[WHILE_COND215:%.*]]
+@@ -103,8 +103,8 @@
+ ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+ ; CHECK:       for.cond:
+ ; CHECK-NEXT:    switch i16 [[X:%.*]], label [[RETURN_LOOPEXIT1:%.*]] [
+-; CHECK-NEXT:      i16 41, label [[FOR_END:%.*]]
+-; CHECK-NEXT:      i16 43, label [[FOR_COND]]
++; CHECK-NEXT:    i16 41, label [[FOR_END:%.*]]
++; CHECK-NEXT:    i16 43, label [[FOR_COND]]
+ ; CHECK-NEXT:    ]
+ ; CHECK:       for.end:
+ ; CHECK-NEXT:    [[I_0_LCSSA2:%.*]] = phi i32 [ 0, [[FOR_COND]] ]
+@@ -336,7 +336,6 @@
+ define void @test5(ptr %header, i32 %conv, i8 %n) {
+ ; CHECK-LABEL: @test5(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
+ ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+ ; CHECK:       for.body:
+ ; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
+@@ -359,6 +358,7 @@
+ ; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[WHILE_COND_PREHEADER:%.*]]
+ ; CHECK:       while.cond.preheader:
+ ; CHECK-NEXT:    [[ADD85_LCSSA:%.*]] = phi i32 [ [[ADD85]], [[FOR_INC]] ]
++; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
+ ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+ ; CHECK:       while.cond:
+ ; CHECK-NEXT:    [[POS_8:%.*]] = phi i32 [ [[INC114:%.*]], [[WHILE_BODY:%.*]] ], [ [[ADD85_LCSSA]], [[WHILE_COND_PREHEADER]] ]
+@@ -427,8 +427,8 @@
+ ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
+ ; CHECK:       while.cond192:
+ ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
+-; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+-; CHECK-NEXT:      i8 10, label [[IF_END224:%.*]]
++; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
++; CHECK-NEXT:    i8 10, label [[IF_END224:%.*]]
+ ; CHECK-NEXT:    ]
+ ; CHECK:       while.cond215.preheader:
+ ; CHECK-NEXT:    [[I_7_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_COND192]] ]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
+--- a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
++++ b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
+@@ -46,12 +46,12 @@
+ define i32 @test_pr58439(i32 %a) {
+ ; CHECK-LABEL: @test_pr58439(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
+ ; CHECK-NEXT:    br label [[LOOP:%.*]]
+ ; CHECK:       loop:
+ ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
+ ; CHECK:       exit:
+ ; CHECK-NEXT:    [[C_EXT_LCSSA:%.*]] = phi i32 [ 0, [[LOOP]] ]
++; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
+ ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[C_EXT_LCSSA]], [[OR]]
+ ; CHECK-NEXT:    ret i32 [[RES]]
+ ;
+@@ -76,7 +76,6 @@
+ ; CHECK-NEXT:  entry:
+ ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+ ; CHECK:       outer.header:
+-; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
+ ; CHECK-NEXT:    br label [[INNER:%.*]]
+ ; CHECK:       inner:
+ ; CHECK-NEXT:    [[C_05_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[INNER]] ], [ 0, [[OUTER_HEADER]] ]
+@@ -87,6 +86,7 @@
+ ; CHECK:       outer.latch:
+ ; CHECK-NEXT:    [[C_05_I_LCSSA:%.*]] = phi i32 [ [[C_05_I]], [[INNER]] ]
+ ; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ 0, [[INNER]] ]
++; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
+ ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[AND]] to i8
+ ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C_05_I_LCSSA]] to i8
+ ; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 [[TMP0]], [[TMP1]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sentinel.ll b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
+--- a/llvm/test/Transforms/IndVarSimplify/sentinel.ll
++++ b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
+@@ -9,19 +9,19 @@
+ ; CHECK-NEXT:  bb:
+ ; CHECK-NEXT:    br label [[BB4:%.*]]
+ ; CHECK:       bb1:
++; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add i32 [[INDVARS_IV:%.*]], 1
++; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6:%.*]], [[INDVARS_IV]]
++; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMAX:%.*]]
+ ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB4]]
+ ; CHECK:       bb2:
+-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1:%.*]], [[BB1:%.*]] ]
++; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[BB1:%.*]] ]
+ ; CHECK-NEXT:    ret void
+ ; CHECK:       bb4:
+-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB1]] ], [ undef, [[BB:%.*]] ]
+-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
+-; CHECK-NEXT:    [[TMP6:%.*]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
++; CHECK-NEXT:    [[INDVARS_IV]] = phi i32 [ [[INDVARS_IV_NEXT]], [[BB1]] ], [ undef, [[BB:%.*]] ]
++; CHECK-NEXT:    [[SMAX]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
++; CHECK-NEXT:    [[TMP6]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
+ ; CHECK-NEXT:            to label [[BB7:%.*]] unwind label [[BB15:%.*]]
+ ; CHECK:       bb7:
+-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6]], [[INDVARS_IV]]
+-; CHECK-NEXT:    [[TMP1]] = sub i32 [[TMP0]], [[SMAX]]
+ ; CHECK-NEXT:    br label [[BB9:%.*]]
+ ; CHECK:       bb9:
+ ; CHECK-NEXT:    br i1 true, label [[BB1]], label [[BB9]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll b/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
+--- a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
++++ b/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
+@@ -0,0 +1,56 @@
++; RUN: opt < %s -passes=indvars -S | FileCheck %s
++target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
++target triple = "i386-apple-darwin10.0"
++
++; PR4775
++; Indvars shouldn't sink the alloca out of the entry block, even though
++; it's not used until after the loop.
++define i32 @main() nounwind {
++; CHECK: entry:
++; CHECK-NEXT: %result.i = alloca i32, align 4
++entry:
++  %result.i = alloca i32, align 4                 ; <ptr> [#uses=2]
++  br label %while.cond
++
++while.cond:                                       ; preds = %while.cond, %entry
++  %call = call i32 @bar() nounwind                ; <i32> [#uses=1]
++  %tobool = icmp eq i32 %call, 0                  ; <i1> [#uses=1]
++  br i1 %tobool, label %while.end, label %while.cond
++
++while.end:                                        ; preds = %while.cond
++  store volatile i32 0, ptr %result.i
++  %tmp.i = load volatile i32, ptr %result.i           ; <i32> [#uses=0]
++  ret i32 0
++}
++declare i32 @bar()
++
++; <rdar://problem/10352360>
++; Indvars shouldn't sink the first alloca between the stacksave and stackrestore
++; intrinsics.
++declare ptr @a(...)
++declare ptr @llvm.stacksave() nounwind
++declare void @llvm.stackrestore(ptr) nounwind
++define void @h(i64 %n) nounwind uwtable ssp {
++; CHECK: entry:
++; CHECK-NEXT: %vla = alloca ptr
++; CHECK-NEXT: %savedstack = call ptr @llvm.stacksave.p0()
++entry:
++  %vla = alloca ptr, i64 %n, align 16
++  %savedstack = call ptr @llvm.stacksave() nounwind
++  %vla.i = alloca ptr, i64 %n, align 16
++  br label %for.body.i
++
++for.body.i:
++  %indvars.iv37.i = phi i64 [ %indvars.iv.next38.i, %for.body.i ], [ 0, %entry ]
++  %call.i = call ptr (...) @a() nounwind
++  %arrayidx.i = getelementptr inbounds ptr, ptr %vla.i, i64 %indvars.iv37.i
++  store ptr %call.i, ptr %arrayidx.i, align 8
++  %indvars.iv.next38.i = add i64 %indvars.iv37.i, 1
++  %exitcond5 = icmp eq i64 %indvars.iv.next38.i, %n
++  br i1 %exitcond5, label %g.exit, label %for.body.i
++
++g.exit:
++  call void @llvm.stackrestore(ptr %savedstack) nounwind
++  %call1 = call ptr (...) @a(ptr %vla) nounwind
++  ret void
++}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
+--- a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
++++ b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
+@@ -0,0 +1,32 @@
++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
++; RUN: opt < %s -passes=indvars -indvars-predicate-loops=0 -S | FileCheck %s
++target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
++target triple = "i386-apple-darwin10.0"
++
++; We make sinking here, Changed flag should be set properly.
++define i32 @test(i32 %a, i32 %b, i32 %N) {
++; CHECK-LABEL: @test(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    br label [[LOOP:%.*]]
++; CHECK:       loop:
++; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
++; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++; CHECK:       exit:
++; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
++; CHECK-NEXT:    ret i32 [[ADD]]
++;
++entry:
++  %add = add i32 %a, %b
++  br label %loop
++
++loop:
++  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++  %iv.next = add i32 %iv, 1
++  %cmp = icmp slt i32 %iv.next, %N
++  br i1 %cmp, label %loop, label %exit
++
++exit:
++  ret i32 %add
++}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
+--- a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
++++ b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
+@@ -0,0 +1,19 @@
++; RUN: opt < %s -passes=indvars -S | FileCheck %s
++
++declare i1 @b()
++
++define i32 @a(i32 %x) nounwind {
++for.body.preheader:
++    %y = sdiv i32 10, %x
++	br label %for.body
++
++for.body:
++    %cmp = call i1 @b()
++	br i1 %cmp, label %for.body, label %for.end.loopexit
++
++for.end.loopexit:
++	ret i32 %y
 +}
-diff -ruN --strip-trailing-cr a/mlir/test/Target/SPIRV/function-decorations.mlir b/mlir/test/Target/SPIRV/function-decorations.mlir
---- a/mlir/test/Target/SPIRV/function-decorations.mlir
-+++ b/mlir/test/Target/SPIRV/function-decorations.mlir
-@@ -1,5 +1,4 @@
- // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
--// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file --debug %s | FileCheck %s
- 
- spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-     spirv.func @linkage_attr_test_kernel()  "DontInline"  attributes {}  {
++; CHECK: for.end.loopexit:
++; CHECK: sdiv
++; CHECK: ret
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
+--- a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
++++ b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
+@@ -14,7 +14,6 @@
+ ; CHECK:       outer_header:
+ ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 21, [[ENTRY:%.*]] ]
+ ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 20, [[ENTRY]] ], [ [[I_NEXT:%.*]], [[OUTER_LATCH]] ]
+-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+ ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
+ ; CHECK:       inner_header:
+ ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 1, [[OUTER_HEADER]] ], [ [[J_NEXT:%.*]], [[INNER_HEADER]] ]
+@@ -23,6 +22,7 @@
+ ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[J_NEXT]], [[INDVARS_IV]]
+ ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[INNER_HEADER]], label [[OUTER_LATCH]]
+ ; CHECK:       outer_latch:
++; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+ ; CHECK-NEXT:    [[COND2:%.*]] = icmp ne i64 [[I_NEXT]], 40
+ ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+ ; CHECK-NEXT:    br i1 [[COND2]], label [[OUTER_HEADER]], label [[RETURN:%.*]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+--- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
++++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+@@ -24,13 +24,13 @@
+ ; CHECK-NEXT:    [[X8:%.*]] = icmp ult i32 0, 4
+ ; CHECK-NEXT:    br i1 [[X8]], label [[DOTPREHEADER_LR_PH:%.*]], label [[X22]]
+ ; CHECK:       .preheader.lr.ph:
+-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
+ ; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
+ ; CHECK:       .preheader:
+ ; CHECK-NEXT:    br label [[X17:%.*]]
+ ; CHECK:       x17:
+ ; CHECK-NEXT:    br i1 false, label [[DOTPREHEADER]], label [[DOT_CRIT_EDGE_8:%.*]]
+ ; CHECK:       ._crit_edge.8:
++; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
+ ; CHECK-NEXT:    br label [[X22]]
+ ; CHECK:       x22:
+ ; CHECK-NEXT:    [[K_1_LCSSA:%.*]] = phi ptr [ [[SCEVGEP]], [[DOT_CRIT_EDGE_8]] ], [ [[K_09]], [[DOTPREHEADER4]] ]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
+--- a/llvm/test/Transforms/LICM/scalar-promote.ll
++++ b/llvm/test/Transforms/LICM/scalar-promote.ll
+@@ -43,9 +43,9 @@
+ ; CHECK-LABEL: define void @test2(
+ ; CHECK-SAME: i32 [[I:%.*]]) {
+ ; CHECK-NEXT:  [[ENTRY:.*]]:
++; CHECK-NEXT:    [[X1:%.*]] = getelementptr i32, ptr @X, i64 1
+ ; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr @X, i64 1
+-; CHECK-NEXT:    [[X3:%.*]] = getelementptr i32, ptr @X, i64 1
+-; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X2]], align 4
++; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X1]], align 4
+ ; CHECK-NEXT:    br label %[[LOOP:.*]]
+ ; CHECK:       [[LOOP]]:
+ ; CHECK-NEXT:    [[A1:%.*]] = phi i32 [ [[V:%.*]], %[[LOOP]] ], [ [[X1_PROMOTED]], %[[ENTRY]] ]
+@@ -53,7 +53,7 @@
+ ; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT:.*]]
+ ; CHECK:       [[EXIT]]:
+ ; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i32 [ [[V]], %[[LOOP]] ]
+-; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X2]], align 4
++; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X1]], align 4
+ ; CHECK-NEXT:    ret void
+ ;
+ Entry:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-alloca.ll b/llvm/test/Transforms/LICM/sink-alloca.ll
+--- a/llvm/test/Transforms/LICM/sink-alloca.ll
++++ b/llvm/test/Transforms/LICM/sink-alloca.ll
+@@ -1,56 +0,0 @@
+-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
+-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+-target triple = "i386-apple-darwin10.0"
+-
+-; PR4775
+-; LICM shouldn't sink the alloca out of the entry block, even though
+-; it's not used until after the loop.
+-define i32 @main() nounwind {
+-; CHECK: entry:
+-; CHECK-NEXT: %result.i = alloca i32, align 4
+-entry:
+-  %result.i = alloca i32, align 4                 ; <ptr> [#uses=2]
+-  br label %while.cond
+-
+-while.cond:                                       ; preds = %while.cond, %entry
+-  %call = call i32 @bar() nounwind                ; <i32> [#uses=1]
+-  %tobool = icmp eq i32 %call, 0                  ; <i1> [#uses=1]
+-  br i1 %tobool, label %while.end, label %while.cond
+-
+-while.end:                                        ; preds = %while.cond
+-  store volatile i32 0, ptr %result.i
+-  %tmp.i = load volatile i32, ptr %result.i           ; <i32> [#uses=0]
+-  ret i32 0
+-}
+-declare i32 @bar()
+-
+-; <rdar://problem/10352360>
+-; LICM shouldn't sink the first alloca between the stacksave and stackrestore
+-; intrinsics.
+-declare ptr @a(...)
+-declare ptr @llvm.stacksave() nounwind
+-declare void @llvm.stackrestore(ptr) nounwind
+-define void @h(i64 %n) nounwind uwtable ssp {
+-; CHECK: entry:
+-; CHECK-NEXT: %vla = alloca ptr
+-; CHECK-NEXT: %savedstack = call ptr @llvm.stacksave.p0()
+-entry:
+-  %vla = alloca ptr, i64 %n, align 16
+-  %savedstack = call ptr @llvm.stacksave() nounwind
+-  %vla.i = alloca ptr, i64 %n, align 16
+-  br label %for.body.i
+-
+-for.body.i:
+-  %indvars.iv37.i = phi i64 [ %indvars.iv.next38.i, %for.body.i ], [ 0, %entry ]
+-  %call.i = call ptr (...) @a() nounwind
+-  %arrayidx.i = getelementptr inbounds ptr, ptr %vla.i, i64 %indvars.iv37.i
+-  store ptr %call.i, ptr %arrayidx.i, align 8
+-  %indvars.iv.next38.i = add i64 %indvars.iv37.i, 1
+-  %exitcond5 = icmp eq i64 %indvars.iv.next38.i, %n
+-  br i1 %exitcond5, label %g.exit, label %for.body.i
+-
+-g.exit:
+-  call void @llvm.stackrestore(ptr %savedstack) nounwind
+-  %call1 = call ptr (...) @a(ptr %vla) nounwind
+-  ret void
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-from-preheader.ll b/llvm/test/Transforms/LICM/sink-from-preheader.ll
+--- a/llvm/test/Transforms/LICM/sink-from-preheader.ll
++++ b/llvm/test/Transforms/LICM/sink-from-preheader.ll
+@@ -1,185 +0,0 @@
+-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
+-
+-; We perform sinking here, Changed flag should be set properly.
+-define i32 @test(i32 %a, i32 %b, i32 %N) {
+-; CHECK-LABEL: @test(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %add = add i32 %a, %b
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, 1
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+-
+-define i32 @test_with_unused_load(i32 %a, ptr %b, i32 %N) {
+-; CHECK-LABEL: @test_with_unused_load(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LOAD]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %load = load i32, ptr %b
+-  %add = add i32 %a, %load
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, 1
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+-
+-define i32 @test_with_unused_load_modified_store(i32 %a, ptr %b, i32 %N) {
+-; CHECK-LABEL: @test_with_unused_load_modified_store(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[A:%.*]]
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[SMAX:%.*]] = phi i32 [ [[IV_NEXT]], [[LOOP]] ]
+-; CHECK-NEXT:    store i32 [[SMAX]], ptr [[B]], align 4
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], [[LOAD]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %load = load i32, ptr %b
+-  %add = add i32 %a, %load
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, %a
+-  store i32 %iv.next, ptr %b
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+-
+-; Volatile loads must not be sunk.
+-define i32 @test_with_volatile_load_no_sink(i32 %a, ptr %b, i32 %N) {
+-; CHECK-LABEL: @test_with_volatile_load_no_sink(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[B:%.*]], align 4
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %ld = load volatile i32, ptr %b, align 4
+-  %add = add i32 %a, %ld
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, 1
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+-
+-; Ordered/atomic loads must not be sunk.
+-define i32 @test_with_atomic_load_no_sink(i32 %a, ptr %b, i32 %N) {
+-; CHECK-LABEL: @test_with_atomic_load_no_sink(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[LD:%.*]] = load atomic i32, ptr [[B:%.*]] acquire, align 4
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %ld = load atomic i32, ptr %b acquire, align 4
+-  %add = add i32 %a, %ld
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, 1
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+-
+-declare void @clobber(ptr)
+-
+-; Calls that may write memory in the loop should prevent sinking the load.
+-define i32 @test_with_unused_load_clobbered_by_call(i32 %a, ptr %b, i32 %N) {
+-; CHECK-LABEL: @test_with_unused_load_clobbered_by_call(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[B:%.*]], align 4
+-; CHECK-NEXT:    br label [[LOOP:%.*]]
+-; CHECK:       loop:
+-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+-; CHECK-NEXT:    call void @clobber(ptr [[B]])
+-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+-; CHECK:       exit:
+-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
+-; CHECK-NEXT:    ret i32 [[ADD]]
+-;
+-entry:
+-  %ld = load i32, ptr %b, align 4
+-  %add = add i32 %a, %ld
+-  br label %loop
+-
+-loop:
+-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+-  %iv.next = add i32 %iv, 1
+-  call void @clobber(ptr %b)
+-  %cmp = icmp slt i32 %iv.next, %N
+-  br i1 %cmp, label %loop, label %exit
+-
+-exit:
+-  ret i32 %add
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-trapping.ll b/llvm/test/Transforms/LICM/sink-trapping.ll
+--- a/llvm/test/Transforms/LICM/sink-trapping.ll
++++ b/llvm/test/Transforms/LICM/sink-trapping.ll
+@@ -1,28 +0,0 @@
+-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
+-
+-declare i1 @b()
+-
+-define i32 @a(i32 %x) nounwind {
+-; CHECK-LABEL: define i32 @a(
+-; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+-; CHECK-NEXT:  [[FOR_BODY_PREHEADER:.*:]]
+-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+-; CHECK:       [[FOR_BODY]]:
+-; CHECK-NEXT:    [[CMP:%.*]] = call i1 @b()
+-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END_LOOPEXIT:.*]]
+-; CHECK:       [[FOR_END_LOOPEXIT]]:
+-; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 10, [[X]]
+-; CHECK-NEXT:    ret i32 [[Y]]
+-;
+-for.body.preheader:
+-  %y = sdiv i32 10, %x
+-  br label %for.body
+-
+-for.body:
+-  %cmp = call i1 @b()
+-  br i1 %cmp, label %for.body, label %for.end.loopexit
+-
+-for.end.loopexit:
+-  ret i32 %y
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
+--- a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
++++ b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
+@@ -84,13 +84,13 @@
+ ; CHECK:       inner.2.preheader:
+ ; CHECK-NEXT:    br label [[INNER_3_PH:%.*]]
+ ; CHECK:       inner.3.ph:
+-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
+ ; CHECK-NEXT:    br label [[INNER_3:%.*]]
+ ; CHECK:       inner.3:
+ ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+ ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH]], label [[INNER_3]]
+ ; CHECK:       outer.latch:
+ ; CHECK-NEXT:    [[L_LCSSA:%.*]] = phi i32 [ [[L]], [[INNER_3]] ]
++; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
+ ; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i32 [[L_LCSSA]], [[TRUNC]]
+ ; CHECK-NEXT:    br label [[OUTER_HEADER]]
+ ;
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
+--- a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
++++ b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
+@@ -4,11 +4,11 @@
+ define void @test_pr50940(ptr %A, ptr %B) {
+ ; CHECK-LABEL: @test_pr50940(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+ ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+ ; CHECK:       outer.header:
+ ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH:%.*]], label [[INNER_PH:%.*]]
+ ; CHECK:       inner.ph:
++; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
+ ; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 3
+ ; CHECK-NEXT:    br label [[INNER_LVER_CHECK:%.*]]
+ ; CHECK:       inner.lver.check:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
++++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+@@ -380,6 +380,7 @@
+ ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+ ; CHECK:       vector.ph:
+ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], 8589934588
++; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
+ ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX5_PROMOTED]], i64 0
+ ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i32, ptr [[VAR2]], i64 [[TMP4]]
+ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+@@ -395,7 +396,6 @@
+ ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+ ; CHECK:       middle.block:
+ ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP17]], [[VECTOR_BODY]] ]
+-; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
+ ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
+ ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[ARRAYIDX5]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META23]]
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+@@ -43,6 +43,7 @@
+ ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
+ ; CHECK:       vector.ph:
+ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
++; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
+ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+ ; CHECK:       vector.body:
+ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+@@ -63,7 +64,6 @@
+ ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+ ; CHECK:       middle.block:
+-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
+ ; CHECK:       for.body.preheader14:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+@@ -1040,6 +1040,7 @@
+ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_PREHEADER11:.*]], label %[[VECTOR_PH:.*]]
+ ; CHECK:       [[VECTOR_PH]]:
+ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775806
++; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
+ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+ ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <10 x i32> zeroinitializer
+ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+@@ -1057,11 +1058,10 @@
+ ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+ ; CHECK:       [[MIDDLE_BLOCK]]:
+-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[N_VEC]], 5
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[LOOP_PREHEADER11]]
+ ; CHECK:       [[LOOP_PREHEADER11]]:
+-; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
++; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+ ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+ ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> zeroinitializer
+ ; CHECK-NEXT:    br label %[[LOOP:.*]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+@@ -8,6 +8,7 @@
+ ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+ ; CHECK-NEXT:  [[ENTRY:.*]]:
+ ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
++; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+ ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+@@ -25,7 +26,6 @@
+ ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+ ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+ ; CHECK:       [[MIDDLE_SPLIT]]:
+-; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+ ; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
+ ; CHECK:       [[VECTOR_EARLY_EXIT]]:
+ ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
+@@ -149,7 +149,8 @@
+ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]]
+ ; CHECK:       [[VECTOR_PH]]:
+ ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], -8
+-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
++; CHECK:         [[TMP9:%.*]] = getelementptr
++; CHECK-NEXT:         br label %[[VECTOR_BODY:.*]]
+ ; CHECK:       [[VECTOR_BODY]]:
+ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ]
+ ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+@@ -164,14 +165,12 @@
+ ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[PROL_ITER_CMP_NOT]]
+ ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+ ; CHECK:       [[MIDDLE_SPLIT]]:
+-; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[XTRAITER]], 1
+-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP9]]
+ ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+ ; CHECK:       [[MIDDLE_BLOCK]]:
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]]
+ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]]
+ ; CHECK:       [[LOOP_HEADER_I_PREHEADER2]]:
+-; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
++; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+ ; CHECK-NEXT:    br label %[[LOOP_HEADER_I:.*]]
+ ; CHECK:       [[VECTOR_EARLY_EXIT]]:
+ ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true)
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
++++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+@@ -18,15 +18,22 @@
+ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
+ ; CHECK:       vector.ph:
+ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
++; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
++; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
++; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
++; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
++; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
++; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
++; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
+ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+ ; CHECK:       vector.body:
+ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+ ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
+-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[OFFSET_IDX]]
++; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]]
+ ; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1
+-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX13]]
++; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]]
+ ; CHECK-NEXT:    [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1
+-; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[OFFSET_IDX15]]
++; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]]
+ ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+ ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+ ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2
+@@ -40,13 +47,6 @@
+ ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+ ; CHECK:       middle.block:
+-; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
+-; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[N_VEC]], 1
+-; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP13]]
+-; CHECK-NEXT:    [[TMP14:%.*]] = shl i32 [[N_VEC]], 1
+-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP14]]
+-; CHECK-NEXT:    [[TMP12:%.*]] = shl i32 [[N_VEC]], 1
+-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP12]]
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
+ ; CHECK:       while.body.preheader15:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
++++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+@@ -46,6 +46,7 @@
+ ; AVX2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124
+ ; AVX2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]]
+ ; AVX2:       vector.ph:
++; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
+ ; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776
+ ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
+ ; AVX2:       vector.body:
+@@ -79,7 +80,6 @@
+ ; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+ ; AVX2-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+ ; AVX2:       middle.block:
+-; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
+ ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+ ; AVX2-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+ ; AVX2:       vec.epilog.iter.check:
+@@ -90,6 +90,8 @@
+ ; AVX2:       vec.epilog.ph:
+ ; AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+ ; AVX2-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
++; AVX2-NEXT:    [[TMP21:%.*]] = shl i64 [[N_VEC10]], 2
++; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]]
+ ; AVX2-NEXT:    br label [[BB12:%.*]]
+ ; AVX2:       vec.epilog.vector.body:
+ ; AVX2-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[BB12_PREHEADER11]] ], [ [[INDEX_NEXT16:%.*]], [[BB12]] ]
+@@ -104,8 +106,6 @@
+ ; AVX2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]]
+ ; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]]
+ ; AVX2:       vec.epilog.middle.block:
+-; AVX2-NEXT:    [[TMP27:%.*]] = shl i64 [[N_VEC10]], 2
+-; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP27]]
+ ; AVX2-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
+ ; AVX2-NEXT:    br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]]
+ ; AVX2:       bb12.preheader:
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
++++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+@@ -16,8 +16,8 @@
+ ; CHECK-SAME: ptr writeonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], double [[A:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+ ; CHECK-NEXT:  [[ENTRY:.*:]]
+ ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
+-; CHECK-NEXT:    br i1 [[CMP1]], label %[[ITER_CHECK:.*]], label %[[FOR_END:.*]]
+-; CHECK:       [[ITER_CHECK]]:
++; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
++; CHECK:       [[FOR_BODY_PREHEADER]]:
+ ; CHECK-NEXT:    [[X4:%.*]] = ptrtoint ptr [[X]] to i64
+ ; CHECK-NEXT:    [[Y5:%.*]] = ptrtoint ptr [[Y]] to i64
+ ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+@@ -25,11 +25,12 @@
+ ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]]
+ ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
+ ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]]
+-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+-; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
+-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
++; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER9:.*]], label %[[VECTOR_PH:.*]]
+ ; CHECK:       [[VECTOR_PH]]:
++; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
++; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
++; CHECK:       [[VECTOR_PH1]]:
++; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
+ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632
+ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
+ ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
+@@ -39,7 +40,7 @@
+ ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
+ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+ ; CHECK:       [[VECTOR_BODY]]:
+-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
++; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+ ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX]]
+ ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
+ ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 64
+@@ -64,14 +65,13 @@
+ ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+ ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+ ; CHECK:       [[MIDDLE_BLOCK]]:
+-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
+ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+ ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+ ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]]
++; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]]
+ ; CHECK:       [[VEC_EPILOG_PH]]:
+-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
++; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+ ; CHECK-NEXT:    [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644
+ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
+ ; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
+@@ -86,12 +86,12 @@
+ ; CHECK-NEXT:    store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[DOUBLE_TBAA3]]
+ ; CHECK-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4
+ ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]]
+-; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
++; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+ ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+ ; CHECK-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]]
+-; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER]]
+-; CHECK:       [[FOR_BODY_PREHEADER]]:
+-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
++; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9]]
++; CHECK:       [[FOR_BODY_PREHEADER9]]:
++; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+ ; CHECK-NEXT:    [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]]
+ ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP43]], 7
+ ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
+@@ -110,13 +110,13 @@
+ ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
+ ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+ ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP12:![0-9]+]]
++; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
+ ; CHECK:       [[FOR_BODY_PROL_LOOPEXIT]]:
+-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
++; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
+ ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]]
+ ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8
+-; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
+-; CHECK:       [[FOR_BODY_PREHEADER_NEW]]:
++; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9_NEW:.*]]
++; CHECK:       [[FOR_BODY_PREHEADER9_NEW]]:
+ ; CHECK-NEXT:    [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+ ; CHECK-NEXT:    [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+ ; CHECK-NEXT:    [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+@@ -127,7 +127,7 @@
+ ; CHECK-NEXT:    [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]]
+ ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+ ; CHECK:       [[FOR_BODY]]:
+-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
++; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+ ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV]]
+ ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA3]]
+ ; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]]
+@@ -177,7 +177,7 @@
+ ; CHECK-NEXT:    store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[DOUBLE_TBAA3]]
+ ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
+ ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
+-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
++; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+ ; CHECK:       [[FOR_END]]:
+ ; CHECK-NEXT:    ret void
+ ;
+@@ -232,9 +232,8 @@
+ ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+ ; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1}
+ ; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"}
+-; CHECK: [[PROF10]] = !{!"branch_weights", i32 4, i32 12}
+-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
+-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]]}
+-; CHECK: [[META13]] = !{!"llvm.loop.unroll.disable"}
+-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]]}
++; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
++; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
++; CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"}
++; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META8]]}
+ ;.
diff --git a/third_party/xla/third_party/llvm/toolchains.patch b/third_party/xla/third_party/llvm/toolchains.patch
index 2370c1e3953bed..9aba6852556350 100644
--- a/third_party/xla/third_party/llvm/toolchains.patch
+++ b/third_party/xla/third_party/llvm/toolchains.patch
@@ -55,4 +55,4 @@ index 2e3bff53ead9..8d01617effdc 100644
 +    "//llvm:macos_x86_64_default": native_arch_defines("X86", "x86_64-unknown-darwin"),
      "@bazel_tools//src/conditions:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
      "@bazel_tools//src/conditions:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
-     "@bazel_tools//src/conditions:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
+     "@bazel_tools//src/conditions:linux_riscv64": native_arch_defines("RISCV", "riscv64-unknown-linux-gnu"),
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index b1675f1afbd4d9..5bd8dd92e9f182 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "32de3b9ef9e7e8debc14416e968456ca13b48bea"
-    LLVM_SHA256 = "e048b05e1fb9366e224ea3c06f8473714114039bfad00e81db4ecb6409f23efa"
+    LLVM_COMMIT = "741ba8209c1f9bd5b1a145d9c137f5e18bfffb84"
+    LLVM_SHA256 = "45f2faa4d50e9f333c4130c23f3a712cf42f8f8fec4520207f6514344f8b32b0"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/llvm_openmp/openmp.bzl b/third_party/xla/third_party/llvm_openmp/openmp.bzl
index 1b41d1dcc3e2d9..b384d9cb549d2b 100644
--- a/third_party/xla/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/xla/third_party/llvm_openmp/openmp.bzl
@@ -4,7 +4,7 @@ after the TF 2.4 branch cut has passed.
 """
 
 load(
-    "@local_xla//xla/tsl/platform:rules_cc.bzl",
+    "//xla/tsl/platform:rules_cc.bzl",
     "cc_binary",
 )
 
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@local_xla//xla/tsl:linux_x86_64": L,
-        "@local_xla//xla/tsl:macos": M,
-        "@local_xla//xla/tsl:windows": W,
+        Label("//xla/tsl:linux_x86_64"): L,
+        Label("//xla/tsl:macos"): M,
+        Label("//xla/tsl:windows"): W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
index 3b1337d6d3b707..bf38ad269caec3 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -220,6 +220,7 @@ cc_library(
         "@local_xla//xla/tsl:linux_aarch64": ["-lrt"],
         "@local_xla//xla/tsl:linux_x86_64": ["-lrt"],
         "@local_xla//xla/tsl:linux_ppc64le": ["-lrt"],
+        "@local_xla//xla/tsl:linux_riscv64": ["-lrt"],
         "//conditions:default": [],
     }),
     textual_hdrs = _TEXTUAL_HDRS_LIST,
diff --git a/third_party/xla/third_party/nanobind/workspace.bzl b/third_party/xla/third_party/nanobind/workspace.bzl
index f3dbd8aa4fbab5..d8be62e27e1caf 100644
--- a/third_party/xla/third_party/nanobind/workspace.bzl
+++ b/third_party/xla/third_party/nanobind/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "nanobind",
-        strip_prefix = "nanobind-2.9.2",
-        sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90",
-        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz"),
+        strip_prefix = "nanobind-e507b118927bc3a12446d0ca235e1baaf343932e",
+        sha256 = "95004c4cd1f3e7417b71ff25be9cfba7e8ad79e570248e377815bc980c8b3c73",
+        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/e507b118927bc3a12446d0ca235e1baaf343932e.tar.gz"),
         build_file = "//third_party/nanobind:nanobind.BUILD",
     )
diff --git a/third_party/xla/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
index a0930df34ecec8..ac7f3bc92cff33 100644
--- a/third_party/xla/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
@@ -328,8 +328,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         out = dlink_cc,
         gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
-            "@local_xla//xla/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_xla//xla/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            Label("//xla/tsl:linux_x86_64"): ["--cpu-arch=X86_64"],
+            Label("//xla/tsl:linux_ppc64le"): ["--cpu-arch=PPC64LE"],
             "//conditions:default": [],
         }),
     )
diff --git a/third_party/xla/third_party/net_zstd.BUILD b/third_party/xla/third_party/net_zstd.BUILD
index 87e58fe5dff903..8fdc434a56608d 100644
--- a/third_party/xla/third_party/net_zstd.BUILD
+++ b/third_party/xla/third_party/net_zstd.BUILD
@@ -28,6 +28,7 @@ cc_library(
     hdrs = glob([
         "*.h",
     ]),
+    copts = ["-no-canonical-prefixes"],
 )
 
 alias(
diff --git a/third_party/xla/third_party/nvshmem/nvshmem.BUILD b/third_party/xla/third_party/nvshmem/nvshmem.BUILD
index c99dfc1a3138d6..7472bed6cfe77e 100644
--- a/third_party/xla/third_party/nvshmem/nvshmem.BUILD
+++ b/third_party/xla/third_party/nvshmem/nvshmem.BUILD
@@ -1,5 +1,6 @@
 # NVSHMEM
 
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 
@@ -67,12 +68,19 @@ expand_template(
     template = "src/include/non_abi/nvshmem_version.h.in",
 )
 
+copy_file(
+    name = "nvshmem_transfer_device_cuh",
+    src = "src/include/non_abi/device/pt-to-pt/transfer_device.cuh.in",
+    out = "src/include/non_abi/device/pt-to-pt/transfer_device.cuh",
+)
+
 cc_library(
     name = "nvshmem_lib",
     hdrs = glob([
         "src/include/**",
     ]) + [
         ":nvshmem_build_options_h",
+        ":nvshmem_transfer_device_cuh",
         ":nvshmem_version_h",
     ],
     include_prefix = "third_party/nvshmem",
diff --git a/third_party/xla/third_party/rocm_device_libs/build_defs.bzl b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
index 845618bd9c4fa2..5b9aee4f085e34 100644
--- a/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
+++ b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
@@ -2,110 +2,148 @@
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
 
-def bitcode_library(
-        name,
-        srcs = [],
-        hdrs = [],
-        file_specific_flags = {}):
-    """Builds a bitcode library
+def _bitcode_library_impl(ctx):
+    """Implements a bitcode library rule."""
+    srcs = ctx.files.srcs
+    hdrs = ctx.files.hdrs
 
-    Args:
-        name: Unique name of the build rule.
-        srcs: List of source files (*.cl, *.ll).
-        hdrs: List of header files (*.h).
-        file_specific_flags: Per-file dict of flags to be passed to clang.
-    """
-    # Takes the CL sources and compiles them into bitcode files.
-    # Merges those bitcode files together with any given .ll files into a single bitcode file.
-    # Strips unnecessary metadata and forces linkonce local visibility for symbols.
-    # Adapted from:
-    #   https://github.com/ROCm/llvm-project/blob/22ee53fa53edc3a5f25feb08dc840f5b0fc362da/amd/device-libs/cmake/OCL.cmake#L73
+    bc_outputs = []
 
-    clang_tool = "@llvm-project//clang:clang"
-    llvm_link_tool = "@llvm-project//llvm:llvm-link"
-    opt_tool = "@llvm-project//llvm:opt"
-    prepare_builtins_tool = ":prepare_builtins"
-    clang_includes = "@llvm-project//clang:builtin_headers_gen"
-
-    # Just for calculating the include path.
-    clang_header = "@llvm-project//clang:staging/include/opencl-c.h"
-
-    include_paths = dict([(paths.dirname(h), None) for h in hdrs]).keys()
-
-    #TODO(rocm): Maybe compute this in cmd not to pass dirs as srcs
-    includes = " ".join(["-I$(location {})".format(inc) for inc in include_paths])
-    flags = ("-fcolor-diagnostics -Werror -Wno-error=atomic-alignment -x cl -Xclang " +
-             "-cl-std=CL2.0 --target=amdgcn-amd-amdhsa -fvisibility=hidden -fomit-frame-pointer " +
-             "-Xclang -finclude-default-header -Xclang -fexperimental-strict-floating-point " +
-             "-Xclang -fdenormal-fp-math=dynamic -Xclang -Qn " +
-             "-nogpulib -cl-no-stdinc -Xclang -mcode-object-version=none")
-
-    link_inputs = []
+    include_dirs = dict([(paths.dirname(h.path), None) for h in ctx.files.hdrs]).keys()
 
+    # Compile .cl files to .bc
     for src in srcs:
-        filename = paths.basename(src)
-        (basename, _, ext) = filename.partition(".")
-
-        if (ext == "ll"):
-            link_inputs.append(src)
-            continue
-
-        out = basename + ".bc"
-        link_inputs.append(out)
-        extra_flags = " ".join(file_specific_flags.get(filename, []))
-        native.genrule(
-            name = "compile_" + basename,
-            srcs = [src] + hdrs + include_paths + [clang_includes, clang_header],
-            outs = [out],
-            cmd = "$(location {}) -I$$(dirname $(location {}))  {} {} {} -emit-llvm -c $(location {}) -o $@".format(
-                clang_tool,
-                clang_header,
-                includes,
-                flags,
-                extra_flags,
-                src,
-            ),
-            tools = [clang_tool],
-            message = "Compiling {} ...".format(filename),
-        )
-
-    link_message = "Linking {}.bc ...".format(name)
-
-    prelink_out = name + ".link0.lib.bc"
-    native.genrule(
-        name = "prelink_" + name,
-        srcs = link_inputs,
-        outs = [prelink_out],
-        cmd = "$(location {}) $(SRCS) -o $@".format(llvm_link_tool),
-        tools = [llvm_link_tool],
-        message = link_message,
+        if src.path.endswith(".cl"):
+            out = ctx.actions.declare_file(src.basename + ".bc")
+            bc_outputs.append(out)
+
+            extra_flags = ctx.attr.file_specific_flags.get(src.basename, [])
+            include_flags = ["-I{}".format(dir) for dir in include_dirs]
+            include_flags += ["-I{}".format(ctx.files._clang_header[0].dirname)]
+            include_flags += ["-I{}".format(ctx.files._clang_includes[0].dirname)]
+
+            # https://github.com/ROCm/llvm-project/blob/679865ee84553d564ad0551d878196e58c9d03f3/amd/device-libs/cmake/OCL.cmake#L33
+            args = [
+                "-fcolor-diagnostics",
+                "-Werror",
+                "-Wno-error=atomic-alignment",
+                "-x",
+                "cl",
+                "-Xclang",
+                "-cl-std=CL2.0",
+                "--target=amdgcn-amd-amdhsa",
+                "-fvisibility=hidden",
+                "-fomit-frame-pointer",
+                "-Xclang",
+                "-finclude-default-header",
+                "-Xclang",
+                "-fexperimental-strict-floating-point",
+                "-Xclang",
+                "-fdenormal-fp-math=dynamic",
+                "-Xclang",
+                "-Qn",
+                "-nogpulib",
+                "-cl-no-stdinc",
+                "-Xclang",
+                "-mcode-object-version=none",
+                "-emit-llvm",
+                "-c",
+            ] + include_flags + [src.path, "-o", out.path] + extra_flags
+
+            ctx.actions.run(
+                executable = ctx.executable._clang,
+                inputs = [src] + hdrs + ctx.files._clang_includes + ctx.files._clang_header,
+                outputs = [out],
+                arguments = args,
+                progress_message = "Compiling {} to bitcode".format(src.basename),
+                mnemonic = "BitcodeCompile",
+            )
+
+        elif src.path.endswith(".ll"):
+            # Directly include .ll files in linking
+            bc_outputs.append(src)
+
+    # Link all .bc files into one prelinked .bc
+    prelink_out = ctx.actions.declare_file(ctx.label.name + ".link0.lib.bc")
+    ctx.actions.run(
+        executable = ctx.executable._llvm_link,
+        inputs = bc_outputs,
+        outputs = [prelink_out],
+        arguments = [f.path for f in bc_outputs] + ["-o", prelink_out.path],
+        progress_message = "Linking {} bitcode files".format(ctx.label.name),
+        mnemonic = "BitcodeLink",
     )
 
-    internalize_out = name + ".lib.bc"
-    native.genrule(
-        name = "internalize_" + name,
-        srcs = [prelink_out],
-        outs = [internalize_out],
-        cmd = "$(location {}) -internalize -only-needed $< -o $@".format(llvm_link_tool),
-        tools = [llvm_link_tool],
-        message = link_message,
+    # Internalize symbols (llvm-link + -internalize)
+    internalize_out = ctx.actions.declare_file(ctx.label.name + ".lib.bc")
+    ctx.actions.run(
+        executable = ctx.executable._llvm_link,
+        inputs = [prelink_out],
+        outputs = [internalize_out],
+        arguments = ["-internalize", "-only-needed", prelink_out.path, "-o", internalize_out.path],
+        progress_message = "Internalizing symbols for {}".format(ctx.label.name),
+        mnemonic = "BitcodeInternalizeSymbols",
     )
 
-    strip_out = name + ".strip.bc"
-    native.genrule(
-        name = "strip_" + name,
-        srcs = [internalize_out],
-        outs = [strip_out],
-        cmd = "$(location {}) -passes=strip -o $@ $<".format(opt_tool),
-        tools = [opt_tool],
-        message = link_message,
+    # Strip unnecessary metadata
+    strip_out = ctx.actions.declare_file(ctx.label.name + ".strip.bc")
+    ctx.actions.run(
+        executable = ctx.executable._opt,
+        inputs = [internalize_out],
+        outputs = [strip_out],
+        arguments = ["-passes=strip", "-o", strip_out.path, internalize_out.path],
+        progress_message = "Stripping {}".format(ctx.label.name),
+        mnemonic = "BitcodeStrip",
     )
 
-    native.genrule(
-        name = name,
-        srcs = [strip_out],
-        outs = [name + ".bc"],
-        cmd = "$(location {}) -o $@ $<".format(prepare_builtins_tool),
-        tools = [prepare_builtins_tool],
-        message = link_message,
+    # Final preparation of bitcode (custom prepare_builtins tool)
+    final_bc = ctx.actions.declare_file(ctx.label.name + ".bc")
+    ctx.actions.run(
+        executable = ctx.executable._prepare_builtins,
+        inputs = [strip_out],
+        outputs = [final_bc],
+        arguments = [strip_out.path, "-o", final_bc.path],
+        progress_message = "Preparing final bitcode for {}".format(ctx.label.name),
+        mnemonic = "BitcodeFinalize",
     )
+
+    return [
+        DefaultInfo(files = depset([final_bc])),
+    ]
+
+bitcode_library = rule(
+    implementation = _bitcode_library_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = [".cl", ".ll"]),
+        "hdrs": attr.label_list(allow_files = [".h"]),
+        "file_specific_flags": attr.string_list_dict(),
+        "_clang": attr.label(
+            default = Label("@llvm-project//clang:clang"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_llvm_link": attr.label(
+            default = Label("@llvm-project//llvm:llvm-link"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_opt": attr.label(
+            default = Label("@llvm-project//llvm:opt"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_prepare_builtins": attr.label(
+            default = Label(":prepare_builtins"),
+            executable = True,
+            cfg = "exec",
+        ),
+        "_clang_includes": attr.label(
+            default = Label("@llvm-project//clang:builtin_headers_gen"),
+            allow_files = True,
+        ),
+        "_clang_header": attr.label(
+            default = Label("@llvm-project//clang:staging/include/opencl-c.h"),
+            allow_files = True,
+        ),
+    },
+)
diff --git a/third_party/xla/third_party/rocm_device_libs/workspace.bzl b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
index e05ba8de4fe1bf..2958b33b670cf8 100644
--- a/third_party/xla/third_party/rocm_device_libs/workspace.bzl
+++ b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     """Imports Rocm-Device-Libs."""
-    LLVM_COMMIT = "c93c6e5451544e9ead12f2d2b15e1969b9a1bd04"
-    LLVM_SHA256 = "f715a0a9c3c1a2b09a79939016ed53a0cbd454f7b0ea4ef32878433275c7b16c"
+    LLVM_COMMIT = "fcc50fb091b7c75d8f6c9a6554d0b004bc0cd474"
+    LLVM_SHA256 = "fa9089d3134bd32d2b05a141006b9261e441c1d80b75782db0dcb154b6a60561"
 
     tf_http_archive(
         name = "rocm_device_libs",
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 51f728653a2f2a..3e585e95c4a75a 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,114 +1,1681 @@
 diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 509398d..2856e5f 100644
+index bc36ca5..db59929 100644
 --- a/third_party/llvm/generated.patch
 +++ b/third_party/llvm/generated.patch
-@@ -1 +1,34 @@
- Auto generated patch. Do not edit or delete it, even if empty.
-+diff -ruN --strip-trailing-cr a/mlir/test/Target/SPIRV/function-decorations-asserts.mlir b/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
-+--- a/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
-++++ b/mlir/test/Target/SPIRV/function-decorations-asserts.mlir
-+@@ -0,0 +1,20 @@
-++// REQUIRES: asserts
-++// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file --debug %s | FileCheck %s
-++
-++spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-++    spirv.func @linkage_attr_test_kernel()  "DontInline"  attributes {}  {
-++        %uchar_0 = spirv.Constant 0 : i8
-++        %ushort_1 = spirv.Constant 1 : i16
-++        %uint_0 = spirv.Constant 0 : i32
-++        spirv.FunctionCall @outside.func.with.linkage(%uchar_0):(i8) -> ()
-++        spirv.Return
-++    }
-++    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outside.func", linkage_type = <Import>>
-++    spirv.func @outside.func.with.linkage(%arg0 : i8) -> () "Pure" attributes {
-++      linkage_attributes=#spirv.linkage_attributes<
-++        linkage_name="outside.func",
-++        linkage_type=<Import>
-++      >
+@@ -48,6 +48,451 @@ diff -ruN --strip-trailing-cr a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/
+  
+  static cl::opt<bool>
+      EnableHotColdSplit("hot-cold-split",
++diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
++--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
++@@ -162,6 +162,8 @@
++                                  const SCEV *ExitCount,
++                                  PHINode *IndVar, SCEVExpander &Rewriter);
++ 
+++  bool sinkUnusedInvariants(Loop *L);
+++
++ public:
++   IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
++                  const DataLayout &DL, TargetLibraryInfo *TLI,
++@@ -1077,6 +1079,85 @@
++   return true;
++ }
++ 
+++//===----------------------------------------------------------------------===//
+++//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+++//===----------------------------------------------------------------------===//
+++
+++/// If there's a single exit block, sink any loop-invariant values that
+++/// were defined in the preheader but not used inside the loop into the
+++/// exit block to reduce register pressure in the loop.
+++bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+++  BasicBlock *ExitBlock = L->getExitBlock();
+++  if (!ExitBlock) return false;
+++
+++  BasicBlock *Preheader = L->getLoopPreheader();
+++  if (!Preheader) return false;
+++
+++  bool MadeAnyChanges = false;
+++  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
+++
+++    // Skip BB Terminator.
+++    if (Preheader->getTerminator() == &I)
+++      continue;
+++
+++    // New instructions were inserted at the end of the preheader.
+++    if (isa<PHINode>(I))
+++      break;
+++
+++    // Don't move instructions which might have side effects, since the side
+++    // effects need to complete before instructions inside the loop.  Also don't
+++    // move instructions which might read memory, since the loop may modify
+++    // memory. Note that it's okay if the instruction might have undefined
+++    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+++    // block.
+++    if (I.mayHaveSideEffects() || I.mayReadFromMemory())
+++      continue;
+++
+++    // Skip debug or pseudo instructions.
+++    if (I.isDebugOrPseudoInst())
+++      continue;
+++
+++    // Skip eh pad instructions.
+++    if (I.isEHPad())
+++      continue;
+++
+++    // Don't sink alloca: we never want to sink static alloca's out of the
+++    // entry block, and correctly sinking dynamic alloca's requires
+++    // checks for stacksave/stackrestore intrinsics.
+++    // FIXME: Refactor this check somehow?
+++    if (isa<AllocaInst>(&I))
+++      continue;
+++
+++    // Determine if there is a use in or before the loop (direct or
+++    // otherwise).
+++    bool UsedInLoop = false;
+++    for (Use &U : I.uses()) {
+++      Instruction *User = cast<Instruction>(U.getUser());
+++      BasicBlock *UseBB = User->getParent();
+++      if (PHINode *P = dyn_cast<PHINode>(User)) {
+++        unsigned i =
+++          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+++        UseBB = P->getIncomingBlock(i);
+++      }
+++      if (UseBB == Preheader || L->contains(UseBB)) {
+++        UsedInLoop = true;
+++        break;
+++      }
 ++    }
-++    spirv.func @inside.func() -> () "Pure" attributes {} {spirv.Return}
+++
+++    // If there is, the def must remain in the preheader.
+++    if (UsedInLoop)
+++      continue;
+++
+++    // Otherwise, sink it to the exit block.
+++    I.moveBefore(ExitBlock->getFirstInsertionPt());
+++    SE->forgetValue(&I);
+++    MadeAnyChanges = true;
+++  }
+++
+++  return MadeAnyChanges;
 ++}
-+diff -ruN --strip-trailing-cr a/mlir/test/Target/SPIRV/function-decorations.mlir b/mlir/test/Target/SPIRV/function-decorations.mlir
-+--- a/mlir/test/Target/SPIRV/function-decorations.mlir
-++++ b/mlir/test/Target/SPIRV/function-decorations.mlir
-+@@ -1,5 +1,4 @@
-+ // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
-+-// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file --debug %s | FileCheck %s
+++
++ static void replaceExitCond(BranchInst *BI, Value *NewCond,
++                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
++   auto *OldCond = BI->getCondition();
++@@ -1984,6 +2065,10 @@
++ 
++   // The Rewriter may not be used from this point on.
++ 
+++  // Loop-invariant instructions in the preheader that aren't used in the
+++  // loop may be sunk below the loop to reduce register pressure.
+++  Changed |= sinkUnusedInvariants(L);
+++
++   // rewriteFirstIterationLoopExitValues does not rely on the computation of
++   // trip count and therefore can further simplify exit values in addition to
++   // rewriteLoopExitValues.
++diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
++--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++++ b/llvm/lib/Transforms/Scalar/LICM.cpp
++@@ -211,15 +211,9 @@
++ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
++                              MemorySSAUpdater &MSSAU);
++ 
++-static void moveInstructionBefore(
++-    Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo,
++-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
++-    MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator);
++-
++-static bool sinkUnusedInvariantsFromPreheaderToExit(
++-    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
++-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
++-    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);
+++static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
+++                                  ICFLoopSafetyInfo &SafetyInfo,
+++                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE);
++ 
++ static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,
++                                 function_ref<void(Instruction *)> Fn);
++@@ -477,12 +471,6 @@
++                                     TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)
++             : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
++                          MSSAU, &SafetyInfo, Flags, ORE);
++-
++-  // sink pre-header defs that are unused in-loop into the unique exit to reduce
++-  // pressure.
++-  Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU,
++-                                                     SE, DT, Flags, ORE);
++-
++   Flags.setIsSink(false);
++   if (Preheader)
++     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L,
++@@ -1468,80 +1456,19 @@
++ 
++ static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,
++                                   ICFLoopSafetyInfo &SafetyInfo,
++-                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE,
++-                                  MemorySSA::InsertionPlace Point) {
+++                                  MemorySSAUpdater &MSSAU,
+++                                  ScalarEvolution *SE) {
++   SafetyInfo.removeInstruction(&I);
++   SafetyInfo.insertInstructionTo(&I, Dest->getParent());
++   I.moveBefore(*Dest->getParent(), Dest);
++   if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
++           MSSAU.getMemorySSA()->getMemoryAccess(&I)))
++-    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);
+++    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(),
+++                      MemorySSA::BeforeTerminator);
++   if (SE)
++     SE->forgetBlockAndLoopDispositions(&I);
++ }
 + 
-+ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-+     spirv.func @linkage_attr_test_kernel()  "DontInline"  attributes {}  {
++-// If there's a single exit block, sink any loop-invariant values that were
++-// defined in the preheader but not used inside the loop into the exit block
++-// to reduce register pressure in the loop.
++-static bool sinkUnusedInvariantsFromPreheaderToExit(
++-    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo,
++-    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT,
++-    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) {
++-  BasicBlock *ExitBlock = L->getExitBlock();
++-  if (!ExitBlock)
++-    return false;
++-
++-  BasicBlock *Preheader = L->getLoopPreheader();
++-  if (!Preheader)
++-    return false;
++-
++-  bool MadeAnyChanges = false;
++-
++-  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) {
++-
++-    // Skip terminator.
++-    if (Preheader->getTerminator() == &I)
++-      continue;
++-
++-    // New instructions were inserted at the end of the preheader.
++-    if (isa<PHINode>(I))
++-      break;
++-
++-    // Don't move instructions which might have side effects, since the side
++-    // effects need to complete before instructions inside the loop. Note that
++-    // it's okay if the instruction might have undefined behavior: LoopSimplify
++-    // guarantees that the preheader dominates the exit block.
++-    if (I.mayHaveSideEffects())
++-      continue;
++-
++-    if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr))
++-      continue;
++-
++-    // Determine if there is a use in or before the loop (direct or
++-    // otherwise).
++-    bool UsedInLoopOrPreheader = false;
++-    for (Use &U : I.uses()) {
++-      auto *UserI = cast<Instruction>(U.getUser());
++-      BasicBlock *UseBB = UserI->getParent();
++-      if (auto *PN = dyn_cast<PHINode>(UserI)) {
++-        UseBB = PN->getIncomingBlock(U);
++-      }
++-      if (UseBB == Preheader || L->contains(UseBB)) {
++-        UsedInLoopOrPreheader = true;
++-        break;
++-      }
++-    }
++-    if (UsedInLoopOrPreheader)
++-      continue;
++-
++-    moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo,
++-                          MSSAU, SE, MemorySSA::Beginning);
++-    MadeAnyChanges = true;
++-  }
++-
++-  return MadeAnyChanges;
++-}
++-
++ static Instruction *sinkThroughTriviallyReplaceablePHI(
++     PHINode *TPN, Instruction *I, LoopInfo *LI,
++     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
++--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
++@@ -73,10 +73,10 @@
++ }
++ 
++ ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
++-; GFX908:    NumSgprs: 56
++-; GFX908-GCNTRACKERS:    NumSgprs: 56
+++; GFX908:    NumSgprs: 64
+++; GFX908-GCNTRACKERS:    NumSgprs: 64
++ ; GFX908:    NumVgprs: 43
++-; GFX908-GCNTRACKERS:    NumVgprs: 40
+++; GFX908-GCNTRACKERS:    NumVgprs: 39
++ ; GFX908:    Occupancy: 5
++ ; GFX908-GCNTRACKERS:    Occupancy: 6
++ 
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
++--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
++@@ -22,6 +22,8 @@
++ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
++ 
++ ; OFFREG is offset system SGPR
+++; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
+++; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
++ ; GCN: NumVgprs: 256
++ ; GCN: ScratchSize: 640
++ 
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
++--- a/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
+++++ b/llvm/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
++@@ -212,33 +212,37 @@
++ ; CHECK-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
++ ; CHECK-NEXT:    andi. r3, r3, 1
++ ; CHECK-NEXT:    li r3, -1
++-; CHECK-NEXT:    li r4, 0
++ ; CHECK-NEXT:    li r30, 0
++ ; CHECK-NEXT:    crmove 4*cr2+lt, gt
++ ; CHECK-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
++ ; CHECK-NEXT:    b .LBB3_2
+++; CHECK-NEXT:    .p2align 4
++ ; CHECK-NEXT:  .LBB3_1: # %if.end116
++ ; CHECK-NEXT:    #
++ ; CHECK-NEXT:    bl callee
++ ; CHECK-NEXT:    nop
++ ; CHECK-NEXT:    mr r3, r29
++-; CHECK-NEXT:    li r4, 0
++-; CHECK-NEXT:    .p2align 4
++-; CHECK-NEXT:  .LBB3_2: # %while.body5.i
++-; CHECK-NEXT:    #
++-; CHECK-NEXT:    addi r4, r4, -1
++-; CHECK-NEXT:    cmpwi r4, 0
++-; CHECK-NEXT:    bgt cr0, .LBB3_2
++-; CHECK-NEXT:  # %bb.3: # %while.cond12.preheader.i
++-; CHECK-NEXT:    #
+++; CHECK-NEXT:  .LBB3_2: # %cond.end.i.i
+++; CHECK-NEXT:    # =>This Loop Header: Depth=1
+++; CHECK-NEXT:    # Child Loop BB3_3 Depth 2
++ ; CHECK-NEXT:    lwz r29, 0(r3)
+++; CHECK-NEXT:    li r5, 0
+++; CHECK-NEXT:    extsw r4, r29
+++; CHECK-NEXT:    .p2align 5
+++; CHECK-NEXT:  .LBB3_3: # %while.body5.i
+++; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
+++; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+++; CHECK-NEXT:    addi r5, r5, -1
+++; CHECK-NEXT:    cmpwi r5, 0
+++; CHECK-NEXT:    bgt cr0, .LBB3_3
+++; CHECK-NEXT:  # %bb.4: # %while.cond12.preheader.i
+++; CHECK-NEXT:    #
++ ; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
++-; CHECK-NEXT:  # %bb.4: # %for.cond99.preheader
+++; CHECK-NEXT:  # %bb.5: # %for.cond99.preheader
++ ; CHECK-NEXT:    #
++-; CHECK-NEXT:    extsw r4, r29
++ ; CHECK-NEXT:    ld r5, 0(r3)
++-; CHECK-NEXT:    stw r3, 0(r3)
++ ; CHECK-NEXT:    sldi r4, r4, 2
+++; CHECK-NEXT:    stw r3, 0(r3)
++ ; CHECK-NEXT:    stwx r30, r5, r4
++ ; CHECK-NEXT:    b .LBB3_1
++ ;
++@@ -252,33 +256,37 @@
++ ; CHECK-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
++ ; CHECK-BE-NEXT:    andi. r3, r3, 1
++ ; CHECK-BE-NEXT:    li r3, -1
++-; CHECK-BE-NEXT:    li r4, 0
++ ; CHECK-BE-NEXT:    li r30, 0
++ ; CHECK-BE-NEXT:    crmove 4*cr2+lt, gt
++ ; CHECK-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
++ ; CHECK-BE-NEXT:    b .LBB3_2
+++; CHECK-BE-NEXT:    .p2align 4
++ ; CHECK-BE-NEXT:  .LBB3_1: # %if.end116
++ ; CHECK-BE-NEXT:    #
++ ; CHECK-BE-NEXT:    bl callee
++ ; CHECK-BE-NEXT:    nop
++ ; CHECK-BE-NEXT:    mr r3, r29
++-; CHECK-BE-NEXT:    li r4, 0
++-; CHECK-BE-NEXT:    .p2align 4
++-; CHECK-BE-NEXT:  .LBB3_2: # %while.body5.i
++-; CHECK-BE-NEXT:    #
++-; CHECK-BE-NEXT:    addi r4, r4, -1
++-; CHECK-BE-NEXT:    cmpwi r4, 0
++-; CHECK-BE-NEXT:    bgt cr0, .LBB3_2
++-; CHECK-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
++-; CHECK-BE-NEXT:    #
+++; CHECK-BE-NEXT:  .LBB3_2: # %cond.end.i.i
+++; CHECK-BE-NEXT:    # =>This Loop Header: Depth=1
+++; CHECK-BE-NEXT:    # Child Loop BB3_3 Depth 2
++ ; CHECK-BE-NEXT:    lwz r29, 0(r3)
+++; CHECK-BE-NEXT:    li r5, 0
+++; CHECK-BE-NEXT:    extsw r4, r29
+++; CHECK-BE-NEXT:    .p2align 5
+++; CHECK-BE-NEXT:  .LBB3_3: # %while.body5.i
+++; CHECK-BE-NEXT:    # Parent Loop BB3_2 Depth=1
+++; CHECK-BE-NEXT:    # => This Inner Loop Header: Depth=2
+++; CHECK-BE-NEXT:    addi r5, r5, -1
+++; CHECK-BE-NEXT:    cmpwi r5, 0
+++; CHECK-BE-NEXT:    bgt cr0, .LBB3_3
+++; CHECK-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
+++; CHECK-BE-NEXT:    #
++ ; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
++-; CHECK-BE-NEXT:  # %bb.4: # %for.cond99.preheader
+++; CHECK-BE-NEXT:  # %bb.5: # %for.cond99.preheader
++ ; CHECK-BE-NEXT:    #
++-; CHECK-BE-NEXT:    extsw r4, r29
++ ; CHECK-BE-NEXT:    ld r5, 0(r3)
++-; CHECK-BE-NEXT:    stw r3, 0(r3)
++ ; CHECK-BE-NEXT:    sldi r4, r4, 2
+++; CHECK-BE-NEXT:    stw r3, 0(r3)
++ ; CHECK-BE-NEXT:    stwx r30, r5, r4
++ ; CHECK-BE-NEXT:    b .LBB3_1
++ ;
++@@ -292,28 +300,32 @@
++ ; CHECK-P9-NEXT:    std r0, 80(r1)
++ ; CHECK-P9-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
++ ; CHECK-P9-NEXT:    li r3, -1
++-; CHECK-P9-NEXT:    li r4, 0
++ ; CHECK-P9-NEXT:    li r30, 0
++ ; CHECK-P9-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
++ ; CHECK-P9-NEXT:    crmove 4*cr2+lt, gt
++ ; CHECK-P9-NEXT:    b .LBB3_2
+++; CHECK-P9-NEXT:    .p2align 4
++ ; CHECK-P9-NEXT:  .LBB3_1: # %if.end116
++ ; CHECK-P9-NEXT:    #
++ ; CHECK-P9-NEXT:    bl callee
++ ; CHECK-P9-NEXT:    nop
++ ; CHECK-P9-NEXT:    mr r3, r29
+++; CHECK-P9-NEXT:  .LBB3_2: # %cond.end.i.i
+++; CHECK-P9-NEXT:    # =>This Loop Header: Depth=1
+++; CHECK-P9-NEXT:    # Child Loop BB3_3 Depth 2
+++; CHECK-P9-NEXT:    lwz r29, 0(r3)
++ ; CHECK-P9-NEXT:    li r4, 0
++-; CHECK-P9-NEXT:    .p2align 4
++-; CHECK-P9-NEXT:  .LBB3_2: # %while.body5.i
++-; CHECK-P9-NEXT:    #
+++; CHECK-P9-NEXT:    .p2align 5
+++; CHECK-P9-NEXT:  .LBB3_3: # %while.body5.i
+++; CHECK-P9-NEXT:    # Parent Loop BB3_2 Depth=1
+++; CHECK-P9-NEXT:    # => This Inner Loop Header: Depth=2
++ ; CHECK-P9-NEXT:    addi r4, r4, -1
++ ; CHECK-P9-NEXT:    cmpwi r4, 0
++-; CHECK-P9-NEXT:    bgt cr0, .LBB3_2
++-; CHECK-P9-NEXT:  # %bb.3: # %while.cond12.preheader.i
+++; CHECK-P9-NEXT:    bgt cr0, .LBB3_3
+++; CHECK-P9-NEXT:  # %bb.4: # %while.cond12.preheader.i
++ ; CHECK-P9-NEXT:    #
++-; CHECK-P9-NEXT:    lwz r29, 0(r3)
++ ; CHECK-P9-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
++-; CHECK-P9-NEXT:  # %bb.4: # %for.cond99.preheader
+++; CHECK-P9-NEXT:  # %bb.5: # %for.cond99.preheader
++ ; CHECK-P9-NEXT:    #
++ ; CHECK-P9-NEXT:    ld r4, 0(r3)
++ ; CHECK-P9-NEXT:    extswsli r5, r29, 2
++@@ -331,28 +343,32 @@
++ ; CHECK-P9-BE-NEXT:    std r0, 96(r1)
++ ; CHECK-P9-BE-NEXT:    std r30, 64(r1) # 8-byte Folded Spill
++ ; CHECK-P9-BE-NEXT:    li r3, -1
++-; CHECK-P9-BE-NEXT:    li r4, 0
++ ; CHECK-P9-BE-NEXT:    li r30, 0
++ ; CHECK-P9-BE-NEXT:    std r29, 56(r1) # 8-byte Folded Spill
++ ; CHECK-P9-BE-NEXT:    crmove 4*cr2+lt, gt
++ ; CHECK-P9-BE-NEXT:    b .LBB3_2
+++; CHECK-P9-BE-NEXT:    .p2align 4
++ ; CHECK-P9-BE-NEXT:  .LBB3_1: # %if.end116
++ ; CHECK-P9-BE-NEXT:    #
++ ; CHECK-P9-BE-NEXT:    bl callee
++ ; CHECK-P9-BE-NEXT:    nop
++ ; CHECK-P9-BE-NEXT:    mr r3, r29
+++; CHECK-P9-BE-NEXT:  .LBB3_2: # %cond.end.i.i
+++; CHECK-P9-BE-NEXT:    # =>This Loop Header: Depth=1
+++; CHECK-P9-BE-NEXT:    # Child Loop BB3_3 Depth 2
+++; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
++ ; CHECK-P9-BE-NEXT:    li r4, 0
++-; CHECK-P9-BE-NEXT:    .p2align 4
++-; CHECK-P9-BE-NEXT:  .LBB3_2: # %while.body5.i
++-; CHECK-P9-BE-NEXT:    #
+++; CHECK-P9-BE-NEXT:    .p2align 5
+++; CHECK-P9-BE-NEXT:  .LBB3_3: # %while.body5.i
+++; CHECK-P9-BE-NEXT:    # Parent Loop BB3_2 Depth=1
+++; CHECK-P9-BE-NEXT:    # => This Inner Loop Header: Depth=2
++ ; CHECK-P9-BE-NEXT:    addi r4, r4, -1
++ ; CHECK-P9-BE-NEXT:    cmpwi r4, 0
++-; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_2
++-; CHECK-P9-BE-NEXT:  # %bb.3: # %while.cond12.preheader.i
+++; CHECK-P9-BE-NEXT:    bgt cr0, .LBB3_3
+++; CHECK-P9-BE-NEXT:  # %bb.4: # %while.cond12.preheader.i
++ ; CHECK-P9-BE-NEXT:    #
++-; CHECK-P9-BE-NEXT:    lwz r29, 0(r3)
++ ; CHECK-P9-BE-NEXT:    bc 12, 4*cr2+lt, .LBB3_1
++-; CHECK-P9-BE-NEXT:  # %bb.4: # %for.cond99.preheader
+++; CHECK-P9-BE-NEXT:  # %bb.5: # %for.cond99.preheader
++ ; CHECK-P9-BE-NEXT:    #
++ ; CHECK-P9-BE-NEXT:    ld r4, 0(r3)
++ ; CHECK-P9-BE-NEXT:    extswsli r5, r29, 2
+ diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
+ --- a/llvm/test/Other/new-pm-defaults.ll
+ +++ b/llvm/test/Other/new-pm-defaults.ll
+@@ -125,3 +570,1209 @@ diff -ruN --strip-trailing-cr a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo
+  ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+  ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+  ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
++--- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll
++@@ -7,11 +7,11 @@
++ ; CHECK-LABEL: define void @f
++ ; CHECK-SAME: (ptr addrspace(7) [[ARG:%.*]]) {
++ ; CHECK-NEXT:  bb:
++-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
++ ; CHECK-NEXT:    br label [[BB1:%.*]]
++ ; CHECK:       bb1:
++ ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB1]]
++ ; CHECK:       bb2:
+++; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8
++ ; CHECK-NEXT:    br label [[BB3:%.*]]
++ ; CHECK:       bb3:
++ ; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr addrspace(7) [[SCEVGEP]], align 4
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
++--- a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
++@@ -4,31 +4,33 @@
++ 
++ define i32 @remove_loop(i32 %size) #0 {
++ ; CHECK-V8M-LABEL: @remove_loop(
+++; CHECK-V8M-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
++ ; CHECK-V8M-NEXT:  entry:
++-; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
+++; CHECK-V8M-NEXT:    br label %[[WHILE_COND:.*]]
+++; CHECK-V8M:       while.cond:
+++; CHECK-V8M-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
+++; CHECK-V8M:       while.end:
+++; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
++ ; CHECK-V8M-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
++ ; CHECK-V8M-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
++ ; CHECK-V8M-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
++ ; CHECK-V8M-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
++ ; CHECK-V8M-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
++-; CHECK-V8M-NEXT:    br label [[WHILE_COND:%.*]]
++-; CHECK-V8M:       while.cond:
++-; CHECK-V8M-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
++-; CHECK-V8M:       while.end:
++ ; CHECK-V8M-NEXT:    ret i32 [[TMP4]]
++ ;
++ ; CHECK-V8A-LABEL: @remove_loop(
+++; CHECK-V8A-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
++ ; CHECK-V8A-NEXT:  entry:
++-; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
+++; CHECK-V8A-NEXT:    br label %[[WHILE_COND:.*]]
+++; CHECK-V8A:       while.cond:
+++; CHECK-V8A-NEXT:    br i1 false, label %[[WHILE_COND]], label %[[WHILE_END:.*]]
+++; CHECK-V8A:       while.end:
+++; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
++ ; CHECK-V8A-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
++ ; CHECK-V8A-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
++ ; CHECK-V8A-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
++ ; CHECK-V8A-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
++ ; CHECK-V8A-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
++-; CHECK-V8A-NEXT:    br label [[WHILE_COND:%.*]]
++-; CHECK-V8A:       while.cond:
++-; CHECK-V8A-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
++-; CHECK-V8A:       while.end:
++ ; CHECK-V8A-NEXT:    ret i32 [[TMP4]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
++--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-unroll-imm-cost.ll
++@@ -77,8 +77,6 @@
++ ; CHECK-NEXT:    [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]]
++ ; CHECK-NEXT:    br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]]
++ ; CHECK:       for.body29.preheader:
++-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
++-; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
++ ; CHECK-NEXT:    br label [[FOR_BODY29:%.*]]
++ ; CHECK:       for.body29:
++ ; CHECK-NEXT:    [[J_184:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY29]] ], [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ]
++@@ -102,6 +100,8 @@
++ ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ADD25]]
++ ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29]]
++ ; CHECK:       for.end40.loopexit:
+++; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]]
+++; CHECK-NEXT:    [[SCEVGEP93:%.*]] = getelementptr i16, ptr [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP10]]
++ ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, ptr [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP10]]
++ ; CHECK-NEXT:    [[SCEVGEP94:%.*]] = getelementptr i32, ptr [[PDEST_ADDR_1_LCSSA]], i32 [[TMP10]]
++ ; CHECK-NEXT:    br label [[FOR_END40]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
++--- a/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/exit-count-select.ll
++@@ -1,5 +1,5 @@
++ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
++-; RUN: opt < %s -passes='require<scalar-evolution>,indvars,loop-mssa(licm)' -S | FileCheck %s
+++; RUN: opt < %s -passes=indvars -S | FileCheck %s
++ 
++ define i32 @logical_and_2ops(i32 %n, i32 %m) {
++ ; CHECK-LABEL: @logical_and_2ops(
++@@ -56,10 +56,10 @@
++ ; CHECK:       loop:
++ ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
++ ; CHECK:       exit:
+++; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
++ ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
++-; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
++-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
++-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
+++; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
+++; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
++ ; CHECK-NEXT:    ret i32 [[UMIN1]]
++ ;
++ entry:
++@@ -84,10 +84,10 @@
++ ; CHECK:       loop:
++ ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
++ ; CHECK:       exit:
+++; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[K:%.*]]
++ ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[M:%.*]]
++-; CHECK-NEXT:    [[N:%.*]] = freeze i32 [[K:%.*]]
++-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 [[N]])
++-; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N1:%.*]])
+++; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP1]])
+++; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[UMIN]], i32 [[N:%.*]])
++ ; CHECK-NEXT:    ret i32 [[UMIN1]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
++--- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll
++@@ -932,9 +932,6 @@
++ define i16 @ult_multiuse_profit(i16 %n.raw, i8 %start) mustprogress {
++ ; CHECK-LABEL: @ult_multiuse_profit(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[START:%.*]], 1
++-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP2]] to i16
++-; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP1]], i16 254)
++ ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i16 254 to i8
++ ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
++ ; CHECK:       for.body:
++@@ -943,6 +940,9 @@
++ ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]]
++ ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
++ ; CHECK:       for.end:
+++; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[START:%.*]], 1
+++; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i16
+++; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[TMP2]], i16 254)
++ ; CHECK-NEXT:    ret i16 [[UMAX]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr116483.ll b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
++--- a/llvm/test/Transforms/IndVarSimplify/pr116483.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/pr116483.ll
++@@ -4,16 +4,16 @@
++ define i32 @test() {
++ ; CHECK-LABEL: define i32 @test() {
++ ; CHECK-NEXT:  [[ENTRY:.*:]]
+++; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+++; CHECK:       [[LOOP_BODY]]:
+++; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
+++; CHECK:       [[EXIT]]:
++ ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 0, 3
++ ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[XOR]], 329
++ ; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[MUL]] to i16
++ ; CHECK-NEXT:    [[SEXT:%.*]] = shl i16 [[CONV]], 8
++ ; CHECK-NEXT:    [[CONV1:%.*]] = ashr i16 [[SEXT]], 8
++ ; CHECK-NEXT:    [[CONV3:%.*]] = zext i16 [[CONV1]] to i32
++-; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
++-; CHECK:       [[LOOP_BODY]]:
++-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_BODY]]
++-; CHECK:       [[EXIT]]:
++ ; CHECK-NEXT:    ret i32 [[CONV3]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr24783.ll b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
++--- a/llvm/test/Transforms/IndVarSimplify/pr24783.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/pr24783.ll
++@@ -7,11 +7,11 @@
++ define void @f(ptr %end.s, ptr %loc, i32 %p) {
++ ; CHECK-LABEL: @f(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
++ ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
++ ; CHECK:       while.body.i:
++ ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[WHILE_BODY_I]]
++ ; CHECK:       loop.exit:
+++; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds i32, ptr [[END_S:%.*]], i32 [[P:%.*]]
++ ; CHECK-NEXT:    store ptr [[END]], ptr [[LOC:%.*]], align 8
++ ; CHECK-NEXT:    ret void
++ ;
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr39673.ll b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
++--- a/llvm/test/Transforms/IndVarSimplify/pr39673.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/pr39673.ll
++@@ -148,7 +148,6 @@
++ define i16 @neg_loop_carried(i16 %arg) {
++ ; CHECK-LABEL: @neg_loop_carried(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
++ ; CHECK-NEXT:    br label [[LOOP1:%.*]]
++ ; CHECK:       loop1:
++ ; CHECK-NEXT:    [[L1:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[L1_ADD:%.*]], [[LOOP1]] ]
++@@ -156,6 +155,7 @@
++ ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[L1_ADD]], 2
++ ; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP1]], label [[LOOP2_PREHEADER:%.*]]
++ ; CHECK:       loop2.preheader:
+++; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[ARG:%.*]], 2
++ ; CHECK-NEXT:    br label [[LOOP2:%.*]]
++ ; CHECK:       loop2:
++ ; CHECK-NEXT:    [[K2:%.*]] = phi i16 [ [[K2_ADD:%.*]], [[LOOP2]] ], [ [[TMP0]], [[LOOP2_PREHEADER]] ]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/pr63763.ll b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
++--- a/llvm/test/Transforms/IndVarSimplify/pr63763.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/pr63763.ll
++@@ -16,13 +16,13 @@
++ ; CHECK-NEXT:    [[CONV2:%.*]] = ashr exact i32 [[SEXT]], 24
++ ; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nsw i32 7, [[CONV2]]
++ ; CHECK-NEXT:    call void @use(i32 [[INVARIANT_OP]])
++-; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
++-; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
++-; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
++ ; CHECK-NEXT:    br label [[LOOP:%.*]]
++ ; CHECK:       loop:
++ ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
++ ; CHECK:       exit:
+++; CHECK-NEXT:    [[SEXT_US:%.*]] = shl i32 [[SEL]], 24
+++; CHECK-NEXT:    [[CONV2_US:%.*]] = ashr exact i32 [[SEXT_US]], 24
+++; CHECK-NEXT:    [[INVARIANT_OP_US:%.*]] = sub nsw i32 7, [[CONV2_US]]
++ ; CHECK-NEXT:    ret i32 [[INVARIANT_OP_US]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
++--- a/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/replace-loop-exit-folds.ll
++@@ -4,21 +4,22 @@
++ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
++ 
++ define i32 @remove_loop(i32 %size) {
++-; CHECK-LABEL: @remove_loop(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
+++; CHECK-LABEL: define i32 @remove_loop(
+++; CHECK-SAME: i32 [[SIZE:%.*]]) {
+++; CHECK-NEXT:  [[ENTRY:.*]]:
+++; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+++; CHECK:       [[WHILE_COND]]:
+++; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], %[[ENTRY]] ], [ [[SUB:%.*]], %[[WHILE_COND]] ]
+++; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
+++; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
+++; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_COND]], label %[[WHILE_END:.*]]
+++; CHECK:       [[WHILE_END]]:
+++; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE]], 31
++ ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SIZE]], i32 31)
++ ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[UMIN]]
++ ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 5
++ ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 5
++ ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[SIZE]], [[TMP3]]
++-; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
++-; CHECK:       while.cond:
++-; CHECK-NEXT:    [[SIZE_ADDR_0:%.*]] = phi i32 [ [[SIZE]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[WHILE_COND]] ]
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SIZE_ADDR_0]], 31
++-; CHECK-NEXT:    [[SUB]] = add i32 [[SIZE_ADDR_0]], -32
++-; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_COND]], label [[WHILE_END:%.*]]
++-; CHECK:       while.end:
++ ; CHECK-NEXT:    ret i32 [[TMP4]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
++--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
++@@ -76,10 +76,6 @@
++ ; CHECK-LABEL: @narow_canonical_iv_wide_multiplied_iv(
++ ; CHECK-NEXT:  entry:
++ ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1)
++-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
++-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
++-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
++-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
++ ; CHECK-NEXT:    br label [[LOOP:%.*]]
++ ; CHECK:       loop:
++ ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++@@ -88,6 +84,10 @@
++ ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
++ ; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]]
++ ; CHECK:       exit:
+++; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
+++; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
+++; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+++; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
++ ; CHECK-NEXT:    ret i64 [[TMP6]]
++ ;
++ entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
++--- a/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/scev-expander-preserve-lcssa.ll
++@@ -23,8 +23,8 @@
++ ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
++ ; CHECK:       while.cond192:
++ ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
++-; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
++-; CHECK-NEXT:      i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
+++; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+++; CHECK-NEXT:    i8 10, label [[IF_END224_LOOPEXIT1:%.*]]
++ ; CHECK-NEXT:    ]
++ ; CHECK:       while.cond215.preheader:
++ ; CHECK-NEXT:    br label [[WHILE_COND215:%.*]]
++@@ -103,8 +103,8 @@
++ ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
++ ; CHECK:       for.cond:
++ ; CHECK-NEXT:    switch i16 [[X:%.*]], label [[RETURN_LOOPEXIT1:%.*]] [
++-; CHECK-NEXT:      i16 41, label [[FOR_END:%.*]]
++-; CHECK-NEXT:      i16 43, label [[FOR_COND]]
+++; CHECK-NEXT:    i16 41, label [[FOR_END:%.*]]
+++; CHECK-NEXT:    i16 43, label [[FOR_COND]]
++ ; CHECK-NEXT:    ]
++ ; CHECK:       for.end:
++ ; CHECK-NEXT:    [[I_0_LCSSA2:%.*]] = phi i32 [ 0, [[FOR_COND]] ]
++@@ -336,7 +336,6 @@
++ define void @test5(ptr %header, i32 %conv, i8 %n) {
++ ; CHECK-LABEL: @test5(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
++ ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
++ ; CHECK:       for.body:
++ ; CHECK-NEXT:    br label [[FOR_INNER:%.*]]
++@@ -359,6 +358,7 @@
++ ; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[WHILE_COND_PREHEADER:%.*]]
++ ; CHECK:       while.cond.preheader:
++ ; CHECK-NEXT:    [[ADD85_LCSSA:%.*]] = phi i32 [ [[ADD85]], [[FOR_INC]] ]
+++; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[CONV:%.*]], 2
++ ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
++ ; CHECK:       while.cond:
++ ; CHECK-NEXT:    [[POS_8:%.*]] = phi i32 [ [[INC114:%.*]], [[WHILE_BODY:%.*]] ], [ [[ADD85_LCSSA]], [[WHILE_COND_PREHEADER]] ]
++@@ -427,8 +427,8 @@
++ ; CHECK-NEXT:    br label [[WHILE_COND192:%.*]]
++ ; CHECK:       while.cond192:
++ ; CHECK-NEXT:    switch i8 [[X:%.*]], label [[WHILE_BODY205:%.*]] [
++-; CHECK-NEXT:      i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
++-; CHECK-NEXT:      i8 10, label [[IF_END224:%.*]]
+++; CHECK-NEXT:    i8 59, label [[WHILE_COND215_PREHEADER:%.*]]
+++; CHECK-NEXT:    i8 10, label [[IF_END224:%.*]]
++ ; CHECK-NEXT:    ]
++ ; CHECK:       while.cond215.preheader:
++ ; CHECK-NEXT:    [[I_7_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_COND192]] ]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
++--- a/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/scev-invalidation.ll
++@@ -46,12 +46,12 @@
++ define i32 @test_pr58439(i32 %a) {
++ ; CHECK-LABEL: @test_pr58439(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
++ ; CHECK-NEXT:    br label [[LOOP:%.*]]
++ ; CHECK:       loop:
++ ; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
++ ; CHECK:       exit:
++ ; CHECK-NEXT:    [[C_EXT_LCSSA:%.*]] = phi i32 [ 0, [[LOOP]] ]
+++; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A:%.*]], 1
++ ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[C_EXT_LCSSA]], [[OR]]
++ ; CHECK-NEXT:    ret i32 [[RES]]
++ ;
++@@ -76,7 +76,6 @@
++ ; CHECK-NEXT:  entry:
++ ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
++ ; CHECK:       outer.header:
++-; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
++ ; CHECK-NEXT:    br label [[INNER:%.*]]
++ ; CHECK:       inner:
++ ; CHECK-NEXT:    [[C_05_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[INNER]] ], [ 0, [[OUTER_HEADER]] ]
++@@ -87,6 +86,7 @@
++ ; CHECK:       outer.latch:
++ ; CHECK-NEXT:    [[C_05_I_LCSSA:%.*]] = phi i32 [ [[C_05_I]], [[INNER]] ]
++ ; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ 0, [[INNER]] ]
+++; CHECK-NEXT:    [[AND:%.*]] = and i32 1, [[INC:%.*]]
++ ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[AND]] to i8
++ ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C_05_I_LCSSA]] to i8
++ ; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 [[TMP0]], [[TMP1]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sentinel.ll b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
++--- a/llvm/test/Transforms/IndVarSimplify/sentinel.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/sentinel.ll
++@@ -9,19 +9,19 @@
++ ; CHECK-NEXT:  bb:
++ ; CHECK-NEXT:    br label [[BB4:%.*]]
++ ; CHECK:       bb1:
+++; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add i32 [[INDVARS_IV:%.*]], 1
+++; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6:%.*]], [[INDVARS_IV]]
+++; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMAX:%.*]]
++ ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB4]]
++ ; CHECK:       bb2:
++-; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1:%.*]], [[BB1:%.*]] ]
+++; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP1]], [[BB1:%.*]] ]
++ ; CHECK-NEXT:    ret void
++ ; CHECK:       bb4:
++-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB1]] ], [ undef, [[BB:%.*]] ]
++-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
++-; CHECK-NEXT:    [[TMP6:%.*]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
+++; CHECK-NEXT:    [[INDVARS_IV]] = phi i32 [ [[INDVARS_IV_NEXT]], [[BB1]] ], [ undef, [[BB:%.*]] ]
+++; CHECK-NEXT:    [[SMAX]] = call i32 @llvm.smax.i32(i32 [[INDVARS_IV]], i32 36)
+++; CHECK-NEXT:    [[TMP6]] = invoke i32 @quux() [ "deopt"(i32 0, i32 0, i32 0, i32 180, i32 0, i32 25, i32 0, i32 7, ptr null, i32 7, ptr null, i32 7, ptr null, i32 3, i32 [[INDVARS_IV]], i32 3, i32 undef, i32 7, ptr null, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 4, double undef, i32 7, ptr null, i32 4, i64 undef, i32 7, ptr null, i32 0, ptr addrspace(1) undef, i32 3, i32 undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 0, ptr addrspace(1) undef, i32 7, ptr null) ]
++ ; CHECK-NEXT:            to label [[BB7:%.*]] unwind label [[BB15:%.*]]
++ ; CHECK:       bb7:
++-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
++-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[TMP6]], [[INDVARS_IV]]
++-; CHECK-NEXT:    [[TMP1]] = sub i32 [[TMP0]], [[SMAX]]
++ ; CHECK-NEXT:    br label [[BB9:%.*]]
++ ; CHECK:       bb9:
++ ; CHECK-NEXT:    br i1 true, label [[BB1]], label [[BB9]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll b/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
++--- a/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/sink-alloca.ll
++@@ -0,0 +1,56 @@
+++; RUN: opt < %s -passes=indvars -S | FileCheck %s
+++target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+++target triple = "i386-apple-darwin10.0"
+++
+++; PR4775
+++; Indvars shouldn't sink the alloca out of the entry block, even though
+++; it's not used until after the loop.
+++define i32 @main() nounwind {
+++; CHECK: entry:
+++; CHECK-NEXT: %result.i = alloca i32, align 4
+++entry:
+++  %result.i = alloca i32, align 4                 ; <ptr> [#uses=2]
+++  br label %while.cond
+++
+++while.cond:                                       ; preds = %while.cond, %entry
+++  %call = call i32 @bar() nounwind                ; <i32> [#uses=1]
+++  %tobool = icmp eq i32 %call, 0                  ; <i1> [#uses=1]
+++  br i1 %tobool, label %while.end, label %while.cond
+++
+++while.end:                                        ; preds = %while.cond
+++  store volatile i32 0, ptr %result.i
+++  %tmp.i = load volatile i32, ptr %result.i           ; <i32> [#uses=0]
+++  ret i32 0
+++}
+++declare i32 @bar()
+++
+++; <rdar://problem/10352360>
+++; Indvars shouldn't sink the first alloca between the stacksave and stackrestore
+++; intrinsics.
+++declare ptr @a(...)
+++declare ptr @llvm.stacksave() nounwind
+++declare void @llvm.stackrestore(ptr) nounwind
+++define void @h(i64 %n) nounwind uwtable ssp {
+++; CHECK: entry:
+++; CHECK-NEXT: %vla = alloca ptr
+++; CHECK-NEXT: %savedstack = call ptr @llvm.stacksave.p0()
+++entry:
+++  %vla = alloca ptr, i64 %n, align 16
+++  %savedstack = call ptr @llvm.stacksave() nounwind
+++  %vla.i = alloca ptr, i64 %n, align 16
+++  br label %for.body.i
+++
+++for.body.i:
+++  %indvars.iv37.i = phi i64 [ %indvars.iv.next38.i, %for.body.i ], [ 0, %entry ]
+++  %call.i = call ptr (...) @a() nounwind
+++  %arrayidx.i = getelementptr inbounds ptr, ptr %vla.i, i64 %indvars.iv37.i
+++  store ptr %call.i, ptr %arrayidx.i, align 8
+++  %indvars.iv.next38.i = add i64 %indvars.iv37.i, 1
+++  %exitcond5 = icmp eq i64 %indvars.iv.next38.i, %n
+++  br i1 %exitcond5, label %g.exit, label %for.body.i
+++
+++g.exit:
+++  call void @llvm.stackrestore(ptr %savedstack) nounwind
+++  %call1 = call ptr (...) @a(ptr %vla) nounwind
+++  ret void
+++}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
++--- a/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/sink-from-preheader.ll
++@@ -0,0 +1,32 @@
+++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+++; RUN: opt < %s -passes=indvars -indvars-predicate-loops=0 -S | FileCheck %s
+++target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+++target triple = "i386-apple-darwin10.0"
+++
+++; We make sinking here, Changed flag should be set properly.
+++define i32 @test(i32 %a, i32 %b, i32 %N) {
+++; CHECK-LABEL: @test(
+++; CHECK-NEXT:  entry:
+++; CHECK-NEXT:    br label [[LOOP:%.*]]
+++; CHECK:       loop:
+++; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+++; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+++; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+++; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+++; CHECK:       exit:
+++; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+++; CHECK-NEXT:    ret i32 [[ADD]]
+++;
+++entry:
+++  %add = add i32 %a, %b
+++  br label %loop
+++
+++loop:
+++  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+++  %iv.next = add i32 %iv, 1
+++  %cmp = icmp slt i32 %iv.next, %N
+++  br i1 %cmp, label %loop, label %exit
+++
+++exit:
+++  ret i32 %add
+++}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
++--- a/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/sink-trapping.ll
++@@ -0,0 +1,19 @@
+++; RUN: opt < %s -passes=indvars -S | FileCheck %s
+++
+++declare i1 @b()
+++
+++define i32 @a(i32 %x) nounwind {
+++for.body.preheader:
+++    %y = sdiv i32 10, %x
+++	br label %for.body
+++
+++for.body:
+++    %cmp = call i1 @b()
+++	br i1 %cmp, label %for.body, label %for.end.loopexit
+++
+++for.end.loopexit:
+++	ret i32 %y
+++}
+++; CHECK: for.end.loopexit:
+++; CHECK: sdiv
+++; CHECK: ret
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
++--- a/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/X86/inner-loop-by-latch-cond.ll
++@@ -14,7 +14,6 @@
++ ; CHECK:       outer_header:
++ ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 21, [[ENTRY:%.*]] ]
++ ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 20, [[ENTRY]] ], [ [[I_NEXT:%.*]], [[OUTER_LATCH]] ]
++-; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
++ ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
++ ; CHECK:       inner_header:
++ ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 1, [[OUTER_HEADER]] ], [ [[J_NEXT:%.*]], [[INNER_HEADER]] ]
++@@ -23,6 +22,7 @@
++ ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[J_NEXT]], [[INDVARS_IV]]
++ ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[INNER_HEADER]], label [[OUTER_LATCH]]
++ ; CHECK:       outer_latch:
+++; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
++ ; CHECK-NEXT:    [[COND2:%.*]] = icmp ne i64 [[I_NEXT]], 40
++ ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
++ ; CHECK-NEXT:    br i1 [[COND2]], label [[OUTER_HEADER]], label [[RETURN:%.*]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
++--- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+++++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
++@@ -24,13 +24,13 @@
++ ; CHECK-NEXT:    [[X8:%.*]] = icmp ult i32 0, 4
++ ; CHECK-NEXT:    br i1 [[X8]], label [[DOTPREHEADER_LR_PH:%.*]], label [[X22]]
++ ; CHECK:       .preheader.lr.ph:
++-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
++ ; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
++ ; CHECK:       .preheader:
++ ; CHECK-NEXT:    br label [[X17:%.*]]
++ ; CHECK:       x17:
++ ; CHECK-NEXT:    br i1 false, label [[DOTPREHEADER]], label [[DOT_CRIT_EDGE_8:%.*]]
++ ; CHECK:       ._crit_edge.8:
+++; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[K_09]], i64 [[TMP5]]
++ ; CHECK-NEXT:    br label [[X22]]
++ ; CHECK:       x22:
++ ; CHECK-NEXT:    [[K_1_LCSSA:%.*]] = phi ptr [ [[SCEVGEP]], [[DOT_CRIT_EDGE_8]] ], [ [[K_09]], [[DOTPREHEADER4]] ]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
++--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++++ b/llvm/test/Transforms/LICM/scalar-promote.ll
++@@ -43,9 +43,9 @@
++ ; CHECK-LABEL: define void @test2(
++ ; CHECK-SAME: i32 [[I:%.*]]) {
++ ; CHECK-NEXT:  [[ENTRY:.*]]:
+++; CHECK-NEXT:    [[X1:%.*]] = getelementptr i32, ptr @X, i64 1
++ ; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr @X, i64 1
++-; CHECK-NEXT:    [[X3:%.*]] = getelementptr i32, ptr @X, i64 1
++-; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X2]], align 4
+++; CHECK-NEXT:    [[X1_PROMOTED:%.*]] = load i32, ptr [[X1]], align 4
++ ; CHECK-NEXT:    br label %[[LOOP:.*]]
++ ; CHECK:       [[LOOP]]:
++ ; CHECK-NEXT:    [[A1:%.*]] = phi i32 [ [[V:%.*]], %[[LOOP]] ], [ [[X1_PROMOTED]], %[[ENTRY]] ]
++@@ -53,7 +53,7 @@
++ ; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT:.*]]
++ ; CHECK:       [[EXIT]]:
++ ; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i32 [ [[V]], %[[LOOP]] ]
++-; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X2]], align 4
+++; CHECK-NEXT:    store i32 [[V_LCSSA]], ptr [[X1]], align 4
++ ; CHECK-NEXT:    ret void
++ ;
++ Entry:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-alloca.ll b/llvm/test/Transforms/LICM/sink-alloca.ll
++--- a/llvm/test/Transforms/LICM/sink-alloca.ll
+++++ b/llvm/test/Transforms/LICM/sink-alloca.ll
++@@ -1,56 +0,0 @@
++-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
++-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
++-target triple = "i386-apple-darwin10.0"
++-
++-; PR4775
++-; LICM shouldn't sink the alloca out of the entry block, even though
++-; it's not used until after the loop.
++-define i32 @main() nounwind {
++-; CHECK: entry:
++-; CHECK-NEXT: %result.i = alloca i32, align 4
++-entry:
++-  %result.i = alloca i32, align 4                 ; <ptr> [#uses=2]
++-  br label %while.cond
++-
++-while.cond:                                       ; preds = %while.cond, %entry
++-  %call = call i32 @bar() nounwind                ; <i32> [#uses=1]
++-  %tobool = icmp eq i32 %call, 0                  ; <i1> [#uses=1]
++-  br i1 %tobool, label %while.end, label %while.cond
++-
++-while.end:                                        ; preds = %while.cond
++-  store volatile i32 0, ptr %result.i
++-  %tmp.i = load volatile i32, ptr %result.i           ; <i32> [#uses=0]
++-  ret i32 0
++-}
++-declare i32 @bar()
++-
++-; <rdar://problem/10352360>
++-; LICM shouldn't sink the first alloca between the stacksave and stackrestore
++-; intrinsics.
++-declare ptr @a(...)
++-declare ptr @llvm.stacksave() nounwind
++-declare void @llvm.stackrestore(ptr) nounwind
++-define void @h(i64 %n) nounwind uwtable ssp {
++-; CHECK: entry:
++-; CHECK-NEXT: %vla = alloca ptr
++-; CHECK-NEXT: %savedstack = call ptr @llvm.stacksave.p0()
++-entry:
++-  %vla = alloca ptr, i64 %n, align 16
++-  %savedstack = call ptr @llvm.stacksave() nounwind
++-  %vla.i = alloca ptr, i64 %n, align 16
++-  br label %for.body.i
++-
++-for.body.i:
++-  %indvars.iv37.i = phi i64 [ %indvars.iv.next38.i, %for.body.i ], [ 0, %entry ]
++-  %call.i = call ptr (...) @a() nounwind
++-  %arrayidx.i = getelementptr inbounds ptr, ptr %vla.i, i64 %indvars.iv37.i
++-  store ptr %call.i, ptr %arrayidx.i, align 8
++-  %indvars.iv.next38.i = add i64 %indvars.iv37.i, 1
++-  %exitcond5 = icmp eq i64 %indvars.iv.next38.i, %n
++-  br i1 %exitcond5, label %g.exit, label %for.body.i
++-
++-g.exit:
++-  call void @llvm.stackrestore(ptr %savedstack) nounwind
++-  %call1 = call ptr (...) @a(ptr %vla) nounwind
++-  ret void
++-}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-from-preheader.ll b/llvm/test/Transforms/LICM/sink-from-preheader.ll
++--- a/llvm/test/Transforms/LICM/sink-from-preheader.ll
+++++ b/llvm/test/Transforms/LICM/sink-from-preheader.ll
++@@ -1,185 +0,0 @@
++-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
++-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
++-
++-; We perform sinking here, Changed flag should be set properly.
++-define i32 @test(i32 %a, i32 %b, i32 %N) {
++-; CHECK-LABEL: @test(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %add = add i32 %a, %b
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, 1
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++-
++-define i32 @test_with_unused_load(i32 %a, ptr %b, i32 %N) {
++-; CHECK-LABEL: @test_with_unused_load(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LOAD]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %load = load i32, ptr %b
++-  %add = add i32 %a, %load
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, 1
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++-
++-define i32 @test_with_unused_load_modified_store(i32 %a, ptr %b, i32 %N) {
++-; CHECK-LABEL: @test_with_unused_load_modified_store(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[A:%.*]]
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[SMAX:%.*]] = phi i32 [ [[IV_NEXT]], [[LOOP]] ]
++-; CHECK-NEXT:    store i32 [[SMAX]], ptr [[B]], align 4
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], [[LOAD]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %load = load i32, ptr %b
++-  %add = add i32 %a, %load
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, %a
++-  store i32 %iv.next, ptr %b
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++-
++-; Volatile loads must not be sunk.
++-define i32 @test_with_volatile_load_no_sink(i32 %a, ptr %b, i32 %N) {
++-; CHECK-LABEL: @test_with_volatile_load_no_sink(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[B:%.*]], align 4
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %ld = load volatile i32, ptr %b, align 4
++-  %add = add i32 %a, %ld
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, 1
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++-
++-; Ordered/atomic loads must not be sunk.
++-define i32 @test_with_atomic_load_no_sink(i32 %a, ptr %b, i32 %N) {
++-; CHECK-LABEL: @test_with_atomic_load_no_sink(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[LD:%.*]] = load atomic i32, ptr [[B:%.*]] acquire, align 4
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %ld = load atomic i32, ptr %b acquire, align 4
++-  %add = add i32 %a, %ld
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, 1
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++-
++-declare void @clobber(ptr)
++-
++-; Calls that may write memory in the loop should prevent sinking the load.
++-define i32 @test_with_unused_load_clobbered_by_call(i32 %a, ptr %b, i32 %N) {
++-; CHECK-LABEL: @test_with_unused_load_clobbered_by_call(
++-; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[B:%.*]], align 4
++-; CHECK-NEXT:    br label [[LOOP:%.*]]
++-; CHECK:       loop:
++-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
++-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
++-; CHECK-NEXT:    call void @clobber(ptr [[B]])
++-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
++-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
++-; CHECK:       exit:
++-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], [[LD]]
++-; CHECK-NEXT:    ret i32 [[ADD]]
++-;
++-entry:
++-  %ld = load i32, ptr %b, align 4
++-  %add = add i32 %a, %ld
++-  br label %loop
++-
++-loop:
++-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
++-  %iv.next = add i32 %iv, 1
++-  call void @clobber(ptr %b)
++-  %cmp = icmp slt i32 %iv.next, %N
++-  br i1 %cmp, label %loop, label %exit
++-
++-exit:
++-  ret i32 %add
++-}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LICM/sink-trapping.ll b/llvm/test/Transforms/LICM/sink-trapping.ll
++--- a/llvm/test/Transforms/LICM/sink-trapping.ll
+++++ b/llvm/test/Transforms/LICM/sink-trapping.ll
++@@ -1,28 +0,0 @@
++-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
++-; RUN: opt < %s -passes=licm -verify-memoryssa -S | FileCheck %s
++-
++-declare i1 @b()
++-
++-define i32 @a(i32 %x) nounwind {
++-; CHECK-LABEL: define i32 @a(
++-; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
++-; CHECK-NEXT:  [[FOR_BODY_PREHEADER:.*:]]
++-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
++-; CHECK:       [[FOR_BODY]]:
++-; CHECK-NEXT:    [[CMP:%.*]] = call i1 @b()
++-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END_LOOPEXIT:.*]]
++-; CHECK:       [[FOR_END_LOOPEXIT]]:
++-; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 10, [[X]]
++-; CHECK-NEXT:    ret i32 [[Y]]
++-;
++-for.body.preheader:
++-  %y = sdiv i32 10, %x
++-  br label %for.body
++-
++-for.body:
++-  %cmp = call i1 @b()
++-  br i1 %cmp, label %for.body, label %for.end.loopexit
++-
++-for.end.loopexit:
++-  ret i32 %y
++-}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
++--- a/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
+++++ b/llvm/test/Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
++@@ -84,13 +84,13 @@
++ ; CHECK:       inner.2.preheader:
++ ; CHECK-NEXT:    br label [[INNER_3_PH:%.*]]
++ ; CHECK:       inner.3.ph:
++-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
++ ; CHECK-NEXT:    br label [[INNER_3:%.*]]
++ ; CHECK:       inner.3:
++ ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SRC:%.*]], align 4
++ ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH]], label [[INNER_3]]
++ ; CHECK:       outer.latch:
++ ; CHECK-NEXT:    [[L_LCSSA:%.*]] = phi i32 [ [[L]], [[INNER_3]] ]
+++; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
++ ; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i32 [[L_LCSSA]], [[TRUNC]]
++ ; CHECK-NEXT:    br label [[OUTER_HEADER]]
++ ;
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
++--- a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
+++++ b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
++@@ -4,11 +4,11 @@
++ define void @test_pr50940(ptr %A, ptr %B) {
++ ; CHECK-LABEL: @test_pr50940(
++ ; CHECK-NEXT:  entry:
++-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
++ ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
++ ; CHECK:       outer.header:
++ ; CHECK-NEXT:    br i1 false, label [[OUTER_LATCH:%.*]], label [[INNER_PH:%.*]]
++ ; CHECK:       inner.ph:
+++; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
++ ; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 3
++ ; CHECK-NEXT:    br label [[INNER_LVER_CHECK:%.*]]
++ ; CHECK:       inner.lver.check:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
++--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
++@@ -380,6 +380,7 @@
++ ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
++ ; CHECK:       vector.ph:
++ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], 8589934588
+++; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
++ ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX5_PROMOTED]], i64 0
++ ; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i32, ptr [[VAR2]], i64 [[TMP4]]
++ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
++@@ -395,7 +396,6 @@
++ ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
++ ; CHECK:       middle.block:
++ ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP17]], [[VECTOR_BODY]] ]
++-; CHECK-NEXT:    [[IND_END:%.*]] = add nuw nsw i64 [[N_VEC]], [[TMP4]]
++ ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
++ ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[ARRAYIDX5]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META23]]
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
++--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
++@@ -43,6 +43,7 @@
++ ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
++ ; CHECK:       vector.ph:
++ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
+++; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
++ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
++ ; CHECK:       vector.body:
++ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
++@@ -63,7 +64,6 @@
++ ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
++ ; CHECK:       middle.block:
++-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
++ ; CHECK:       for.body.preheader14:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
++--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
++@@ -1040,6 +1040,7 @@
++ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_PREHEADER11:.*]], label %[[VECTOR_PH:.*]]
++ ; CHECK:       [[VECTOR_PH]]:
++ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775806
+++; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 5
++ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
++ ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <10 x i32> zeroinitializer
++ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
++@@ -1057,11 +1058,10 @@
++ ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
++ ; CHECK:       [[MIDDLE_BLOCK]]:
++-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[N_VEC]], 5
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[LOOP_PREHEADER11]]
++ ; CHECK:       [[LOOP_PREHEADER11]]:
++-; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
+++; CHECK-NEXT:    [[I1_PH:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
++ ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
++ ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> zeroinitializer
++ ; CHECK-NEXT:    br label %[[LOOP:.*]]
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
++--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
++@@ -8,6 +8,7 @@
++ ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
++ ; CHECK-NEXT:  [[ENTRY:.*]]:
++ ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
+++; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
++ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
++ ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
++ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
++@@ -25,7 +26,6 @@
++ ; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
++ ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
++ ; CHECK:       [[MIDDLE_SPLIT]]:
++-; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
++ ; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
++ ; CHECK:       [[VECTOR_EARLY_EXIT]]:
++ ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
++@@ -149,7 +149,8 @@
++ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]]
++ ; CHECK:       [[VECTOR_PH]]:
++ ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], -8
++-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+++; CHECK:         [[TMP9:%.*]] = getelementptr
+++; CHECK-NEXT:         br label %[[VECTOR_BODY:.*]]
++ ; CHECK:       [[VECTOR_BODY]]:
++ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ]
++ ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
++@@ -164,14 +165,12 @@
++ ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[PROL_ITER_CMP_NOT]]
++ ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
++ ; CHECK:       [[MIDDLE_SPLIT]]:
++-; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[XTRAITER]], 1
++-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP9]]
++ ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
++ ; CHECK:       [[MIDDLE_BLOCK]]:
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]]
++ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]]
++ ; CHECK:       [[LOOP_HEADER_I_PREHEADER2]]:
++-; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+++; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
++ ; CHECK-NEXT:    br label %[[LOOP_HEADER_I:.*]]
++ ; CHECK:       [[VECTOR_EARLY_EXIT]]:
++ ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true)
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
++--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
++@@ -18,15 +18,22 @@
++ ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
++ ; CHECK:       vector.ph:
++ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
+++; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
+++; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
+++; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
+++; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
+++; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[TMP1]]
+++; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[N_VEC]], 1
+++; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[TMP2]]
++ ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
++ ; CHECK:       vector.body:
++ ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
++ ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
++-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[OFFSET_IDX]]
+++; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[OFFSET_IDX]]
++ ; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = shl i32 [[INDEX]], 1
++-; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX13]]
+++; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX13]]
++ ; CHECK-NEXT:    [[OFFSET_IDX15:%.*]] = shl i32 [[INDEX]], 1
++-; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[OFFSET_IDX15]]
+++; CHECK-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[OFFSET_IDX15]]
++ ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
++ ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
++ ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x i16>, ptr [[NEXT_GEP16]], align 2
++@@ -40,13 +47,6 @@
++ ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
++ ; CHECK:       middle.block:
++-; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
++-; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[N_VEC]], 1
++-; CHECK-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA]], i32 [[TMP13]]
++-; CHECK-NEXT:    [[TMP14:%.*]] = shl i32 [[N_VEC]], 1
++-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP14]]
++-; CHECK-NEXT:    [[TMP12:%.*]] = shl i32 [[N_VEC]], 1
++-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP12]]
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
++ ; CHECK:       while.body.preheader15:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
++--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
++@@ -46,6 +46,7 @@
++ ; AVX2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124
++ ; AVX2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]]
++ ; AVX2:       vector.ph:
+++; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
++ ; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776
++ ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
++ ; AVX2:       vector.body:
++@@ -79,7 +80,6 @@
++ ; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
++ ; AVX2-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
++ ; AVX2:       middle.block:
++-; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
++ ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
++ ; AVX2-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
++ ; AVX2:       vec.epilog.iter.check:
++@@ -90,6 +90,8 @@
++ ; AVX2:       vec.epilog.ph:
++ ; AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
++ ; AVX2-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
+++; AVX2-NEXT:    [[TMP21:%.*]] = shl i64 [[N_VEC10]], 2
+++; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]]
++ ; AVX2-NEXT:    br label [[BB12:%.*]]
++ ; AVX2:       vec.epilog.vector.body:
++ ; AVX2-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[BB12_PREHEADER11]] ], [ [[INDEX_NEXT16:%.*]], [[BB12]] ]
++@@ -104,8 +106,6 @@
++ ; AVX2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]]
++ ; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]]
++ ; AVX2:       vec.epilog.middle.block:
++-; AVX2-NEXT:    [[TMP27:%.*]] = shl i64 [[N_VEC10]], 2
++-; AVX2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP27]]
++ ; AVX2-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
++ ; AVX2-NEXT:    br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]]
++ ; AVX2:       bb12.preheader:
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
++--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
++@@ -16,8 +16,8 @@
++ ; CHECK-SAME: ptr writeonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], double [[A:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
++ ; CHECK-NEXT:  [[ENTRY:.*:]]
++ ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[N]], 0
++-; CHECK-NEXT:    br i1 [[CMP1]], label %[[ITER_CHECK:.*]], label %[[FOR_END:.*]]
++-; CHECK:       [[ITER_CHECK]]:
+++; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+++; CHECK:       [[FOR_BODY_PREHEADER]]:
++ ; CHECK-NEXT:    [[X4:%.*]] = ptrtoint ptr [[X]] to i64
++ ; CHECK-NEXT:    [[Y5:%.*]] = ptrtoint ptr [[Y]] to i64
++ ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
++@@ -25,11 +25,12 @@
++ ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]]
++ ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
++ ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]]
++-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
++-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
++-; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
++-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+++; CHECK-NEXT:    br i1 [[OR_COND]], label %[[FOR_BODY_PREHEADER9:.*]], label %[[VECTOR_PH:.*]]
++ ; CHECK:       [[VECTOR_PH]]:
+++; CHECK-NEXT:    [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16
+++; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+++; CHECK:       [[VECTOR_PH1]]:
+++; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
++ ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632
++ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
++ ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
++@@ -39,7 +40,7 @@
++ ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]]
++ ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
++ ; CHECK:       [[VECTOR_BODY]]:
++-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+++; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
++ ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDEX]]
++ ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 32
++ ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP5]], i64 64
++@@ -64,14 +65,13 @@
++ ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
++ ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
++ ; CHECK:       [[MIDDLE_BLOCK]]:
++-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12
++ ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
++ ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
++ ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
++ ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
++-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER]], label %[[VEC_EPILOG_PH]], !prof [[PROF10:![0-9]+]]
+++; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]]
++ ; CHECK:       [[VEC_EPILOG_PH]]:
++-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+++; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
++ ; CHECK-NEXT:    [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644
++ ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0
++ ; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
++@@ -86,12 +86,12 @@
++ ; CHECK-NEXT:    store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[DOUBLE_TBAA3]]
++ ; CHECK-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4
++ ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]]
++-; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+++; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
++ ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
++ ; CHECK-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]]
++-; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER]]
++-; CHECK:       [[FOR_BODY_PREHEADER]]:
++-; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+++; CHECK-NEXT:    br i1 [[CMP_N17]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9]]
+++; CHECK:       [[FOR_BODY_PREHEADER9]]:
+++; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
++ ; CHECK-NEXT:    [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]]
++ ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP43]], 7
++ ; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
++@@ -110,13 +110,13 @@
++ ; CHECK-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1
++ ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
++ ; CHECK-NEXT:    [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
++-; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP12:![0-9]+]]
+++; CHECK-NEXT:    br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
++ ; CHECK:       [[FOR_BODY_PROL_LOOPEXIT]]:
++-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
+++; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], %[[FOR_BODY_PROL]] ]
++ ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]]
++ ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8
++-; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER_NEW:.*]]
++-; CHECK:       [[FOR_BODY_PREHEADER_NEW]]:
+++; CHECK-NEXT:    br i1 [[TMP21]], label %[[FOR_END]], label %[[FOR_BODY_PREHEADER9_NEW:.*]]
+++; CHECK:       [[FOR_BODY_PREHEADER9_NEW]]:
++ ; CHECK-NEXT:    [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]]
++ ; CHECK-NEXT:    [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]]
++ ; CHECK-NEXT:    [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]]
++@@ -127,7 +127,7 @@
++ ; CHECK-NEXT:    [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]]
++ ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
++ ; CHECK:       [[FOR_BODY]]:
++-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
+++; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], %[[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
++ ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[Y]], i64 [[INDVARS_IV]]
++ ; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA3]]
++ ; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]]
++@@ -177,7 +177,7 @@
++ ; CHECK-NEXT:    store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[DOUBLE_TBAA3]]
++ ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
++ ; CHECK-NEXT:    [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]]
++-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+++; CHECK-NEXT:    br i1 [[EXITCOND_NOT_7]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
++ ; CHECK:       [[FOR_END]]:
++ ; CHECK-NEXT:    ret void
++ ;
++@@ -232,9 +232,8 @@
++ ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
++ ; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1}
++ ; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"}
++-; CHECK: [[PROF10]] = !{!"branch_weights", i32 4, i32 12}
++-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
++-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]]}
++-; CHECK: [[META13]] = !{!"llvm.loop.unroll.disable"}
++-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]]}
+++; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
+++; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
+++; CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"}
+++; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META8]]}
++ ;.
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 622428b..b1675f1 100644
+index 0fec012..5bd8dd9 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "d5ce81dc8143eed18a7342093b991a63b025e2d9"
--    LLVM_SHA256 = "07efbd552b4539a0b52201cfe73767a4265720789facdfa0d73bfeeb78ea19d3"
-+    LLVM_COMMIT = "32de3b9ef9e7e8debc14416e968456ca13b48bea"
-+    LLVM_SHA256 = "e048b05e1fb9366e224ea3c06f8473714114039bfad00e81db4ecb6409f23efa"
+-    LLVM_COMMIT = "9625cf6cc0e3e530ea0bed971d85b363f77c49d8"
+-    LLVM_SHA256 = "48a0479efdde83a02ece6a168094aa14e6f819c829eb61a133ec124089643356"
++    LLVM_COMMIT = "741ba8209c1f9bd5b1a145d9c137f5e18bfffb84"
++    LLVM_SHA256 = "45f2faa4d50e9f333c4130c23f3a712cf42f8f8fec4520207f6514344f8b32b0"
  
      tf_http_archive(
          name = name,
-diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
-index f2e3e59..473baf0 100755
---- a/third_party/stablehlo/temporary.patch
-+++ b/third_party/stablehlo/temporary.patch
-@@ -904,36 +904,6 @@ diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplific
-  // -----
-  
-  ////////
--diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
----- stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
--+++ stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
--@@ -752,7 +752,7 @@
--     %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
--     return %2 : tensor<?xf32>
--   }
---  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
--+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
--   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
--     return %arg1 : tensor<?xf32>
--   }
--@@ -770,7 +770,7 @@
--     %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
--     return %2 : tensor<?xf32>
--   }
---  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
--+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
--   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
--     return %arg1 : tensor<?xf32>
--   }
--@@ -789,7 +789,7 @@
--     %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
--     return %4 : tensor<?xf32>
--   }
---  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
--+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
--   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
--     return %arg1 : tensor<?xf32>
--   }
- diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
- --- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
- +++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -1461,18 +1431,6 @@ diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stable
-    populateChloBroadcastingPatterns(context, patterns);
-    populateChloDecompositionPatterns(context, patterns);
-  }
--diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
----- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
--+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
--@@ -461,7 +461,7 @@
--   LogicalResult emitDifferentRefinementContextError(func::FuncOp func,
--                                                     RefinementKey key,
--                                                     RefinementKey prevKey) {
---    return func.emitOpError() << "refined with invompatible refinement keys:"
--+    return func.emitOpError() << "refined with incompatible refinement keys:"
--                               << "\n  curr=" << key.toString()
--                               << "\n  prev=" << prevKey.toString();
--   }
- diff --ruN a/stablehlo/stablehlo/transforms/optimization/Passes.td b/stablehlo/stablehlo/transforms/optimization/Passes.td
- --- stablehlo/stablehlo/transforms/optimization/Passes.td
- +++ stablehlo/stablehlo/transforms/optimization/Passes.td
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 83966483b0795e..67867140b2b80d 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "7fed8c5400d6ffd7e46e9ae318b7f55a5f67b91a"
-    SHARDY_SHA256 = "eb1ce21c6302403d4d215c6f5be24a8790475c861e7303ead2728ab66edeef6d"
+    SHARDY_COMMIT = "c898ed01ccd83579bdb8e9cdba685ed5f188066e"
+    SHARDY_SHA256 = "6a39d72a85322e333f834aae9df3731cd9e594fc07fd360985a4076c9bff7bf0"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/xla/third_party/slinky/workspace.bzl b/third_party/xla/third_party/slinky/workspace.bzl
index 2dc4462fa92218..cc48542026381f 100644
--- a/third_party/xla/third_party/slinky/workspace.bzl
+++ b/third_party/xla/third_party/slinky/workspace.bzl
@@ -5,7 +5,7 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "slinky",
-        sha256 = "e30625e2a3071ed674d48436bd671547f29825953ea7349c066c21322587ff6e",
-        strip_prefix = "slinky-e9c16e794b1a0a5f7bcc86565108a325f98c1422",
-        urls = tf_mirror_urls("https://github.com/dsharlet/slinky/archive/e9c16e794b1a0a5f7bcc86565108a325f98c1422.zip"),
+        sha256 = "1cf72ab8135680bcefc676eef5acfa59dc68bf7fc5a20e174394edfc85074d08",
+        strip_prefix = "slinky-395643708e97085cff91ac5f9d7afc5b4a02f2c5",
+        urls = tf_mirror_urls("https://github.com/dsharlet/slinky/archive/395643708e97085cff91ac5f9d7afc5b4a02f2c5.zip"),
     )
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index f2e3e59ec3a501..915772f92413a9 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,644 +1,149 @@
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir b/stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
---- stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
-+++ stablehlo/stablehlo/conversions/linalg/tests/pointwise.mlir
-@@ -1,5 +1,6 @@
- // RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg --split-input-file --canonicalize | FileCheck %s
- // RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg="enable-primitive-ops=true" --split-input-file --canonicalize | FileCheck %s --check-prefix=CHECK-PRIMITIVE
-+// RUN: stablehlo-opt %s --stablehlo-legalize-to-linalg="capture-scalar-inputs=false" --split-input-file --canonicalize | FileCheck %s --check-prefix=CHECK-NO-CAPTURE
- 
- // CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
- // CHECK-LABEL: func @float_add
-@@ -534,6 +535,19 @@
-   %0 = "stablehlo.sign"(%arg0) : (tensor<2x2xcomplex<f32>>)
-                           -> tensor<2x2xcomplex<f32>>
-   func.return %0 : tensor<2x2xcomplex<f32>>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @float_tan
-+// CHECK-PRIMITIVE-LABEL: func @float_tan
-+func.func @float_tan(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-+  // CHECK: linalg.generic
-+  // CHECK: tan
-+  // CHECK-PRIMITIVE: linalg.map
-+  // CHECK-PRIMITIVE: tan
-+  %0 = "stablehlo.tan"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-+  func.return %0 : tensor<2x2xf32>
- }
- 
- // -----
-@@ -926,6 +940,23 @@
- // CHECK-PRIMITIVE:      (%[[LHS_:.*]]: f32, %[[RHS_:.*]]: f32) {
- // CHECK-PRIMITIVE:        %[[RES:.*]] = arith.select %[[PRED_ELEM]], %[[LHS_]], %[[RHS_]] : f32
- // CHECK-PRIMITIVE:        linalg.yield %[[RES]]
-+
-+// CHECK-NO-CAPTURE:      #[[SCALAR_MAP:.*]] = affine_map<(d0, d1) -> ()>
-+// CHECK-NO-CAPTURE:      #[[ID_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-+// CHECK-NO-CAPTURE:      func @select_scalar_pred_dyn
-+// CHECK-NO-CAPTURE-SAME:  (%[[PRED:.*]]: tensor<i1>, %[[LHS:.*]]: tensor<2x?xf32>, %[[RHS:.*]]: tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-DAG:  %[[C1:.*]] = arith.constant 1
-+// CHECK-NO-CAPTURE-DAG:  %[[DIM:.*]] =  tensor.dim %[[LHS]], %[[C1]]
-+// CHECK-NO-CAPTURE-DAG:  %[[DST:.*]] = tensor.empty(%[[DIM]])
-+// CHECK-NO-CAPTURE:      linalg.generic
-+// CHECK-NO-CAPTURE-SAME:   indexing_maps = [#[[SCALAR_MAP]], #[[ID_MAP]], #[[ID_MAP]], #[[ID_MAP]]]
-+// CHECK-NO-CAPTURE-SAME:   iterator_types = ["parallel", "parallel"]
-+// CHECK-NO-CAPTURE-SAME:   ins(%[[PRED]], %[[LHS]], %[[RHS]] : tensor<i1>, tensor<2x?xf32>, tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-SAME:   outs(%[[DST]] : tensor<2x?xf32>)
-+// CHECK-NO-CAPTURE-SAME:   {someattr}
-+// CHECK-NO-CAPTURE:      ^bb0(%[[PRED_:.*]]: i1, %[[LHS_:.*]]: f32, %[[RHS_:.*]]: f32, %{{.*}}: f32):
-+// CHECK-NO-CAPTURE:        %[[RES:.*]] = arith.select %[[PRED_]], %[[LHS_]], %[[RHS_]] : f32
-+// CHECK-NO-CAPTURE:        linalg.yield %[[RES]]
- 
- // -----
- 
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/LegalizeToLinalgUtils.cpp
-@@ -140,12 +140,11 @@
-   // (any sign-op, or an integral abs-op).
-   // TODO(peiming, ajcbik): these all can potentially be optimized by applying
-   // value transform on sparse_tenosr.value memref
--  if (isa<mlir::stablehlo::SignOp>(op) || isa<mlir::stablehlo::NegOp>(op) ||
-+  if (isa<mlir::stablehlo::SignOp, mlir::stablehlo::NegOp,
-+          mlir::stablehlo::TanOp>(op) ||
-       (isa<mlir::stablehlo::AbsOp>(op) && hasIntegralShapeType(op)) ||
--      isa<chlo::AsinOp>(op) || isa<chlo::AsinhOp>(op) ||
--      isa<chlo::AtanOp>(op) || isa<chlo::AtanhOp>(op) ||
--      isa<chlo::BesselI1eOp>(op) || isa<chlo::SinhOp>(op) ||
--      isa<chlo::TanOp>(op)) {
-+      isa<chlo::AsinOp, chlo::AsinhOp, chlo::AtanOp, chlo::AtanhOp,
-+          chlo::BesselI1eOp, chlo::SinhOp, chlo::TanOp>(op)) {
-     if (!sparse_tensor::getSparseTensorEncoding(op->getResult(0).getType()) &&
-         !sparse_tensor::getSparseTensorEncoding(op->getOperand(0).getType()))
-       return Value();
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h b/stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
---- stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
-+++ stablehlo/stablehlo/conversions/linalg/transforms/MapStablehloToScalarOp.h
-@@ -153,14 +153,11 @@
-   using FOp = ::mlir::math::SinOp;
-   using COp = ::mlir::complex::SinOp;
- };
--// FIXME(Jakub)
--/*
- template <>
- struct StablehloToScalarOp<stablehlo::TanOp> {
-   using FOp = ::mlir::math::TanOp;
-   using COp = ::mlir::complex::TanOp;
- };
--*/
- template <>
- struct StablehloToScalarOp<stablehlo::Atan2Op> {
-   using FOp = ::mlir::math::Atan2Op;
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/Passes.td b/stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
---- stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
-+++ stablehlo/stablehlo/conversions/linalg/transforms/Passes.td
-@@ -39,7 +39,11 @@
-                  Option<"enableSparseOps", "enable-sparse-ops", "bool",
-                         /*default=*/"false",
-                         "Lower to Sparse Tensor ops (sparse_tensor.concatenate)"
--                        "when possible, instead of linalg.generic">];
-+                        "when possible, instead of linalg.generic">,
-+                 Option<"captureScalarInputs", "capture-scalar-inputs", "bool",
-+                        /*default=*/"true",
-+                        "Capture scalar inputs in generic ops instead of"
-+                        "passing as tensor-scalar argument.">];
- }
- 
- #endif  // STABLEHLO_TO_LINALG_PASSES
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h b/stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
---- stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
-+++ stablehlo/stablehlo/conversions/linalg/transforms/Rewriters.h
-@@ -26,11 +26,12 @@
- //===----------------------------------------------------------------------===//
- 
- /// Populates the patterns that convert from StableHLO to Linalg on tensors.
--void populateStablehloToLinalgConversionPatterns(MLIRContext *context,
--                                                 TypeConverter &typeConverter,
--                                                 RewritePatternSet *patterns,
-+void populateStablehloToLinalgConversionPatterns(MLIRContext* context,
-+                                                 TypeConverter& typeConverter,
-+                                                 RewritePatternSet* patterns,
-                                                  bool enablePrimitiveOps,
--                                                 bool enableSparseOps);
-+                                                 bool enableSparseOps,
-+                                                 bool captureScalarInputs);
- 
- //===----------------------------------------------------------------------===//
- // Fine-grained patterns used by the implementation.
-@@ -39,8 +40,9 @@
- /// Populates the patterns that convert from elementwise StableHLO ops to Linalg
- /// on tensors.
- void populatePointwiseStablehloToLinalgConversionPatterns(
--    MLIRContext *context, TypeConverter &typeConverter,
--    RewritePatternSet *patterns, bool enablePrimitiveOps);
-+    MLIRContext* context, TypeConverter& typeConverter,
-+    RewritePatternSet* patterns, bool enablePrimitiveOps,
-+    bool captureScalarInputs);
- 
- /// Populates the patterns that convert from convolution StableHLO ops to Linalg
- /// on tensors.
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-@@ -2634,7 +2634,8 @@
- 
-     RewritePatternSet patterns_(context);
-     populateStablehloToLinalgConversionPatterns(
--        context, converter, &patterns_, enablePrimitiveOps, enableSparseOps);
-+        context, converter, &patterns_, enablePrimitiveOps, enableSparseOps,
-+        captureScalarInputs);
-     patterns = std::move(patterns_);
- 
-     return success();
-@@ -2657,7 +2658,8 @@
-                                                  TypeConverter& typeConverter,
-                                                  RewritePatternSet* patterns,
-                                                  bool enablePrimitiveOps,
--                                                 bool enableSparseOps) {
-+                                                 bool enableSparseOps,
-+                                                 bool captureScalarInputs) {
-   // clang-format off
-   patterns->add<ConcatenateConverter>(typeConverter, context,
-                                       enablePrimitiveOps);
-@@ -2680,7 +2682,8 @@
-       >(typeConverter, context);
- 
-   detail::populatePointwiseStablehloToLinalgConversionPatterns(
--      context, typeConverter, patterns, enablePrimitiveOps);
-+      context, typeConverter, patterns, enablePrimitiveOps,
-+      captureScalarInputs);
- 
-   if (enableSparseOps) {
-     patterns->add<SparseConcatenateConverter>(typeConverter, context);
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloToArith.cpp
-@@ -145,6 +145,7 @@
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SineOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SqrtOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::SubtractOp>,
-+      ScalarHloToArithmeticPattern<mlir::stablehlo::TanOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::TanhOp>,
-       ScalarHloToArithmeticPattern<mlir::stablehlo::XorOp>>(typeConverter,
-                                                             context, filterFn);
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloToLinalgPointwise.cpp
-@@ -23,6 +23,7 @@
- 
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
-+#include "llvm/Support/Debug.h"
- #include "mlir/Dialect/Linalg/IR/Linalg.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- #include "mlir/IR/AffineMap.h"
-@@ -43,6 +44,8 @@
- #include "stablehlo/conversions/linalg/transforms/Rewriters.h"
- #include "stablehlo/dialect/StablehloOps.h"
- 
-+#define DEBUG_TYPE "stablehlo-conversions"
-+
- namespace mlir::stablehlo {
- namespace {
- int64_t getRank(Value v) { return cast<ShapedType>(v.getType()).getRank(); }
-@@ -142,6 +145,11 @@
- struct PointwiseToLinalgMapConverter : OpConversionPattern<OpTy> {
-   using OpConversionPattern<OpTy>::OpConversionPattern;
-   using OpAdaptor = typename OpTy::Adaptor;
-+
-+  PointwiseToLinalgMapConverter(TypeConverter& typeConverter,
-+                                MLIRContext* context, bool captureScalarInputs)
-+      : OpConversionPattern<OpTy>(typeConverter, context),
-+        captureScalarInputs(captureScalarInputs) {}
- 
-   virtual FailureOr<Operation *> createLinalgOp(
-       OpTy &op, ConversionPatternRewriter &rewriter,
-@@ -190,8 +198,11 @@
-             rewriter, loc, cast<TypedValue<ShapedType>>(input),
-             cast<ShapedType>(emptyTensor.getType())));
-         scalarInputs.push_back(nullptr);
-+      } else if (captureScalarInputs) {
-+        scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
-       } else {
--        scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
-+        mappedInputs.push_back(input);
-+        scalarInputs.push_back(nullptr);
-       }
-     }
- 
-@@ -202,6 +213,8 @@
-     rewriter.replaceOp(op, (*mapOp)->getResults());
-     return success();
-   }
+diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
+--- stablehlo/BUILD.bazel
++++ stablehlo/BUILD.bazel
+@@ -1105,6 +1105,24 @@
+     tblgen = "@llvm-project//mlir:mlir-tblgen",
+     td_file = "stablehlo/transforms/Passes.td",
+     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
++)
 +
-+  bool captureScalarInputs;
- };
- 
- /// Converts a HLO operation to a linalg.generic op that contains the
-@@ -211,12 +224,12 @@
-   using PointwiseToLinalgMapConverter<OpTy>::PointwiseToLinalgMapConverter;
-   using OpAdaptor = typename OpTy::Adaptor;
- 
--  FailureOr<Operation *> createLinalgOp(OpTy &op,
--                                        ConversionPatternRewriter &rewriter,
--                                        ArrayRef<Value> mappedInputs,
--                                        ArrayRef<Value> scalarVals,
--                                        Value emptyTensor,
--                                        int64_t maxRank) const override {
-+  FailureOr<Operation*> createLinalgOp(OpTy& op,
-+                                       ConversionPatternRewriter& rewriter,
-+                                       ArrayRef<Value> mappedInputs,
-+                                       ArrayRef<Value> scalarVals,
-+                                       Value emptyTensor,
-+                                       int64_t maxRank) const override {
-     // Create indexing maps.
-     AffineMap scalarMap = AffineMap::get(maxRank, 0, rewriter.getContext());
-     AffineMap idMap = rewriter.getMultiDimIdentityMap(maxRank);
-@@ -225,10 +238,10 @@
-       maps.push_back(isScalar(v) ? scalarMap : idMap);
-     maps.push_back(idMap);
-     bool failed = false;
--    Operation *linalgOp = rewriter.create<linalg::GenericOp>(
-+    Operation* linalgOp = rewriter.create<linalg::GenericOp>(
-         op.getLoc(), emptyTensor.getType(), mappedInputs, emptyTensor, maps,
-         getNParallelLoopsAttrs(maxRank),
--        [&](OpBuilder &nestedBuilder, Location /*nested_loc*/,
-+        [&](OpBuilder& nestedBuilder, Location /*nested_loc*/,
-             ValueRange args) {
-           Type innerResultTy = getElementTypeOrSelf(emptyTensor);
-           auto argvec =
-@@ -253,8 +266,9 @@
- 
- namespace detail {
- void populatePointwiseStablehloToLinalgConversionPatterns(
--    MLIRContext *context, TypeConverter &typeConverter,
--    RewritePatternSet *patterns, bool enablePrimitiveOps) {
-+    MLIRContext* context, TypeConverter& typeConverter,
-+    RewritePatternSet* patterns, bool enablePrimitiveOps,
-+    bool captureScalarInputs) {
-   if (enablePrimitiveOps) {
-     patterns->add<
-         PointwiseToLinalgMapConverter<mlir::stablehlo::AbsOp>,
-@@ -301,12 +315,12 @@
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SineOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SqrtOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::SubtractOp>,
-+        PointwiseToLinalgMapConverter<mlir::stablehlo::TanOp>,
-         PointwiseToLinalgMapConverter<mlir::stablehlo::TanhOp>,
--        PointwiseToLinalgMapConverter<mlir::stablehlo::XorOp>>(typeConverter,
--                                                               context);
-+        PointwiseToLinalgMapConverter<mlir::stablehlo::XorOp>>(
-+        typeConverter, context, captureScalarInputs);
-     return;
-   }
--
-   patterns
-       ->add<PointwiseToLinalgConverter<mlir::stablehlo::AbsOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::AddOp>,
-@@ -352,9 +366,10 @@
-             PointwiseToLinalgConverter<mlir::stablehlo::SineOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::SqrtOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::SubtractOp>,
-+            PointwiseToLinalgConverter<mlir::stablehlo::TanOp>,
-             PointwiseToLinalgConverter<mlir::stablehlo::TanhOp>,
--            PointwiseToLinalgConverter<mlir::stablehlo::XorOp>>(typeConverter,
--                                                                context);
-+            PointwiseToLinalgConverter<mlir::stablehlo::XorOp>>(
-+          typeConverter, context, captureScalarInputs);
- }
- }  // namespace detail
- }  // namespace mlir::stablehlo
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp b/stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
---- stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
-+++ stablehlo/stablehlo/conversions/tosa/transforms/StablehloQuantLegalizeToTosaRescale.cpp
-@@ -40,7 +40,7 @@
- 
- namespace {
- 
--Value buildRescaleMultiplier(bool scale32, OpBuilder& builder, Location loc,
-+Value buildRescaleMultiplier(bool scale32, OpBuilder &builder, Location loc,
-                              ArrayRef<int32_t> multipliers) {
-   if (scale32) {
-     return tosa::getConstTensorInt<int32_t>(builder, loc, multipliers);
-@@ -51,7 +51,7 @@
- }
- 
- // create a tosa rescale op and return its result value
--Value buildRescale(PatternRewriter& rewriter, Location loc,
-+Value buildRescale(PatternRewriter &rewriter, Location loc,
-                    ShapedType outputType, Value inputVal, int32_t multiplier,
-                    int32_t shift, int64_t inputZp, int64_t outputZp,
-                    bool doubleRound, bool scale32, bool perChannel) {
-@@ -85,7 +85,7 @@
- }
- 
- // Creates TOSA rescale op with int32 output
--Value buildRescaleToInt32(PatternRewriter& rewriter, Location loc,
-+Value buildRescaleToInt32(PatternRewriter &rewriter, Location loc,
-                           Value inputVal, double inputScale, int64_t inputZp) {
-   auto inputType = cast<ShapedType>(inputVal.getType());
-   auto outputType = inputType.clone(rewriter.getI32Type());
-@@ -103,7 +103,7 @@
- }
- 
- // Creates TOSA rescale op with int32 input
--Value buildRescaleFromInt32(PatternRewriter& rewriter, Location loc,
-+Value buildRescaleFromInt32(PatternRewriter &rewriter, Location loc,
-                             ShapedType outputType, Value inputVal,
-                             double outputScale, int64_t outputZp) {
-   // Input should be int32 type
-@@ -124,14 +124,14 @@
- }
- 
- using UnaryRescaleScalesFn =
--    void (*)(const quant::UniformQuantizedType& operandQType,
--             const quant::UniformQuantizedType& resultQType,
--             double& operandRescaleScale, double& resultRescaleScale);
--
--void GetUnaryRescaleScales(const quant::UniformQuantizedType& operandQType,
--                           const quant::UniformQuantizedType& resultQType,
--                           double& operandRescaleScale,
--                           double& resultRescaleScale) {
-+    void (*)(const quant::UniformQuantizedType &operandQType,
-+             const quant::UniformQuantizedType &resultQType,
-+             double &operandRescaleScale, double &resultRescaleScale);
-+
-+void GetUnaryRescaleScales(const quant::UniformQuantizedType &operandQType,
-+                           const quant::UniformQuantizedType &resultQType,
-+                           double &operandRescaleScale,
-+                           double &resultRescaleScale) {
-   double operandScale = operandQType.getScale();
-   double resultScale = resultQType.getScale();
- 
-@@ -145,7 +145,7 @@
- 
- template <typename StablehloOp>
- LogicalResult matchAndRewriteUnaryOp(
--    StablehloOp op, PatternRewriter& rewriter,
-+    StablehloOp op, PatternRewriter &rewriter,
-     UnaryRescaleScalesFn rescaleScalesFn = GetUnaryRescaleScales) {
-   Value operand = op.getOperand();
-   Value result = op.getResult();
-@@ -190,21 +190,21 @@
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::AbsOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteUnaryOp(op, rewriter);
- }
- 
- using BinaryRescaleScalesFn = void (*)(
--    const quant::UniformQuantizedType& lhsQType,
--    const quant::UniformQuantizedType& rhsQType,
--    const quant::UniformQuantizedType& resultQType, double& lhsRescaleScale,
--    double& rhsRescaleScale, double& resultRescaleScale);
--
--void GetAddSubRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+    const quant::UniformQuantizedType &lhsQType,
-+    const quant::UniformQuantizedType &rhsQType,
-+    const quant::UniformQuantizedType &resultQType, double &lhsRescaleScale,
-+    double &rhsRescaleScale, double &resultRescaleScale);
-+
-+void GetAddSubRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   // 1. Rescale inputs to scale = 2.0 x max(lhs.scale, rhs.scale)
-   // 2. Extra left shift to input to increase precision
-   // Where input_shift = 20 if input is 8-bit
-@@ -230,11 +230,11 @@
-       maxScale2x / (resultScale * static_cast<double>(1 << inputShift));
- }
- 
--void GetMulDivRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+void GetMulDivRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   double lhsScale = lhsQType.getScale();
-   double rhsScale = rhsQType.getScale();
-   double resultScale = resultQType.getScale();
-@@ -248,11 +248,11 @@
-   resultRescaleScale = lhsScale * rhsScale / resultScale;
- }
- 
--void GetMinMaxRescaleScales(const quant::UniformQuantizedType& lhsQType,
--                            const quant::UniformQuantizedType& rhsQType,
--                            const quant::UniformQuantizedType& resultQType,
--                            double& lhsRescaleScale, double& rhsRescaleScale,
--                            double& resultRescaleScale) {
-+void GetMinMaxRescaleScales(const quant::UniformQuantizedType &lhsQType,
-+                            const quant::UniformQuantizedType &rhsQType,
-+                            const quant::UniformQuantizedType &resultQType,
-+                            double &lhsRescaleScale, double &rhsRescaleScale,
-+                            double &resultRescaleScale) {
-   // 1. Rescale inputs to scale = max(lhs.scale, rhs.scale)
-   // 2. Extra left shift to input to increase precision
-   // Where input_shift = 20 if input is 8-bit
-@@ -280,7 +280,7 @@
- }
- 
- template <typename StablehloOp>
--LogicalResult matchAndRewriteBinaryOp(StablehloOp op, PatternRewriter& rewriter,
-+LogicalResult matchAndRewriteBinaryOp(StablehloOp op, PatternRewriter &rewriter,
-                                       BinaryRescaleScalesFn rescaleScalesFn) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-@@ -339,37 +339,37 @@
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::AddOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetAddSubRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::SubtractOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetAddSubRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MulOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMulDivRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::DivOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMulDivRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MinOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMinMaxRescaleScales);
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::MaxOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteBinaryOp(op, rewriter, GetMinMaxRescaleScales);
- }
- 
- LogicalResult matchAndRewriteCompareOp(stablehlo::CompareOp op,
--                                       PatternRewriter& rewriter) {
-+                                       PatternRewriter &rewriter) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-   Value result = op.getResult();
-@@ -429,7 +429,7 @@
- }
- 
- LogicalResult matchAndRewriteOp(stablehlo::CompareOp op,
--                                PatternRewriter& rewriter) {
-+                                PatternRewriter &rewriter) {
-   return matchAndRewriteCompareOp(op, rewriter);
- }
- 
-@@ -438,7 +438,7 @@
-     : public OpRewritePattern<StablehloOpType> {
-   using OpRewritePattern<StablehloOpType>::OpRewritePattern;
-   LogicalResult matchAndRewrite(StablehloOpType op,
--                                PatternRewriter& rewriter) const override {
-+                                PatternRewriter &rewriter) const override {
-     return matchAndRewriteOp(op, rewriter);
-   }
- };
-@@ -446,7 +446,7 @@
- struct StablehloQuantLegalizeToTosaRescalePass
-     : impl::StablehloQuantLegalizeToTosaRescalePassBase<
-           StablehloQuantLegalizeToTosaRescalePass> {
--  LogicalResult initialize(MLIRContext* ctx) override {
-+  LogicalResult initialize(MLIRContext *ctx) override {
-     RewritePatternSet patternList(ctx);
-     populateStablehloQuantLegalizeToTosaRescalePatterns(&patternList, ctx);
-     patterns = std::move(patternList);
-@@ -468,7 +468,7 @@
- }  // namespace
- 
- void populateStablehloQuantLegalizeToTosaRescalePatterns(
--    RewritePatternSet* patterns, MLIRContext* context) {
-+    RewritePatternSet *patterns, MLIRContext *context) {
-   // unary ops
-   patterns->addWithLabel<QuantizedStablehloOpConversion<stablehlo::AbsOp>>(
-       {"StablehloQuantAbsOp"}, context);
++cc_library(
++    name = "stablehlo_broadcast_lowering",
++    srcs = [
++        "stablehlo/transforms/StablehloBroadcastLowering.cpp",
++    ],
++    hdrs = [
++        "stablehlo/transforms/StablehloBroadcastLowering.h",
++    ],
++    strip_include_prefix = ".",
++    deps = [
++        ":stablehlo_ops",
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:ShapeDialect",
++        "@llvm-project//mlir:Support",
++    ],
+ )
+ 
+ cc_library(
 diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -3164,6 +3164,7 @@
- using mlir::hlo::printVariadicOperandWithAttribute;
- using mlir::hlo::printVariadicSameOperandsAndResultType;
- 
-+using mlir::stablehlo::TokenType;
- #define GET_OP_CLASSES
- #include "stablehlo/dialect/StablehloOps.cpp.inc"
- 
-diff --ruN a/stablehlo/stablehlo/integrations/c/VhloDialect.h b/stablehlo/stablehlo/integrations/c/VhloDialect.h
---- stablehlo/stablehlo/integrations/c/VhloDialect.h
-+++ stablehlo/stablehlo/integrations/c/VhloDialect.h
-@@ -13,7 +13,7 @@
- #ifndef STABLEHLO_INTEGRATIONS_C_VHLO_DIALECT_H
- #define STABLEHLO_INTEGRATIONS_C_VHLO_DIALECT_H
- 
--#include "mlir-c/RegisterEverything.h"
-+#include "mlir-c/IR.h"
- 
- #ifdef __cplusplus
- extern "C" {
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.cpp
-@@ -110,6 +110,43 @@
+@@ -3275,12 +3275,12 @@
+ // Entry point for Attribute printing, TableGen generated code will handle the
+ // dispatch to the individual classes.
+ void StablehloDialect::printAttribute(Attribute attr,
+-                                      DialectAsmPrinter& os) const {
++                                      DialectAsmPrinter& printer) const {
+   if (auto type_extensions = dyn_cast<TypeExtensionsAttr>(attr)) {
+-    hlo::printTypeExtensions(cast<hlo::BoundedAttrInterface>(attr), os);
++    hlo::printTypeExtensions(cast<hlo::BoundedAttrInterface>(attr), printer);
+     return;
    }
+-  LogicalResult result = generatedAttributePrinter(attr, os);
++  LogicalResult result = generatedAttributePrinter(attr, printer);
+   (void)result;
+   assert(succeeded(result));
+ }
+@@ -4024,6 +4024,61 @@
+   ReturnOp::create(*builder, loc, compare);
  }
  
-+bool IsBoolean(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isInteger(1);
-+}
++void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
++                           OpBuilder& builder) {
++  OpBuilder::InsertionGuard guard(builder);
++  if (body.getBlocks().empty()) builder.createBlock(&body);
++  Block* block = &body.getBlocks().front();
 +
-+bool IsComplex(ElementType elementType) {
-+  MLIRContext ctx;
-+  auto type = dyn_cast<ComplexType>(getElementType(ctx, elementType));
-+  return !!type;
-+}
++  Type value_type = RankedTensorType::get(/*shape=*/{}, elementType);
++  Type index_type = RankedTensorType::get(/*shape=*/{}, indices_type);
++  Location loc = body.getLoc();
++  block->addArguments({value_type, index_type}, {loc, loc});
++  block->addArguments({value_type, index_type}, {loc, loc});
 +
-+bool IsFloat(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isFloat();
-+}
++  auto lhs_value = block->getArgument(0);
++  auto lhs_index = block->getArgument(1);
++  auto rhs_value = block->getArgument(2);
++  auto rhs_index = block->getArgument(3);
 +
-+bool IsInteger(ElementType elementType, bool includeBool = false) {
-+  MLIRContext ctx;
-+  Type type = getElementType(ctx, elementType);
-+  return type.isInteger() && (includeBool || !IsBoolean(elementType));
-+}
++  auto gt_pred =
++      builder
++          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::GT)
++          .getResult();
 +
-+bool IsSignedInteger(ElementType elementType) {
-+  MLIRContext ctx;
-+  Type type = getElementType(ctx, elementType);
++  // Tie-Breaker Condition: (lhs == rhs) AND (lhs_index < rhs_index)
++  auto eq_pred =
++      builder
++          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::EQ)
++          .getResult();
++  auto lt_index_pred =
++      builder
++          .create<CompareOp>(loc, lhs_index, rhs_index, ComparisonDirection::LT)
++          .getResult();
++  auto tie_breaker_condition =
++      builder.create<AndOp>(loc, eq_pred, lt_index_pred).getResult();
 +
-+  // Note that this is not the same as `type.isSignedInteger()`. Signed integers
-+  // are not used in StableHLO.
-+  return type.isSignlessInteger() && !IsBoolean(elementType);
-+}
++  // Final lhs Selection Condition: (gt_pred) OR (tie_breaker_condition)
++  auto final_lhs_condition =
++      builder.create<OrOp>(loc, gt_pred, tie_breaker_condition).getResult();
 +
-+bool IsUnsignedInteger(ElementType elementType) {
-+  MLIRContext ctx;
-+  return getElementType(ctx, elementType).isUnsignedInteger() &&
-+         !IsBoolean(elementType);
++  // Select Final Results:
++  // if final_lhs_condition:
++  //     return (lhs_value, lhs_index)
++  // else:
++  //     return (rhs_value, rhs_index)
++  auto selected_value = builder
++                            .create<stablehlo::SelectOp>(
++                                loc, final_lhs_condition, lhs_value, rhs_value)
++                            .getResult();
++  auto selected_index = builder
++                            .create<stablehlo::SelectOp>(
++                                loc, final_lhs_condition, lhs_index, rhs_index)
++                            .getResult();
++  builder.create<stablehlo::ReturnOp>(
++      loc, mlir::ValueRange{selected_value, selected_index});
 +}
 +
- RankedTensorType makeTensorType(MLIRContext& ctx, ArrayRef<int64_t> shape,
-                                 ElementType elementType) {
-   return makeTensorType(ctx, shape, getElementType(ctx, elementType));
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
---- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
-+++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h
-@@ -18,7 +18,6 @@
- 
- #include <complex>
- #include <cstdint>
--#include <source_location>
- #include <type_traits>
- #include <vector>
- 
-@@ -68,6 +67,20 @@
-   // clang-format on
- };
+ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
+                     const llvm::ArrayRef<Value>& operands,
+                     const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.h b/stablehlo/stablehlo/dialect/StablehloOps.h
+--- stablehlo/stablehlo/dialect/StablehloOps.h
++++ stablehlo/stablehlo/dialect/StablehloOps.h
+@@ -93,13 +93,14 @@
+   Type parseType(DialectAsmParser& parser) const override;
+ 
+   // Prints a type registered to this dialect.
+-  void printType(Type type, DialectAsmPrinter& os) const override;
++  void printType(Type type, DialectAsmPrinter& printer) const override;
+ 
+   // Parses an attribute registered to this dialect.
+   Attribute parseAttribute(DialectAsmParser& parser, Type type) const override;
+ 
+   // Prints an attribute registered to this dialect.
+-  void printAttribute(Attribute attr, DialectAsmPrinter& os) const override;
++  void printAttribute(Attribute attr,
++                      DialectAsmPrinter& printer) const override;
+ 
+   // Get the set dialect version.
+   std::optional<StablehloDialectVersion> getVersion() const;
+@@ -203,6 +204,16 @@
+   stablehlo::ReturnOp::create(builder, loc, reducer.getResult());
+ }
  
-+bool IsBoolean(ElementType elementType);
-+
-+bool IsComplex(ElementType elementType);
++// Builds the region `body` for a max-and-argmax computation, suitable for
++// use in ReduceWindow operations with varidic value and index inputs.
++// It creates four block arguments (val1, idx1, val2, idx2) of `elementType` and
++// `indices_type`, and returns two results: result_val and result_idx.
++// result_val is the maximum of val1 and val2, and result_idx is the index
++// corresponding to result_val. If val1 >= val2, idx1 is returned, otherwise
++// idx2 is returned.
++void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
++                           OpBuilder& builder);
 +
-+bool IsFloat(ElementType elementType);
-+
-+bool IsInteger(ElementType elementType, bool includeBool);
-+
-+// In StableHLO, we refer to signed integer as the MLIR's equivalent signless
-+// integer. StableHLO does not have a notion of signless integers like MLIR.
-+bool IsSignedInteger(ElementType elementType);
-+
-+bool IsUnsignedInteger(ElementType elementType);
-+
- Type getElementType(MLIRContext& ctx, ElementType elementType);
- 
- // Build a ranked tensor type with an element type of ElementType.
+ // PrecisionConfigAttr is a constraint attribute on ArrayAttrs.
+ // Create this class to allow for building this attr similar to other
+ // attributes.
 diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
 --- stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
 +++ stablehlo/stablehlo/integrations/cpp/builder/AttrTypeBuilderUtilTest.cpp
@@ -663,71 +168,6 @@ diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTest.cpp b/
  #include "llvm/Support/raw_ostream.h"
  #include "mlir/Dialect/Func/IR/FuncOps.h"
  #include "mlir/IR/BuiltinOps.h"
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-@@ -29,9 +29,35 @@
- #include "stablehlo/dialect/TypeInference.h"
- #include "stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
-+#include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
- 
- namespace mlir {
- namespace stablehlo {
-+
-+///////////////
-+// Dialect Helpers
-+///////////////
-+
-+MlirOp AttachFrontendAttribute(MlirBuilder& builder, MlirOp op, StringRef name,
-+                               Attribute value) {
-+  constexpr char kFrontendAttrName[] = "mhlo.frontend_attributes";
-+  Operation* mlirOp = unwrap(op).getDefiningOp();
-+  SmallVector<NamedAttribute> attrs;
-+  DictionaryAttr frontendAttr =
-+      mlirOp->getAttrOfType<DictionaryAttr>(kFrontendAttrName);
-+  if (frontendAttr) {
-+    for (NamedAttribute attr : frontendAttr.getValue()) {
-+      // Populate all non-conflicting names.
-+      if (attr.getName() != name) {
-+        attrs.push_back(attr);
-+      }
-+    }
-+  }
-+  attrs.emplace_back(name, value);
-+  mlirOp->setAttr(kFrontendAttrName,
-+                  DictionaryAttr::get(&builder.getContext(), attrs));
-+  return op;
-+}
- 
- /////////////////
- // MANUAL APIs
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.h b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.h
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.h
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.h
-@@ -27,9 +27,22 @@
- #include "stablehlo/dialect/StablehloOps.h"
- #include "stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
-+#include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
- 
- namespace mlir {
- namespace stablehlo {
-+
-+///////////////
-+// Dialect Helpers
-+///////////////
-+
-+// Appends or overwrites an entry in the `mhlo.frontend_attributes` attribute
-+//
-+// of the given op.
-+// Ex:
-+//   stablehlo.abs %0 { mhlo.frontend_attributes = { "foo" = 123 } }
-+MlirOp AttachFrontendAttribute(MlirBuilder& builder, MlirOp op, StringRef name,
-+                               Attribute value);
- 
- /////////////////
- // MANUAL APIs
 diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
 --- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
 +++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
@@ -740,931 +180,1290 @@ diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.c
  #include "mlir/IR/BuiltinAttributes.h"
  #include "mlir/IR/BuiltinOps.h"
  #include "mlir/IR/DialectRegistry.h"
-@@ -1592,5 +1592,57 @@
-   EXPECT_EQ(expected, debugString(*module));
- }
+diff --ruN a/stablehlo/stablehlo/integrations/python/mlir/dialects/InterpreterOps.td b/stablehlo/stablehlo/integrations/python/mlir/dialects/InterpreterOps.td
+--- stablehlo/stablehlo/integrations/python/mlir/dialects/InterpreterOps.td
++++ stablehlo/stablehlo/integrations/python/mlir/dialects/InterpreterOps.td
+@@ -17,6 +17,6 @@
+ #ifndef STABLEHLO_INTEGRATIONS_PYTHON_INTERPRETER_OPS
+ #define STABLEHLO_INTEGRATIONS_PYTHON_INTERPRETER_OPS
+ 
+-include "third_party/stablehlo/stablehlo/reference/InterpreterOps.h"
++include "stablehlo/reference/InterpreterOps.h"
+ 
+ #endif
+diff --ruN a/stablehlo/stablehlo/tests/BUILD.bazel b/stablehlo/stablehlo/tests/BUILD.bazel
+--- stablehlo/stablehlo/tests/BUILD.bazel
++++ stablehlo/stablehlo/tests/BUILD.bazel
+@@ -102,6 +102,8 @@
+     deps = [
+         ":test_utils_inc_gen",
+         "//:stablehlo_assembly_format",
++        "//:stablehlo_broadcast_lowering",
++        "//:stablehlo_ops",
+         "@llvm-project//llvm:Support",
+         "@llvm-project//mlir:FuncDialect",
+         "@llvm-project//mlir:IR",
+diff --ruN a/stablehlo/stablehlo/tests/TestUtils.cpp b/stablehlo/stablehlo/tests/TestUtils.cpp
+--- stablehlo/stablehlo/tests/TestUtils.cpp
++++ stablehlo/stablehlo/tests/TestUtils.cpp
+@@ -19,6 +19,7 @@
+ #include <utility>
  
-+TEST(MlirBuilderTest, FrontendAttributesAppend) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-+    %0 = stablehlo.exponential %arg0 {mhlo.frontend_attributes = {bar = "hello", foo = 123 : i32}} : tensor<2xf32>
-+    return %0 : tensor<2xf32>
-+  }
-+})mlir";
-+
-+  StablehloModuleBuilder mb;
-+  {
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type = makeTensorType(fb.getContext(), {2}, ElementType::F32);
-+    auto arg0 = func::Argument(fb, type);
-+    auto exp = Exp(arg0);
-+    stablehlo::AttachFrontendAttribute(
-+        fb, exp, "foo", fb.getOpBuilder().getI32IntegerAttr(123));
-+    stablehlo::AttachFrontendAttribute(
-+        fb, exp, "bar", fb.getOpBuilder().getStringAttr("hello"));
-+    func::Return(fb, {exp});
-+  }
+ #include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/SmallVector.h"
+ #include "llvm/Support/Casting.h"
+ #include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/Dialect/Shape/IR/Shape.h"
+@@ -28,6 +29,7 @@
+ #include "mlir/IR/Operation.h"
+ #include "mlir/IR/OperationSupport.h"
+ #include "mlir/IR/PatternMatch.h"
++#include "mlir/IR/TypeRange.h"
+ #include "mlir/Interfaces/InferTypeOpInterface.h"
+ #include "mlir/Interfaces/SideEffectInterfaces.h"
+ #include "mlir/Pass/Pass.h"
+@@ -35,11 +37,34 @@
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Support/LogicalResult.h"
+ #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/transforms/StablehloBroadcastLowering.h"
+ 
+ namespace mlir {
+ namespace hlo {
+ 
+ namespace {
 +
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_EQ(expected, debugString(*module));
-+}
++struct BroadcastValuesPattern : public RewritePattern {
++  explicit BroadcastValuesPattern(MLIRContext* context)
++      : RewritePattern("hlo_test_broadcast.numpy_broadcast", 1, context) {}
++  LogicalResult matchAndRewrite(Operation* op,
++                                PatternRewriter& rewriter) const override {
++    // Process all operands
++    SmallVector<Value> operands = llvm::to_vector(op->getOperands());
++    auto broadcastedOperands =
++        stablehlo::numpyBroadcastIfNeeded(rewriter, operands);
++    if (failed(broadcastedOperands)) return failure();
 +
-+TEST(MlirBuilderTest, FrontendAttributesOverwrite) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-+    %0 = stablehlo.exponential %arg0 {mhlo.frontend_attributes = {foo = 456 : i32}} : tensor<2xf32>
-+    return %0 : tensor<2xf32>
++    // Replace with custom call to avoid pattern reapplication
++    auto customCall = stablehlo::CustomCallOp::create(
++        rewriter, op->getLoc(), op->getResultTypes(), *broadcastedOperands);
++    customCall.setCallTargetName("numpy_broadcasted");
++    customCall.setHasSideEffect(true);
++    rewriter.replaceOp(op, customCall);
++    return success();
 +  }
-+})mlir";
-+
-+  StablehloModuleBuilder mb;
-+  {
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type = makeTensorType(fb.getContext(), {2}, ElementType::F32);
-+    auto arg0 = func::Argument(fb, type);
-+    auto exp = Exp(arg0);
-+    stablehlo::AttachFrontendAttribute(
-+        fb, exp, "foo", fb.getOpBuilder().getI32IntegerAttr(123));
-+    stablehlo::AttachFrontendAttribute(
-+        fb, exp, "foo", fb.getOpBuilder().getI32IntegerAttr(456));
-+    func::Return(fb, {exp});
++};
+ 
+ struct InferReturnTypesPattern : public RewritePattern {
+   explicit InferReturnTypesPattern(MLIRContext *context)
+@@ -137,36 +162,55 @@
+   }
+ };
+ 
++#define GEN_PASS_DEF_HLOTESTBROADCASTPASS
+ #define GEN_PASS_DEF_HLOTESTINFERPASS
+ #define GEN_PASS_DEF_HLOTESTSPECULATABILITYPASS
+ #include "stablehlo/tests/TestUtils.h.inc"
+ 
++struct HloTestBroadcastPass
++    : public impl::HloTestBroadcastPassBase<HloTestBroadcastPass> {
++  LogicalResult initialize(MLIRContext* context) override {
++    RewritePatternSet patterns(context);
++    patterns.add<BroadcastValuesPattern>(context);
++    patterns_ = std::move(patterns);
++    return success();
 +  }
 +
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_EQ(expected, debugString(*module));
-+}
++  void runOnOperation() override {
++    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns_))))
++      return signalPassFailure();
++  }
++
++ private:
++  FrozenRewritePatternSet patterns_;
++};
 +
- }  // namespace stablehlo
- }  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/reference/InterpreterOps.cpp b/stablehlo/stablehlo/reference/InterpreterOps.cpp
---- stablehlo/stablehlo/reference/InterpreterOps.cpp
-+++ stablehlo/stablehlo/reference/InterpreterOps.cpp
-@@ -46,6 +46,7 @@
- #include "stablehlo/reference/ProcessGrid.h"
- #include "stablehlo/reference/Value.h"
+ struct HloTestInferPass : public impl::HloTestInferPassBase<HloTestInferPass> {
+   LogicalResult initialize(MLIRContext *context) override {
+-    RewritePatternSet patterns_(context);
+-    patterns_.add<InferReturnTypesPattern>(context);
+-    patterns_.add<ReifyReturnTypeShapesPattern>(context);
+-    patterns = std::move(patterns_);
++    RewritePatternSet patterns(context);
++    patterns.add<InferReturnTypesPattern>(context);
++    patterns.add<ReifyReturnTypeShapesPattern>(context);
++    patterns_ = std::move(patterns);
+     return success();
+   }
  
-+using mlir::stablehlo::TokenType;
- #define GET_OP_CLASSES
- #include "stablehlo/reference/InterpreterOps.cpp.inc"
+   void runOnOperation() override {
+-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
++    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns_))))
+       return signalPassFailure();
+   }
  
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-@@ -529,28 +529,15 @@
- // IotaOp
+  private:
+-  FrozenRewritePatternSet patterns;
++  FrozenRewritePatternSet patterns_;
+ };
  
- // CHECK-LABEL: func @eval_iota
--func.func @eval_iota() -> (tensor<3x4x5xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>) {
--  // CHECK-NOT: stablehlo.iota
--  // CHECK: [[RESULT0:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
--  // CHECK-SAME: {{\[}}[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
--  // CHECK-SAME: {{\[}}[2, 2, 2, 2, 2], [2, 2, 2, 2, 2], [2, 2, 2, 2, 2], [2, 2, 2, 2, 2]]]> : tensor<3x4x5xi32>
--
--  // CHECK: [[RESULT1:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]],
--  // CHECK-SAME: {{\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]],
--  // CHECK-SAME: {{\[}}[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]]]> : tensor<3x4x5xi32>
--
--  // CHECK: [[RESULT2:%.*]] = stablehlo.constant dense<
--  // CHECK-SAME: {{\[\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
--  // CHECK-SAME: {{\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
--  // CHECk-SAME: {{\[}}[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]]> : tensor<3x4x5xi32>
--
-+func.func @eval_iota() -> (tensor<1xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>) {
-+  // CHECK:      [[RESULT0:%.*]] = stablehlo.constant dense<0> : tensor<1xi32>
-+  // CHECK-NEXT: [[RESULT1:%.*]] = stablehlo.iota dim = 1 : tensor<3x4x5xi32>
-+  // CHECK-NEXT: [[RESULT2:%.*]] = stablehlo.iota dim = 2 : tensor<3x4x5xi32>
-   // CHECK: return [[RESULT0]], [[RESULT1]], [[RESULT2]]
--  %0 = stablehlo.iota dim = 0 : tensor<3x4x5xi32>
-+  %0 = stablehlo.iota dim = 0 : tensor<1xi32>
-   %1 = stablehlo.iota dim = 1 : tensor<3x4x5xi32>
-   %2 = stablehlo.iota dim = 2 : tensor<3x4x5xi32>
--  func.return %0, %1, %2 : tensor<3x4x5xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>
-+  func.return %0, %1, %2 : tensor<1xi32>, tensor<3x4x5xi32>, tensor<3x4x5xi32>
- }
+ struct HloTestSpeculatabilityPass
+     : public impl::HloTestSpeculatabilityPassBase<HloTestSpeculatabilityPass> {
+   LogicalResult initialize(MLIRContext *context) override {
+-    RewritePatternSet patterns_(context);
+-    patterns_.add<IsSpeculatablePattern>(context);
+-    patterns_.add<IsNotSpeculatablePattern>(context);
+-    patterns_.add<IsRecursivelySpeculatablePattern>(context);
+-    patterns = std::move(patterns_);
++    RewritePatternSet patterns(context);
++    patterns.add<IsSpeculatablePattern>(context);
++    patterns.add<IsNotSpeculatablePattern>(context);
++    patterns.add<IsRecursivelySpeculatablePattern>(context);
++    patterns_ = std::move(patterns);
+     return success();
+   }
  
- // -----
-@@ -596,6 +583,37 @@
-   // CHECK-DAG:  [[CST2:%.+]] = stablehlo.constant dense<{{\[\[1, 2\], \[3, 4\]\]}}> : tensor<2x2xi32>
-   // CHECK-NEXT: return [[CST1]], [[CST2]]
-   return %0, %1 : tensor<1xi32>, tensor<2x2xi32>
+@@ -175,11 +219,11 @@
+     config.setMaxIterations(1)
+         .setUseTopDownTraversal(true)
+         .setRegionSimplificationLevel(GreedySimplifyRegionLevel::Disabled);
+-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
++    (void)applyPatternsGreedily(getOperation(), std::move(patterns_));
+   }
+ 
+  private:
+-  FrozenRewritePatternSet patterns;
++  FrozenRewritePatternSet patterns_;
+ };
+ 
+ #define GEN_PASS_REGISTRATION
+diff --ruN a/stablehlo/stablehlo/tests/TestUtils.td b/stablehlo/stablehlo/tests/TestUtils.td
+--- stablehlo/stablehlo/tests/TestUtils.td
++++ stablehlo/stablehlo/tests/TestUtils.td
+@@ -16,6 +16,11 @@
+ 
+ include "mlir/Pass/PassBase.td"
+ 
++def HloTestBroadcastPass : Pass<"hlo-test-broadcast", "func::FuncOp"> {
++  let summary = "Uses test ops to invoke BroadcastUtils methods.";
++  let dependentDialects = ["stablehlo::StablehloDialect"];
++}
++
+ def HloTestInferPass : Pass<"hlo-test-infer", "func::FuncOp"> {
+   let summary = "Uses test ops to invoke InferShapedTypeOpInterface methods.";
+   let dependentDialects = ["shape::ShapeDialect"];
+diff --ruN a/stablehlo/stablehlo/tests/ops_broadcasting.mlir b/stablehlo/stablehlo/tests/ops_broadcasting.mlir
+--- stablehlo/stablehlo/tests/ops_broadcasting.mlir
++++ stablehlo/stablehlo/tests/ops_broadcasting.mlir
+@@ -0,0 +1,249 @@
++// RUN: stablehlo-opt %s --hlo-test-broadcast --split-input-file --allow-unregistered-dialect | FileCheck %s
++
++/////////
++// Scalar broadcast tests.
++
++// [] x [1] => [1]
++// CHECK-LABEL: func @scalar_broadcast_scalar_x_1
++func.func @scalar_broadcast_scalar_x_1(%arg0: tensor<f64>, %arg1: tensor<1xf64>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [] : (tensor<f64>) -> tensor<1xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<f64>, tensor<1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
 +// -----
 +
-+////////
-+// SliceOp / DynamicSliceOp
++// [1] x [] => [1]
++// CHECK-LABEL: func @scalar_broadcast_1_x_scalar
++func.func @scalar_broadcast_1_x_scalar(%arg0: tensor<1xf64>, %arg1: tensor<f64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f64>) -> tensor<1xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1xf64>, tensor<f64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
 +
-+// CHECK-LABEL: @slice_fold
-+func.func @slice_fold(%arg0: tensor<6x1xi32>) -> tensor<1x1xi32> {
-+  %c = stablehlo.constant dense<[[0], [1], [2], [3], [4], [5]]> : tensor<6x1xi32>
-+  %0 = stablehlo.slice %c [2:3, 0:1] : (tensor<6x1xi32>) -> tensor<1x1xi32>
-+  // CHECK: stablehlo.constant dense<2> : tensor<1x1xi32>
-+  return %0 : tensor<1x1xi32>
++// [] x [10] => [10]
++// CHECK-LABEL: func @scalar_broadcast_scalar_x_10
++func.func @scalar_broadcast_scalar_x_10(%arg0: tensor<f64>, %arg1: tensor<10xf64>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [] : (tensor<f64>) -> tensor<10xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<f64>, tensor<10xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
-+// CHECK-LABEL: @slice_fold_splat
-+func.func @slice_fold_splat(%arg0: tensor<6x1xi32>) -> tensor<1x1xi32> {
-+  %c = stablehlo.constant dense<1> : tensor<6x1xi32>
-+  %0 = stablehlo.slice %c [2:3, 0:1] : (tensor<6x1xi32>) -> tensor<1x1xi32>
-+  // CHECK: stablehlo.constant dense<1> : tensor<1x1xi32>
-+  return %0 : tensor<1x1xi32>
++// -----
++
++// [<=10] x [] => [<=10]
++// CHECK-LABEL: func @scalar_broadcast_b10_x_scalar
++func.func @scalar_broadcast_b10_x_scalar(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<f64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f64>) -> tensor<10xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<f64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
 +}
 +
-+// CHECK-LABEL: @dynamic_slice_fold
-+func.func @dynamic_slice_fold(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<1x1xi32> {
-+  %0 = stablehlo.constant dense<256> : tensor<6x1xi32>
-+  %1 = "stablehlo.dynamic_slice"(%0, %arg0, %arg1) <{slice_sizes = array<i64: 1, 1>}> : (tensor<6x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x1xi32>
++// -----
 +
-+  // CHECK: %[[RESULT:.*]] = stablehlo.constant dense<256> : tensor<1x1xi32>
-+  // CHECK: return %[[RESULT]]
-+  return %1 : tensor<1x1xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-@@ -1810,6 +1810,15 @@
-   return %0 : tensor<2x4x1x5xf32>
++// [] x [<=10] => [<=10]
++// CHECK-LABEL: func @scalar_broadcast_scalar_x_b10
++func.func @scalar_broadcast_scalar_x_b10(%arg0: tensor<f64>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [] : (tensor<f64>) -> tensor<10xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg1, dim = 0
++  // CHECK: %[[LHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[LHS_BCAST]], %[[DIM_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST_DYN]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<f64>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [] x [1, <=10, 1] => [1, <=10, 1]
++// CHECK-LABEL: func @scalar_broadcast_scalar_x_1_b10_1
++func.func @scalar_broadcast_scalar_x_1_b10_1(%arg0: tensor<f64>, %arg1: tensor<1x?x1xf64, #stablehlo.bounds<?, 10, ?>>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [] : (tensor<f64>) -> tensor<1x10x1xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg1, dim = 1
++  // CHECK: %[[LHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[LHS_BCAST]], %[[DIM_SIZE]], dim = 1
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST_DYN]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<f64>, tensor<1x?x1xf64, #stablehlo.bounds<?, 10, ?>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// [10, 1, <=5] x [] => [10, 1, <=5]
++// CHECK-LABEL: func @scalar_broadcast_10_1_b5_x_scalar
++func.func @scalar_broadcast_10_1_b5_x_scalar(%arg0: tensor<10x1x?xf64, #stablehlo.bounds<?, ?, 5>>, %arg1: tensor<f64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f64>) -> tensor<10x1x5xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 2
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 2
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10x1x?xf64, #stablehlo.bounds<?, ?, 5>>, tensor<f64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++//////
++// 1-D SCALAR TESTS
++
++// [1] x [1] => [1]
++// [1] x [10] => [1]
++// [<=10] x [1] => [<=10]
++// [1] x [<=10] => [<=10]
++// [1] x [1, <=10, 1] => [1, <=10, 1]
++
++
++// [1] x [1] => [1]
++// CHECK-LABEL: func @single_dim_scalar_1_x_1
++func.func @single_dim_scalar_1_x_1(%arg0: tensor<1xf64>, %arg1: tensor<1xf64>) -> !stablehlo.token {
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1xf64>, tensor<1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [1] x [10] => [10]
++// CHECK-LABEL: func @single_dim_scalar_1_x_10
++func.func @single_dim_scalar_1_x_10(%arg0: tensor<1xf64>, %arg1: tensor<10xf64>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0] : (tensor<1xf64>) -> tensor<10xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1xf64>, tensor<10xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [<=10] x [1] => [<=10]
++// CHECK-LABEL: func @single_dim_scalar_b10_x_1
++func.func @single_dim_scalar_b10_x_1(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<1xf64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0] : (tensor<1xf64>) -> tensor<10xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [1] x [<=10] => [<=10]
++// CHECK-LABEL: func @single_dim_scalar_1_x_b10
++func.func @single_dim_scalar_1_x_b10(%arg0: tensor<1xf64>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0] : (tensor<1xf64>) -> tensor<10xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg1, dim = 0
++  // CHECK: %[[LHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[LHS_BCAST]], %[[DIM_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST_DYN]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1xf64>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// [<=10] x [<=10] => [<=10] // PT layer must ensure these are identical!
++// CHECK-LABEL: func @single_dim_scalar_b10_x_b10
++func.func @single_dim_scalar_b10_x_b10(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [1] x [1, <=10, 1] => [1, <=10, 1]
++// CHECK-LABEL: func @single_dim_scalar_1_x_1_b10_1
++func.func @single_dim_scalar_1_x_1_b10_1(%arg0: tensor<1xf64>, %arg1: tensor<1x?x1xf64, #stablehlo.bounds<?, 10, ?>>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [2] : (tensor<1xf64>) -> tensor<1x10x1xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg1, dim = 1
++  // CHECK: %[[LHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[LHS_BCAST]], %[[DIM_SIZE]], dim = 1
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST_DYN]], %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1xf64>, tensor<1x?x1xf64, #stablehlo.bounds<?, 10, ?>>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [10, 1, <=5] x [1] => [10, 1, <=5]
++// CHECK-LABEL: func @single_dim_scalar_10_1_b5_x_1
++func.func @single_dim_scalar_10_1_b5_x_1(%arg0: tensor<10x1x?xf64, #stablehlo.bounds<?, ?, 5>>, %arg1: tensor<1xf64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [2] : (tensor<1xf64>) -> tensor<10x1x5xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 2
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 2
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10x1x?xf64, #stablehlo.bounds<?, ?, 5>>, tensor<1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++
++//////
++// N-D Tests
++
++// [1, 2] x [1, 2] => [1, 2]
++// CHECK-LABEL: func @tensor_no_broadcast_match
++func.func @tensor_no_broadcast_match(%arg0: tensor<1x2xf64>, %arg1: tensor<1x2xf64>) -> !stablehlo.token {
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %arg1)
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<1x2xf64>, tensor<1x2xf64>) ->  !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// [10, 1] x [1, 1] => [10, 1]
++// CHECK-LABEL: func @tensor_broadcast_10_1_x_1_1
++func.func @tensor_broadcast_10_1_x_1_1(%arg0: tensor<10x1xf64>, %arg1: tensor<1x1xf64>) -> !stablehlo.token {
++  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<10x1xf64>
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%arg0, %[[RHS_BCAST]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10x1xf64>, tensor<1x1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [<=10, 1] x [1, 10] => [<=10, 10]
++// CHECK-LABEL: func @tensor_broadcast_b10_1_x_1_10
++func.func @tensor_broadcast_b10_1_x_1_10(%arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>, %arg1: tensor<1x10xf64>) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>) -> tensor<?x10xf64, #stablehlo.bounds<10, ?>>
++  // CHECK: %[[RHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<1x10xf64>) -> tensor<10x10xf64>
++  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST_STATIC]], %[[DIM_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>, tensor<1x10xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++// [<=10, 1] x [1, <=10] => [<=10, <=10]
++// CHECK-LABEL: func @tensor_broadcast_b10_1_x_1_b10
++func.func @tensor_broadcast_b10_1_x_1_b10(
++  %arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
++  %arg1: tensor<1x?xf64, #stablehlo.bounds<?, 10>>
++) -> !stablehlo.token {
++  // CHECK: %[[LHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>) -> tensor<?x10xf64, #stablehlo.bounds<10, ?>>
++  // CHECK: %[[ARG1_DIM1_SIZE:.+]] = stablehlo.get_dimension_size %arg1, dim = 1
++  // CHECK: %[[LHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[LHS_BCAST_STATIC]], %[[ARG1_DIM1_SIZE]], dim = 1
++  // CHECK: %[[RHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<1x?xf64, #stablehlo.bounds<?, 10>>) -> tensor<10x?xf64, #stablehlo.bounds<?, 10>>
++  // CHECK: %[[ARG0_DIM0_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
++  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST_STATIC]], %[[ARG0_DIM0_SIZE]], dim = 0
++  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST_DYN]], %[[RHS_BCAST_DYN]])
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (
++    tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
++    tensor<1x?xf64, #stablehlo.bounds<?, 10>>
++  ) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
++// -----
++
++//////
++// N-ary broadcast tests.
++
++
++// [<=10, 1] x [1, <=10] x [1] => [<=10, <=10]
++// CHECK-LABEL: func @nary_broadcast_b10_1_x_1_b10_x_1
++func.func @nary_broadcast_b10_1_x_1_b10_x_1(
++  %arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
++  %arg1: tensor<1x?xf64, #stablehlo.bounds<?, 10>>,
++  %arg2: tensor<1xf64>
++) -> !stablehlo.token {
++  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1, %arg2) : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>, tensor<1x?xf64, #stablehlo.bounds<?, 10>>, tensor<1xf64>) -> !stablehlo.token
++  return %0 : !stablehlo.token
++}
++
+diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
+--- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
++++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
+@@ -473,6 +473,44 @@
+   return %0, %1, %2, %3 : tensor<6xi32>, tensor<3xi32>, tensor<3x3xi32>, tensor<2x5xi32>
  }
  
-+// CHECK-LABEL: @transpose_of_transpose
-+func.func @transpose_of_transpose(%arg0 : tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-+  %0 = stablehlo.transpose %arg0, dims = [3,2,1,0] : (tensor<1x2x3x4xf32>) -> tensor<4x3x2x1xf32>
-+  %1 = stablehlo.transpose %0, dims = [3,2,1,0] : (tensor<4x3x2x1xf32>) -> tensor<1x2x3x4xf32>
-+  // CHECK-NOT: stablehlo.transpose
-+  // CHECK: return %arg0
-+  return %1 : tensor<1x2x3x4xf32>
++// CHECK-LABEL: func.func @fold_concatenate_splat_leading
++func.func @fold_concatenate_splat_leading(%arg0: tensor<1xi32>) -> tensor<3xi32> {
++  // CHECK: [[CST0:%.+]] = stablehlo.constant dense<0> : tensor<2xi32>
++  // CHECK-NEXT: stablehlo.concatenate [[CST0]], %arg0, dim = 0
++  %cst0 = stablehlo.constant dense<0> : tensor<1xi32>
++  %0 = stablehlo.concatenate %cst0, %cst0, %arg0, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
++  return %0 : tensor<3xi32>
++}
++
++// CHECK-LABEL: func.func @fold_concatenate_splat_trailing
++func.func @fold_concatenate_splat_trailing(%arg0: tensor<2xi32>) -> tensor<6xi32> {
++  // CHECK: [[CST0:%.+]] = stablehlo.constant dense<0> : tensor<4xi32>
++  // CHECK-NEXT: stablehlo.concatenate %arg0, [[CST0]], dim = 0
++  %cst0 = stablehlo.constant dense<0> : tensor<2xi32>
++  %0 = stablehlo.concatenate %arg0, %cst0, %cst0, dim = 0 : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<6xi32>
++  return %0 : tensor<6xi32>
++}
++
++// CHECK-LABEL: func.func @fold_concatenate_splat_middle
++func.func @fold_concatenate_splat_middle(%arg0: tensor<1xi32>) -> tensor<4xi32> {
++  // CHECK: [[CST0:%.+]] = stablehlo.constant dense<0> : tensor<2xi32>
++  // CHECK-NEXT: stablehlo.concatenate %arg0, [[CST0]], %arg0, dim = 0
++  %cst0 = stablehlo.constant dense<0> : tensor<1xi32>
++  %0 = stablehlo.concatenate %arg0, %cst0, %cst0, %arg0, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
++  return %0 : tensor<4xi32>
++}
++
++// CHECK-LABEL: func.func @fold_concatenate_splat_multiple
++func.func @fold_concatenate_splat_multiple(%arg0: tensor<1xi32>) -> tensor<5xi32> {
++  // CHECK-DAG: [[CST0:%.+]] = stablehlo.constant dense<0> : tensor<2xi32>
++  // CHECK-DAG: [[CST1:%.+]] = stablehlo.constant dense<1> : tensor<2xi32>
++  // CHECK-NEXT: stablehlo.concatenate [[CST0]], [[CST1]], %arg0, dim = 0
++  %cst0 = stablehlo.constant dense<0> : tensor<1xi32>
++  %cst1 = stablehlo.constant dense<1> : tensor<1xi32>
++  %0 = stablehlo.concatenate %cst0, %cst0, %cst1, %cst1, %arg0, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<5xi32>
++  return %0 : tensor<5xi32>
 +}
 +
  // -----
  
  ////////
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_refine_shapes.mlir
-@@ -752,7 +752,7 @@
-     %2 = call @refine_call_callee(%arg0_different_i32, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-     return %2 : tensor<?xf32>
-   }
--  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
-+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
-   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-     return %arg1 : tensor<?xf32>
-   }
-@@ -770,7 +770,7 @@
-     %2 = call @refine_call_callee(%arg0_different, %1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-     return %2 : tensor<?xf32>
-   }
--  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
-+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
-   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-     return %arg1 : tensor<?xf32>
-   }
-@@ -789,7 +789,7 @@
-     %4 = call @refine_call_callee(%arg0_new, %3) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
-     return %4 : tensor<?xf32>
-   }
--  // expected-error@+1{{'func.func' op refined with invompatible refinement keys}}
-+  // expected-error@+1{{'func.func' op refined with incompatible refinement keys}}
-   func.func @refine_call_callee(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-     return %arg1 : tensor<?xf32>
-   }
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -73,7 +73,7 @@
- template <typename FromOpTy, typename ToOpTy>
- struct HloNaryElementwiseAdaptor {
-   static ToOpTy createOp(FromOpTy fromOp, Type resultType,
--                         ValueRange broadcastedOperands, OpBuilder& builder) {
-+                         ValueRange broadcastedOperands, OpBuilder &builder) {
-     return builder.create<ToOpTy>(fromOp.getLoc(), resultType,
-                                   broadcastedOperands);
-   }
-@@ -118,7 +118,7 @@
- struct HloCompareAdaptor {
-   static mlir::stablehlo::CompareOp createOp(
-       mlir::chlo::BroadcastCompareOp fromOp, Type resultType,
--      ValueRange broadcastedOperands, OpBuilder& builder) {
-+      ValueRange broadcastedOperands, OpBuilder &builder) {
-     auto chloDirection = fromOp.getComparisonDirection();
-     auto hloDirection = toStableHloComparisonDirection(chloDirection);
-     if (!hloDirection) return nullptr;
-@@ -140,9 +140,9 @@
- // to take a ChloOpTy, NonBroadcastingOpTy, and an Adaptor as templated values.
- template <template <typename, typename, typename> typename Pattern,
-           typename... ConstructorArgs>
--static void populateForBroadcastingBinaryOp(MLIRContext* context,
--                                            RewritePatternSet* patterns,
--                                            ConstructorArgs&&... args) {
-+static void populateForBroadcastingBinaryOp(MLIRContext *context,
-+                                            RewritePatternSet *patterns,
-+                                            ConstructorArgs &&...args) {
- #define POPULATE_BCAST(ChloOp, HloOp)                                          \
-   patterns                                                                     \
-       ->add<Pattern<ChloOp, HloOp, HloNaryElementwiseAdaptor<ChloOp, HloOp>>>( \
-@@ -179,21 +179,21 @@
-       context, args...);
- }
- 
--static Value getConstantLikeMaxFiniteValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeMaxFiniteValue(OpBuilder &b, Location loc,
-                                            Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getLargest(ty.getFloatSemantics()), val);
- }
- 
--static Value getConstantLikeInfValue(OpBuilder& b, Location loc, Value val,
-+static Value getConstantLikeInfValue(OpBuilder &b, Location loc, Value val,
-                                      bool negative) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getInf(ty.getFloatSemantics(), negative), val);
- }
- 
--static Value getConstantLikeSmallestNormalizedValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeSmallestNormalizedValue(OpBuilder &b, Location loc,
-                                                     Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-@@ -239,7 +239,7 @@
- 
-   LogicalResult matchAndRewrite(
-       ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
-@@ -329,7 +329,7 @@
- 
-   LogicalResult matchAndRewrite(
-       ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only support ranked operands.
-     Value lhs = adaptor.getLhs();
-     Value rhs = adaptor.getRhs();
-@@ -413,7 +413,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ConstantLikeOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     auto resultTy = cast<ShapedType>(op.getType());
- 
-     // Unranked uses are not supported.
-@@ -445,7 +445,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::BroadcastSelectOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     // Only support ranked operands.
-     Value pred = adaptor.getPred();
-     Value onTrue = adaptor.getOnTrue();
-@@ -533,7 +533,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ConstantOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(op, op.getValue());
-     return success();
-   }
-@@ -541,7 +541,7 @@
- 
- template <typename FTy>
- static Value materializeChebyshevPolynomialApproximation(
--    OpBuilder& rewriter, Location loc, Value x, ArrayRef<FTy> coefficients) {
-+    OpBuilder &rewriter, Location loc, Value x, ArrayRef<FTy> coefficients) {
-   Value b0 = getConstantLike(rewriter, loc, 0.0, x);
-   Value b1 = getConstantLike(rewriter, loc, 0.0, x);
-   Value b2 = getConstantLike(rewriter, loc, 0.0, x);
-@@ -561,7 +561,7 @@
- }
- 
- template <typename FTy>
--static Value materializeBesselI1eApproximation(OpBuilder& rewriter,
-+static Value materializeBesselI1eApproximation(OpBuilder &rewriter,
-                                                Location loc, Value x,
-                                                ArrayRef<FTy> kI1eCoeffsA,
-                                                ArrayRef<FTy> kI1eCoeffsB) {
-@@ -594,7 +594,7 @@
-       loc, rewriter.create<mlir::stablehlo::SignOp>(loc, x), select);
- }
- 
--Value materializeBesselI1eApproximationF32(OpBuilder& rewriter, Location loc,
-+Value materializeBesselI1eApproximationF32(OpBuilder &rewriter, Location loc,
-                                            ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -620,7 +620,7 @@
-                                                   kI1eCoeffsB);
- }
- 
--static Value materializeBesselI1eApproximationF64(OpBuilder& rewriter,
-+static Value materializeBesselI1eApproximationF64(OpBuilder &rewriter,
-                                                   Location loc,
-                                                   ValueRange args) {
-   Value x = args.front();
-@@ -663,10 +663,10 @@
-                                                    kI1eCoeffsA, kI1eCoeffsB);
- }
- 
--static Value materializeWithUpcast(ConversionPatternRewriter& rewriter,
-+static Value materializeWithUpcast(ConversionPatternRewriter &rewriter,
-                                    Location loc, ValueRange args,
-                                    FloatType minPrecisionTy,
--                                   Value callback(OpBuilder&, Location,
-+                                   Value callback(OpBuilder &, Location,
-                                                   ValueRange)) {
-   Type originalTy = getElementTypeOrSelf(args.front().getType());
-   auto floatOriginalTy = dyn_cast<FloatType>(originalTy);
-@@ -699,7 +699,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::BesselI1eOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -725,7 +725,7 @@
- };
- 
- template <typename FTy>
--static Value materializePolynomialApproximation(OpBuilder& rewriter,
-+static Value materializePolynomialApproximation(OpBuilder &rewriter,
-                                                 Location loc, Value x,
-                                                 ArrayRef<FTy> coefficients) {
-   if (coefficients.empty()) return getConstantLike(rewriter, loc, 0.0, x);
-@@ -746,7 +746,7 @@
- // argument and derive the final approximation for all |x| >= 1.
- // This implementation is based on Cephes.
- static Value materializeErfcApproximationF64ForMagnituteGeOne(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -831,7 +831,7 @@
- // Precondition is |x| <= 1. Use erfc approximation, otherwise.
- // This implementation is based on Cephes.
- static Value materializeErfApproximationF64ForMagnituteLeOne(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -856,7 +856,7 @@
- }
- 
- // This implementation is based on Cephes.
--static Value materializeErfApproximationF64(ConversionPatternRewriter& rewriter,
-+static Value materializeErfApproximationF64(ConversionPatternRewriter &rewriter,
-                                             Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-@@ -884,7 +884,7 @@
- }
- 
- static Value materializeErfcApproximationF64(
--    ConversionPatternRewriter& rewriter, Location loc, ValueRange args) {
-+    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF64() &&
-          "expect f64 element type");
-@@ -916,7 +916,7 @@
- // argument and derive the final approximation for all |x| >= 1.
- // This implementation is based on Cephes.
- static Value materializeErfcApproximationF32ForMagnitudeGeOne(
--    OpBuilder& rewriter, Location loc, ValueRange args) {
-+    OpBuilder &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-          "expect f32 element type");
-@@ -982,7 +982,7 @@
- // Precondition is |x| <= 1. Use erfc approximation, otherwise.
- // This implementation is based on Cephes.
- static Value materializeErfApproximationF32ForMagnitudeLeOne(
--    OpBuilder& rewriter, Location loc, ValueRange args) {
-+    OpBuilder &rewriter, Location loc, ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-          "expect f32 element type");
-@@ -1001,7 +1001,7 @@
- }
- 
- // This is the same approximation as used in Eigen.
--static Value materializeErfApproximationF32(OpBuilder& rewriter, Location loc,
-+static Value materializeErfApproximationF32(OpBuilder &rewriter, Location loc,
-                                             ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -1038,7 +1038,7 @@
-                                                    erf, ubErf);
- }
- 
--static Value materializeErfcApproximationF32(OpBuilder& rewriter, Location loc,
-+static Value materializeErfcApproximationF32(OpBuilder &rewriter, Location loc,
-                                              ValueRange args) {
-   Value x = args.front();
-   assert(cast<ShapedType>(x.getType()).getElementType().isF32() &&
-@@ -1070,7 +1070,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -1098,7 +1098,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfcOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     Value x = adaptor.getOperand();
-     Type ty = cast<ShapedType>(x.getType()).getElementType();
-@@ -1121,7 +1121,7 @@
-   }
- };
- 
--static Value erfInv32(OpBuilder& b, Location loc, ValueRange args) {
-+static Value erfInv32(OpBuilder &b, Location loc, ValueRange args) {
-   constexpr int kDegree = 9;
-   constexpr std::array<float, 9> wLessThan5Constants = {
-       2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-@@ -1178,7 +1178,7 @@
-       result);
- }
- 
--static Value erfInv64(ConversionPatternRewriter& b, Location loc,
-+static Value erfInv64(ConversionPatternRewriter &b, Location loc,
-                       ValueRange args) {
-   constexpr std::array<double, 23> wLessThan625Constants = {
-       -3.6444120640178196996e-21, -1.685059138182016589e-19,
-@@ -1298,7 +1298,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ErfInvOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     if (op.getType().getElementType().isF64()) {
-       rewriter.replaceOp(op, erfInv64(rewriter, loc, adaptor.getOperands()));
-@@ -1338,7 +1338,7 @@
- //   with   t(z) = z + kLanczosGamma + 1/2
- //          a(z) = kBaseLanczosCoeff
- //                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
--Value materializeLgamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializeLgamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   // If the input is less than 0.5 use Euler's reflection formula.
-   //   gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-   // Let z be
-@@ -1485,7 +1485,7 @@
- // +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
- // correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
- // we deem this acceptable.
--static Value materializeCoshApproximation(OpBuilder& rewriter, Location loc,
-+static Value materializeCoshApproximation(OpBuilder &rewriter, Location loc,
-                                           ValueRange operands) {
-   mlir::chlo::CoshOp::Adaptor transformed(operands);
-   Value x = transformed.getOperand();
-@@ -1504,7 +1504,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::CoshOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                   rewriter.getF32Type(),
-@@ -1523,7 +1523,7 @@
- //          a(z) = kBaseLanczosCoeff
- //                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
- //          a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
--Value materializeDigamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializeDigamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   // If the input is less than 0.5 use Euler's reflection formula.
-   //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-   // Let z be
-@@ -1630,14 +1630,14 @@
- 
- namespace {
- 
--static Value getConstantLikeSmallestFiniteValue(OpBuilder& b, Location loc,
-+static Value getConstantLikeSmallestFiniteValue(OpBuilder &b, Location loc,
-                                                 Value val) {
-   auto ty = cast<FloatType>(getElementTypeOrSelf(val.getType()));
-   return getConstantLike(
-       b, loc, llvm::APFloat::getSmallest(ty.getFloatSemantics()), val);
+@@ -576,16 +614,19 @@
+ // ReshapeOp
+ 
+ // CHECK-LABEL: func @reshape_fold
+-func.func @reshape_fold() -> (tensor<1xi32>, tensor<2x2xi32>) {
+-  %c0 = stablehlo.constant dense<2> : tensor<i32>
++func.func @reshape_fold() -> (tensor<1xf32>, tensor<2x2xi32>, tensor<3x2xcomplex<f32>>) {
++  %c0 = stablehlo.constant dense<2.0> : tensor<f32>
+   %c1 = stablehlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+-  %0 = stablehlo.reshape %c0 : (tensor<i32>) -> tensor<1xi32>
++  %c2 = stablehlo.constant dense<(1.0,2.0)> : tensor<2x3xcomplex<f32>>
++  %0 = stablehlo.reshape %c0 : (tensor<f32>) -> tensor<1xf32>
+   %1 = stablehlo.reshape %c1 : (tensor<4xi32>) -> tensor<2x2xi32>
+-
+-  // CHECK-DAG:  [[CST1:%.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+-  // CHECK-DAG:  [[CST2:%.+]] = stablehlo.constant dense<{{\[\[1, 2\], \[3, 4\]\]}}> : tensor<2x2xi32>
+-  // CHECK-NEXT: return [[CST1]], [[CST2]]
+-  return %0, %1 : tensor<1xi32>, tensor<2x2xi32>
++  %2 = stablehlo.reshape %c2 : (tensor<2x3xcomplex<f32>>) -> tensor<3x2xcomplex<f32>>
++
++  // CHECK-DAG:  [[RESULT0:%.+]] = stablehlo.constant dense<2.0{{.*}}> : tensor<1xf32>
++  // CHECK-DAG:  [[RESULT1:%.+]] = stablehlo.constant dense<{{\[\[1, 2\], \[3, 4\]\]}}> : tensor<2x2xi32>
++  // CHECK-DAG:  [[RESULT2:%.+]] = stablehlo.constant dense<(1.0{{.*}},2.0{{.*}})> : tensor<3x2xcomplex<f32>>
++  // CHECK-NEXT: return [[RESULT0]], [[RESULT1]], [[RESULT2]]
++  return %0, %1, %2 : tensor<1xf32>, tensor<2x2xi32>, tensor<3x2xcomplex<f32>>
  }
  
--static Value materializeZeta(OpBuilder& rewriter, Location loc,
-+static Value materializeZeta(OpBuilder &rewriter, Location loc,
-                              ValueRange args) {
-   // Implementation ported from:
-   // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-@@ -1790,7 +1790,7 @@
- 
- }  // namespace
- 
--Value materializePolygamma(OpBuilder& rewriter, Location loc, ValueRange args) {
-+Value materializePolygamma(OpBuilder &rewriter, Location loc, ValueRange args) {
-   mlir::chlo::PolygammaOp::Adaptor transformed(args);
-   Value n = transformed.getN();
-   Value x = transformed.getX();
-@@ -1840,7 +1840,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::LgammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-@@ -1854,7 +1854,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::DigammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-         op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-@@ -1863,7 +1863,7 @@
-   }
- };
- 
--static Value materializeNextAfter(ConversionPatternRewriter& rewriter,
-+static Value materializeNextAfter(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-   mlir::chlo::NextAfterOp::Adaptor transformed(operands);
-   Value x = transformed.getX();
-@@ -1957,7 +1957,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::NextAfterOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     rewriter.replaceOp(
-         op, materializeNextAfter(rewriter, op.getLoc(), adaptor.getOperands()));
-     return success();
-@@ -1969,7 +1969,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::PolygammaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-@@ -1989,7 +1989,7 @@
- // +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
- // correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
- // we deem this acceptable.
--static Value materializeSinhApproximationForLargeX(OpBuilder& rewriter,
-+static Value materializeSinhApproximationForLargeX(OpBuilder &rewriter,
-                                                    Location loc,
-                                                    ValueRange operands) {
-   mlir::chlo::SinhOp::Adaptor transformed(operands);
-@@ -2007,7 +2007,7 @@
- // Express `sinh` as
- //   sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
- //           = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
--static Value materializeSinhApproximation(OpBuilder& rewriter, Location loc,
-+static Value materializeSinhApproximation(OpBuilder &rewriter, Location loc,
-                                           ValueRange operands) {
-   Value largeSinhResult =
-       materializeSinhApproximationForLargeX(rewriter, loc, operands);
-@@ -2043,7 +2043,7 @@
- namespace {
- 
- ArrayAttr convertPrecisionConfig(mlir::ArrayAttr precisionConfig,
--                                 ConversionPatternRewriter& rewriter) {
-+                                 ConversionPatternRewriter &rewriter) {
-   std::vector<Attribute> precisions;
-   for (Attribute precision : precisionConfig.getValue()) {
-     switch (dyn_cast<mlir::chlo::PrecisionAttr>(precision).getValue()) {
-@@ -2077,7 +2077,7 @@
- // In this implementation, the IR size increases by a factor of g. If this
- // becomes a problem, we can try adding stablehlo.while to reduce the IR size.
- LogicalResult handleRaggedDotMode1(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   Value lhs = op.getLhs();
-   Value rhs = op.getRhs();
-   chlo::RaggedDotDimensionNumbersAttr raggedDotDimensionNumbers =
-@@ -2231,7 +2231,7 @@
- //   group_sizes : [g]
- //   result : [g, b, m, n]
- LogicalResult handleRaggedDotMode2(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   return failure();
+ // -----
+diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
+--- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
++++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
+@@ -27,6 +27,14 @@
+   // CHECK-NOT: stablehlo.constant
+   // CHECK: return %arg0
+   return %1 : tensor<f32>
++}
++
++// CHECK-LABEL: @add_cst_on_rhs_with_attrs
++func.func @add_cst_on_rhs_with_attrs(%arg0: tensor<f32>) -> tensor<f32> {
++  %cst = stablehlo.constant dense<1.0> : tensor<f32>
++  // CHECK: stablehlo.add %arg0, %cst {mhlo.frontend_attributes = {foo = "1"}} : tensor<f32>
++  %0 = stablehlo.add %cst, %arg0 {mhlo.frontend_attributes = {foo = "1"}} : tensor<f32>
++  return %0 : tensor<f32>
  }
  
-@@ -2241,7 +2241,7 @@
- //   group_sizes : [g]
- //   result : [b, m, n]
- LogicalResult handleRaggedDotMode3(mlir::chlo::RaggedDotOp op,
--                                   ConversionPatternRewriter& rewriter) {
-+                                   ConversionPatternRewriter &rewriter) {
-   return failure();
+ // -----
+@@ -120,6 +128,16 @@
+   return %7 : tensor<3x2x3x3xi32>
  }
  
-@@ -2254,7 +2254,7 @@
-   // dimension.
-   LogicalResult matchAndRewrite(
-       mlir::chlo::RaggedDotOp op, OpAdaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     if (op.getLhs().getType().getRank() < op.getRhs().getType().getRank()) {
-       return handleRaggedDotMode1(op, rewriter);
-     } else if (op.getLhs().getType().getRank() <
-@@ -2271,7 +2271,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::SinhOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Value x = adaptor.getOperand();
-     if (isa<ComplexType>(cast<ShapedType>(x.getType()).getElementType())) {
-       rewriter.replaceOp(op, materializeSinhApproximationForLargeX(
-@@ -2321,7 +2321,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::TopKOp op, OpAdaptor /*adaptor*/,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
-     if (!operandType) return failure();
-     int64_t operandRank = operandType.getRank();
-@@ -2436,7 +2436,7 @@
- 
-   LogicalResult matchAndRewrite(
-       mlir::chlo::ZetaOp op, OpAdaptor adaptor,
--      ConversionPatternRewriter& rewriter) const override {
-+      ConversionPatternRewriter &rewriter) const override {
-     Location loc = op.getLoc();
-     FloatType minPrecisionTy = rewriter.getF32Type();
-     rewriter.replaceOp(
-@@ -2452,7 +2452,7 @@
- 
- struct ChloLegalizeToStablehloPass final
-     : impl::ChloLegalizeToStablehloPassBase<ChloLegalizeToStablehloPass> {
--  LogicalResult initialize(MLIRContext* context) override {
-+  LogicalResult initialize(MLIRContext *context) override {
-     target = std::make_shared<ConversionTarget>(*context);
-     target->addIllegalDialect<chlo::ChloDialect>();
-     target->addLegalDialect<mlir::stablehlo::StablehloDialect,
-@@ -2482,8 +2482,8 @@
- }  // namespace
++// CHECK-LABEL: func.func @broadcast_in_dim_nested_bounded
++func.func @broadcast_in_dim_nested_bounded(%arg0: tensor<3x3xi32>, %arg1: tensor<i32>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>> {
++  // CHECK: [[SDS:%.+]] = stablehlo.set_dimension_size
++  // CHECK-NEXT: stablehlo.broadcast_in_dim [[SDS]], dims = [2, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<3x3xi32>, tensor<i32>) -> tensor<?x3xi32, #stablehlo.bounds<3, ?>>
++  %1 = stablehlo.broadcast_in_dim %0, dims = [1, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>
++  %2 = stablehlo.broadcast_in_dim %1, dims = [0, 2, 1] : (tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++  return %2 : tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
++}
++
+ // CHECK-LABEL: func.func @broadcast_in_dim_reshape
+ // CHECK-SAME:   ([[ARG0:%.+]]: tensor<3x6xi32>)
+ func.func @broadcast_in_dim_reshape(%arg0: tensor<3x6xi32>)
+@@ -132,6 +150,15 @@
  
- namespace {
--static void populateChloBroadcastingPatterns(MLIRContext* context,
--                                             RewritePatternSet* patterns) {
-+static void populateChloBroadcastingPatterns(MLIRContext *context,
-+                                             RewritePatternSet *patterns) {
-   // Instantiate conversion templates for conforming binary elementwise ops
-   // that do not have different dtypes between operands and results and do
-   // not have special attributes that need to be preserved.
-@@ -2496,8 +2496,8 @@
-   patterns->add<ConvertConstantLikeOp, ConvertSelectOp>(context);
+   // CHECK-NEXT: return [[R0]], [[R5]]
+   return %0, %5 : tensor<1x3x6xi32>, tensor<3x6x1xi32>
++}
++
++// CHECK-LABEL: func.func @broadcast_in_dim_bounded_no_reshape
++func.func @broadcast_in_dim_bounded_no_reshape(%arg0: tensor<20xf32>, %arg1: tensor<i32>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>> {
++  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<20xf32>, tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<20>>
++  // CHECK: stablehlo.set_dimension_size
++  // CHECK-NEXT: stablehlo.broadcast_in_dim
++  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<?xf32, #stablehlo.bounds<20>>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>>
++  return %1 : tensor<1x?xf32, #stablehlo.bounds<?, 20>>
  }
  
--static void populateChloDecompositionPatterns(MLIRContext* context,
--                                              RewritePatternSet* patterns) {
-+static void populateChloDecompositionPatterns(MLIRContext *context,
-+                                              RewritePatternSet *patterns) {
-   populateWithGenerated(*patterns);
-   patterns
-       ->add<ConvertConstantOp, ConvertBesselI1eOp, ConvertCoshOp,
-@@ -2508,8 +2508,8 @@
+ // CHECK-LABEL: func.func @broadcast_in_dim_prefer_nested_reshape
+@@ -976,6 +1003,26 @@
+   // CHECK-NOT: stablehlo.constant
+   // CHECK: return %arg0 : tensor<f32>
+   return %0 : tensor<f32>
++}
++
++// CHECK-LABEL: @multiply_by_one_merge_attrs
++func.func @multiply_by_one_merge_attrs(%arg0: tensor<f32>) -> tensor<f32> {
++  %cst = stablehlo.constant dense<1.0> : tensor<f32>
++  %0 = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {bar = "1"}} : tensor<f32>
++  %1 = stablehlo.multiply %0, %cst {mhlo.frontend_attributes = {foo = "1"}} : tensor<f32>
++  // CHECK: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {bar = "1", foo = "1"}} : tensor<f32>
++  // CHECK: return %[[ADD]] : tensor<f32>
++  return %1 : tensor<f32>
++}
++
++// CHECK-LABEL: @multiply_by_one_merge_attrs_conflict
++func.func @multiply_by_one_merge_attrs_conflict(%arg0: tensor<f32>) -> tensor<f32> {
++  %cst = stablehlo.constant dense<1.0> : tensor<f32>
++  %0 = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {bar = "1", foo = "0"}} : tensor<f32>
++  %1 = stablehlo.multiply %0, %cst {mhlo.frontend_attributes = {foo = "1"}} : tensor<f32>
++  // CHECK: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {bar = "1", foo = "1"}} : tensor<f32>
++  // CHECK: return %[[ADD]] : tensor<f32>
++  return %1 : tensor<f32>
  }
- }  // namespace
  
--void populateChloToStablehloPatterns(MLIRContext* context,
--                                     RewritePatternSet* patterns) {
-+void populateChloToStablehloPatterns(MLIRContext *context,
-+                                     RewritePatternSet *patterns) {
-   populateChloBroadcastingPatterns(context, patterns);
-   populateChloDecompositionPatterns(context, patterns);
+ // -----
+diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_target_independent_optimization.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_target_independent_optimization.mlir
+--- stablehlo/stablehlo/tests/transforms/stablehlo_target_independent_optimization.mlir
++++ stablehlo/stablehlo/tests/transforms/stablehlo_target_independent_optimization.mlir
+@@ -9,6 +9,32 @@
+   // CHECK: stablehlo.add %arg0, %cst : tensor<f32>
+   %1 = stablehlo.add %0, %arg0 : tensor<f32>
+   return %1 : tensor<f32>
++}
++
++// -----
++
++func.func @concatenate_fold_splat_flatten_integ(%arg0: tensor<8xf32>) -> tensor<64xf32> {
++  // CHECK-DAG: [[CST0:%.+]] = stablehlo.constant dense<0.000000e+00> : tensor<8xf32>
++  // CHECK-DAG: [[CST1:%.+]] = stablehlo.constant dense<1.000000e+00> : tensor<8xf32>
++  // CHECK-DAG: [[CST2:%.+]] = stablehlo.constant dense<2.000000e+00> : tensor<8xf32>
++  // CHECK-DAG: [[CST3:%.+]] = stablehlo.constant dense<3.000000e+00> : tensor<8xf32>
++  // CHECK: stablehlo.concatenate [[CST0]], [[CST1]], [[CST2]], [[CST3]], %arg0, %arg0, %arg0, %arg0,
++  %cst0 = stablehlo.constant dense<0.0> : tensor<f32>
++  %cst1 = stablehlo.constant dense<1.0> : tensor<f32>
++  %cst2 = stablehlo.constant dense<2.0> : tensor<f32>
++  %cst3 = stablehlo.constant dense<3.0> : tensor<f32>
++  %0 = stablehlo.reshape %cst0 : (tensor<f32>) -> tensor<1xf32>
++  %1 = stablehlo.reshape %cst1 : (tensor<f32>) -> tensor<1xf32>
++  %2 = stablehlo.reshape %cst2 : (tensor<f32>) -> tensor<1xf32>
++  %3 = stablehlo.reshape %cst3 : (tensor<f32>) -> tensor<1xf32>
++  %4 = stablehlo.concatenate %0, %0, %0, %0, %0, %0, %0, %0, dim = 0 : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8xf32>
++  %5 = stablehlo.concatenate %1, %1, %1, %1, %1, %1, %1, %1, dim = 0 : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8xf32>
++  %6 = stablehlo.concatenate %2, %2, %2, %2, %2, %2, %2, %2, dim = 0 : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8xf32>
++  %7 = stablehlo.concatenate %3, %3, %3, %3, %3, %3, %3, %3, dim = 0 : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<8xf32>
++  %8 = stablehlo.concatenate %4, %5, %6, %7, dim = 0 : (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<32xf32>
++  %9 = stablehlo.concatenate %arg0, %arg0, %arg0, %arg0, dim = 0 : (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<32xf32>
++  %10 = stablehlo.concatenate %8, %9, dim = 0 : (tensor<32xf32>, tensor<32xf32>) -> tensor<64xf32>
++  return %10 : tensor<64xf32>
  }
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -461,7 +461,7 @@
-   LogicalResult emitDifferentRefinementContextError(func::FuncOp func,
-                                                     RefinementKey key,
-                                                     RefinementKey prevKey) {
--    return func.emitOpError() << "refined with invompatible refinement keys:"
-+    return func.emitOpError() << "refined with incompatible refinement keys:"
-                               << "\n  curr=" << key.toString()
-                               << "\n  prev=" << prevKey.toString();
-   }
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/Passes.td b/stablehlo/stablehlo/transforms/optimization/Passes.td
---- stablehlo/stablehlo/transforms/optimization/Passes.td
-+++ stablehlo/stablehlo/transforms/optimization/Passes.td
-@@ -23,14 +23,14 @@
-          "explicit MLIR `MemoryEffects`. Notably, this means `func.call` ops "
-          "will be assumed pure.">,
-   Option<"foldOpElementLimit", "fold-op-element-limit", "int64_t",
--         /*default=*/"1",
-+         /*default=*/"65536",
-          "Folding an op into a constant can sometimes come at the cost of "
-          "memory overhead. (This occurs if the op's inputs are reused, meaning "
-          "that they can't be deleted after the op is folded to a constant, or "
--         "when folding operations like `iota` whose outputs take up more "
-+         "when folding operations like `concat` whose outputs take up more "
-          "memory than their inputs.) In such cases, this config option sets an "
-          "upper limit on how many elements an op's result may have before the "
--         "op is no longer folded.">,
-+         "op is no longer folded. Splat folds are exempt from this limit.">,
-   Option<"optimizeFloat", "optimize-float", "bool", /*default=*/"true",
-          "Allow float optimizations that, though mathematically equivalent, "
-          "may result in slightly different quantization of floating-point "
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-@@ -74,12 +74,39 @@
  
- static constexpr StablehloAggressiveFolderPassOptions kDefaultOptions;
- 
-+APSInt getAPSInt(Type type, uint64_t value) {
-+  unsigned numBits;
-+  bool isUnsigned;
-+  if (auto integerType = dyn_cast<IntegerType>(type)) {
-+    numBits = integerType.getWidth();
-+    // Signless types are treated as signed, per StableHLO convention.
-+    isUnsigned = integerType.isUnsignedInteger();
-+  } else {
-+    llvm::report_fatal_error("expected integer type");
+ // -----
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+@@ -0,0 +1,293 @@
++/* Copyright 2025 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "stablehlo/transforms/StablehloBroadcastLowering.h"
++
++#include <algorithm>
++#include <cassert>
++#include <cstddef>
++#include <cstdint>
++#include <string>
++#include <utility>
++
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/Sequence.h"
++#include "llvm/ADT/SmallVector.h"
++#include "llvm/Support/Debug.h"
++#include "llvm/Support/raw_ostream.h"
++#include "mlir/IR/Builders.h"
++#include "mlir/IR/BuiltinTypeInterfaces.h"
++#include "mlir/IR/BuiltinTypes.h"
++#include "mlir/IR/Diagnostics.h"
++#include "mlir/IR/Location.h"
++#include "mlir/IR/Types.h"
++#include "mlir/IR/Value.h"
++#include "mlir/Support/LLVM.h"
++#include "stablehlo/dialect/StablehloOps.h"
++
++#define DEBUG_TYPE "stablehlo-broadcast-lowering"
++
++namespace mlir {
++namespace stablehlo {
++
++/////
++// Bounded dynamism broadcasting
++
++namespace {
++
++DimensionInfo getDimensionInfo(Value op, mlir::RankedTensorType tensorType,
++                               TypeExtensionsAttr encoding,
++                               int64_t dim) {
++  if (!encoding || !mlir::ShapedType::isDynamic(tensorType.getDimSize(dim)))
++    return DimensionInfo{tensorType.getDimSize(dim)};
++
++  return DimensionInfo{
++      encoding.getBounds()[dim],
++      op,
++      dim,
++  };
++}
++
++FailureOr<Dimensions> getDimensions(Value op) {
++  // Get tensor type
++  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
++  if (!tensor_type)
++    return emitError(op.getLoc(), "expected ranked tensor type");
++
++  auto encoding =
++      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
++          tensor_type.getEncoding());
++
++  Dimensions dimensions;
++  dimensions.reserve(tensor_type.getRank());
++  for (size_t idx = 0; idx < tensor_type.getRank(); ++idx) {
++    auto dimInfo = getDimensionInfo(op, tensor_type, encoding, idx);
++    dimensions.push_back(dimInfo);
++  }
++  return dimensions;
++}
++
++FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(
++    const Dimensions& a, const Dimensions& b) {
++  LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] inputs: "
++                          << toString(a) << " * " << toString(b));
++  size_t max_rank = std::max(a.size(), b.size());
++  Dimensions result(max_rank);
++
++  // Iterate from right to left (NumPy-style broadcasting)
++  for (int i = 1; i <= max_rank; ++i) {
++    size_t a_idx = a.size() - i;
++    size_t b_idx = b.size() - i;
++    size_t res_idx = max_rank - i;
++
++    // Get DimensionInfo for the current index, padding with size 1 if out of
++    // bounds.
++    DimensionInfo dim_a =
++        (a_idx >= 0 && a_idx < a.size()) ? a[a_idx] : DimensionInfo{1};
++    DimensionInfo dim_b =
++        (b_idx >= 0 && b_idx < b.size()) ? b[b_idx] : DimensionInfo{1};
++
++    // Short circuit on size 1 dimensions.
++    if (dim_a.size == 1) {
++      result[res_idx] = dim_b;
++      continue;
++    }
++    if (dim_b.size == 1) {
++      result[res_idx] = dim_a;
++      continue;
++    }
++
++    // If both LHS and RHS are not 1, dim size must match.
++    if (dim_a.size != dim_b.size) {
++      return emitError(a[a_idx].boundOp.value().getLoc(),
++                       "incompatible shapes for broadcasting ")
++             << dim_a.size << " and " << dim_b.size;
++    }
++
++    // If bounded both must be bounded
++    if (dim_a.boundOp.has_value() != dim_b.boundOp.has_value()) {
++      return emitError(a[a_idx].boundOp.value().getLoc(),
++                       "cannot mix bounded and static dimensions in broadcast");
++    }
++
++    // LHS and RHS match, populate with one of the dimensions.
++    result[res_idx] = dim_a;
 +  }
-+  return APSInt(
-+      {/*numBits=*/numBits, value, /*isSigned=*/false, /*implicitTrunc=*/true},
-+      /*isUnsigned=*/isUnsigned);
++
++  LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] result: "
++                          << toString(result));
++  return result;
 +}
 +
- template <typename T>
- APSInt getAPSInt(unsigned bitWidth, T value, bool isSigned) {
-   return APSInt({/*numBits=*/bitWidth, static_cast<uint64_t>(value),
-                  /*isSigned=*/isSigned,
-                  /*implicitTrunc=*/true},
-                 /*isUnsigned=*/!isSigned);
++mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
++                                           mlir::Type element_type) {
++  mlir::SmallVector<int64_t> shape;
++  mlir::SmallVector<int64_t> bounds;
++  shape.reserve(dims.size());
++  for (const DimensionInfo& dim : dims) {
++    if (dim.boundOp.has_value()) {
++      shape.push_back(mlir::ShapedType::kDynamic);
++      bounds.push_back(dim.size);
++    } else {
++      shape.push_back(dim.size);
++      bounds.push_back(mlir::ShapedType::kDynamic);
++    }
++  }
++  mlir::stablehlo::TypeExtensionsAttr encoding;
++  if (!llvm::all_of(
++          bounds, [](int64_t b) { return b == mlir::ShapedType::kDynamic; })) {
++    encoding = mlir::stablehlo::TypeExtensionsAttr::get(
++        element_type.getContext(), bounds);
++  }
++  return mlir::RankedTensorType::get(shape, element_type, encoding);
 +}
 +
-+APFloat getAPFloat(
-+    Type type, double value,
-+    llvm::RoundingMode roundingMode = llvm::RoundingMode::NearestTiesToEven) {
-+  auto floatType = dyn_cast<FloatType>(type);
-+  if (!floatType) llvm::report_fatal_error("expected float type");
++}  // namespace
 +
-+  APFloat result(value);
-+  bool unusedLosesInfo = false;
-+  result.convert(floatType.getFloatSemantics(), roundingMode, &unusedLosesInfo);
++
++FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops) {
++  if (ops.empty()) return failure();
++
++  Value first = ops[0];
++  auto bcastShapeOrFail = getDimensions(first);
++  if (failed(bcastShapeOrFail)) return failure();
++  Dimensions bcastShape = std::move(*bcastShapeOrFail);
++
++  for (int i = 1; i < ops.size(); ++i) {
++    Value currOp = ops[i];
++    auto dims = getDimensions(currOp);
++    if (failed(dims)) return failure();
++    auto currBcastShapeOrFail =
++        getNumpyBroadcastShapeWithBounds(bcastShape, *dims);
++    if (failed(currBcastShapeOrFail)) return failure();
++    bcastShape = std::move(*currBcastShapeOrFail);
++  }
++  return std::move(bcastShape);
++}
++
++std::string toString(const Dimensions& dims) {
++  std::string result;
++  llvm::raw_string_ostream os(result);
++  os << "tensor<";
++  llvm::interleave(
++      dims, os,
++      [&](const DimensionInfo& dim) {
++        os << (dim.boundOp.has_value() ? "b" : "") << dim.size;
++      },
++      "x");
++  os << ">";
 +  return result;
- }
- 
- LogicalResult validateStaticShapeResult(PatternRewriter& rewriter,
-@@ -1256,21 +1283,48 @@
-       return rewriter.notifyMatchFailure(
-           op, "expected operand with static ranked tensor type");
- 
--    ElementsAttr els;
-+    DenseElementsAttr els;
-     if (!matchPattern(operand, m_Constant(&els)))
-       return rewriter.notifyMatchFailure(
-           op, "expected constant integer or float operand");
- 
-+    // Short circuit on splat resizes
-+    if (els.isSplat()) {
-+      rewriter.replaceOpWithNewOp<ConstantOp>(op, els.resizeSplat(resultType));
-+      return success();
++}
++
++FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
++                                                     ArrayRef<Value> operands) {
++  // Figure out the broadcast shape
++  auto bcastShapeOrFail = getNumpyBroadcastShape(operands);
++  if (failed(bcastShapeOrFail)) return failure();
++  Dimensions bcastShape = std::move(*bcastShapeOrFail);
++
++  // Apply to all operands
++  SmallVector<Value> broadcastedOperands;
++  for (auto operand : operands) {
++    auto bcastOperand = numpyBroadcastIfNeeded(builder, operand, bcastShape);
++    if (failed(bcastOperand)) return failure();
++    broadcastedOperands.push_back(*bcastOperand);
++  }
++  return std::move(broadcastedOperands);
++}
++
++FailureOr<Value> numpyBroadcastIfNeeded(OpBuilder& builder, Value input,
++                                        const Dimensions& shape) {
++  LLVM_DEBUG(llvm::dbgs() << "[BroadcastIfNeeded] input: " << input
++                          << " shape: " << toString(shape));
++  auto loc = input.getLoc();
++  mlir::RankedTensorType input_type =
++      dyn_cast<RankedTensorType>(input.getType());
++  if (!input_type) return emitError(input.getLoc(), "expected tensor type");
++  mlir::RankedTensorType output_type =
++      getRankedTensorType(shape, input_type.getElementType());
++
++  // Short circuit if no broadcasting is needed.
++  if (input_type == output_type) return input;
++
++  int64_t input_rank = input_type.getRank();
++  int64_t output_rank = output_type.getRank();
++  if (input_rank > output_rank)
++    return emitError(loc, "input rank must be <= output rank, got ")
++           << input_rank << " vs " << output_rank;
++
++  size_t rank_diff = output_rank - input_rank;
++  SmallVector<int64_t> bcast_dims;
++  bcast_dims.reserve(input_rank);
++
++  auto inputShapeOrFail = getDimensions(input);
++  if (failed(inputShapeOrFail)) return failure();
++  Dimensions inputShape = std::move(*inputShapeOrFail);
++
++  // Construct broadcast dimensions.
++  auto broadcastDimensions = llvm::to_vector(
++      llvm::seq<int64_t>(output_rank - input_rank, output_rank));
++
++  // Construct the result type of the broadcast
++  //  - If input is static and target shape is static, use static shape.
++  //  - If input has bounded dim, target shape must be bounded, use bounded dim.
++  //  - If input is not bounded, but target shape is bounded, broadcast to
++  //    the padded shape then call SetDimensionSize to make dynamic.
++  auto bcastShape = shape;
++  for (size_t i = 0; i < input_rank; ++i) {
++    int64_t input_dim_size = inputShape[i].size;
++    int64_t result_idx = i + rank_diff;
++    int64_t result_dim_size = shape[result_idx].size;
++    if (input_dim_size != 1 && input_dim_size != result_dim_size)
++      return emitError(loc, "Cannot broadcast input: ")
++             << input_type << " to target shape " << toString(shape);
++
++    if (!inputShape[i].boundOp.has_value() &&
++        shape[result_idx].boundOp.has_value()) {
++      // Use padded shape in broadcast.
++      bcastShape[result_idx] = DimensionInfo{shape[result_idx].size};
 +    }
++    bcast_dims.push_back(result_idx);
++  }
 +
-     DenseElementsAttr resAttr;
--    if (auto data = els.tryGetValues<APInt>())
-+    if (auto data = els.tryGetValues<APInt>(); succeeded(data))
-       resAttr = sliceType(op, *data);
--    else if (auto data = els.tryGetValues<APFloat>())
-+    else if (auto data = els.tryGetValues<APFloat>(); succeeded(data))
-       resAttr = sliceType(op, *data);
-     else
-       return rewriter.notifyMatchFailure(op.getLoc(),
-                                          "unsupported element type");
- 
-     rewriter.replaceOpWithNewOp<ConstantOp>(op, resAttr);
-+    return success();
++  // Broadcast to padded size for remaining dimensions.
++  for (size_t i = input_rank; i < shape.size(); ++i) {
++    bcastShape[i] = DimensionInfo{shape[i].size};
++  }
++
++  // Insert broadcast ops
++  mlir::RankedTensorType bcast_type =
++      getRankedTensorType(bcastShape, input_type.getElementType());
++  Value bcast_op = stablehlo::BroadcastInDimOp::create(
++      builder, loc, bcast_type, input, broadcastDimensions);
++  if (bcast_op.getType() == output_type) return bcast_op;
++
++  // Mark the padded broadcast as dynamic where the result is bounded.
++  // Inserts `GetDimSize(boundOp)->SetDimSize(inputBcast)` for any bounded
++  // dimensions that required broadcasting.
++  for (size_t i = 0; i < shape.size(); ++i) {
++    if (!bcastShape[i].boundOp.has_value() && shape[i].boundOp.has_value()) {
++      Value boundOp = shape[i].boundOp.value();
++      auto dim_size = stablehlo::GetDimensionSizeOp::create(
++          builder, loc, boundOp, shape[i].boundOpDim);
++      bcast_op = stablehlo::SetDimensionSizeOp::create(builder, loc, bcast_op,
++                                                       dim_size, i);
++    }
 +  }
++  return bcast_op;
++}
++
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+@@ -0,0 +1,68 @@
++/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++
++#ifndef STABLEHLO_TRANSFORMS_OPBROADCASTUTILS_H_
++#define STABLEHLO_TRANSFORMS_OPBROADCASTUTILS_H_
++
++#include <cstdint>
++#include <optional>
++#include <string>
++
++#include "mlir/IR/Builders.h"
++#include "mlir/IR/Value.h"
++#include "mlir/Support/LLVM.h"
++
++namespace mlir {
++namespace stablehlo {
++
++///////
++// Numpy broadcasting with support for bounded dynamism.
++
++// Struct that represents a dim size of a tensor and possible dynamic value to
++// match. If dimension is not dynamic, bound_op is set to std::nullopt. If
++// dimension is bounded, the resulting dimension should be padded to `size` then
++// marked dynamic using:
++//   runtime_size = get_dimension_size(bound_op, dim=bound_op_dim)
++//   T = set_dimension_size(T, dim=bound_op_dim, runtime_size)
++//
++struct DimensionInfo {
++  int64_t size;
++  std::optional<Value> boundOp = std::nullopt;
++  int64_t boundOpDim = -1;
 +};
 +
-+// Pattern: dynamic_slice(splat_cst, start, end) -> resized_splat_cst
-+struct FoldDynamicSliceOpPattern : public FoldOpRewritePattern<DynamicSliceOp> {
-+  using FoldOpRewritePattern::FoldOpRewritePattern;
++using Dimensions = SmallVector<DimensionInfo>;
++std::string toString(const Dimensions& dims);
 +
-+  LogicalResult matchAndRewrite(DynamicSliceOp op,
-+                                PatternRewriter& rewriter) const override {
-+    auto resultType = op.getType();
-+    if (failed(validateStaticShapeResult(rewriter, op, resultType)))
-+      return failure();
++// Returns the common shape these ops would broadcast to, or an error if the
++// ops are not broadcastable.
++FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops);
 +
-+    SplatElementsAttr inputSplatAttr;
-+    if (!matchPattern(op.getOperand(), m_Constant(&inputSplatAttr)) ||
-+        !inputSplatAttr)
-+      return rewriter.notifyMatchFailure(op, "Input must be a splat constant.");
++// Apply numpy broadcasting to the given operands, returning an error if any
++// operands are not broadcastable.
++FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
++                                                     ArrayRef<Value> operands);
 +
-+    rewriter.replaceOpWithNewOp<ConstantOp>(
-+        op, inputSplatAttr.resizeSplat(resultType));
-     return success();
-   }
++// Apply numpy broadcasting to the given operand, returning an error if the
++// operand is not broadcastable.
++FailureOr<Value> numpyBroadcastIfNeeded(OpBuilder& builder, Value input,
++                                        const Dimensions& shape);
++
++}  // namespace stablehlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_TRANSFORMS_OPBROADCASTUTILS_H_
+diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
+--- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
++++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
+@@ -822,6 +822,61 @@
+   int64_t foldOpElementLimit;
  };
-@@ -1482,6 +1536,14 @@
-       rewriter.replaceOpWithNewOp<ConstantOp>(
-           op, DenseIntElementsAttr::get(resultType, values));
-       return success();
+ 
++// Pattern: concat(splat_a, splat_a, X) -> concat(splat_a_resize, X)
++struct FoldConcatenateAdjacentSplatsOpPattern final
++    : ShapeOpRewritePattern<mlir::stablehlo::ConcatenateOp> {
++  using ShapeOpRewritePattern::ShapeOpRewritePattern;
++
++  LogicalResult matchAndRewrite(ConcatenateOp op,
++                                PatternRewriter& rewriter) const override {
++    SmallVector<Value> newOperands;
++    SplatElementsAttr currSplat;
++    for (size_t i = 0; i < op.getNumOperands(); ++i) {
++      Value operand = op.getOperand(i);
++      // Match a splat and look ahead for adjacent identical splats.
++      if (matchPattern(operand, m_Constant(&currSplat)) && currSplat) {
++        size_t j = i+1;
++        SplatElementsAttr lookaheadSplat;
++        int64_t nOccurrences = 1;
++        for (; j < op.getNumOperands(); ++j) {
++          if (matchPattern(op.getOperand(j), m_Constant(&lookaheadSplat)) &&
++              lookaheadSplat && lookaheadSplat == currSplat) {
++            ++nOccurrences;
++            continue;
++          }
++          break;
++        }
++
++        // Special case for a single occurrence, no new constants
++        if (nOccurrences == 1) {
++          newOperands.push_back(operand);
++          continue;
++        }
++
++        // Resize the splat and append it to the new operands.
++        SmallVector<int64_t> newShape =
++            llvm::to_vector(currSplat.getType().getShape());
++        newShape[op.getDimension()] *= nOccurrences;
++        newOperands.push_back(ConstantOp::create(
++            rewriter, op.getLoc(),
++            currSplat.resizeSplat(currSplat.getType().clone(newShape))));
++
++        // Set `i` to j-1 so that next iteration processes the next operand.
++        i = j - 1;
++        continue;
++      }
++      // Not splat, append the operand.
++      newOperands.push_back(operand);
++    }
++    if (newOperands.size() == op.getNumOperands()) {
++      return rewriter.notifyMatchFailure(op, "No splats to fold");
 +    }
++    rewriter.replaceOpWithNewOp<ConcatenateOp>(op, op.getType(), newOperands,
++                                               op.getDimension());
++    return success();
++  }
++};
 +
-+    // TODO: Support more iota folding, but doing so currently causes OOMs,
-+    // so this pattern needs to be enabled more carefully.
-+    if (outputSize != 1) {
-+      return rewriter.notifyMatchFailure(
-+          op, "expected output size to be 1, but got: " +
-+                  std::to_string(outputSize));
-     }
- 
-     int64_t sequences = 1;
-@@ -1881,6 +1943,7 @@
+ struct FoldConvertOpPattern : public ShapeOpRewritePattern<ConvertOp> {
+   using ShapeOpRewritePattern::ShapeOpRewritePattern;
+ 
+@@ -1108,7 +1163,8 @@
+                                 PatternRewriter& rewriter) const override {
+     auto resultType = op.getType();
+     if (failed(validateStaticShapeResult(rewriter, op, resultType)) ||
+-        failed(validateShapeFoldDtype(rewriter, op, resultType)))
++        failed(validateShapeFoldDtype(rewriter, op, resultType,
++                                      /*allowComplex=*/true)))
+       return failure();
+ 
+     DenseElementsAttr attr;
+@@ -1923,6 +1979,8 @@
+   patterns->add<FoldClampOpPattern>(context, options, benefit);
+   patterns->add<FoldCompareOpPattern>(context, options, benefit);
    patterns->add<FoldConcatenateOpPattern>(context, options, benefit);
++  patterns->add<FoldConcatenateAdjacentSplatsOpPattern>(context, options,
++                                                        benefit);
    patterns->add<FoldConvertOpPattern>(context, options, benefit);
    patterns->add<FoldDivOpPattern>(context, options, benefit);
-+  patterns->add<FoldDynamicSliceOpPattern>(context, options, benefit);
-   patterns->add<FoldGetDimensionSizeOpPattern>(context, options, benefit);
-   patterns->add<FoldMaxOpPattern>(context, options, benefit);
-   patterns->add<FoldMinOpPattern>(context, options, benefit);
+   patterns->add<FoldDynamicSliceOpPattern>(context, options, benefit);
 diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
 --- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
 +++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplification.cpp
-@@ -331,7 +331,7 @@
- DenseI64ArrayAttr getInvertedBroadcastDimensions(OpBuilder& b,
-                                                  ArrayRef<int64_t> dims) {
-   SmallVector<int64_t> permutation(dims.size());
--  for (size_t i = 0; i < dims.size(); ++i) {
-+  for (auto i = 0; i < dims.size(); ++i) {
-     permutation[dims[i]] = i;
-   }
-   return b.getDenseI64ArrayAttr(permutation);
-@@ -1308,6 +1308,17 @@
- //////////////////////////////////
- // TransposeOp
- /////////////////////////////////
-+
-+DenseI64ArrayAttr getMergedTransposePermutation(OpBuilder& b,
-+                                                ArrayRef<int64_t> childPerm,
-+                                                ArrayRef<int64_t> parentPerm) {
-+  SmallVector<int64_t> mergedPerm;
-+  mergedPerm.reserve(parentPerm.size());
-+  for (int64_t parentIdx : parentPerm) {
-+    mergedPerm.push_back(childPerm[parentIdx]);
+@@ -69,6 +69,54 @@
+   });
+ }
+ 
++bool mergeDiscardableAttributes(ValueRange sourceValues,
++                                ValueRange destValues) {
++  if (sourceValues.size() != destValues.size()) return false;
++  bool changed = false;
++  for (auto [source, dest] : llvm::zip(sourceValues, destValues)) {
++    if (mergeDiscardableAttributes(source, dest)) changed = true;
 +  }
-+  return b.getDenseI64ArrayAttr(mergedPerm);
++  return changed;
 +}
- 
- // Pattern: transpose(X, [no_mem_layout_change...]) -> reshape(X)
- struct TransposeIsReshape final : SimplifyOpRewritePattern<TransposeOp> {
++
++bool mergeDiscardableAttributes(Value sourceValue, Value destValue) {
++  Operation* sourceOp = sourceValue.getDefiningOp();
++  Operation* destOp = destValue.getDefiningOp();
++  if (!sourceOp || !destOp) return false;
++
++  auto sourceAttrs = sourceOp->getDiscardableAttrDictionary();
++  if (!sourceAttrs) return true;
++
++  auto destAttrs = destOp->getDiscardableAttrDictionary();
++  if (!destAttrs) {
++    destOp->setDiscardableAttrs(sourceAttrs);
++    return true;
++  }
++
++  NamedAttrList mergedAttrs(destAttrs);
++  for (auto attr : sourceAttrs.getValue()) {
++    if (attr.getName() == "mhlo.frontend_attributes" &&
++        mergedAttrs.get("mhlo.frontend_attributes")) {
++      // Merge frontend attributes, prioritizing source attributes.
++      auto destFrontendAttrs =
++          cast<DictionaryAttr>(mergedAttrs.get("mhlo.frontend_attributes"));
++      auto sourceFrontendAttrs = cast<DictionaryAttr>(attr.getValue());
++      NamedAttrList frontendAttrs(destFrontendAttrs);
++      for (auto sourceAttr : sourceFrontendAttrs) {
++        frontendAttrs.set(sourceAttr.getName(), sourceAttr.getValue());
++      }
++      mergedAttrs.set("mhlo.frontend_attributes",
++                      frontendAttrs.getDictionary(destOp->getContext()));
++    } else {
++      // Otherwise prioritize source attributes
++      mergedAttrs.set(attr.getName(), attr.getValue());
++    }
++  }
++
++  destOp->setDiscardableAttrs(mergedAttrs.getDictionary(destOp->getContext()));
++  return true;
++}
++
+ template <typename OpType>
+ struct SimplifyOpRewritePattern : OpRewritePattern<OpType> {
+   SimplifyOpRewritePattern(
 diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
 --- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
 +++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
-@@ -119,6 +119,8 @@
- def InvertBroadcastDims : NativeCodeCall<"getInvertedBroadcastDimensions($_builder, $0)">;
+@@ -44,7 +44,8 @@
+     "same number of elements">;
  
- def MergeBroadcastDims : NativeCodeCall<"getMergedBroadcastDimensions($_builder, $0, $1)">;
-+
-+def MergePermutations : NativeCodeCall<"getMergedTransposePermutation($_builder, $0, $1)">;
+ def BroadcastNotReducibleToReshape : Constraint<
+-    CPred<"llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
++    CPred<"!llvm::cast<ShapedType>($0.getType()).hasStaticShape() || "
++          "llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
+           "!("
+             "llvm::is_sorted($0.getDefiningOp<stablehlo::BroadcastInDimOp>().getBroadcastDimensions()) && "
+             "llvm::cast<ShapedType>($0.getType()).getNumElements() == llvm::cast<ShapedType>($1.getType()).getNumElements()"
+@@ -134,6 +135,8 @@
  
- def StableHLO_ConvertOpWithShape : NativeCodeCall<
-     "$_builder.create<stablehlo::ConvertOp>($_loc, $0.getType(), $1)">;
-@@ -539,6 +541,12 @@
-   : Pat<(StableHLO_TransposeOp $lhs, IotaDims:$dims),
-         (replaceWithValue $lhs)>;
+ def MergePermutations : NativeCodeCall<"getMergedTransposePermutation($_builder, $0, $1)">;
  
-+// Pattern: transpose(transpose(X)) -> transpose(X)
-+def TransposeOp_TransposeOfTranspose
-+  : Pat<(StableHLO_TransposeOp
-+          (StableHLO_TransposeOp $child, $child_dims), $dims),
-+        (StableHLO_TransposeOp $child, (MergePermutations $child_dims, $dims))>;
++def MergeDiscardableAttributes : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
 +
+ def StableHLO_ConvertOpWithShape : NativeCodeCall<
+     "stablehlo::ConvertOp::create($_builder, $_loc, $0.getType(), $1)">;
+ 
+@@ -149,8 +152,9 @@
+ // op(cst, X) -> op(X, cst)
+ class CanonicalizeConstantToRhs<Op StableHLO_OpType>
+   : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
+-        (StableHLO_OpType $rhs, $lhs),
+-        [(NotConstantOp $rhs), (CommutativeOp $op)]>;
++        (StableHLO_OpType:$new_op $rhs, $lhs),
++        [(NotConstantOp $rhs), (CommutativeOp $op)],
++        [(MergeDiscardableAttributes $op, $new_op)]>;
+ 
+ ////////
+ // AddOp
+@@ -161,8 +165,9 @@
+ 
+ // Pattern: add(X, 0) -> X
+ def AddOp_RemoveNoop
+-  : Pat<(StableHLO_AddOp $lhs, (ConstantLikeMatcher AnyZero:$value)),
+-        (replaceWithValue $lhs)>;
++  : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // AndOp
+@@ -173,13 +178,15 @@
+ 
+ // Pattern: and(X, 0) -> 0
+ def AndOp_FoldToZero
+-  : Pat<(StableHLO_AndOp $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
+-        (replaceWithValue $zero)>;
++  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $zero), [],
++        [(MergeDiscardableAttributes $op, $zero)]>;
+ 
+ // Pattern: and(X, 1) -> X
+ def AndOp_RemoveNoop
+-  : Pat<(StableHLO_AndOp $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
+-        (replaceWithValue $lhs)>;
++  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // BroadcastInDimOp
+@@ -188,7 +195,8 @@
+ def BroadcastInDimOp_RemoveNoop
+   : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
+         (replaceWithValue $operand),
+-        [(TypesEqual $op, $operand)]>;
++        [(TypesEqual $op, $operand)],
++        [(MergeDiscardableAttributes $op, $operand)]>;
+ 
+ // Pattern: broadcast_in_dim(broadcast_in_dim(X, [dimsA...]), [dimsB...])
+ //       -> broadcast_in_dim(X, merge(dimsA, dimsB))
+@@ -203,8 +211,10 @@
+ 
+ // Pattern: broadcast_in_dim(X, [sorted...]) -> reshape(X, [sorted...])
+ //          [if same numel]
++// TODO: Figure out if static extents matching is valid (i.e. <=10 -> 1x[<=10])
++// for bounded dynamism, same for BroadcastInDimOp_ReplaceWithReshape
+ def BroadcastInDimOp_ReplaceWithReshape
+-  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, SortedDims:$dims),
++  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, SortedDims:$dims),
+         (StableHLO_ReshapeOpWithShape $op, $operand),
+         [(NumberOfElementsEqual $op, $operand)],
+         [],
+@@ -213,7 +223,7 @@
+ // Pattern: broadcast_in_dim(X, [dims...]) -> transpose(X, [dims...])
+ //          [if same numel & rank]
+ def BroadcastInDimOp_ReplaceWithTranspose
+-  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, $dims),
++  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, $dims),
+         (StableHLO_TransposeOp $operand, (InvertBroadcastDims $dims)),
+         [(NumberOfElementsEqual $op, $operand), (RankEqual $op, $operand)]>;
+ 
+@@ -254,7 +264,8 @@
+ def ConvertOp_RemoveNoop
+   : Pat<(StableHLO_ConvertOp:$convert $operand),
+         (replaceWithValue $operand),
+-        [(TypesEqual $convert, $operand)]>;
++        [(TypesEqual $convert, $operand)],
++        [(MergeDiscardableAttributes $convert, $operand)]>;
+ 
+ ////////
+ // DynamicBroadcastInDimOp
+@@ -441,13 +452,15 @@
+ // Multiplication by 0. This fold is not trivial for floats in presence of NaNs,
+ // so we currently only enable it for ints.
+ def MulOp_FoldToZero
+-  : Pat<(StableHLO_MulOp $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
+-        (replaceWithValue $zero)>;
++  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $zero), [],
++        [(MergeDiscardableAttributes $mul_op, $zero)]>;
+ 
+ // Pattern: multiply(X, 1i) -> X
+ def MulOp_RemoveNoop
+-  : Pat<(StableHLO_MulOp $lhs, (StableHLO_ConstantOp AnyOne:$value)),
+-        (replaceWithValue $lhs)>;
++  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $mul_op, $lhs)]>;
+ 
  ////////
- // GetTupleElementOp
+ // OrOp
+@@ -457,13 +470,15 @@
+ 
+ // Pattern: or(X, 1) -> 1
+ def OrOp_FoldToOne
+-  : Pat<(StableHLO_OrOp $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
+-        (replaceWithValue $one)>;
++  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
++        (replaceWithValue $one), [],
++        [(MergeDiscardableAttributes $op, $one)]>;
+ 
+ // Pattern: or(X, 0) -> X
+ def OrOp_RemoveNoop
+-  : Pat<(StableHLO_OrOp $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
+-        (replaceWithValue $lhs)>;
++  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
  
+ ////////
+ // PadOp
+@@ -564,8 +579,9 @@
+ 
+ // Pattern: subtract(X, 0) -> X
+ def SubtractOp_RemoveNoop
+-  : Pat<(StableHLO_SubtractOp $lhs, (StableHLO_ConstantOp AnyZero:$value)),
+-        (replaceWithValue $lhs)>;
++  : Pat<(StableHLO_SubtractOp:$op $lhs, (StableHLO_ConstantOp AnyZero:$value)),
++        (replaceWithValue $lhs), [],
++        [(MergeDiscardableAttributes $op, $lhs)]>;
+ 
+ ////////
+ // SliceOp
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 34abe5661db1ee..7548b489858693 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "0a4440a5c8de45c4f9649bf3eb4913bf3f97da0d"
-    STABLEHLO_SHA256 = "f1620aafc2b6d730e2ee9c33b35a59a2656a11eed10b1ef8049f175eb4fbdd9c"
+    STABLEHLO_COMMIT = "3f27c53c20b9021ccab8b5f673e2c72e5b9cd6aa"
+    STABLEHLO_SHA256 = "915e05e79d9764c048557a929c64e090ab58a5c7334da2c2650cd6378aa4d166"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
index 32c6d96f161874..8a4761348e35ff 100644
--- a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
@@ -7,13 +7,13 @@
 """
 
 load(
-    "@local_xla//third_party/gpus:cuda_configure.bzl",
+    "//third_party/gpus:cuda_configure.bzl",
     "find_cuda_config",
     "lib_name",
     "make_copy_files_rule",
 )
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
@@ -330,7 +330,7 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "//third_party/gpus:find_cuda_config.py"),
     },
 )
 
@@ -338,7 +338,7 @@ tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
     attrs = {
-        "_find_cuda_config": attr.label(default = "@local_xla//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "//third_party/gpus:find_cuda_config.py"),
     },
 )
 """Detects and configures the local CUDA toolchain.
diff --git a/third_party/xla/third_party/transformer_engine/BUILD b/third_party/xla/third_party/transformer_engine/BUILD
new file mode 100644
index 00000000000000..fe1959f93a952c
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/BUILD
@@ -0,0 +1,3 @@
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
+
+exports_files(["codegen.py"])
diff --git a/third_party/xla/third_party/transformer_engine/codegen.py b/third_party/xla/third_party/transformer_engine/codegen.py
new file mode 100644
index 00000000000000..a0daa028cb4b70
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/codegen.py
@@ -0,0 +1,41 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Codegen script for Transformer Engine."""
+
+from absl import app
+from absl import flags
+
+_TEMPLATE_FILE = flags.DEFINE_string(
+    'template_file', None, 'Path to the template file.', required=True
+)
+_DATA_FILE = flags.DEFINE_string(
+    'data_file', None, 'Path to the data file.', required=True
+)
+_STRING_NAME = flags.DEFINE_string(
+    'string_name', None, 'String name to use in the template.', required=True
+)
+
+
+def main(_):
+  with open(_TEMPLATE_FILE.value, 'rt') as f, open(_DATA_FILE.value, 'rt') as g:
+    template = f.read()
+    data = g.read()
+  template = template.replace('@STRING_NAME@', _STRING_NAME.value)
+  template = template.replace('@STRING@', data)
+  print(template)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD b/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD
new file mode 100644
index 00000000000000..5d9abf67401346
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/transformer_engine.BUILD
@@ -0,0 +1,249 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.impl.bzl", "python_extension", "pywrap_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("@rules_python//python:py_binary.bzl", "py_binary")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = [
+        "-header_modules",
+        "-use_header_modules",
+    ],
+    licenses = ["notice"],
+)
+
+py_binary(
+    name = "codegen",
+    srcs = ["@local_xla//third_party/transformer_engine:codegen.py"],
+    deps = [
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
+
+genrule(
+    name = "make_string_code_utils_cuh",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/utils.cuh",
+    ],
+    outs = ["string_headers/string_code_utils_cuh.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/utils.cuh) --string_name=string_code_utils_cuh > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_util_math_h",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/util/math.h",
+    ],
+    outs = ["string_headers/string_code_util_math_h.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/util/math.h) --string_name=string_code_util_math_h > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_transpose_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/transpose.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_transpose_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/transpose.cu) --string_name=string_code_transpose_rtc_transpose_cu > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_cast_transpose_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/cast_transpose.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_cast_transpose_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/cast_transpose.cu) --string_name=string_code_transpose_rtc_cast_transpose_cu > $@",
+    tools = [":codegen"],
+)
+
+genrule(
+    name = "make_string_code_transpose_rtc_cast_transpose_fusion_cu",
+    srcs = [
+        "transformer_engine/common/util/string_header.h.in",
+        "transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu",
+    ],
+    outs = ["string_headers/string_code_transpose_rtc_cast_transpose_fusion_cu.h"],
+    cmd = "$(location :codegen) --template_file=$(location transformer_engine/common/util/string_header.h.in) --data_file=$(location transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu) --string_name=string_code_transpose_rtc_cast_transpose_fusion_cu > $@",
+    tools = [":codegen"],
+)
+
+cc_library(
+    name = "string_headers",
+    hdrs = [
+        "string_headers/string_code_transpose_rtc_cast_transpose_cu.h",
+        "string_headers/string_code_transpose_rtc_cast_transpose_fusion_cu.h",
+        "string_headers/string_code_transpose_rtc_transpose_cu.h",
+        "string_headers/string_code_util_math_h.h",
+        "string_headers/string_code_utils_cuh.h",
+    ],
+    includes = ["string_headers"],
+)
+
+UNSUPPORTED_ARCHITECTURES_FLAGS = [
+    "--no-cuda-gpu-arch=sm_50",
+    "--no-cuda-gpu-arch=sm_60",
+]
+
+cuda_library(
+    name = "nvshmem_api",
+    srcs = [
+        "transformer_engine/common/nvshmem_api/nvshmem_waitkernel.cu",
+        "transformer_engine/common/util/logging.h",
+        "transformer_engine/common/util/string.h",
+    ],
+    hdrs = ["transformer_engine/common/nvshmem_api/nvshmem_waitkernel.h"],
+    copts = [
+        "-fexceptions",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    includes = [
+        "transformer_engine",
+        "transformer_engine/common",
+        "transformer_engine/common/include",
+        "transformer_engine/common/include/transformer_engine",
+    ],
+    local_defines = ["NVSHMEM_ENABLE_ALL_DEVICE_INLINING"],
+    deps = [
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn_header",
+        "@local_config_cuda//cuda:nvrtc_headers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+)
+
+cuda_library(
+    name = "common_lib",
+    srcs = glob(
+        [
+            "transformer_engine/common/**/*.cc",
+            "transformer_engine/common/**/*.cpp",
+            "transformer_engine/common/**/*.cu",
+        ],
+        exclude = [
+            "transformer_engine/common/permutation/permutation.cu",
+            "transformer_engine/common/transpose/rtc/transpose.cu",
+            "transformer_engine/common/transpose/rtc/cast_transpose.cu",
+            "transformer_engine/common/transpose/rtc/cast_transpose_fusion.cu",
+            "transformer_engine/common/nvshmem_api/nvshmem_waitkernel.cu",
+        ],
+    ),
+    hdrs = glob([
+        "transformer_engine/common/**/*.cuh",
+        "transformer_engine/common/**/*.h",
+    ]),
+    copts = [
+        "-fexceptions",
+        "-Wno-logical-op-parentheses",
+        "-Wno-missing-braces",
+        "-Wno-pass-failed",
+        "-Wno-reorder-ctor",
+        "-Wno-unused-variable",
+        "-Wno-switch",
+        "-Wno-exceptions",
+        "-Wno-assume",
+        "-Wno-self-assign",
+        "-Wno-sometimes-uninitialized",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    data = ["@local_config_cuda//cuda:cuda_headers"],
+    includes = [
+        "transformer_engine",
+        "transformer_engine/common",
+        "transformer_engine/common/include",
+        "transformer_engine/common/include/transformer_engine",
+        "transformer_engine/common/layer_norm",
+    ],
+    deps = [
+        ":nvshmem_api",
+        ":string_headers",
+        "@com_google_absl//absl/log",
+        "@cuda_nccl//:nccl",
+        "@cudnn_frontend_archive//:cudnn_frontend",
+        "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:cufft",
+        "@local_config_cuda//cuda:cusparse",
+        "@local_config_cuda//cuda:nvrtc_headers",
+        "@local_tsl//tsl/platform:cuda_root_path",
+        "@local_xla//xla/ffi/api:ffi",
+        "@pybind11",
+    ],
+)
+
+cuda_library(
+    name = "transformer_engine_jax_utils",
+    srcs = ["transformer_engine/jax/csrc/extensions/utils.cpp"],
+    hdrs = ["transformer_engine/jax/csrc/extensions/utils.h"],
+    copts = [
+        "-fexceptions",
+    ] + UNSUPPORTED_ARCHITECTURES_FLAGS,
+    includes = [
+        "transformer_engine",
+    ],
+    deps = [
+        ":common_lib",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@pybind11",
+    ],
+)
+
+python_extension(
+    name = "transformer_engine_jax_extension",
+    srcs = glob(
+        [
+            "transformer_engine/jax/csrc/*.cpp",
+            "transformer_engine/jax/csrc/*.h",
+            "transformer_engine/jax/csrc/*/*.h",
+            "transformer_engine/jax/csrc/*/*.cpp",
+        ],
+        exclude = [
+            "transformer_engine/jax/csrc/extensions/utils.h",
+            "transformer_engine/jax/csrc/extensions/utils.cpp",
+        ],
+    ),
+    copts = [
+        "-fexceptions",
+        "-Wno-unused-variable",
+        "-Wno-c++11-narrowing",
+    ],
+    includes = [
+        "transformer_engine",
+        "transformer_engine/jax/csrc",
+        "transformer_engine/jax/csrc/extensions",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":common_lib",
+        ":transformer_engine_jax_utils",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_xla//xla/ffi/api:c_api",
+        "@local_xla//xla/ffi/api:ffi",
+        "@pybind11",
+    ],
+)
+
+pywrap_library(
+    name = "transformer_engine_jax",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":transformer_engine_jax_extension",
+    ],
+)
diff --git a/third_party/xla/third_party/transformer_engine/transformer_engine.patch b/third_party/xla/third_party/transformer_engine/transformer_engine.patch
new file mode 100644
index 00000000000000..4f23d58df7cf7e
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/transformer_engine.patch
@@ -0,0 +1,158 @@
+diff --git a/transformer_engine/common/common.cu b/transformer_engine/common/common.cu
+index 192c915..e94a92b 100644
+--- a/transformer_engine/common/common.cu
++++ b/transformer_engine/common/common.cu
+@@ -6,7 +6,6 @@
+
+ #include <transformer_engine/transformer_engine.h>
+
+-#include <bit>
+
+ #include "./common.h"
+ #include "./utils.cuh"
+diff --git a/transformer_engine/common/cudnn_utils.h b/transformer_engine/common/cudnn_utils.h
+index 0016ad7..985a9ba 100644
+--- a/transformer_engine/common/cudnn_utils.h
++++ b/transformer_engine/common/cudnn_utils.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_CUDNN_UTILS_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+ #include <cudnn_graph.h>
+
+ #include "transformer_engine/transformer_engine.h"
+diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+index 0932b2c..7fd2a94 100644
+--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
++++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+@@ -6,8 +6,8 @@
+
+ #include <cuda_bf16.h>
+ #include <cuda_fp16.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+
+ #include <map>
+ #include <vector>
+diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
+index 89528fa..9f53123 100644
+--- a/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
++++ b/transformer_engine/common/fused_attn/fused_attn_f16_max512_seqlen.cu
+@@ -6,7 +6,7 @@
+
+ #include <cuda_bf16.h>
+ #include <cuda_fp16.h>
+-#include <cudnn_frontend.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
+
+ #include <map>
+ #include <vector>
+diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h
+index 678b636..58bcec4 100644
+--- a/transformer_engine/common/fused_attn/utils.h
++++ b/transformer_engine/common/fused_attn/utils.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_FUSED_ATTN_UTILS_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+
+ #include <cstdint>
+ #include <mutex>
+diff --git a/transformer_engine/common/normalization/common.h b/transformer_engine/common/normalization/common.h
+index 0ec1604..aef319d 100644
+--- a/transformer_engine/common/normalization/common.h
++++ b/transformer_engine/common/normalization/common.h
+@@ -8,8 +8,8 @@
+ #define TRANSFORMER_ENGINE_COMMON_NORM_COMMON_H_
+
+ #include <cudnn.h>
+-#include <cudnn_frontend.h>
+-#include <cudnn_frontend_utils.h>
++#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
++#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+ #include <transformer_engine/normalization.h>
+ #include <transformer_engine/transformer_engine.h>
+
+diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp
+index ac1bb1b..febb9a1 100644
+--- a/transformer_engine/common/util/cuda_runtime.cpp
++++ b/transformer_engine/common/util/cuda_runtime.cpp
+@@ -13,18 +13,12 @@
+ #include "../util/cuda_driver.h"
+ #include "../util/system.h"
+ #include "common/util/cuda_runtime.h"
++#include "tsl/platform/cuda_root_path.h"
+
+ namespace transformer_engine {
+
+ namespace cuda {
+
+-namespace {
+-
+-// String with build-time CUDA include path
+-#include "string_path_cuda_include.h"
+-
+-}  // namespace
+-
+ int num_devices() {
+   auto query_num_devices = []() -> int {
+     int count;
+@@ -146,8 +140,10 @@ const std::string &include_directory(bool required) {
+     std::vector<std::pair<std::string, Path>> search_paths = {{"NVTE_CUDA_INCLUDE_DIR", ""},
+                                                               {"CUDA_HOME", ""},
+                                                               {"CUDA_DIR", ""},
+-                                                              {"", string_path_cuda_include},
+                                                               {"", "/usr/local/cuda"}};
++    for (auto &candidate : tsl::CandidateCudaRoots()) {
++      search_paths.push_back({"", candidate});
++    }
+     for (auto &[env, p] : search_paths) {
+       if (p.empty()) {
+         p = getenv<Path>(env.c_str());
+diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
+index 40089dc..cf256e3 100644
+--- a/transformer_engine/jax/csrc/extensions/attention.cpp
++++ b/transformer_engine/jax/csrc/extensions/attention.cpp
+@@ -5,6 +5,7 @@
+  ************************************************************************/
+
+ #include "../extensions.h"
++#include <cudnn_graph.h>
+ #include "transformer_engine/fused_attn.h"
+ #include "transformer_engine/transformer_engine.h"
+
+diff --git a/transformer_engine/jax/csrc/extensions/ffi.h b/transformer_engine/jax/csrc/extensions/ffi.h
+index 852a67c..92df9de 100644
+--- a/transformer_engine/jax/csrc/extensions/ffi.h
++++ b/transformer_engine/jax/csrc/extensions/ffi.h
+@@ -5,7 +5,7 @@
+  ************************************************************************/
+
+ #include <transformer_engine/transformer_engine.h>
+-#include <xla/ffi/api/ffi.h>
++#include "xla/ffi/api/ffi.h"
+
+ #include <numeric>
+
+diff --git a/transformer_engine/jax/csrc/extensions/utils.cpp b/transformer_engine/jax/csrc/extensions/utils.cpp
+index 3ba0737..10b72c8 100644
+--- a/transformer_engine/jax/csrc/extensions/utils.cpp
++++ b/transformer_engine/jax/csrc/extensions/utils.cpp
+@@ -6,6 +6,7 @@
+ #include "utils.h"
+
+ #include <cuda_runtime_api.h>
++#include <cudnn_graph.h>
+
+ #include <cassert>
+
diff --git a/third_party/xla/third_party/transformer_engine/workspace.bzl b/third_party/xla/third_party/transformer_engine/workspace.bzl
new file mode 100644
index 00000000000000..e12528aa70fd3d
--- /dev/null
+++ b/third_party/xla/third_party/transformer_engine/workspace.bzl
@@ -0,0 +1,13 @@
+"""Loads the TransformerEngine library."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "transformer_engine",
+        strip_prefix = "TransformerEngine-2.5",
+        sha256 = "ee52ee9e43e44edc8598bc3d111eedc2445c9ebfe78a1fcab6f5c4c887020b72",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/TransformerEngine/archive/refs/tags/v2.5.tar.gz"),
+        build_file = "//third_party/transformer_engine:transformer_engine.BUILD",
+        patch_file = ["//third_party/transformer_engine:transformer_engine.patch"],
+    )
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch b/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch
new file mode 100644
index 00000000000000..8e421c8fc9a06f
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl823109577.patch
@@ -0,0 +1,11 @@
+
+--- a/python/src/llvm.cc	2025-08-22 04:02:56.000000000 -0700
++++ b/python/src/llvm.cc	2025-10-23 11:14:07.000000000 -0700
+@@ -53,7 +53,6 @@
+   bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT");
+   if (enable_fp_fusion)
+     opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+-  opt.UnsafeFPMath = false;
+   opt.NoInfsFPMath = false;
+   opt.NoNaNsFPMath = true;
+   opt.TrapUnreachable = true;
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch b/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch
new file mode 100644
index 00000000000000..807507cbce4b75
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl825373861.patch
@@ -0,0 +1,102 @@
+
+--- a/lib/Analysis/Allocation.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Analysis/Allocation.cpp	2025-10-28 22:48:41.000000000 -0700
+@@ -16,6 +16,8 @@
+ #include "llvm/Support/Debug.h"
+ #include "llvm/Support/raw_ostream.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "allocation-shared-memory"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Analysis/AxisInfo.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Analysis/AxisInfo.cpp	2025-10-28 22:48:41.000000000 -0700
+@@ -9,6 +9,8 @@
+ 
+ #include <numeric>
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "axis-info"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -903,8 +903,9 @@
+     return;
+   }
+   // And the default region branches transparently back to the parent.
+-  assert(src.getRegionOrNull() == &getDefaultRegion());
+-  successors.push_back(RegionSuccessor(getResults()));
++  assert(src.getTerminatorPredecessorOrNull()->getParentRegion() ==
++         &getDefaultRegion());
++  successors.push_back(RegionSuccessor(getOperation(), getResults()));
+ }
+ 
+ LogicalResult WarpSpecializeOp::verify() {
+
+--- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -15,6 +15,8 @@
+ #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+ #include "llvm/Support/Debug.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "ttg-utility"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2025-07-31 00:13:23.000000000 -0700
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2025-10-28 22:48:42.000000000 -0700
+@@ -1,5 +1,7 @@
+ #include "triton/Dialect/TritonNvidiaGPU/Transforms/Utility.h"
+ 
++#undef LDBG
++
+ #define DEBUG_TYPE "ttng-utility"
+ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+--- a/third_party/amd/include/Analysis/RangeAnalysis.h	2025-04-30 09:57:08.000000000 -0700
++++ b/third_party/amd/include/Analysis/RangeAnalysis.h	2025-10-29 06:16:49.000000000 -0700
+@@ -78,7 +78,7 @@
+   /// the loop operands and all users and all users of the results of the loop.
+   void visitRegionSuccessors(
+       ProgramPoint *point, RegionBranchOpInterface branch,
+-      RegionBranchPoint successor,
++      RegionSuccessor successor,
+       ArrayRef<dataflow::AbstractSparseLattice *> abstractLattices) override;
+ 
+   /// Collect all operands that participate in assumptions (see description of
+
+--- a/third_party/amd/lib/Analysis/RangeAnalysis.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/third_party/amd/lib/Analysis/RangeAnalysis.cpp	2025-10-29 06:16:50.000000000 -0700
+@@ -471,7 +471,7 @@
+ 
+ void TritonIntegerRangeAnalysis::visitRegionSuccessors(
+     ProgramPoint *point, RegionBranchOpInterface branch,
+-    RegionBranchPoint successor,
++    RegionSuccessor successor,
+     ArrayRef<dataflow::AbstractSparseLattice *> abstractLattices) {
+   LLVM_DEBUG({
+     DBGS() << "Inferring ranges for ";
+@@ -535,10 +535,11 @@
+         if (!inputs.empty()) {
+           firstIndex = cast<OpResult>(inputs.front()).getResultNumber();
+         }
+-        visitNonControlFlowArguments(branch,
+-                                     RegionSuccessor(branch->getResults().slice(
+-                                         firstIndex, inputs.size())),
+-                                     lattices, firstIndex);
++        visitNonControlFlowArguments(
++            branch,
++            RegionSuccessor(
++                branch, branch->getResults().slice(firstIndex, inputs.size())),
++            lattices, firstIndex);
+       } else {
+         if (!inputs.empty()) {
+           firstIndex = cast<BlockArgument>(inputs.front()).getArgNumber();
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch b/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch
new file mode 100644
index 00000000000000..2c12f164868aec
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl828494580.patch
@@ -0,0 +1,12 @@
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp	2025-08-29 00:00:16.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp	2025-11-05 09:23:21.000000000 -0800
+@@ -655,7 +655,7 @@
+     Value prod = b.fmul(f32_ty, operands[0][0], b.f32_val(log2e));
+ 
+     Type resultTy = operands[0][0].getType();
+-    StringRef name = "llvm.nvvm.ex2.approx.f";
++    StringRef name = "llvm.nvvm.ex2.approx.f32";
+     auto callOp =
+         LLVM::createLLVMIntrinsicCallOp(rewriter, loc, name, resultTy, {prod});
+     return {callOp.getResult(0)};
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch b/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch
new file mode 100644
index 00000000000000..5dd2a4ed33641b
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl831451347.patch
@@ -0,0 +1,27 @@
+
+--- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir	2025-11-12 02:33:41.000000000 -0800
++++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir	2025-11-12 10:58:30.000000000 -0800
+@@ -215,9 +215,9 @@
+ // CHECK-LABEL: async_copy_mbarrier_arrive
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+   tt.func public @async_copy_mbarrier_arrive(%arg0: !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>)  attributes { noinline = false } {
+-    // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3>
++    // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} : !llvm.ptr<3>
+     ttng.async_copy_mbarrier_arrive %arg0 : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>
+-    // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} : !llvm.ptr<3>
++    // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} : !llvm.ptr<3>
+     ttng.async_copy_mbarrier_arrive %arg0 { noIncrement } : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>
+     tt.return
+   }
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp	2025-10-15 10:11:13.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp	2025-11-12 10:58:30.000000000 -0800
+@@ -1818,7 +1818,7 @@
+         typeConverter->convertType(op.getBarrier().getType().getElementType()),
+         rewriter);
+     TritonLLVMOpBuilder b(loc, rewriter);
+-    rewriter.create<NVVM::CpAsyncMBarrierArriveSharedOp>(
++    rewriter.create<NVVM::CpAsyncMBarrierArriveOp>(
+         loc, barrierMemObj.getBase(), noinc);
+     op->erase();
+     return success();
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index 656b9c894904d8..25609d0818b26d 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -8,5 +8,6 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
+    "//third_party/triton:llvm_integration/cl831451347.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/construction_order.patch b/third_party/xla/third_party/triton/temporary/construction_order.patch
deleted file mode 100644
index f933f2a8946e9c..00000000000000
--- a/third_party/xla/third_party/triton/temporary/construction_order.patch
+++ /dev/null
@@ -1,23 +0,0 @@
-
---- a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp	2025-09-25 06:36:50.000000000 -0700
-+++ b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertTmemAref.cpp	2025-09-29 04:55:20.000000000 -0700
-@@ -54,8 +54,8 @@
-     SmallVector<std::unique_ptr<Node>> subDags;
-     Node(Operation *op, OpOperand *tokOperand,
-          std::optional<PartitionId> partitionId, Node *parent)
--        : op(op), tokOperand(tokOperand), partitionId(partitionId),
--          parent(parent), parentDag(nullptr) {}
-+        : op(op), tokOperand(tokOperand), parent(parent), parentDag(nullptr),
-+          partitionId(partitionId) {}
- 
-     // ------------------------------------------------------------------------
- 
-@@ -364,7 +364,7 @@
-   enum Kind { PUT, GET };
- 
-   TMEMAref(Value aref, Value origBuffer, Value replToken)
--      : aref(aref), origBuffer(origBuffer), replToken(replToken), kind(PUT) {}
-+      : origBuffer(origBuffer), aref(aref), replToken(replToken), kind(PUT) {}
- 
-   void acquire(OpBuilder &b, Location loc,
-                std::pair<std::optional<PartitionId>, StageCluster>
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
new file mode 100644
index 00000000000000..57d4c2121e37ea
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
@@ -0,0 +1,144 @@
+diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
+--- a/third_party/nvidia/backend/cuda_utils.cc
++++ b/third_party/nvidia/backend/cuda_utils.cc
+@@ -270,51 +270,16 @@ bool extractPointer(PyObject* obj, void*
+   return true;
+ }
+ 
++CUtensorMap* getTmaDesc(PyObject* obj);
++
+ // Extract a CUtensorMap descriptor from a python object, and store it to the
+ // memory location pointed by ptr.
+ bool extractTmaDesc(PyObject* obj, void* ptr) {
+-  if (sizeof(CUtensorMap*) != 8) {
+-    PyErr_SetString(PyExc_SystemError,
+-                "extractTmaDesc() requires 64-bit compilation");
+-    return false;
+-  }
+-
+-  UniquePyObjectPtr method_ret(
+-      PyObject_CallMethod(obj, "tma_desc_cpu_ptr", nullptr));
+-  // Checking the error retains context if tma_desc_cpu_ptr raises an exception.
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!method_ret) {
+-    PyErr_SetString(PyExc_SystemError, "Call to tma_desc_cpu_ptr() failed");
++  CUtensorMap* tensor_map = getTmaDesc(obj);
++  if (tensor_map == nullptr) {
+     return false;
+   }
+-
+-  if (!PyLong_Check(method_ret.get())) {
+-    PyErr_SetString(PyExc_TypeError,
+-                    "tma_desc_cpu_ptr() must return 64-bit int");
+-    return false;
+-  }
+-
+-  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret.get());
+-  if (PyErr_Occurred()) {
+-    return false;
+-  }
+-
+-  if (!ptr_as_uint) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "received NULL ptr from tma_desc_cpu_ptr()");
+-    return false;
+-  }
+-  if (ptr_as_uint % 64 != 0) {
+-    PyErr_SetString(PyExc_ValueError,
+-                    "tma_desc_cpu_ptr() must be 64-byte aligned");
+-    return false;
+-  }
+-
+-  *static_cast<CUtensorMap*>(ptr) =
+-      *reinterpret_cast<CUtensorMap*>(ptr_as_uint);
++  *static_cast<CUtensorMap*>(ptr) = *tensor_map;
+   return true;
+ }
+ 
+@@ -392,6 +357,7 @@ struct ExtractionInfo {
+   // Prefixes of types reprs supported by the extractor.
+   llvm::SmallVector<llvm::StringRef> supported_type_repr_prefixes;
+   std::size_t size;         // Size required by the extracted value.
++  std::size_t alignment;    // Alignment requirement for the extracted value.
+   ExtractorType extractor;  // Function to call to extract the value.
+ 
+   // Builds an ExtractionInfo for a given type T and a list of type reprs that
+@@ -400,7 +366,7 @@ struct ExtractionInfo {
+   static ExtractionInfo build(
+       std::initializer_list<llvm::StringRef> supported_type_reprs,
+       ExtractorType extractor = extractValue<T>) {
+-    return {supported_type_reprs, sizeof(T), extractor};
++    return {supported_type_reprs, sizeof(T), alignof(T), extractor};
+   }
+ 
+   // Checks if the extractor supports extracting a given type repr.
+@@ -428,7 +394,7 @@ const ExtractionInfo kExtractionInfos[]{
+     // Note: types are e.g. '*fp32', so no closing quote is intentional.
+     ExtractionInfo::build<void*>({"'*"}, extractPointer),
+     ExtractionInfo{
+-        {"None", "'none'"}, 0, nullptr},  // Represent constexprs as None
++        {"None", "'none'"}, 0, 0, nullptr},  // Represent constexprs as None
+     ExtractionInfo::build<CUtensorMap>({"'nvTmaDesc'"}, extractTmaDesc),
+ };
+ 
+@@ -628,7 +594,19 @@ PyObject* launch(PyObject* self, PyObjec
+     if (extraction_info.size == 0) {
+       continue;  // skip adding constexpr parameters
+     }
+-    config.params[params_idx] = alloca(extraction_info.size);
++    size_t alignment = std::max(1UL, extraction_info.alignment);
++
++    // Allocate enough space on the stack to guarantee an aligned block.
++    size_t size_with_alignment = extraction_info.size + alignment - 1;
++    void *param_storage_ptr = alloca(size_with_alignment);
++
++    void *aligned_ptr = std::align(alignment, extraction_info.size,
++                                   param_storage_ptr, size_with_alignment);
++    if (aligned_ptr == nullptr) {
++      PyErr_SetString(PyExc_MemoryError, "Failed to align parameter storage");
++      return nullptr;
++    }
++    config.params[params_idx] = aligned_ptr;
+     if (!extraction_info.extractor(arg, config.params[params_idx])) {
+       return nullptr;
+     }
+@@ -940,6 +918,36 @@ static PyTypeObject PyCUtensorMapType = 
+ };
+ // clang-format on
+ 
++namespace {
++
++// Extracts a pointer to `CUtensorMap` from a `PyCUtensorMapObject`.
++CUtensorMap* getTmaDesc(PyObject* obj) {
++  if (sizeof(CUtensorMap*) != 8) {
++    PyErr_SetString(PyExc_SystemError,
++                    "getTmaDesc() requires 64-bit compilation");
++    return nullptr;
++  }
++  if (Py_TYPE(obj) != static_cast<PyTypeObject*>(&PyCUtensorMapType)) {
++    PyErr_Format(PyExc_TypeError,
++                 "object must be of type PyCUtensorMap, got %s",
++                 Py_TYPE(obj)->tp_name);
++    return nullptr;
++  }
++  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
++  // PyCUtensorMapObject aligns tensorMap to 128.
++  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
++  if (align_128 != 0) {
++    PyErr_Format(
++        PyExc_ValueError,
++        "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld",
++        align_128);
++    return nullptr;
++  }
++  return map;
++}
++
++}  // namespace
++
+ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
+   unsigned long long global_address;
+   int swizzle;
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index ba274ebbd2581a..4fa55269e3323c 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -14,6 +14,5 @@ those to this list.
 """
 
 temporary_patch_list = [
-    "//third_party/triton:temporary/utility-fix.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch b/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch
deleted file mode 100644
index 599be88da7b1fd..00000000000000
--- a/third_party/xla/third_party/triton/temporary/triton-tensor-layout-init-fiasco.patch
+++ /dev/null
@@ -1,67 +0,0 @@
-Upsteam Pull Request: https://github.com/triton-lang/triton/pull/8117.
-
-diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp
---- a/bin/triton-tensor-layout.cpp
-+++ b/bin/triton-tensor-layout.cpp
-@@ -39,29 +39,32 @@ using namespace mlir;
- // CLI options
- //===--------------------------------------------------------------------===//
-
--cl::OptionCategory PrinterCategory("Available Print Options",
--                                   "Options for the tensor layout printing.");
-+static cl::OptionCategory &getPrinterCategory() {
-+  static cl::OptionCategory PrinterCategory(
-+      "Available Print Options", "Options for the tensor layout printing.");
-+  return PrinterCategory;
-+}
-
- static cl::opt<std::string> InputFile(
-     "i", cl::desc("File that contains the tensor data layout attributes"),
--    cl::init(""), cl::value_desc("filename"), cl::cat(PrinterCategory));
-+    cl::init(""), cl::value_desc("filename"), cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string>
-     OutputFile("o", cl::desc("Output file to write the layout into"),
-                cl::init(""), cl::value_desc("filename"),
--               cl::cat(PrinterCategory));
-+               cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string>
-     DataLayoutStr("l", cl::desc("Tensor data layout attribute in string"),
-                   cl::value_desc("layout-string"), cl::init(""),
--                  cl::cat(PrinterCategory));
-+                  cl::cat(getPrinterCategory()));
-
- static cl::list<std::string>
-     AliasName("alias-names",
-               cl::desc("A list of alias names (separated by comma) of the "
-                        "layout attributes in the input file"),
-               cl::value_desc("name1,name2,name3,..."), cl::CommaSeparated,
--              cl::ZeroOrMore, cl::cat(PrinterCategory));
-+              cl::ZeroOrMore, cl::cat(getPrinterCategory()));
-
- static cl::opt<bool> UseHWPointOfView(
-     "use-hw-view",
-@@ -69,11 +72,11 @@ static cl::opt<bool> UseHWPointOfView(
-         "Print the layout in hardware point of view. This means the output is "
-         "from the warp's perspective. Otherwise, the output is from the "
-         "tensor's perspective (e.g., each element maps to xxx thread)."),
--    cl::init(false), cl::cat(PrinterCategory));
-+    cl::init(false), cl::cat(getPrinterCategory()));
-
- static cl::opt<std::string> TensorStr(
-     "t", cl::desc("Tensor shape and element type (e.g., tensor<2x2xf32>)"),
--    cl::init(""), cl::value_desc("tensor-type"), cl::cat(PrinterCategory));
-+    cl::init(""), cl::value_desc("tensor-type"), cl::cat(getPrinterCategory()));
-
- //===--------------------------------------------------------------------===//
- // Helper functions
-@@ -180,7 +183,7 @@ static LogicalResult printLayoutFromString(MLIRContext *context,
- //===--------------------------------------------------------------------===//
-
- int main(int argc, char **argv) {
--  cl::HideUnrelatedOptions(PrinterCategory);
-+  cl::HideUnrelatedOptions(getPrinterCategory());
-   cl::ParseCommandLineOptions(argc, argv, "tensor layout printer\n");
-
-   DialectRegistry registry;
diff --git a/third_party/xla/third_party/triton/temporary/type-fix-in-test.patch b/third_party/xla/third_party/triton/temporary/type-fix-in-test.patch
deleted file mode 100644
index 53dcecea088c03..00000000000000
--- a/third_party/xla/third_party/triton/temporary/type-fix-in-test.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-This test was broken on it's first integration already. This can be upstreamed.
---- a/test/LLVMIR/convert-to-llvmir-with-dbg-info.mlir	2025-09-25 06:36:50.000000000 -0700
-+++ b/test/LLVMIR/convert-to-llvmir-with-dbg-info.mlir	2025-10-02 02:23:37.000000000 -0700
-@@ -29,6 +29,7 @@
-                         %arg2: !llvm.ptr<1>, %arg3: i32, %arg4: !llvm.ptr<1>) {
-     %constant_i32 = llvm.mlir.constant(9 : i32) : i32
-     %constant_i16 = llvm.mlir.constant(0 : i16) : i16
-+    %constant_i64 = llvm.mlir.constant(0 : i64) : i64
- 
-     // CHECK: !DILocalVariable(name: "pid", scope:
-     %pid = rocdl.workgroup.id.x : i32 loc(#loc14)
-@@ -49,14 +50,14 @@
- 
-     // CHECK: !DILocalVariable(name: "x", scope:
-     %x_ptr = llvm.getelementptr %arg0[%block_start] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, f32
--    %x_buffer_ptr = rocdl.make.buffer.rsrc %x_ptr, %constant_i16, %constant_i32, %constant_i32 : <1> to <8> loc(#loc18)
-+    %x_buffer_ptr = rocdl.make.buffer.rsrc %x_ptr, %constant_i16, %constant_i64, %constant_i32 : <1> to <8> loc(#loc18)
-     llvm.intr.dbg.value #di_local_variable4 = %x_buffer_ptr : !llvm.ptr<8> loc(#loc8)
-     %x_val = rocdl.raw.ptr.buffer.load %x_buffer_ptr, %mask_i1, %constant_i32, %constant_i32 : vector<4xf32> loc(#loc18)
-     %x_scalar = llvm.extractelement %x_val[%constant_i32 : i32] : vector<4xf32> loc(#loc18)
- 
-     // CHECK: !DILocalVariable(name: "y", scope:
-     %y_ptr = llvm.getelementptr %arg1[%block_start] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, f32
--    %y_buffer_ptr = rocdl.make.buffer.rsrc %y_ptr, %constant_i16, %constant_i32, %constant_i32 : <1> to <8> loc(#loc19)
-+    %y_buffer_ptr = rocdl.make.buffer.rsrc %y_ptr, %constant_i16, %constant_i64, %constant_i32 : <1> to <8> loc(#loc19)
-     llvm.intr.dbg.value #di_local_variable5 = %y_buffer_ptr : !llvm.ptr<8> loc(#loc10)
-     %y_val = rocdl.raw.ptr.buffer.load %y_buffer_ptr, %mask_i1, %constant_i32, %constant_i32 : vector<4xf32> loc(#loc19)
-     %y_scalar = llvm.extractelement %y_val[%constant_i32 : i32] : vector<4xf32> loc(#loc19)
diff --git a/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch b/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch
deleted file mode 100644
index e9e66e03defb4a..00000000000000
--- a/third_party/xla/third_party/triton/temporary/verify_nvmma_encoding.patch
+++ /dev/null
@@ -1,93 +0,0 @@
-
---- a/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-08-22 04:02:56.000000000 -0700
-+++ b/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-09-08 07:22:55.000000000 -0700
-@@ -1,3 +1,5 @@
-+#include "llvm/Support/Casting.h"
-+#include "llvm/Support/LogicalResult.h"
- #include "mlir/IR/BuiltinTypes.h"
- #include "mlir/IR/Diagnostics.h"
- #include "mlir/Support/DebugStringHelper.h"
-@@ -9,9 +11,8 @@
- #include "triton/Dialect/TritonGPU/IR/Types.h"
- #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
- #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
-+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
- #include "triton/Tools/LayoutUtils.h"
--#include "llvm/Support/Casting.h"
--#include "llvm/Support/LogicalResult.h"
-
- // Provide custom directive handlers for declarative assemblyFormat.
- // They must be visible before including the generated op classes.
-@@ -517,10 +518,47 @@ LogicalResult MemDescReshapeOp::verify() {
-   return success();
- }
-
--static LogicalResult inferMemDescReshapeOpEncoding(ArrayRef<int64_t> srcShape,
-+// Verification copied from nvmmaSharedToLinearLayout().
-+LogicalResult verifyNVMMASharedEncoding(std::optional<Location> loc,
-+                                        NVMMASharedEncodingAttr attr,
-+                                        ArrayRef<int64_t> shape,
-+                                        int elementBitWidth) {
-+  if (attr.getSwizzlingByteWidth() == 0) return success();
-+  if (shape.size() < 2)
-+    return emitOptionalError(loc, "nvmma_shared encoding requires rank >= 2");
-+
-+  auto shapePerCTA = getShapePerCTA(attr, shape);
-+  auto tmaShape = triton::nvidia_gpu::getTMABlockShape(attr, shapePerCTA,
-+                                                       /*packedSize=*/true);
-+  std::array<int64_t, 2> collapsedTmaShape{1, tmaShape.back()};
-+  for (int i = 0; i + 1 < shape.size(); i++)
-+    collapsedTmaShape[0] *= tmaShape[i];
-+  if (attr.getTransposed()) {
-+    std::swap(collapsedTmaShape[0], collapsedTmaShape[1]);
-+  }
-+
-+  int tileRows = 8;
-+  int tileCols = 8 * attr.getSwizzlingByteWidth() / elementBitWidth;
-+  if (attr.getFp4Padded()) tileCols /= 2;
-+
-+  int packingFactor = attr.getFp4Padded() ? 2 : 1;
-+  if (collapsedTmaShape[1] * packingFactor < tileCols ||
-+      collapsedTmaShape[0] < tileRows) {
-+    return emitOptionalError(
-+        loc,
-+        "Illegal shared layout; expected collapsed shapePerCTA to "
-+        "be at least [",
-+        tileRows, ", ", (tileCols / packingFactor), "], collapsedTmaShape: [",
-+        collapsedTmaShape[0], ", ", collapsedTmaShape[1], "]");
-+  }
-+  return success();
-+}
-+
-+static LogicalResult inferMemDescReshapeOpEncoding(std::optional<Location> loc,
-+                                                   ArrayRef<int64_t> srcShape,
-                                                    Attribute srcEnc,
-                                                    ArrayRef<int64_t> dstShape,
--                                                   Attribute &dstEnc) {
-+                                                   Attribute& dstEnc) {
-   if (auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(srcEnc)) {
-     // TODO: supporting reshape of CTA layouts is non-trivial.
-     if (getNumCTAs(mmaEncoding) > 1)
-@@ -544,6 +582,11 @@ static LogicalResult inferMemDescReshapeOpEncoding(ArrayRef<int64_t> srcShape,
-         ctx, mmaEncoding.getSwizzlingByteWidth(), mmaEncoding.getTransposed(),
-         mmaEncoding.getElementBitWidth(), mmaEncoding.getFp4Padded(),
-         CTALayout);
-+    if (failed(verifyNVMMASharedEncoding(
-+            loc, cast<NVMMASharedEncodingAttr>(dstEnc), dstShape,
-+            mmaEncoding.getElementBitWidth()))) {
-+      return failure();
-+    }
-     // Big guns, check linear layouts are equivalent
-     // We disallow reshaping memdesc_subslice in the verifier
-     // so allocShape == shape
-@@ -566,8 +609,8 @@ LogicalResult MemDescReshapeOp::inferReturnTypes(
-
-   Attribute dstEncoding;
-   if (Attribute srcEnc = srcTy.getEncoding()) {
--    if (failed(inferMemDescReshapeOpEncoding(srcTy.getShape(), srcEnc, dstShape,
--                                             dstEncoding)))
-+    if (failed(inferMemDescReshapeOpEncoding(loc, srcTy.getShape(), srcEnc,
-+                                             dstShape, dstEncoding)))
-       return failure();
-   }
-
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index fe2cfd33f7662f..6316fc91a1a9fa 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -7,8 +7,8 @@ load("//third_party/triton:temporary/series.bzl", "temporary_patch_list")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "triton_integrate_branch-1.13"
-    TRITON_SHA256 = "390ce756b3e0ce7be0a69633897f11bfd3227682ad90bd720fe4860bfedc4849"
+    TRITON_COMMIT = "triton_integrate_branch-1.14"
+    TRITON_SHA256 = "b684cff8d07e839f8a1ea6cc7d331f370615b4c5530489db76f619aa7aa66608"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index ef05bbeab5f31d..ab4822e73fcc91 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -158,11 +158,11 @@ cc_library(
     srcs = ["numbers.cc"],
     hdrs = ["numbers.h"],
     deps = [
-        ":str_util",
         ":stringpiece",
         ":stringprintf",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_xla//xla/tsl/platform:logging",
         "@local_xla//xla/tsl/platform:macros",
         "@local_xla//xla/tsl/platform:types",
@@ -314,6 +314,7 @@ cc_library(
     deps = [
         ":cord",
         ":platform",
+        ":refcount",
         ":stringpiece",
     ],
 )
@@ -453,7 +454,6 @@ filegroup(
         "str_util.h",
         "strcat.h",
         "stringpiece.h",
-        "stringprintf.cc",
         "stringprintf.h",
         "thread_annotations.h",
         "threadpool.h",
@@ -688,6 +688,7 @@ cc_library(
     ],
     deps = tf_windows_aware_platform_deps("platform_port") + [
         ":platform",
+        "@com_google_absl//absl/base:core_headers",
         "@local_xla//xla/tsl/platform:byte_order",
         "@local_xla//xla/tsl/platform:dynamic_annotations",
         "@local_xla//xla/tsl/platform:types",
@@ -791,11 +792,10 @@ cc_library(
 
 cc_library(
     name = "stringprintf",
-    srcs = ["stringprintf.cc"],
     hdrs = ["stringprintf.h"],
     deps = [
-        "@local_xla//xla/tsl/platform:macros",
-        "@local_xla//xla/tsl/platform:types",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -1038,6 +1038,7 @@ cc_library(
     deps = [
         ":str_util",
         ":stringpiece",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/platform:macros",
     ],
 )
@@ -1087,10 +1088,9 @@ tsl_cc_test(
     srcs = ["path_test.cc"],
     deps = [
         ":path",
-        ":stringpiece",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_xla//xla/tsl/platform:env",
-        "@local_xla//xla/tsl/platform:env_impl",
         "@local_xla//xla/tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ctstring.h b/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
index f841e5f4d22af5..0e45c0e8685636 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ctstring.h
@@ -92,6 +92,12 @@ inline char *TF_TString_GetMutableDataPointer(TF_TString *str);
 inline void TF_TString_AssignView(TF_TString *dst, const char *src,
                                   size_t size);
 
+// Sets `dst' as an owning VIEW type to `src', taking shared ownership via
+// `owner_ref`. If `owner_ref` is null, behaves as TF_TString_AssignView.
+inline void TF_TString_AssignViewWithOwner(TF_TString *dst, const char *src,
+                                           size_t size,
+                                           TStringOwnerCApi *owner_ref);
+
 // Appends `src' onto `dst'.  If `dst' is a VIEW or OFFSET type, it will first
 // be converted to an owned LARGE or SMALL type.  `dst' should not point to
 // memory owned by `src'.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h b/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
index 43e909a8065aaa..dad9f483b378a1 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_CTSTRING_INTERNAL_H_
 
 #include <limits.h>
+#include <stdbool.h>  // IWYU pragma: keep, provides bool
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -81,6 +82,18 @@ typedef enum TF_TString_Type {  // NOLINT
   TF_TSTR_TYPE_MASK = 0x03
 } TF_TString_Type;
 
+// C-compatible API for tstring shared ownership.
+struct TStringOwnerCApi;
+
+typedef void (*tstring_owner_ref_t)(struct TStringOwnerCApi *self);
+typedef bool (*tstring_owner_unref_t)(struct TStringOwnerCApi *self);
+
+typedef struct TStringOwnerCApi {
+  tstring_owner_ref_t ref;
+  tstring_owner_unref_t unref;
+  void *obj;  // Opaque pointer to the C++ owner object
+} TStringOwnerCApi;
+
 typedef struct TF_TString_Large {  // NOLINT
   size_t size;
   size_t cap;
@@ -96,6 +109,7 @@ typedef struct TF_TString_Offset {  // NOLINT
 typedef struct TF_TString_View {  // NOLINT
   size_t size;
   const char *ptr;
+  TStringOwnerCApi *owner_ref;
 } TF_TString_View;
 
 typedef struct TF_TString_Raw {  // NOLINT
@@ -177,6 +191,10 @@ static inline void TF_TString_Dealloc(TF_TString *str) {
       str->u.large.ptr != NULL) {  // NOLINT
     free(str->u.large.ptr);
     TF_TString_Init(str);
+  } else if (TF_TString_GetType(str) == TF_TSTR_VIEW &&
+             str->u.view.owner_ref != NULL) {  // NOLINT
+    (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    TF_TString_Init(str);
   }
 }
 
@@ -231,6 +249,8 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
 
   TF_TString_Type curr_type = TF_TString_GetType(str);
   const char *curr_ptr = TF_TString_GetDataPointer(str);
+  TStringOwnerCApi *curr_owner_ref =
+      curr_type == TF_TSTR_VIEW ? str->u.view.owner_ref : NULL;  // NOLINT
 
   // Case: SMALL/LARGE/VIEW/OFFSET -> SMALL
   if (new_size <= TF_TString_SmallCapacity) {
@@ -243,10 +263,10 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
 
     if (curr_type == TF_TSTR_LARGE) {
       free((void *)curr_ptr);  // NOLINT
+    } else if (curr_owner_ref != NULL) {  // NOLINT
+      curr_owner_ref->unref(curr_owner_ref);
     }
 
-    // We do not clear out the newly excluded region.
-
     return str->u.smll.str;
   }
 
@@ -273,6 +293,9 @@ static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
     if (copy_size) {
       memcpy(new_ptr, curr_ptr, copy_size);
     }
+    if (curr_type == TF_TSTR_VIEW && str->u.view.owner_ref != NULL) {  // NOLINT
+      (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    }
   }
 
   str->u.large.size = TF_TString_ToInternalSizeT(new_size, TF_TSTR_LARGE);
@@ -333,6 +356,9 @@ static inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
     // Convert to Large
     char *new_ptr = (char *)malloc(new_cap + 1);  // NOLINT
     memcpy(new_ptr, curr_ptr, curr_size);
+    if (curr_type == TF_TSTR_VIEW && str->u.view.owner_ref != NULL) {  // NOLINT
+      (void)str->u.view.owner_ref->unref(str->u.view.owner_ref);
+    }
 
     str->u.large.size = TF_TString_ToInternalSizeT(curr_size, TF_TSTR_LARGE);
     str->u.large.ptr = new_ptr;
@@ -362,12 +388,22 @@ static inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
   return cstr;
 }
 
-static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
-                                         size_t size) {
+static inline void TF_TString_AssignViewWithOwner(TF_TString *dst,
+                                                  const char *src, size_t size,
+                                                  TStringOwnerCApi *owner_ref) {
+  if (owner_ref != NULL) {  // NOLINT
+    owner_ref->ref(owner_ref);
+  }
   TF_TString_Dealloc(dst);
 
   dst->u.view.size = TF_TString_ToInternalSizeT(size, TF_TSTR_VIEW);
   dst->u.view.ptr = src;
+  dst->u.view.owner_ref = owner_ref;
+}
+
+static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+                                         size_t size) {
+  TF_TString_AssignViewWithOwner(dst, src, size, NULL);  // NOLINT
 }
 
 static inline void TF_TString_AppendN(TF_TString *dst, const char *src,
@@ -400,14 +436,21 @@ static inline void TF_TString_Copy(TF_TString *dst, const char *src,
 static inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   if (dst == src) return;
 
-  TF_TString_Dealloc(dst);
-
   switch (TF_TString_GetType(src)) {
     case TF_TSTR_SMALL:
-    case TF_TSTR_VIEW:
+      TF_TString_Dealloc(dst);
       *dst = *src;
       return;
+    case TF_TSTR_VIEW: {
+      TF_TString_Dealloc(dst);
+      *dst = *src;
+      if (dst->u.view.owner_ref != NULL) {  // NOLINT
+        dst->u.view.owner_ref->ref(dst->u.view.owner_ref);
+      }
+      return;
+    }
     case TF_TSTR_LARGE: {
+      TF_TString_Dealloc(dst);
       const char *src_c = TF_TString_GetDataPointer(src);
       size_t size = TF_TString_GetSize(src);
 
@@ -433,9 +476,9 @@ static inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
 
   switch (TF_TString_GetType(src)) {
     case TF_TSTR_SMALL:
-    case TF_TSTR_VIEW:
       *dst = *src;
       return;
+    case TF_TSTR_VIEW:
     case TF_TSTR_LARGE:
       *dst = *src;
       TF_TString_Init(src);
diff --git a/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc b/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
index 80655dbee9407d..b7142c5e2f5675 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/integral_types_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <limits>
+
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 
@@ -32,15 +35,19 @@ TEST(IntegralTypes, Basic) {
 }
 
 TEST(IntegralTypes, MinAndMaxConstants) {
-  EXPECT_EQ(static_cast<uint8>(kint8min), static_cast<uint8>(kint8max) + 1);
-  EXPECT_EQ(static_cast<uint16>(kint16min), static_cast<uint16>(kint16max) + 1);
-  EXPECT_EQ(static_cast<uint32>(kint32min), static_cast<uint32>(kint32max) + 1);
-  EXPECT_EQ(static_cast<uint64>(kint64min), static_cast<uint64>(kint64max) + 1);
-
-  EXPECT_EQ(0, static_cast<uint8>(kuint8max + 1));
-  EXPECT_EQ(0, static_cast<uint16>(kuint16max + 1));
-  EXPECT_EQ(0, static_cast<uint32>(kuint32max + 1));
-  EXPECT_EQ(0, static_cast<uint64>(kuint64max + 1));
+  EXPECT_EQ(static_cast<uint8_t>(std::numeric_limits<int8_t>::min()),
+            static_cast<uint8_t>(std::numeric_limits<int8_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint16_t>(std::numeric_limits<int16_t>::min()),
+            static_cast<uint16_t>(std::numeric_limits<int16_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint32_t>(std::numeric_limits<int32_t>::min()),
+            static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) + 1);
+  EXPECT_EQ(static_cast<uint64_t>(std::numeric_limits<int64_t>::min()),
+            static_cast<uint64_t>(std::numeric_limits<int64_t>::max()) + 1);
+
+  EXPECT_EQ(0, static_cast<uint8_t>(std::numeric_limits<uint8_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint16_t>(std::numeric_limits<uint16_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint32_t>(std::numeric_limits<uint32_t>::max() + 1));
+  EXPECT_EQ(0, static_cast<uint64_t>(std::numeric_limits<uint64_t>::max() + 1));
 }
 
 }  // namespace
diff --git a/third_party/xla/third_party/tsl/tsl/platform/mem.h b/third_party/xla/third_party/tsl/tsl/platform/mem.h
index f88b4e1b197ef2..d2b3286f07e78a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/mem.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/mem.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <new>
 
 // TODO(cwhipkey): remove this when callers use annotations directly.
+#include "absl/base/macros.h"
 #include "xla/tsl/platform/dynamic_annotations.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/platform.h"
@@ -29,12 +31,23 @@ namespace port {
 
 // Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
 // and a multiple of sizeof(void*).
-void* AlignedMalloc(size_t size, int minimum_alignment);
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment);
+ABSL_DEPRECATE_AND_INLINE()
+inline void* AlignedMalloc(size_t size, int minimum_alignment) {
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
+}
 void AlignedFree(void* aligned_memory);
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size);
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t minimum_alignment);
+ABSL_DEPRECATE_AND_INLINE()
+inline void AlignedSizedFree(void* aligned_memory, size_t alignment,
+                             size_t size) {
+  AlignedSizedFree(aligned_memory, size,
+                   static_cast<std::align_val_t>(alignment));
+}
 
 // An allocator that allocates memory with the given minimum alignment.
-template <class T, size_t minimum_alignment>
+template <class T, std::align_val_t minimum_alignment>
 struct AlignedAllocator {
   using value_type = T;
 
@@ -44,7 +57,7 @@ struct AlignedAllocator {
   }
 
   void deallocate(value_type* p, size_t n) {
-    return AlignedSizedFree(p, minimum_alignment, n);
+    return AlignedSizedFree(p, n, minimum_alignment);
   }
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/net.h b/third_party/xla/third_party/tsl/tsl/platform/net.h
index 6c48b89fc6c9fe..cce8b6d43fa983 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/net.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/net.h
@@ -27,6 +27,16 @@ int PickUnusedPort();
 // that case, the error message is logged to FATAL.
 int PickUnusedPortOrDie();
 
+// Relinquish a claim on the given port which was previously returned by
+// PickUnusedPort[OrDie](). This allows PickUnusedPort[OrDie]() to return
+// the given port to another caller in the future. Since the number of
+// ports the portserver will give to a process is limited (typically 200),
+// recycling ports after they are no longer needed can help avoid
+// exhausting them. 'port' must be a positive number that was previously
+// returned by PickUnusedPort[OrDie](), and not yet recycled, otherwise an
+// abort may occur.
+void RecycleUnusedPort(int port);
+
 }  // namespace internal
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/net_test.cc b/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
index d99c7cb3952777..0c4a88518f5ad0 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/net_test.cc
@@ -31,5 +31,23 @@ TEST(Net, PickUnusedPortOrDie) {
   CHECK_NE(port0, port1);
 }
 
+TEST(Net, RecycleUnusedPort) {
+  for (int i = 0; i < 1000; ++i) {
+    int port0 = PickUnusedPortOrDie();
+    CHECK_GE(port0, 0);
+    CHECK_LT(port0, 65536);
+    RecycleUnusedPort(port0);
+  }
+}
+
+TEST(Net, RecycleUnusedPortTwiceShallFail) {
+  int port0 = PickUnusedPortOrDie();
+  CHECK_GE(port0, 0);
+  CHECK_LT(port0, 65536);
+  RecycleUnusedPort(port0);
+
+  EXPECT_DEATH(RecycleUnusedPort(port0), "");
+}
+
 }  // namespace internal
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
index c9dd5fc4fb6e36..708de7097833db 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
@@ -14,128 +14,45 @@ limitations under the License.
 
 #include "tsl/platform/numbers.h"
 
-#include <ctype.h>
-#include <float.h>
-#include <stdio.h>
-#include <stdlib.h>
-
 #include <algorithm>
 #include <charconv>
-#include <cmath>
+#include <cstddef>
 #include <cstdint>
-#include <locale>
+#include <limits>
+#include <optional>
 #include <string>
 #include <system_error>  // NOLINT
-#include <unordered_map>
+#include <type_traits>
 
+#include "absl/strings/charconv.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/stringprintf.h"
 
 namespace tsl {
 
 namespace {
 
 template <typename T>
-const std::unordered_map<std::string, T>* GetSpecialNumsSingleton() {
-  static const std::unordered_map<std::string, T>* special_nums =
-      CHECK_NOTNULL((new const std::unordered_map<std::string, T>{
-          {"inf", std::numeric_limits<T>::infinity()},
-          {"+inf", std::numeric_limits<T>::infinity()},
-          {"-inf", -std::numeric_limits<T>::infinity()},
-          {"infinity", std::numeric_limits<T>::infinity()},
-          {"+infinity", std::numeric_limits<T>::infinity()},
-          {"-infinity", -std::numeric_limits<T>::infinity()},
-          {"nan", std::numeric_limits<T>::quiet_NaN()},
-          {"+nan", std::numeric_limits<T>::quiet_NaN()},
-          {"-nan", -std::numeric_limits<T>::quiet_NaN()},
-      }));
-  return special_nums;
-}
-
-template <typename T>
-T locale_independent_strtonum(const char* str, const char** endptr) {
-  auto special_nums = GetSpecialNumsSingleton<T>();
-  std::stringstream s(str);
-
-  // Check if str is one of the special numbers.
-  std::string special_num_str;
-  s >> special_num_str;
-
-  for (size_t i = 0; i < special_num_str.length(); ++i) {
-    special_num_str[i] =
-        std::tolower(special_num_str[i], std::locale::classic());
+std::optional<T> AsciiToFp(absl::string_view str) {
+  T value;
+  absl::from_chars_result result =
+      absl::from_chars(str.data(), str.data() + str.size(), value);
+  if (result.ec != std::errc{}) {
+    return std::nullopt;
   }
-
-  auto entry = special_nums->find(special_num_str);
-  if (entry != special_nums->end()) {
-    *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
-                             : s.tellg());
-    return entry->second;
-  } else {
-    // Perhaps it's a hex number
-    if (special_num_str.compare(0, 2, "0x") == 0 ||
-        special_num_str.compare(0, 3, "-0x") == 0) {
-      return strtol(str, const_cast<char**>(endptr), 16);
-    }
+  if (result.ptr != str.data() + str.size()) {
+    // Not all characters consumed.
+    return std::nullopt;
   }
-  // Reset the stream
-  s.str(str);
-  s.clear();
-  // Use the "C" locale
-  s.imbue(std::locale::classic());
-
-  T result;
-  s >> result;
-
-  // Set to result to what strto{f,d} functions would have returned. If the
-  // number was outside the range, the stringstream sets the fail flag, but
-  // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
-  if (s.fail()) {
-    if (result == std::numeric_limits<T>::max() ||
-        result == std::numeric_limits<T>::infinity()) {
-      result = std::numeric_limits<T>::infinity();
-      s.clear(s.rdstate() & ~std::ios::failbit);
-    } else if (result == -std::numeric_limits<T>::max() ||
-               result == -std::numeric_limits<T>::infinity()) {
-      result = -std::numeric_limits<T>::infinity();
-      s.clear(s.rdstate() & ~std::ios::failbit);
-    }
-  }
-
-  if (endptr) {
-    *endptr =
-        str +
-        (s.fail() ? static_cast<std::iostream::pos_type>(0)
-                  : (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
-                             : s.tellg()));
-  }
-  return result;
+  return value;
 }
 
-}  // namespace
-
-namespace strings {
-
-size_t FastInt32ToBufferLeft(int32_t i, char* buffer) {
-  uint32_t u = i;
-  size_t length = 0;
-  if (i < 0) {
-    *buffer++ = '-';
-    ++length;
-    // We need to do the negation in modular (i.e., "unsigned")
-    // arithmetic; MSVC++ apparently warns for plain "-u", so
-    // we write the equivalent expression "0 - u" instead.
-    u = 0 - u;
-  }
-  length += FastUInt32ToBufferLeft(u, buffer);
-  return length;
-}
-
-size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
+template <typename T>
+size_t FastUIntToBufferLeft(T i, char* buffer) {
+  static_assert(std::is_unsigned_v<T>);
   char* start = buffer;
   do {
     *buffer++ = ((i % 10) + '0');
@@ -146,103 +63,108 @@ size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
   return buffer - start;
 }
 
-size_t FastInt64ToBufferLeft(int64_t i, char* buffer) {
-  uint64_t u = i;
+template <typename T>
+size_t FastIntToBufferLeft(T i, char* buffer) {
+  static_assert(std::is_signed_v<T>);
+  std::make_unsigned_t<T> u = i;
   size_t length = 0;
   if (i < 0) {
     *buffer++ = '-';
     ++length;
+    // We need to do the negation in modular (i.e., "unsigned")
+    // arithmetic; MSVC++ apparently warns for plain "-u", so
+    // we write the equivalent expression "0 - u" instead.
     u = 0 - u;
   }
-  length += FastUInt64ToBufferLeft(u, buffer);
+  length += FastUIntToBufferLeft(u, buffer);
   return length;
 }
+}  // namespace
 
-size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer) {
-  char* start = buffer;
-  do {
-    *buffer++ = ((i % 10) + '0');
-    i /= 10;
-  } while (i > 0);
-  *buffer = 0;
-  std::reverse(start, buffer);
-  return buffer - start;
-}
-
-static const double kDoublePrecisionCheckMax = DBL_MAX / 1.000000000000001;
-
-size_t DoubleToBuffer(double value, char* buffer) {
-  // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
-  // platforms these days.  Just in case some system exists where DBL_DIG
-  // is significantly larger -- and risks overflowing our buffer -- we have
-  // this assert.
-  static_assert(DBL_DIG < 20, "DBL_DIG is too big");
-
-  if (std::isnan(value)) {
-    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
-                                   std::signbit(value) ? "-" : "");
-    // Paranoid check to ensure we don't overflow the buffer.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
-    return snprintf_result;
-  }
+namespace strings {
 
-  if (std::abs(value) <= kDoublePrecisionCheckMax) {
-    int snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value);
+size_t FastInt32ToBufferLeft(int32_t i, char* buffer) {
+  return FastIntToBufferLeft(i, buffer);
+}
 
-    // The snprintf should never overflow because the buffer is significantly
-    // larger than the precision we asked for.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer) {
+  return FastUIntToBufferLeft(i, buffer);
+}
 
-    if (locale_independent_strtonum<double>(buffer, nullptr) == value) {
-      // Round-tripping the string to double works; we're done.
-      return snprintf_result;
-    }
-    // else: full precision formatting needed. Fall through.
-  }
+size_t FastInt64ToBufferLeft(int64_t i, char* buffer) {
+  return FastIntToBufferLeft(i, buffer);
+}
 
-  int snprintf_result =
-      snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value);
+size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer) {
+  return FastUIntToBufferLeft(i, buffer);
+}
 
-  // Should never overflow; see above.
-  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+namespace {
 
-  return snprintf_result;
+constexpr int NumDecimalDigits(int n) {
+  int count = 0;
+  do {
+    ++count;
+    n /= 10;
+  } while (n != 0);
+  return count;
 }
 
-size_t FloatToBuffer(float value, char* buffer) {
-  // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
-  // platforms these days.  Just in case some system exists where FLT_DIG
-  // is significantly larger -- and risks overflowing our buffer -- we have
-  // this assert.
-  static_assert(FLT_DIG < 10, "FLT_DIG is too big");
-
+template <typename T>
+size_t FpToBuffer(T value, char* buffer) {
+  using strings_internal::kFastToBufferSize;
+  // Out of an abundance of caution, we ensure that the buffer is large enough
+  // to hold the worst-case formatting of any floating-point number.
+  constexpr size_t kMaxExponentDigits10 =
+      std::max(NumDecimalDigits(std::numeric_limits<T>::max_exponent10),
+               NumDecimalDigits(std::numeric_limits<T>::min_exponent10));
+  constexpr size_t kMaxCharsWritten =
+      1 +                                     // sign bit
+      std::numeric_limits<T>::max_digits10 +  // decimal digits
+      1 +                                     // decimal point
+      1 +                                     // exponent character
+      1 +                                     // exponent sign
+      kMaxExponentDigits10;                   // exponent digits
+  static_assert(kMaxCharsWritten < kFastToBufferSize);
   if (std::isnan(value)) {
-    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
-                                   std::signbit(value) ? "-" : "");
+    int snprintf_result = absl::SNPrintF(buffer, kFastToBufferSize, "%snan",
+                                         std::signbit(value) ? "-" : "");
     // Paranoid check to ensure we don't overflow the buffer.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
     return snprintf_result;
   }
 
-  int snprintf_result =
-      snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG, value);
+  int snprintf_result = absl::SNPrintF(buffer, kFastToBufferSize, "%.*g",
+                                       std::numeric_limits<T>::digits10, value);
 
   // The snprintf should never overflow because the buffer is significantly
   // larger than the precision we asked for.
-  DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+  DCHECK(snprintf_result > 0 && snprintf_result <= kMaxCharsWritten);
 
-  float parsed_value;
-  if (!absl::SimpleAtof(buffer, &parsed_value) || parsed_value != value) {
+  if (auto parsed_value = AsciiToFp<T>(buffer); parsed_value != value) {
+    // Round-trip conversion failed, so we need to use full precision
+    // formatting.
     snprintf_result =
-        snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG + 3, value);
+        absl::SNPrintF(buffer, kFastToBufferSize, "%.*g",
+                       std::numeric_limits<T>::max_digits10, value);
 
     // Should never overflow; see above.
-    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+    DCHECK(snprintf_result > 0 && snprintf_result <= kMaxCharsWritten);
   }
+
   return snprintf_result;
 }
 
+}  // namespace
+
+size_t DoubleToBuffer(double value, char* buffer) {
+  return FpToBuffer(value, buffer);
+}
+
+size_t FloatToBuffer(float value, char* buffer) {
+  return FpToBuffer(value, buffer);
+}
+
 strings_internal::AlphaNumBuffer LegacyPrecision(double d) {
   strings_internal::AlphaNumBuffer result;
   result.size = DoubleToBuffer(d, result.data.data());
@@ -274,63 +196,58 @@ bool HexStringToUint64(absl::string_view s, uint64_t* result) {
   return true;
 }
 
-std::string HumanReadableNum(int64_t value) {
+std::string HumanReadableNum(int64_t signed_value) {
   std::string s;
-  if (value < 0) {
-    s += "-";
+
+  uint64_t value = static_cast<uint64_t>(signed_value);
+  if (signed_value < 0) {
+    s = "-";
     value = -value;
   }
   if (value < 1000) {
-    Appendf(&s, "%lld", static_cast<long long>(value));
+    absl::StrAppendFormat(&s, "%d", value);
   } else if (value >= static_cast<int64_t>(1e15)) {
     // Number bigger than 1E15; use that notation.
-    Appendf(&s, "%0.3G", static_cast<double>(value));
+    absl::StrAppendFormat(&s, "%0.3G", static_cast<double>(value));
   } else {
-    static const char units[] = "kMBT";
-    const char* unit = units;
+    static absl::string_view kUnits = "kMBT";
+    auto unit = kUnits.begin();
     while (value >= static_cast<int64_t>(1000000)) {
       value /= static_cast<int64_t>(1000);
       ++unit;
-      CHECK(unit < units + TF_ARRAYSIZE(units));
+      CHECK(unit < kUnits.end());
     }
-    Appendf(&s, "%.2f%c", value / 1000.0, *unit);
+    absl::StrAppendFormat(&s, "%.2f%c", value / 1000.0, *unit);
   }
   return s;
 }
 
-std::string HumanReadableNumBytes(int64_t num_bytes) {
-  if (num_bytes == kint64min) {
-    // Special case for number with not representable negation.
-    return "-8E";
-  }
-
-  const char* neg_str = (num_bytes < 0) ? "-" : "";
-  if (num_bytes < 0) {
+std::string HumanReadableNumBytes(int64_t signed_num_bytes) {
+  static absl::string_view kNegSign = "-";
+  absl::string_view sign_str;
+  uint64_t num_bytes = static_cast<uint64_t>(signed_num_bytes);
+  if (signed_num_bytes < 0) {
     num_bytes = -num_bytes;
+    sign_str = kNegSign;
   }
 
   // Special case for bytes.
   if (num_bytes < 1024) {
     // No fractions for bytes.
-    char buf[8];  // Longest possible string is '-XXXXB'
-    snprintf(buf, sizeof(buf), "%s%lldB", neg_str,
-             static_cast<long long>(num_bytes));
-    return std::string(buf);
+    return absl::StrCat(sign_str, num_bytes, "B");
   }
 
-  static const char units[] = "KMGTPE";  // int64 only goes up to E.
-  const char* unit = units;
+  static absl::string_view kUnits = "KMGTPE";  // int64 only goes up to E.
+  auto unit = kUnits.begin();
   while (num_bytes >= static_cast<int64_t>(1024) * 1024) {
     num_bytes /= 1024;
     ++unit;
-    CHECK(unit < units + TF_ARRAYSIZE(units));
+    CHECK(unit < kUnits.end());
   }
 
   // We use SI prefixes.
-  char buf[16];
-  snprintf(buf, sizeof(buf), ((*unit == 'K') ? "%s%.1f%ciB" : "%s%.2f%ciB"),
-           neg_str, num_bytes / 1024.0, *unit);
-  return std::string(buf);
+  return absl::StrFormat("%s%.*f%ciB", sign_str, *unit == 'K' ? 1 : 2,
+                         num_bytes / 1024.0, *unit);
 }
 
 std::string HumanReadableElapsedTime(double seconds) {
@@ -346,43 +263,43 @@ std::string HumanReadableElapsedTime(double seconds) {
   // the tested condition and returning, e.g., "1e+03 us" instead of "1 ms".
   const double microseconds = seconds * 1.0e6;
   if (microseconds < 999.5) {
-    strings::Appendf(&human_readable, "%0.3g us", microseconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g us", microseconds);
     return human_readable;
   }
   double milliseconds = seconds * 1e3;
   if (milliseconds >= .995 && milliseconds < 1) {
-    // Round half to even in Appendf would convert this to 0.999 ms.
+    // Round half to even in StrAppendFormat would convert this to 0.999 ms.
     milliseconds = 1.0;
   }
   if (milliseconds < 999.5) {
-    strings::Appendf(&human_readable, "%0.3g ms", milliseconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g ms", milliseconds);
     return human_readable;
   }
   if (seconds < 60.0) {
-    strings::Appendf(&human_readable, "%0.3g s", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g s", seconds);
     return human_readable;
   }
   seconds /= 60.0;
   if (seconds < 60.0) {
-    strings::Appendf(&human_readable, "%0.3g min", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g min", seconds);
     return human_readable;
   }
   seconds /= 60.0;
   if (seconds < 24.0) {
-    strings::Appendf(&human_readable, "%0.3g h", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g h", seconds);
     return human_readable;
   }
   seconds /= 24.0;
   if (seconds < 30.0) {
-    strings::Appendf(&human_readable, "%0.3g days", seconds);
+    absl::StrAppendFormat(&human_readable, "%0.3g days", seconds);
     return human_readable;
   }
   if (seconds < 365.2425) {
-    strings::Appendf(&human_readable, "%0.3g months", seconds / 30.436875);
+    absl::StrAppendFormat(&human_readable, "%0.3g months", seconds / 30.436875);
     return human_readable;
   }
   seconds /= 365.2425;
-  strings::Appendf(&human_readable, "%0.3g years", seconds);
+  absl::StrAppendFormat(&human_readable, "%0.3g years", seconds);
   return human_readable;
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.h b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
index 8d87fc87187637..43b4ac86414130 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <string>
+#include <type_traits>
 
 #include "absl/base/macros.h"
 #include "absl/strings/numbers.h"
@@ -30,21 +31,7 @@ limitations under the License.
 namespace tsl {
 namespace strings {
 
-// ----------------------------------------------------------------------
-// FastIntToBufferLeft()
-//    These are intended for speed.
-//
-//    All functions take the output buffer as an arg.  FastInt() uses
-//    at most 22 bytes, FastTime() uses exactly 30 bytes.  They all
-//    return a pointer to the beginning of the output, which is the same as
-//    the beginning of the input buffer.
-//
-//    NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible
-//    to pass to FastTimeToBuffer() a time whose year cannot be
-//    represented in 4 digits. In this case, the output buffer
-//    will contain the string "Invalid:<value>"
-// ----------------------------------------------------------------------
-
+namespace strings_internal {
 // Previously documented minimums -- the buffers provided must be at least this
 // long, though these numbers are subject to change:
 //     Int32, UInt32:                   12 bytes
@@ -53,33 +40,6 @@ namespace strings {
 // Use kFastToBufferSize rather than hardcoding constants.
 inline constexpr int kFastToBufferSize = 32;
 
-// ----------------------------------------------------------------------
-// FastInt32ToBufferLeft()
-// FastUInt32ToBufferLeft()
-// FastInt64ToBufferLeft()
-// FastUInt64ToBufferLeft()
-//
-// These functions convert their numeric argument to an ASCII
-// representation of the numeric value in base 10, with the
-// representation being left-aligned in the buffer.  The caller is
-// responsible for ensuring that the buffer has enough space to hold
-// the output.  The buffer should typically be at least kFastToBufferSize
-// bytes.
-//
-// Returns the number of characters written.
-// ----------------------------------------------------------------------
-
-size_t FastInt32ToBufferLeft(int32_t i, char* buffer);  // at least 12 bytes
-size_t FastUInt32ToBufferLeft(uint32_t i, char* buffer);  // at least 12 bytes
-size_t FastInt64ToBufferLeft(int64_t i, char* buffer);  // at least 22 bytes
-size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer);  // at least 22 bytes
-
-// Required buffer size for DoubleToBuffer is kFastToBufferSize.
-// Required buffer size for FloatToBuffer is kFastToBufferSize.
-size_t DoubleToBuffer(double value, char* buffer);
-size_t FloatToBuffer(float value, char* buffer);
-
-namespace strings_internal {
 // AlphaNumBuffer allows a way to pass a string to absl::StrCat without having
 // to do memory allocation. It is simply a pair of a fixed-size character
 // array, and a size.  Please don't use outside of the "strings" package.
@@ -173,36 +133,19 @@ inline bool safe_strtod(absl::string_view str, double* value) {
   return absl::SimpleAtod(str, value);
 }
 
-inline bool ProtoParseNumeric(absl::string_view s, int32_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, uint32_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, int64_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, uint64_t* value) {
-  return absl::SimpleAtoi(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, float* value) {
-  return absl::SimpleAtof(s, value);
-}
-
-inline bool ProtoParseNumeric(absl::string_view s, double* value) {
-  return absl::SimpleAtod(s, value);
-}
-
 // Convert strings to number of type T.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 template <typename T>
 bool SafeStringToNumeric(absl::string_view s, T* value) {
-  return ProtoParseNumeric(s, value);
+  if constexpr (std::is_integral_v<T>) {
+    return absl::SimpleAtoi(s, value);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return absl::SimpleAtof(s, value);
+  } else {
+    static_assert(std::is_same_v<T, double>);
+    return absl::SimpleAtod(s, value);
+  }
 }
 
 // Converts from an int64 to a human readable string representing the
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
index 60b6deb67dac82..db94c24766767c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tsl/platform/numbers.h"
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <limits>
 #include <string>
@@ -27,6 +28,8 @@ limitations under the License.
 namespace tsl {
 namespace strings {
 
+using strings_internal::kFastToBufferSize;
+
 // NOTE: most of the routines in numbers.h are tested indirectly through
 // strcat_test.cc in this directory.
 
@@ -73,6 +76,8 @@ TEST(HumanReadableNum, Basic) {
   EXPECT_EQ(HumanReadableNum(1048576), "1.05M");
   EXPECT_EQ(HumanReadableNum(23956812342), "23.96B");
   EXPECT_EQ(HumanReadableNum(123456789012345678), "1.23E+17");
+  EXPECT_EQ(HumanReadableNum(std::numeric_limits<int64_t>::max()), "9.22E+18");
+  EXPECT_EQ(HumanReadableNum(std::numeric_limits<int64_t>::min()), "-9.22E+18");
 }
 
 TEST(HumanReadableNumBytes, Bytes) {
@@ -99,7 +104,8 @@ TEST(HumanReadableNumBytes, Bytes) {
   EXPECT_EQ("-4B", HumanReadableNumBytes(-4));
   EXPECT_EQ("-1000B", HumanReadableNumBytes(-1000));
   EXPECT_EQ("-11.77MiB", HumanReadableNumBytes(-12345678));
-  EXPECT_EQ("-8E", HumanReadableNumBytes(kint64min));
+  EXPECT_EQ("-8.00EiB",
+            HumanReadableNumBytes(std::numeric_limits<int64_t>::min()));
 }
 
 TEST(HumanReadableElapsedTime, Basic) {
@@ -200,9 +206,9 @@ TEST(safe_strto64, Int64s) {
   EXPECT_EQ(true, absl::SimpleAtoi("9223372036854775807", &result));
   EXPECT_EQ(9223372036854775807, result);
   EXPECT_EQ(true, absl::SimpleAtoi("-9223372036854775808", &result));
-  // kint64min == -9223372036854775808
+  // std::numeric_limits<int64_t>::min() == -9223372036854775808
   // Use -9223372036854775808 directly results in out of range error
-  EXPECT_EQ(kint64min, result);
+  EXPECT_EQ(std::numeric_limits<int64_t>::min(), result);
 
   // Invalid argument
   EXPECT_EQ(false, absl::SimpleAtoi(" 132as ", &result));
diff --git a/third_party/xla/third_party/tsl/tsl/platform/path_test.cc b/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
index f644b0742ab1e2..4fa6cff629ef2f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/path_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
-#include "tsl/platform/stringpiece.h"
 
 namespace tsl {
 namespace io {
@@ -106,8 +106,8 @@ TEST(PathTest, CleanPath) {
 
 #define EXPECT_PARSE_URI(uri, scheme, host, path)  \
   do {                                             \
-    StringPiece u(uri);                            \
-    StringPiece s, h, p;                           \
+    absl::string_view u(uri);                      \
+    absl::string_view s, h, p;                     \
     ParseURI(u, &s, &h, &p);                       \
     EXPECT_EQ(scheme, s);                          \
     EXPECT_EQ(host, h);                            \
diff --git a/third_party/xla/third_party/tsl/tsl/platform/scanner.h b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
index 4eb70b8244bc71..a13bcaf2b65e12 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/scanner.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/ascii.h"
 #include "xla/tsl/platform/macros.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringpiece.h"
@@ -174,58 +175,43 @@ class Scanner {
     return *this;
   }
 
-  static bool IsLetter(char ch) {
-    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
-  }
-
-  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
-
-  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
-
-  static bool IsSpace(char ch) {
-    return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
-            ch == '\r');
-  }
-
   static bool Matches(CharClass clz, char ch) {
     switch (clz) {
       case ALL:
         return true;
       case DIGIT:
-        return IsDigit(ch);
+        return absl::ascii_isdigit(ch);
       case LETTER:
-        return IsLetter(ch);
+        return absl::ascii_isalpha(ch);
       case LETTER_DIGIT:
-        return IsLetter(ch) || IsDigit(ch);
+        return absl::ascii_isalnum(ch);
       case LETTER_DIGIT_DASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
+        return (absl::ascii_isalnum(ch) || ch == '-' || ch == '_');
       case LETTER_DIGIT_DASH_DOT_SLASH:
-        return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
-               ch == '/';
+        return absl::ascii_isalnum(ch) || ch == '-' || ch == '.' || ch == '/';
       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
-        return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
+        return (absl::ascii_isalnum(ch) || ch == '-' || ch == '.' ||
                 ch == '/' || ch == '_');
       case LETTER_DIGIT_DOT:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.';
+        return absl::ascii_isalnum(ch) || ch == '.';
       case LETTER_DIGIT_DOT_PLUS_MINUS:
-        return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
-               ch == '.';
+        return absl::ascii_isalnum(ch) || ch == '+' || ch == '-' || ch == '.';
       case LETTER_DIGIT_DOT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
+        return absl::ascii_isalnum(ch) || ch == '.' || ch == '_';
       case LETTER_DIGIT_UNDERSCORE:
-        return IsLetter(ch) || IsDigit(ch) || ch == '_';
+        return absl::ascii_isalnum(ch) || ch == '_';
       case LOWERLETTER:
-        return ch >= 'a' && ch <= 'z';
+        return absl::ascii_islower(ch);
       case LOWERLETTER_DIGIT:
-        return IsLowerLetter(ch) || IsDigit(ch);
+        return absl::ascii_islower(ch) || absl::ascii_isdigit(ch);
       case LOWERLETTER_DIGIT_UNDERSCORE:
-        return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
+        return absl::ascii_islower(ch) || absl::ascii_isdigit(ch) || ch == '_';
       case NON_ZERO_DIGIT:
-        return IsDigit(ch) && ch != '0';
+        return absl::ascii_isdigit(ch) && ch != '0';
       case SPACE:
-        return IsSpace(ch);
+        return absl::ascii_isspace(ch);
       case UPPERLETTER:
-        return ch >= 'A' && ch <= 'Z';
+        return absl::ascii_isupper(ch);
       case RANGLE:
         return ch == '>';
     }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util.cc b/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
index f22bc6f0c45e3a..4d4881a827a1f5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tsl/platform/str_util.h"
 
-#include <cctype>
+#include <charconv>
+#include <cstddef>
 #include <cstdint>
 #include <string>
+#include <system_error>  // NOLINT
 
 #include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/stringpiece.h"
 
@@ -48,28 +51,16 @@ size_t RemoveWhitespaceContext(absl::string_view* text) {
 }
 
 bool ConsumeLeadingDigits(absl::string_view* s, uint64_t* val) {
-  const char* p = s->data();
-  const char* limit = p + s->size();
-  uint64_t v = 0;
-  while (p < limit) {
-    const char c = *p;
-    if (c < '0' || c > '9') break;
-    uint64_t new_v = (v * 10) + (c - '0');
-    if (new_v / 8 < v) {
-      // Overflow occurred
-      return false;
-    }
-    v = new_v;
-    p++;
-  }
-  if (p > s->data()) {
-    // Consume some digits
-    s->remove_prefix(p - s->data());
-    *val = v;
-    return true;
-  } else {
+  uint64_t v;
+  auto [p, ec] =
+      std::from_chars(s->data(), s->data() + s->size(), v, /*base=*/10);
+  if (ec != std::errc{}) {
     return false;
   }
+  // Consume some digits
+  s->remove_prefix(p - s->data());
+  *val = v;
+  return true;
 }
 
 bool ConsumeNonWhitespace(absl::string_view* s, absl::string_view* val) {
@@ -77,7 +68,7 @@ bool ConsumeNonWhitespace(absl::string_view* s, absl::string_view* val) {
   const char* limit = p + s->size();
   while (p < limit) {
     const char c = *p;
-    if (isspace(c)) break;
+    if (absl::ascii_isspace(c)) break;
     p++;
   }
   const size_t n = p - s->data();
@@ -95,9 +86,9 @@ void TitlecaseString(string* s, absl::string_view delimiters) {
   bool upper = true;
   for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
     if (upper) {
-      *ss = toupper(*ss);
+      *ss = absl::ascii_toupper(*ss);
     }
-    upper = (delimiters.find(*ss) != absl::string_view::npos);
+    upper = absl::StrContains(delimiters, *ss);
   }
 }
 
@@ -120,14 +111,6 @@ string StringReplace(absl::string_view s, absl::string_view oldsub,
   return res;
 }
 
-size_t Strnlen(const char* str, const size_t string_max_len) {
-  size_t len = 0;
-  while (len < string_max_len && str[len] != '\0') {
-    ++len;
-  }
-  return len;
-}
-
 string ArgDefCase(absl::string_view s) {
   const size_t n = s.size();
 
@@ -138,7 +121,7 @@ string ArgDefCase(absl::string_view s) {
   size_t to_skip = 0;
   for (size_t i = 0; i < n; ++i) {
     // If we are skipping and current letter is non-alpha, skip it as well
-    if (i == to_skip && !isalpha(s[i])) {
+    if (i == to_skip && !absl::ascii_isalpha(s[i])) {
       ++to_skip;
       continue;
     }
@@ -147,7 +130,8 @@ string ArgDefCase(absl::string_view s) {
     // If this letter is upper case, not the very first char in the
     // resulting string, and previous letter isn't replaced with an underscore,
     // we will need to insert an underscore.
-    if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
+    if (absl::ascii_isupper(s[i]) && i != to_skip && i > 0 &&
+        absl::ascii_isalnum(s[i - 1])) {
       ++extra_us;
     }
   }
@@ -162,15 +146,15 @@ string ArgDefCase(absl::string_view s) {
     char c = s[i];
     // If c is not alphanumeric, we don't need to do anything
     // since there is already an underscore in its place.
-    if (isalnum(c)) {
-      if (isupper(c)) {
+    if (absl::ascii_isalnum(c)) {
+      if (absl::ascii_isupper(c)) {
         // If current char is upper case, we might need to insert an
         // underscore.
         if (i != to_skip) {
           DCHECK_GT(j, 0);
           if (result[j - 1] != '_') ++j;
         }
-        result[j] = tolower(c);
+        result[j] = absl::ascii_tolower(c);
       } else {
         result[j] = c;
       }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util.h b/third_party/xla/third_party/tsl/tsl/platform/str_util.h
index f6ba02e7014190..62c0b7a6de53c0 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util.h
@@ -210,11 +210,6 @@ inline bool StrContains(absl::string_view haystack, absl::string_view needle) {
   return absl::StrContains(haystack, needle);
 }
 
-// Returns the length of the given null-terminated byte string 'str'.
-// Returns 'string_max_len' if the null character was not found in the first
-// 'string_max_len' bytes of 'str'.
-size_t Strnlen(const char* str, const size_t string_max_len);
-
 //   ----- NON STANDARD, TF SPECIFIC METHOD -----
 // Converts "^2ILoveYou!" to "i_love_you_". More specifically:
 // - converts all non-alphanumeric characters to underscores
diff --git a/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc b/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
index 607d7d1bbdf0c7..4a2e14dd1d78f3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/str_util_test.cc
@@ -367,12 +367,4 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
-TEST(Strnlen, Basic) {
-  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
-  EXPECT_EQ(1, str_util::Strnlen("a", 1));
-  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
-  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
-  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
-}
-
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h b/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
index c7c9841ff701cd..c885f477e0a390 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/stringpiece.h
@@ -26,8 +26,10 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
 #define TENSORFLOW_TSL_PLATFORM_STRINGPIECE_H_
 
+// IWYU pragma: private, include "absl/strings/string_view.h"
+
 #include "absl/base/macros.h"
-#include "absl/strings/string_view.h"  // IWYU pragma: export
+#include "absl/strings/string_view.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc
deleted file mode 100644
index 2e33fb7107c136..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tsl/platform/stringprintf.h"
-
-#include <errno.h>
-#include <stdarg.h>  // For va_list and related operations
-#include <stdio.h>   // MSVC requires this for _vsnprintf
-
-namespace tsl {
-namespace strings {
-
-void Appendv(string* dst, const char* format, va_list ap) {
-  // First try with a small fixed size buffer
-  static const int kSpaceLength = 1024;
-  char space[kSpaceLength];
-
-  // It's possible for methods that use a va_list to invalidate
-  // the data in it upon use.  The fix is to make a copy
-  // of the structure before using it and use that copy instead.
-  va_list backup_ap;
-  va_copy(backup_ap, ap);
-  int result = vsnprintf(space, kSpaceLength, format, backup_ap);
-  va_end(backup_ap);
-
-  if (result < kSpaceLength) {
-    if (result >= 0) {
-      // Normal case -- everything fit.
-      dst->append(space, result);
-      return;
-    }
-
-#ifdef _MSC_VER
-      // Error or MSVC running out of space.  MSVC 8.0 and higher
-      // can be asked about space needed with the special idiom below:
-      va_copy(backup_ap, ap);
-      result = vsnprintf(nullptr, 0, format, backup_ap);
-      va_end(backup_ap);
-#endif
-
-    if (result < 0) {
-      // Just an error.
-      return;
-    }
-  }
-
-  // Increase the buffer size to the size requested by vsnprintf,
-  // plus one for the closing \0.
-  int length = result + 1;
-  char* buf = new char[length];
-
-  // Restore the va_list before we use it again
-  va_copy(backup_ap, ap);
-  result = vsnprintf(buf, length, format, backup_ap);
-  va_end(backup_ap);
-
-  if (result >= 0 && result < length) {
-    // It fit
-    dst->append(buf, result);
-  }
-  delete[] buf;
-}
-
-string Printf(const char* format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  string result;
-  Appendv(&result, format, ap);
-  va_end(ap);
-  return result;
-}
-
-void Appendf(string* dst, const char* format, ...) {
-  va_list ap;
-  va_start(ap, format);
-  Appendv(dst, format, ap);
-  va_end(ap);
-}
-
-}  // namespace strings
-}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
index 6e1268dfa352dc..837f117bd6950f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/stringprintf.h
@@ -26,25 +26,118 @@ limitations under the License.
 
 #include <string>
 
-#include "xla/tsl/platform/macros.h"
-#include "xla/tsl/platform/types.h"
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
 
 namespace tsl {
 namespace strings {
 
 // Return a C++ string
-std::string Printf(const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(1, 2);
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string Printf(const absl::FormatSpec<>& format) {
+  return absl::StrFormat(format);
+}
+
+template <typename Arg1>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1>& format, Arg1 arg1) {
+  return absl::StrFormat(format, arg1);
+}
+
+template <typename Arg1, typename Arg2>
+ABSL_DEPRECATE_AND_INLINE()
+std::string
+    Printf(const absl::FormatSpec<Arg1, Arg2>& format, Arg1 arg1, Arg2 arg2) {
+  return absl::StrFormat(format, arg1, arg2);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3>& format, Arg1 arg1,
+                   Arg2 arg2, Arg3 arg3) {
+  return absl::StrFormat(format, arg1, arg2, arg3);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4>& format,
+                   Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5>
+ABSL_DEPRECATE_AND_INLINE()
+std::string Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5>& format,
+                   Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4, arg5);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5, typename... AV>
+ABSL_DEPRECATED("Use absl::StrFormat instead.")
+std::string
+    Printf(const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5, AV...>& format,
+           Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, AV... args) {
+  return absl::StrFormat(format, arg1, arg2, arg3, arg4, arg5,
+                         std::forward<AV>(args)...);
+}
 
 // Append result to a supplied string
-void Appendf(std::string* dst, const char* format, ...)
-    // Tell the compiler to do printf format string checking.
-    TF_PRINTF_ATTRIBUTE(2, 3);
+ABSL_DEPRECATE_AND_INLINE()
+inline void Appendf(std::string* dst, const absl::FormatSpec<>& format) {
+  absl::StrAppendFormat(dst, format);
+}
+
+template <typename Arg1>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1>& format,
+             Arg1 arg1) {
+  absl::StrAppendFormat(dst, format, arg1);
+}
+
+template <typename Arg1, typename Arg2>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1, Arg2>& format,
+             Arg1 arg1, Arg2 arg2) {
+  absl::StrAppendFormat(dst, format, arg1, arg2);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst, const absl::FormatSpec<Arg1, Arg2, Arg3>& format,
+             Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst,
+             const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4>& format, Arg1 arg1,
+             Arg2 arg2, Arg3 arg3, Arg4 arg4) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4);
+}
+
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5>
+ABSL_DEPRECATE_AND_INLINE()
+void Appendf(std::string* dst,
+             const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5>& format,
+             Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4, arg5);
+}
 
-// Lower-level routine that takes a va_list and appends to a specified
-// string.  All other routines are just convenience wrappers around it.
-void Appendv(std::string* dst, const char* format, va_list ap);
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4,
+          typename Arg5, typename... AV>
+ABSL_DEPRECATED("Use absl::StrAppendFormat instead.")
+void Appendf(
+    std::string* dst,
+    const absl::FormatSpec<Arg1, Arg2, Arg3, Arg4, Arg5, AV...>& format,
+    Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, AV... args) {
+  absl::StrAppendFormat(dst, format, arg1, arg2, arg3, arg4, arg5,
+                        std::forward<AV>(args)...);
+}
 
 }  // namespace strings
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring.h b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
index 9c10389fc04379..a302174b93cc66 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
@@ -18,12 +18,16 @@ limitations under the License.
 
 #include <assert.h>
 
+#include <cstddef>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 
 #include "tsl/platform/cord.h"
 #include "tsl/platform/ctstring.h"
 #include "tsl/platform/platform.h"
+#include "tsl/platform/refcount.h"
 #include "tsl/platform/stringpiece.h"
 
 namespace tsl {
@@ -92,6 +96,45 @@ class tstring {
     view& operator=(const view&) = delete;
   };
 
+  // `tstring::owner` manages reference-counted shared ownership of a `tstring`
+  // buffer via an owned object of type `T`. `owner<T>` is useful for cases
+  // where an existing object already manages a buffer, and we want to transfer
+  // ownership of that object to a `tstring` via reference counting.
+  //
+  // Ownership of `v` is taken via `std::move`, and `value_` is destroyed via
+  // its destructor when `owner`'s reference count drops to zero. `T` must be
+  // move-constructible.
+  template <typename T>
+  class owner : public tsl::core::RefCounted {
+   public:
+    // Constructs an owner by moving `v` into `value_`.
+    explicit owner(T v) : value_(std::move(v)) {
+      capi_.ref = &RefImpl;
+      capi_.unref = &UnrefImpl;
+      capi_.obj = this;  // Store this to facilitate safe capi_ ref/unref calls.
+    }
+
+    // Accessors for the underlying object.
+    T& value() { return value_; }
+    const T& value() const { return value_; }
+
+   protected:
+    ~owner() override = default;
+
+   private:
+    friend class tstring;
+
+    static void RefImpl(TStringOwnerCApi* self) {
+      static_cast<owner*>(self->obj)->Ref();
+    }
+    static bool UnrefImpl(TStringOwnerCApi* self) {
+      return static_cast<owner*>(self->obj)->Unref();
+    }
+
+    TStringOwnerCApi capi_;
+    T value_;
+  };
+
   typedef const char* const_iterator;
 
   // Ctor
@@ -189,6 +232,23 @@ class tstring {
   tstring& assign_as_view(const char* str, size_t len);
   tstring& assign_as_view(const char* str);
 
+  // Assigns `str` as a shared view, where the lifetime of the buffer `str`
+  // points to is managed by `owner` via reference counting. This is in
+  // contrast to `assign_as_view`, which creates an unowned view.
+  //
+  // This function increments the reference count of `owner`. The caller should
+  // typically call `owner->Unref()` after this to donate their reference to the
+  // tstring(s), ensuring `owner` is deleted only when all tstrings referencing
+  // it are destroyed.
+  //
+  // The buffer pointed to by `str` is guaranteed to remain valid as long as
+  // this tstring (or any copies of it) exists. If `owner` is null, this
+  // method behaves like `assign_as_view`.
+  template <typename T>
+  tstring& assign_as_shared_view(absl::string_view str, owner<T>* owner);
+  template <typename T>
+  tstring& assign_as_shared_view(const char* str, size_t len, owner<T>* owner);
+
   // Modifiers
   // NOTE: Invalid input will result in undefined behavior.
   tstring& append(const tstring& str);
@@ -250,8 +310,16 @@ inline tstring::tstring(const absl::string_view str)
 #ifdef PLATFORM_GOOGLE
 inline tstring::tstring(const absl::Cord& cord) {
   TF_TString_Init(&tstr_);
+  if (cord.size() > TF_TString_SmallCapacity) {
+    std::optional<absl::string_view> flat = cord.TryFlat();
+    if (flat.has_value()) {
+      auto* cord_owner = new tstring::owner<absl::Cord>(cord);
+      assign_as_shared_view(*flat, cord_owner);
+      cord_owner->Unref();
+      return;
+    }
+  }
   TF_TString_ResizeUninitialized(&tstr_, cord.size());
-
   cord.CopyToArray(data());
 }
 #endif  // PLATFORM_GOOGLE
@@ -308,10 +376,17 @@ inline tstring& tstring::operator=(const absl::string_view str) {
 
 #ifdef PLATFORM_GOOGLE
 inline tstring& tstring::operator=(const absl::Cord& cord) {
+  if (cord.size() > TF_TString_SmallCapacity) {
+    std::optional<absl::string_view> flat = cord.TryFlat();
+    if (flat.has_value()) {
+      auto* cord_owner = new tstring::owner<absl::Cord>(cord);
+      assign_as_shared_view(*flat, cord_owner);
+      cord_owner->Unref();
+      return *this;
+    }
+  }
   TF_TString_ResizeUninitialized(&tstr_, cord.size());
-
   cord.CopyToArray(data());
-
   return *this;
 }
 #endif  // PLATFORM_GOOGLE
@@ -486,6 +561,22 @@ inline tstring& tstring::assign_as_view(const char* str) {
   return *this;
 }
 
+template <typename T>
+inline tstring& tstring::assign_as_shared_view(const absl::string_view str,
+                                               tstring::owner<T>* owner) {
+  TF_TString_AssignViewWithOwner(&tstr_, str.data(), str.size(),
+                                 owner ? &owner->capi_ : nullptr);
+  return *this;
+}
+
+template <typename T>
+inline tstring& tstring::assign_as_shared_view(const char* str, size_t len,
+                                               tstring::owner<T>* owner) {
+  TF_TString_AssignViewWithOwner(&tstr_, str, len,
+                                 owner ? &owner->capi_ : nullptr);
+  return *this;
+}
+
 // Modifiers
 
 inline tstring& tstring::append(const tstring& str) {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
index 04951c6e9de5be..16118d88953e60 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
@@ -183,15 +183,29 @@ TEST(TF_TStringTest, Assignment) {
 #ifdef PLATFORM_GOOGLE
   s33 = absl::Cord(kLongString);
 
+  // Check flat cord.
   EXPECT_EQ(kLongString, s33);
-  EXPECT_EQ(tstring::Type::LARGE, s33.type());
+  EXPECT_EQ(tstring::Type::VIEW, s33.type());
   EXPECT_EQ(kLongStringLen, s33.size());
 
   tstring s34((absl::Cord(kLongString)));
 
   EXPECT_EQ(kLongString, s34);
-  EXPECT_EQ(tstring::Type::LARGE, s34.type());
+  EXPECT_EQ(tstring::Type::VIEW, s34.type());
   EXPECT_EQ(kLongStringLen, s34.size());
+
+  // Check non-flat cord.
+  absl::Cord c1(kLongString);
+  absl::Cord c2(std::string(500, 'x'));
+  c1.Append(c2);
+  if (!c1.TryFlat()) {
+    tstring s35(c1);
+    EXPECT_EQ(tstring::Type::LARGE, s35.type());
+    EXPECT_EQ(c1.size(), s35.size());
+    s33 = c1;
+    EXPECT_EQ(tstring::Type::LARGE, s33.type());
+    EXPECT_EQ(c1.size(), s33.size());
+  }
 #endif  // PLATFORM_GOOGLE
 }
 
@@ -407,3 +421,66 @@ TEST(TF_TStringTest, Friends) {
 
   EXPECT_EQ(std::string("\0a\0", 3), ss.str());
 }
+
+struct DeletionMarker {
+  bool* deleted = nullptr;
+  DeletionMarker() = default;
+  explicit DeletionMarker(bool* d) : deleted(d) {
+    if (deleted) *deleted = false;
+  }
+  DeletionMarker(DeletionMarker&& other) : deleted(other.deleted) {
+    other.deleted = nullptr;
+  }
+  DeletionMarker& operator=(DeletionMarker&& other) {
+    deleted = other.deleted;
+    other.deleted = nullptr;
+    return *this;
+  }
+  ~DeletionMarker() {
+    if (deleted) *deleted = true;
+  }
+};
+
+TEST(OwnerTest, RefUnref) {
+  bool deleted = false;
+  auto* owner =
+      new tsl::tstring::owner<DeletionMarker>(DeletionMarker(&deleted));
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s1;
+  s1.assign_as_shared_view("hello", owner);
+  owner->Unref();
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s2 = s1;
+  EXPECT_FALSE(deleted);
+
+  s1.clear();
+  EXPECT_FALSE(deleted);
+
+  s2.clear();
+  EXPECT_TRUE(deleted);
+}
+
+TEST(OwnerTest, Assign) {
+  bool deleted = false;
+  auto* owner =
+      new tsl::tstring::owner<DeletionMarker>(DeletionMarker(&deleted));
+  EXPECT_FALSE(deleted);
+
+  tsl::tstring s1;
+  s1.assign_as_shared_view("hello", owner);
+  owner->Unref();
+
+  tsl::tstring s2;
+  s2 = s1;
+
+  tsl::tstring s3;
+  s3 = std::move(s1);
+
+  s2.clear();
+  EXPECT_FALSE(deleted);
+
+  s3.clear();
+  EXPECT_TRUE(deleted);
+}
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
index 422e8271ee4fc3..c9e4e520c91028 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
@@ -82,7 +82,7 @@ class TraceMeProducer : public TraceMe {
   explicit TraceMeProducer(NameT&& name,
                            ContextType context_type = ContextType::kGeneric,
                            std::optional<uint64> context_id = std::nullopt,
-                           int level = 2)
+                           int level = tsl::profiler::TraceMeLevel::kCritical)
       : TraceMe(std::forward<NameT>(name), level),
         context_id_(context_id.has_value() ? context_id.value()
                                            : TraceMe::NewActivityId()) {
@@ -101,7 +101,7 @@ class TraceMeConsumer : public TraceMe {
  public:
   template <typename NameT>
   TraceMeConsumer(NameT&& name, ContextType context_type, uint64 context_id,
-                  int level = 2)
+                  int level = tsl::profiler::TraceMeLevel::kCritical)
       : TraceMe(std::forward<NameT>(name), level) {
     AppendMetadata([&] {
       return TraceMeEncode({{"_ct", context_type}, {"_c", context_id}});
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
index b41215e77ae6c7..5ffe1e799284c2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-// Next ID: 14
+// Next ID: 15
 message ProfileOptions {
   // Some default value of option are not proto3 default value. Use this version
   // to determine if we should use default option value instead of proto3
@@ -96,6 +96,10 @@ message ProfileOptions {
   map<string, AdvancedConfigValue> advanced_configuration = 12;
 
   bool raise_error_on_start_failure = 13;
+
+  // Identifier of the profiling session. This will be used as the subdirectory
+  // under the repository path. If not set, the current timestamp will be used.
+  string session_id = 14;
 }
 
 // Options for remote profiler session manager.
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index a738211d877d75..548f579d4f2ad7 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "a23040b6307b67da2319de292a0bb4b39d0e5913fae50a90f955eafa1acb81c7",
-        strip_prefix = "XNNPACK-da9a34d9bb68f339c35d2da480ab0734b0a26429",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/da9a34d9bb68f339c35d2da480ab0734b0a26429.zip"),
+        sha256 = "b75c85b77e2d20b710763978c00385b27869f28a5f0a4967050c6d06767043ce",
+        strip_prefix = "XNNPACK-2dbaa1cd9faac161a59f4e1f3d0835991e2370d9",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2dbaa1cd9faac161a59f4e1f3d0835991e2370d9.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
index da4cc1302b7c79..8945d3988e5b91 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "@local_xla//third_party/remote_config:common.bzl",
+    "//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 77f3565f07364a..166d87f63e8617 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,6 +1,6 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 4c689ece55a536..807f13fb546fae 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -50,7 +50,7 @@ def initialize_rbe_configs():
     # The `ml-build`'s base image is a standard `ubuntu22.04` image.
     # Note that in order to use this image with RBE GPU builds, you need to have hermetic CUDA
     # toolchain integrated into your project, and pass
-    # `--@cuda_driver//:enable_forward_compatibility=true` to Bazel command.
+    # `--@cuda_driver//:include_cuda_umd_libs=true` to Bazel command.
     ml_build_rbe_config("docker://us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build@sha256:ea67e8453d8b09c2ba48853da5e79efef4b65804b4a48dfae4b4da89ffd38405")
 
     # TF-Version-Specific SIG Build RBE Configs. The crosstool generated from these
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index ba78bb509de7a4..dbfafdfb08c180 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,8 @@
 """Macro that creates external repositories for remote config."""
 
-load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
 
 def _container_image_uri(container_name):
diff --git a/third_party/xla/tools/toolchains/win/BUILD b/third_party/xla/tools/toolchains/win/BUILD
index 258ca032ecd1ea..3aeae034d1f4b2 100644
--- a/third_party/xla/tools/toolchains/win/BUILD
+++ b/third_party/xla/tools/toolchains/win/BUILD
@@ -14,22 +14,12 @@ platform(
         "@platforms//cpu:x86_64",
         "@platforms//os:windows",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
 
 # Register the clang-cl platform
@@ -40,20 +30,10 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "default"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:d3577d20dea75966faf7fd03479c71462441937df5694259109c2ee1d002a3dd",
+        "OSFamily": "Windows",
+        "Pool": "default",
+        "dockerNetwork": "off",
+    },
 )
diff --git a/third_party/xla/tools/toolchains/win2022/BUILD b/third_party/xla/tools/toolchains/win2022/BUILD
index 1499e7f0767ab9..0dba97d9d4a4b7 100644
--- a/third_party/xla/tools/toolchains/win2022/BUILD
+++ b/third_party/xla/tools/toolchains/win2022/BUILD
@@ -16,20 +16,11 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
-    remote_execution_properties = """
-        properties:{
-          name: "container-image"
-	  value: "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
-        }
-        properties:{
-          name: "OSFamily"
-          value: "Windows"
-        }
-        properties:{
-          name: "Pool" value: "win2022"
-        }
-        properties:{
-          name: "dockerNetwork" value: "off"
-        }
-        """,
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc",
+        "OSFamily": "Windows",
+        "Pool": "win2022",
+        "dockerNetwork": "off",
+        "cache-silo-key": "20251105-1762360217",
+    },
 )
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 75bdf8ad249c36..aff3deeb46d6d5 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -140,10 +140,10 @@ def workspace():
     if "rules_ml_toolchain" not in native.existing_rules():
         http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "4133c6c2045de5d7a133f6fc008ee6bd613af778f12143d09003e908dd541d8c",
-            strip_prefix = "rules_ml_toolchain-d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53",
+            sha256 = "a7951a86c4e9783302230b859237d953a6c8c301b219d344e05d70496eeefa52",
+            strip_prefix = "rules_ml_toolchain-0d383c69076f637d55eaae0b6e0ee2980b1345a9",
             urls = [
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/d8d8f49297a1e74fcceffc9ef6c7f8da9b0a0c53.tar.gz",
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/0d383c69076f637d55eaae0b6e0ee2980b1345a9.tar.gz",
             ],
         )
 
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index e757be6602cb79..a9f6d55f67e041 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -51,6 +51,7 @@ load("//third_party/spdlog:workspace.bzl", spdlog = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
+load("//third_party/transformer_engine:workspace.bzl", transformer_engine = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
 load("//third_party/uv:workspace.bzl", uv = "repo")
 load("//third_party/xnnpack:workspace.bzl", xnnpack = "repo")
@@ -100,6 +101,7 @@ def _initialize_third_party():
     spdlog()
     stablehlo()
     tensorrt()
+    transformer_engine()
     triton()
     uv()
     xnnpack()
@@ -160,9 +162,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "42155cfc084bf1f80e9ef486470f949502ea8d1b845b2f1bebd58978a1b540aa",
-        strip_prefix = "kleidiai-8ca226712975f24f13f71d04cda039a0ee9f9e2f",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/8ca226712975f24f13f71d04cda039a0ee9f9e2f.zip"),
+        sha256 = "ecd433a4f7446f7f02a04e458989de8253f19187aa85e3b81b19e0b60f0bf859",
+        strip_prefix = "kleidiai-d7770c89632329a9914ef1a90289917597639cbe",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/d7770c89632329a9914ef1a90289917597639cbe.zip"),
     )
 
     tf_http_archive(
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 5446acae4481c9..7e9db93f3a644f 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -4,7 +4,7 @@ load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
 load("//xla:package_groups.bzl", "xla_package_groups")
 load("//xla:xla.default.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
-load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_google", "if_oss", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//xla/tsl/platform:build_config.bzl",
@@ -51,7 +51,6 @@ filegroup(
 filegroup(
     name = "cpu_runtime_hdrs",
     srcs = [
-        "cpu_function_runtime.h",
         "executable_run_options.h",
         "types.h",
     ],
@@ -558,6 +557,7 @@ xla_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1213,10 +1213,16 @@ cc_library(
     name = "debug_options_flags",
     srcs = [
         "debug_options_flags.cc",
+        "debug_options_parsers.cc",
         "debug_options_parsers.h",
     ],
     hdrs = ["debug_options_flags.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]),
+    copts = if_enable_acl([
+        "-DXLA_CPU_USE_ACL=1",
+    ]) + if_oss([
+        "-DHAS_SUPPORT_FOR_LLD_AS_A_LIBRARY=1",
+        "-DHAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE=1",
+    ]),
     visibility = internal_visibility([":friends"]),
     deps =
         [
@@ -1257,18 +1263,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "cpu_function_runtime",
-    hdrs = ["cpu_function_runtime.h"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "//xla/backends/cpu:alignment",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-    ],
-)
-
 xla_cc_test(
     name = "debug_options_parsers_test",
     size = "small",
@@ -1285,6 +1279,7 @@ xla_cc_test(
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:test",
             "//xla/tsl/util:command_line_flags",
+            "@com_google_absl//absl/base:nullability",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/status",
             "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/array.h b/third_party/xla/xla/array.h
index 7d1274de103bb8..17d4ce9a2f2eb0 100644
--- a/third_party/xla/xla/array.h
+++ b/third_party/xla/xla/array.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <functional>
-#include <initializer_list>
 #include <iterator>
 #include <limits>
 #include <memory>
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index f606982d14542f..a35fb42b55265d 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -82,6 +82,7 @@ message AutotuneResult {
     int64 num_warps = 6;
     int64 num_ctas = 7;
     bool is_tma_allowed = 8;
+    bool is_warp_specialization_allowed = 9;
   }
 
   message CustomKernelFusionKey {
diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
index d10175b80a1e84..a801131ae7e8c0 100644
--- a/third_party/xla/xla/backends/autotuner/BUILD
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -40,8 +40,11 @@ cc_library(
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
+        "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -61,7 +64,6 @@ cc_library(
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -79,6 +81,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
@@ -86,6 +89,7 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "//xla/tsl/util/proto:proto_matchers",
         "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status",
@@ -93,13 +97,12 @@ xla_cc_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:any_cc_proto",
         "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -129,6 +132,9 @@ cc_library(
         ":autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 9ac4a53c0012b6..2c2dcb14750ef7 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "xla/backends/autotuner/autotuner.h"
 
 #include <algorithm>
+#include <cmath>
+#include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -25,6 +28,7 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -41,8 +45,11 @@ limitations under the License.
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/shaped_buffer.h"
+#include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -50,7 +57,6 @@ limitations under the License.
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/fingerprint.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -74,6 +80,38 @@ std::string UnpackedAnyShortDebugString(const google::protobuf::Any& any) {
   return s;
 }
 
+// It is important to fingerprint the entire module not just the autotuning
+// candidates, to avoid collisions in the key-value store when several
+// distinct modules have the same fusions, and are compiled at different
+// times by the same PjRt client.
+//
+// TODO(b/394763704): Eliminate the sharding feature when we have offline
+// autotuning. See below for an explanation of some issues.
+//
+// Theoretically, we also want to include the hash of the module config
+// to ensure that a module compiled twice with different configs is
+// autotuned twice.
+//
+// This is important since the config could e.g. affect codegen, or the
+// space of possible parameters for autotuning. As a result, the autotuning
+// results could look very different for the same module.
+//
+// Why is it not done here? Well, proto serialization is non-deterministic
+// and may change across different builds. Which means that users who run
+// on several hosts with different CPUs may end up generating different
+// fingerprints for the same module config. They would then fail to
+// exchange results through the key value store, which would lead to
+// deadlocks. Therefore, we don't hash the module config here.
+//
+// The flip side is this: if we compile the same module twice in the same
+// client, but with a different module config each time, we may hit the
+// cache the second time and recover potentially inferior, or incomplete
+// autotuning results.
+std::string GetKvStoreKey(const HloModule* module, int shard_index) {
+  return absl::StrCat("autotune_results_", module->GetFingerprint128(), "_",
+                      shard_index);
+}
+
 }  // namespace
 
 absl::StatusOr<Autotuner::Config> Autotuner::GetDefaultConfig(
@@ -110,18 +148,21 @@ absl::StatusOr<std::unique_ptr<Autotuner>> Autotuner::Create(
 
 absl::Status Autotuner::Autotune(HloModule* module,
                                  const InstructionFilterFn& should_autotune) {
-  InstructionsByFingerprint instrunctions_by_fingerprint =
+  InstructionsByFingerprint instructions_by_fingerprint =
       GetAutotuningCandidates(module, should_autotune);
-  if (instrunctions_by_fingerprint.empty()) {
+  if (instructions_by_fingerprint.empty()) {
     VLOG(1) << "No instructions to autotune.";
     return absl::OkStatus();
   }
-  VLOG(1) << "Finding configs for " << instrunctions_by_fingerprint.size()
+  VLOG(1) << "Finding configs for " << instructions_by_fingerprint.size()
           << " unique instructions.";
-  for (auto& [_, instructions] : instrunctions_by_fingerprint) {
+  for (auto& [_, instructions] : instructions_by_fingerprint) {
     CHECK(!instructions.empty());
     TF_ASSIGN_OR_RETURN(Config config, GetConfig(instructions[0]));
     CodegenBackend* codegen_backend = config.codegen_backend;
+    if (autotune_config_.dump_hlos) {
+      TF_RETURN_IF_ERROR(DumpHlo(instructions[0], config));
+    }
     for (auto* instr : instructions) {
       TF_RETURN_IF_ERROR(
           codegen_backend->ApplyConfig(*instr, *config.backend_config));
@@ -130,9 +171,110 @@ absl::Status Autotuner::Autotune(HloModule* module,
   return DumpLogsToFile();
 }
 
+absl::Status Autotuner::Autotune(HloModule* module,
+                                 const InstructionFilterFn& should_autotune,
+                                 MultiProcessKeyValueStore& sharding_kv_store) {
+  CHECK(cache_ != nullptr) << "Sharding autotuning requires a cache.";
+  int total_shards = sharding_kv_store.process_count;
+  int my_shard_index = sharding_kv_store.process_index;
+
+  // 1. Get all the instructions that could be autotuned.
+  InstructionsByFingerprint all_instructions_by_fingerprint =
+      GetAutotuningCandidates(module, should_autotune);
+  if (all_instructions_by_fingerprint.empty()) {
+    VLOG(1) << "No instructions to autotune.";
+    return absl::OkStatus();
+  }
+
+  // 2. Shard and get instructions to autotune for current shard.
+  const size_t bucket_size =
+      std::ceil(static_cast<double>(all_instructions_by_fingerprint.size()) /
+                static_cast<double>(total_shards));
+  const size_t start = bucket_size * my_shard_index;
+  const size_t end =
+      std::min(start + bucket_size, all_instructions_by_fingerprint.size());
+  InstructionsByFingerprint instructions_by_fingerprint(
+      std::next(all_instructions_by_fingerprint.begin(), start),
+      std::next(all_instructions_by_fingerprint.begin(), end));
+
+  // 3. Autotune instructions for this shard. Use cached configs if available,
+  // otherwise autotune and cache the best config.
+  VLOG(1) << "Shard " << my_shard_index << "/" << total_shards
+          << ": finding configs for " << instructions_by_fingerprint.size()
+          << "/" << all_instructions_by_fingerprint.size()
+          << " unique instructions ";
+  std::vector<const HloInstruction*> autotuned_instructions;
+  for (auto& [_, instructions] : instructions_by_fingerprint) {
+    CHECK(!instructions.empty());
+    TF_ASSIGN_OR_RETURN(Config config, GetConfig(instructions[0]));
+    autotuned_instructions.push_back(instructions[0]);
+  }
+  TF_RETURN_IF_ERROR(DumpLogsToFile());
+
+  // 4. Store the results for this shard as a serialized string to the KV store.
+  KeyValueStoreInterface& kv_store = *sharding_kv_store.key_value_store;
+  const std::string local_key = GetKvStoreKey(module, my_shard_index);
+  std::string local_results;
+  if (!autotuned_instructions.empty()) {
+    TF_ASSIGN_OR_RETURN(local_results,
+                        cache_->Serialize(autotuned_instructions));
+  }
+  absl::StatusOr<std::string> stored_result = kv_store.TryGet(local_key);
+  if (stored_result.status().code() == absl::StatusCode::kNotFound) {
+    VLOG(2) << "Storing results for " << local_key;
+    TF_RETURN_IF_ERROR(kv_store.Set(local_key, local_results));
+    VLOG(2) << "Shard " << my_shard_index << " stored results at " << local_key;
+  } else if (!stored_result.ok()) {
+    return stored_result.status();
+  } else {
+    VLOG(2) << "Results already exist for " << local_key << ", skipping store.";
+  }
+
+  // 5. Load the autotune results of other shards from the KV store and update
+  // the current shard's cache by deserializing the results.
+  for (int i = 0; i < total_shards; ++i) {
+    if (i == my_shard_index) {
+      continue;
+    }
+    const std::string remote_key = GetKvStoreKey(module, i);
+    VLOG(2) << "Shard " << my_shard_index << ": waiting for results from shard "
+            << i << " / " << total_shards << " at " << remote_key;
+    // TODO(b/361009609): reset to infinite duration once issue with MPI is
+    // fixed. https://github.com/google/jax/issues/22995.
+    TF_ASSIGN_OR_RETURN(std::string remote_results,
+                        kv_store.Get(remote_key, absl::Hours(24)));
+    if (!remote_results.empty()) {
+      TF_RETURN_IF_ERROR(cache_->Deserialize(remote_results));
+    }
+  }
+
+  // 6. Apply the results to all candidate instructions, must be already in
+  // cache_ due to step 3 and 5 above.
+  for (auto& [_, instructions] : all_instructions_by_fingerprint) {
+    CHECK(!instructions.empty());
+    std::optional<Config> cached_config = LookUp(instructions[0]);
+    CHECK(cached_config.has_value())
+        << "Sharding autotuning failed: no config found for HLO: " +
+               instructions[0]->ToString();
+    if (autotune_config_.dump_hlos) {
+      TF_RETURN_IF_ERROR(DumpHlo(instructions[0], *cached_config));
+    }
+    CodegenBackend* codegen_backend = cached_config->codegen_backend;
+    for (auto* instr : instructions) {
+      TF_RETURN_IF_ERROR(
+          codegen_backend->ApplyConfig(*instr, *cached_config->backend_config));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status Autotuner::Autotune(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(Config config, GetConfig(instr));
   CodegenBackend* codegen_backend = config.codegen_backend;
+  if (autotune_config_.dump_hlos) {
+    TF_RETURN_IF_ERROR(DumpHlo(instr, config));
+  }
   TF_RETURN_IF_ERROR(
       codegen_backend->ApplyConfig(*instr, *config.backend_config));
   return DumpLogsToFile();
@@ -221,15 +363,15 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
 
 Autotuner::InstructionsByFingerprint Autotuner::GetAutotuningCandidates(
     const HloModule* module, const InstructionFilterFn& should_autotune) {
-  InstructionsByFingerprint instrunctions_by_fingerprint;
+  InstructionsByFingerprint instructions_by_fingerprint;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
       if (should_autotune(*instr)) {
-        instrunctions_by_fingerprint[GetFingerprint(instr)].push_back(instr);
+        instructions_by_fingerprint[GetFingerprint(instr)].push_back(instr);
       }
     }
   }
-  return instrunctions_by_fingerprint;
+  return instructions_by_fingerprint;
 }
 
 std::optional<Autotuner::Config> Autotuner::LookUp(
@@ -316,8 +458,11 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
   std::optional<ScopedShapedBuffer> reference_output;
   if (autotune_config_.check_buffers) {
-    TF_ASSIGN_OR_RETURN(reference_output,
-                        GetReferenceOutput(candidates, *input_buffers));
+    reference_output = GetReferenceOutput(candidates, *input_buffers);
+    if (!reference_output.has_value()) {
+      LOG(WARNING) << "No reference output found even though buffer checking "
+                      "was requested while autotuning";
+    }
   }
 
   for (int i = 0; i < candidates.size(); ++i) {
@@ -333,8 +478,7 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
     } else {
       duration = profile_result->duration;
       scratch_bytes = profile_result->scratch_bytes;
-      if (autotune_config_.check_buffers) {
-        CHECK(reference_output.has_value());
+      if (autotune_config_.check_buffers && reference_output.has_value()) {
         CHECK(profile_result->output_buffer.has_value());
         failure =
             CheckBuffers(*input_buffers, profile_result->output_buffer.value(),
@@ -357,7 +501,7 @@ absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
         std::remove_if(results.begin(), results.end(),
                        [](const ConfigResult& result) {
                          return result.config.codegen_backend->name() ==
-                                "cublas";
+                                "Cublas_fission";
                        }),
         results.end());
   }
@@ -380,13 +524,14 @@ absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
         absl::InfiniteDuration();
     for (ConfigResult& result : results) {
       if (!result.failure.has_value() && result.duration <= duration_limit) {
-        if (result.scratch_bytes < min_scratch_bytes) {
+        bool current_result_is_better =
+            result.scratch_bytes < min_scratch_bytes ||
+            (result.scratch_bytes == min_scratch_bytes &&
+             result.duration < min_duration_with_optimzed_scratch_bytes);
+        if (current_result_is_better) {
           min_scratch_bytes = result.scratch_bytes;
           min_duration_with_optimzed_scratch_bytes = result.duration;
           best_result = &result;
-        } else if (result.scratch_bytes == min_scratch_bytes &&
-                   result.duration < min_duration_with_optimzed_scratch_bytes) {
-          best_result = &result;
         }
       }
     }
@@ -399,7 +544,23 @@ absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
   return std::move(*best_result);
 }
 
-absl::StatusOr<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
+absl::Status Autotuner::DumpHlo(HloInstruction* instr, const Config& config) {
+  const HloModule* parent_module = instr->GetModule();
+  std::unique_ptr<HloModule> module = ExtractInstructionIntoNewModule(*instr);
+  module->set_name(std::string(instr->name()));
+  std::string id =
+      absl::StrCat("autotuner_", dump_counter_++, ".", instr->name());
+  DumpToFileInDirOrStdout(*parent_module, "", absl::StrCat(id, ".before.txt"),
+                          module->ToString());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  TF_RETURN_IF_ERROR(
+      config.codegen_backend->ApplyConfig(*root, *config.backend_config));
+  DumpToFileInDirOrStdout(*parent_module, "", absl::StrCat(id, ".after.txt"),
+                          module->ToString());
+  return absl::OkStatus();
+}
+
+std::optional<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
     std::vector<ExecutableCandidate>& candidates, InputBuffers& input_buffers) {
   for (auto& candidate : candidates) {
     if (candidate.config.codegen_backend->CanProduceWrongResults()) {
@@ -415,8 +576,7 @@ absl::StatusOr<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
       return std::move(profile_result.value().output_buffer.value());
     }
   }
-  return absl::NotFoundError(
-      "No reference output found but correctness checking is enabled!");
+  return std::nullopt;
 }
 
 std::optional<Autotuner::Failure> Autotuner::CheckBuffers(
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.h b/third_party/xla/xla/backends/autotuner/autotuner.h
index 14a070372f0ee8..eb15e516070b22 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/executable.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -45,6 +46,8 @@ namespace xla {
 struct AutotuneConfig {
   // Whether to check the correctness of the output buffers and OOM reads on
   // Input Buffers.
+  // Correctness check is only performed when a trustable reference output is
+  // available.
   bool check_buffers = true;
   // Relative tolerance for correctness check.
   float relative_tolerance = 1e-6;
@@ -66,9 +69,9 @@ struct AutotuneConfig {
   std::string dump_logs_to = "";
   // TODO b/446618161 - Remove this when old triton emitter is
   // deprecated.
-  // If true, autotuner will not select cublas configs. We still try cublas
-  // configs as they can be used to check numerical issues with triton but they
-  // are not considered for selection, unless there are no other options.
+  // If true, autotuner will not select cublas configs for fusions. We still try
+  // the configs as they can be used to check numerical issues with triton but
+  // they are not considered for selection, unless there are no other options.
   bool exclude_cublas_config = false;
   // TODO b/446870267- Remove this option and use default configs rather than
   // the first config.
@@ -81,6 +84,9 @@ struct AutotuneConfig {
   // Note: If cache is provided, the cached config will be used instead of the
   // default config.
   bool use_default_config = false;
+  // If true, dump the autotuned instructions to the modules's xla_dump_to or
+  // to stdout if not set.
+  bool dump_hlos = false;
 };
 
 class Autotuner {
@@ -103,6 +109,13 @@ class Autotuner {
   absl::Status Autotune(HloModule* module,
                         const InstructionFilterFn& should_autotune);
 
+  // Same as above, but also takes a sharding KV store which helps to shard
+  // the autotuning work across multiple processes.
+  // This is used for distributed autotuning.
+  absl::Status Autotune(HloModule* module,
+                        const InstructionFilterFn& should_autotune,
+                        MultiProcessKeyValueStore& sharding_kv_store);
+
  private:
   using InstructionsByFingerprint =
       absl::flat_hash_map<tsl::Fprint128, std::vector<HloInstruction*>,
@@ -186,7 +199,7 @@ class Autotuner {
   absl::StatusOr<ConfigResult> PickBestConfig(
       std::vector<ConfigResult>& results);
 
-  absl::StatusOr<ScopedShapedBuffer> GetReferenceOutput(
+  std::optional<ScopedShapedBuffer> GetReferenceOutput(
       std::vector<ExecutableCandidate>& candidates,
       InputBuffers& input_buffers);
 
@@ -197,6 +210,8 @@ class Autotuner {
   void LogConfigResults(const HloInstruction& instr,
                         const std::vector<ConfigResult>& results);
   absl::Status DumpLogsToFile();
+  // Dumps HLO before and after applying the config.
+  absl::Status DumpHlo(HloInstruction* instr, const Config& config);
 
   std::vector<std::unique_ptr<CodegenBackend>> codegen_backends_;
   std::unique_ptr<Profiler> profiler_;
@@ -204,6 +219,7 @@ class Autotuner {
   std::unique_ptr<AutotunerCacheInterface> cache_;
   tsl::thread::ThreadPool* thread_pool_;
   AutotuningLogs logs_;
+  int dump_counter_ = 0;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h b/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
index f109920f2ad5c0..b6bbd57e64df89 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner_cache_interface.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 
@@ -42,6 +45,20 @@ class AutotunerCacheInterface {
 
   virtual absl::Status Insert(const HloInstruction* instr,
                               const Config& best_config) = 0;
+
+  // Serializes the cache to a string. If instructions are provided, only the
+  // cache entries corresponding to the instructions will be serialized,
+  // otherwise all cache entries will be serialized.
+  virtual absl::StatusOr<std::string> Serialize(
+      absl::Span<const HloInstruction* const> instructions_to_serialize) {
+    return absl::UnimplementedError("Serialize is not implemented.");
+  };
+
+  // Deserializes the string and updates the cache, overwriting the keys if they
+  // already exist.
+  virtual absl::Status Deserialize(absl::string_view serialized_cache) {
+    return absl::UnimplementedError("Deserialize is not implemented.");
+  };
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index 635b450017ee9c..bdedc6e620505c 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/shaped_buffer.h"
@@ -48,6 +50,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/testing/temporary_directory.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/path.h"
@@ -66,6 +69,7 @@ MATCHER_P(ConfigMatcher, name, "") {
 }
 
 MATCHER_P(InstructionMatcher, opcode, "") { return arg.opcode() == opcode; }
+MATCHER_P(InstrPtrMatcher, opcode, "") { return arg->opcode() == opcode; }
 
 std::unique_ptr<google::protobuf::Any> GetTestConfig(std::string name) {
   TestConfig config;
@@ -145,6 +149,11 @@ class MockAutotunerCache : public AutotunerCacheInterface {
               (const HloInstruction* instr,
                const AutotunerCacheInterface::Config& best_config),
               (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, Serialize,
+              (absl::Span<const HloInstruction* const> instructions),
+              (override));
+  MOCK_METHOD(absl::Status, Deserialize, (absl::string_view serialized_cache),
+              (override));
 };
 
 using absl_testing::IsOk;
@@ -152,7 +161,9 @@ using absl_testing::StatusIs;
 using ::testing::_;
 using ::testing::AtMost;
 using ::testing::ByMove;
+using ::testing::MatchesRegex;
 using ::testing::Return;
+using ::testing::UnorderedElementsAre;
 using tsl::proto_utils::ToDurationProto;
 
 se::DeviceDescription CreateDummyDeviceDescription() {
@@ -168,43 +179,45 @@ se::DeviceDescription CreateDummyDeviceDescription() {
 }
 
 absl::StatusOr<std::unique_ptr<Autotuner>> SetupAutotunerWithExpectations(
-    HloOpcode instr_to_autotune,
-    std::pair<HloOpcode, int> instr_to_apply_config_and_count) {
-  auto cache_manager = std::make_unique<MockAutotunerCache>();
-  EXPECT_CALL(*cache_manager, Lookup(_)).WillRepeatedly(Return(std::nullopt));
-  EXPECT_CALL(*cache_manager, Insert(_, _))
-      .WillRepeatedly(Return(absl::OkStatus()));
-
-  std::vector<std::unique_ptr<BackendConfig>> configs;
-  configs.push_back(GetTestConfig("test_config_1"));
-  configs.push_back(GetTestConfig("test_config_2"));
-
+    std::vector<HloOpcode> instrs_to_autotune,
+    std::vector<std::pair<HloOpcode, int>> instrs_to_apply_config_and_count,
+    std::unique_ptr<MockAutotunerCache> cache = nullptr,
+    bool dump_hlos = false) {
   auto backend = std::make_unique<MockCodegenBackend>();
-  EXPECT_CALL(*backend,
-              GetSupportedConfigs(InstructionMatcher(instr_to_autotune)))
-      .WillOnce(Return(std::move(configs)));
-  EXPECT_CALL(*backend, Compile(_, _))
-      .WillOnce(Return(std::unique_ptr<Executable>()))
-      .WillOnce(Return(std::unique_ptr<Executable>()));
-  HloOpcode instr_to_apply_config = instr_to_apply_config_and_count.first;
-  int count = instr_to_apply_config_and_count.second;
-  EXPECT_CALL(*backend,
-              ApplyConfig(InstructionMatcher(instr_to_apply_config), _))
-      .Times(count)
-      .WillRepeatedly(Return(absl::OkStatus()));
-
   auto profiler = std::make_unique<MockProfiler>();
-  auto device_description = CreateDummyDeviceDescription();
+  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("mock_backend"));
+  for (const auto& instr_to_autotune : instrs_to_autotune) {
+    std::vector<std::unique_ptr<BackendConfig>> configs;
+    // Best config is just by notion here since profiler time is same for all.
+    configs.push_back(GetTestConfig("best_config"));
+    configs.push_back(GetTestConfig("another_config"));
+    EXPECT_CALL(*backend,
+                GetSupportedConfigs(InstructionMatcher(instr_to_autotune)))
+        .WillOnce(Return(std::move(configs)));
+  }
   EXPECT_CALL(*profiler, CreateInputBuffers(_))
-      .WillOnce(Return(std::make_unique<InputBuffers>()));
+      .Times(instrs_to_autotune.size())
+      .WillRepeatedly([] { return std::make_unique<InputBuffers>(); });
+  EXPECT_CALL(*backend, Compile(_, _))
+      .Times(2 * instrs_to_autotune.size())
+      .WillRepeatedly([] { return std::unique_ptr<Executable>(); });
   EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(2)})))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
-
+      .Times(2 * instrs_to_autotune.size())
+      .WillRepeatedly([] { return ProfileResult({absl::Seconds(1)}); });
+
+  for (const auto& [instr_to_apply_config, count] :
+       instrs_to_apply_config_and_count) {
+    EXPECT_CALL(*backend,
+                ApplyConfig(InstructionMatcher(instr_to_apply_config), _))
+        .Times(count)
+        .WillRepeatedly(Return(absl::OkStatus()));
+  }
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  return Autotuner::Create(std::move(backends), std::move(profiler),
-                           GetTestAutotuneConfig(), std::move(cache_manager));
+  AutotuneConfig config = GetTestAutotuneConfig();
+  config.dump_hlos = dump_hlos;
+  return Autotuner::Create(std::move(backends), std::move(profiler), config,
+                           std::move(cache));
 }
 
 constexpr absl::string_view kHlo = R"(
@@ -396,8 +409,8 @@ TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
-          /*instr_to_autotune=*/HloOpcode::kCopy,
-          /*instr_to_apply_config_and_count=*/{HloOpcode::kCopy, 1}));
+          /*instrs_to_autotune=*/{HloOpcode::kCopy},
+          /*instrs_to_apply_config_and_count=*/{{HloOpcode::kCopy, 1}}));
 
   EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune),
               absl_testing::IsOk());
@@ -413,8 +426,8 @@ TEST_F(AutotunerTest, AutotuneModuleWithDuplicateInstructions) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
-          /*instr_to_autotune=*/HloOpcode::kAdd,
-          /*instr_to_apply_config_and_count=*/{HloOpcode::kAdd, 2}));
+          /*instrs_to_autotune=*/{HloOpcode::kAdd},
+          /*instrs_to_apply_config_and_count=*/{{HloOpcode::kAdd, 2}}));
 
   EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune), IsOk());
 }
@@ -481,7 +494,7 @@ TEST_F(AutotunerTest, CacheHit) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
 }
 
-TEST_F(AutotunerTest, AutotuneWithBufferCheck) {
+TEST_F(AutotunerTest, AutotuneWithBufferCheckFiltersWrongResults) {
   config_.check_buffers = true;
 
   std::vector<std::unique_ptr<BackendConfig>> configs_1;
@@ -528,15 +541,54 @@ TEST_F(AutotunerTest, AutotuneWithBufferCheck) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
 }
 
+TEST_F(AutotunerTest, AutotuneSkipsBufferCheckWhenNoReferenceOutput) {
+  config_.check_buffers = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+  auto backend = std::make_unique<MockCodegenBackendWithWrongResults>();
+  EXPECT_CALL(*backend, GetSupportedConfigs)
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+
+  auto profiler = std::make_unique<MockProfiler>();
+  ScopedShapedBuffer output_1(Shape(), nullptr, 0),
+      output_2(Shape(), nullptr, 0), output_3(Shape(), nullptr, 0);
+  EXPECT_CALL(*profiler, CreateInputBuffers(_))
+      .WillOnce(Return(std::make_unique<InputBuffers>()));
+  EXPECT_CALL(*profiler, Profile(_, _))
+      .WillOnce(Return(ProfileResult({absl::Seconds(1), std::move(output_1)})))
+      .WillOnce(Return(ProfileResult({absl::Seconds(2), std::nullopt})));
+  EXPECT_CALL(*profiler, CheckOutputBuffer(_, _, _)).Times(0);
+
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto autotuner,
+      Autotuner::Create(std::move(backends), std::move(profiler), config_,
+                        std::make_unique<MockAutotunerCache>()));
+  auto dummy_instr = HloInstruction::CreateConstant(LiteralUtil::CreateR0(1));
+  EXPECT_THAT(autotuner->Autotune(dummy_instr.get()), IsOk());
+}
+
 TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
   std::vector<std::unique_ptr<BackendConfig>> configs;
-  configs.push_back(GetTestConfig("config_more_time_less_scratch"));
+  configs.push_back(GetTestConfig("config_most_time_less_scratch"));
   configs.push_back(GetTestConfig("config_less_time_less_scratch"));
   configs.push_back(GetTestConfig("config_least_time_most_scratch"));
+  configs.push_back(GetTestConfig("config_more_time_less_scratch"));
   auto backend_1 = std::make_unique<MockCodegenBackend>();
   EXPECT_CALL(*backend_1, GetSupportedConfigs)
       .WillOnce(Return(std::move(configs)));
   EXPECT_CALL(*backend_1, Compile(_, _))
+      .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
@@ -551,7 +603,7 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
       .WillOnce(Return(std::make_unique<InputBuffers>()));
   EXPECT_CALL(*profiler, Profile(_, _))
       .WillOnce(Return(ProfileResult({
-          /*duration=*/absl::Microseconds(5),
+          /*duration=*/absl::Microseconds(7),
           /*output_buffer=*/std::nullopt,
           /*scratch_bytes=*/100,
       })))
@@ -564,12 +616,17 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
           /*duration=*/absl::Microseconds(2),
           /*output_buffer=*/std::nullopt,
           /*scratch_bytes=*/200,
+      })))
+      .WillOnce(Return(ProfileResult({
+          /*duration=*/absl::Microseconds(6),
+          /*output_buffer=*/std::nullopt,
+          /*scratch_bytes=*/100,
       })));
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend_1));
   config_.optimize_scratch_bytes = true;
-  config_.scratch_bytes_window_size_us = 2;
+  config_.scratch_bytes_window_size_us = 8;
   TF_ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
@@ -599,7 +656,10 @@ TEST_F(AutotunerTest, ExpectAllInstructionsInCache) {
 }
 
 TEST_F(AutotunerTest, DumpLogsToFile) {
-  config_.dump_logs_to = tsl::io::JoinPath(tsl::testing::TmpDir(), "dump.log");
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::testing::TemporaryDirectory temp_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+  config_.dump_logs_to = tsl::io::JoinPath(temp_dir.path(), "dump.log");
 
   std::vector<std::unique_ptr<BackendConfig>> configs;
   configs.push_back(GetTestConfig("test_config_1"));
@@ -671,7 +731,7 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   EXPECT_CALL(*backend, Compile(_, _))
       .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
-  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("cublas"));
+  EXPECT_CALL(*backend, name()).WillRepeatedly(Return("Cublas_fission"));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
@@ -765,5 +825,108 @@ TEST_F(AutotunerTest, UseDefaultConfigUnimplemented) {
                "GetDefaultConfig is not implemented for mock_backend");
 }
 
+class MockKeyValueStore : public KeyValueStoreInterface {
+ public:
+  MOCK_METHOD(absl::Status, Set,
+              (absl::string_view key, absl::string_view value), (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, Get,
+              (absl::string_view key, absl::Duration timeout), (override));
+  MOCK_METHOD(absl::StatusOr<std::string>, TryGet, (absl::string_view key),
+              (override));
+};
+
+AutotunerCacheInterface::Config GetCacheConfig(absl::string_view name) {
+  AutotunerCacheInterface::Config config;
+  config.codegen_backend_name = "mock_backend";
+  config.backend_config = *GetTestConfig(std::string(name));
+  return config;
+};
+
+TEST_F(AutotunerTest, ShardedAutotuning) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  constexpr int kShardCount = 2;
+  auto should_autotune = [](const HloInstruction& instruction) {
+    return instruction.opcode() == HloOpcode::kAdd ||
+           instruction.opcode() == HloOpcode::kCopy;
+  };
+  auto kv_store = std::make_shared<MockKeyValueStore>();
+  auto cache = std::make_unique<MockAutotunerCache>();
+
+  // Shard 0 autotunes kCopy instructions, updates the cache and serializes the
+  // result to a string "kCopy_autotune_result".
+  EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kCopy)))
+      .WillOnce(Return(std::nullopt))                    // During autotuning.
+      .WillOnce(Return(GetCacheConfig("best_config")));  // Config application.
+  EXPECT_CALL(*cache, Insert(InstrPtrMatcher(HloOpcode::kCopy), _))
+      .WillOnce(Return(absl::OkStatus()));
+  EXPECT_CALL(*cache, Serialize(_)).WillOnce(Return("kCopy_autotune_result"));
+  // Stores the serialized results to the KV store if it does not exist.
+  EXPECT_CALL(*kv_store, TryGet(testing::HasSubstr("_0")))
+      .WillOnce(Return(absl::NotFoundError("not found")));
+  EXPECT_CALL(*kv_store, Set(testing::HasSubstr("_0"), "kCopy_autotune_result"))
+      .WillOnce(Return(absl::OkStatus()));
+
+  // Shard 0 reads the KV store entry for shard 1 and updates the current cache.
+  EXPECT_CALL(*kv_store, Get(testing::HasSubstr("_1"), _))
+      .WillOnce(Return("kAdd_autotune_result"));
+  EXPECT_CALL(*cache, Deserialize("kAdd_autotune_result"))
+      .WillOnce(Return(absl::OkStatus()));
+  EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kAdd)))
+      .WillOnce(Return(GetCacheConfig("best_config")));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Autotuner> autotuner,
+      SetupAutotunerWithExpectations(
+          /*instrs_to_autotune=*/{HloOpcode::kCopy},
+          /*instrs_to_apply_config_and_count=*/
+          {{HloOpcode::kCopy, 1}, {HloOpcode::kAdd, 2}}, std::move(cache)));
+
+  MultiProcessKeyValueStore sharding_kv_store;
+  sharding_kv_store.key_value_store = kv_store;
+  sharding_kv_store.process_count = kShardCount;
+  sharding_kv_store.process_index = 0;
+  EXPECT_THAT(
+      autotuner->Autotune(module.get(), should_autotune, sharding_kv_store),
+      IsOk());
+}
+
+TEST_F(AutotunerTest, DumpHlos) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::testing::TemporaryDirectory dump_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  module->mutable_config().mutable_debug_options().set_xla_dump_to(
+      dump_dir.path());
+  auto should_autotune = [](const HloInstruction& instruction) {
+    return instruction.opcode() == HloOpcode::kCopy ||
+           instruction.opcode() == HloOpcode::kAdd;
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Autotuner> autotuner,
+      SetupAutotunerWithExpectations(
+          /*instrs_to_autotune=*/{HloOpcode::kCopy, HloOpcode::kAdd},
+          // One apply config call per instruction is expected for dumping HLOs.
+          /*instrs_to_apply_config_and_count=*/
+          {{HloOpcode::kCopy, 2}, {HloOpcode::kAdd, 3}},
+          /*cache=*/nullptr,
+          /*dump_hlos=*/true));
+
+  EXPECT_THAT(autotuner->Autotune(module.get(), should_autotune), IsOk());
+
+  std::vector<std::string> files;
+  EXPECT_THAT(tsl::Env::Default()->GetChildren(dump_dir.path(), &files),
+              IsOk());
+  EXPECT_THAT(files.size(), 4);
+  EXPECT_THAT(
+      files,
+      UnorderedElementsAre(
+          MatchesRegex(".*\\.test_module\\.autotuner_0\\.copy\\.before\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_0\\.copy\\.after\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.after\\.txt"),
+          MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.before\\.txt")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index 67ff1417c86b2a..ae4b88ee54022e 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -60,15 +60,6 @@ cc_library(
     ],
 )
 
-xla_cc_test(
-    name = "buffer_allocation_info_test",
-    srcs = ["buffer_allocation_info_test.cc"],
-    deps = [
-        ":buffer_allocation_info",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 onednn_graph_cc_library(
     name = "onednn_emitter",
     srcs = ["onednn_emitter.cc"],
@@ -80,7 +71,7 @@ onednn_graph_cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/onednn:onednn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/mkl:onednn",
@@ -110,7 +101,7 @@ onednn_graph_cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/tsl/mkl:onednn",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
@@ -120,8 +111,8 @@ onednn_graph_cc_library(
 )
 
 tf_proto_library(
-    name = "xnnpack_config_proto",
-    srcs = ["xnnpack_config.proto"],
+    name = "xnn_fusion_options_proto",
+    srcs = ["xnn_fusion_options.proto"],
 )
 
 cc_library(
@@ -157,9 +148,11 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_memory",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@XNNPACK//ynnpack",
@@ -179,7 +172,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Target",
@@ -196,7 +189,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
@@ -226,30 +219,9 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "ynn_threadpool_test",
-    srcs = ["ynn_threadpool_test.cc"],
-    deps = [
-        ":ynn_threadpool",
-        "//xla/tsl/platform:env",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "ynn_threadpool",
-    srcs = ["ynn_threadpool.cc"],
-    hdrs = ["ynn_threadpool.h"],
-    deps = [
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@slinky//slinky/base:thread_pool",
-    ],
+tf_proto_library(
+    name = "ynn_fusion_options_proto",
+    srcs = ["ynn_fusion_options.proto"],
 )
 
 cc_library(
@@ -260,14 +232,13 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla:xla_proto_cc",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
         "//xla/tsl/platform:statusor",
         "@XNNPACK//ynnpack",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -275,7 +246,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index b756df83ef23e0..3788b401dc22ff 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -91,8 +91,8 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
         "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu:xnnpack_config_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service/cpu:backend_config_proto_cc",
diff --git a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
index d8984847035da0..35d6c5b33b334a 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.cc
@@ -39,7 +39,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-absl::StatusOr<bool> LlvmKernelAutotuner::Run(
+absl::StatusOr<bool> LlvmKernelAutotuner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(auto compiler,
diff --git a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
index 6fac8e0eb743d8..fe26d971346742 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/llvm_kernel_autotuner.h
@@ -40,7 +40,8 @@ class LlvmKernelAutotuner : public HloModulePass {
 
   absl::string_view name() const override { return kLlvmKernelAutotunerName; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
index 5a801d053f41c5..765a50a887cd54 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
 #include "xla/backends/cpu/xnn_support.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/compiler.h"
@@ -78,18 +78,18 @@ XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
   TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
   std::vector<std::unique_ptr<xla::BackendConfig>> configs;
   {
-    Config config;
-    config.set_use_threadpool(true);
+    XnnFusionOptions options;
+    options.set_use_threadpool(true);
     auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(config);
+    any->PackFrom(options);
     configs.push_back(std::move(any));
   }
 
   {
-    Config config;
-    config.set_use_threadpool(false);
+    XnnFusionOptions options;
+    options.set_use_threadpool(false);
     auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(config);
+    any->PackFrom(options);
     configs.push_back(std::move(any));
   }
   return configs;
@@ -97,7 +97,7 @@ XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
 absl::StatusOr<std::unique_ptr<xla::BackendConfig>>
 XnnpackBackend::GetDefaultConfig(const HloInstruction& instr) {
   TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  auto config = std::make_unique<Config>();
+  auto config = std::make_unique<XnnFusionOptions>();
   config->set_use_threadpool(true);
   auto any = std::make_unique<xla::BackendConfig>();
   any->PackFrom(*config);
@@ -110,11 +110,11 @@ absl::Status XnnpackBackend::ApplyConfig(HloInstruction& instr,
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr.backend_config<xla::cpu::BackendConfig>());
 
-  XnnpackBackend::Config xnn_config;
-  config.UnpackTo(&xnn_config);
+  XnnFusionOptions options;
+  config.UnpackTo(&options);
 
-  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_config() =
-      xnn_config;
+  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_options() =
+      options;
 
   TF_RETURN_IF_ERROR(instr.set_backend_config(backend_config));
 
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
index 3246f10fab9561..71b8b8c86d8011 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
 
@@ -37,8 +37,6 @@ class XnnpackBackend : public CpuCodegenBackend {
   static absl::StatusOr<std::unique_ptr<CodegenBackend>> Create(
       Compiler* compiler);
 
-  using Config = XnnFusionBackendConfig;
-
   bool IsSupported(const HloInstruction& instr);
 
   absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
index 4b6a53eb13643b..ef9af614bc694a 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
@@ -77,10 +77,10 @@ TEST_F(XnnpackBackendTest, GetDefaultConfigTest) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto config, backend_->GetDefaultConfig(
                        *module->entry_computation()->root_instruction()));
-  XnnFusionBackendConfig xnnpack_config;
-  config->UnpackTo(&xnnpack_config);
+  XnnFusionOptions xnn_fusion_options;
+  config->UnpackTo(&xnn_fusion_options);
 
-  EXPECT_TRUE(xnnpack_config.use_threadpool());
+  EXPECT_TRUE(xnn_fusion_options.use_threadpool());
 }
 
 TEST_F(XnnpackBackendTest, InvalidFusionKind) {
@@ -123,12 +123,12 @@ TEST_F(XnnpackBackendTest, GetSupportedConfigsTest) {
                         *module->entry_computation()->root_instruction()));
 
   EXPECT_EQ(configs.size(), 2);
-  XnnFusionBackendConfig xnnpack_config0;
-  configs[0]->UnpackTo(&xnnpack_config0);
-  EXPECT_TRUE(xnnpack_config0.use_threadpool());
-  XnnFusionBackendConfig xnnpack_config1;
-  configs[1]->UnpackTo(&xnnpack_config1);
-  EXPECT_FALSE(xnnpack_config1.use_threadpool());
+  XnnFusionOptions xnn_fusion_options0;
+  configs[0]->UnpackTo(&xnn_fusion_options0);
+  EXPECT_TRUE(xnn_fusion_options0.use_threadpool());
+  XnnFusionOptions xnn_fusion_options1;
+  configs[1]->UnpackTo(&xnn_fusion_options1);
+  EXPECT_FALSE(xnn_fusion_options1.use_threadpool());
 }
 
 TEST_F(XnnpackBackendTest, CompileSupportedBackends) {
@@ -153,8 +153,8 @@ TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
                           backend_->GetSupportedConfigs(*fusion_instruction));
 
   for (const auto& config : configs) {
-    XnnFusionBackendConfig xnnpack_config;
-    config->UnpackTo(&xnnpack_config);
+    XnnFusionOptions xnn_fusion_options;
+    config->UnpackTo(&xnn_fusion_options);
     EXPECT_TRUE(backend_->ApplyConfig(*fusion_instruction, *config).ok());
 
     TF_ASSERT_OK_AND_ASSIGN(
@@ -162,9 +162,9 @@ TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
         fusion_instruction->backend_config<BackendConfig>());
 
     EXPECT_EQ(instruction_backend_config.fusion_config()
-                  .xnn_fusion_config()
+                  .xnn_fusion_options()
                   .use_threadpool(),
-              xnnpack_config.use_threadpool());
+              xnn_fusion_options.use_threadpool());
   }
 }
 
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
index 2b732513d6a34f..a74ade4204850d 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
+++ b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/keras/requirements.txt
@@ -1,4 +1,4 @@
-keras==3.9.0
+keras==3.12.0
 keras_nlp==0.18.1
 tensorflow==2.18.0
 jax==0.4.38
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index e61af00e38efa0..0aeeb6d69a12bc 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -228,7 +228,6 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   // thread pool if we need to run multiple executions in parallel.
   ExecuteOptions execute_options;
   execute_options.execution_mode = ExecuteOptions::ExecutionMode::kSynchronous;
-  execute_options.untuple_result = true;
 
   std::vector<std::vector<PjRtBuffer*>> execution_args_ptrs(
       benchmark_options.num_executions);
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
index a9d8a3c958496f..272d96002c2721 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
@@ -198,7 +198,8 @@ static void BM_ReduceWindowAddF32OverlappingWindows(
       ->Arg(512)                \
       ->Arg(1024)               \
       ->Arg(8192)               \
-      ->Arg(16384)
+      ->Arg(16384)              \
+      ->MeasureProcessCPUTime();
 
 BENCHMARK_SIZES(BM_ReduceAddF32);
 BENCHMARK_SIZES(BM_ReduceAddBF16);
@@ -206,7 +207,8 @@ BENCHMARK_SIZES(BM_ReduceAddBF16);
 XLA_CPU_BENCHMARK(BM_ReduceAddF32OverDimension)
     ->ArgName("reduce_dim")
     ->Arg(0)
-    ->Arg(1);
+    ->Arg(1)
+    ->MeasureProcessCPUTime();
 
 XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32OuterAndInnerDim)
     ->MeasureProcessCPUTime()
@@ -217,7 +219,8 @@ XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32OuterAndInnerDim)
     ->Args({32, 4})
     ->Args({32, 8})
     ->Args({32, 16})
-    ->Args({32, 32});
+    ->Args({32, 32})
+    ->MeasureProcessCPUTime();
 
 XLA_CPU_BENCHMARK(BM_ReduceWindowAddF32SkippingData)->MeasureProcessCPUTime();
 
diff --git a/third_party/xla/xla/backends/cpu/buffer_allocation_info.h b/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
index 1a12333ed1c29c..d71811569a3eb5 100644
--- a/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
+++ b/third_party/xla/xla/backends/cpu/buffer_allocation_info.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include <cassert>
 #include <cstdint>
 
-namespace xla {
-namespace cpu {
+namespace xla::cpu {
 
 // `BufferAllocationInfo` stores information about buffer allocations required
 // by an XLA:CPU executable at run time. It corresponds to a `BufferAllocation`
@@ -117,78 +116,6 @@ class BufferAllocationInfo {
   int32_t result_number_ = -1;
 };
 
-// Encoded version of `BufferAllocationInfo`, which can be used to reconstruct
-// the `BufferAllocationInfo` later. It's used in the AOT compiler, to
-// represent buffer allocation info as a lightweight struct.
-struct EncodedBufferAllocationInfo {
-  EncodedBufferAllocationInfo(uint64_t packed_kind_and_size,
-                              uint32_t entry_param_number,
-                              uint32_t result_number)
-      : packed_kind_and_size(packed_kind_and_size),
-        entry_param_number(entry_param_number),
-        result_number(result_number) {}
-
-  // Encodes BufferAllocationInfo into the struct that can be used to
-  // reconstruct the BufferAllocationInfo later using the constructor. We need
-  // this because we use BufferAllocationInfo in places where using protocol
-  // buffers would negatively impact binary size.
-  explicit EncodedBufferAllocationInfo(
-      const BufferAllocationInfo& buffer_info) {
-    packed_kind_and_size = Pack(buffer_info.kind(), buffer_info.size());
-    entry_param_number = buffer_info.is_entry_parameter()
-                             ? buffer_info.entry_parameter_number()
-                             : -1;
-    result_number = buffer_info.is_result() ? buffer_info.result_number() : -1;
-  }
-
-  explicit operator BufferAllocationInfo() const {
-    auto kind = UnpackKind(packed_kind_and_size);
-    auto size = UnpackSize(packed_kind_and_size);
-    int32_t entry_param_number = static_cast<int32_t>(this->entry_param_number);
-    int32_t result_number = static_cast<int32_t>(this->result_number);
-
-    switch (kind) {
-      case BufferAllocationInfo::Kind::kConstant:
-        return BufferAllocationInfo::Constant(size);
-      case BufferAllocationInfo::Kind::kTemp:
-        return BufferAllocationInfo::Temp(size);
-      case BufferAllocationInfo::Kind::kParameter:
-        if (entry_param_number >= 0 && result_number >= 0) {
-          return BufferAllocationInfo::InOutParameter(size, entry_param_number,
-                                                      result_number);
-        }
-        if (entry_param_number >= 0) {
-          return BufferAllocationInfo::EntryParameter(size, entry_param_number);
-        }
-        return BufferAllocationInfo::Result(size, result_number);
-      case BufferAllocationInfo::Kind::kThreadLocal:
-        return BufferAllocationInfo::ThreadLocal(size);
-    }
-  }
-
-  static uint64_t Pack(BufferAllocationInfo::Kind kind, uint64_t size) {
-    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
-  }
-
-  static constexpr BufferAllocationInfo::Kind UnpackKind(uint64_t packed) {
-    return static_cast<BufferAllocationInfo::Kind>((packed << 62) >> 62);
-  }
-
-  static constexpr uint64_t UnpackSize(uint64_t packed) { return packed >> 2; }
-
-  uint64_t packed_kind_and_size = 0;
-  uint32_t entry_param_number = -1;
-  uint32_t result_number = -1;
-};
-
-}  // namespace cpu
-
-// TODO(ezhulenev): This is a temporary hack to keep `tfcompile` code working.
-namespace cpu_function_runtime {
-using BufferInfo = ::xla::cpu::BufferAllocationInfo;
-using EncodedBufferInfo = ::xla::cpu::EncodedBufferAllocationInfo;
-}  // namespace cpu_function_runtime
-
-}  // namespace xla
+}  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_BUFFER_ALLOCATION_INFO_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index 787dc8f1991523..e6bf1a1b7e7ffd 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -1,10 +1,11 @@
 load("//xla:py_strict.bzl", "py_strict_test")
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -23,6 +24,42 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "builtin_definition_generator",
+    srcs = ["builtin_definition_generator.cc"],
+    hdrs = ["builtin_definition_generator.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":builtin_fp16",
+        ":builtin_pow",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:mlir_c_runner_utils",
+    ],
+)
+
+cc_library(
+    name = "builtin_fp16",
+    srcs = ["builtin_fp16.cc"],
+    hdrs = ["builtin_fp16.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
+cc_library(
+    name = "builtin_pow",
+    srcs = ["builtin_pow.cc"],
+    hdrs = ["builtin_pow.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
 cc_library(
     name = "contiguous_section_memory_manager",
     srcs = ["contiguous_section_memory_manager.cc"],
@@ -129,6 +166,8 @@ xla_cc_test(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -147,7 +186,7 @@ cc_library(
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
         "//xla/backends/cpu/codegen/emitters/transforms:passes",
         "//xla/backends/cpu/codegen/tiled/transforms:passes",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen:trace_pass_instrumentation",
         "//xla/codegen/emitters/ir:xla",
@@ -155,6 +194,7 @@ cc_library(
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
@@ -172,6 +212,9 @@ cc_library(
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:AffineTransforms",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationPipelines",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
@@ -182,6 +225,8 @@ cc_library(
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMIRTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MemRefToLLVM",
@@ -190,16 +235,21 @@ cc_library(
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorToSCF",
+        "@llvm-project//mlir:VectorTransforms",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
+        "@stablehlo//:linalg_passes",
+        "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_passes",
     ],
 )
@@ -249,6 +299,8 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -533,9 +585,7 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -632,12 +682,14 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_cluster",
@@ -647,8 +699,6 @@ cc_library(
         "//xla/runtime:work_tile_size",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc
new file mode 100644
index 00000000000000..e91212e97e8fb2
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.cc
@@ -0,0 +1,274 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
+
+#ifdef _MSC_VER
+#include <math.h>
+#endif  // _MSC_VER
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+#include "xla/backends/cpu/codegen/builtin_fp16.h"
+#include "xla/backends/cpu/codegen/builtin_pow.h"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// A global static registry of builtin symbols available to XLA:CPU executables.
+//===----------------------------------------------------------------------===//
+
+using Registry = absl::flat_hash_map<std::string, llvm::orc::ExecutorSymbolDef>;
+
+// Create a new registry of builtin runtime symbols by looking up the addresses
+// of the symbols in the current process. Defined below.
+static Registry CreateRegistry();
+
+// Returns a global static registry of builtin runtime symbols.
+static const Registry& StaticRegistry() {
+  static absl::NoDestructor<Registry> registry(CreateRegistry());
+  return *registry;
+}
+
+static std::optional<llvm::orc::ExecutorSymbolDef> ResolveBuiltinSymbol(
+    const llvm::DataLayout& data_layout, llvm::StringRef name) {
+  const Registry& registry = StaticRegistry();
+
+  if (name.size() > 1 && name.front() == data_layout.getGlobalPrefix()) {
+    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
+    // registered name may not.
+    std::string stripped_name(name.begin() + 1, name.end());
+    if (registry.contains(stripped_name)) {
+      return registry.at(stripped_name);
+    }
+  } else {
+    if (registry.contains(name)) {
+      return registry.at(name.str());
+    }
+  }
+
+  return std::nullopt;
+}
+
+//===----------------------------------------------------------------------===//
+// Create builtin runtime symbols registry for the current process.
+//===----------------------------------------------------------------------===//
+
+#if defined(PLATFORM_WINDOWS)
+// This function is used by compiler-generated code on windows, but it's not
+// declared anywhere. The signature does not matter, we just need the address.
+extern "C" void __chkstk(size_t);
+#endif
+
+extern "C" {
+// Provided by compiler-rt and MLIR.
+// Converts an F32 value to a BF16.
+uint16_t __truncsfbf2(float);
+// Converts an F64 value to a BF16.
+uint16_t __truncdfbf2(double);
+
+#ifdef __APPLE__
+// Converts an F32 value to a F16.
+uint16_t __truncsfhf2(float);
+
+float __extendhfsf2(uint16_t a);
+#endif  // __APPLE__
+
+}  // extern "C"
+
+// MSVC does not have sincos[f].
+#ifdef _MSC_VER
+
+static void sincos(double x, double* sinv, double* cosv) {
+  *sinv = sin(x);
+  *cosv = cos(x);
+}
+
+static void sincosf(float x, float* sinv, float* cosv) {
+  *sinv = sinf(x);
+  *cosv = cosf(x);
+}
+
+#endif  // _MSC_VER
+
+template <typename R, typename... Args>
+static llvm::orc::ExecutorSymbolDef SymbolDef(R (*func)(Args...)) {
+  // We register runtime symbols as weak, because during concurrent compilation
+  // different threads may race to register their symbols in the same dylib and
+  // we get spurious "symbol already defined" errors.
+  return llvm::orc::ExecutorSymbolDef{
+      llvm::orc::ExecutorAddr(reinterpret_cast<uint64_t>(func)),
+      llvm::JITSymbolFlags::Weak};
+}
+
+// Register both the f32 (float) and f64 (double) versions of a libm symbol.
+// Unfortunately the double versions are overloaded on some systems, e.g.
+// Mac so we need an explicit cast. This requires passing the function signature
+// for that case.
+#define REGISTER_LIBM_SYMBOL(name, double_sig) \
+  registry[#name "f"] = SymbolDef(name##f);    \
+  registry[#name] = SymbolDef(static_cast<double_sig>(name));
+
+static Registry CreateRegistry() {
+  Registry registry;
+
+  // Some platforms have overloaded memcpy, memmove, and memset, so we need to
+  // specify the signature type to get the address of the specific function.
+  registry["memcpy"] =
+      SymbolDef(static_cast<void* (*)(void*, const void*, size_t)>(memcpy));
+  registry["memmove"] =
+      SymbolDef(static_cast<void* (*)(void*, const void*, size_t)>(memmove));
+  registry["memset"] =
+      SymbolDef(static_cast<void* (*)(void*, int, size_t)>(memset));
+
+  registry["__gnu_f2h_ieee"] = SymbolDef(__gnu_f2h_ieee);
+  registry["__gnu_h2f_ieee"] = SymbolDef(__gnu_h2f_ieee);
+
+  registry["__truncdfhf2"] = SymbolDef(__truncdfhf2);
+  registry["__truncdfbf2"] = SymbolDef(__truncdfbf2);
+  registry["__truncsfbf2"] = SymbolDef(__truncsfbf2);
+
+  registry["__powisf2"] = SymbolDef(__powisf2);
+  registry["__powidf2"] = SymbolDef(__powidf2);
+
+  REGISTER_LIBM_SYMBOL(acos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(acosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan2, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(atanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cbrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(ceil, double (*)(double));
+  REGISTER_LIBM_SYMBOL(copysign, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(cos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erf, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erfc, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(expm1, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fabs, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fdim, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(floor, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fma, double (*)(double, double, double));
+  REGISTER_LIBM_SYMBOL(fmax, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmin, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmod, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(frexp, double (*)(double, int*));
+  REGISTER_LIBM_SYMBOL(hypot, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
+  REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT
+  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT
+  REGISTER_LIBM_SYMBOL(log, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log10, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(logb, double (*)(double));
+  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT
+  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT
+  REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
+  REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
+  REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(nextafter, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(nexttoward, double (*)(double, long double));
+  REGISTER_LIBM_SYMBOL(pow, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remainder, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
+  REGISTER_LIBM_SYMBOL(rint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(round, double (*)(double));
+  REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));  // NOLINT
+  REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(trunc, double (*)(double));
+
+#ifdef __APPLE__
+  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+  registry["__sincosf_stret"] = SymbolDef(__sincosf_stret);
+  registry["__sincos_stret"] = SymbolDef(__sincos_stret);
+#else
+  REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+#endif
+
+#undef REGISTER_LIBM_SYMBOL
+
+#ifdef __APPLE__
+  registry["__truncsfhf2"] = SymbolDef(__truncsfhf2);
+  registry["__extendhfsf2"] = SymbolDef(__extendhfsf2);
+  registry["__bzero"] = SymbolDef(bzero);
+  registry["bzero"] = SymbolDef(bzero);
+  registry["memset_pattern16"] = SymbolDef(memset_pattern16);
+#endif
+
+#if defined(PLATFORM_WINDOWS)
+  registry["__chkstk"] = SymbolDef(__chkstk);
+#endif
+
+#ifdef MEMORY_SANITIZER
+  registry["__msan_unpoison"] = SymbolDef(__msan_unpoison);
+#endif
+
+  return registry;
+}
+
+//===----------------------------------------------------------------------===//
+// BuiltinDefinitionGenerator
+//===----------------------------------------------------------------------===//
+
+BuiltinDefinitionGenerator::BuiltinDefinitionGenerator(
+    llvm::DataLayout data_layout)
+    : data_layout_(std::move(data_layout)) {}
+
+llvm::Error BuiltinDefinitionGenerator::tryToGenerate(
+    llvm::orc::LookupState&, llvm::orc::LookupKind kind,
+    llvm::orc::JITDylib& jit_dylib, llvm::orc::JITDylibLookupFlags,
+    const llvm::orc::SymbolLookupSet& names) {
+  llvm::orc::SymbolMap symbols;
+  symbols.reserve(names.size());
+
+  for (const auto& [name, flags] : names) {
+    if (auto symbol = ResolveBuiltinSymbol(data_layout_, *name)) {
+      symbols[name] = *symbol;
+    }
+  }
+
+  cantFail(jit_dylib.define(llvm::orc::absoluteSymbols(std::move(symbols))));
+  return llvm::Error::success();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.h b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
similarity index 56%
rename from third_party/xla/xla/service/cpu/runtime_symbol_generator.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
index 4173717a91163e..7689e7b13e425c 100644
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_definition_generator.h
@@ -13,24 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
-#define XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
 
-#include <optional>
-
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Error.h"
 
 namespace xla::cpu {
 
-// Generates symbol definitions for XLA runtime symbols, which are linked into
-// the compiled XLA kernels.
-class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
+// Generates symbol definitions for builtin XLA runtime symbols, which are
+// looked up at run time in the parent process:
+//
+//   - libc symbols (e.g. memcpy, memmove, memset)
+//   - libm symbols (e.g. sin, cos, etc.)
+//   - compiler-rt symbols (e.g. __msan_unpoison)
+//   - custom XLA symbols (e.g. __truncsfbf2)
+//
+// We keep the list of definitions short, and prefer to compile math functions
+// into generated XLA:CPU executables via intrinsics, as it allows the LLVM
+// optimizer to inline them and optimize across function call boundaries.
+class BuiltinDefinitionGenerator : public llvm::orc::DefinitionGenerator {
  public:
-  explicit RuntimeSymbolGenerator(llvm::DataLayout data_layout);
+  explicit BuiltinDefinitionGenerator(llvm::DataLayout data_layout);
 
   llvm::Error tryToGenerate(llvm::orc::LookupState&, llvm::orc::LookupKind,
                             llvm::orc::JITDylib& jit_dylib,
@@ -38,12 +43,9 @@ class RuntimeSymbolGenerator : public llvm::orc::DefinitionGenerator {
                             const llvm::orc::SymbolLookupSet& names) final;
 
  private:
-  std::optional<llvm::orc::ExecutorSymbolDef> ResolveRuntimeSymbol(
-      llvm::StringRef name);
-
   llvm::DataLayout data_layout_;
 };
 
 }  // namespace xla::cpu
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_SYMBOL_GENERATOR_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_DEFINITION_GENERATOR_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_fp16.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
similarity index 98%
rename from third_party/xla/xla/service/cpu/runtime_fp16.cc
rename to third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
index 5bca7fecee69e5..4208d838c0a4d4 100644
--- a/third_party/xla/xla/service/cpu/runtime_fp16.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_fp16.h"
+#include "xla/backends/cpu/codegen/builtin_fp16.h"
 
 #include <cstdint>
 #include <cstring>
diff --git a/third_party/xla/xla/service/cpu/runtime_fp16.h b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
similarity index 89%
rename from third_party/xla/xla/service/cpu/runtime_fp16.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
index c86d6dc37f0d5c..cbe75ec2e18684 100644
--- a/third_party/xla/xla/service/cpu/runtime_fp16.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_fp16.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_FP16_H_
-#define XLA_SERVICE_CPU_RUNTIME_FP16_H_
-
-#include <stdint.h>
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
 
 // _Float16 always gets us the correct ABI type, so use that if available.
 // AArch64 GCC defines __FLT16_MANT_DIG__ even when _Float16 is not available.
@@ -43,4 +41,4 @@ extern "C" float __gnu_h2f_ieee(XlaF16ABIType);
 // Converts an F64 value to a F16.
 extern "C" XlaF16ABIType __truncdfhf2(double);
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_FP16_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_FP16_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_pow.cc b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
similarity index 85%
rename from third_party/xla/xla/service/cpu/runtime_pow.cc
rename to third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
index 0a915715b46407..ddc23f59b68015 100644
--- a/third_party/xla/xla/service/cpu/runtime_pow.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_pow.h"
+#include "xla/backends/cpu/codegen/builtin_pow.h"
 
 #include <cstdint>
 
@@ -24,9 +24,13 @@ static T Powi(T a, int32_t b) {
   const bool recip = b < 0;
   T r = 1;
   while (true) {
-    if (b & 1) r *= a;
+    if (b & 1) {
+      r *= a;
+    }
     b /= 2;
-    if (b == 0) break;
+    if (b == 0) {
+      break;
+    }
     a *= a;
   }
   return recip ? 1 / r : r;
diff --git a/third_party/xla/xla/service/cpu/runtime_pow.h b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
similarity index 81%
rename from third_party/xla/xla/service/cpu/runtime_pow.h
rename to third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
index ae3196cba89ccb..cbbdba29375b00 100644
--- a/third_party/xla/xla/service/cpu/runtime_pow.h
+++ b/third_party/xla/xla/backends/cpu/codegen/builtin_pow.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_RUNTIME_POW_H_
-#define XLA_SERVICE_CPU_RUNTIME_POW_H_
+#ifndef XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
+#define XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
 
 #include <stdint.h>
 
@@ -24,4 +24,4 @@ extern "C" float __powisf2(float a, int32_t b);
 // Raises F64 value a to the power of b.
 extern "C" double __powidf2(double a, int32_t b);
 
-#endif  // XLA_SERVICE_CPU_RUNTIME_POW_H_
+#endif  // XLA_BACKENDS_CPU_CODEGEN_BUILTIN_POW_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
index 6d5b7fe4accbb3..113e12bcc4d37a 100644
--- a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
@@ -44,8 +44,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -133,7 +132,7 @@ ComputationKernelEmitter::ComputationKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ComputationKernelEmitter::KernelDefinition>
 ComputationKernelEmitter::EmitKernelDefinition() {
   VLOG(2) << "Emit Computation host kernel: " << instr_->name();
 
@@ -215,14 +214,14 @@ ComputationKernelEmitter::EmitKernelDefinition() {
                                     buffer_table, llvm_nullptr, llvm_nullptr};
   ir_builder.CreateCall(computation_function, args);
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(), NumWorkGroups(),
                   std::move(kernel_prototype.argument_buffers),
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 absl::StatusOr<llvm::Function*> ComputationKernelEmitter::EmitNestedComputation(
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
index e7c6246b87a014..95c538ad261729 100644
--- a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
@@ -17,19 +17,17 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_CODEGEN_COMPUTATION_KERNEL_EMITTER_H_
 
 #include <cstdint>
-#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
@@ -46,15 +44,14 @@ namespace xla::cpu {
 // producing a synthetic buffer_table for all arguments and results (including
 // intermediate instructions), though this may change in the future to use stack
 // allocations for small buffers.
-class ComputationKernelEmitter final : public LlvmKernelEmitter {
+class ComputationKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ComputationKernelEmitter(const HloInstruction* instr,
                            const BufferAssignment* buffer_assignment,
                            const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const final { return "computation_kernel_emitter"; }
+  absl::string_view name() const final { return "computation_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   absl::StatusOr<llvm::Function*> EmitNestedComputation(
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
index 915cce121f9a58..067d7d21b6df1d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
@@ -18,9 +18,7 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
index 2c20b19e4b197e..1e158daec9fe83 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -57,7 +56,8 @@ DotKernelEmitter::DotKernelEmitter(const HloInstruction* instr,
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
+absl::StatusOr<DotKernelEmitter::KernelDefinition>
+DotKernelEmitter::EmitKernelDefinition() {
   const HloModuleConfig& config = instr_->GetModule()->config();
 
   DotImplementationStrategy strategy = GetDotImplementationStrategy(
@@ -104,7 +104,7 @@ absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
           *target_machine_,
           /*allow_runtime_calls=*/false));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(),
                   NumWorkGroups{num_workgroups.x, num_workgroups.y},
@@ -112,7 +112,7 @@ absl::StatusOr<LlvmKernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
index 48ca003f05c5ad..8a7916aec66bda 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.h
@@ -16,28 +16,24 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_DOT_DOT_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_DOT_DOT_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
 
-class DotKernelEmitter final : public LlvmKernelEmitter {
+class DotKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   DotKernelEmitter(const HloInstruction* instr,
                    const BufferAssignment* buffer_assignment,
                    const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "dot_kernel_emitter"; }
+  absl::string_view name() const final { return "dot_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   const HloInstruction* instr_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
index 3179b0a355fdb8..bdce35a2907d7e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
@@ -26,10 +26,9 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -72,10 +71,9 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -106,7 +104,8 @@ xla_cc_test(
         ":elemental_kernel_emitter",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_kernel_definition",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
index 85e2b4c90766b5..4c5a24a42b35b5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout_util.h"
@@ -44,7 +43,6 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -68,7 +66,7 @@ ConcatenateKernelEmitter::ConcatenateKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ConcatenateKernelEmitter::KernelDefinition>
 ConcatenateKernelEmitter::EmitKernelDefinition() {
   if (absl::Status status = CanDoFastConcatenate(instr_); !status.ok()) {
     VLOG(1) << "Could not emit fast concatenate for " << instr_->ToString()
@@ -112,7 +110,7 @@ ConcatenateKernelEmitter::EmitKernelDefinition() {
                           llvm_module.get(), ir_builder,
                           kernel_prototype.workgroup_id.x, total_workgroups));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
   NumWorkGroups num_workgroups;
   if (is_parallel) {
     num_workgroups.x = total_workgroups;
@@ -123,7 +121,7 @@ ConcatenateKernelEmitter::EmitKernelDefinition() {
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
index 36ba97ed2ad10a..19a77b3519f2ed 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h
@@ -16,27 +16,24 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_CONCATENATE_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_CONCATENATE_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
 
-class ConcatenateKernelEmitter final : public LlvmKernelEmitter {
+class ConcatenateKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ConcatenateKernelEmitter(const HloInstruction* instr,
                            const BufferAssignment* buffer_assignment,
                            const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "concatenate_kernel_emitter"; }
+  absl::string_view name() const final { return "concatenate_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   const HloInstruction* instr_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
index ffe6485bd67d9c..a771c6d4a7c59b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
@@ -39,8 +39,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -169,7 +168,7 @@ ElementalKernelEmitter::ElementalKernelEmitter(
       buffer_assignment_(buffer_assignment),
       target_machine_(target_machine) {}
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<ElementalKernelEmitter::KernelDefinition>
 ElementalKernelEmitter::EmitKernelDefinition() {
   VLOG(2) << "Emit elemental host kernel: " << instr_->name();
 
@@ -228,14 +227,14 @@ ElementalKernelEmitter::EmitKernelDefinition() {
                       EmitElementalLoops(ir_builder, instr_, kernel_prototype,
                                          element_generator));
 
-  LlvmIrKernelSource source(std::move(ctx), std::move(llvm_module));
+  LlvmKernelSource source(std::move(ctx), std::move(llvm_module));
 
   KernelSpec spec(kernel_prototype.function->getName(), num_workgroups,
                   std::move(kernel_prototype.argument_buffers),
                   std::move(kernel_prototype.result_buffers),
                   std::move(kernel_prototype.invariant_arguments));
 
-  return LlvmKernelDefinition(std::move(spec), std::move(source));
+  return KernelDefinition(std::move(spec), std::move(source));
 }
 
 absl::StatusOr<NumWorkGroups> ElementalKernelEmitter::EmitElementalLoops(
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
index 0b57dca65d39e9..59549f300719dc 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h
@@ -16,16 +16,14 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_ELEMENTAL_KERNEL_EMITTER_H_
 #define XLA_BACKENDS_CPU_CODEGEN_ELEMENTAL_ELEMENTAL_KERNEL_EMITTER_H_
 
-#include <string>
-
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -34,15 +32,14 @@ limitations under the License.
 
 namespace xla::cpu {
 
-class ElementalKernelEmitter final : public LlvmKernelEmitter {
+class ElementalKernelEmitter final : public KernelEmitter<LlvmKernelSource> {
  public:
   ElementalKernelEmitter(const HloInstruction* instr,
                          const BufferAssignment* buffer_assignment,
                          const TargetMachineFeatures* target_machine);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() override;
-
-  std::string name() const final { return "elemental_kernel_emitter"; }
+  absl::string_view name() const final { return "elemental_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
  private:
   // Emits LLVM IR using elemental loop emitter and the given element generator.
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
index 0a1bf19e023125..453e37dfa6ec9d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -46,7 +46,7 @@ class ElementalKernelEmitterTest : public HloHardwareIndependentTestBase {
   ElementalKernelEmitterTest()
       : target_machine_features_([](int64_t size) { return 1; }) {}
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition(
+  absl::StatusOr<KernelDefinition<LlvmKernelSource>> EmitKernelDefinition(
       const HloInstruction* instr, const BufferAssignment* buffer_assignment) {
     ElementalKernelEmitter emitter(instr, buffer_assignment,
                                    &target_machine_features_);
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
index 3065ca8cf918bc..d0ac6c73095272 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -38,9 +38,8 @@ cc_library(
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:symbol_name_util",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
@@ -49,6 +48,7 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
@@ -58,7 +58,6 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:scatter_simplifier",
         "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -102,10 +101,11 @@ xla_cc_test(
         ":cpu_fusion_emitters",
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -113,7 +113,6 @@ xla_cc_test(
         "//xla/service:buffer_value",
         "//xla/service:logical_buffer",
         "//xla/service/cpu:cpu_executable",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
index bc7ed5e835d269..3edab5fab09a14 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -72,7 +73,6 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/dump.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -135,7 +135,7 @@ IndexingMap GetDefaultIndexingMap(
     absl::Span<const int64_t> thread_tile_sizes,
     absl::Span<const int64_t> shape,
     // TODO: b/451959933 - Use reference or absl_nullable pointer.
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   CHECK_EQ(thread_tile_sizes.size(), shape.size())
       << "thread_tile_sizes and shape must have the same size";
   SmallVector<int64_t> thread_tile_counts;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
index 190ee802130184..56d88e2bcab48b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
@@ -32,18 +32,17 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace cpu {
 
-IndexingMap GetDefaultIndexingMap(
-    absl::Span<const int64_t> thread_tile_sizes,
-    absl::Span<const int64_t> shape,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+IndexingMap GetDefaultIndexingMap(absl::Span<const int64_t> thread_tile_sizes,
+                                  absl::Span<const int64_t> shape,
+                                  SymbolicExprContext* symbolic_expr_context);
 
 absl::StatusOr<mlir::func::FuncOp> EmitEntryFunctionApi(
     mlir::ModuleOp fusion_module, const HloFusionInstruction& fusion,
@@ -66,24 +65,6 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateNamedMlirModuleOp(
 // HloModule and the name of the fusion.
 absl::StatusOr<std::string> GetFusionName(const HloFusionInstruction& fusion);
 
-class CpuFusionEmitterBase {
- public:
-  virtual ~CpuFusionEmitterBase() = default;
-
-  virtual int64_t num_threads() const = 0;
-
-  virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, gpu::SymbolicExprContext* ctx) const = 0;
-
-  virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
-      int64_t root_index, int64_t hero_operand_index,
-      gpu::SymbolicExprContext* ctx) const = 0;
-
-  virtual std::string BackendExtraOptions() { return {}; }
-
-  virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Emit() const = 0;
-};
-
 int64_t CeilDiv(int64_t a, int64_t b);
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
index ccbe54a009d2b0..d8efc6d1a57e89 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
@@ -30,10 +30,11 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,29 +43,13 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/cpu/cpu_executable.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
-std::string LlvmModuleToString(const llvm::Module& module) {
-  std::string dump;
-  llvm::raw_string_ostream stream(dump);
-  stream << module;
-  return dump;
-}
-
-std::string MlirModuleToString(const mlir::ModuleOp& module) {
-  std::string mlir_dump;
-  llvm::raw_string_ostream mlir_stream(mlir_dump);
-  module->print(mlir_stream);
-  return mlir_dump;
-}
-
 class CpuFusionEmitterTest : public HloHardwareIndependentTestBase {
  protected:
   absl::StatusOr<std::unique_ptr<BufferAssignment>> RunBufferAssignment(
@@ -132,7 +117,7 @@ TEST_F(CpuFusionEmitterTest, ScatterMlir) {
       hlo_module->entry_computation()->root_instruction());
   auto mlir_context = FusionCompiler::CreateContext();
   auto symbolic_expr_context =
-      std::make_unique<gpu::SymbolicExprContext>(mlir_context.get());
+      std::make_unique<SymbolicExprContext>(mlir_context.get());
   CpuScatterFusion emitter(*buffer_assignment, fusion,
                            symbolic_expr_context.get());
   TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
@@ -163,16 +148,16 @@ TEST_F(CpuFusionEmitterTest, ScatterLlvm) {
       hlo_module->entry_computation()->root_instruction());
   auto mlir_context = FusionCompiler::CreateContext();
   auto symbolic_expr_context =
-      std::make_unique<gpu::SymbolicExprContext>(mlir_context.get());
+      std::make_unique<SymbolicExprContext>(mlir_context.get());
   CpuScatterFusion emitter(*buffer_assignment, fusion,
                            symbolic_expr_context.get());
   TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
                           emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
   FusionCompiler compiler(mlir_context.get(),
                           FusionCompiler::Options{512, 1, true});
-  TF_ASSERT_OK_AND_ASSIGN(LlvmIrKernelSource llvm_source,
-                          compiler.Compile(std::move(source)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LlvmKernelSource llvm_source,
+      compiler.Compile(std::move(kernel_definition).TakeSource()));
   auto llvm_dump = llvm_source.ToString();
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
                           RunFileCheck(llvm_dump, kExpected));
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
index f3c84fb219ddce..c540904b325d5d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -59,10 +59,10 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -71,7 +71,6 @@ limitations under the License.
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/scatter_simplifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -93,7 +92,7 @@ namespace scf = ::mlir::scf;
 
 std::vector<emitters::EpilogueSpecification> CpuScatterFusion::GetEpilogues(
     const HloFusionInstruction& fusion,
-    gpu::SymbolicExprContext* symbolic_expr_context) const {
+    SymbolicExprContext* symbolic_expr_context) const {
   const auto* scatter = fusion_->fused_expression_root();
   // We don't actually support epilogues for scatter, but this is how we tell
   // the base class that we don't want it to generate code for the scatter.
@@ -102,13 +101,13 @@ std::vector<emitters::EpilogueSpecification> CpuScatterFusion::GetEpilogues(
 }
 
 std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, gpu::SymbolicExprContext* ctx) const {
+    int64_t root_index, SymbolicExprContext* ctx) const {
   return std::nullopt;
 }
 
 std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    gpu::SymbolicExprContext* ctx) const {
+    SymbolicExprContext* ctx) const {
   const auto* scatter =
       DynCast<HloScatterInstruction>(fusion_->fused_expression_root());
   CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
@@ -183,10 +182,9 @@ SmallVector<Value> EmitScatterComputation(
   return {atomic_rmw->getResult(0)};
 }
 
-CpuScatterFusion::CpuScatterFusion(
-    const BufferAssignment& buffer_assignment,
-    const HloFusionInstruction* fusion,
-    gpu::SymbolicExprContext* symbolic_expr_context)
+CpuScatterFusion::CpuScatterFusion(const BufferAssignment& buffer_assignment,
+                                   const HloFusionInstruction* fusion,
+                                   SymbolicExprContext* symbolic_expr_context)
     : buffer_assignment_(buffer_assignment),
       fusion_(fusion),
       symbolic_expr_context_(symbolic_expr_context) {
@@ -251,7 +249,8 @@ IndexingMap GetScatterIndexingMap(
       {}, constraints);
 }
 
-absl::StatusOr<MlirKernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
+absl::StatusOr<CpuScatterFusion::KernelDefinition>
+CpuScatterFusion::EmitKernelDefinition() {
   mlir::OpBuilder builder(symbolic_expr_context_->GetMLIRContext());
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
                       CreateNamedMlirModuleOp(*fusion_, builder));
@@ -327,8 +326,8 @@ absl::StatusOr<MlirKernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
                          std::move(argument_buffers), std::move(result_buffers),
                          std::move(invariant_arguments));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(mlir_module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(mlir_module)));
 }
 
 absl::Status CpuScatterFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
index cea1943120c8cc..8c96f687a26653 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
@@ -17,35 +17,35 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/Value.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace cpu {
 
 // Generic scatter fusion. Lowers to LLVM via MLIR.
-class CpuScatterFusion final : public MlirKernelEmitter {
+class CpuScatterFusion final : public KernelEmitter<MlirKernelSource> {
  public:
   CpuScatterFusion(const BufferAssignment& buffer_assignment,
                    const HloFusionInstruction* fusion,
-                   gpu::SymbolicExprContext* symbolic_expr_context);
+                   SymbolicExprContext* symbolic_expr_context);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const final { return "cpu_scatter_fusion"; }
+  absl::string_view name() const final { return "cpu_scatter_fusion"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   absl::Status EmitEntryFunction(
@@ -56,21 +56,21 @@ class CpuScatterFusion final : public MlirKernelEmitter {
 
   std::vector<emitters::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
-      gpu::SymbolicExprContext* symbolic_expr_context) const;
+      SymbolicExprContext* symbolic_expr_context) const;
 
   mlir::Value EmitThreadId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
 
   // These two methods do not seem to be used @ecg?
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, gpu::SymbolicExprContext* ctx) const;
+      int64_t root_index, SymbolicExprContext* ctx) const;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      gpu::SymbolicExprContext* ctx) const;
+      SymbolicExprContext* ctx) const;
 
   const BufferAssignment& buffer_assignment_;
   const HloFusionInstruction* fusion_;
-  gpu::SymbolicExprContext* symbolic_expr_context_;
+  SymbolicExprContext* symbolic_expr_context_;
 
   int64_t vector_size_;
   int64_t num_threads_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
index 0ff3804e171ffe..d231d5014e929d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
@@ -23,6 +23,7 @@ td_library(
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
     ],
 )
 
@@ -94,5 +95,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
index 6280c7108e1f4b..972ffff654e9be 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc
@@ -15,7 +15,26 @@ limitations under the License.
 
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
 
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"  // IWYU pragma: keep
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace xla::cpu {
+
+using EffectsVector = llvm::SmallVectorImpl<
+    mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>;
+
+void LoadOp::getEffects(EffectsVector& effects) {
+  effects.emplace_back(mlir::MemoryEffects::Read::get(), &getCallFrameMutable(),
+                       mlir::SideEffects::DefaultResource::get());
+}
+
+void ExtractWorkgroupIdOp::getEffects(EffectsVector& effects) {
+  effects.emplace_back(mlir::MemoryEffects::Read::get(), &getCallFrameMutable(),
+                       mlir::SideEffects::DefaultResource::get());
+}
+
+}  // namespace xla::cpu
 
 #define GET_OP_CLASSES
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.cc.inc"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
index 78591ef6800b50..b4581ba1f50560 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.td
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_CODEGEN_EMITTERS_IR_XLA_CPU_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.td"
 include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_types.td"
 include "xla/codegen/emitters/ir/xla_attrs.td"
@@ -31,7 +32,8 @@ def TensorOrMemRef : AnyTypeOf<[AnyMemRef, AnyRankedTensor]>;
 // !xla_cpu.load
 //===----------------------------------------------------------------------===//
 
-def XLACPU_LoadOp : XLACPU_Op<"load"> {
+def XLACPU_LoadOp : XLACPU_Op<"load",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Loads a tensor from an XLA:CPU call frame";
 
   let description = [{
@@ -56,7 +58,8 @@ def XLACPU_LoadOp : XLACPU_Op<"load"> {
 // !xla_cpu.extract_workgroup_id
 //===----------------------------------------------------------------------===//
 
-def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id"> {
+def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Extracts the workgroup id from the call frame";
 
   let description = [{
@@ -83,7 +86,7 @@ def XLACPU_ExtractWorkgroupIdOp : XLACPU_Op<"extract_workgroup_id"> {
 // !xla_cpu.success
 //===----------------------------------------------------------------------===//
 
-def XLACPU_SuccessOp : XLACPU_Op<"success"> {
+def XLACPU_SuccessOp : XLACPU_Op<"success", [Pure]> {
   let summary = "Creates an !xla_cpu.error in the 'success' state.";
   let arguments = (ins);
   let results = (outs XLACPU_Error:$result);
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
index 04f14554cce723..cab42ce16aab09 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -43,15 +43,10 @@ cc_library(
         ":passes_inc_gen",
         ":xla_cpu_rewrite_patterns",
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
-        "//xla/codegen/emitters:implicit_arith_op_builder",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/intrinsic:fptrunc",
-        "//xla/codegen/intrinsic:log1p",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/mlir/utils:type_util",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
index 0fbf7f014d1214..4580afb61e44c4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/add_reduction_fast_math_flags.cc
@@ -55,6 +55,13 @@ struct RewriteCallPattern
       return rewriter.notifyMatchFailure(call_op, "Could not resolve callee.");
     }
 
+    // Adding reassoc flags to reductions with more than one fast math op
+    // can result in unexpected behaviour as they can reassociate between
+    // themselves.
+    if (FastMathOpCount(callee) > 1) {
+      return rewriter.notifyMatchFailure(call_op, "Too many fast math ops.");
+    }
+
     callee->walk([&rewriter](mlir::Operation* op) {
       if (auto fm_op =
               mlir::dyn_cast_or_null<mlir::arith::ArithFastMathInterface>(op)) {
@@ -74,6 +81,13 @@ struct RewriteCallPattern
 
     return mlir::success();
   }
+
+ private:
+  static int FastMathOpCount(mlir::func::FuncOp callee) {
+    int count = 0;
+    callee.walk([&](mlir::arith::ArithFastMathInterface op) { count++; });
+    return count;
+  }
 };
 
 class AddReductionFastMathFlagsPass
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
index c62c0c2e51ab77..4a565b5c696d46 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/expand_float_ops.cc
@@ -17,17 +17,17 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -35,8 +35,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/codegen/emitters/implicit_arith_op_builder.h"
-#include "xla/mlir/utils/type_util.h"
 
 namespace xla::cpu {
 
@@ -48,33 +46,31 @@ namespace {
 
 namespace ma = ::mlir::arith;
 
-mlir::func::FuncOp GetOrInsertDeclaration(mlir::PatternRewriter& rewriter,
-                                          mlir::ModuleOp& module_op,
-                                          absl::string_view name,
-                                          mlir::FunctionType func_type) {
-  // Check if the function already exists
-  if (auto func = module_op.lookupSymbol<mlir::func::FuncOp>(name)) {
-    // Ensure the existing function has the correct type
-    if (func.getFunctionType() == func_type) {
-      return func;
+mlir::Value EmitBF16ToF32(mlir::Type dst_ty, mlir::Value in,
+                          mlir::ImplicitLocOpBuilder& b) {
+  auto get_type = [&](mlir::Type element_type) -> mlir::Type {
+    if (auto vector_type = mlir::dyn_cast<mlir::VectorType>(in.getType())) {
+      return vector_type.clone(element_type);
     }
-  }
+    return element_type;
+  };
 
-  // If not found or type mismatch, create the declaration
-  mlir::PatternRewriter::InsertionGuard insertGuard(rewriter);
-  rewriter.setInsertionPointToStart(module_op.getBody());
+  mlir::Type i16_type = get_type(b.getI16Type());
+  mlir::Type i32_type = get_type(b.getI32Type());
 
-  auto func_decl =
-      rewriter.create<mlir::func::FuncOp>(module_op.getLoc(), name, func_type);
-  func_decl.setPrivate();
-  return func_decl;
-}
+  mlir::Value i16 = ma::BitcastOp::create(b, i16_type, in);
+  mlir::Value i32 = ma::ExtUIOp::create(b, i32_type, i16);
+
+  mlir::TypedAttr shift_attr = b.getI32IntegerAttr(16);
+  if (auto vector_type = mlir::dyn_cast<mlir::VectorType>(in.getType())) {
+    shift_attr = mlir::SplatElementsAttr::get(
+        mlir::cast<mlir::ShapedType>(i32_type), shift_attr);
+  }
+  mlir::Value shift_const =
+      mlir::arith::ConstantOp::create(b, i32_type, shift_attr);
 
-mlir::Value EmitBF16ToF32(mlir::Value in, mlir::ImplicitLocOpBuilder& b) {
-  mlir::Value i16 = b.create<ma::BitcastOp>(b.getI16Type(), in);
-  emitters::ImplicitArithOpBuilder i32(
-      b.create<ma::ExtUIOp>(b.getI32Type(), i16), &b);
-  return b.create<ma::BitcastOp>(b.getType<mlir::Float32Type>(), i32 << 16);
+  mlir::Value i32_shl = mlir::arith::ShLIOp::create(b, i32, shift_const);
+  return ma::BitcastOp::create(b, dst_ty, i32_shl);
 }
 
 struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
@@ -83,13 +79,14 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
     auto src = op.getOperand();
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    auto dst_ty = op.getType();
 
     mlir::ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
 
-    if (mlir::isa<mlir::BFloat16Type>(src.getType()) &&
-        mlir::isa<mlir::Float32Type>(dst_ty)) {
-      rewriter.replaceOp(op, EmitBF16ToF32(src, builder));
+    if (mlir::isa<mlir::BFloat16Type>(
+            mlir::getElementTypeOrSelf(src.getType())) &&
+        mlir::isa<mlir::Float32Type>(mlir::getElementTypeOrSelf(dst_ty))) {
+      rewriter.replaceOp(op, EmitBF16ToF32(dst_ty, src, builder));
       return mlir::success();
     }
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
index 6761f2f1b3cd72..742ebb6382a515 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/add_reduction_fast_math.mlir
@@ -14,4 +14,24 @@ func.func @reducer(%x: f32, %y: f32) -> f32
 
 // CHECK-LABEL: func.func @caller
 // CHECK-LABEL: func.func @reducer
-// CHECK arith.addf {{.*}} fastmath<reassoc> : f32
+// CHECK: arith.addf {{.*}} fastmath<reassoc> : f32
+
+// -----
+
+
+func.func @caller(%x: f32, %y: f32) -> f32
+{
+  %z = func.call @reducer(%x, %y) { xla.is_reduction }: (f32, f32) -> f32
+  func.return %z : f32
+}
+
+func.func @reducer(%x: f32, %y: f32) -> f32
+{
+  %w = arith.addf %x, %y : f32
+  %z = arith.mulf %w, %y : f32
+  func.return %z : f32
+}
+
+// CHECK-LABEL: func.func @caller
+// CHECK-LABEL: func.func @reducer
+// CHECK-NOT: fastmath
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
index b8ffcc83489e5e..4f0bf66f9c93b1 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/tests/expand_float_ops.mlir
@@ -1,12 +1,18 @@
 // RUN: emitters_opt %s -split-input-file -xla-cpu-expand-float-ops | FileCheck %s
 
-
 func.func @extend(%input: bf16) -> f32 {
+  // CHECK-NOT: arith.extf
   %truncated = arith.extf %input : bf16 to f32
   func.return %truncated : f32
 }
 
-// CHECK-NOT: arith.extf
+// -----
+
+func.func @extend_vector(%input: vector<8xbf16>) -> vector<8xf32> {
+  // CHECK-NOT: arith.extf
+  %truncated = arith.extf %input : vector<8xbf16> to vector<8xf32>
+  func.return %truncated : vector<8xf32>
+}
 
 // -----
 
@@ -15,7 +21,6 @@ func.func @cbrt(%arg0: f64) -> f64 {
   return %ret : f64
 }
 
-
 // CHECK: @cbrt(%[[ARG:.*]]: f64) -> f64
 // CHECK-NOT: math.cbrt
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant 0.3333333
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
index fd79949e60e361..f4a26521dbd09c 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -28,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/FMF.h"
@@ -49,6 +51,11 @@ limitations under the License.
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
@@ -56,11 +63,22 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.h"
+#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -68,12 +86,16 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/WalkResult.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
+#include "stablehlo/conversions/linalg/transforms/Passes.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "stablehlo/transforms/Passes.h"
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.h"
 #include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
@@ -84,9 +106,10 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/trace_pass_instrumentation.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
 #include "xla/codegen/xtile/ir/xtile_dialect.h"
 #include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
@@ -101,6 +124,30 @@ limitations under the License.
 
 namespace xla::cpu {
 
+static void RegisterPassPipeline(
+    absl::string_view name, absl::string_view description,
+    absl::FunctionRef<void(mlir::OpPassManager&)> pipeline_builder) {
+  using ErrorHandlerFn =
+      llvm::function_ref<mlir::LogicalResult(const llvm::Twine&)>;
+
+  mlir::PassRegistryFunction register_pass_callback =
+      [pipeline_builder](mlir::OpPassManager& pm, llvm::StringRef options,
+                         ErrorHandlerFn error_handler) {
+        if (!options.empty()) {
+          return mlir::failure();
+        }
+        pipeline_builder(pm);
+        return mlir::success();
+      };
+
+  auto option_handler =
+      [](llvm::function_ref<void(const mlir::detail::PassOptions&)>
+             options_handler) { options_handler(mlir::detail::PassOptions()); };
+
+  mlir::registerPassPipeline(name, description, register_pass_callback,
+                             option_handler);
+}
+
 class ModuleCallbackPass
     : public mlir::PassWrapper<ModuleCallbackPass,
                                mlir::OperationPass<mlir::ModuleOp>> {
@@ -233,26 +280,63 @@ static void AddScalarLoweringPasses(mlir::OpPassManager& pm,
   AddGenericLoweringPasses(pm);
 }
 
+static void AddBufferizationPasses(mlir::OpPassManager& pm) {
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
+  pm.addPass(mlir::bufferization::createOneShotBufferizePass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::bufferization::createBufferHoistingPass());
+  pm.addPass(mlir::memref::createFoldMemRefAliasOpsPass());
+  mlir::bufferization::PromoteBuffersToStackPassOptions
+      buffer_promotion_options;
+  // We don't want any heap allocation for now.
+  buffer_promotion_options.maxAllocSizeInBytes =
+      std::numeric_limits<unsigned>::max();
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::bufferization::createPromoteBuffersToStackPass(
+          buffer_promotion_options));
+  // This shouldn't be necessary as we promote everything to the stack, but we
+  // leave it in for now while we are experimenting.
+  mlir::bufferization::buildBufferDeallocationPipeline(
+      pm, mlir::bufferization::BufferDeallocationPipelineOptions());
+}
+
 // Optimizations passes for the tiled emitter.
 // This is currently very simple but will grow to include tiled optimizations
 // such as transpose hoisting and dimension reduction.
 static void AddTiledOptimizationPasses(mlir::OpPassManager& pm) {
   emitters::RegisterOptimizationPasses(pm);
+
+  pm.addPass(CreateShloToVectorPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(CreateLowerXTileEntryPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::vector::createLowerVectorMultiReductionPass(
+          mlir::vector::VectorMultiReductionLowering::InnerParallel));
+  pm.addPass(CreateTensorOpsToVectorPass());
+
+  pm.addPass(mlir::createConvertElementwiseToLinalgPass());
+  pm.addPass(mlir::createLinalgElementwiseOpFusionPass());
+
+  AddBufferizationPasses(pm);
+
+  pm.addPass(CreateLinalgElementwiseToVectorPass());
 }
 
 // Lowering passes for the tiled emitter.
 // The input IR is from the xtile dialect which uses tensors that are converted
 // first to the vector dialect and then to LLVM.
 static void AddTiledLoweringPasses(mlir::OpPassManager& pm) {
-  pm.addPass(CreateXTileToVectorPass());
-  pm.addPass(CreateElementalTensorToVectorPass());
-  pm.addPass(CreateShloToVectorPass());
-  pm.addPass(CreateLowerXTileEntryPass());
-  pm.addPass(CreateTensorOpsToVectorPass());
+  pm.addPass(cpu::CreateMemrefCopyToLoopsPass());
   pm.addPass(cpu::createLowerToLLVMPass());
   pm.addPass(mlir::createConvertVectorToSCFPass(
       mlir::VectorTransferToSCFOptions().enableFullUnroll(false)));
-  pm.addPass(mlir::createConvertVectorToLLVMPass());
+  mlir::ConvertVectorToLLVMPassOptions options;
+  options.vectorTransposeLowering =
+      mlir::vector::VectorTransposeLowering::Shuffle1D;
+  pm.addPass(mlir::createConvertVectorToLLVMPass(options));
 
   pm.addPass(mlir::createConvertComplexToStandardPass());
   pm.addPass(mlir::memref::createExpandStridedMetadataPass());
@@ -301,6 +385,7 @@ FusionCompiler::FusionCompiler(mlir::MLIRContext* context, Options options,
                           options_.fast_min_max);
 
   // Tiled passes.
+  tiled_pass_manager_.addPass(xtile::createVerifyLegalXTileOpsPass());
   AddTiledOptimizationPasses(tiled_pass_manager_);
   if (hooks_.post_optimization) {
     tiled_pass_manager_.addPass(
@@ -398,12 +483,12 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
 }
 
 // Compile a MLIR kernel source to a LLVM kernel source.
-absl::StatusOr<LlvmIrKernelSource> FusionCompiler::Compile(
+absl::StatusOr<LlvmKernelSource> FusionCompiler::Compile(
     MlirKernelSource mlir_kernel_source) {
   auto llvm_context = std::make_unique<llvm::LLVMContext>();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
                       Compile(*llvm_context, mlir_kernel_source.module()));
-  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
+  return LlvmKernelSource(std::move(llvm_context), std::move(llvm_module));
 }
 
 std::unique_ptr<mlir::MLIRContext> FusionCompiler::CreateContext() {
@@ -414,26 +499,62 @@ std::unique_ptr<mlir::MLIRContext> FusionCompiler::CreateContext() {
   auto context = std::make_unique<mlir::MLIRContext>(
       mlir::MLIRContext::Threading::DISABLED);
 
-  context->loadDialect<mlir::DLTIDialect, mlir::affine::AffineDialect,
-                       mlir::arith::ArithDialect, mlir::cf::ControlFlowDialect,
-                       mlir::func::FuncDialect, mlir::math::MathDialect,
-                       xla::cpu::XlaCpuDialect, mlir::mhlo::MhloDialect,
-                       mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect,
-                       mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
-                       xla::XlaDialect, xla::xtile::XTileDialect>();
+  context->appendDialectRegistry(CreateDialectRegistry());
+  context->loadAllAvailableDialects();
 
+  return context;
+}
+
+mlir::DialectRegistry FusionCompiler::CreateDialectRegistry(
+    bool register_pass_pipelines) {
   mlir::DialectRegistry registry;
+
+  registry.insert<
+      mlir::DLTIDialect, mlir::affine::AffineDialect, mlir::arith::ArithDialect,
+      mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,
+      mlir::math::MathDialect, xla::cpu::XlaCpuDialect, mlir::mhlo::MhloDialect,
+      mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect,
+      mlir::tensor::TensorDialect, mlir::vector::VectorDialect, xla::XlaDialect,
+      xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect,
+      mlir::linalg::LinalgDialect, mlir::memref::MemRefDialect>();
+
   mlir::LLVM::registerInlinerInterface(registry);
   mlir::func::registerInlinerExtension(registry);
+
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+
+  mlir::arith::registerBufferDeallocationOpInterfaceExternalModels(registry);
+  mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
+
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
+      registry);
+  mlir::linalg::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::scf::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::vector::registerBufferizableOpInterfaceExternalModels(registry);
+
+  mlir::vector::registerSubsetOpInterfaceExternalModels(registry);
+
   mlir::registerLLVMDialectTranslation(registry);
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerConvertMathToLLVMInterface(registry);
   mlir::registerConvertMemRefToLLVMInterface(registry);
   mlir::ub::registerConvertUBToLLVMInterface(registry);
   mlir::vector::registerConvertVectorToLLVMInterface(registry);
-  context->appendDialectRegistry(registry);
 
-  return context;
+  if (register_pass_pipelines) {
+    RegisterPassPipeline(
+        "xla-test-optimize",
+        "Test pipeline of passes up to inlining. Intended to simplify IR in "
+        "tests.",
+        &xla::emitters::RegisterOptimizationPasses);
+    RegisterPassPipeline("xtile-cpu-bufferization",
+                         "Run the bufferization pipeline for a tiled kernel.",
+                         &AddBufferizationPasses);
+  }
+
+  return registry;
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
index bd575a40ee50c1..9dc88e385e8de0 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 
 namespace xla::cpu {
@@ -57,13 +57,19 @@ class FusionCompiler {
   absl::StatusOr<std::unique_ptr<llvm::Module>> Compile(
       llvm::LLVMContext& llvm_context, mlir::ModuleOp mlir_module);
   // Compile a MLIR kernel source to a LLVM kernel source.
-  absl::StatusOr<LlvmIrKernelSource> Compile(
-      MlirKernelSource mlir_kernel_source);
+  absl::StatusOr<LlvmKernelSource> Compile(MlirKernelSource mlir_kernel_source);
 
   // Create a new MLIR context for the compiler with the required dialects for
   // compiling an XLA:CPU fusion.
   static std::unique_ptr<mlir::MLIRContext> CreateContext();
 
+  // Create a dialect registry for the compiler with the required dialects for
+  // compiling an XLA:CPU fusion. If `register_pass_pipelines` is true, this
+  // will also register the pass pipelines for the compiler, typically to be
+  // used in tests.
+  static mlir::DialectRegistry CreateDialectRegistry(
+      bool register_pass_pipelines = false);
+
  private:
   Options options_;
   CompilationHooks hooks_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
index 9b5648a9a5e965..d7c8ff759e87b1 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/symbol_name_util.h"
@@ -42,7 +43,9 @@ limitations under the License.
 #include "xla/codegen/emitters/loop_kernel_emitter.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/ir_emission_utils.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -55,8 +58,6 @@ limitations under the License.
 #include "xla/runtime/work_tile_size.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -64,6 +65,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+using ::mlir::MLIRContext;
+
 static absl::StatusOr<std::string> GetName(const HloFusionInstruction& fusion,
                                            bool use_unique_c_name) {
   if (!use_unique_c_name) {
@@ -207,8 +210,8 @@ static HloFusionSpec GetLoopFusionSpec(const HloFusionInstruction& fusion) {
                        std::move(heroes));
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitLoopFusionKernel(
-    gpu::SymbolicExprContext& context, const HloFusionInstruction& fusion,
+static absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitLoopFusionKernel(
+    SymbolicExprContext& context, const HloFusionInstruction& fusion,
     const BufferAssignment* buffer_assignment, absl::string_view name) {
   VLOG(2) << "Emitting loop fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
@@ -220,22 +223,20 @@ static absl::StatusOr<MlirKernelDefinition> EmitLoopFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       loop_fusion_emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(context.GetMLIRContext());
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(
           BuildModuleMemoryRegionName(loop_fusion_emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitConcatenateFusionKernel(
-    gpu::SymbolicExprContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, absl::string_view name) {
+static absl::StatusOr<KernelDefinition<MlirKernelSource>>
+EmitConcatenateFusionKernel(SymbolicExprContext& context,
+                            const HloFusionInstruction& fusion,
+                            const BufferAssignment* buffer_assignment,
+                            absl::string_view name) {
   VLOG(2) << "Emitting concatenate fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
   auto work_dimensions = GetConcatenateEmitterWorkDims(fusion, fusion_spec);
@@ -246,22 +247,20 @@ static absl::StatusOr<MlirKernelDefinition> EmitConcatenateFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       concatenate_fusion_emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(context.GetMLIRContext());
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(BuildModuleMemoryRegionName(
           concatenate_fusion_emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-static absl::StatusOr<MlirKernelDefinition> EmitDynamicUpdateSliceFusionKernel(
-    gpu::SymbolicExprContext& context, const HloFusionInstruction& fusion,
-    const BufferAssignment* buffer_assignment, absl::string_view name) {
+static absl::StatusOr<KernelDefinition<MlirKernelSource>>
+EmitDynamicUpdateSliceFusionKernel(SymbolicExprContext& context,
+                                   const HloFusionInstruction& fusion,
+                                   const BufferAssignment* buffer_assignment,
+                                   absl::string_view name) {
   VLOG(2) << "Emitting dynamic update slice fusion kernel: " << name;
   HloFusionSpec fusion_spec = GetLoopFusionSpec(fusion);
   auto work_dimensions =
@@ -273,29 +272,26 @@ static absl::StatusOr<MlirKernelDefinition> EmitDynamicUpdateSliceFusionKernel(
   TF_ASSIGN_OR_RETURN(auto mlir_kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  // We have to release otherwise the source wouldn't be mutable, and we
-  // wouldn't be able to set the CpuMemoryRegionNameAttr.
-  auto [kernel_spec, kernel_source] =
-      std::move(mlir_kernel_definition).ReleaseStorage();
-
   mlir::OpBuilder builder(context.GetMLIRContext());
-  kernel_source.module().getOperation()->setAttr(
+  mlir_kernel_definition.source().module().getOperation()->setAttr(
       xla::CpuMemoryRegionNameAttr::name,
       builder.getStringAttr(
           BuildModuleMemoryRegionName(emitter.name(), &fusion)));
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(kernel_source));
+
+  return mlir_kernel_definition;
 }
 
-absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
-    gpu::SymbolicExprContext& context, const HloFusionInstruction& fusion,
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitFusionKernel(
+    MLIRContext& mlir_context, SymbolicExprContext& expr_context,
+    const HloFusionInstruction& fusion,
     const BufferAssignment* buffer_assignment, bool use_unique_c_name) {
   if (fusion.fusion_kind() == HloFusionInstruction::FusionKind::kLoop) {
     TF_ASSIGN_OR_RETURN(std::string name, GetName(fusion, use_unique_c_name));
     const HloInstruction& hero =
         FindNonTrivialHero(*fusion.fused_expression_root());
     if (hero.opcode() == HloOpcode::kConcatenate) {
-      return EmitConcatenateFusionKernel(context, fusion, buffer_assignment,
-                                         name);
+      return EmitConcatenateFusionKernel(expr_context, fusion,
+                                         buffer_assignment, name);
     }
     auto fusion_spec = GetLoopFusionSpec(fusion);
     if (IsDynamicUpdateSliceFusion(fusion_spec)) {
@@ -304,11 +300,11 @@ absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
           CanEmitFusedDynamicUpdateSliceInPlace(fusion_spec.fusion(),
                                                 buffer_assignment, &fusion));
       if (dus_inplace) {
-        return EmitDynamicUpdateSliceFusionKernel(context, fusion,
+        return EmitDynamicUpdateSliceFusionKernel(expr_context, fusion,
                                                   buffer_assignment, name);
       }
     }
-    return EmitLoopFusionKernel(context, fusion, buffer_assignment, name);
+    return EmitLoopFusionKernel(expr_context, fusion, buffer_assignment, name);
   }
 
   return absl::UnimplementedError("Fusion kind not supported.");
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
index 18087d72a7dc70..bef9ec6853c706 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
@@ -17,18 +17,21 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_CODEGEN_FUSION_EMITTER_H_
 
 #include "absl/status/statusor.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla::cpu {
 
 emitters::KernelArguments::BufferAlignment GetDefaultBufferAlignment();
 
-absl::StatusOr<MlirKernelDefinition> EmitFusionKernel(
-    gpu::SymbolicExprContext& context, const HloFusionInstruction& fusion,
+absl::StatusOr<KernelDefinition<MlirKernelSource>> EmitFusionKernel(
+    mlir::MLIRContext& mlir_context, SymbolicExprContext& expr_context,
+    const HloFusionInstruction& fusion,
     const BufferAssignment* buffer_assignment, bool use_unique_c_name);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
index 10a868a9a78c0e..0d27255cf0d329 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter_test.py
@@ -55,6 +55,7 @@ def test_basic_add_sub(self):
     mlir_context = testlib_cpu.MLIRContext()
     symbolic_expr_context = testlib_cpu.SymbolicExprContext(mlir_context)
     kernel_definition = testlib_cpu.emit_fusion_kernel(
+        mlir_context,
         symbolic_expr_context,
         hlo_module.get_root_instruction(),
         buffer_assignment,
@@ -116,6 +117,7 @@ def test_convert_f32_bf16_f32(self):
     mlir_context = testlib_cpu.MLIRContext()
     symbolic_expr_context = testlib_cpu.SymbolicExprContext(mlir_context)
     kernel_definition = testlib_cpu.emit_fusion_kernel(
+        mlir_context,
         symbolic_expr_context,
         hlo_module.get_root_instruction(),
         buffer_assignment,
@@ -172,6 +174,7 @@ def test_convert_f32_bf16_f32_nan(self):
     mlir_context = testlib_cpu.MLIRContext()
     symbolic_expr_context = testlib_cpu.SymbolicExprContext(mlir_context)
     kernel_definition = testlib_cpu.emit_fusion_kernel(
+        mlir_context,
         symbolic_expr_context,
         hlo_module.get_root_instruction(),
         buffer_assignment,
@@ -225,6 +228,7 @@ def test_constant_with_layout(self):
     mlir_context = testlib_cpu.MLIRContext()
     symbolic_expr_context = testlib_cpu.SymbolicExprContext(mlir_context)
     kernel_definition = testlib_cpu.emit_fusion_kernel(
+        mlir_context,
         symbolic_expr_context,
         hlo_module.get_root_instruction(),
         buffer_assignment,
@@ -275,6 +279,7 @@ def test_exp_nan_dce(self):
     mlir_context = testlib_cpu.MLIRContext()
     symbolic_expr_context = testlib_cpu.SymbolicExprContext(mlir_context)
     kernel_definition = testlib_cpu.emit_fusion_kernel(
+        mlir_context,
         symbolic_expr_context,
         hlo_module.get_root_instruction(),
         buffer_assignment,
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
index 2d8f05b99c9be4..a92b0919a63118 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
@@ -262,8 +262,8 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
   XLA_SCOPED_LOGGING_TIMER_LEVEL(
       absl::StrCat("Compiled LLVM module: ", module_name), 1);
 
-  VLOG(2) << "IR before optimizations";
-  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
+  VLOG(3) << "IR before optimizations";
+  XLA_VLOG_LINES(3, llvm_ir::DumpToString(&module));
 
   // Get a target machine for compilation. If compilations run concurrently on
   // multiple threads, `IrCompiler` user (in most cases `SimpleOrcJIT`)
@@ -292,8 +292,8 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
     return ir_passes_error;
   }
 
-  VLOG(2) << "IR after optimizations";
-  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
+  VLOG(3) << "IR after optimizations";
+  XLA_VLOG_LINES(3, llvm_ir::DumpToString(&module));
 
   {  // Synchronize access to user-defined hooks.
     absl::MutexLock lock(mutex_);
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
index 294d537bc791b8..2e45622be876da 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
@@ -329,7 +329,7 @@ KernelApiIrBuilder::KernelApiIrBuilder(llvm::LLVMContext& context,
 auto KernelApiIrBuilder::EmitKernelPrototype(
     llvm::Module& module, const HloInstruction* instr,
     const BufferAssignment* buffer_assignment,
-    const std::string& generating_emitter_name, absl::string_view suffix)
+    absl::string_view generating_emitter_name, absl::string_view suffix)
     -> absl::StatusOr<KernelPrototype> {
   TF_ASSIGN_OR_RETURN(std::vector<KernelParameter> arguments,
                       GetKernelArgumentsParameters(instr, buffer_assignment));
@@ -347,7 +347,7 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
     llvm::Module& module, absl::string_view name,
     absl::Span<const KernelParameter> arguments,
     absl::Span<const KernelParameter> results,
-    const std::string& module_memory_region_name)
+    absl::string_view module_memory_region_name)
     -> absl::StatusOr<KernelPrototype> {
   CHECK(&module.getContext() == &context_) << "Module context mismatch";
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
index 579d7fcdfe1789..22c355988102bb 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
@@ -124,14 +124,13 @@ class KernelApiIrBuilder {
   absl::StatusOr<KernelPrototype> EmitKernelPrototype(
       llvm::Module& module, const HloInstruction* instr,
       const BufferAssignment* buffer_assignment,
-      const std::string& generating_emitter_name,
-      absl::string_view suffix = "");
+      absl::string_view generating_emitter_name, absl::string_view suffix = "");
 
   absl::StatusOr<KernelPrototype> EmitKernelPrototype(
       llvm::Module& module, absl::string_view name,
       absl::Span<const KernelParameter> arguments,
       absl::Span<const KernelParameter> results,
-      const std::string& module_memory_region_name);
+      absl::string_view module_memory_region_name);
 
   // Get the kernel name for the given HLO instruction.
   // If generate_unique_c_style_kernel_entry_points is enabled, the name will
diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index c7689f82e2bc54..232bfe0488ba26 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -522,8 +522,11 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
   rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls("llvm.exp.v4f32", GenerateVF32Exp, /*vector_width=*/4);
   rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
+  rewrite_calls("llvm.exp.v8f32", GenerateVF32Exp, /*vector_width=*/8);
   rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
+  rewrite_calls("llvm.exp.v16f32", GenerateVF32Exp, /*vector_width=*/16);
 
   rewrite_calls("llvm.exp.f16", UpcastF16ToF32<GenerateVF32Exp>,
                 /*vector_width=*/1);
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
index 1eb3c4d2935fe1..314ec5265b016b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
@@ -14,7 +14,6 @@ py_strict_test(
     ],
     deps = [
         "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
         "//xla/backends/cpu/testlib",
         "//xla/codegen/testlib",
         "@absl_py//absl/testing:absltest",
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
index 4fbb7a0eee3f93..4e1e1241602c60 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
@@ -26,6 +26,13 @@
 create_literal = testlib_utilities.create_literal_from_np
 
 
+class InputSpec:
+
+  def __init__(self, shape: tuple[int, ...], input_value):
+    self.shape = shape
+    self.input_value = input_value
+
+
 def get_random_array(shape: tuple[int, ...], dtype: np.dtype) -> np.ndarray:
   rng = np.random.default_rng()
   return rng.uniform(low=-5, high=5, size=shape).astype(dtype)
@@ -35,7 +42,7 @@ def compare_kernel(
     ir: str,
     kernel_name: str,
     num_workgroups: int,
-    input_shapes: Iterable[tuple[int, ...]],
+    input_specs: Iterable[InputSpec],
     output_shape: tuple[int, ...],
     dtype,
     expected_output: Callable[[np.ndarray, ...], np.ndarray],
@@ -51,8 +58,14 @@ def compare_kernel(
       cpu_testlib.JitCompiler(base_testlib.HloModuleConfig()),
   )
 
-  # Simply use a all-ones arrays as inputs to make it easy to debug the kernel.
-  inputs = [np.ones(shape=shape, dtype=dtype) for shape in input_shapes]
+  # Simply use a all-ones arrays as inputs to make it easy to debug the kernel
+  # unless random inputs are requested.
+  def get_input(spec: InputSpec):
+    if spec.input_value is None:
+      return get_random_array(spec.shape, dtype)
+    return np.full(shape=spec.shape, fill_value=spec.input_value, dtype=dtype)
+
+  inputs = [get_input(spec) for spec in input_specs]
 
   input_tensors = [create_literal(input) for input in inputs]
   # Use a random array as the output to ensure all values are written to.
@@ -92,7 +105,7 @@ def test_slice(self):
         ir,
         "tiled_slice",
         1,
-        [(5, 5)],
+        [InputSpec((5, 5), 1)],
         (5, 5),
         np.float32,
         lambda arg: arg.transpose(),
@@ -116,7 +129,7 @@ def test_strided(self):
         ir,
         "tiled_slice",
         1,
-        [(64, 64)],
+        [InputSpec((64, 64), 1)],
         (4, 32),
         np.float32,
         lambda arg: arg[::21, ::2],
@@ -143,7 +156,7 @@ def test_transpose(self):
         ir,
         "tiled_transpose",
         8,
-        [(4096, 4096)],
+        [InputSpec((4096, 4096), 1)],
         (4096, 4096),
         np.float32,
         lambda arg: arg.transpose(),
@@ -172,7 +185,7 @@ def test_add_tranpose(self):
         ir,
         "add_tranpose",
         8,
-        [(4096, 4096)],
+        [InputSpec((4096, 4096), 1)],
         (4096, 4096),
         np.float32,
         lambda arg: arg + arg.transpose(),
@@ -200,7 +213,7 @@ def test_dot_single_tile(self):
         ir,
         "dot_single_tile",
         1,
-        [(8, 16), (16, 8)],
+        [InputSpec((8, 16), 1), InputSpec((16, 8), 1)],
         (8, 8),
         np.float32,
         lambda lhs, rhs: lhs @ rhs,
@@ -229,7 +242,7 @@ def test_dot_scalar_output(self):
         ir,
         "test_dot_scalar_output",
         1,
-        [(8, 16), (16, 8)],
+        [InputSpec((8, 16), 1), InputSpec((16, 8), 1)],
         (),
         np.float32,
         lambda lhs, rhs: np.tensordot(lhs, rhs, axes=[[1, 0], [0, 1]]),
@@ -262,13 +275,201 @@ def test_dot_fusion_single_tile(self):
         ir,
         "dot_fusion_single_tile",
         1,
-        [(8, 16), (8, 16), (16, 1)],
+        [InputSpec((8, 16), 1), InputSpec((8, 16), 1), InputSpec((16, 1), 1)],
         (8, 1),
         np.float32,
         lambda lhs_0, lhs_1, rhs: np.tanh((lhs_0 + lhs_1) @ rhs),
         maxulp=5,
     )
 
+  def test_reduction_add_inner(self):
+    ir = """
+      module @reduction_add_inner {
+        xtile.entry_func @reduction_add_inner(
+            %input: memref<1024x32xf32>,
+            %init: memref<f32>,
+            %output: memref<1024xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:128, tiles_per_workgroup:32>} {
+          %c_0 = arith.constant 0 : index
+          %c_8 = arith.constant 8 : index
+          %init_tile = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %index = arith.muli %tile_id, %c_8 : index
+          %input_tile = xtile.extract %input[%index, %c_0][8, 32][1, 1] : memref<1024x32xf32> -> tensor<8x32xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_tile)
+                    across dimensions = [1]
+                    : (tensor<8x32xf32>, tensor<f32>) -> tensor<8xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%index][8][1] : tensor<8xf32> -> memref<1024xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_inner",
+        4,
+        [InputSpec((1024, 32), 1), InputSpec((1,), 0)],
+        (1024,),
+        np.int32,
+        lambda input, init: np.sum(input, axis=1) + init,
+    )
+
+  def test_reduction_add_outer(self):
+    ir = """
+      module @reduction_add_outer {
+        xtile.entry_func @reduction_add_outer(
+            %input: memref<1024x32xf32>,
+            %init: memref<f32>,
+            %output: memref<32xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:4, tiles_per_workgroup:1>} {
+          %c_0 = arith.constant 0 : index
+          %c_8 = arith.constant 8 : index
+          %init_tile = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %index = arith.muli %tile_id, %c_8 : index
+          %input_tile = xtile.extract %input[%c_0, %index][1024, 8][1, 1] : memref<1024x32xf32> -> tensor<1024x8xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_tile)
+                    across dimensions = [0]
+                    : (tensor<1024x8xf32>, tensor<f32>) -> tensor<8xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%index][8][1] : tensor<8xf32> -> memref<32xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_outer",
+        4,
+        [InputSpec((1024, 32), 1), InputSpec((1,), 0)],
+        (32,),
+        np.float32,
+        lambda input, init: np.sum(input, axis=0),
+    )
+
+  def test_reduction_middle(self):
+    ir = """
+      module @reduction_add_middle {
+        xtile.entry_func @reduction_add_middle(
+            %input: memref<8x4x2xf32>,
+            %init: memref<f32>,
+            %output: memref<8x2xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %init_val = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %input_tile = xtile.extract %input[%tile_id, %tile_id, %tile_id][8, 4, 2][1, 1, 1] : memref<8x4x2xf32> -> tensor<8x4x2xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_val)
+                    across dimensions = [1]
+                    : (tensor<8x4x2xf32>, tensor<f32>) -> tensor<8x2xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%tile_id, %tile_id][8, 2][1, 1] : tensor<8x2xf32> -> memref<8x2xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_middle",
+        1,
+        [InputSpec((8, 4, 2), 1), InputSpec((1,), 0)],
+        (8, 2),
+        np.float32,
+        lambda input, init: np.sum(input, axis=1),
+    )
+
+  def test_reduction_outer_inner(self):
+    ir = """
+      module @reduction_add_outer_inner {
+        xtile.entry_func @reduction_add_outer_inner(
+            %input: memref<8x4x2xf32>,
+            %init: memref<f32>,
+            %output: memref<4xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %init_val = xtile.extract %init[][][] : memref<f32> -> tensor<f32>
+          %input_tile = xtile.extract %input[%tile_id, %tile_id, %tile_id][8, 4, 2][1, 1, 1] : memref<8x4x2xf32> -> tensor<8x4x2xf32>
+          %result = stablehlo.reduce(%input_tile init: %init_val)
+                    across dimensions = [0, 2]
+                    : (tensor<8x4x2xf32>, tensor<f32>) -> tensor<4xf32>
+            reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+              %add = arith.addf %arg0, %arg1 : tensor<f32>
+              stablehlo.return %add : tensor<f32>
+            }
+          xtile.insert %result into %output[%tile_id][4][1] : tensor<4xf32> -> memref<4xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "reduction_add_outer_inner",
+        1,
+        [InputSpec((8, 4, 2), 1), InputSpec((1,), 0)],
+        (4,),
+        np.float32,
+        lambda input, init: np.sum(input, axis=(0, 2)),
+    )
+
+  def test_broadcast_in_dim_inner(self):
+    ir = """
+      module @broadcast_in_dim_inner {
+        xtile.entry_func @broadcast_in_dim_inner(
+            %input: memref<4xf32>,
+            %output: memref<32x4xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %input_tile = xtile.extract %input[%tile_id][4][1] : memref<4xf32> -> tensor<4xf32>
+          %result = stablehlo.broadcast_in_dim %input_tile, dims = [1] : (tensor<4xf32>) -> tensor<32x4xf32>
+          xtile.insert %result into %output[%tile_id, %tile_id][32,4][1,1] : tensor<32x4xf32> -> memref<32x4xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "broadcast_in_dim_inner",
+        1,
+        [InputSpec((4,), None)],
+        (32, 4),
+        np.float32,
+        lambda input: np.broadcast_to(input, (32, 4)),
+    )
+
+  def test_broadcast_in_dim_outer(self):
+    ir = """
+      module @broadcast_in_dim_outer {
+        xtile.entry_func @broadcast_in_dim_outer(
+            %input: memref<4xf32>,
+            %output: memref<4x32xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %input_tile = xtile.extract %input[%tile_id][4][1] : memref<4xf32> -> tensor<4xf32>
+          %result = stablehlo.broadcast_in_dim %input_tile, dims = [0] : (tensor<4xf32>) -> tensor<4x32xf32>
+          xtile.insert %result into %output[%tile_id, %tile_id][4,32][1,1] : tensor<4x32xf32> -> memref<4x32xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "broadcast_in_dim_outer",
+        1,
+        [InputSpec((4,), None)],
+        (4, 32),
+        np.float32,
+        lambda input: np.transpose(np.broadcast_to(input, (32, 4))),
+    )
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
index aa1f510ba7a5e5..7a2175e1ca9625 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
@@ -36,7 +36,9 @@ cc_library(
     hdrs = ["lowering_utils.h"],
     visibility = ["//visibility:private"],
     deps = [
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:VectorDialect",
@@ -46,16 +48,17 @@ cc_library(
 cc_library(
     name = "passes",
     srcs = [
-        "elemental_tensor_to_vector.cc",
+        "linalg_elementwise_to_vector_pass.cc",
         "lower_xtile_entry.cc",
+        "memref_copy_to_loops.cc",
         "shlo_to_vector.cc",
         "tensor_ops_to_vector.cc",
-        "xtile_to_vector.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
         ":lowering_utils",
         ":passes_inc_gen",
+        ":vectorized_reduce_emitter",
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/xtile/ir:xtile",
@@ -64,17 +67,21 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithOpsIncGen",
+        "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:LinalgUtils",
         "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MathOpsIncGen",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MemRefUtils",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
@@ -83,3 +90,26 @@ cc_library(
         "@stablehlo//:stablehlo_ops",
     ],
 )
+
+cc_library(
+    name = "vectorized_reduce_emitter",
+    srcs = ["vectorized_reduce_emitter.cc"],
+    hdrs = ["vectorized_reduce_emitter.h"],
+    deps = [
+        ":lowering_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/elemental_tensor_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/elemental_tensor_to_vector.cc
deleted file mode 100644
index d457bc29ae3f45..00000000000000
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/elemental_tensor_to_vector.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <memory>
-#include <utility>
-
-#include "llvm/Support/LogicalResult.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Interfaces/DataLayoutInterfaces.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
-
-namespace xla::cpu {
-
-#define GEN_PASS_DECL_ELEMENTALTENSORTOVECTORPASS
-#define GEN_PASS_DEF_ELEMENTALTENSORTOVECTORPASS
-#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
-
-namespace {
-
-// This converter defines the rules for mapping types from the source (tensors)
-// to the target (vectors).
-class TensorToVectorTypeConverter : public mlir::TypeConverter {
- public:
-  TensorToVectorTypeConverter() {
-    // Keep all non-tensor types as-is.
-    addConversion([](mlir::Type type) { return type; });
-
-    // Convert RankedTensorType to VectorType.
-    addConversion([](mlir::RankedTensorType type) -> mlir::Type {
-      // We can only convert tensors with a static shape to vectors.
-      if (!type.hasStaticShape()) {
-        return nullptr;  // Return null if the type cannot be converted.
-      }
-      return mlir::VectorType::get(type.getShape(), type.getElementType());
-    });
-  }
-};
-
-// A generic pattern to convert an elemental op from tensor-based to
-// vector-based.
-template <typename ElementalOp>
-class ElementalOpConversion : public mlir::OpConversionPattern<ElementalOp> {
- public:
-  using mlir::OpConversionPattern<ElementalOp>::OpConversionPattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      ElementalOp op, typename ElementalOp::Adaptor adaptor,
-      mlir::ConversionPatternRewriter& rewriter) const override {
-    llvm::SmallVector<mlir::Type> new_result_types;
-    mlir::LogicalResult results_ok = this->getTypeConverter()->convertTypes(
-        op->getResultTypes(), new_result_types);
-    if (results_ok.failed()) {
-      return rewriter.notifyMatchFailure(op, "could not convert result type");
-    }
-
-    rewriter.replaceOpWithNewOp<ElementalOp>(
-        op, new_result_types, adaptor.getOperands(), op->getAttrs());
-    return mlir::success();
-  }
-};
-
-// We need to specify the ConstantOp conversion explicitly as it doesn't follow
-// the simple operands & results of the other Arith ops.
-template <>
-class ElementalOpConversion<mlir::arith::ConstantOp>
-    : public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
- public:
-  using mlir::OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      mlir::arith::ConstantOp op,
-      typename mlir::arith::ConstantOp::Adaptor adaptor,
-      mlir::ConversionPatternRewriter& rewriter) const override {
-    mlir::Type new_type = getTypeConverter()->convertType(op.getType());
-    mlir::ShapedType shaped_type = mlir::dyn_cast<mlir::ShapedType>(new_type);
-    if (!shaped_type) {
-      return rewriter.notifyMatchFailure(op, "could not convert result type");
-    }
-
-    auto dense_attr = mlir::dyn_cast<mlir::DenseElementsAttr>(op.getValue());
-    rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
-        op, new_type, dense_attr.reshape(shaped_type));
-    return mlir::success();
-  }
-};
-
-template <typename... ElementalOps>
-void AddAElementalOpConversionsImpl(
-    mlir::ConversionTarget& target, mlir::RewritePatternSet& patterns,
-    TensorToVectorTypeConverter& typeConverter) {
-  target.addDynamicallyLegalOp<ElementalOps...>(
-      [&](mlir::Operation* op) { return typeConverter.isLegal(op); });
-  patterns.add<ElementalOpConversion<ElementalOps>...>(typeConverter,
-                                                       patterns.getContext());
-}
-
-void AddArithOpConversions(mlir::ConversionTarget& target,
-                           mlir::RewritePatternSet& patterns,
-                           TensorToVectorTypeConverter& typeConverter) {
-  AddAElementalOpConversionsImpl<
-#define GET_OP_LIST
-#include "mlir/Dialect/Arith/IR/ArithOps.cpp.inc"
-#undef GET_OP_LIST
-      >(target, patterns, typeConverter);
-}
-
-void AddMathOpConversions(mlir::ConversionTarget& target,
-                          mlir::RewritePatternSet& patterns,
-                          TensorToVectorTypeConverter& typeConverter) {
-  AddAElementalOpConversionsImpl<
-#define GET_OP_LIST
-#include "mlir/Dialect/Math/IR/MathOps.cpp.inc"
-#undef GET_OP_LIST
-      >(target, patterns, typeConverter);
-}
-
-struct ElementalTensorToVectorPass
-    : public impl::ElementalTensorToVectorPassBase<
-          ElementalTensorToVectorPass> {
-  void runOnOperation() override {
-    auto* context = &getContext();
-    mlir::ModuleOp module = getOperation();
-
-    mlir::ConversionTarget target(*context);
-    mlir::RewritePatternSet patterns(context);
-    TensorToVectorTypeConverter typeConverter;
-    AddArithOpConversions(target, patterns, typeConverter);
-    AddMathOpConversions(target, patterns, typeConverter);
-
-    mlir::ConversionConfig config;
-    config.buildMaterializations = false;
-    if (failed(applyPartialConversion(module, target, std::move(patterns),
-                                      config))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> CreateElementalTensorToVectorPass() {
-  return std::make_unique<ElementalTensorToVectorPass>();
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc
new file mode 100644
index 00000000000000..4126bc4a17df23
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/linalg_elementwise_to_vector_pass.cc
@@ -0,0 +1,110 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_LINALGELEMENTWISETOVECTORPASS
+#define GEN_PASS_DEF_LINALGELEMENTWISETOVECTORPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+class ElementwiseToVectorPattern
+    : public mlir::OpInterfaceRewritePattern<mlir::linalg::LinalgOp> {
+ public:
+  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::linalg::LinalgOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (!mlir::linalg::isElementwise(op)) {
+      return rewriter.notifyMatchFailure(op, "Op is not elementwise");
+    }
+
+    // Is this possible?
+    if (op.getDpsInits().empty()) {
+      return rewriter.notifyMatchFailure(op, "op has no outputs");
+    }
+
+    auto result_type =
+        mlir::dyn_cast<mlir::ShapedType>(op.getDpsInits().front().getType());
+    if (!result_type) {
+      return rewriter.notifyMatchFailure(op, "could not convert result type");
+    }
+    if (!result_type.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op,
+                                         "only static shapes are supported");
+    }
+
+    // The default linalg vectorization is very naive and just replaces the
+    // elementwise op with a transfer_read -> super_vector -> transfer_write,
+    // but this works as a first pass.
+    // TODO(willfroom): replace this with explicit loops on natural vector
+    // sizes.
+    mlir::FailureOr<mlir::linalg::VectorizationResult> result =
+        mlir::linalg::vectorize(rewriter, op);
+
+    if (mlir::failed(result)) {
+      return rewriter.notifyMatchFailure(op, "could not vectorize");
+    }
+
+    rewriter.replaceOp(op, result->replacements);
+    return mlir::success();
+  }
+};
+
+struct LinalgElementwiseToVectorPass
+    : public impl::LinalgElementwiseToVectorPassBase<
+          LinalgElementwiseToVectorPass> {
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<ElementwiseToVectorPattern>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateLinalgElementwiseToVectorPass() {
+  return std::make_unique<LinalgElementwiseToVectorPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
index b55bb4754d2951..c88fd34f242e40 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
@@ -15,23 +15,34 @@ limitations under the License.
 
 #include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
 
+#include <cstdint>
+#include <optional>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 
 namespace xla::cpu {
 
-mlir::VectorType GetVectorType(mlir::RankedTensorType tensor_type) {
-  return mlir::VectorType::get(tensor_type.getShape(),
-                               tensor_type.getElementType());
+static llvm::SmallVector<mlir::Value> MakeZeroIndices(mlir::OpBuilder& builder,
+                                                      mlir::Location loc,
+                                                      int64_t rank) {
+  return llvm::SmallVector<mlir::Value>(
+      rank, mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+}
+
+mlir::VectorType GetVectorType(mlir::ShapedType type) {
+  return mlir::VectorType::get(type.getShape(), type.getElementType());
 }
 
-mlir::TypedValue<mlir::VectorType> CastToVector(mlir::OpBuilder& builder,
-                                                mlir::Value input) {
+mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
+                                                      mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
     return builder.create<mlir::vector::FromElementsOp>(
         input.getLoc(), mlir::VectorType::get({}, input.getType()), input);
@@ -40,18 +51,19 @@ mlir::TypedValue<mlir::VectorType> CastToVector(mlir::OpBuilder& builder,
   auto input_tensor =
       mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(input);
   auto vector_type = GetVectorType(input_tensor.getType());
-  auto cast_op = builder.create<mlir::UnrealizedConversionCastOp>(
-      input.getLoc(), vector_type, input_tensor);
-  return mlir::cast<mlir::TypedValue<mlir::VectorType>>(cast_op.getResult(0));
+
+  return mlir::vector::TransferReadOp::create(
+      builder, input.getLoc(), vector_type, input_tensor,
+      MakeZeroIndices(builder, input.getLoc(), vector_type.getRank()),
+      std::nullopt);
 }
 
-mlir::RankedTensorType GetTensorType(mlir::VectorType vector_type) {
-  return mlir::RankedTensorType::get(vector_type.getShape(),
-                                     vector_type.getElementType());
+mlir::RankedTensorType GetTensorType(mlir::ShapedType type) {
+  return mlir::RankedTensorType::get(type.getShape(), type.getElementType());
 }
 
-mlir::TypedValue<mlir::RankedTensorType> CastToTensor(mlir::OpBuilder& builder,
-                                                      mlir::Value input) {
+mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
+    mlir::OpBuilder& builder, mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
     return builder.create<mlir::tensor::FromElementsOp>(
         input.getLoc(), mlir::RankedTensorType::get({}, input.getType()),
@@ -59,11 +71,23 @@ mlir::TypedValue<mlir::RankedTensorType> CastToTensor(mlir::OpBuilder& builder,
   }
 
   auto input_vector = mlir::cast<mlir::TypedValue<mlir::VectorType>>(input);
-  auto tensor_type = GetTensorType(input_vector.getType());
-  auto cast_op = builder.create<mlir::UnrealizedConversionCastOp>(
-      input.getLoc(), tensor_type, input_vector);
+  mlir::VectorType vector_type = input_vector.getType();
+  auto empty_tensor = mlir::tensor::EmptyOp::create(
+      builder, input.getLoc(), vector_type.getShape(),
+      vector_type.getElementType());
   return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
-      cast_op.getResult(0));
+      mlir::vector::TransferWriteOp::create(
+          builder, input.getLoc(), input, empty_tensor,
+          MakeZeroIndices(builder, input.getLoc(), vector_type.getRank()))
+          .getResult());
+}
+
+mlir::TypedValue<mlir::MemRefType> CreateBufferOfShape(mlir::OpBuilder& builder,
+                                                       mlir::Location loc,
+                                                       mlir::ShapedType shape) {
+  mlir::MemRefType memrefType =
+      mlir::MemRefType::get(shape.getShape(), shape.getElementType());
+  return mlir::memref::AllocaOp::create(builder, loc, memrefType);
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h
index 1ffd2a54db532e..dee0a0a607725d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h
@@ -18,33 +18,38 @@ limitations under the License.
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/Value.h"
 
 namespace xla::cpu {
 
 // Get the vector type that has the same shape and element type as the tensor
 // type.
-mlir::VectorType GetVectorType(mlir::RankedTensorType tensor_type);
+mlir::VectorType GetVectorType(mlir::ShapedType tensor_type);
 
 // Cast the input to a vector value.
 // If the input is a scalar it will be simply constructed as a
 // vector.from_elements to create a 0D vector.
 // If it is a vector it will be cast to a vector using an unrealized cast op.
 // Any other type will crash.
-mlir::TypedValue<mlir::VectorType> CastToVector(mlir::OpBuilder& builder,
-                                                mlir::Value input);
+mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
+                                                      mlir::Value input);
 
 // Get the tensor type that has the same shape and element type as the vector
 // type.
-mlir::RankedTensorType GetTensorType(mlir::VectorType vector_type);
+mlir::RankedTensorType GetTensorType(mlir::ShapedType vector_type);
 
 // Cast the input to a tensor value.
 // If the input is a scalar it will be simply constructed as a
 // tensor.from_elements to create a 0D tensor.
 // If it is a vector it will be cast to a tensor using an unrealized cast op.
 // Any other type will crash.
-mlir::TypedValue<mlir::RankedTensorType> CastToTensor(mlir::OpBuilder& builder,
-                                                      mlir::Value input);
+mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
+    mlir::OpBuilder& builder, mlir::Value input);
+
+mlir::TypedValue<mlir::MemRefType> CreateBufferOfShape(mlir::OpBuilder& builder,
+                                                       mlir::Location loc,
+                                                       mlir::ShapedType shape);
 
 }  // namespace xla::cpu
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc
new file mode 100644
index 00000000000000..b28434c74c4423
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/memref_copy_to_loops.cc
@@ -0,0 +1,141 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+
+namespace xla::cpu {
+
+#define GEN_PASS_DECL_MEMREFCOPYTOLOOPSPASS
+#define GEN_PASS_DEF_MEMREFCOPYTOLOOPSPASS
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
+
+namespace {
+
+// Super simple lowering of memref.copies that would otherwise be lowered to a
+// external call by the default memref lowering.
+// TODO(willfroom): look into vectorizing these.
+struct LowerMemRefCopyPattern
+    : public mlir::OpRewritePattern<mlir::memref::CopyOp> {
+  using mlir::OpRewritePattern<mlir::memref::CopyOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::memref::CopyOp op, mlir::PatternRewriter& rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    auto source =
+        mlir::cast<mlir::TypedValue<mlir::MemRefType>>(op.getSource());
+    auto dest = mlir::cast<mlir::TypedValue<mlir::MemRefType>>(op.getTarget());
+
+    mlir::MemRefType src_type = source.getType();
+    mlir::MemRefType dest_type = dest.getType();
+
+    // These will be lowered by the default memref -> llvm pipeline to a memcpy
+    // intrinsic.
+    // TODO(willfroom): We should update the default memref lowering to allow
+    // the same layout rather than requiring identity.
+    if (mlir::memref::isStaticShapeAndContiguousRowMajor(src_type) &&
+        mlir::memref::isStaticShapeAndContiguousRowMajor(dest_type)) {
+      return rewriter.notifyMatchFailure(
+          op, "memref.copy will be lowered to a memcpy intrinsic");
+    }
+
+    int64_t rank = src_type.getRank();
+
+    llvm::SmallVector<mlir::Value> lbs, ubs, steps;
+    lbs.reserve(rank);
+    ubs.reserve(rank);
+    steps.reserve(rank);
+
+    mlir::Value c1 = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    mlir::Value c0 = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+
+    for (int64_t idx = 0; idx < rank; ++idx) {
+      lbs.push_back(c0);
+      steps.push_back(c1);
+
+      // Source & destination must have the same shape as defined by the copy op
+      // spec so we can just extract it from the source without checking the
+      // destination.
+      if (src_type.isDynamicDim(idx)) {
+        ubs.push_back(mlir::memref::DimOp::create(rewriter, loc, source, idx));
+      } else {
+        ubs.push_back(mlir::arith::ConstantIndexOp::create(
+            rewriter, loc, src_type.getDimSize(idx)));
+      }
+    }
+
+    // TODO(willfroom): We should ensure that the loop order is major-to-minor.
+    mlir::scf::buildLoopNest(
+        rewriter, loc, lbs, ubs, steps,
+        [source, dest](mlir::OpBuilder& builder, mlir::Location loc,
+                       mlir::ValueRange ivs) {
+          mlir::Value element =
+              mlir::memref::LoadOp::create(builder, loc, source, ivs);
+          mlir::memref::StoreOp::create(builder, loc, element, dest, ivs);
+        });
+
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+class MemrefCopyToLoopsPass
+    : public impl::MemrefCopyToLoopsPassBase<MemrefCopyToLoopsPass> {
+ public:
+  using MemrefCopyToLoopsPassBase::MemrefCopyToLoopsPassBase;
+
+  void runOnOperation() override {
+    mlir::MLIRContext* context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<LowerMemRefCopyPattern>(context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateMemrefCopyToLoopsPass() {
+  return std::make_unique<MemrefCopyToLoopsPass>();
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h
index b092d7d18cb5fe..f31f83b1af7254 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
 #include "mlir/Dialect/Math/IR/Math.h"  // IWYU pragma: keep
 #include "mlir/Dialect/SCF/IR/SCF.h"  // IWYU pragma: keep
@@ -33,11 +34,11 @@ namespace xla::cpu {
 #define GEN_PASS_DECL
 #include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
 
-std::unique_ptr<mlir::Pass> CreateElementalTensorToVectorPass();
+std::unique_ptr<mlir::Pass> CreateLinalgElementwiseToVectorPass();
 std::unique_ptr<mlir::Pass> CreateLowerXTileEntryPass();
 std::unique_ptr<mlir::Pass> CreateShloToVectorPass();
-std::unique_ptr<mlir::Pass> CreateXTileToVectorPass();
 std::unique_ptr<mlir::Pass> CreateTensorOpsToVectorPass();
+std::unique_ptr<mlir::Pass> CreateMemrefCopyToLoopsPass();
 
 #define GEN_PASS_REGISTRATION
 #include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td
index 2d5789080505f0..3741957a81ceac 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/passes.td
@@ -15,17 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def XTileToVectorPass : Pass<"xtile-cpu-xtile-to-vector", "mlir::ModuleOp"> {
-  let summary = "Lowering xtile ops to vector ops";
-
-  let constructor = "CreateXTileToVectorPass()";
-
-  let dependentDialects = [
-    "mlir::vector::VectorDialect",
-    "xla::xtile::XTileDialect",
-  ];
-}
-
 def LowerXTileEntryPass : Pass<"xtile-cpu-lower-xtile-entry", "mlir::ModuleOp"> {
   let summary = "Lowers the entry function into the form required by the CPU runtime";
 
@@ -46,22 +35,23 @@ def ShloToVectorPass : Pass<"xtile-cpu-shlo-to-vector", "mlir::ModuleOp"> {
   let constructor = "CreateShloToVectorPass()";
 
   let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::bufferization::BufferizationDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::stablehlo::StablehloDialect",
     "mlir::tensor::TensorDialect",
     "mlir::vector::VectorDialect",
-    "mlir::stablehlo::StablehloDialect",
   ];
 }
 
-def ElementalTensorToVectorPass : Pass<"xtile-cpu-elemental-tensor-to-vector",
+def LinalgElementwiseToVectorPass : Pass<"xtile-cpu-linalg-elementwise-to-vector",
                                        "mlir::ModuleOp"> {
-  let summary = "Lowering arith & math ops with tensor types to vector types";
+  let summary = "Convert elementwise linalg ops to vector ops";
 
-  let constructor = "CreateElementalTensorToVectorPass()";
+  let constructor = "CreateLinalgElementwiseToVectorPass()";
 
   let dependentDialects = [
-    "mlir::arith::ArithDialect",
-    "mlir::math::MathDialect",
-    "mlir::tensor::TensorDialect",
     "mlir::vector::VectorDialect",
   ];
 }
@@ -76,3 +66,14 @@ def TensorOpsToVectorPass : Pass<"xtile-cpu-tensor-ops-to-vector",
     "mlir::vector::VectorDialect",
   ];
 }
+
+
+def MemrefCopyToLoopsPass : Pass<"xtile-cpu-memref-copy-to-loops",
+                                 "mlir::ModuleOp"> {
+  let summary = "Rewrite mmeref.copy to loops.";
+
+  let dependentDialects = [
+    "::mlir::scf::SCFDialect",
+    "::mlir::memref::MemRefDialect",
+  ];
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
index 8012c9da15ef60..e92d15797552c4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
@@ -18,20 +18,26 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
@@ -41,6 +47,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
 #include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h"
 
 namespace xla::cpu {
 
@@ -129,10 +136,10 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::stablehlo::DotGeneralOp op,
       mlir::PatternRewriter& rewriter) const override {
-    auto lhs_vector = CastToVector(rewriter, op.getLhs());
+    auto lhs_vector = ReadTensorToVector(rewriter, op.getLhs());
     auto lhs_rank = lhs_vector.getType().getRank();
 
-    auto rhs_vector = CastToVector(rewriter, op.getRhs());
+    auto rhs_vector = ReadTensorToVector(rewriter, op.getRhs());
     auto rhs_rank = rhs_vector.getType().getRank();
 
     // TODO(willfroom): Ensure this is being folded into the accumulator in the
@@ -177,7 +184,7 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
         op->getLoc(), lhs_vector, rhs_vector, accumulator, indexing_maps,
         iterator_types);
 
-    rewriter.replaceOp(op, CastToTensor(rewriter, result));
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, result));
 
     return mlir::success();
   }
@@ -205,19 +212,136 @@ struct LowerTranspose : mlir::OpRewritePattern<mlir::stablehlo::TransposeOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::stablehlo::TransposeOp op,
       mlir::PatternRewriter& rewriter) const override {
-    mlir::Value source_vector = CastToVector(rewriter, op.getOperand());
+    mlir::Value source_vector = ReadTensorToVector(rewriter, op.getOperand());
 
     mlir::TypedValue<mlir::VectorType> dest_vector =
         rewriter.create<mlir::vector::TransposeOp>(op->getLoc(), source_vector,
                                                    op.getPermutation());
 
-    mlir::Value dest_tensor = CastToTensor(rewriter, dest_vector);
+    mlir::Value dest_tensor = WriteVectorToTensor(rewriter, dest_vector);
 
     rewriter.replaceAllUsesWith(op, dest_tensor);
     return mlir::success();
   }
 };
 
+struct LowerReduce : mlir::OpRewritePattern<mlir::stablehlo::ReduceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::ReduceOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (op.getNumResults() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "reduce op with multiple results is not supported");
+    }
+
+    auto source_tensor = mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
+        op.getInputs().front());
+    mlir::Value result_tensor = op.getResult(0);
+    auto result_type =
+        mlir::cast<mlir::RankedTensorType>(result_tensor.getType());
+
+    mlir::Value init_value = rewriter.create<mlir::tensor::ExtractOp>(
+        op->getLoc(), result_type.getElementType(), op.getInitValues().front());
+
+    // Ensure the reduction dimensions are sorted so we can easily check if the
+    // minor dimension is reduced.
+    llvm::SmallVector<int64_t> reduction_dims(op.getDimensions());
+    absl::c_sort(reduction_dims);
+
+    mlir::Value reduced_vector = EmitVectorizedReduction(
+        rewriter, op->getLoc(), result_type, source_tensor, init_value,
+        reduction_dims, op.getBody().front());
+
+    rewriter.replaceOp(op, reduced_vector);
+
+    return mlir::success();
+  }
+};
+
+struct LowerBroadcastInDim
+    : mlir::OpRewritePattern<mlir::stablehlo::BroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::BroadcastInDimOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto source_vector = ReadTensorToVector(rewriter, op.getOperand());
+    auto result_vector_type = GetVectorType(op.getType());
+
+    llvm::ArrayRef<int64_t> source_shape = source_vector.getType().getShape();
+    llvm::ArrayRef<int64_t> broadcast_dims = op.getBroadcastDimensions();
+
+    // First create an intermediate vector with the rank of the result vector
+    // but with the broadcasted dimensions set to the source shape with all
+    // additional dimensions set to 1.
+    llvm::SmallVector<int64_t> intermediate_shape(result_vector_type.getRank(),
+                                                  1);
+    for (auto [input_dim, result_dim] : llvm::enumerate(broadcast_dims)) {
+      intermediate_shape[result_dim] = source_shape[input_dim];
+    }
+    mlir::Value intermediate_vector = mlir::vector::ShapeCastOp::create(
+        rewriter, op->getLoc(),
+        mlir::VectorType::get(intermediate_shape,
+                              result_vector_type.getElementType()),
+        source_vector);
+    // Now that all the inserted dimensions are size 1 we can legally call
+    // broadcast even if they are not the most major dimensions.
+    mlir::Value broadcast_op = mlir::vector::BroadcastOp::create(
+        rewriter, op->getLoc(), result_vector_type, intermediate_vector);
+
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, broadcast_op));
+    return mlir::success();
+  }
+};
+
+struct LowerReshape : mlir::OpRewritePattern<mlir::stablehlo::ReshapeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::ReshapeOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto source_vector = ReadTensorToVector(rewriter, op.getOperand());
+    auto result_vector_type = GetVectorType(op.getType());
+
+    mlir::Value reshaped_vector = mlir::vector::ShapeCastOp::create(
+        rewriter, op->getLoc(), result_vector_type, source_vector);
+
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, reshaped_vector));
+    return mlir::success();
+  }
+};
+
+struct LowerIota : mlir::OpRewritePattern<mlir::stablehlo::IotaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::IotaOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (op.getType().getRank() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "iota op with rank != 1 is not supported");
+    }
+
+    auto result_vector_type = GetVectorType(op.getType());
+    auto element_type = result_vector_type.getElementType();
+    int64_t iota_size = result_vector_type.getNumElements();
+
+    llvm::SmallVector<mlir::Attribute> iota_values(iota_size);
+    for (int idx = 0; idx != iota_size; ++idx) {
+      iota_values[idx] = rewriter.getIntegerAttr(element_type, idx);
+    }
+
+    mlir::Value iota_const = mlir::arith::ConstantOp::create(
+        rewriter, op->getLoc(),
+        mlir::DenseElementsAttr::get(result_vector_type, iota_values));
+
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, iota_const));
+    return mlir::success();
+  }
+};
+
 class ShloToVectorPass : public impl::ShloToVectorPassBase<ShloToVectorPass> {
  public:
   using ShloToVectorPassBase::ShloToVectorPassBase;
@@ -225,7 +349,8 @@ class ShloToVectorPass : public impl::ShloToVectorPassBase<ShloToVectorPass> {
   void runOnOperation() override {
     mlir::MLIRContext* context = &getContext();
     mlir::RewritePatternSet patterns(context);
-    patterns.add<LowerTranspose, LowerDotGeneral>(context);
+    patterns.add<LowerTranspose, LowerDotGeneral, LowerReduce,
+                 LowerBroadcastInDim, LowerReshape, LowerIota>(context);
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_vector.cc
index 819d1ae685e24e..daffa5978ffc3c 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_vector.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tensor_ops_to_vector.cc
@@ -57,8 +57,7 @@ struct LowerFromElements
     mlir::Value vector_from_elements =
         rewriter.create<mlir::vector::FromElementsOp>(op.getLoc(), vector_type,
                                                       op->getOperands());
-    rewriter.replaceOpWithNewOp<mlir::UnrealizedConversionCastOp>(
-        op, op.getType(), vector_from_elements);
+    rewriter.replaceOp(op, WriteVectorToTensor(rewriter, vector_from_elements));
     return mlir::success();
   }
 };
@@ -69,7 +68,7 @@ struct LowerExtract : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::tensor::ExtractOp op,
       mlir::PatternRewriter& rewriter) const override {
-    mlir::Value vector_input = CastToVector(rewriter, op.getTensor());
+    mlir::Value vector_input = ReadTensorToVector(rewriter, op.getTensor());
     llvm::SmallVector<mlir::OpFoldResult> indices(op.getIndices());
     rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(op, vector_input,
                                                          indices);
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD
index 10228fcc460af8..7efb5865d7b8d0 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/BUILD
@@ -10,7 +10,7 @@ lit_test_suite(
     srcs = glob(["*.mlir"]),
     cfg = "//xla:lit.cfg.py",
     tools = [
-        "//xla/codegen/tools:emitters_opt",
+        "//xla/backends/cpu/codegen/tools:fusion_compiler_opt",
         "@llvm-project//llvm:FileCheck",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/arith_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/arith_to_vector.mlir
deleted file mode 100644
index 9378fc77c0dff5..00000000000000
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/arith_to_vector.mlir
+++ /dev/null
@@ -1,383 +0,0 @@
-// RUN: emitters_opt %s --xtile-cpu-elemental-tensor-to-vector -split-input-file | FileCheck %s
-
-func.func @addf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.addf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %add = arith.addf %lhs, %rhs : tensor<1024xf32>
-  return %add : tensor<1024xf32>
-}
-
-// -----
-
-func.func @addi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %add = arith.addi %lhs, %rhs : tensor<1024xi32>
-  return %add : tensor<1024xi32>
-}
-
-// -----
-
-func.func @addiu(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> (tensor<1024xi32>, tensor<1024xi1>) {
-  // CHECK: arith.addui_extended %{{.*}}, %{{.*}} : vector<1024xi32>, vector<1024xi1>
-  %add, %carry = arith.addui_extended %lhs, %rhs : tensor<1024xi32>, tensor<1024xi1>
-  return %add, %carry : tensor<1024xi32>, tensor<1024xi1>
-}
-
-// -----
-
-func.func @andi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.andi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %and = arith.andi %lhs, %rhs : tensor<1024xi32>
-  return %and : tensor<1024xi32>
-}
-
-// -----
-
-func.func @bitcast(%arg0 : tensor<1024xi32>) -> tensor<1024xf32> {
-  // CHECK: arith.bitcast %{{.*}} : vector<1024xi32> to vector<1024xf32>
-  %cast = arith.bitcast %arg0 : tensor<1024xi32> to tensor<1024xf32>
-  return %cast : tensor<1024xf32>
-}
-
-// -----
-
-func.func @ceildivsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.ceildivsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %div = arith.ceildivsi %lhs, %rhs : tensor<1024xi32>
-  return %div : tensor<1024xi32>
-}
-
-// -----
-
-func.func @ceildivui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.ceildivui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %div = arith.ceildivui %lhs, %rhs : tensor<1024xi32>
-  return %div : tensor<1024xi32>
-}
-
-// -----
-
-func.func @cmpf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xi1> {
-  // CHECK: arith.cmpf oeq, %{{.*}}, %{{.*}} : vector<1024xf32>
-  %cmp = arith.cmpf oeq, %lhs, %rhs : tensor<1024xf32>
-  return %cmp : tensor<1024xi1>
-}
-
-// -----
-
-func.func @cmpi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi1> {
-  // CHECK: arith.cmpi eq, %{{.*}}, %{{.*}} : vector<1024xi32>
-  %cmp = arith.cmpi eq, %lhs, %rhs : tensor<1024xi32>
-  return %cmp : tensor<1024xi1>
-}
-
-// -----
-
-func.func @constant() -> tensor<1024xf32> {
-  // CHECK: arith.constant dense<1.000000e+00> : vector<1024xf32>
-  %const = arith.constant dense<1.0> : tensor<1024xf32>
-  return %const : tensor<1024xf32>
-}
-
-// -----
-
-func.func @divf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.divf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %div = arith.divf %lhs, %rhs : tensor<1024xf32>
-  return %div : tensor<1024xf32>
-}
-
-// -----
-
-func.func @divsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.divsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %div = arith.divsi %lhs, %rhs : tensor<1024xi32>
-  return %div : tensor<1024xi32>
-}
-
-// -----
-
-func.func @divui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.divui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %div = arith.divui %lhs, %rhs : tensor<1024xi32>
-  return %div : tensor<1024xi32>
-}
-
-// -----
-
-func.func @extf(%arg0 : tensor<1024xf32>) -> tensor<1024xf64> {
-  // CHECK: arith.extf %{{.*}} : vector<1024xf32> to vector<1024xf64>
-  %ext = arith.extf %arg0 : tensor<1024xf32> to tensor<1024xf64>
-  return %ext : tensor<1024xf64>
-}
-
-// -----
-
-func.func @extsi(%arg0 : tensor<1024xi16>) -> tensor<1024xi32> {
-  // CHECK: arith.extsi %{{.*}} : vector<1024xi16> to vector<1024xi32>
-  %ext = arith.extsi %arg0 : tensor<1024xi16> to tensor<1024xi32>
-  return %ext : tensor<1024xi32>
-}
-
-// -----
-
-func.func @extui(%arg0 : tensor<1024xi16>) -> tensor<1024xi32> {
-  // CHECK: arith.extui %{{.*}} : vector<1024xi16> to vector<1024xi32>
-  %ext = arith.extui %arg0 : tensor<1024xi16> to tensor<1024xi32>
-  return %ext : tensor<1024xi32>
-}
-
-// -----
-
-func.func @fptosi(%arg0 : tensor<1024xf32>) -> tensor<1024xi32> {
-  // CHECK: arith.fptosi %{{.*}} : vector<1024xf32> to vector<1024xi32>
-  %cast = arith.fptosi %arg0 : tensor<1024xf32> to tensor<1024xi32>
-  return %cast : tensor<1024xi32>
-}
-
-// -----
-
-func.func @fptoui(%arg0 : tensor<1024xf32>) -> tensor<1024xi32> {
-  // CHECK: arith.fptoui %{{.*}} : vector<1024xf32> to vector<1024xi32>
-  %cast = arith.fptoui %arg0 : tensor<1024xf32> to tensor<1024xi32>
-  return %cast : tensor<1024xi32>
-}
-
-// -----
-
-func.func @floordivsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.floordivsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %div = arith.floordivsi %lhs, %rhs : tensor<1024xi32>
-  return %div : tensor<1024xi32>
-}
-
-// -----
-
-func.func @index_cast(%arg0 : tensor<1024xi32>) -> tensor<1024xindex> {
-  // CHECK: arith.index_cast %{{.*}} : vector<1024xi32> to vector<1024xindex>
-  %cast = arith.index_cast %arg0 : tensor<1024xi32> to tensor<1024xindex>
-  return %cast : tensor<1024xindex>
-}
-
-// -----
-
-func.func @index_castui(%arg0 : tensor<1024xi32>) -> tensor<1024xindex> {
-  // CHECK: arith.index_castui %{{.*}} : vector<1024xi32> to vector<1024xindex>
-  %cast = arith.index_castui %arg0 : tensor<1024xi32> to tensor<1024xindex>
-  return %cast : tensor<1024xindex>
-}
-
-// -----
-
-func.func @maxnumf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.maxnumf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %max = arith.maxnumf %lhs, %rhs : tensor<1024xf32>
-  return %max : tensor<1024xf32>
-}
-
-// -----
-
-func.func @maxsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.maxsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %max = arith.maxsi %lhs, %rhs : tensor<1024xi32>
-  return %max : tensor<1024xi32>
-}
-
-// -----
-
-func.func @maxui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.maxui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %max = arith.maxui %lhs, %rhs : tensor<1024xi32>
-  return %max : tensor<1024xi32>
-}
-
-// -----
-
-func.func @maximumf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.maximumf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %max = arith.maximumf %lhs, %rhs : tensor<1024xf32>
-  return %max : tensor<1024xf32>
-}
-
-// -----
-
-func.func @minnumf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.minnumf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %min = arith.minnumf %lhs, %rhs : tensor<1024xf32>
-  return %min : tensor<1024xf32>
-}
-
-// -----
-
-func.func @minsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.minsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %min = arith.minsi %lhs, %rhs : tensor<1024xi32>
-  return %min : tensor<1024xi32>
-}
-
-// -----
-
-func.func @minui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.minui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %min = arith.minui %lhs, %rhs : tensor<1024xi32>
-  return %min : tensor<1024xi32>
-}
-
-// -----
-
-func.func @minimumf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.minimumf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %min = arith.minimumf %lhs, %rhs : tensor<1024xf32>
-  return %min : tensor<1024xf32>
-}
-
-// -----
-
-func.func @mulf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %mul = arith.mulf %lhs, %rhs : tensor<1024xf32>
-  return %mul : tensor<1024xf32>
-}
-
-// -----
-
-func.func @muli(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.muli %{{.*}}, %{{.*}} overflow<nsw, nuw> : vector<1024xi32>
-  %mul = arith.muli %lhs, %rhs overflow<nsw, nuw> : tensor<1024xi32>
-  return %mul : tensor<1024xi32>
-}
-
-// -----
-
-func.func @mului_ext(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> (tensor<1024xi32>, tensor<1024xi32>) {
-  // CHECK: arith.mulsi_extended %{{.*}}, %{{.*}} : vector<1024xi32>
-  %low, %high = arith.mulsi_extended %lhs, %rhs : tensor<1024xi32>
-  return %low, %high : tensor<1024xi32>, tensor<1024xi32>
-}
-
-// -----
-
-func.func @negf(%arg0 : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.negf %{{.*}} : vector<1024xf32>
-  %neg = arith.negf %arg0 : tensor<1024xf32>
-  return %neg : tensor<1024xf32>
-}
-
-// -----
-
-func.func @ori(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.ori %{{.*}}, %{{.*}} : vector<1024xi32>
-  %or = arith.ori %lhs, %rhs : tensor<1024xi32>
-  return %or : tensor<1024xi32>
-}
-
-// -----
-
-func.func @remf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.remf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %rem = arith.remf %lhs, %rhs : tensor<1024xf32>
-  return %rem : tensor<1024xf32>
-}
-
-// -----
-
-func.func @remsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.remsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %rem = arith.remsi %lhs, %rhs : tensor<1024xi32>
-  return %rem : tensor<1024xi32>
-}
-
-// -----
-
-func.func @remui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.remui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %rem = arith.remui %lhs, %rhs : tensor<1024xi32>
-  return %rem : tensor<1024xi32>
-}
-
-// -----
-
-func.func @sitofp(%arg0 : tensor<1024xi32>) -> tensor<1024xf32> {
-  // CHECK: arith.sitofp %{{.*}} : vector<1024xi32> to vector<1024xf32>
-  %cast = arith.sitofp %arg0 : tensor<1024xi32> to tensor<1024xf32>
-  return %cast : tensor<1024xf32>
-}
-
-// -----
-
-func.func @select(%cond : tensor<1024xi1>, %true_val : tensor<1024xf32>, %false_val : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.select %{{.*}}, %{{.*}}, %{{.*}} : vector<1024xi1>, vector<1024xf32>
-  %sel = arith.select %cond, %true_val, %false_val : tensor<1024xi1>, tensor<1024xf32>
-  return %sel : tensor<1024xf32>
-}
-
-// -----
-
-func.func @shli(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.shli %{{.*}}, %{{.*}} : vector<1024xi32>
-  %shl = arith.shli %lhs, %rhs : tensor<1024xi32>
-  return %shl : tensor<1024xi32>
-}
-
-// -----
-
-func.func @shrsi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.shrsi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %shr = arith.shrsi %lhs, %rhs : tensor<1024xi32>
-  return %shr : tensor<1024xi32>
-}
-
-// -----
-
-func.func @shrui(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.shrui %{{.*}}, %{{.*}} : vector<1024xi32>
-  %shr = arith.shrui %lhs, %rhs : tensor<1024xi32>
-  return %shr : tensor<1024xi32>
-}
-
-// -----
-
-func.func @subf(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: arith.subf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %sub = arith.subf %lhs, %rhs : tensor<1024xf32>
-  return %sub : tensor<1024xf32>
-}
-
-// -----
-
-func.func @subi(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.subi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %sub = arith.subi %lhs, %rhs : tensor<1024xi32>
-  return %sub : tensor<1024xi32>
-}
-
-// -----
-
-func.func @truncf(%arg0 : tensor<1024xf64>) -> tensor<1024xf32> {
-  // CHECK: arith.truncf %{{.*}} : vector<1024xf64> to vector<1024xf32>
-  %trunc = arith.truncf %arg0 : tensor<1024xf64> to tensor<1024xf32>
-  return %trunc : tensor<1024xf32>
-}
-
-// -----
-
-func.func @trunci(%arg0 : tensor<1024xi32>) -> tensor<1024xi16> {
-  // CHECK: arith.trunci %{{.*}} : vector<1024xi32> to vector<1024xi16>
-  %trunc = arith.trunci %arg0 : tensor<1024xi32> to tensor<1024xi16>
-  return %trunc : tensor<1024xi16>
-}
-
-// -----
-
-func.func @uitofp(%arg0 : tensor<1024xi32>) -> tensor<1024xf32> {
-  // CHECK: arith.uitofp %{{.*}} : vector<1024xi32> to vector<1024xf32>
-  %cast = arith.uitofp %arg0 : tensor<1024xi32> to tensor<1024xf32>
-  return %cast : tensor<1024xf32>
-}
-
-// -----
-
-func.func @xori(%lhs : tensor<1024xi32>, %rhs : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: arith.xori %{{.*}}, %{{.*}} : vector<1024xi32>
-  %xor = arith.xori %lhs, %rhs : tensor<1024xi32>
-  return %xor : tensor<1024xi32>
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir
new file mode 100644
index 00000000000000..d1ec75fd8a426a
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/linalg_elementwise_to_vector_pass.mlir
@@ -0,0 +1,16 @@
+// RUN: fusion_compiler_opt %s -xtile-cpu-linalg-elementwise-to-vector -split-input-file | FileCheck %s
+
+func.func @elementwise_add_to_vector(
+    %lhs : memref<8x1024xf32>,
+    %rhs : memref<8x1024xf32>,
+    %out : memref<8x1024xf32>) {
+  // CHECK: %1 = vector.transfer_read %arg0
+  // CHECK: %2 = vector.transfer_read %arg1
+  // CHECK: %3 = arith.addf {{.*}} : vector<8x1024xf32>
+  // CHECK: vector.transfer_write %{{.*}}, %arg2{{.*}} :
+  // CHECK-SAME: vector<8x1024xf32>, memref<8x1024xf32>
+  linalg.elementwise kind=#linalg.elementwise_kind<add>
+    ins(%lhs, %rhs : memref<8x1024xf32>, memref<8x1024xf32>)
+    outs(%out : memref<8x1024xf32>)
+  return
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir
index 716fc5676637d8..63bbe9248cbc27 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/lower_xtile_entry.mlir
@@ -1,4 +1,4 @@
-// RUN: emitters_opt %s --xtile-cpu-lower-xtile-entry -split-input-file | FileCheck %s
+// RUN: fusion_compiler_opt %s --xtile-cpu-lower-xtile-entry -split-input-file | FileCheck %s
 
 xtile.entry_func @simple_wrap(%input: memref<1024xf32> {xla.some_attr = 1},
                              %output: memref<32xf64>,
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/math_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/math_to_vector.mlir
deleted file mode 100644
index c49d37474d7d84..00000000000000
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/math_to_vector.mlir
+++ /dev/null
@@ -1,367 +0,0 @@
-// RUN: emitters_opt %s --xtile-cpu-elemental-tensor-to-vector -split-input-file | FileCheck %s
-
-func.func @absf(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.absf %{{.*}} : vector<1024xf32>
-  %abs = math.absf %input : tensor<1024xf32>
-  return %abs : tensor<1024xf32>
-}
-
-// -----
-
-func.func @absi(%input : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: math.absi %{{.*}} : vector<1024xi32>
-  %abs = math.absi %input : tensor<1024xi32>
-  return %abs : tensor<1024xi32>
-}
-
-// -----
-
-func.func @acos(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.acos %{{.*}} : vector<1024xf32>
-  %res = math.acos %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @acosh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.acosh %{{.*}} : vector<1024xf32>
-  %res = math.acosh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @asin(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.asin %{{.*}} : vector<1024xf32>
-  %res = math.asin %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @asinh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.asinh %{{.*}} : vector<1024xf32>
-  %res = math.asinh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @atan2(%input1 : tensor<1024xf32>, %input2 : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.atan2 %{{.*}}, %{{.*}} : vector<1024xf32>
-  %res = math.atan2 %input1, %input2 : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @atan(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.atan %{{.*}} : vector<1024xf32>
-  %res = math.atan %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @atanh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.atanh %{{.*}} : vector<1024xf32>
-  %res = math.atanh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @cbrt(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.cbrt %{{.*}} : vector<1024xf32>
-  %res = math.cbrt %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @ceil(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.ceil %{{.*}} : vector<1024xf32>
-  %res = math.ceil %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @clampf(%input : tensor<1024xf32>, %low : tensor<1024xf32>, %high : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.clampf %{{.*}} to [%{{.*}}, %{{.*}}] : vector<1024xf32>
-  %res = math.clampf %input to [%low, %high] : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @copysign(%mag : tensor<1024xf32>, %sign : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.copysign %{{.*}}, %{{.*}} : vector<1024xf32>
-  %res = math.copysign %mag, %sign : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @cos(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.cos %{{.*}} : vector<1024xf32>
-  %res = math.cos %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @cosh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.cosh %{{.*}} : vector<1024xf32>
-  %res = math.cosh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @ctlz(%input : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: math.ctlz %{{.*}} : vector<1024xi32>
-  %res = math.ctlz %input : tensor<1024xi32>
-  return %res : tensor<1024xi32>
-}
-
-// -----
-
-func.func @cttz(%input : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: math.cttz %{{.*}} : vector<1024xi32>
-  %res = math.cttz %input : tensor<1024xi32>
-  return %res : tensor<1024xi32>
-}
-
-// -----
-
-func.func @ctpop(%input : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: math.ctpop %{{.*}} : vector<1024xi32>
-  %res = math.ctpop %input : tensor<1024xi32>
-  return %res : tensor<1024xi32>
-}
-
-// -----
-
-func.func @erf(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.erf %{{.*}} : vector<1024xf32>
-  %res = math.erf %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @erfc(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.erfc %{{.*}} : vector<1024xf32>
-  %res = math.erfc %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @exp2(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.exp2 %{{.*}} : vector<1024xf32>
-  %res = math.exp2 %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @expm1(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.expm1 %{{.*}} : vector<1024xf32>
-  %res = math.expm1 %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @exp(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.exp %{{.*}} : vector<1024xf32>
-  %res = math.exp %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @fpowi(%base : tensor<1024xf32>, %exp : tensor<1024xi32>) -> tensor<1024xf32> {
-  // CHECK: math.fpowi %{{.*}}, %{{.*}} : vector<1024xf32>
-  %res = math.fpowi %base, %exp : tensor<1024xf32>, tensor<1024xi32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @floor(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.floor %{{.*}} : vector<1024xf32>
-  %res = math.floor %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @fma(%a : tensor<1024xf32>, %b : tensor<1024xf32>, %c : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.fma %{{.*}}, %{{.*}}, %{{.*}} : vector<1024xf32>
-  %res = math.fma %a, %b, %c : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @ipowi(%base : tensor<1024xi32>, %exp : tensor<1024xi32>) -> tensor<1024xi32> {
-  // CHECK: math.ipowi %{{.*}}, %{{.*}} : vector<1024xi32>
-  %res = math.ipowi %base, %exp : tensor<1024xi32>
-  return %res : tensor<1024xi32>
-}
-
-// -----
-
-func.func @isfinite(%input : tensor<1024xf32>) -> tensor<1024xi1> {
-  // CHECK: math.isfinite %{{.*}} : vector<1024xf32>
-  %res = math.isfinite %input : tensor<1024xf32>
-  return %res : tensor<1024xi1>
-}
-
-// -----
-
-func.func @isinf(%input : tensor<1024xf32>) -> tensor<1024xi1> {
-  // CHECK: math.isinf %{{.*}} : vector<1024xf32>
-  %res = math.isinf %input : tensor<1024xf32>
-  return %res : tensor<1024xi1>
-}
-
-// -----
-
-func.func @isnan(%input : tensor<1024xf32>) -> tensor<1024xi1> {
-  // CHECK: math.isnan %{{.*}} : vector<1024xf32>
-  %res = math.isnan %input : tensor<1024xf32>
-  return %res : tensor<1024xi1>
-}
-
-// -----
-
-func.func @isnormal(%input : tensor<1024xf32>) -> tensor<1024xi1> {
-  // CHECK: math.isnormal %{{.*}} : vector<1024xf32>
-  %res = math.isnormal %input : tensor<1024xf32>
-  return %res : tensor<1024xi1>
-}
-
-// -----
-
-func.func @log10(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.log10 %{{.*}} : vector<1024xf32>
-  %res = math.log10 %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @log1p(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.log1p %{{.*}} : vector<1024xf32>
-  %res = math.log1p %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @log2(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.log2 %{{.*}} : vector<1024xf32>
-  %res = math.log2 %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @log(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.log %{{.*}} : vector<1024xf32>
-  %res = math.log %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @powf(%base : tensor<1024xf32>, %exp : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.powf %{{.*}}, %{{.*}} : vector<1024xf32>
-  %res = math.powf %base, %exp : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @roundeven(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.roundeven %{{.*}} : vector<1024xf32>
-  %res = math.roundeven %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @round(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.round %{{.*}} : vector<1024xf32>
-  %res = math.round %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @rsqrt(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.rsqrt %{{.*}} : vector<1024xf32>
-  %res = math.rsqrt %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @sin(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.sin %{{.*}} : vector<1024xf32>
-  %res = math.sin %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @sincos(%input : tensor<1024xf32>) -> (tensor<1024xf32>, tensor<1024xf32>) {
-  // CHECK: math.sincos %{{.*}} : vector<1024xf32>
-  %sin, %cos = math.sincos %input : tensor<1024xf32>
-  return %sin, %cos : tensor<1024xf32>, tensor<1024xf32>
-}
-
-// -----
-
-func.func @sinh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.sinh %{{.*}} : vector<1024xf32>
-  %res = math.sinh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @sqrt(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.sqrt %{{.*}} : vector<1024xf32>
-  %res = math.sqrt %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @tan(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.tan %{{.*}} : vector<1024xf32>
-  %res = math.tan %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @tanh(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.tanh %{{.*}} : vector<1024xf32>
-  %res = math.tanh %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
-
-// -----
-
-func.func @trunc(%input : tensor<1024xf32>) -> tensor<1024xf32> {
-  // CHECK: math.trunc %{{.*}} : vector<1024xf32>
-  %res = math.trunc %input : tensor<1024xf32>
-  return %res : tensor<1024xf32>
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir
new file mode 100644
index 00000000000000..f305ddbfb4af45
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/memref_copy_to_loops.mlir
@@ -0,0 +1,31 @@
+// RUN: fusion_compiler_opt %s \
+// RUN: -xtile-cpu-memref-copy-to-loops -split-input-file \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: @identity_copy_is_unchanged
+func.func @identity_copy_is_unchanged(%arg0: memref<5xi32>, %arg1: memref<5xi32>) {
+  // CHECK: memref.copy
+  memref.copy %arg0, %arg1 : memref<5xi32> to memref<5xi32>
+  func.return
+}
+
+
+// CHECK-LABEL: @non_default_layout_copy_to_loops
+func.func @non_default_layout_copy_to_loops(
+    %arg0: memref<5x2xf32, strided<[1, 5]>>,
+    %arg1: memref<5x2xf32>) {
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
+  // CHECK: scf.for %[[IDX0:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
+  // CHECK:   scf.for %[[IDX1:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
+  // CHECK:     %[[ELEMENT:.*]] = memref.load %arg0[%[[IDX0]], %[[IDX1]]]
+  // CHECK-SAME: : memref<5x2xf32, strided<[1, 5]>>
+  // CHECK:     memref.store %[[ELEMENT]], %arg1[%[[IDX0]], %[[IDX1]]]
+  // CHECK-SAME: : memref<5x2xf32>
+  // CHECK:   }
+  // CHECK: }
+  memref.copy %arg0, %arg1 : memref<5x2xf32, strided<[1, 5]>> to memref<5x2xf32>
+  func.return
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
index 7437038aa450e5..b15888fc2a31f4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
@@ -1,4 +1,4 @@
-// RUN: emitters_opt %s --xtile-cpu-shlo-to-vector -split-input-file | FileCheck %s
+// RUN: fusion_compiler_opt %s --xtile-cpu-shlo-to-vector -split-input-file | FileCheck %s
 
 func.func @transpose(%input : tensor<1024x32xf32>) -> tensor<32x1024xf32> {
   // CHECK: vector.transpose %{{.*}}, [1, 0] : vector<1024x32xf32> to vector<32x1024xf32>
@@ -37,3 +37,148 @@ func.func @dot_scalar_output(%lhs : tensor<1024xf32>, %rhs : tensor<1024xf32>) -
   // CHECK: return %[[RESULT_TENSOR]] : tensor<f32>
   return %result : tensor<f32>
 }
+
+// -----
+
+
+func.func @reduce_outer(%input : tensor<1024x32xf32>, %init : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [0] : (tensor<1024x32xf32>, tensor<f32>) -> tensor<32xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<32xf32>
+}
+
+// CHECK: func.func @reduce_outer
+// CHECK:   memref.alloca() : memref<32xf32>
+// CHECK:   vector.transfer_read %{{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read %{{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:     arith.addf {{.*}} : vector<32xf32>
+// CHECK:     scf.yield %{{.*}} : vector<32xf32>
+// CHECK:   }
+// CHECK:   vector.transfer_write %{{.*}} : vector<32xf32>, memref<32xf32>
+
+// -----
+
+
+func.func @reduce_inner(%input : tensor<1024x32xf32>, %init : tensor<f32>) -> tensor<1024xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [1] : (tensor<1024x32xf32>, tensor<f32>) -> tensor<1024xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<1024xf32>
+}
+
+// CHECK: func.func @reduce_inner
+// CHECK:   memref.alloca() : memref<1024xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32xf32>, vector<32xf32>
+// CHECK:     vector.reduction <add>, {{.*}} : vector<32xf32> into f32
+// CHECK:     memref.store {{.*}} : memref<1024xf32>
+// CHECK:   }
+
+// -----
+
+func.func @reduce_middle(%input : tensor<1024x32x8xf32>, %init : tensor<f32>) -> tensor<1024x8xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [1] : (tensor<1024x32x8xf32>, tensor<f32>) -> tensor<1024x8xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<1024x8xf32>
+}
+
+// CHECK: func.func @reduce_middle
+// CHECK:   memref.alloca() : memref<1024x8xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:     scf.for
+// CHECK:       vector.transfer_read %{{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:       arith.addf {{.*}} : vector<8xf32>
+// CHECK:       scf.yield {{.*}} : vector<8xf32>
+// CHECK:     }
+// CHECK:     vector.transfer_write {{.*}} : vector<8xf32>, memref<1024x8xf32>
+// CHECK:   }
+// CHECK: }
+
+// -----
+
+func.func @reduce_outer_and_inner(%input : tensor<1024x32x8xf32>, %init : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.reduce(%input init: %init) across dimensions = [0, 2] : (tensor<1024x32x8xf32>, tensor<f32>) -> tensor<32xf32>
+    reducer(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+      %add = arith.addf %arg0, %arg1 : tensor<f32>
+      stablehlo.return %add : tensor<f32>
+    }
+  return %result : tensor<32xf32>
+}
+
+// CHECK: func.func @reduce_outer_and_inner
+// CHECK:   %[[BUFFER:.*]] = memref.alloca() : memref<32xf32>
+// CHECK:   scf.for
+// CHECK:     vector.transfer_read {{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:     scf.for
+// CHECK:       vector.transfer_read %{{.*}} : tensor<1024x32x8xf32>, vector<8xf32>
+// CHECK:       arith.addf %{{.*}} : vector<8xf32>
+// CHECK:       scf.yield {{.*}} : vector<8xf32>
+// CHECK:     }
+// CHECK:     vector.reduction <add>, {{.*}} : vector<8xf32> into f32
+// CHECK:     memref.store {{.*}}, %[[BUFFER]]{{.*}} : memref<32xf32>
+// CHECK:   }
+// CHECK: }
+
+// -----
+
+func.func @broadcast_0D_tensor(%input : tensor<f32>) -> tensor<32xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [] : (tensor<f32>) -> tensor<32xf32>
+  return %result : tensor<32xf32>
+}
+
+// CHECK-LABEL: @broadcast_0D_tensor
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.broadcast {{.*}} : vector<f32> to vector<32xf32>
+
+// -----
+
+func.func @broadcast_2D_tensor_inner(%input : tensor<4xf32>) -> tensor<32x4xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [1] : (tensor<4xf32>) -> tensor<32x4xf32>
+  return %result : tensor<32x4xf32>
+}
+
+// CHECK-LABEL: @broadcast_2D_tensor_inner
+// CHECK-NOT: vector.shape_cast
+// CHECK: vector.broadcast {{.*}} : vector<4xf32> to vector<32x4xf32>
+
+// -----
+
+func.func @broadcast_2D_tensor_outer(%input : tensor<4xf32>) -> tensor<4x32xf32> {
+  %result = stablehlo.broadcast_in_dim %input, dims = [0] : (tensor<4xf32>) -> tensor<4x32xf32>
+  return %result : tensor<4x32xf32>
+}
+
+// CHECK-LABEL: @broadcast_2D_tensor_outer
+// CHECK: vector.shape_cast {{.*}} : vector<4xf32> to vector<4x1xf32>
+// CHECK: vector.broadcast {{.*}} : vector<4x1xf32> to vector<4x32xf32>
+
+// -----
+
+func.func @reshape(%input : tensor<4xf32>) -> tensor<2x1x2xf32> {
+  %result = stablehlo.reshape %input : (tensor<4xf32>) -> tensor<2x1x2xf32>
+  return %result : tensor<2x1x2xf32>
+}
+
+// CHECK-LABEL: @reshape
+// CHECK:vector.shape_cast {{.*}} : vector<4xf32> to vector<2x1x2xf32>
+
+// -----
+
+func.func @iota() -> tensor<4xi32> {
+  %result = stablehlo.iota dim = 0 : tensor<4xi32>
+  return %result : tensor<4xi32>
+}
+
+// CHECK-LABEL: @iota
+// CHECK: arith.constant dense<[0, 1, 2, 3]> : vector<4xi32>
+
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_vector.mlir
index 17ab409e4deaca..c091c0256fb38b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_vector.mlir
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/tensor_ops_to_vector.mlir
@@ -1,4 +1,4 @@
-// RUN: emitters_opt %s -xtile-cpu-tensor-ops-to-vector -split-input-file | FileCheck %s
+// RUN: fusion_compiler_opt %s -xtile-cpu-tensor-ops-to-vector -split-input-file | FileCheck %s
 
 func.func @from_elements(%input : f32) -> tensor<f32> {
   // CHECK: vector.from_elements %{{.*}} : vector<f32>
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/xtile_to_vector.mlir b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/xtile_to_vector.mlir
deleted file mode 100644
index d4ba7a6a99183a..00000000000000
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/tests/xtile_to_vector.mlir
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: emitters_opt %s --xtile-cpu-xtile-to-vector -cse -split-input-file | FileCheck %s
-
-// CHECK-LABEL: @simple_insert_extract
-// CHECK-SAME: (%[[INPUT:.*]]: memref<1024xf32>, %[[OUTPUT:.*]]: memref<1024xf32>, %[[TILE_ID:.*]]: index)
-xtile.entry_func @simple_insert_extract(%input: memref<1024xf32>, %output: memref<1024xf32>, %tile_id: index) {
-  // CHECK-DAG: %[[POISON:.*]] = ub.poison : f32
-  // CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IN_SUBVIEW:.*]] = memref.subview %[[INPUT]][%[[TILE_ID]]] [1] [1]
-  // CHECK-SAME: memref<1024xf32> to memref<1xf32, strided<[1], offset: ?>>
-  // CHECK: %[[MASK:.*]] = vector.create_mask
-  // CHECK: %[[EXTRACT:.*]] = vector.transfer_read %[[IN_SUBVIEW]][%[[C_0]]], %[[POISON]], %[[MASK]]
-  %tile = xtile.extract %input[%tile_id][1][1] : memref<1024xf32> -> tensor<1xf32>
-  // CHECK: %[[OUT_SUBVIEW:.*]] = memref.subview %[[OUTPUT]][%[[TILE_ID]]] [1] [1]
-  // CHECK-SAME: memref<1024xf32> to memref<1xf32, strided<[1], offset: ?>>
-  // CHECK: vector.transfer_write %[[EXTRACT]], %[[OUT_SUBVIEW]][%[[C_0]]], %[[MASK]]
-  xtile.insert %tile into %output[%tile_id][1][1] : tensor<1xf32> -> memref<1024xf32>
-  xtile.return
-}
-
-// -----
-
-// CHECK: @reduce_dimension(%[[INPUT:.*]]: memref<16x1024xf32>, %[[OUTPUT:.*]]: memref<16x1024xf32>, %[[TILE_ID:.*]]: index)
-xtile.entry_func @reduce_dimension(%input: memref<16x1024xf32>, %output: memref<16x1024xf32>, %tile_id: index) {
-  // CHECK: %[[C_0:.*]] = arith.constant 0 : index
-  %offset = arith.constant 0 : index
-  // CHECK: memref.subview %[[INPUT]][%[[C_0]], %[[TILE_ID]]] [10, 1] [1, 1]
-  // CHECK-SAME: memref<16x1024xf32> to memref<10xf32, strided<[1024], offset: ?>>
-  %tile = xtile.extract %input[%offset, %tile_id][10, 1][1, 1] : memref<16x1024xf32> -> tensor<10xf32>
-  // CHECK: memref.subview %[[OUTPUT]][%[[C_0]], %[[TILE_ID]]] [10, 1] [1, 1]
-  // CHECK-SAME: memref<16x1024xf32> to memref<10xf32, strided<[1024], offset: ?>>
-  xtile.insert %tile into %output[%offset, %tile_id][10, 1][1, 1] : tensor<10xf32> -> memref<16x1024xf32>
-  xtile.return
-}
-
-// -----
-
-// CHECK: @extract_strided(%[[SOURCE:.*]]: memref<16xf32>, %[[TILE_ID:.*]]: index)
-func.func @extract_strided(%source: memref<16xf32>, %tile_id: index) -> tensor<8xf32> {
-  // CHECK: memref.subview %[[SOURCE]][%[[TILE_ID]]] [8] [2] :
-  // CHECK-SAME: memref<16xf32> to memref<8xf32, strided<[2], offset: ?>>
-  %tile = xtile.extract %source[%tile_id][8][2] : memref<16xf32> -> tensor<8xf32>
-  return %tile : tensor<8xf32>
-}
-
-// -----
-
-// CHECK: @insert_strided(
-// CHECK-SAME: %[[SOURCE:.*]]: tensor<8xf32>,
-// CHECK-SAME: %[[DESTINATION:.*]]: memref<16xf32>,
-// CHECK-SAME: %[[TILE_ID:.*]]: index)
-func.func @insert_strided(%source: tensor<8xf32>, %destination: memref<16xf32>, %tile_id: index) {
-  // CHECK: memref.subview %[[DESTINATION]][%[[TILE_ID]]] [8] [2] :
-  // CHECK-SAME: memref<16xf32> to memref<8xf32, strided<[2], offset: ?>>
-  xtile.insert %source into %destination[%tile_id][8][2] : tensor<8xf32> -> memref<16xf32>
-  return
-}
-
-
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
new file mode 100644
index 00000000000000..4f7354d0cc669c
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
@@ -0,0 +1,312 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
+
+namespace xla::cpu {
+
+static absl::StatusOr<mlir::vector::CombiningKind> GetCombiningKind(
+    mlir::Block& reduction_body) {
+  mlir::Operation* op =
+      reduction_body.getTerminator()->getOperand(0).getDefiningOp();
+  if (!op) {
+    return absl::InternalError("No reduction combiner");
+  }
+
+  for (mlir::Value operand : op->getOperands()) {
+    if (operand.getDefiningOp()) {
+      return absl::InternalError("Non trivial reduction combiner");
+    }
+  }
+
+  if (auto kind = mlir::linalg::getCombinerOpKind(op)) {
+    return *kind;
+  }
+
+  return absl::InternalError("Unsupported reduction combiner");
+}
+
+static void InsertValue(mlir::OpBuilder& builder, mlir::Location loc,
+                        mlir::Value value,
+                        mlir::TypedValue<mlir::MemRefType> buffer,
+                        mlir::ValueRange indices) {
+  llvm::SmallVector<mlir::Value> padded_indices(indices);
+  while (padded_indices.size() < buffer.getType().getRank()) {
+    padded_indices.push_back(
+        builder.create<mlir::arith::ConstantIndexOp>(loc, 0));
+  }
+
+  if (mlir::isa<mlir::VectorType>(value.getType())) {
+    mlir::vector::TransferWriteOp::create(builder, loc, value, buffer,
+                                          padded_indices);
+  } else {
+    mlir::memref::StoreOp::create(builder, loc, value, buffer, padded_indices);
+  }
+}
+
+static mlir::TypedValue<mlir::VectorType> ExtractVector(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::TypedValue<mlir::ShapedType> input, mlir::ValueRange indices = {}) {
+  llvm::SmallVector<mlir::Value> padded_indices(indices);
+  while (padded_indices.size() < input.getType().getRank()) {
+    padded_indices.push_back(
+        mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+  }
+  mlir::VectorType vector_type = mlir::VectorType::get(
+      input.getType().getShape().drop_front(indices.size()),
+      input.getType().getElementType());
+  return mlir::vector::TransferReadOp::create(builder, loc, vector_type, input,
+                                              padded_indices,
+                                              /*padding=*/std::nullopt);
+}
+
+static std::array<llvm::SmallVector<mlir::Value>, 3> GetLoopBounds(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    llvm::ArrayRef<int64_t> upper_bounds, int64_t lower_bound = 0) {
+  llvm::SmallVector<mlir::Value> lbs(
+      upper_bounds.size(),
+      builder.create<mlir::arith::ConstantIndexOp>(loc, lower_bound));
+  llvm::SmallVector<mlir::Value> ubs =
+      llvm::map_to_vector(upper_bounds, [&](int64_t size) -> mlir::Value {
+        return builder.create<mlir::arith::ConstantIndexOp>(loc, size);
+      });
+  llvm::SmallVector<mlir::Value> step(
+      upper_bounds.size(),
+      builder.create<mlir::arith::ConstantIndexOp>(loc, 1));
+  return {lbs, ubs, step};
+}
+
+mlir::Value VectorizeBody(mlir::OpBuilder& builder, mlir::Location loc,
+                          mlir::Block& old_body, mlir::Value lhs_vector,
+                          mlir::Value rhs_vector) {
+  mlir::IRMapping mapping;
+
+  mapping.map(old_body.getArgument(0), lhs_vector);
+  mapping.map(old_body.getArgument(1), rhs_vector);
+
+  for (mlir::Operation& op : old_body.without_terminator()) {
+    // TODO(willfroom): Check
+    // mlir::OpTrait::hasElementwiseMappableTraits
+    auto new_operands = llvm::map_to_vector(
+        op.getOperands(),
+        [&](mlir::Value operand) { return mapping.lookup(operand); });
+    mlir::Operation* new_op = op.create(
+        loc, op.getName(), {lhs_vector.getType()}, new_operands, op.getAttrs(),
+        op.getPropertiesStorage(), op.getSuccessors(), op.getNumRegions());
+    mapping.map(&op, new_op);
+    for (auto [old_res, new_res] :
+         llvm::zip(op.getResults(), new_op->getResults())) {
+      mapping.map(old_res, new_res);
+    }
+    builder.insert(new_op);
+  }
+  return mapping.lookup(old_body.getTerminator()->getOperand(0));
+}
+
+// Reduce a 1D vector to a scalar with the given body.
+mlir::Value EmitMinorReduction(mlir::OpBuilder& builder, mlir::Location loc,
+                               mlir::RankedTensorType result_type,
+                               mlir::TypedValue<mlir::VectorType> input,
+                               mlir::Value init_value, mlir::Block& body) {
+  absl::StatusOr<mlir::vector::CombiningKind> kind_or = GetCombiningKind(body);
+  if (!kind_or.ok()) {
+    body.getParentOp()->emitRemark() << kind_or.status().ToString();
+  }
+
+  auto input_type = input.getType();
+  int64_t minor_dim_size = input_type.getShape().back();
+
+  if (kind_or.ok()) {
+    // TODO(willfroom): Investigate tree-reduction to split the reduction
+    // op into natural sizes (2, 4, 8, 16, ...) and then remove the
+    // reassociation flag.
+    mlir::Value reduced_scalar = mlir::vector::ReductionOp::create(
+        builder, loc, *kind_or, input, init_value,
+        mlir::arith::FastMathFlags::reassoc);
+
+    return reduced_scalar;
+  }
+
+  mlir::Value lbs = mlir::arith::ConstantIndexOp::create(builder, loc, 0);
+  mlir::Value ubs =
+      mlir::arith::ConstantIndexOp::create(builder, loc, minor_dim_size);
+  mlir::Value step = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
+  auto loop = mlir::scf::ForOp::create(
+      builder, loc, lbs, ubs, step, {init_value},
+      [&](mlir::OpBuilder& builder, mlir::Location loc, mlir::Value index,
+          mlir::ValueRange carry_value) {
+        mlir::TypedValue<mlir::VectorType> element_vector =
+            ExtractVector(builder, loc, input, index);
+        mlir::Value element =
+            mlir::vector::ExtractOp::create(builder, loc, element_vector);
+
+        mlir::Value result =
+            VectorizeBody(builder, loc, body, element, carry_value.front());
+
+        mlir::scf::YieldOp::create(builder, loc, result);
+      });
+
+  return loop.getResult(0);
+}
+
+mlir::TypedValue<mlir::MemRefType> EmitReductionLoop(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source_tensor,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body,
+    mlir::Value init_value) {
+  mlir::RankedTensorType source_tensor_type = source_tensor.getType();
+  int64_t rank = source_tensor_type.getRank();
+  int64_t minor_dim = rank - 1;
+  bool minor_dim_reduced = reduction_dims.back() == minor_dim;
+
+  // The set of non-reduced dimensions that are not the minor dimension.
+  llvm::SmallVector<int64_t> non_reduced_non_minor_dims(rank);
+  absl::c_iota(non_reduced_non_minor_dims, 0);
+  non_reduced_non_minor_dims.erase(
+      std::remove_if(
+          non_reduced_non_minor_dims.begin(), non_reduced_non_minor_dims.end(),
+          [&](int64_t dim) {
+            return absl::c_find(reduction_dims, dim) != reduction_dims.end() ||
+                   dim == minor_dim;
+          }),
+      non_reduced_non_minor_dims.end());
+
+  // The set of reduced dimensions that are not the minor dimension.
+  llvm::SmallVector<int64_t> non_minor_reduced_dims(reduction_dims);
+  if (auto itr = absl::c_find(non_minor_reduced_dims, minor_dim);
+      itr != non_minor_reduced_dims.end()) {
+    non_minor_reduced_dims.erase(itr);
+  }
+
+  auto buffer = CreateBufferOfShape(builder, loc, result_type);
+
+  auto get_source_vector_dim_size = [&](llvm::ArrayRef<int64_t> dims) {
+    return llvm::map_to_vector(
+        dims, [&](int64_t dim) { return source_tensor_type.getDimSize(dim); });
+  };
+
+  // Outer loop is non-minor non-reduced dimensions.
+  auto [lbs, ubs, step] = GetLoopBounds(
+      builder, loc, get_source_vector_dim_size(non_reduced_non_minor_dims));
+
+  mlir::scf::buildLoopNest(
+      builder, loc, lbs, ubs, step,
+      [&](mlir::OpBuilder& builder, mlir::Location loc,
+          mlir::ValueRange outer_induction_vars) {
+        auto [lbs, ubs, step] = GetLoopBounds(
+            builder, loc, get_source_vector_dim_size(non_minor_reduced_dims),
+            1);
+
+        llvm::SmallVector<mlir::Value> zeroth_step_indices(
+            rank - 1, mlir::arith::ConstantIndexOp::create(builder, loc, 0));
+        for (auto [idx, var] :
+             llvm::zip(non_reduced_non_minor_dims, outer_induction_vars)) {
+          zeroth_step_indices[idx] = var;
+        }
+        // Get the first iteration
+        mlir::Value minor_accumilator =
+            ExtractVector(builder, loc, source_tensor, zeroth_step_indices);
+        // Inner loop is the non-minor reduced dimension.
+        mlir::scf::LoopNest loop_nest = mlir::scf::buildLoopNest(
+            builder, loc, lbs, ubs, step, minor_accumilator,
+            [&](mlir::OpBuilder& builder, mlir::Location loc,
+                mlir::ValueRange inner_induction_vars,
+                mlir::ValueRange minor_accumilator)
+                -> mlir::SmallVector<mlir::Value> {
+              // Handle the case when there are no non-minor reduced dimensions.
+              if (inner_induction_vars.empty()) {
+                return {minor_accumilator.front()};
+              }
+
+              llvm::SmallVector<mlir::Value> indices = zeroth_step_indices;
+              for (auto [idx, var] :
+                   llvm::zip(non_minor_reduced_dims, inner_induction_vars)) {
+                indices[idx] = var;
+              }
+
+              mlir::Value vector_slice =
+                  ExtractVector(builder, loc, source_tensor, indices);
+
+              return {VectorizeBody(builder, loc, body, vector_slice,
+                                    minor_accumilator.front())};
+            });
+
+        auto non_minor_reduced_result =
+            mlir::cast<mlir::TypedValue<mlir::VectorType>>(
+                loop_nest.results.front());
+
+        if (minor_dim_reduced) {
+          mlir::Value reduced_scalar =
+              EmitMinorReduction(builder, loc, result_type,
+                                 non_minor_reduced_result, init_value, body);
+
+          InsertValue(builder, loc, reduced_scalar, buffer,
+                      outer_induction_vars);
+        } else {
+          InsertValue(builder, loc, non_minor_reduced_result, buffer,
+                      outer_induction_vars);
+        }
+      });
+
+  return buffer;
+}
+
+mlir::Value EmitVectorizedReduction(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source, mlir::Value init_value,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body) {
+  mlir::TypedValue<mlir::ShapedType> result;
+  result = EmitReductionLoop(builder, loc, result_type, source, reduction_dims,
+                             body, init_value);
+
+  auto to_tensor = mlir::bufferization::ToTensorOp::create(builder, loc,
+                                                           result_type, result);
+  // This is a local allocation so we know it doesn't alias.
+  to_tensor.setRestrict(true);
+  to_tensor.setWritable(true);
+  return to_tensor;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h
new file mode 100644
index 00000000000000..89443a52c7b3ab
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+
+namespace xla::cpu {
+
+// Create a vectorized reduction of the given source vector.
+//
+// The implementation is as follows:
+// 1. If the reduction dimension is only the most minor we convert it into a
+//    nested scf.loop of horizonal reductions and if the body of the reduce is a
+//    single binary operation that is supported by ReductionOp we use that,
+//    otherwise we simply loop over the scalar values.
+// 2. If the reduction dimensions does not include the most minor dimension, we
+//    loop over the reductions dimensions and apply the body with vectorized
+//    inputs.
+// 3. If the dimensions are a combindation of minor & non-minor dimensions we
+//    simply apply strategy 2 followed by strategy 1.
+mlir::Value EmitVectorizedReduction(
+    mlir::OpBuilder& builder, mlir::Location loc,
+    mlir::RankedTensorType result_type,
+    mlir::TypedValue<mlir::RankedTensorType> source, mlir::Value init_value,
+    llvm::ArrayRef<int64_t> reduction_dims, mlir::Block& body);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TILED_TRANSFORMS_VECTORIZED_REDUCE_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/xtile_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/xtile_to_vector.cc
deleted file mode 100644
index f7c6f2fc8f3604..00000000000000
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/xtile_to_vector.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "absl/algorithm/container.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallVectorExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/UB/IR/UBOps.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/cpu/codegen/tiled/transforms/lowering_utils.h"
-#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
-#include "xla/codegen/xtile/ir/xtile_ops.h"
-
-namespace xla::cpu {
-
-#define GEN_PASS_DECL_XTILETOVECTORPASS
-#define GEN_PASS_DEF_XTILETOVECTORPASS
-#include "xla/backends/cpu/codegen/tiled/transforms/passes.h.inc"
-
-namespace {
-
-// Dims are dropped in the subview so we use the identity map.
-mlir::AffineMapAttr GetIdentityMap(xtile::TiledBufferInterface op) {
-  int64_t rank = op.getTile().getType().getRank();
-  return mlir::AffineMapAttr::get(
-      mlir::AffineMap::getMultiDimIdentityMap(rank, op.getContext()));
-}
-
-mlir::TypedValue<mlir::MemRefType> GetSubView(
-    mlir::ImplicitLocOpBuilder& builder, xtile::TiledBufferInterface op) {
-  auto get_static_fold_result = [&](llvm::ArrayRef<int64_t> input) {
-    return llvm::map_to_vector(input, [&builder](int64_t value) {
-      return mlir::OpFoldResult(builder.getIndexAttr(value));
-    });
-  };
-
-  auto offsets = llvm::SmallVector<mlir::OpFoldResult>(op.getOffsets());
-  auto full_tile_shape = get_static_fold_result(op.getFullTileShape());
-  auto strides = get_static_fold_result(op.getStrides());
-
-  mlir::MemRefType subview_type =
-      mlir::memref::SubViewOp::inferRankReducedResultType(
-          op.getTile().getType().getShape(), op.getBuffer().getType(), offsets,
-          full_tile_shape, get_static_fold_result(op.getStrides()));
-
-  return builder.create<mlir::memref::SubViewOp>(
-      subview_type, op.getBuffer(), offsets, full_tile_shape, strides);
-}
-
-llvm::SmallVector<mlir::Value> GetZeroIndexVector(
-    mlir::ImplicitLocOpBuilder& builder, int64_t rank) {
-  return llvm::SmallVector<mlir::Value>(
-      rank, builder.create<mlir::arith::ConstantIndexOp>(0));
-}
-
-mlir::ArrayAttr GetInBoundsAttr(mlir::ImplicitLocOpBuilder& builder,
-                                int64_t rank) {
-  // TODO(willfroom): Add proper support for inBounds attr.
-  llvm::SmallVector<mlir::Attribute> in_bounds(rank,
-                                               builder.getBoolAttr(false));
-  return builder.getArrayAttr(in_bounds);
-}
-
-// Get the mask for the given transfer_<read/write> op on a subview of the
-// original memeref.
-// The inequality we need to satisfy in 1D is:
-//  1. offset + subview_idx * stride <= size - 1
-//  2. subview_idx * stride <= size - 1 - offset
-//  3. subview_idx <= (size - 1 - offset) / stride
-//  4. subview_idx < ((size - 1 - offset) / stride) + 1
-//  5. subview_idx < (size + stride - 1 - offset) / stride
-mlir::Value GetMask(mlir::ImplicitLocOpBuilder& builder,
-                    xtile::TiledBufferInterface op) {
-  mlir::RankedTensorType tile_tensor_type = op.getTile().getType();
-
-  auto get_const_index_op = [&](int64_t value) {
-    return builder.create<mlir::arith::ConstantIndexOp>(value);
-  };
-
-  if (tile_tensor_type.getRank() == 0) {
-    // Vector transfer read/write currently don't support 0D masks.
-    auto mask_0D_type = mlir::VectorType::get({1}, builder.getI1Type());
-    return builder.create<mlir::vector::CreateMaskOp>(
-        mask_0D_type, mlir::OpFoldResult(builder.getIndexAttr(1)));
-  }
-
-  llvm::SmallDenseSet<unsigned> reduced_dims = op.getReducedDimensions();
-  llvm::SmallVector<mlir::Value> upper_bounds;
-  int64_t idx = 0;
-  for (auto [offset, size, stride] :
-       llvm::zip(op.getOffsets(), op.getBuffer().getType().getShape(),
-                 op.getStrides())) {
-    if (reduced_dims.contains(idx++)) {
-      continue;
-    }
-    upper_bounds.push_back(builder.create<mlir::arith::DivSIOp>(
-        builder.create<mlir::arith::SubIOp>(
-            get_const_index_op(size + stride - 1), offset),
-        get_const_index_op(stride)));
-  }
-
-  auto mask_type = mlir::VectorType::get(op.getTile().getType().getShape(),
-                                         builder.getI1Type());
-  return builder.create<mlir::vector::CreateMaskOp>(mask_type, upper_bounds);
-}
-
-struct LowerExtractTile : mlir::OpRewritePattern<xtile::ExtractTileOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      xtile::ExtractTileOp op, mlir::PatternRewriter& rewriter) const override {
-    mlir::ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-    auto vector_type = GetVectorType(op.getResult().getType());
-
-    mlir::TypedValue<mlir::MemRefType> buffer_subview = GetSubView(builder, op);
-
-    int64_t reduced_rank = vector_type.getRank();
-
-    // The subview is already offset so the read has zero offsets.
-    auto zero_index = GetZeroIndexVector(builder, reduced_rank);
-    mlir::Value padding =
-        builder.create<mlir::ub::PoisonOp>(vector_type.getElementType());
-    mlir::Value mask = GetMask(builder, op);
-    auto in_bounds = GetInBoundsAttr(builder, reduced_rank);
-
-    mlir::Value vector_value = rewriter.create<mlir::vector::TransferReadOp>(
-        op->getLoc(), vector_type, buffer_subview, zero_index,
-        GetIdentityMap(op), padding, mask, in_bounds);
-
-    rewriter.replaceOp(op, CastToTensor(builder, vector_value));
-    return mlir::success();
-  }
-};
-
-struct LowerInsertTile : mlir::OpRewritePattern<xtile::InsertTileOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      xtile::InsertTileOp op, mlir::PatternRewriter& rewriter) const override {
-    mlir::ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-    mlir::TypedValue<mlir::VectorType> vector_tile =
-        CastToVector(builder, op.getSource());
-
-    mlir::TypedValue<mlir::MemRefType> buffer_subview = GetSubView(builder, op);
-
-    int64_t reduced_rank = vector_tile.getType().getRank();
-
-    // The subview is already offset so the write has zero offsets.
-    auto zero_index = GetZeroIndexVector(builder, reduced_rank);
-    mlir::Value mask = GetMask(builder, op);
-    auto in_bounds = GetInBoundsAttr(builder, reduced_rank);
-
-    mlir::vector::TransferWriteOp transfer_write =
-        builder.create<mlir::vector::TransferWriteOp>(
-            vector_tile, buffer_subview, zero_index, GetIdentityMap(op), mask,
-            in_bounds);
-
-    rewriter.replaceOp(op, transfer_write);
-    return mlir::success();
-  }
-};
-
-class XTileToVectorPass
-    : public impl::XTileToVectorPassBase<XTileToVectorPass> {
- public:
-  using XTileToVectorPassBase::XTileToVectorPassBase;
-
-  void runOnOperation() override {
-    mlir::MLIRContext* context = &getContext();
-    mlir::RewritePatternSet patterns(context);
-    patterns.add<LowerExtractTile, LowerInsertTile>(context);
-    if (mlir::failed(
-            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> CreateXTileToVectorPass() {
-  return std::make_unique<XTileToVectorPass>();
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/BUILD b/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
index c771b9c61f4eef..7a748c77b7f8db 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/BUILD
@@ -4,6 +4,7 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -38,6 +39,8 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -79,10 +82,9 @@ xla_cc_binary(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
-        "//xla/codegen:mlir_kernel_definition",
         "//xla/codegen/tools:test_lib",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -90,3 +92,23 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+xla_cc_binary(
+    name = "fusion_compiler_opt",
+    srcs = ["fusion_compiler_opt.cc"],
+    deps = [
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/backends/cpu/codegen/emitters/transforms:passes",
+        "//xla/backends/cpu/codegen/tiled/transforms:passes",
+        "//xla/codegen/emitters/transforms:convert_pure_call_ops_pass",
+        "//xla/codegen/xtile/ir/transforms:passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:LLVMIRTransforms",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:RegisterAllPasses",  # buildcleaner: keep
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:linalg_passes",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc
new file mode 100644
index 00000000000000..db5170f43ebbe9
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_compiler_opt.cc
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "stablehlo/conversions/linalg/transforms/Passes.h"
+#include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/backends/cpu/codegen/tiled/transforms/passes.h"
+#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
+
+int main(int argc, char** argv) {
+  mlir::DialectRegistry registry =
+      xla::cpu::FusionCompiler::CreateDialectRegistry(true);
+
+  mlir::registerAllPasses();
+
+  xla::emitters::registerTransformsPasses();
+  xla::cpu::registerXlaCpuTransformsPasses();
+  xla::cpu::registerXTileCpuTransformsPasses();
+  xla::xtile::registerXTileTransformsPasses();
+  mlir::stablehlo::registerStablehloLinalgTransformsPasses();
+
+  return mlir::failed(MlirOptMain(
+      argc, argv, "XLA:CPU Fusion compiler pass driver\n", registry));
+}
diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
index c41605d6b78659..211a42e16eefe2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/fusion_to_mlir.cc
@@ -21,11 +21,10 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/fusion_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/tools/test_lib.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/init_main.h"
 
@@ -33,15 +32,14 @@ namespace xla::cpu {
 
 absl::Status Run(const std::string& filename) {
   auto mlir_context = FusionCompiler::CreateContext();
-  auto symbolic_expr_context =
-      std::make_unique<gpu::SymbolicExprContext>(mlir_context.get());
+  auto expr_context = std::make_unique<SymbolicExprContext>(mlir_context.get());
   TF_ASSIGN_OR_RETURN(auto module, LoadTestModule(filename));
   auto fusion = DynCast<HloFusionInstruction>(
       module->entry_computation()->root_instruction());
   fusion->SetAndSanitizeName("main");
   TF_ASSIGN_OR_RETURN(
-      MlirKernelDefinition kernel_definition,
-      EmitFusionKernel(*symbolic_expr_context, *fusion, nullptr, false));
+      KernelDefinition kernel_definition,
+      EmitFusionKernel(*mlir_context, *expr_context, *fusion, nullptr, false));
   llvm::outs() << kernel_definition.source().ToString();
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h b/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
index 330b35f52146d1..4e864c139f890b 100644
--- a/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
+++ b/third_party/xla/xla/backends/cpu/collectives/cpu_collectives.h
@@ -64,7 +64,8 @@ class CpuCollectives : public Collectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Config& config) final {
+      absl::Span<const RankId> keys, const Config& config,
+      absl::Span<const DeviceRank> ranks) final {
     return Unimplemented(
         "CPU collectives do not support communicator splitting");
   }
diff --git a/third_party/xla/xla/backends/cpu/nanort/BUILD b/third_party/xla/xla/backends/cpu/nanort/BUILD
index d445433b79051b..ee6537863a524d 100644
--- a/third_party/xla/xla/backends/cpu/nanort/BUILD
+++ b/third_party/xla/xla/backends/cpu/nanort/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/client:executable_build_options",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:utils",
@@ -61,6 +62,9 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/c_api_client:pjrt_c_api_client",
+        "//xla/pjrt/plugin:plugin_names",
+        "//xla/pjrt/plugin/xla_cpu:cpu_static_registration",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "//xla/runtime:device_id",
         "//xla/service:computation_placer_hdr",
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
index 3dd8db1f22ebaa..bec4d24483521b 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,7 +43,8 @@ using ::tsl::profiler::TraceMe;
 using ::tsl::profiler::TraceMeEncode;
 
 absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
-    const XlaComputation& computation) {
+    const XlaComputation& computation,
+    const ExecutableBuildOptions& executable_build_options) {
   TraceMe trace([&] {
     return TraceMeEncode("NanoRtClient::Compile",
                          {{"computation", computation.name()}});
@@ -61,14 +63,16 @@ absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
   static constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
 
-  // Use default XLA compiler options.
   Compiler::CompileOptions compile_options;
 
   // Run high-level XLA CPU compiler passes.
   cpu::CpuCompiler compiler;
-  TF_ASSIGN_OR_RETURN(hlo_module, compiler.RunHloPasses(std::move(hlo_module),
-                                                        /*stream_exec=*/nullptr,
-                                                        compile_options));
+  if (!executable_build_options.run_backend_only()) {
+    TF_ASSIGN_OR_RETURN(
+        hlo_module,
+        compiler.RunHloPasses(std::move(hlo_module),
+                              /*stream_exec=*/nullptr, compile_options));
+  }
 
   auto optimized_hlo_program_shape =
       hlo_module->entry_computation_layout().ComputeProgramShape();
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
index 3dc3ddf2962019..cad9c584a0fba4 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/service/compiler.h"
 
@@ -33,7 +34,9 @@ class NanoRtClient {
   // Compiles the given XLA computation to a NanoRtExecutable using the XLA:CPU
   // backend.
   absl::StatusOr<std::unique_ptr<NanoRtExecutable>> Compile(
-      const XlaComputation& computation);
+      const XlaComputation& computation,
+      const ExecutableBuildOptions& executable_build_options =
+          ExecutableBuildOptions());
 
   // Exports the given NanoRtExecutable to an AotCompilationResult.
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
index d8df49dcb12c9a..01c74ffe3bece6 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
@@ -42,8 +42,10 @@ limitations under the License.
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/c_api_client/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/plugin_names.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
@@ -577,8 +579,11 @@ BENCHMARK_CAPTURE(BM_NanoRtFibonacci, no_thread_pool, std::nullopt);
 BENCHMARK_CAPTURE(BM_NanoRtFibonacci, thread_pool,
                   std::make_optional<Eigen::ThreadPool>(2));
 
-static void BM_PjRtAddScalars(benchmark::State& state) {
-  auto client = GetXlaPjrtCpuClient(/*options=*/{});
+static void BM_PjRtAddScalars(benchmark::State& state,
+                              bool use_c_api_sandwich) {
+  auto client = use_c_api_sandwich
+                    ? xla::GetCApiClient(kCpuPjrtName, /*create_options=*/{})
+                    : GetXlaPjrtCpuClient(/*options=*/{});
   PjRtDevice* device = (*client)->devices().front();
   PjRtMemorySpace* memory_space = *device->default_memory_space();
 
@@ -609,10 +614,13 @@ static void BM_PjRtAddScalars(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_PjRtAddScalars);
+BENCHMARK_CAPTURE(BM_PjRtAddScalars, Direct, false);
+BENCHMARK_CAPTURE(BM_PjRtAddScalars, CSandwich, true);
 
-static void BM_PjRtFibonacci(benchmark::State& state) {
-  auto client = GetXlaPjrtCpuClient(/*options=*/{});
+static void BM_PjRtFibonacci(benchmark::State& state, bool use_c_api_sandwich) {
+  auto client = use_c_api_sandwich
+                    ? xla::GetCApiClient(kCpuPjrtName, /*create_options=*/{})
+                    : GetXlaPjrtCpuClient(/*options=*/{});
   PjRtDevice* device = (*client)->devices().front();
   PjRtMemorySpace* memory_space = *device->default_memory_space();
 
@@ -643,7 +651,8 @@ static void BM_PjRtFibonacci(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_PjRtFibonacci);
+BENCHMARK_CAPTURE(BM_PjRtFibonacci, Direct, false);
+BENCHMARK_CAPTURE(BM_PjRtFibonacci, CSandwich, true);
 
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
index f12920e5a64a9d..baecfc09471db4 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <new>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -174,7 +175,9 @@ class NanoRtExecutable {
 
    private:
     friend class NanoRtExecutable;
-    using Allocator = tsl::port::AlignedAllocator<std::byte, Align()>;
+    using Allocator =
+        tsl::port::AlignedAllocator<std::byte,
+                                    static_cast<std::align_val_t>(Align())>;
     alignas(Align()) absl::FixedArray<std::byte, n, Allocator> data_;
   };
 
diff --git a/third_party/xla/xla/backends/cpu/onednn_emitter.cc b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
index 43641cc2cf119a..0ea1f6a916ffc5 100644
--- a/third_party/xla/xla/backends/cpu/onednn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "oneapi/dnnl/dnnl_graph.hpp"  // NOLINT
 #include "xla/backends/cpu/onednn_fusion.h"
 #include "xla/backends/cpu/onednn_support.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/onednn/onednn_interop.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -100,7 +100,7 @@ static dnnl::graph::logical_tensor::dims OneDnnDimensions(const Shape& shape) {
 }
 
 static dnnl::graph::logical_tensor::dims OneDnnStrides(const Shape& shape) {
-  dnnl::graph::logical_tensor::dims strides(shape.dimensions_size());
+  dnnl::graph::logical_tensor::dims strides(shape.dimensions().size());
   int64_t stride = 1;
   for (int i : shape.layout().minor_to_major()) {
     strides.at(i) = stride;
diff --git a/third_party/xla/xla/backends/cpu/onednn_support.cc b/third_party/xla/xla/backends/cpu/onednn_support.cc
index 396e681bcc700d..332c0a1cf70212 100644
--- a/third_party/xla/xla/backends/cpu/onednn_support.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_support.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "dnnl.hpp"  // NOLINT: for DNNL_MAX_NDIMS
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
@@ -84,9 +84,9 @@ absl::StatusOr<bool> IsOneDnnDotSupported(
   }
 
   // NOLINTNEXTLINE: Use dnnl.hpp for DNNL_MAX_NDIMS for now.
-  if (lhs_shape.dimensions_size() > DNNL_MAX_NDIMS ||
-      rhs_shape.dimensions_size() > DNNL_MAX_NDIMS ||
-      lhs_shape.dimensions_size() != rhs_shape.dimensions_size()) {
+  if (lhs_shape.dimensions().size() > DNNL_MAX_NDIMS ||
+      rhs_shape.dimensions().size() > DNNL_MAX_NDIMS ||
+      lhs_shape.dimensions().size() != rhs_shape.dimensions().size()) {
     return false;
   }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 2a891e0fe78dab..b2ce6300faba43 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -22,9 +22,10 @@ package_group(
 filegroup(
     name = "runtime_srcs",
     srcs = [
-        "convolution_thunk_f16.cc",
-        "convolution_thunk_f32.cc",
+        "convolution_lib_f16.cc",
+        "convolution_lib_f32.cc",
         "rng_state_lib.cc",
+        "sort_lib.cc",
     ],
     visibility = internal_visibility([":friends"]),
 )
@@ -32,9 +33,10 @@ filegroup(
 filegroup(
     name = "runtime_hdrs",
     srcs = [
-        "convolution_thunk_internal.h",
+        "convolution_lib.h",
         "kernel_c_api.h",
         "rng_state_lib.h",
+        "sort_lib.h",
         "work_queue.h",
     ],
     visibility = internal_visibility([":friends"]),
@@ -136,9 +138,9 @@ cc_library(
     deps = [
         ":kernel_c_api",
         "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -148,7 +150,8 @@ tf_proto_library(
     create_grpc_library = True,
     make_default_target_header_only = True,
     protodeps = [
-        "//xla/backends/cpu:xnnpack_config_proto",
+        "//xla/backends/cpu:xnn_fusion_options_proto",
+        "//xla/backends/cpu:ynn_fusion_options_proto",
         "//xla/service:buffer_assignment_proto",
         "//xla:xla_data_proto",
         "//xla/service:hlo_proto",
@@ -370,9 +373,9 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_lib",
-    srcs = ["convolution_lib.cc"],
-    hdrs = ["convolution_lib.h"],
+    name = "convolution_dims",
+    srcs = ["convolution_dims.cc"],
+    hdrs = ["convolution_dims.h"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -390,20 +393,22 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_thunk_internal",
+    name = "convolution_lib",
     srcs = [
-        "convolution_thunk_f16.cc",
-        "convolution_thunk_f32.cc",
+        "convolution_lib_f16.cc",
+        "convolution_lib_f32.cc",
     ],
-    hdrs = ["convolution_thunk_internal.h"],
+    hdrs = ["convolution_lib.h"],
     copts = runtime_copts(),
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":work_queue",
+        "//xla/backends/cpu/runtime:work_queue",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/framework/contraction:eigen_contraction_kernel",
         "//xla/tsl/framework/convolution:eigen_helpers",
-        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -412,10 +417,9 @@ cc_library(
     name = "convolution_thunk",
     srcs = ["convolution_thunk.cc"],
     hdrs = ["convolution_thunk.h"],
-    copts = runtime_copts(),
     deps = [
+        ":convolution_dims",
         ":convolution_lib",
-        ":convolution_thunk_internal",
         ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -709,9 +713,9 @@ cc_library(
 )
 
 cc_library(
-    name = "dot_lib",
-    srcs = ["dot_lib.cc"],
-    hdrs = ["dot_lib.h"],
+    name = "dot_dims",
+    srcs = ["dot_dims.cc"],
+    hdrs = ["dot_dims.h"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -728,19 +732,31 @@ cc_library(
 )
 
 cc_library(
-    name = "dot_thunk",
+    name = "dot_lib",
     srcs = [
-        "dot_thunk.cc",
-        "dot_thunk_c128.cc",
-        "dot_thunk_c64.cc",
-        "dot_thunk_f16.cc",
-        "dot_thunk_f32.cc",
-        "dot_thunk_f64.cc",
-        "dot_thunk_s32.cc",
-        "dot_thunk_s8.cc",
+        "dot_lib_c128.cc",
+        "dot_lib_c64.cc",
+        "dot_lib_f16.cc",
+        "dot_lib_f32.cc",
+        "dot_lib_f64.cc",
+        "dot_lib_s32.cc",
+        "dot_lib_s8.cc",
+    ],
+    hdrs = ["dot_lib.h"],
+    deps = [
+        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@eigen_archive//:eigen3",
     ],
+)
+
+cc_library(
+    name = "dot_thunk",
+    srcs = ["dot_thunk.cc"],
     hdrs = ["dot_thunk.h"],
     deps = [
+        ":dot_dims",
         ":dot_lib",
         ":thunk",
         "//xla:shape_util",
@@ -750,7 +766,6 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
@@ -1015,12 +1030,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sort_lib",
+    srcs = ["sort_lib.cc"],
+    hdrs = ["sort_lib.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "sort_thunk",
     srcs = ["sort_thunk.cc"],
     hdrs = ["sort_thunk.h"],
     deps = [
         ":function_library",
+        ":sort_lib",
         ":thunk",
         "//xla:shape_util",
         "//xla:util",
@@ -1142,18 +1171,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "topk_lib",
+    hdrs = ["topk_lib.h"],
+    deps = [
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/numeric:bits",
+    ],
+)
+
 cc_library(
     name = "topk_thunk",
     srcs = ["topk_thunk.cc"],
     hdrs = ["topk_thunk.h"],
     deps = [
         ":thunk",
+        ":topk_lib",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
-        "//xla/service/cpu:runtime_topk",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -1177,7 +1217,7 @@ cc_library(
         ":collective_permute_thunk",
         ":collective_thunk",
         ":conditional_thunk",
-        ":convolution_lib",
+        ":convolution_dims",
         ":convolution_thunk",
         ":copy_thunk",
         ":custom_call_thunk",
@@ -1197,26 +1237,35 @@ cc_library(
         ":while_thunk",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/backends/cpu:xnnpack_config_proto_cc",
+        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
+        "//xla/backends/cpu:ynn_fusion_options_proto_cc",
         "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-    ],
+    ] + if_ynnpack([
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu:ynn_emitter",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
similarity index 99%
rename from third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
rename to third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
index cda85eb51229f9..515194bbd61b9d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h
new file mode 100644
index 00000000000000..312b3918b2a483
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_dims.h
@@ -0,0 +1,128 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
+#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Allocation slices of the convolution operation.
+struct ConvolutionSlices {
+  BufferAllocation::Slice input_buffer;
+  Shape input_shape;
+
+  BufferAllocation::Slice kernel_buffer;
+  Shape kernel_shape;
+
+  BufferAllocation::Slice output_buffer;
+  Shape output_shape;
+};
+
+// Returns buffer uses of the dot operation.
+absl::InlinedVector<BufferUse, 4> ConvolutionBufferUses(
+    const ConvolutionSlices& slices);
+
+// Convolution dimensions in canonical form inferred from the operands shapes
+// and convolution parameters.
+struct ConvolutionCanonicalDims {
+  // A helper struct to store the x, y and z dimensions of a tensor, introduced
+  // for readability. In case of 2D convolution, only the x and y dimensions are
+  // used and z is set to 0.
+  struct Dims {
+    explicit Dims(absl::Span<const int64_t> dims);
+
+    template <typename Sink>
+    friend void AbslStringify(Sink& sink, const Dims& d);
+
+    int64_t rank;
+    int64_t x;
+    int64_t y;
+    int64_t z;
+  };
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d);
+
+  size_t convolution_rank() const { return input_dims.rank; }
+
+  int64_t input_batch;
+  Dims input_dims;
+  int64_t input_channels;
+
+  Dims kernel_dims;
+  int64_t kernel_channels;
+  int64_t kernel_filters;
+
+  Dims output_dims;
+
+  Dims strides;
+  Dims padding_before;
+  Dims padding_after;
+  Dims base_dilation;
+  Dims window_dilation;
+
+  int64_t feature_group_count;
+};
+
+// Get convolution dimensions in canonical form inferred from the operands
+// shapes and convolution parameters.
+absl::StatusOr<ConvolutionCanonicalDims> GetConvolutionCanonicalDims(
+    const ConvolutionSlices& slices, const ConvolutionDimensionNumbers& dnums,
+    const Window& window, int64_t feature_group_count);
+
+template <typename Sink>
+void AbslStringify(Sink& sink, const ConvolutionCanonicalDims::Dims& d) {
+  switch (d.rank) {
+    case 2:
+      absl::Format(&sink, "[%d,%d]", d.x, d.y);
+      break;
+    case 3:
+      absl::Format(&sink, "[%d,%d,%d]", d.x, d.y, d.z);
+      break;
+    default:
+      absl::Format(&sink, "[invalid rank %d]", d.rank);
+  }
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d) {
+  absl::Format(&sink,
+               "convolution_rank=%d input_batch=%d input_dims=%v "
+               "input_channels=%d kernel_dims=%v kernel_channels=%d "
+               "kernel_filters=%d output_dims=%v strides=%v padding_before=%v "
+               "padding_after=%v base_dilation=%v window_dilation=%v "
+               "feature_group_count=%d",
+               d.convolution_rank(), d.input_batch, d.input_dims,
+               d.input_channels, d.kernel_dims, d.kernel_channels,
+               d.kernel_filters, d.output_dims, d.strides, d.padding_before,
+               d.padding_after, d.base_dilation, d.window_dilation,
+               d.feature_group_count);
+}
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_DIMS_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
index bf4610a3f18a64..cc42cfd2c079cd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2025 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,113 +16,650 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
 #define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <utility>
 
-#include "absl/container/inlined_vector.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Allocation slices of the convolution operation.
-struct ConvolutionSlices {
-  BufferAllocation::Slice input_buffer;
-  Shape input_shape;
-
-  BufferAllocation::Slice kernel_buffer;
-  Shape kernel_shape;
-
-  BufferAllocation::Slice output_buffer;
-  Shape output_shape;
-};
-
-// Returns buffer uses of the dot operation.
-absl::InlinedVector<BufferUse, 4> ConvolutionBufferUses(
-    const ConvolutionSlices& slices);
-
-// Convolution dimensions in canonical form inferred from the operands shapes
-// and convolution parameters.
-struct ConvolutionCanonicalDims {
-  // A helper struct to store the x, y and z dimensions of a tensor, introduced
-  // for readability. In case of 2D convolution, only the x and y dimensions are
-  // used and z is set to 0.
-  struct Dims {
-    explicit Dims(absl::Span<const int64_t> dims);
-
-    template <typename Sink>
-    friend void AbslStringify(Sink& sink, const Dims& d);
-
-    int64_t rank;
-    int64_t x;
-    int64_t y;
-    int64_t z;
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/backends/cpu/runtime/work_queue.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
+#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
+
+#define EIGEN_USE_THREADS
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu::internal {
+
+// Done callback is called when the Convolution computation is complete.
+using DoneCallback = absl::AnyInvocable<void()>;
+
+template <typename ScalarType>
+void EigenConv2D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_channels,
+                 Eigen::Index kernel_x, Eigen::Index kernel_y,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index x_stride, Eigen::Index y_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done);
+
+template <typename ScalarType>
+void EigenConv3D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_z,
+                 Eigen::Index input_channels, Eigen::Index kernel_x,
+                 Eigen::Index kernel_y, Eigen::Index kernel_z,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index output_z, Eigen::Index x_stride,
+                 Eigen::Index y_stride, Eigen::Index z_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index padding_z_before, Eigen::Index padding_z_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
+                 Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done);
+
+//===----------------------------------------------------------------------===//
+// Convolution 2D implementation details.
+//===----------------------------------------------------------------------===//
+
+// Returns in 'out_data' (assumes to be zero-initialized) image patch in storage
+// order (width, height, depth), constructed from patches in 'conv_matrix',
+// which is required to be in storage order (in_width * in_height, filter_width,
+// filter_height, out_depth).
+// Based on TF implementation by Yangqing Jia (jiayq).
+// TODO(adambanas): The original implementation implicitly rotates the kernel by
+// 180 degrees, but to be backwards compatible, we cannot do that in XLA. This
+// results in counterintuitive operations on conv_matrix, which is also 15-20%
+// slower. Try alternative approaches (e.g. rotate kernel before matrix
+// multiplication in the calling function).
+template <typename T>
+void Pack2DPatches(const T* conv_matrix, const int depth, const int height,
+                   const int width, const int filter_h, const int filter_w,
+                   const int pad_top, const int pad_bottom, const int pad_left,
+                   const int pad_right, const int stride_h, const int stride_w,
+                   T* __restrict out_im_data) {
+  int w_patches_number =
+      (width + filter_w - pad_left - pad_right - 2) / stride_w + 1;
+  int h_patches_number =
+      (height + filter_h - pad_top - pad_bottom - 2) / stride_h + 1;
+
+  const int filter_spatial_size = filter_h * filter_w;
+
+  int w_patch_begin = pad_left - filter_w + 1;
+  conv_matrix += depth * (filter_spatial_size - 1);
+  for (int w = 0; w < w_patches_number; ++w) {
+    int h_patch_begin = pad_top - filter_h + 1;
+    for (int h = 0; h < h_patches_number; ++h) {
+      // This loop body covers 1 output patch, at all depths, accounting for
+      // padding. The next line is always a pointer to the first element of the
+      // new output patch. Notice in case of less-than-full padding, the pointer
+      // can point to an element outside the image, but such elements will be
+      // skipped by the inner if (so no write occurs).
+      T* out_im_patch_data =
+          out_im_data + (w_patch_begin * height + h_patch_begin) * depth;
+
+      for (int iw = w_patch_begin; iw < w_patch_begin + filter_w; ++iw) {
+        for (int ih = h_patch_begin; ih < h_patch_begin + filter_h; ++ih) {
+          // This loop body covers 1 spatial point with coordinates (iw, ih)
+          // in the output buffer, at all depths
+          if (iw >= 0 && iw < width && ih >= 0 && ih < height) {
+            for (int i = 0; i < depth; ++i) {
+              out_im_patch_data[i] += conv_matrix[i];
+            }
+          }
+          out_im_patch_data += depth;
+          conv_matrix -= depth;
+        }
+        // Jump over remaining number of depth.
+        out_im_patch_data += depth * (height - filter_h);
+      }
+
+      conv_matrix += 2 * depth * filter_spatial_size;
+      h_patch_begin += stride_h;
+    }
+    w_patch_begin += stride_w;
+  }
+}
+
+template <typename ScalarType>
+bool CanUseCustomTransposedConv(
+    Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_filters,
+    Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride,
+    Eigen::Index y_stride, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
+  // Total spatial dimensions.
+  const int input_image_size = input_x * input_y;
+  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
+
+  // Don't use custom transposed convolutions with intermediate buffers.
+  constexpr auto kMaxConvMatrixSize = static_cast<size_t>(8) << 30;  // 8 GiB
+
+  // Intermediate buffer (convolution matrix)
+  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
+  if (buffer_size * sizeof(ScalarType) > kMaxConvMatrixSize) {
+    return false;
+  }
+
+  return (lhs_x_dilation > 1 || lhs_y_dilation > 1) && rhs_x_dilation == 1 &&
+         rhs_y_dilation == 1 && feature_group_count == 1 && x_stride == 1 &&
+         y_stride == 1;
+}
+
+// This implementation is based on TF algorithm with parallel contraction.
+// TODO(adambanas): There are other variants of this algorithm, 10% performance
+// improvement was observed on 1D case when not using parallel contraction.
+// Explore these alternatives.
+// TODO(adambanas): Add support for feature group count.
+template <typename ScalarType>
+void EigenTransposedConv2D(
+    const Eigen::ThreadPoolDevice* device, ScalarType* out,
+    const ScalarType* lhs, const ScalarType* rhs, Eigen::Index input_batch,
+    Eigen::Index input_x, Eigen::Index input_y, Eigen::Index input_channels,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+    Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+    Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+    DoneCallback done) {
+  // Grouped convolutions are not supported yet.
+  CHECK(kernel_channels == input_channels);
+
+  using TensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Unaligned>;
+  using ConstTensorMap3D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 3, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  using ConstTensorMap2D =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 2, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+
+  // Total spatial dimensions.
+  const int input_image_size = input_x * input_y;
+  const int output_image_size = output_x * output_y;
+  // Kernel dimensions per input channel.
+  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
+
+  // Intermediate buffer (convolution matrix)
+  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
+
+  auto conv_matrix = std::make_unique<ScalarType[]>(buffer_size);
+  ScalarType* conv_matrix_data = conv_matrix.get();
+
+  // Initialize output to zero.
+  ScalarType* out_data = out;
+  std::fill(out_data,
+            out_data + input_batch * output_image_size * kernel_filters,
+            ScalarType(0.0f));
+
+  // Initialize contraction dims (we need to transpose 'B' below, the dimension
+  // we need to contract is 'kernel_channels').
+  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims = {
+      Eigen::IndexPair<Eigen::DenseIndex>(1, 1)};
+
+  // Compute intermediate results (convolution matrix) into conv_matrix.
+  TensorMap2D C(conv_matrix_data, input_batch * input_image_size,
+                kernel_total_size);
+
+  ConstTensorMap2D A(lhs, input_batch * input_image_size, input_channels);
+  ConstTensorMap3D B(rhs, kernel_x * kernel_y, kernel_channels, kernel_filters);
+
+  const int input_offset = input_image_size * kernel_total_size;
+  const int output_offset = output_image_size * kernel_filters;
+
+  // Pack the calculated patches into the output buffer.
+  // NOTE: The ownership of the conv_matrix is transferred to the lambda without
+  // data copy or reallocation. Thanks to that, conv_matrix_data pointer remains
+  // valid, and that is important because 'C' matrix is referencing it.
+  auto pack_patches = [=, conv_matrix = std::move(conv_matrix),
+                       done = std::move(done)]() mutable {
+    // Using local pointers to buffers, because lambda is not mutable.
+    const ScalarType* conv_matrix_data = conv_matrix.get();
+    ScalarType* local_out_data = out_data;
+
+    // TODO(adambanas): Run this part in parallel.
+    for (int image_id = 0; image_id < input_batch; ++image_id) {
+      Pack2DPatches<ScalarType>(
+          conv_matrix_data, kernel_filters, output_y, output_x, kernel_y,
+          kernel_x, padding_y_before, padding_y_after, padding_x_before,
+          padding_x_after, lhs_y_dilation, lhs_x_dilation, local_out_data);
+
+      conv_matrix_data += input_offset;
+      local_out_data += output_offset;
+    }
+
+    // Signal completion of the work once the patches are packed.
+    done();
+  };
+
+  // Molds the output of the contraction into the shape expected by packing
+  // algorithm:
+  // - the minor dimension (dims[1]): the patch values to be packed; contiguous
+  //   in memory
+  // - the major dimension (dims[0]): everything else
+  Eigen::DSizes<Eigen::Index, 2> post_contract_dims;
+  post_contract_dims[0] = input_batch * input_image_size;
+  post_contract_dims[1] = kernel_total_size;
+
+  if (device != nullptr) {
+    C.device(*device, std::move(pack_patches)) =
+        A.contract(B, contract_dims).reshape(post_contract_dims);
+  } else {
+    C = A.contract(B, contract_dims).reshape(post_contract_dims);
+    pack_patches();
+  }
+}
+
+// Algorithm that works for all types of 2D convolutions. Even though it works
+// for transposed convolutions, the custom algorithm should be used whenever
+// applicable, because it is faster.
+template <bool is_grouped, typename ScalarType>
+void EigenGenericConv2D(
+    const Eigen::ThreadPoolDevice* device, ScalarType* out,
+    const ScalarType* lhs, const ScalarType* rhs, Eigen::Index input_batch,
+    Eigen::Index input_x, Eigen::Index input_y, Eigen::Index input_channels,
+    Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
+    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
+    DoneCallback done) {
+  // For non-grouped convolutions, we can optimize Eigen expressions and avoid
+  // introducing an extra dimension of size `1`.
+  if constexpr (!is_grouped) {
+    DCHECK_EQ(feature_group_count, 1) << "Expected feature group count to be 1";
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(lhs, input_batch, input_x, input_y, input_channels);
+
+  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      kernel(rhs, kernel_x, kernel_y, kernel_channels, kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 4, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, kernel_filters);
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = feature_group_count;
+  input_reshaped_dims[4] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = feature_group_count;
+  output_reshaped_dims[4] = kernel_filters / feature_group_count;
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_y * output_x * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = kernel_filters / feature_group_count;
+
+  // Kernel dimensions for non-grouped convolution.
+  Eigen::DSizes<Eigen::Index, 2> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
+  kernel_dims[1] = kernel_filters;
+
+  // Kernel dimensions for grouped convolution.
+  Eigen::DSizes<Eigen::Index, 3> kernel_reshaped_dims;
+  kernel_reshaped_dims[0] = kernel_channels * kernel_y * kernel_x;
+  kernel_reshaped_dims[1] = feature_group_count;
+  kernel_reshaped_dims[2] = kernel_filters / feature_group_count;
+
+  // IMPORTANT: Below in `convolve_group` and `convolve` lambdas, the row and
+  // column dimensions must be flipped when passed to Eigen.
+
+  // Constructs convolution and output expressions for a given group index.
+  auto convolve_group = [=](int64_t i) {
+    auto convolved =
+        input.reshape(input_reshaped_dims)
+            .chip(i, 3)
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_reshaped_dims).chip(i, 1),
+                      contract_dims)
+            .reshape(post_contract_dims);
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 3);
+    return std::make_pair(output_reshaped, convolved);
+  };
+
+  // Constructs convolution and output expressions for full input.
+  auto convolve = [=] {
+    auto convolved =
+        input
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims), contract_dims)
+            .reshape(post_contract_dims);
+    return std::make_pair(output, convolved);
   };
 
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d);
-
-  size_t convolution_rank() const { return input_dims.rank; }
-
-  int64_t input_batch;
-  Dims input_dims;
-  int64_t input_channels;
-
-  Dims kernel_dims;
-  int64_t kernel_channels;
-  int64_t kernel_filters;
-
-  Dims output_dims;
-
-  Dims strides;
-  Dims padding_before;
-  Dims padding_after;
-  Dims base_dilation;
-  Dims window_dilation;
-
-  int64_t feature_group_count;
-};
-
-// Get convolution dimensions in canonical form inferred from the operands
-// shapes and convolution parameters.
-absl::StatusOr<ConvolutionCanonicalDims> GetConvolutionCanonicalDims(
-    const ConvolutionSlices& slices, const ConvolutionDimensionNumbers& dnums,
-    const Window& window, int64_t feature_group_count);
-
-template <typename Sink>
-void AbslStringify(Sink& sink, const ConvolutionCanonicalDims::Dims& d) {
-  switch (d.rank) {
-    case 2:
-      absl::Format(&sink, "[%d,%d]", d.x, d.y);
-      break;
-    case 3:
-      absl::Format(&sink, "[%d,%d,%d]", d.x, d.y, d.z);
-      break;
-    default:
-      absl::Format(&sink, "[invalid rank %d]", d.rank);
+  // For non-grouped convolutions, we need to execute only one Eigen expression.
+  if constexpr (!is_grouped) {
+    auto [output, convolved] = convolve();
+
+    if (device != nullptr) {
+      output.device(*device, std::move(done)) = convolved;
+    } else {
+      output = convolved;
+      done();
+    }
+
+    return;
+  }
+
+  // For grouped convolutions, we need to execute multiple Eigen expressions and
+  // we might use thread pool to run them concurrently.
+  if (device != nullptr) {
+    // Although we schedule at most one tasks for each thread, individual
+    // convolution might also schedule more tasks into the same thread pool.
+    auto max_tasks = static_cast<Eigen::Index>(device->numThreads());
+    auto task_size = Eigen::numext::div_ceil(feature_group_count, max_tasks);
+    auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
+
+    // Signal done callback when all feature groups are done.
+    tsl::CountDownAsyncValueRef<tsl::Chain> count_down(feature_group_count);
+    count_down.AsPtr().AndThen([done = std::move(done)]() mutable { done(); });
+
+    Worker::Parallelize(
+        device->getPool(), /*num_workers=*/num_tasks, num_tasks,
+        [=](Eigen::Index task_index) {
+          Eigen::Index start = task_index * task_size;
+          Eigen::Index end = std::min(start + task_size, feature_group_count);
+          for (Eigen::Index i = start; i < end; ++i) {
+            auto on_done = [count_down]() mutable { count_down.CountDown(); };
+            auto [output, convolved] = convolve_group(i);
+            output.device(*device, std::move(on_done)) = convolved;
+          }
+        });
+
+  } else {
+    // Convolve all feature groups sequentially in the caller thread.
+    for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+      auto [output, convolved] = convolve_group(i);
+      output = convolved;
+    }
+    done();
   }
 }
 
-template <typename Sink>
-void AbslStringify(Sink& sink, const ConvolutionCanonicalDims& d) {
-  absl::Format(&sink,
-               "convolution_rank=%d input_batch=%d input_dims=%v "
-               "input_channels=%d kernel_dims=%v kernel_channels=%d "
-               "kernel_filters=%d output_dims=%v strides=%v padding_before=%v "
-               "padding_after=%v base_dilation=%v window_dilation=%v "
-               "feature_group_count=%d",
-               d.convolution_rank(), d.input_batch, d.input_dims,
-               d.input_channels, d.kernel_dims, d.kernel_channels,
-               d.kernel_filters, d.output_dims, d.strides, d.padding_before,
-               d.padding_after, d.base_dilation, d.window_dilation,
-               d.feature_group_count);
+template <typename ScalarType>
+void EigenConv2D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_channels,
+                 Eigen::Index kernel_x, Eigen::Index kernel_y,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index x_stride, Eigen::Index y_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done) {
+  if (CanUseCustomTransposedConv<ScalarType>(
+          input_batch, input_x, input_y, kernel_x, kernel_y, kernel_filters,
+          output_x, output_y, x_stride, y_stride, lhs_x_dilation,
+          lhs_y_dilation, rhs_x_dilation, rhs_y_dilation,
+          feature_group_count)) {
+    EigenTransposedConv2D(device, out, lhs, rhs, input_batch, input_x, input_y,
+                          input_channels, kernel_x, kernel_y, kernel_channels,
+                          kernel_filters, output_x, output_y, padding_x_before,
+                          padding_x_after, padding_y_before, padding_y_after,
+                          lhs_x_dilation, lhs_y_dilation, std::move(done));
+    return;
+  }
+
+  if (feature_group_count == 1) {
+    EigenGenericConv2D</*is_grouped=*/false>(
+        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
+        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
+        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
+        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
+        rhs_y_dilation, feature_group_count, std::move(done));
+
+  } else {
+    EigenGenericConv2D</*is_grouped=*/true>(
+        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
+        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
+        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
+        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
+        rhs_y_dilation, feature_group_count, std::move(done));
+  }
 }
 
-}  // namespace xla::cpu
+//===----------------------------------------------------------------------===//
+// Convolution 3D implementation details.
+//===----------------------------------------------------------------------===//
+
+template <typename ScalarType>
+void EigenConv3D(const Eigen::ThreadPoolDevice* device, ScalarType* out,
+                 const ScalarType* lhs, const ScalarType* rhs,
+                 Eigen::Index input_batch, Eigen::Index input_x,
+                 Eigen::Index input_y, Eigen::Index input_z,
+                 Eigen::Index input_channels, Eigen::Index kernel_x,
+                 Eigen::Index kernel_y, Eigen::Index kernel_z,
+                 Eigen::Index kernel_channels, Eigen::Index kernel_filters,
+                 Eigen::Index output_x, Eigen::Index output_y,
+                 Eigen::Index output_z, Eigen::Index x_stride,
+                 Eigen::Index y_stride, Eigen::Index z_stride,
+                 Eigen::Index padding_x_before, Eigen::Index padding_x_after,
+                 Eigen::Index padding_y_before, Eigen::Index padding_y_after,
+                 Eigen::Index padding_z_before, Eigen::Index padding_z_after,
+                 Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
+                 Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
+                 Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
+                 Eigen::Index feature_group_count, DoneCallback done) {
+  using ConstTType =
+      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
+                       Eigen::Aligned>;
+  const ConstTType input(lhs, input_batch, input_x, input_y, input_z,
+                         input_channels);
+
+  const ConstTType kernel(rhs, kernel_x, kernel_y, kernel_z, kernel_channels,
+                          kernel_filters);
+
+  Eigen::TensorMap<Eigen::Tensor<ScalarType, 5, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, input_batch, output_x, output_y, output_z, kernel_filters);
+
+  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = input_z;
+  input_reshaped_dims[4] = feature_group_count;
+  input_reshaped_dims[5] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = output_z;
+  output_reshaped_dims[4] = feature_group_count;
+  output_reshaped_dims[5] = kernel_filters / feature_group_count;
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  //   kernels
+  // - the second dimension (dims[1]): everything else
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
+  pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
+  pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
+
+  // Molds the output of the contraction into the shape expected by the user:
+  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
+  post_contract_dims[0] = input_batch;
+  post_contract_dims[1] = output_x;
+  post_contract_dims[2] = output_y;
+  post_contract_dims[3] = output_z;
+  post_contract_dims[4] = kernel_filters / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
+  kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  // Signal done callback when all feature groups are done.
+  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(feature_group_count);
+  count_down.AsPtr().AndThen([done = std::move(done)]() mutable { done(); });
+
+  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+    // The dimension order must be flipped when passed to Eigen.
+    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
+    auto patches =
+        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
+                                   Eigen::Dynamic, decltype(input_chip)>(
+            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
+            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
+            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
+            padding_z_after, padding_y_before, padding_y_after,
+            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
+
+    auto convolved =
+        patches.reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+
+    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 4);
+
+    if (device != nullptr) {
+      auto on_done = [count_down]() mutable { count_down.CountDown(); };
+      output_reshaped.device(*device, std::move(on_done)) = convolved;
+    } else {
+      output_reshaped = convolved;
+      count_down.CountDown();
+    }
+  }
+}
+
+}  // namespace xla::cpu::internal
+
+// Define Conv2D template for all supported data types.
+#define XLA_CPU_DECLARE_CONV2D(SCALAR_TYPE)                                 \
+  extern template void xla::cpu::internal::EigenConv2D<SCALAR_TYPE>(        \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_channels, Eigen::Index kernel_x,                   \
+      Eigen::Index kernel_y, Eigen::Index kernel_channels,                  \
+      Eigen::Index kernel_filters, Eigen::Index output_x,                   \
+      Eigen::Index output_y, Eigen::Index x_stride, Eigen::Index y_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+XLA_CPU_DECLARE_CONV2D(Eigen::half);
+XLA_CPU_DECLARE_CONV2D(float);
+
+#undef XLA_CPU_DECLARE_CONV2D
+
+// Define Conv3D template for all supported data types.
+#define XLA_CPU_DECLARE_CONV3D(SCALAR_TYPE)                                 \
+  extern template void xla::cpu::internal::EigenConv3D<SCALAR_TYPE>(        \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_z, Eigen::Index input_channels,                    \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,  \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,            \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,  \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,             \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+XLA_CPU_DECLARE_CONV3D(Eigen::half);
+XLA_CPU_DECLARE_CONV3D(float);
+
+#undef XLA_CPU_DECLARE_CONV3D
+
+#define XLA_CPU_DEFINE_CONV2D(SCALAR_TYPE)                                  \
+  template void xla::cpu::internal::EigenConv2D<SCALAR_TYPE>(               \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_channels, Eigen::Index kernel_x,                   \
+      Eigen::Index kernel_y, Eigen::Index kernel_channels,                  \
+      Eigen::Index kernel_filters, Eigen::Index output_x,                   \
+      Eigen::Index output_y, Eigen::Index x_stride, Eigen::Index y_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
+
+#define XLA_CPU_DEFINE_CONV3D(SCALAR_TYPE)                                  \
+  template void xla::cpu::internal::EigenConv3D<SCALAR_TYPE>(               \
+      const Eigen::ThreadPoolDevice* device, SCALAR_TYPE* out,              \
+      const SCALAR_TYPE* lhs, const SCALAR_TYPE* rhs,                       \
+      Eigen::Index input_batch, Eigen::Index input_x, Eigen::Index input_y, \
+      Eigen::Index input_z, Eigen::Index input_channels,                    \
+      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,  \
+      Eigen::Index kernel_channels, Eigen::Index kernel_filters,            \
+      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,  \
+      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,  \
+      Eigen::Index padding_x_before, Eigen::Index padding_x_after,          \
+      Eigen::Index padding_y_before, Eigen::Index padding_y_after,          \
+      Eigen::Index padding_z_before, Eigen::Index padding_z_after,          \
+      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,             \
+      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,             \
+      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,             \
+      Eigen::Index feature_group_count, DoneCallback done)
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_LIB_H_
diff --git a/third_party/xla/xla/codegen/llvm_kernel_emitter.h b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
similarity index 66%
rename from third_party/xla/xla/codegen/llvm_kernel_emitter.h
rename to third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
index ca0c1605ec1323..e374aefae76748 100644
--- a/third_party/xla/xla/codegen/llvm_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f16.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,16 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
-#define XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
-
-#include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-
-namespace xla {
-
-using LlvmKernelEmitter = KernelEmitter<LlvmKernelDefinition>;
-
-}  // namespace xla
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
-#endif  // XLA_CODEGEN_LLVM_KERNEL_EMITTER_H_
+XLA_CPU_DEFINE_CONV2D(Eigen::half);
+XLA_CPU_DEFINE_CONV3D(Eigen::half);
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
similarity index 72%
rename from third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc
rename to third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
index c4bdf05dac12d2..4caddbb902cfcd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib_f32.cc
@@ -13,14 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, float);
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, float);
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, float);
+XLA_CPU_DEFINE_CONV2D(float);
+XLA_CPU_DEFINE_CONV3D(float);
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
index 1e211b54b571cd..986676020c9946 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "xla/backends/cpu/runtime/convolution_thunk.h"
 
 #include <cstdint>
@@ -22,10 +23,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/convolution_lib.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
@@ -101,12 +103,6 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ConvolutionThunk::Execute(
                                 convolution_slices_.output_buffer.ToString(),
                                 output_data.opaque());
 
-  if (options_.multi_threaded && params.intra_op_threadpool == nullptr) {
-    return Internal(
-        "Intra-op threadpool must be provided for ConvolutionThunk in "
-        "multi-threaded mode.");
-  }
-
   // Eigen convolution
   if (convolution_canonical_dims_.convolution_rank() == 2) {
     return HandleEigen2DConvolution(params, input_data, kernel_data,
@@ -122,13 +118,14 @@ ConvolutionThunk::HandleEigen2DConvolution(const ExecuteParams& params,
                                            se::DeviceMemoryBase input,
                                            se::DeviceMemoryBase kernel,
                                            se::DeviceMemoryBase output) {
-  auto dispatch = [&](auto type_tag, const auto& eigen_device,
-                      tsl::CountDownAsyncValueRef<ExecuteEvent> count_down) {
-    using scalar_type = decltype(type_tag);
+  auto dispatch = [&](auto type_tag) {
+    auto execute_event = tsl::MakeConstructedAsyncValueRef<ExecuteEvent>();
+
+    using ScalarType = decltype(type_tag);
     internal::EigenConv2D(
-        eigen_device, static_cast<scalar_type*>(output.opaque()),
-        static_cast<scalar_type*>(input.opaque()),
-        static_cast<scalar_type*>(kernel.opaque()),
+        params.intra_op_threadpool, static_cast<ScalarType*>(output.opaque()),
+        static_cast<const ScalarType*>(input.opaque()),
+        static_cast<const ScalarType*>(kernel.opaque()),
         convolution_canonical_dims_.input_batch,
         convolution_canonical_dims_.input_dims.x,
         convolution_canonical_dims_.input_dims.y,
@@ -149,35 +146,23 @@ ConvolutionThunk::HandleEigen2DConvolution(const ExecuteParams& params,
         convolution_canonical_dims_.base_dilation.y,
         convolution_canonical_dims_.window_dilation.x,
         convolution_canonical_dims_.window_dilation.y,
-        convolution_canonical_dims_.feature_group_count, std::move(count_down),
-        /*use_thunk_runtime=*/true);
+        convolution_canonical_dims_.feature_group_count,
+        [execute_event] { execute_event.SetStateConcrete(); });
+
+    return execute_event;
   };
 
   PrimitiveType input_type = convolution_slices_.input_shape.element_type();
 
-  // Execute convolution in the intra-op threadpool.
-  if (options_.multi_threaded) {
-    tsl::CountDownAsyncValueRef<ExecuteEvent> count_down(
-        convolution_canonical_dims_.feature_group_count);
-    auto execute_event = count_down.AsRef();
-
-    if (input_type == PrimitiveType::F16) {
-      dispatch(Eigen::half{}, *params.intra_op_threadpool,
-               std::move(count_down));
-    } else {
-      dispatch(float{}, *params.intra_op_threadpool, std::move(count_down));
-    }
-    return execute_event;
-  }
-
-  // Execute convolution in the caller thread.
-  if (input_type == PrimitiveType::F16) {
-    dispatch(Eigen::half{}, Eigen::DefaultDevice(), /*count_down=*/{});
-  } else {
-    dispatch(float{}, Eigen::DefaultDevice(), /*count_down=*/{});
+  switch (input_type) {
+    case PrimitiveType::F16:
+      return dispatch(Eigen::half{});
+    case PrimitiveType::F32:
+      return dispatch(float{});
+    default:
+      return Internal("Unsupported Conv2D input type: %s",
+                      primitive_util::LowercasePrimitiveTypeName(input_type));
   }
-
-  return OkExecuteEvent();
 }
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent>
@@ -185,13 +170,14 @@ ConvolutionThunk::HandleEigen3DConvolution(const ExecuteParams& params,
                                            se::DeviceMemoryBase input,
                                            se::DeviceMemoryBase kernel,
                                            se::DeviceMemoryBase output) {
-  auto dispatch = [&](auto type_tag, const auto& eigen_device,
-                      tsl::CountDownAsyncValueRef<ExecuteEvent> count_down) {
-    using scalar_type = decltype(type_tag);
+  auto dispatch = [&](auto type_tag) {
+    auto execute_event = tsl::MakeConstructedAsyncValueRef<ExecuteEvent>();
+
+    using ScalarType = decltype(type_tag);
     internal::EigenConv3D(
-        eigen_device, static_cast<scalar_type*>(output.opaque()),
-        static_cast<scalar_type*>(input.opaque()),
-        static_cast<scalar_type*>(kernel.opaque()),
+        params.intra_op_threadpool, static_cast<ScalarType*>(output.opaque()),
+        static_cast<const ScalarType*>(input.opaque()),
+        static_cast<const ScalarType*>(kernel.opaque()),
         convolution_canonical_dims_.input_batch,
         convolution_canonical_dims_.input_dims.x,
         convolution_canonical_dims_.input_dims.y,
@@ -220,33 +206,23 @@ ConvolutionThunk::HandleEigen3DConvolution(const ExecuteParams& params,
         convolution_canonical_dims_.window_dilation.x,
         convolution_canonical_dims_.window_dilation.y,
         convolution_canonical_dims_.window_dilation.z,
-        convolution_canonical_dims_.feature_group_count, std::move(count_down));
+        convolution_canonical_dims_.feature_group_count,
+        [execute_event] { execute_event.SetStateConcrete(); });
+
+    return execute_event;
   };
 
   PrimitiveType input_type = convolution_slices_.input_shape.element_type();
 
-  // Execute convolution in the intra-op threadpool.
-  if (options_.multi_threaded) {
-    tsl::CountDownAsyncValueRef<ExecuteEvent> count_down(
-        convolution_canonical_dims_.feature_group_count);
-    auto execute_event = count_down.AsRef();
-
-    if (input_type == PrimitiveType::F16) {
-      dispatch(Eigen::half{}, *params.intra_op_threadpool,
-               std::move(count_down));
-    } else {
-      dispatch(float{}, *params.intra_op_threadpool, std::move(count_down));
-    }
-    return execute_event;
-  }
-
-  // Execute convolution in the caller thread.
-  if (input_type == PrimitiveType::F16) {
-    dispatch(Eigen::half{}, Eigen::DefaultDevice(), /*count_down=*/{});
-  } else {
-    dispatch(float{}, Eigen::DefaultDevice(), /*count_down=*/{});
+  switch (input_type) {
+    case PrimitiveType::F16:
+      return dispatch(Eigen::half{});
+    case PrimitiveType::F32:
+      return dispatch(float{});
+    default:
+      return Internal("Unsupported Conv3D input type: %s",
+                      primitive_util::LowercasePrimitiveTypeName(input_type));
   }
-  return OkExecuteEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
index cbfa0beb52df0f..5f639d5dbdd0ff 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -33,8 +33,9 @@ namespace xla::cpu {
 // Performs 1D, 2D or 3D convolution.
 class ConvolutionThunk final : public Thunk {
  public:
+  // TODO(ezhulenev): Remove this struct as we always use thread pool.
   struct Options {
-    bool multi_threaded = false;
+    bool multi_threaded = true;
   };
 
   static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> Create(
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
deleted file mode 100644
index 0d49186fb52578..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_internal.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
-#define XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "xla/backends/cpu/runtime/work_queue.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/concurrency/chain.h"
-#include "xla/tsl/framework/convolution/eigen_spatial_convolutions.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/logging.h"
-
-#define EIGEN_USE_THREADS
-#include "Eigen/Core"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu::internal {
-
-constexpr auto kMaxConvMatrixSize = static_cast<size_t>(8) << 30;  // 8 GiB
-
-// Returns in 'out_data' (assumes to be zero-initialized) image patch in storage
-// order (width, height, depth), constructed from patches in 'conv_matrix',
-// which is required to be in storage order (in_width * in_height, filter_width,
-// filter_height, out_depth).
-// Based on TF implementation by Yangqing Jia (jiayq).
-// TODO(adambanas): The original implementation implicitly rotates the kernel by
-// 180 degrees, but to be backwards compatible, we cannot do that in XLA. This
-// results in counterintuitive operations on conv_matrix, which is also 15-20%
-// slower. Try alternative approaches (e.g. rotate kernel before matrix
-// multiplication in the calling function).
-template <typename T>
-void Pack2DPatches(const T* conv_matrix, const int depth, const int height,
-                   const int width, const int filter_h, const int filter_w,
-                   const int pad_top, const int pad_bottom, const int pad_left,
-                   const int pad_right, const int stride_h, const int stride_w,
-                   T* __restrict out_im_data) {
-  int w_patches_number =
-      (width + filter_w - pad_left - pad_right - 2) / stride_w + 1;
-  int h_patches_number =
-      (height + filter_h - pad_top - pad_bottom - 2) / stride_h + 1;
-
-  const int filter_spatial_size = filter_h * filter_w;
-
-  int w_patch_begin = pad_left - filter_w + 1;
-  conv_matrix += depth * (filter_spatial_size - 1);
-  for (int w = 0; w < w_patches_number; ++w) {
-    int h_patch_begin = pad_top - filter_h + 1;
-    for (int h = 0; h < h_patches_number; ++h) {
-      // This loop body covers 1 output patch, at all depths, accounting for
-      // padding. The next line is always a pointer to the first element of the
-      // new output patch. Notice in case of less-than-full padding, the pointer
-      // can point to an element outside the image, but such elements will be
-      // skipped by the inner if (so no write occurs).
-      T* out_im_patch_data =
-          out_im_data + (w_patch_begin * height + h_patch_begin) * depth;
-
-      for (int iw = w_patch_begin; iw < w_patch_begin + filter_w; ++iw) {
-        for (int ih = h_patch_begin; ih < h_patch_begin + filter_h; ++ih) {
-          // This loop body covers 1 spatial point with coordinates (iw, ih)
-          // in the output buffer, at all depths
-          if (iw >= 0 && iw < width && ih >= 0 && ih < height) {
-            for (int i = 0; i < depth; ++i) {
-              out_im_patch_data[i] += conv_matrix[i];
-            }
-          }
-          out_im_patch_data += depth;
-          conv_matrix -= depth;
-        }
-        // Jump over remaining number of depth.
-        out_im_patch_data += depth * (height - filter_h);
-      }
-
-      conv_matrix += 2 * depth * filter_spatial_size;
-      h_patch_begin += stride_h;
-    }
-    w_patch_begin += stride_w;
-  }
-}
-
-// This implementation is based on TF algorithm with parallel contraction.
-// TODO(adambanas): There are other variants of this algorithm, 10% performance
-// improvement was observed on 1D case when not using parallel contraction.
-// Explore these alternatives.
-// TODO(adambanas): Add support for feature group count.
-template <typename EigenDevice, typename ScalarType>
-bool EigenTransposedConv2D(
-    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
-    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
-    Eigen::Index kernel_y, Eigen::Index kernel_channels,
-    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
-    Eigen::Index padding_x_before, Eigen::Index padding_x_after,
-    Eigen::Index padding_y_before, Eigen::Index padding_y_after,
-    Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-    bool use_thunk_runtime) {
-  // Grouped convolutions are not supported yet.
-  CHECK(kernel_channels == input_channels);
-
-  using TensorMap2D =
-      Eigen::TensorMap<Eigen::Tensor<ScalarType, 2, Eigen::RowMajor>,
-                       Eigen::Unaligned>;
-  using ConstTensorMap3D =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 3, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-  using ConstTensorMap2D =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 2, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-
-  // Total spatial dimensions.
-  const int input_image_size = input_x * input_y;
-  const int output_image_size = output_x * output_y;
-  // Kernel dimensions per input channel.
-  const int kernel_total_size = kernel_x * kernel_y * kernel_filters;
-
-  // Intermediate buffer (convolution matrix)
-  const size_t buffer_size = input_batch * input_image_size * kernel_total_size;
-  if (buffer_size * sizeof(ScalarType) > kMaxConvMatrixSize) {
-    LOG(WARNING)
-        << "Falling back to generic convolution implementation, because custom "
-           "transposed convolution algorithm needs too much memory ("
-        << buffer_size * sizeof(ScalarType)
-        << " bytes, exceeding the threshold of " << kMaxConvMatrixSize
-        << " bytes).";
-    return false;
-  }
-  auto conv_matrix = std::make_unique<ScalarType[]>(buffer_size);
-  ScalarType* conv_matrix_data = conv_matrix.get();
-
-  // Initialize output to zero.
-  ScalarType* out_data = out;
-  std::fill(out_data,
-            out_data + input_batch * output_image_size * kernel_filters,
-            ScalarType(0.0f));
-
-  // Initialize contraction dims (we need to transpose 'B' below, the dimension
-  // we need to contract is 'kernel_channels').
-  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims = {
-      Eigen::IndexPair<Eigen::DenseIndex>(1, 1)};
-
-  // Compute intermediate results (convolution matrix) into conv_matrix.
-  TensorMap2D C(conv_matrix_data, input_batch * input_image_size,
-                kernel_total_size);
-
-  ConstTensorMap2D A(lhs, input_batch * input_image_size, input_channels);
-  ConstTensorMap3D B(rhs, kernel_x * kernel_y, kernel_channels, kernel_filters);
-
-  // Use concurrent execution if we have a thread pool device.
-  constexpr bool use_thread_pool =
-      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
-
-  // For thunk runtime, `count_down` must be provided only if we use a thread
-  // pool device. This check is not true for classic runtime which does not
-  // support async execution.
-  if (use_thunk_runtime) {
-    CHECK_EQ(use_thread_pool, static_cast<bool>(count_down));  // Crash OK
-  }
-
-  const int input_offset = input_image_size * kernel_total_size;
-  const int output_offset = output_image_size * kernel_filters;
-
-  // Pack the calculated patches into the output buffer.
-  // NOTE: The ownership of the conv_matrix is transferred to the lambda without
-  // data copy or reallocation. Thanks to that, conv_matrix_data pointer remains
-  // valid, and that is important because 'C' matrix is referencing it.
-  auto pack_patches = [=, conv_matrix = std::move(conv_matrix)]() mutable {
-    // Using local pointers to buffers, because lambda is not mutable.
-    const ScalarType* conv_matrix_data = conv_matrix.get();
-    ScalarType* local_out_data = out_data;
-
-    // TODO(adambanas): Run this part in parallel.
-    for (int image_id = 0; image_id < input_batch; ++image_id) {
-      Pack2DPatches<ScalarType>(
-          conv_matrix_data, kernel_filters, output_y, output_x, kernel_y,
-          kernel_x, padding_y_before, padding_y_after, padding_x_before,
-          padding_x_after, lhs_y_dilation, lhs_x_dilation, local_out_data);
-
-      conv_matrix_data += input_offset;
-      local_out_data += output_offset;
-    }
-
-    // If `count_down` is provided, we need to count it down after the work is
-    // done.
-    if (count_down) {
-      count_down.CountDown();
-    }
-  };
-
-  // Molds the output of the contraction into the shape expected by packing
-  // algorithm:
-  // - the minor dimension (dims[1]): the patch values to be packed; contiguous
-  //   in memory
-  // - the major dimension (dims[0]): everything else
-  Eigen::DSizes<Eigen::Index, 2> post_contract_dims;
-  post_contract_dims[0] = input_batch * input_image_size;
-  post_contract_dims[1] = kernel_total_size;
-
-  if (count_down) {
-    // Schedule the work in the thread pool and return.
-    C.device(device, std::move(pack_patches)) =
-        A.contract(B, contract_dims).reshape(post_contract_dims);
-  } else {
-    // Run synchronously in the current thread.
-    C.device(device) = A.contract(B, contract_dims).reshape(post_contract_dims);
-    pack_patches();
-  }
-  return true;
-}
-
-inline bool CanUseCustomTransposedConv(
-    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index lhs_x_dilation,
-    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
-  return (lhs_x_dilation > 1 || lhs_y_dilation > 1) && rhs_x_dilation == 1 &&
-         rhs_y_dilation == 1 && feature_group_count == 1 && x_stride == 1 &&
-         y_stride == 1;
-}
-
-// Algorithm that works for all types of 2D convolutions. Even though it works
-// for transposed convolutions, the custom algorithm should be used whenever
-// applicable, because it is faster.
-template <bool is_grouped, typename EigenDevice, typename ScalarType>
-void EigenGenericConv2D(
-    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
-    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
-    Eigen::Index kernel_y, Eigen::Index kernel_channels,
-    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
-    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
-    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
-    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-    bool use_thunk_runtime) {
-  // For non-grouped convolutions, we can optimize Eigen expressions and avoid
-  // introducing an extra dimension of size `1`.
-  if constexpr (!is_grouped) {
-    DCHECK_EQ(feature_group_count, 1) << "Expected feature group count to be 1";
-  }
-
-  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
-                         Eigen::Aligned>
-      input(lhs, input_batch, input_x, input_y, input_channels);
-
-  const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
-                         Eigen::Aligned>
-      kernel(rhs, kernel_x, kernel_y, kernel_channels, kernel_filters);
-
-  Eigen::TensorMap<Eigen::Tensor<ScalarType, 4, Eigen::RowMajor>,
-                   Eigen::Aligned>
-      output(out, input_batch, output_x, output_y, kernel_filters);
-
-  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
-
-  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
-  input_reshaped_dims[0] = input_batch;
-  input_reshaped_dims[1] = input_x;
-  input_reshaped_dims[2] = input_y;
-  input_reshaped_dims[3] = feature_group_count;
-  input_reshaped_dims[4] = input_channels / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
-  output_reshaped_dims[0] = input_batch;
-  output_reshaped_dims[1] = output_x;
-  output_reshaped_dims[2] = output_y;
-  output_reshaped_dims[3] = feature_group_count;
-  output_reshaped_dims[4] = kernel_filters / feature_group_count;
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  //   kernels
-  // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
-  pre_contract_dims[0] = output_y * output_x * input_batch;
-  pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
-
-  // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
-  post_contract_dims[0] = input_batch;
-  post_contract_dims[1] = output_x;
-  post_contract_dims[2] = output_y;
-  post_contract_dims[3] = kernel_filters / feature_group_count;
-
-  // Kernel dimensions for non-grouped convolution.
-  Eigen::DSizes<Eigen::Index, 2> kernel_dims;
-  kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
-  kernel_dims[1] = kernel_filters;
-
-  // Kernel dimensions for grouped convolution.
-  Eigen::DSizes<Eigen::Index, 3> kernel_reshaped_dims;
-  kernel_reshaped_dims[0] = kernel_channels * kernel_y * kernel_x;
-  kernel_reshaped_dims[1] = feature_group_count;
-  kernel_reshaped_dims[2] = kernel_filters / feature_group_count;
-
-  // IMPORTANT: Below in `convolve_group` and `convolve` lambdas, the row and
-  // column dimensions must be flipped when passed to Eigen.
-
-  // Constructs convolution and output expressions for a given group index.
-  auto convolve_group = [=](int64_t i) {
-    auto convolved =
-        input.reshape(input_reshaped_dims)
-            .chip(i, 3)
-            .extract_image_patches(
-                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
-                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
-                padding_y_before, padding_y_after, padding_x_before,
-                padding_x_after, static_cast<ScalarType>(0.0f))
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_reshaped_dims).chip(i, 1),
-                      contract_dims)
-            .reshape(post_contract_dims);
-    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 3);
-    return std::make_pair(output_reshaped, convolved);
-  };
-
-  // Constructs convolution and output expressions for full input.
-  auto convolve = [=] {
-    auto convolved =
-        input
-            .extract_image_patches(
-                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
-                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
-                padding_y_before, padding_y_after, padding_x_before,
-                padding_x_after, static_cast<ScalarType>(0.0f))
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims)
-            .reshape(post_contract_dims);
-    return std::make_pair(output, convolved);
-  };
-
-  // Use concurrent execution if we have a thread pool device.
-  constexpr bool use_thread_pool =
-      std::is_same_v<EigenDevice, Eigen::ThreadPoolDevice>;
-
-  // For thunk runtime, `count_down` must be provided only if we use a thread
-  // pool device. This check is not true for classic runtime which does not
-  // support async execution.
-  if (use_thunk_runtime) {
-    CHECK_EQ(use_thread_pool, static_cast<bool>(count_down));  // Crash OK
-  }
-
-  // For non-grouped convolutions, we need to execute only one Eigen expression.
-  if constexpr (!is_grouped) {
-    auto [output, convolved] = convolve();
-
-    if (count_down) {
-      auto on_done = [count_down]() mutable { count_down.CountDown(); };
-      output.device(device, std::move(on_done)) = convolved;
-    } else {
-      output.device(device) = convolved;
-    }
-
-    return;
-  }
-
-  // For grouped convolutions, we need to execute multiple Eigen expressions and
-  // we might use thread pool to run them concurrently.
-  if constexpr (use_thread_pool) {
-    // Although we schedule at most one tasks for each thread, individual
-    // convolution might also schedule more tasks into the same thread pool.
-    auto max_tasks = static_cast<Eigen::Index>(device.numThreads());
-    auto task_size = Eigen::numext::div_ceil(feature_group_count, max_tasks);
-    auto num_tasks = Eigen::numext::div_ceil(feature_group_count, task_size);
-
-    if (use_thunk_runtime) {
-      Worker::Parallelize(
-          device.getPool(), /*num_workers=*/num_tasks, num_tasks,
-          [=, &device](Eigen::Index task_index) mutable {
-            Eigen::Index start = task_index * task_size;
-            Eigen::Index end = std::min(start + task_size, feature_group_count);
-            for (Eigen::Index i = start; i < end; ++i) {
-              auto on_done = [count_down]() mutable { count_down.CountDown(); };
-              auto [output, convolved] = convolve_group(i);
-              output.device(device, std::move(on_done)) = convolved;
-            }
-          });
-    } else {
-      tsl::BlockUntilReady(Worker::Parallelize(
-          device.getPool(), /*num_workers=*/num_tasks, num_tasks,
-          [=, &device](Eigen::Index task_index) {
-            Eigen::Index start = task_index * task_size;
-            Eigen::Index end = std::min(start + task_size, feature_group_count);
-            for (Eigen::Index i = start; i < end; ++i) {
-              auto [output, convolved] = convolve_group(i);
-              output.device(device) = convolved;
-            }
-          }));
-    }
-
-  } else {
-    // Convolve all feature groups sequentially in the caller thread.
-    for (Eigen::Index i = 0; i < feature_group_count; ++i) {
-      auto [output, convolved] = convolve_group(i);
-      output.device(device) = convolved;
-    }
-  }
-}
-
-// TODO(ezhulenev): Make internal implementation a private static method of
-// ConvolutionThunk (for consistency with DotThunk). Today we keep it as a
-// free function to use it in the legacy XLA CPU runtime.
-template <typename EigenDevice, typename ScalarType>
-void EigenConv2D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-                 ScalarType* rhs, Eigen::Index input_batch,
-                 Eigen::Index input_x, Eigen::Index input_y,
-                 Eigen::Index input_channels, Eigen::Index kernel_x,
-                 Eigen::Index kernel_y, Eigen::Index kernel_channels,
-                 Eigen::Index kernel_filters, Eigen::Index output_x,
-                 Eigen::Index output_y, Eigen::Index x_stride,
-                 Eigen::Index y_stride, Eigen::Index padding_x_before,
-                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-                 Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
-                 Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-                 Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
-                 tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-                 bool use_thunk_runtime) {
-  DCHECK(!count_down || (feature_group_count == count_down.count()));
-
-  if (CanUseCustomTransposedConv(x_stride, y_stride, lhs_x_dilation,
-                                 lhs_y_dilation, rhs_x_dilation, rhs_y_dilation,
-                                 feature_group_count)) {
-    if (EigenTransposedConv2D(device, out, lhs, rhs, input_batch, input_x,
-                              input_y, input_channels, kernel_x, kernel_y,
-                              kernel_channels, kernel_filters, output_x,
-                              output_y, padding_x_before, padding_x_after,
-                              padding_y_before, padding_y_after, lhs_x_dilation,
-                              lhs_y_dilation, count_down, use_thunk_runtime)) {
-      return;
-    }
-  }
-
-  if (feature_group_count == 1) {
-    EigenGenericConv2D</*is_grouped=*/false>(
-        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
-        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
-        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
-        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
-        rhs_y_dilation, feature_group_count, std::move(count_down),
-        use_thunk_runtime);
-
-  } else {
-    EigenGenericConv2D</*is_grouped=*/true>(
-        device, out, lhs, rhs, input_batch, input_x, input_y, input_channels,
-        kernel_x, kernel_y, kernel_channels, kernel_filters, output_x, output_y,
-        x_stride, y_stride, padding_x_before, padding_x_after, padding_y_before,
-        padding_y_after, lhs_x_dilation, lhs_y_dilation, rhs_x_dilation,
-        rhs_y_dilation, feature_group_count, std::move(count_down),
-        use_thunk_runtime);
-  }
-}
-
-template <typename EigenDevice, typename ScalarType>
-void EigenConv3D(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
-                 ScalarType* rhs, Eigen::Index input_batch,
-                 Eigen::Index input_x, Eigen::Index input_y,
-                 Eigen::Index input_z, Eigen::Index input_channels,
-                 Eigen::Index kernel_x, Eigen::Index kernel_y,
-                 Eigen::Index kernel_z, Eigen::Index kernel_channels,
-                 Eigen::Index kernel_filters, Eigen::Index output_x,
-                 Eigen::Index output_y, Eigen::Index output_z,
-                 Eigen::Index x_stride, Eigen::Index y_stride,
-                 Eigen::Index z_stride, Eigen::Index padding_x_before,
-                 Eigen::Index padding_x_after, Eigen::Index padding_y_before,
-                 Eigen::Index padding_y_after, Eigen::Index padding_z_before,
-                 Eigen::Index padding_z_after, Eigen::Index lhs_x_dilation,
-                 Eigen::Index lhs_y_dilation, Eigen::Index lhs_z_dilation,
-                 Eigen::Index rhs_x_dilation, Eigen::Index rhs_y_dilation,
-                 Eigen::Index rhs_z_dilation, Eigen::Index feature_group_count,
-                 tsl::CountDownAsyncValueRef<tsl::Chain> count_down) {
-  DCHECK(!count_down || (feature_group_count == count_down.count()));
-
-  using ConstTType =
-      Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
-                       Eigen::Aligned>;
-  const ConstTType input(lhs, input_batch, input_x, input_y, input_z,
-                         input_channels);
-
-  const ConstTType kernel(rhs, kernel_x, kernel_y, kernel_z, kernel_channels,
-                          kernel_filters);
-
-  Eigen::TensorMap<Eigen::Tensor<ScalarType, 5, Eigen::RowMajor>,
-                   Eigen::Aligned>
-      output(out, input_batch, output_x, output_y, output_z, kernel_filters);
-
-  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
-  input_reshaped_dims[0] = input_batch;
-  input_reshaped_dims[1] = input_x;
-  input_reshaped_dims[2] = input_y;
-  input_reshaped_dims[3] = input_z;
-  input_reshaped_dims[4] = feature_group_count;
-  input_reshaped_dims[5] = input_channels / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
-  output_reshaped_dims[0] = input_batch;
-  output_reshaped_dims[1] = output_x;
-  output_reshaped_dims[2] = output_y;
-  output_reshaped_dims[3] = output_z;
-  output_reshaped_dims[4] = feature_group_count;
-  output_reshaped_dims[5] = kernel_filters / feature_group_count;
-
-  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  //   kernels
-  // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
-  pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
-  pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
-
-  // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
-  post_contract_dims[0] = input_batch;
-  post_contract_dims[1] = output_x;
-  post_contract_dims[2] = output_y;
-  post_contract_dims[3] = output_z;
-  post_contract_dims[4] = kernel_filters / feature_group_count;
-
-  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
-  kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
-  kernel_dims[1] = feature_group_count;
-  kernel_dims[2] = kernel_filters / feature_group_count;
-
-  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
-    // The dimension order must be flipped when passed to Eigen.
-    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
-    auto patches =
-        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
-                                   Eigen::Dynamic, decltype(input_chip)>(
-            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
-            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
-            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
-            padding_z_after, padding_y_before, padding_y_after,
-            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
-
-    auto convolved =
-        patches.reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
-            .reshape(post_contract_dims);
-
-    auto output_reshaped = output.reshape(output_reshaped_dims).chip(i, 4);
-    if (count_down) {
-      auto on_done = [count_down]() mutable { count_down.CountDown(); };
-      output_reshaped.device(device, std::move(on_done)) = convolved;
-    } else {
-      output_reshaped.device(device) = convolved;
-    }
-  }
-}
-
-// Extern Conv2D template for all supported devices and data types.
-#define CONV2D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                        \
-  extern template void EigenConv2D<DEVICE, SCALAR_TYPE>(                   \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
-      Eigen::Index input_y, Eigen::Index input_channels,                   \
-      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
-      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
-      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
-      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
-      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
-      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,                  \
-      bool use_thunk_runtime)
-
-CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
-CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
-CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-#undef CONV2D_EXTERN_TEMPLATE
-
-// Extern Conv3D template for all supported devices and data types.
-#define CONV3D_EXTERN_TEMPLATE(DEVICE, SCALAR_TYPE)                            \
-  extern template void EigenConv3D<DEVICE, SCALAR_TYPE>(                       \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
-      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
-      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
-      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
-      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
-      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
-      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
-      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
-      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
-      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
-      Eigen::Index feature_group_count,                                        \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down)
-
-CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
-CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
-CONV3D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
-
-#undef CONV3D_EXTERN_TEMPLATE
-
-}  // namespace xla::cpu::internal
-
-#define CONV2D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                   \
-  template void xla::cpu::internal::EigenConv2D<DEVICE, SCALAR_TYPE>(      \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,            \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,    \
-      Eigen::Index input_y, Eigen::Index input_channels,                   \
-      Eigen::Index kernel_x, Eigen::Index kernel_y,                        \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,           \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index x_stride, \
-      Eigen::Index y_stride, Eigen::Index padding_x_before,                \
-      Eigen::Index padding_x_after, Eigen::Index padding_y_before,         \
-      Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,           \
-      Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,            \
-      Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,       \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,                  \
-      bool use_thunk_runtime)
-
-#define CONV3D_INSTANTIATE_TEMPLATE(DEVICE, SCALAR_TYPE)                       \
-  template void xla::cpu::internal::EigenConv3D<DEVICE, SCALAR_TYPE>(          \
-      const DEVICE& device, SCALAR_TYPE* out, SCALAR_TYPE* lhs,                \
-      SCALAR_TYPE* rhs, Eigen::Index input_batch, Eigen::Index input_x,        \
-      Eigen::Index input_y, Eigen::Index input_z, Eigen::Index input_channels, \
-      Eigen::Index kernel_x, Eigen::Index kernel_y, Eigen::Index kernel_z,     \
-      Eigen::Index kernel_channels, Eigen::Index kernel_filters,               \
-      Eigen::Index output_x, Eigen::Index output_y, Eigen::Index output_z,     \
-      Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index z_stride,     \
-      Eigen::Index padding_x_before, Eigen::Index padding_x_after,             \
-      Eigen::Index padding_y_before, Eigen::Index padding_y_after,             \
-      Eigen::Index padding_z_before, Eigen::Index padding_z_after,             \
-      Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,                \
-      Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,                \
-      Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,                \
-      Eigen::Index feature_group_count,                                        \
-      tsl::CountDownAsyncValueRef<tsl::Chain> count_down)
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_CONVOLUTION_THUNK_INTERNAL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
index 89de8ed1f08f55..80004ffc471391 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_test.cc
@@ -182,34 +182,5 @@ TEST(ConvolutionThunkTest, CreationErrorOnOutputChannelsMismatch) {
                            "should be the same as output channels count (4)"));
 }
 
-TEST(ConvolutionThunkTest,
-     ExecutionErrorOnMissingThreadPoolInMultiThreadedMode) {
-  ConvolutionThunkBuilder<float> builder;
-
-  auto options = MakeConvolutionOptions();
-  options.multi_threaded = true;
-  builder.SetOptions(options);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto thunk, builder.Build());
-  BufferAllocations allocations = builder.GetAllocations();
-
-  // Execute thunk and wait for completion.
-  Thunk::ExecuteParams params;
-  params.intra_op_threadpool = nullptr;
-  params.buffer_allocations = &allocations;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-
-  // Verify that the execution was not successful.
-  ASSERT_TRUE(execute_event.IsError());
-  auto status = execute_event.GetError();
-  EXPECT_EQ(absl::StatusCode::kInternal, status.code());
-  EXPECT_EQ(
-      "Intra-op threadpool must be provided for ConvolutionThunk in "
-      "multi-threaded mode.",
-      status.message());
-}
-
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
index b27ba96f926bd5..0ee33256beea13 100644
--- a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
@@ -60,7 +60,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
+using AttributesMap = ffi::AttributesMap;
 
 static absl::StatusOr<AttributesMap> ParseAttributes(
     absl::string_view backend_config) {
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc b/third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
similarity index 97%
rename from third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
index 2b477022831e05..4d57a8944f3967 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_dims.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 
 #include <cstdint>
 #include <functional>
@@ -138,10 +138,12 @@ absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
       dot_dimensions.rhs_contracting_dimensions().end());
 
   // Adjust contracting dimensions for leading batch dimensions.
-  for (int64_t& dim : lhs_contracting_dims)
+  for (int64_t& dim : lhs_contracting_dims) {
     dim -= dot_dimensions.lhs_batch_dimensions_size();
-  for (int64_t& dim : rhs_contracting_dims)
+  }
+  for (int64_t& dim : rhs_contracting_dims) {
     dim -= dot_dimensions.rhs_batch_dimensions_size();
+  }
 
   // Non-contracting dots should never make it here.
   TF_RET_CHECK(lhs_contracting_dims.size() == 1);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_dims.h b/third_party/xla/xla/backends/cpu/runtime/dot_dims.h
new file mode 100644
index 00000000000000..2b40717a5314c3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_dims.h
@@ -0,0 +1,100 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
+#define XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
+
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+// Allocation slices of the dot operation.
+struct DotSlices {
+  BufferAllocation::Slice lhs_buffer;
+  Shape lhs_shape;
+
+  BufferAllocation::Slice rhs_buffer;
+  Shape rhs_shape;
+
+  BufferAllocation::Slice out_buffer;
+  Shape out_shape;
+};
+
+// Shape of the batched dot operation supported by the XLA:CPU runtime.
+struct DotShape {
+  // Product of batch dimensions.
+  int64_t batch_size;
+
+  // Shapes of the non-batch matrix-multiplication for the dot operation
+  Shape lhs_matmul_shape;
+  Shape rhs_matmul_shape;
+  Shape out_matmul_shape;
+};
+
+// Dot operation is implemented as a matrix-matrix multiply (row-major x
+// rowm-major or col-major x col-major). For batched dot operations, it is
+// implemented as multiple matrix multiplications repeated for each batch
+// element.
+struct DotCanonicalDims {
+  // The number of rows in the LHS.
+  int64_t m;
+
+  // The number of columns in the LHS, which also must be equal to the
+  // number of rows in the RHS.
+  int64_t k;
+
+  // The number of columns in the RHS.
+  int64_t n;
+
+  // True if the LHS matrix is column major.
+  bool lhs_column_major;
+
+  // True if the LHS contraction dimension is 1.
+  bool lhs_canonical;
+
+  // True if the RHS matrix is column major.
+  bool rhs_column_major;
+
+  // True if the RHS contraction dimension is 0.
+  bool rhs_canonical;
+
+  // True if the output matrix is column major.
+  bool output_column_major;
+};
+
+// Returns buffer uses of the dot operation.
+absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices);
+
+// Verifies dot dimensions and shapes and returns the shape of the dot operation
+// in a form that is convenient for the runtime implementation.
+absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
+                                     const Shape& lhs_shape,
+                                     const Shape& rhs_shape,
+                                     const Shape& out_shape);
+
+// Get canonical dot dimensions for the given dot shape.
+absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
+    const DotDimensionNumbers& dot_dimensions, const DotShape& dot_shape);
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_DIMS_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
index 00a29d7d2fc20f..363e83609f1eab 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,85 +16,126 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
 #define XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
 
+#include <array>
 #include <cstdint>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/status/statusor.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Allocation slices of the dot operation.
-struct DotSlices {
-  BufferAllocation::Slice lhs_buffer;
-  Shape lhs_shape;
-
-  BufferAllocation::Slice rhs_buffer;
-  Shape rhs_shape;
-
-  BufferAllocation::Slice out_buffer;
-  Shape out_shape;
-};
-
-// Shape of the batched dot operation supported by the XLA:CPU runtime.
-struct DotShape {
-  // Product of batch dimensions.
-  int64_t batch_size;
-
-  // Shapes of the non-batch matrix-multiplication for the dot operation
-  Shape lhs_matmul_shape;
-  Shape rhs_matmul_shape;
-  Shape out_matmul_shape;
-};
-
-// Dot operation is implemented as a matrix-matrix multiply (row-major x
-// rowm-major or col-major x col-major). For batched dot operations, it is
-// implemented as multiple matrix multiplications repeated for each batch
-// element.
-struct DotCanonicalDims {
-  // The number of rows in the LHS.
-  int64_t m;
-
-  // The number of columns in the LHS, which also must be equal to the
-  // number of rows in the RHS.
-  int64_t k;
-
-  // The number of columns in the RHS.
-  int64_t n;
-
-  // True if the LHS matrix is column major.
-  bool lhs_column_major;
-
-  // True if the LHS contraction dimension is 1.
-  bool lhs_canonical;
-
-  // True if the RHS matrix is column major.
-  bool rhs_column_major;
-
-  // True if the RHS contraction dimension is 0.
-  bool rhs_canonical;
-
-  // True if the output matrix is column major.
-  bool output_column_major;
-};
-
-// Returns buffer uses of the dot operation.
-absl::InlinedVector<BufferUse, 4> DotBufferUses(const DotSlices& slices);
-
-// Verifies dot dimensions and shapes and returns the shape of the dot operation
-// in a form that is convenient for the runtime implementation.
-absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
-                                     const Shape& lhs_shape,
-                                     const Shape& rhs_shape,
-                                     const Shape& out_shape);
-
-// Get canonical dot dimensions for the given dot shape.
-absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
-    const DotDimensionNumbers& dot_dimensions, const DotShape& dot_shape);
-
-}  // namespace xla::cpu
+#include <utility>
+
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/Core"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu::internal {
+
+// Done callback is called when the MatMul computation is complete.
+using DoneCallback = absl::AnyInvocable<void()>;
+
+// Col-major x Col-major MatMul implementation as Eigen contraction.
+template <typename LhsType, typename RhsType, typename OutType,
+          Eigen::AlignmentType alignment>
+void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out, LhsType* lhs,
+            RhsType* rhs, int64_t m, int64_t n, int64_t k,
+            int32_t transpose_lhs, int32_t transpose_rhs, DoneCallback done);
+
+// Col-major x Col-major MatMul implementation as Eigen contraction.
+template <typename LhsType, typename RhsType, typename OutType>
+void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out, void* lhs,
+                 void* rhs, int64_t m, int64_t n, int64_t k, bool transpose_lhs,
+                 bool transpose_rhs, DoneCallback done);
+
+//===----------------------------------------------------------------------===//
+// TypedMatMul/MatMul implementation details.
+//===----------------------------------------------------------------------===//
+
+template <typename LhsType, typename RhsType, typename OutType,
+          Eigen::AlignmentType alignment>
+void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out, LhsType* lhs,
+            RhsType* rhs, int64_t m, int64_t n, int64_t k,
+            int32_t transpose_lhs, int32_t transpose_rhs, DoneCallback done) {
+  int64_t lhs_rows = m;
+  int64_t lhs_cols = k;
+  if (transpose_lhs) {
+    std::swap(lhs_rows, lhs_cols);
+  }
+
+  int64_t rhs_rows = k;
+  int64_t rhs_cols = n;
+  if (transpose_rhs) {
+    std::swap(rhs_rows, rhs_cols);
+  }
+
+  const Eigen::TensorMap<Eigen::Tensor<const LhsType, 2>, alignment> a(
+      lhs, lhs_rows, lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const RhsType, 2>, alignment> b(
+      rhs, rhs_rows, rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<OutType, 2>, alignment> c(out, m, n);
+
+  typedef typename Eigen::Tensor<LhsType, 2>::DimensionPair DimPair;
+  int lhs_contract_dim = transpose_lhs ? 0 : 1;
+  int rhs_contract_dim = transpose_rhs ? 1 : 0;
+
+  std::array<DimPair, 1> dims({DimPair(lhs_contract_dim, rhs_contract_dim)});
+
+  if (device != nullptr) {
+    c.device(*device, std::move(done)) =
+        a.contract(b, dims).template cast<OutType>();
+  } else {
+    c = a.contract(b, dims).template cast<OutType>();
+    done();
+  }
+}
+
+template <typename LhsType, typename RhsType, typename OutType>
+void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out, void* lhs,
+                 void* rhs, int64_t m, int64_t n, int64_t k, bool transpose_lhs,
+                 bool transpose_rhs, DoneCallback done) {
+  auto is_16_byte_aligned = [](void* ptr) {
+    return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+  };
+
+  bool is_aligned = is_16_byte_aligned(lhs) && is_16_byte_aligned(rhs) &&
+                    is_16_byte_aligned(out);
+
+  if (ABSL_PREDICT_TRUE(is_aligned)) {
+    MatMul<LhsType, RhsType, OutType, Eigen::Aligned16>(
+        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
+        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
+        std::move(done));
+  } else {
+    MatMul<LhsType, RhsType, OutType, Eigen::Unaligned>(
+        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
+        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
+        std::move(done));
+  }
+}
+
+// Declare TypedMatMul template for all supported data types to enable
+// parallel compilation.
+#define DECLARE_TYPED_MATMUL(T)                                                \
+  extern template void TypedMatMul<T, T, T>(                                   \
+      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
+      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
+      DoneCallback done)
+
+DECLARE_TYPED_MATMUL(Eigen::half);
+DECLARE_TYPED_MATMUL(float);
+DECLARE_TYPED_MATMUL(double);
+DECLARE_TYPED_MATMUL(int32_t);
+DECLARE_TYPED_MATMUL(std::complex<float>);
+DECLARE_TYPED_MATMUL(std::complex<double>);
+
+#define DECLARE_MIXED_MATMUL(LhsType, RhsType, OutType)                        \
+  extern template void TypedMatMul<LhsType, RhsType, OutType>(                 \
+      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
+      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
+      DoneCallback done)
+
+DECLARE_MIXED_MATMUL(int8_t, int8_t, int32_t);
+
+#undef DECLARE_TYPED_MATMUL
+
+}  // namespace xla::cpu::internal
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
index 6c86df47d02c59..158ee7b82e25d7 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c128.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c128.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<
+template void ::xla::cpu::internal::TypedMatMul<
     std::complex<double>, std::complex<double>, std::complex<double>>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
index 39ae563ffff276..6daf5ccc8516cd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_c64.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_c64.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<
+template void ::xla::cpu::internal::TypedMatMul<
     std::complex<float>, std::complex<float>, std::complex<float>>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
similarity index 86%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
index c6a8d049c545c7..b9e50c11be5905 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f16.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f16.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<Eigen::half, Eigen::half,
+template void ::xla::cpu::internal::TypedMatMul<Eigen::half, Eigen::half,
                                                 Eigen::half>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
index d41bc567c99961..cdc654c41787b4 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f32.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-template void ::xla::cpu::DotThunk::TypedMatMul<float, float, float>(
+template void ::xla::cpu::internal::TypedMatMul<float, float, float>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
similarity index 85%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
index d8ef6762d87f9e..4626c002dab800 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_f64.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_f64.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<double, double, double>(
+template void ::xla::cpu::internal::TypedMatMul<double, double, double>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
similarity index 85%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
index e6557ec7e03433..ca3f145c05f339 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s32.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s32.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
-template void ::xla::cpu::DotThunk::TypedMatMul<int32_t, int32_t, int32_t>(
+template void ::xla::cpu::internal::TypedMatMul<int32_t, int32_t, int32_t>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
similarity index 87%
rename from third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc
rename to third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
index eb959f5c457fb4..122bde428d2517 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk_s8.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib_s8.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/dot_thunk.h"  // NOLINT IWYU pragma: keep
+#include "xla/backends/cpu/runtime/dot_lib.h"  // IWYU pragma: keep
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"  // IWYU pragma: keep
 #endif
 
-template void ::xla::cpu::DotThunk::TypedMatMul<int8_t, int8_t, int32_t>(
+template void ::xla::cpu::internal::TypedMatMul<int8_t, int8_t, int32_t>(
     const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,
     int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs,
     DoneCallback done);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
index 84678dcbd9f834..79a68a40dd7c80 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/dot_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/primitive_util.h"
@@ -71,7 +72,6 @@ DotThunk::DotThunk(Info info, DotDimensionNumbers dot_dimensions,
 
 tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
     const ExecuteParams& params) {
-
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase lhs_data,
       params.buffer_allocations->GetDeviceAddress(dot_slices_.lhs_buffer));
@@ -169,13 +169,17 @@ tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
 
   auto dispatch = [&](auto lhs_type, auto rhs_type, auto out_type) {
     for (int64_t i = 0; i < dot_shape_.batch_size; ++i) {
-      TypedMatMul<decltype(lhs_type), decltype(rhs_type), decltype(out_type)>(
+      using LhsType = decltype(lhs_type);
+      using RhsType = decltype(rhs_type);
+      using OutType = decltype(out_type);
+      internal::TypedMatMul<LhsType, RhsType, OutType>(
           params.intra_op_threadpool, batch_ptr(out, out_stride, i),
           batch_ptr(lhs, lhs_stride, i), batch_ptr(rhs, rhs_stride, i), m, n, k,
           transpose_lhs, transpose_rhs,
           [state]() mutable { state.CountDown(); });
     }
   };
+
   auto dispatch_same_type = [&](auto type_tag) {
     dispatch(type_tag, type_tag, type_tag);
   };
@@ -205,7 +209,7 @@ tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
         dispatch_same_type(std::complex<double>{});
         break;
       default:
-        absl::string_view type_name = PrimitiveType_Name(lhs_dtype);
+        auto type_name = primitive_util::LowercasePrimitiveTypeName(lhs_dtype);
         return Unimplemented(
             "Unsupported element type for DotThunk::Execute: %s x %s = %s",
             type_name, type_name, type_name);
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
index a5d98fff2e3165..2a57a1ed021bf3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.h
@@ -16,20 +16,12 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
 #define XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
 
-#include "xla/backends/cpu/runtime/dot_lib.h"
-#define EIGEN_USE_THREADS
-
-#include <array>
 #include <cstdint>
 #include <memory>
-#include <utility>
 
-#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
-#include "Eigen/Core"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -57,22 +49,6 @@ class DotThunk final : public Thunk {
   DotThunk(Info info, DotDimensionNumbers dot_dimensions, DotSlices dot_slices,
            DotShape dot_shape, DotCanonicalDims dot_canonical_dims);
 
-  using DoneCallback = absl::AnyInvocable<void()>;
-
-  // Col-major x Col-major MatMul implementation as Eigen contraction.
-  template <typename LhsType, typename RhsType, typename OutType,
-            Eigen::AlignmentType alignment>
-  static void MatMul(const Eigen::ThreadPoolDevice* device, OutType* out,
-                     LhsType* lhs, RhsType* rhs, int64_t m, int64_t n,
-                     int64_t k, int32_t transpose_lhs, int32_t transpose_rhs,
-                     DoneCallback done);
-
-  template <typename LhsType, typename RhsType, typename OutType>
-  static void TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
-                          void* lhs, void* rhs, int64_t m, int64_t n, int64_t k,
-                          bool transpose_lhs, bool transpose_rhs,
-                          DoneCallback done);
-
   DotDimensionNumbers dot_dimensions_;
   DotSlices dot_slices_;
   DotShape dot_shape_;
@@ -83,95 +59,6 @@ class DotThunk final : public Thunk {
   absl::InlinedVector<int64_t, 2> rhs_matmul_contracting_dims_;
 };
 
-//===----------------------------------------------------------------------===//
-// DotThunk implementation details.
-//===----------------------------------------------------------------------===//
-
-template <typename LhsType, typename RhsType, typename OutType,
-          Eigen::AlignmentType alignment>
-void DotThunk::MatMul(const Eigen::ThreadPoolDevice* device, OutType* out,
-                      LhsType* lhs, RhsType* rhs, int64_t m, int64_t n,
-                      int64_t k, int32_t transpose_lhs, int32_t transpose_rhs,
-                      DoneCallback done) {
-  int64_t lhs_rows = m;
-  int64_t lhs_cols = k;
-  if (transpose_lhs) std::swap(lhs_rows, lhs_cols);
-
-  int64_t rhs_rows = k;
-  int64_t rhs_cols = n;
-  if (transpose_rhs) std::swap(rhs_rows, rhs_cols);
-
-  const Eigen::TensorMap<Eigen::Tensor<const LhsType, 2>, alignment> a(
-      lhs, lhs_rows, lhs_cols);
-  const Eigen::TensorMap<Eigen::Tensor<const RhsType, 2>, alignment> b(
-      rhs, rhs_rows, rhs_cols);
-  Eigen::TensorMap<Eigen::Tensor<OutType, 2>, alignment> c(out, m, n);
-
-  typedef typename Eigen::Tensor<LhsType, 2>::DimensionPair DimPair;
-  int lhs_contract_dim = transpose_lhs ? 0 : 1;
-  int rhs_contract_dim = transpose_rhs ? 1 : 0;
-  std::array<DimPair, 1> dims({DimPair(lhs_contract_dim, rhs_contract_dim)});
-
-  if (device != nullptr) {
-    c.device(*device, std::move(done)) =
-        a.contract(b, dims).template cast<OutType>();
-  } else {
-    c = a.contract(b, dims).template cast<OutType>();
-    done();
-  }
-}
-
-template <typename LhsType, typename RhsType, typename OutType>
-void DotThunk::TypedMatMul(const Eigen::ThreadPoolDevice* device, void* out,
-                           void* lhs, void* rhs, int64_t m, int64_t n,
-                           int64_t k, bool transpose_lhs, bool transpose_rhs,
-                           DoneCallback done) {
-  auto is_16_byte_aligned = [](void* ptr) {
-    return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
-  };
-
-  bool is_aligned = is_16_byte_aligned(lhs) && is_16_byte_aligned(rhs) &&
-                    is_16_byte_aligned(out);
-
-  if (ABSL_PREDICT_TRUE(is_aligned)) {
-    MatMul<LhsType, RhsType, OutType, Eigen::Aligned16>(
-        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
-        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
-        std::move(done));
-  } else {
-    MatMul<LhsType, RhsType, OutType, Eigen::Unaligned>(
-        device, static_cast<OutType*>(out), static_cast<LhsType*>(lhs),
-        static_cast<RhsType*>(rhs), m, n, k, transpose_lhs, transpose_rhs,
-        std::move(done));
-  }
-}
-
-// Extern DotThunk::TypedMatMul template for all supported data types to enable
-// parallel compilation.
-#define DOT_THUNK_EXTERN_MATMUL_TEMPLATE(T)                                    \
-  extern template void DotThunk::TypedMatMul<T, T, T>(                         \
-      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
-      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
-      DoneCallback done)
-
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(Eigen::half);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(float);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(double);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(int32_t);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<float>);
-DOT_THUNK_EXTERN_MATMUL_TEMPLATE(std::complex<double>);
-
-#define DOT_THUNK_EXTERN_MATMUL_MIXED_PRECISION_TEMPLATE(LhsType, RhsType,     \
-                                                         OutType)              \
-  extern template void DotThunk::TypedMatMul<LhsType, RhsType, OutType>(       \
-      const Eigen::ThreadPoolDevice* device, void* out, void* lhs, void* rhs,  \
-      int64_t m, int64_t n, int64_t k, bool transpose_lhs, bool transpose_rhs, \
-      DoneCallback done)
-
-DOT_THUNK_EXTERN_MATMUL_MIXED_PRECISION_TEMPLATE(int8_t, int8_t, int32_t);
-
-#undef DOT_THUNK_EXTERN_MATMUL_TEMPLATE
-
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_DOT_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/function_library.h b/third_party/xla/xla/backends/cpu/runtime/function_library.h
index d9a685fb16a52e..3b2699642389be 100644
--- a/third_party/xla/xla/backends/cpu/runtime/function_library.h
+++ b/third_party/xla/xla/backends/cpu/runtime/function_library.h
@@ -19,13 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 #include <type_traits>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/runtime/kernel_c_api.h"
 #include "xla/tsl/lib/gtl/int_type.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {
 
@@ -64,14 +63,14 @@ class FunctionLibrary {
   };
 
   template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
-  static Symbol Sym(std::string name) {
-    return Symbol{GetTypeId<F>(), std::move(name)};
+  static Symbol Sym(absl::string_view name) {
+    return Symbol{GetTypeId<F>(), std::string(name)};
   }
 
   template <typename F, std::enable_if_t<std::is_function_v<F>>* = nullptr>
   absl::StatusOr<F*> ResolveFunction(absl::string_view name) {
     TF_ASSIGN_OR_RETURN(void* ptr, ResolveFunction(GetTypeId<F>(), name));
-    return reinterpret_cast<F*>(ptr);
+    return reinterpret_cast<F*>(ptr);  // NOLINT
   }
 
  protected:
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel.cc b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
index d065549cfb5947..82c843b4025b31 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
@@ -62,10 +62,10 @@ static absl::InlinedVector<XLA_CPU_KernelArg, 8> ConvertBuffersToKernelArgs(
 }
 
 template <bool num_workgroups_x_only>
-class Kernel::ParallelTask {
+class Kernel::Task {
  public:
-  ParallelTask(XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
-               absl::Span<const XLA_CPU_KernelArg> args);
+  Task(XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
+       absl::Span<const XLA_CPU_KernelArg> args);
 
   // Invokes a host kernel for a given task index.
   absl::Status operator()(size_t task_index) const;
@@ -87,7 +87,7 @@ class Kernel::ParallelTask {
 };
 
 template <bool num_workgroups_x_only>
-Kernel::ParallelTask<num_workgroups_x_only>::ParallelTask(
+Kernel::Task<num_workgroups_x_only>::Task(
     XLA_CPU_Kernel* kernel, NumWorkGroups num_workgroups,
     absl::Span<const XLA_CPU_KernelArg> args)
     : kernel_(kernel),
@@ -98,7 +98,7 @@ Kernel::ParallelTask<num_workgroups_x_only>::ParallelTask(
       stride_y_(num_workgroups.x) {}
 
 template <bool num_workgroups_x_only>
-absl::Status Kernel::ParallelTask<num_workgroups_x_only>::operator()(
+absl::Status Kernel::Task<num_workgroups_x_only>::operator()(
     size_t task_index) const {
   DCHECK_LT(task_index, num_tasks_) << "Task index out of range";  // Crash OK
 
@@ -117,7 +117,7 @@ absl::Status Kernel::ParallelTask<num_workgroups_x_only>::operator()(
 }
 
 template <bool num_workgroups_x_only>
-XLA_CPU_WorkGroupId Kernel::ParallelTask<num_workgroups_x_only>::Delinearize(
+XLA_CPU_WorkGroupId Kernel::Task<num_workgroups_x_only>::Delinearize(
     uint64_t task_index) const {
   // In the most common case we parallelize only over the `x` dimension.
   if constexpr (num_workgroups_x_only) {
@@ -197,14 +197,12 @@ tsl::AsyncValueRef<LaunchEvent> Kernel::Launch(
                        std::numeric_limits<uint16_t>::max());
 
   if (ABSL_PREDICT_TRUE(num_workgroups.y == 1 && num_workgroups.z == 1)) {
-    return Worker::Parallelize(
-        device->getPool(), num_workers, num_tasks,
-        ParallelTask<true>(kernel_, num_workgroups, args));
+    return Worker::Parallelize(device->getPool(), num_workers, num_tasks,
+                               Task<true>(kernel_, num_workgroups, args));
   }
 
-  return Worker::Parallelize(
-      device->getPool(), num_workers, num_tasks,
-      ParallelTask<false>(kernel_, num_workgroups, args));
+  return Worker::Parallelize(device->getPool(), num_workers, num_tasks,
+                             Task<false>(kernel_, num_workgroups, args));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel.h b/third_party/xla/xla/backends/cpu/runtime/kernel.h
index 40b4083694ffb3..9fddf45cd93b6e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel.h
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel.h
@@ -105,9 +105,9 @@ class Kernel {
   }
 
  private:
-  // A kernel parallel task that is used to parallelize host kernel execution.
+  // A kernel task that is used to parallelize host kernel execution.
   template <bool num_workgroups_x_only>
-  class ParallelTask;
+  class Task;
 
   std::unique_ptr<KernelFunction> function_;
   XLA_CPU_Kernel* kernel_;  // pointer to the kernel owned by `function_`
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
index efa44a13fde7f1..90fd1b74c3c6bd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -64,7 +64,9 @@ namespace internal {
 static absl::Status CheckBufferAlignment(
     const Thunk::Info& info, uint64_t min_alignment,
     absl::Span<const XLA_CPU_KernelArg> kernel_args) {
-  if (min_alignment == 0) return absl::OkStatus();
+  if (min_alignment == 0) {
+    return absl::OkStatus();
+  }
 
   for (int64_t i = 0; i < kernel_args.size(); ++i) {
     auto ptr = reinterpret_cast<uintptr_t>(kernel_args[i].data);
@@ -114,8 +116,9 @@ template <int64_t num_arguments, int64_t num_results>
 KernelThunk<num_arguments, num_results>::KernelThunk(
     Info info, absl::Span<const BufferAllocation::Slice> arguments_buffers,
     absl::Span<const BufferAllocation::Slice> results_buffers,
-    absl::flat_hash_set<int64_t> invariant_arguments, std::string kernel_name,
-    NumWorkGroups num_workgroups, std::optional<uint64_t> min_alignment)
+    absl::flat_hash_set<int64_t> invariant_arguments,
+    absl::string_view kernel_name, NumWorkGroups num_workgroups,
+    std::optional<uint64_t> min_alignment)
     : KernelThunkBase(Kind::kKernel, std::move(info)),
       invariant_arguments_(std::move(invariant_arguments)),
       num_kernel_args_(arguments_buffers.size() + results_buffers.size()),
@@ -312,7 +315,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
     Thunk::Info info,
     absl::Span<const BufferAllocation::Slice> arguments_buffers,
     absl::Span<const BufferAllocation::Slice> results_buffers,
-    std::string kernel_name, NumWorkGroups num_workgroups,
+    absl::string_view kernel_name, NumWorkGroups num_workgroups,
     absl::flat_hash_set<int64_t> invariant_arguments,
     std::optional<uint64_t> min_alignment) {
   if (min_alignment.has_value() && !absl::has_single_bit(*min_alignment)) {
@@ -324,8 +327,8 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
     return absl::WrapUnique(
         new SmallKernelThunk<num_arguments(), num_results()>(
             std::move(info), arguments_buffers, results_buffers,
-            std::move(invariant_arguments), std::move(kernel_name),
-            num_workgroups, min_alignment));
+            std::move(invariant_arguments), kernel_name, num_workgroups,
+            min_alignment));
   };
 
   static constexpr auto _0 = std::integral_constant<size_t, 0>{};
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
index 4b7519bcf9f904..1a4500bbbfa4c1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.h
@@ -137,7 +137,7 @@ class KernelThunk : public KernelThunkBase {
               absl::Span<const BufferAllocation::Slice> arguments_buffers,
               absl::Span<const BufferAllocation::Slice> results_buffers,
               absl::flat_hash_set<int64_t> invariant_arguments,
-              std::string kernel_name, NumWorkGroups num_workgroups,
+              absl::string_view kernel_name, NumWorkGroups num_workgroups,
               std::optional<uint64_t> min_alignment);
 
   absl::Status CheckInvariantBuffersMemory(const KernelArgs& kernel_args) const;
@@ -196,7 +196,7 @@ class KernelThunk final : public internal::KernelThunk<> {
       Thunk::Info info,
       absl::Span<const BufferAllocation::Slice> arguments_buffers,
       absl::Span<const BufferAllocation::Slice> results_buffers,
-      std::string kernel_name, NumWorkGroups num_workgroups,
+      absl::string_view kernel_name, NumWorkGroups num_workgroups,
       absl::flat_hash_set<int64_t> invariant_arguments,
       std::optional<uint64_t> min_alignment = std::nullopt);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
index 7d19ec45031d10..b83f084d040c62 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
@@ -50,7 +50,7 @@ onednn_graph_cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:onednn_fusion",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/runtime:buffer_use",
         "//xla/runtime:object_pool",
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
index 933d657f4bdcf7..3900fdc388408d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_op_thunk_test.cc
@@ -114,12 +114,6 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnMatMulThunk) {
 // Input: 1x3x3x1, Kernel: 2x2x1x1 with values [[1,0],[0,1]] (HWIO).
 // Output (valid conv): each element = top-left + bottom-right of 2x2 patch:
 // [[1+5, 2+6], [4+8, 5+9]] = [[6, 8], [12, 14]].
-// Layout metadata uses one-based spatial dim indices.
-// Window parameter encoding (matches runtime expectations defined in
-// onednn_contraction_rewriter.cc):
-//   strides stored as (actual + 1)
-//   pads stored as (actual + 1)
-//   dilations stored as (actual + 1).
 TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 4);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
@@ -176,19 +170,17 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   OneDnnDataLayoutProto* inp_data = inp->mutable_data();
   inp_data->set_batch_dim(0);
   inp_data->set_feature_dim(3);
-  // Spatial dims stored as one-based (so 1->2, 2->3).
+  inp_data->add_spatial_dims(1);
   inp_data->add_spatial_dims(2);
-  inp_data->add_spatial_dims(3);
 
   // Kernel layout assumed HWIO (H,W,In,Out):
   OneDnnTensorLayoutProto* ker = conv_config.mutable_kernel();
   ker->set_dims(4);
   OneDnnFilterLayoutProto* filter = ker->mutable_filter();
-  filter->set_input_feature_dim(2);   // zero-based index of IC
-  filter->set_output_feature_dim(3);  // zero-based index of OC
-  // Spatial dims (H,W) one-based: (0->1,1->2) => 1,2
+  filter->set_input_feature_dim(2);
+  filter->set_output_feature_dim(3);
+  filter->add_spatial_dims(0);
   filter->add_spatial_dims(1);
-  filter->add_spatial_dims(2);
 
   // Output layout NHWC
   OneDnnTensorLayoutProto* out = conv_config.mutable_output();
@@ -196,24 +188,21 @@ TEST(OneDnnOpThunkTest, SimpleOneDnnConvolutionThunk) {
   OneDnnDataLayoutProto* out_data = out->mutable_data();
   out_data->set_batch_dim(0);
   out_data->set_feature_dim(3);
+  out_data->add_spatial_dims(1);
   out_data->add_spatial_dims(2);
-  out_data->add_spatial_dims(3);
 
   conv_config.set_feature_groups(1);
 
-  // Window parameters: stride=1, pad=0, dilation=1 encoded with offsets.
+  // Window parameters: stride=1, pad=0, dilation=1.
   OneDnnWindowProto* win = conv_config.mutable_window();
-  // Store (actual + 1) for strides so 2 -> (2 - 1 = 1 real stride).
-  win->add_strides(2);
-  win->add_strides(2);
-  // Pads store (actual +1) so 1 -> 0 actual pad.
-  win->add_pad_left(1);
-  win->add_pad_left(1);
-  win->add_pad_right(1);
-  win->add_pad_right(1);
-  // Dilations store (actual +1) so 2 -> 1 actual dilation.
-  win->add_window_dilations(2);
-  win->add_window_dilations(2);
+  win->add_strides(1);
+  win->add_strides(1);
+  win->add_pad_left(0);
+  win->add_pad_left(0);
+  win->add_pad_right(0);
+  win->add_pad_right(0);
+  win->add_window_dilations(1);
+  win->add_window_dilations(1);
 
   // Set up op buffers
   OneDnnOpThunk::OpBuffers op_buffers;
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc b/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc
new file mode 100644
index 00000000000000..38290c14a36149
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_lib.cc
@@ -0,0 +1,701 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/sort_lib.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+
+namespace xla::cpu::internal {
+
+namespace {
+
+// We use a lot of template metaprogramming below to be able to construct
+// iterators with statically known number of compared elements. We support a
+// limited set of template instantiations that we need in practice.
+
+// The size of the largest element we support (std::complex<double>).
+static constexpr size_t kMaxElementSize = 16;
+
+// Type erased storage suitable for storing any primitive type.
+using ValueStorage = std::array<std::byte, kMaxElementSize>;
+
+// Pointers to the input arrays together with their primitive sizes.
+template <size_t n>
+class Inputs {
+ public:
+  Inputs(absl::Span<std::byte* const> ptrs,
+         absl::Span<const size_t> primitive_sizes) {
+    DCHECK_EQ(n, ptrs.size());
+    DCHECK_EQ(n, primitive_sizes.size());
+    for (size_t i = 0; i < n; ++i) {
+      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
+    }
+  }
+
+  // Accessing arrays with `operator[]` has zero overheads, so we don't need to
+  // use pointers to data in contrast to `DInputs` below.
+
+  std::byte* ptr(size_t i, size_t offset) const {
+    DCHECK_LT(i, n) << "Input index out of bounds";
+    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_[i];
+    return ptr + offset * primitive_size;
+  }
+
+  size_t primitive_size(size_t i) const {
+    return ptrs_and_primitive_sizes_[i].second;
+  }
+
+ private:
+  // Pointers into the input buffers and each input's primitive size. Keep
+  // pointers and primitives sizes next to each other to avoid cache misses
+  // on a hot path.
+  std::array<std::pair<std::byte*, size_t>, n> ptrs_and_primitive_sizes_;
+};
+
+class DInputs {
+ public:
+  DInputs(absl::Span<std::byte* const> ptrs,
+          absl::Span<const size_t> primitive_sizes)
+      : n_(ptrs.size()), ptrs_and_primitive_sizes_(ptrs.size()) {
+    DCHECK_EQ(ptrs.size(), primitive_sizes.size());
+    for (size_t i = 0; i < ptrs.size(); ++i) {
+      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
+    }
+  }
+
+  size_t n() const { return n_; }
+
+  // Accessing vectors with `operator[]` is significantly slower than using a
+  // pointer to data because of libc++ hardening which checks for OOB access on
+  // every call. We know that we are not going to access out of bounds, so we
+  // use a pointer to data instead.
+
+  std::byte* ptr(size_t i, size_t offset) const {
+    DCHECK_LT(i, n_) << "Input index out of bounds";
+    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_.data()[i];
+    return ptr + offset * primitive_size;
+  }
+
+  size_t primitive_size(size_t i) const {
+    return ptrs_and_primitive_sizes_.data()[i].second;
+  }
+
+ private:
+  size_t n_;  // number of sorted inputs
+
+  // Pointers into the input buffers and each input's primitive size. Keep
+  // pointers and primitives sizes next to each other to avoid cache misses
+  // on a hot path.
+  std::vector<std::pair<std::byte*, size_t>> ptrs_and_primitive_sizes_;
+};
+
+// Forward declare reference type defined below.
+template <size_t n>
+struct Ref;
+struct DRef;
+
+// Value type to store values loaded from the input buffers.
+template <size_t n>
+struct Value {
+  Value(const Ref<n>& ref);  // NOLINT
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::array<ValueStorage, n> values;
+};
+
+struct DValue {
+  DValue(const DRef& ref);  // NOLINT
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::vector<ValueStorage> values;
+};
+
+// Reference to values stored in the input buffers.
+template <size_t n>
+struct Ref {
+  Ref(const Inputs<n>* inputs, size_t offset)
+      : inputs(inputs), offset(offset) {}
+
+  Ref& operator=(const Value<n>& value);
+  Ref& operator=(const Ref<n>& other);
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
+  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
+
+  const Inputs<n>* inputs;
+  size_t offset;
+};
+
+struct DRef {
+  DRef(const DInputs* inputs, size_t offset) : inputs(inputs), offset(offset) {}
+
+  DRef& operator=(const DValue& value);
+  DRef& operator=(const DRef& other);
+
+  void FillComparedValues(const void** __restrict compared_values) const;
+
+  size_t n() const { return inputs->n(); }
+  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
+  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
+
+  const DInputs* inputs;
+  size_t offset;
+};
+
+// We know that we can only copy up to 16 bytes for the largest element type
+// and can specialize `std::memcpy` to allow LLVM to inline it with statically
+// known sizes.
+static ABSL_ATTRIBUTE_ALWAYS_INLINE void Memcpy(void* __restrict dest,
+                                                const void* __restrict src,
+                                                size_t n) {
+  switch (n) {
+    case 1:
+      std::memcpy(dest, src, 1);
+      break;
+    case 2:
+      std::memcpy(dest, src, 2);
+      break;
+    case 4:
+      std::memcpy(dest, src, 4);
+      break;
+    case 8:
+      std::memcpy(dest, src, 8);
+      break;
+    case 16:
+      std::memcpy(dest, src, 16);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported memcpy size: " << n;
+  }
+}
+
+// Specialize swap for statically known sizes to avoid going through the same
+// switch statement multiple times.
+static ABSL_ATTRIBUTE_ALWAYS_INLINE void Swap(void* __restrict a,
+                                              void* __restrict b, size_t n) {
+  std::array<std::byte, kMaxElementSize> tmp;
+  switch (n) {
+    case 1:
+      std::memcpy(tmp.data(), a, 1);
+      std::memcpy(a, b, 1);
+      std::memcpy(b, tmp.data(), 1);
+      break;
+    case 2:
+      std::memcpy(tmp.data(), a, 2);
+      std::memcpy(a, b, 2);
+      std::memcpy(b, tmp.data(), 2);
+      break;
+    case 4:
+      std::memcpy(tmp.data(), a, 4);
+      std::memcpy(a, b, 4);
+      std::memcpy(b, tmp.data(), 4);
+      break;
+    case 8:
+      std::memcpy(tmp.data(), a, 8);
+      std::memcpy(a, b, 8);
+      std::memcpy(b, tmp.data(), 8);
+      break;
+    case 16:
+      std::memcpy(tmp.data(), a, 16);
+      std::memcpy(a, b, 16);
+      std::memcpy(b, tmp.data(), 16);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported swap size: " << n;
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Value<n>::Value(const Ref<n>& ref) {
+  for (size_t i = 0; i < n; ++i) {
+    Memcpy(values[i].data(), ref.ptr(i), ref.primitive_size(i));
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void Value<n>::FillComparedValues(
+    const void** __restrict compared_values) const {
+  for (const ValueStorage& value : values) {
+    *compared_values = value.data();
+    compared_values += 2;
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DValue::DValue(const DRef& ref) : values(ref.n()) {
+  for (size_t i = 0, end = ref.n(); i < end; ++i) {
+    Memcpy(values.data()[i].data(), ref.ptr(i), ref.primitive_size(i));
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void DValue::FillComparedValues(
+    const void** __restrict compared_values) const {
+#pragma unroll 8
+  for (const ValueStorage& value : values) {
+    *compared_values = value.data();
+    compared_values += 2;
+  }
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Value<n>& value) {
+  for (size_t i = 0; i < n; ++i) {
+    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
+  }
+  return *this;
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Ref<n>& other) {
+  for (size_t i = 0; i < n; ++i) {
+    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
+    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
+  }
+  return *this;
+}
+
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void Ref<n>::FillComparedValues(
+    const void** __restrict compared_values) const {
+  for (size_t i = 0; i < n; ++i) {
+    *compared_values = ptr(i);
+    compared_values += 2;
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DValue& value) {
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
+  }
+  return *this;
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DRef& other) {
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
+    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
+  }
+  return *this;
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void DRef::FillComparedValues(
+    const void** __restrict compared_values) const {
+#pragma unroll 8
+  for (size_t i = 0, end = n(); i < end; ++i) {
+    *compared_values = ptr(i);
+    compared_values += 2;
+  }
+}
+
+// Swap function required by `std::sort` and `std::stable_sort` implementations.
+template <size_t n>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const Ref<n>& lhs, const Ref<n>& rhs) {
+  for (size_t i = 0; i < n; ++i) {
+    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
+    size_t primitive_size = lhs.primitive_size(i);
+    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
+  }
+}
+
+ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const DRef& lhs, const DRef& rhs) {
+  for (size_t i = 0, end = lhs.n(); i < end; ++i) {
+    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
+    size_t primitive_size = lhs.primitive_size(i);
+    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
+  }
+}
+
+// An array of pointers to the input data.
+template <size_t n>
+struct Ptr {
+  using difference_type = std::ptrdiff_t;
+
+  Ptr() = default;
+
+  explicit Ptr(const Inputs<n>* inputs, size_t offset = 0)
+      : inputs(inputs), offset(offset) {}
+
+  Ref<n> operator*() const { return Ref<n>{inputs, offset}; }
+
+  Ptr& operator+=(difference_type diff) {
+    offset += diff;
+    return *this;
+  }
+
+  Ptr& operator-=(difference_type diff) {
+    offset -= diff;
+    return *this;
+  }
+
+  Ptr operator+(difference_type diff) const {
+    return Ptr(inputs, offset + diff);
+  }
+
+  Ptr operator-(difference_type diff) const {
+    return Ptr(inputs, offset - diff);
+  }
+
+  difference_type operator-(const Ptr& rhs) const {
+    return offset - rhs.offset;
+  }
+
+  bool operator==(const Ptr& rhs) const { return offset == rhs.offset; }
+  bool operator!=(const Ptr& rhs) const { return offset != rhs.offset; }
+  bool operator>(const Ptr& rhs) const { return offset > rhs.offset; }
+  bool operator<(const Ptr& rhs) const { return offset < rhs.offset; }
+  bool operator>=(const Ptr& rhs) const { return offset >= rhs.offset; }
+  bool operator<=(const Ptr& rhs) const { return offset <= rhs.offset; }
+
+  const Inputs<n>* inputs;  // pointer to the input arrays
+  size_t offset;            // offset into the inputs arrays
+};
+
+struct DPtr {
+  using difference_type = std::ptrdiff_t;
+
+  DPtr() = default;
+
+  explicit DPtr(const DInputs* inputs, size_t offset = 0)
+      : inputs(inputs), offset(offset) {}
+
+  DRef operator*() const { return DRef{inputs, offset}; }
+
+  DPtr& operator+=(difference_type diff) {
+    offset += diff;
+    return *this;
+  }
+
+  DPtr& operator-=(difference_type diff) {
+    offset -= diff;
+    return *this;
+  }
+
+  DPtr operator+(difference_type diff) const {
+    return DPtr(inputs, offset + diff);
+  }
+
+  DPtr operator-(difference_type diff) const {
+    return DPtr(inputs, offset - diff);
+  }
+
+  difference_type operator-(const DPtr& rhs) const {
+    return offset - rhs.offset;
+  }
+
+  bool operator==(const DPtr& rhs) const { return offset == rhs.offset; }
+  bool operator!=(const DPtr& rhs) const { return offset != rhs.offset; }
+  bool operator>(const DPtr& rhs) const { return offset > rhs.offset; }
+  bool operator<(const DPtr& rhs) const { return offset < rhs.offset; }
+  bool operator>=(const DPtr& rhs) const { return offset >= rhs.offset; }
+  bool operator<=(const DPtr& rhs) const { return offset <= rhs.offset; }
+
+  const DInputs* inputs;  // pointer to the input arrays
+  size_t offset;          // offset into the inputs arrays
+};
+
+// We rely on `std::sort` and `std::stable_sort` to sort the raw data. We sort
+// multiple input buffers together using the same comparator function, so we
+// need to provide a custom iterator that can access the data of all input
+// buffers at the same time and swap elements in them.
+template <class Value, class Ref, class Ptr>
+class SortIterator {
+ public:
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+
+  using value_type = Value;
+  using reference = Ref;
+  using pointer = Ptr;
+
+  SortIterator() = default;
+  SortIterator(pointer ptr, difference_type stride)
+      : ptr_(std::move(ptr)), stride_(stride) {}
+
+  SortIterator(const SortIterator& other) = default;
+  SortIterator& operator=(const SortIterator& other) = default;
+  SortIterator(SortIterator&& other) = default;
+  SortIterator& operator=(SortIterator&& other) = default;
+
+  reference operator*() const { return *ptr_; }
+  reference operator[](difference_type diff) const { return *(*this + diff); }
+
+  difference_type operator-(const SortIterator& rhs) const {
+    return (ptr_ - rhs.ptr_) / stride_;
+  }
+
+  SortIterator& operator+=(difference_type diff) {
+    ptr_ += diff * stride_;
+    return *this;
+  }
+
+  SortIterator& operator-=(difference_type diff) {
+    ptr_ -= diff * stride_;
+    return *this;
+  }
+
+  SortIterator& operator++() {
+    ptr_ += stride_;
+    return *this;
+  }
+
+  SortIterator& operator--() {
+    ptr_ -= stride_;
+    return *this;
+  }
+
+  SortIterator operator+(difference_type diff) const {
+    return SortIterator(ptr_ + diff * stride_, stride_);
+  }
+
+  SortIterator operator-(difference_type diff) const {
+    return SortIterator(ptr_ - diff * stride_, stride_);
+  }
+
+  bool operator==(const SortIterator& rhs) const { return ptr_ == rhs.ptr_; }
+  bool operator!=(const SortIterator& rhs) const { return ptr_ != rhs.ptr_; }
+  bool operator>(const SortIterator& rhs) const { return ptr_ > rhs.ptr_; }
+  bool operator<(const SortIterator& rhs) const { return ptr_ < rhs.ptr_; }
+  bool operator>=(const SortIterator& rhs) const { return ptr_ >= rhs.ptr_; }
+  bool operator<=(const SortIterator& rhs) const { return ptr_ <= rhs.ptr_; }
+
+ private:
+  pointer ptr_;
+  difference_type stride_ = 1;
+};
+
+}  // namespace
+
+template <size_t n>
+static void Sort1DInplace(const SortDims& sort_dims, int64_t offset,
+                          absl::Span<std::byte* const> data,
+                          absl::Span<const size_t> primitive_sizes,
+                          bool is_stable, LessThan* less_than) {
+  DCHECK_EQ(n, data.size());
+  DCHECK_EQ(n, primitive_sizes.size());
+
+  std::array<std::byte*, n> ptrs;
+  for (size_t i = 0; i < n; ++i) {
+    ptrs[i] = data[i] + offset * primitive_sizes[i];
+  }
+
+  Inputs<n> inputs(ptrs, primitive_sizes);
+
+  auto compare = [&](const auto& a, const auto& b) {
+    std::array<const void*, 2 * n> values;
+    a.FillComparedValues(&values[0]);
+    b.FillComparedValues(&values[1]);
+    return (*less_than)(values.data());
+  };
+
+  SortIterator<Value<n>, Ref<n>, Ptr<n>> begin(
+      Ptr<n>(&inputs), /*stride=*/sort_dims.inner_dim_size);
+  if (is_stable) {
+    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
+  } else {
+    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+  }
+}
+
+static void DSort1DInplace(const SortDims& sort_dims, int64_t offset,
+                           absl::Span<std::byte* const> data,
+                           absl::Span<const size_t> primitive_sizes,
+                           bool is_stable, LessThan* less_than) {
+  DCHECK_EQ(data.size(), primitive_sizes.size());
+
+  std::vector<std::byte*> ptrs(data.size());
+  for (size_t i = 0; i < data.size(); ++i) {
+    ptrs[i] = data[i] + offset * primitive_sizes[i];
+  }
+
+  DInputs inputs(std::move(ptrs), primitive_sizes);
+
+  // Allocate scratch space for sorted values outside of the lambda to avoid
+  // allocating it on every call to `compare`.
+  std::vector<const void*> values(2 * data.size());
+
+  auto compare = [&, values = values.data()](const auto& a, const auto& b) {
+    a.FillComparedValues(&values[0]);
+    b.FillComparedValues(&values[1]);
+    return (*less_than)(values);
+  };
+
+  SortIterator<DValue, DRef, DPtr> begin(DPtr(&inputs),
+                                         /*stride=*/sort_dims.inner_dim_size);
+  if (is_stable) {
+    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
+  } else {
+    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+  }
+}
+
+// Sorts `data` using `less_than` comparator function.
+void SortInplace(const SortDims& sort_dims, absl::Span<std::byte* const> data,
+                 absl::Span<const size_t> primitive_sizes, bool is_stable,
+                 LessThan* less_than) {
+  // Iterate over all the 1-dimensional slices of the buffers and sort them.
+  int64_t num_iterations = sort_dims.outer_dim_size * sort_dims.inner_dim_size;
+
+  for (int64_t i = 0; i < num_iterations; ++i) {
+    int64_t inner_idx = i % sort_dims.inner_dim_size;
+    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
+
+    // Use "sort" for statically known number of sorted inputs (expected to be
+    // faster) and "dsort" for dynamically known number of sorted inputs.
+    auto sort = [&](auto num_inputs) {
+      Sort1DInplace<decltype(num_inputs)::value>(
+          sort_dims, offset, data, primitive_sizes, is_stable, less_than);
+    };
+
+    switch (data.size()) {
+      case 1:
+        sort(std::integral_constant<size_t, 1>{});
+        break;
+      case 2:
+        sort(std::integral_constant<size_t, 2>{});
+        break;
+      case 3:
+        sort(std::integral_constant<size_t, 3>{});
+        break;
+      case 4:
+        sort(std::integral_constant<size_t, 4>{});
+        break;
+      case 5:
+        sort(std::integral_constant<size_t, 5>{});
+        break;
+      case 6:
+        sort(std::integral_constant<size_t, 6>{});
+        break;
+      case 7:
+        sort(std::integral_constant<size_t, 7>{});
+        break;
+      case 8:
+        sort(std::integral_constant<size_t, 8>{});
+        break;
+      case 9:
+        sort(std::integral_constant<size_t, 9>{});
+        break;
+      case 10:
+        sort(std::integral_constant<size_t, 10>{});
+        break;
+      case 11:
+        sort(std::integral_constant<size_t, 11>{});
+        break;
+      case 12:
+        sort(std::integral_constant<size_t, 12>{});
+        break;
+      case 13:
+        sort(std::integral_constant<size_t, 13>{});
+        break;
+      case 14:
+        sort(std::integral_constant<size_t, 14>{});
+        break;
+      case 15:
+        sort(std::integral_constant<size_t, 15>{});
+        break;
+      case 16:
+        sort(std::integral_constant<size_t, 16>{});
+        break;
+      default:
+        DSort1DInplace(sort_dims, offset, data, primitive_sizes, is_stable,
+                       less_than);
+        break;
+    }
+  }
+}
+
+template <class Iterator, class T>
+static void Sort1DInplace(Iterator begin, Iterator end, bool is_stable,
+                          SortDirection direction) {
+  if (direction == SortDirection::kAscending) {
+    if (is_stable) {
+      std::stable_sort(begin, end, std::less<T>());
+    } else {
+      std::sort(begin, end, std::less<T>());
+    }
+  } else {
+    if (is_stable) {
+      std::stable_sort(begin, end, std::greater<T>());
+    } else {
+      std::sort(begin, end, std::greater<T>());
+    }
+  };
+}
+
+template <typename T>
+static void Sort1DInplace(const SortDims& sort_dims, int64_t offset, T* data,
+                          bool is_stable, SortDirection direction) {
+  T* begin = data + offset;
+  T* end = begin + sort_dims.sort_dim_size;
+
+  if (sort_dims.inner_dim_size == 1) {
+    Sort1DInplace<T*, T>(begin, end, is_stable, direction);
+  } else {
+    using Iterator = internal::SortIterator<T, T&, T*>;
+    Iterator begin_it(begin, /*stride=*/sort_dims.inner_dim_size);
+    Iterator end_it = begin_it + sort_dims.sort_dim_size;
+    Sort1DInplace<Iterator, T>(begin_it, end_it, is_stable, direction);
+  }
+}
+
+template <typename T>
+void SortInplace(const SortDims& sort_dims, T* data, bool is_stable,
+                 SortDirection direction) {
+  // Iterate over all the 1-dimensional slices of the buffers and sort them.
+  int64_t num_iterations = sort_dims.outer_dim_size * sort_dims.inner_dim_size;
+
+  for (int64_t i = 0; i < num_iterations; ++i) {
+    int64_t inner_idx = i % sort_dims.inner_dim_size;
+    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
+
+    Sort1DInplace<T>(sort_dims, offset, data, is_stable, direction);
+  }
+}
+
+// Declare Sort1DInplace for all supported types. Template is instantiated in
+// the .cc file.
+#define DEFINE_SORT_INPLACE(T) \
+  template void SortInplace<T>(const SortDims&, T*, bool, SortDirection)
+
+DEFINE_SORT_INPLACE(float);
+DEFINE_SORT_INPLACE(double);
+DEFINE_SORT_INPLACE(int8_t);
+DEFINE_SORT_INPLACE(int16_t);
+DEFINE_SORT_INPLACE(int32_t);
+DEFINE_SORT_INPLACE(int64_t);
+DEFINE_SORT_INPLACE(uint8_t);
+DEFINE_SORT_INPLACE(uint16_t);
+DEFINE_SORT_INPLACE(uint32_t);
+DEFINE_SORT_INPLACE(uint64_t);
+
+#undef DEFINE_SORT_INPLACE
+
+}  // namespace xla::cpu::internal
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_lib.h b/third_party/xla/xla/backends/cpu/runtime/sort_lib.h
new file mode 100644
index 00000000000000..7b271b21b221eb
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_lib.h
@@ -0,0 +1,81 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
+
+namespace xla::cpu::internal {
+
+// Conceptually we have a 3-dimensional shape:
+//
+//   [outer_dim_size, sort_dim_size, inner_dim_size]
+//
+// We sort `outer_dim_size * inner_dim_size` vectors of length `sort_dim_size`,
+// by iterating over `data` memory and calling `std::sort` (or
+// `std::stable_sort`) on each (strided) slice of the buffer.
+struct SortDims {
+  int64_t outer_dim_size;
+  int64_t sort_dim_size;
+  int64_t inner_dim_size;
+};
+
+// For trivial sort functors (computation with two parameters that are
+// compared using `LT` or `GT` direction) we can define sort as a enum. We use
+// it for performance optimization to be able to inline the sort function.
+enum class SortDirection {
+  kAscending,
+  kDescending,
+};
+
+// Sorts `data` using `less_than` comparator function. Data is sorted in place,
+// and sort dimensions are specified in `sort_dims`.
+using LessThan = absl::AnyInvocable<bool(const void** data)>;
+void SortInplace(const SortDims& sort_dims, absl::Span<std::byte* const> data,
+                 absl::Span<const size_t> primitive_sizes, bool is_stable,
+                 LessThan* less_than);
+
+// Sorts `data` using the sort `direction` with builtin comparator functions.
+// This is more efficient, as the comparator can be inlined.
+template <typename T>
+void SortInplace(const SortDims& sort_dims, T* data, bool is_stable,
+                 SortDirection direction);
+
+// Declare SortInplace for all supported types. Template is instantiated in
+// the .cc file.
+#define DECLARE_SORT_INPLACE(T) \
+  extern template void SortInplace<T>(const SortDims&, T*, bool, SortDirection)
+
+DECLARE_SORT_INPLACE(float);
+DECLARE_SORT_INPLACE(double);
+DECLARE_SORT_INPLACE(int8_t);
+DECLARE_SORT_INPLACE(int16_t);
+DECLARE_SORT_INPLACE(int32_t);
+DECLARE_SORT_INPLACE(int64_t);
+DECLARE_SORT_INPLACE(uint8_t);
+DECLARE_SORT_INPLACE(uint16_t);
+DECLARE_SORT_INPLACE(uint32_t);
+DECLARE_SORT_INPLACE(uint64_t);
+
+#undef DECLARE_SORT_INPLACE
+
+}  // namespace xla::cpu::internal
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_SORT_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
index 9a53032011347c..972bfbed1eb61d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 
-#include <algorithm>
-#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -26,23 +24,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/call_once.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/sort_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/layout_util.h"
 #include "xla/primitive_util.h"
@@ -60,8 +56,42 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static absl::Status VerifySortInputs(absl::Span<const SortThunk::Input> inputs,
-                                     int64_t dimension) {
+// Conceptually we have a 3-dimensional shape:
+//
+//   [outer_dim_size, sort_dim_size, inner_dim_size]
+//
+// We sort `outer_dim_size * inner_dim_size` vectors of length
+// `sort_dim_size`, by iterating over `data` memory and calling `std::sort`
+// (or `std::stable_sort`) on each (strided) slice of the buffer.
+static SortThunk::SortDims GetSortDims(const Shape& shape, int64_t dimension) {
+  int64_t sort_dimension =
+      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
+
+  // We need to normalize shape + layout into a descending layout, so that we
+  // can compute access strides according to the physical layout.
+  Shape physical_shape =
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
+
+  // Map `sort_dimension` from logical to physical.
+  auto logical_to_physical = LayoutUtil::MakeLogicalToPhysical(shape.layout());
+  sort_dimension = logical_to_physical[sort_dimension];
+
+  auto product = [](absl::Span<const int64_t> dims) {
+    return absl::c_accumulate(dims, int64_t{1}, std::multiplies<>());
+  };
+
+  // Use physical dimensions to compute access strides.
+  absl::Span<const int64_t> dimensions = physical_shape.dimensions();
+
+  int64_t outer_dim_size = product(dimensions.subspan(0, sort_dimension));
+  int64_t sort_dim_size = dimensions[sort_dimension];
+  int64_t inner_dim_size = product(dimensions.subspan(sort_dimension + 1));
+
+  return SortThunk::SortDims{outer_dim_size, sort_dim_size, inner_dim_size};
+}
+
+static absl::StatusOr<SortThunk::SortDims> VerifySortInputs(
+    absl::Span<const SortThunk::Input> inputs, int64_t dimension) {
   // We should have at least one input buffer.
   if (inputs.empty()) {
     return Internal("Inputs must not be empty");
@@ -86,768 +116,96 @@ static absl::Status VerifySortInputs(absl::Span<const SortThunk::Input> inputs,
         absl::StrJoin(shape.dimensions(), ","), dimension);
   }
 
-  return absl::OkStatus();
+  return GetSortDims(inputs[0].shape, dimension);
 }
 
 absl::StatusOr<std::unique_ptr<SortThunk>> SortThunk::Create(
     Info info, absl::Span<const Input> inputs, int64_t dimension,
     bool is_stable, LessThan less_than,
     std::optional<SortDirection> direction) {
-  TF_RETURN_IF_ERROR(VerifySortInputs(inputs, dimension));
+  TF_ASSIGN_OR_RETURN(auto sort_dims, VerifySortInputs(inputs, dimension));
   return absl::WrapUnique(new SortThunk(std::move(info), inputs, dimension,
                                         is_stable, std::move(less_than),
-                                        direction));
+                                        sort_dims, direction));
 }
 
 absl::StatusOr<std::unique_ptr<SortThunk>> SortThunk::Create(
     Info info, absl::Span<const Input> inputs, int64_t dimension,
     bool is_stable, std::string comparator_name,
     std::optional<SortDirection> direction) {
-  TF_RETURN_IF_ERROR(VerifySortInputs(inputs, dimension));
+  TF_ASSIGN_OR_RETURN(auto sort_dims, VerifySortInputs(inputs, dimension));
   return absl::WrapUnique(new SortThunk(std::move(info), inputs, dimension,
                                         is_stable, std::move(comparator_name),
-                                        direction));
+                                        sort_dims, direction));
 }
 
 SortThunk::SortThunk(Info info, absl::Span<const Input> inputs,
                      int64_t dimension, bool is_stable, LessThan less_than,
-                     std::optional<SortDirection> direction)
+                     SortDims sort_dims, std::optional<SortDirection> direction)
     : Thunk(Kind::kSort, std::move(info)),
       inputs_(inputs.begin(), inputs.end()),
       dimension_(dimension),
       is_stable_(is_stable),
+      sort_dims_(sort_dims),
       direction_(direction),
       less_than_(std::move(less_than)) {}
 
 SortThunk::SortThunk(Info info, absl::Span<const Input> inputs,
                      int64_t dimension, bool is_stable,
-                     std::string comparator_name,
+                     std::string comparator_name, SortDims sort_dims,
                      std::optional<SortDirection> direction)
     : Thunk(Kind::kSort, std::move(info)),
       inputs_(inputs.begin(), inputs.end()),
       dimension_(dimension),
       is_stable_(is_stable),
+      sort_dims_(sort_dims),
       direction_(direction),
       comparator_name_(std::move(comparator_name)) {}
 
-namespace {
-
-// We use a lot of template metaprogramming below to be able to construct
-// iterators with statically known number of compared elements. We support a
-// limited set of template instantiations that we need in practice.
-
-// The size of the largest element we support (std::complex<double>).
-static constexpr size_t kMaxElementSize = 16;
-
-// Type erased storage suitable for storing any primitive type.
-using ValueStorage = std::array<std::byte, kMaxElementSize>;
-
-// Pointers to the input arrays together with their primitive sizes.
-template <size_t n>
-class Inputs {
- public:
-  Inputs(std::array<std::byte*, n> ptrs,
-         std::array<size_t, n> primitive_sizes) {
-    for (size_t i = 0; i < n; ++i) {
-      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
-    }
-  }
-
-  // Accessing arrays with `operator[]` has zero overheads, so we don't need to
-  // use pointers to data in contrast to `DInputs` below.
-
-  std::byte* ptr(size_t i, size_t offset) const {
-    DCHECK_LT(i, n) << "Input index out of bounds";
-    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_[i];
-    return ptr + offset * primitive_size;
-  }
-
-  size_t primitive_size(size_t i) const {
-    return ptrs_and_primitive_sizes_[i].second;
-  }
-
- private:
-  // Pointers into the input buffers and each input's primitive size. Keep
-  // pointers and primitives sizes next to each other to avoid cache misses
-  // on a hot path.
-  std::array<std::pair<std::byte*, size_t>, n> ptrs_and_primitive_sizes_;
-};
-
-class DInputs {
- public:
-  DInputs(std::vector<std::byte*> ptrs, std::vector<size_t> primitive_sizes)
-      : n_(ptrs.size()), ptrs_and_primitive_sizes_(ptrs.size()) {
-    DCHECK_EQ(ptrs.size(), primitive_sizes.size());
-    for (size_t i = 0; i < ptrs.size(); ++i) {
-      ptrs_and_primitive_sizes_[i] = {ptrs[i], primitive_sizes[i]};
-    }
-  }
-
-  size_t n() const { return n_; }
-
-  // Accessing vectors with `operator[]` is significantly slower than using a
-  // pointer to data because of libc++ hardening which checks for OOB access on
-  // every call. We know that we are not going to access out of bounds, so we
-  // use a pointer to data instead.
-
-  std::byte* ptr(size_t i, size_t offset) const {
-    DCHECK_LT(i, n_) << "Input index out of bounds";
-    auto& [ptr, primitive_size] = ptrs_and_primitive_sizes_.data()[i];
-    return ptr + offset * primitive_size;
-  }
-
-  size_t primitive_size(size_t i) const {
-    return ptrs_and_primitive_sizes_.data()[i].second;
-  }
-
- private:
-  size_t n_;  // number of sorted inputs
-
-  // Pointers into the input buffers and each input's primitive size. Keep
-  // pointers and primitives sizes next to each other to avoid cache misses
-  // on a hot path.
-  std::vector<std::pair<std::byte*, size_t>> ptrs_and_primitive_sizes_;
-};
-
-// Forward declare reference type defined below.
-template <size_t n>
-struct Ref;
-struct DRef;
-
-// Value type to store values loaded from the input buffers.
-template <size_t n>
-struct Value {
-  Value(const Ref<n>& ref);  // NOLINT
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::array<ValueStorage, n> values;
-};
-
-struct DValue {
-  DValue(const DRef& ref);  // NOLINT
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::vector<ValueStorage> values;
-};
-
-// Reference to values stored in the input buffers.
-template <size_t n>
-struct Ref {
-  Ref(const Inputs<n>* inputs, size_t offset)
-      : inputs(inputs), offset(offset) {}
-
-  Ref& operator=(const Value<n>& value);
-  Ref& operator=(const Ref<n>& other);
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
-  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
-
-  const Inputs<n>* inputs;
-  size_t offset;
-};
-
-struct DRef {
-  DRef(const DInputs* inputs, size_t offset) : inputs(inputs), offset(offset) {}
-
-  DRef& operator=(const DValue& value);
-  DRef& operator=(const DRef& other);
-
-  void FillComparedValues(const void** __restrict compared_values) const;
-
-  size_t n() const { return inputs->n(); }
-  std::byte* ptr(size_t i) const { return inputs->ptr(i, offset); }
-  size_t primitive_size(size_t i) const { return inputs->primitive_size(i); }
-
-  const DInputs* inputs;
-  size_t offset;
-};
-
-// We know that we can only copy up to 16 bytes for the largest element type
-// and can specialize `std::memcpy` to allow LLVM to inline it with statically
-// known sizes.
-static ABSL_ATTRIBUTE_ALWAYS_INLINE void Memcpy(void* __restrict dest,
-                                                const void* __restrict src,
-                                                size_t n) {
-  switch (n) {
-    case 1:
-      std::memcpy(dest, src, 1);
-      break;
-    case 2:
-      std::memcpy(dest, src, 2);
-      break;
-    case 4:
-      std::memcpy(dest, src, 4);
-      break;
-    case 8:
-      std::memcpy(dest, src, 8);
-      break;
-    case 16:
-      std::memcpy(dest, src, 16);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported memcpy size: " << n;
-  }
-}
-
-// Specialize swap for statically known sizes to avoid going through the same
-// switch statement multiple times.
-static ABSL_ATTRIBUTE_ALWAYS_INLINE void Swap(void* __restrict a,
-                                              void* __restrict b, size_t n) {
-  std::array<std::byte, kMaxElementSize> tmp;
-  switch (n) {
-    case 1:
-      std::memcpy(tmp.data(), a, 1);
-      std::memcpy(a, b, 1);
-      std::memcpy(b, tmp.data(), 1);
-      break;
-    case 2:
-      std::memcpy(tmp.data(), a, 2);
-      std::memcpy(a, b, 2);
-      std::memcpy(b, tmp.data(), 2);
-      break;
-    case 4:
-      std::memcpy(tmp.data(), a, 4);
-      std::memcpy(a, b, 4);
-      std::memcpy(b, tmp.data(), 4);
-      break;
-    case 8:
-      std::memcpy(tmp.data(), a, 8);
-      std::memcpy(a, b, 8);
-      std::memcpy(b, tmp.data(), 8);
-      break;
-    case 16:
-      std::memcpy(tmp.data(), a, 16);
-      std::memcpy(a, b, 16);
-      std::memcpy(b, tmp.data(), 16);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported swap size: " << n;
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Value<n>::Value(const Ref<n>& ref) {
-  for (size_t i = 0; i < n; ++i) {
-    Memcpy(values[i].data(), ref.ptr(i), ref.primitive_size(i));
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void Value<n>::FillComparedValues(
-    const void** __restrict compared_values) const {
-  for (const ValueStorage& value : values) {
-    *compared_values = value.data();
-    compared_values += 2;
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DValue::DValue(const DRef& ref) : values(ref.n()) {
-  for (size_t i = 0, end = ref.n(); i < end; ++i) {
-    Memcpy(values.data()[i].data(), ref.ptr(i), ref.primitive_size(i));
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void DValue::FillComparedValues(
-    const void** __restrict compared_values) const {
-#pragma unroll 8
-  for (const ValueStorage& value : values) {
-    *compared_values = value.data();
-    compared_values += 2;
-  }
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Value<n>& value) {
-  for (size_t i = 0; i < n; ++i) {
-    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
-  }
-  return *this;
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE Ref<n>& Ref<n>::operator=(const Ref<n>& other) {
-  for (size_t i = 0; i < n; ++i) {
-    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
-    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
-  }
-  return *this;
-}
-
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void Ref<n>::FillComparedValues(
-    const void** __restrict compared_values) const {
-  for (size_t i = 0; i < n; ++i) {
-    *compared_values = ptr(i);
-    compared_values += 2;
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DValue& value) {
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    Memcpy(ptr(i), value.values.data()[i].data(), primitive_size(i));
-  }
-  return *this;
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE DRef& DRef::operator=(const DRef& other) {
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    DCHECK_EQ(primitive_size(i), other.primitive_size(i));
-    Memcpy(ptr(i), other.ptr(i), primitive_size(i));
-  }
-  return *this;
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void DRef::FillComparedValues(
-    const void** __restrict compared_values) const {
-#pragma unroll 8
-  for (size_t i = 0, end = n(); i < end; ++i) {
-    *compared_values = ptr(i);
-    compared_values += 2;
-  }
-}
-
-// Swap function required by `std::sort` and `std::stable_sort` implementations.
-template <size_t n>
-ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const Ref<n>& lhs, const Ref<n>& rhs) {
-  for (size_t i = 0; i < n; ++i) {
-    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
-    size_t primitive_size = lhs.primitive_size(i);
-    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
-  }
-}
-
-ABSL_ATTRIBUTE_ALWAYS_INLINE void swap(const DRef& lhs, const DRef& rhs) {
-  for (size_t i = 0, end = lhs.n(); i < end; ++i) {
-    DCHECK_EQ(lhs.primitive_size(i), rhs.primitive_size(i));
-    size_t primitive_size = lhs.primitive_size(i);
-    Swap(lhs.ptr(i), rhs.ptr(i), primitive_size);
-  }
-}
-
-// An array of pointers to the input data.
-template <size_t n>
-struct Ptr {
-  using difference_type = std::ptrdiff_t;
-
-  Ptr() = default;
-
-  explicit Ptr(const Inputs<n>* inputs, size_t offset = 0)
-      : inputs(inputs), offset(offset) {}
-
-  Ref<n> operator*() const { return Ref<n>{inputs, offset}; }
-
-  Ptr& operator+=(difference_type diff) {
-    offset += diff;
-    return *this;
-  }
-
-  Ptr& operator-=(difference_type diff) {
-    offset -= diff;
-    return *this;
-  }
-
-  Ptr operator+(difference_type diff) const {
-    return Ptr(inputs, offset + diff);
-  }
-
-  Ptr operator-(difference_type diff) const {
-    return Ptr(inputs, offset - diff);
-  }
-
-  difference_type operator-(const Ptr& rhs) const {
-    return offset - rhs.offset;
-  }
-
-  bool operator==(const Ptr& rhs) const { return offset == rhs.offset; }
-  bool operator!=(const Ptr& rhs) const { return offset != rhs.offset; }
-  bool operator>(const Ptr& rhs) const { return offset > rhs.offset; }
-  bool operator<(const Ptr& rhs) const { return offset < rhs.offset; }
-  bool operator>=(const Ptr& rhs) const { return offset >= rhs.offset; }
-  bool operator<=(const Ptr& rhs) const { return offset <= rhs.offset; }
-
-  const Inputs<n>* inputs;  // pointer to the input arrays
-  size_t offset;            // offset into the inputs arrays
-};
-
-struct DPtr {
-  using difference_type = std::ptrdiff_t;
-
-  DPtr() = default;
-
-  explicit DPtr(const DInputs* inputs, size_t offset = 0)
-      : inputs(inputs), offset(offset) {}
-
-  DRef operator*() const { return DRef{inputs, offset}; }
-
-  DPtr& operator+=(difference_type diff) {
-    offset += diff;
-    return *this;
-  }
-
-  DPtr& operator-=(difference_type diff) {
-    offset -= diff;
-    return *this;
-  }
-
-  DPtr operator+(difference_type diff) const {
-    return DPtr(inputs, offset + diff);
-  }
-
-  DPtr operator-(difference_type diff) const {
-    return DPtr(inputs, offset - diff);
-  }
-
-  difference_type operator-(const DPtr& rhs) const {
-    return offset - rhs.offset;
-  }
-
-  bool operator==(const DPtr& rhs) const { return offset == rhs.offset; }
-  bool operator!=(const DPtr& rhs) const { return offset != rhs.offset; }
-  bool operator>(const DPtr& rhs) const { return offset > rhs.offset; }
-  bool operator<(const DPtr& rhs) const { return offset < rhs.offset; }
-  bool operator>=(const DPtr& rhs) const { return offset >= rhs.offset; }
-  bool operator<=(const DPtr& rhs) const { return offset <= rhs.offset; }
-
-  const DInputs* inputs;  // pointer to the input arrays
-  size_t offset;          // offset into the inputs arrays
-};
-
-// We rely on `std::sort` and `std::stable_sort` to sort the raw data. We sort
-// multiple input buffers together using the same comparator function, so we
-// need to provide a custom iterator that can access the data of all input
-// buffers at the same time and swap elements in them.
-template <class Value, class Ref, class Ptr>
-class SortIterator {
- public:
-  using iterator_category = std::random_access_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-
-  using value_type = Value;
-  using reference = Ref;
-  using pointer = Ptr;
-
-  SortIterator() = default;
-  SortIterator(pointer ptr, difference_type stride)
-      : ptr_(std::move(ptr)), stride_(stride) {}
-
-  SortIterator(const SortIterator& other) = default;
-  SortIterator& operator=(const SortIterator& other) = default;
-  SortIterator(SortIterator&& other) = default;
-  SortIterator& operator=(SortIterator&& other) = default;
-
-  reference operator*() const { return *ptr_; }
-  reference operator[](difference_type diff) const { return *(*this + diff); }
-
-  difference_type operator-(const SortIterator& rhs) const {
-    return (ptr_ - rhs.ptr_) / stride_;
-  }
-
-  SortIterator& operator+=(difference_type diff) {
-    ptr_ += diff * stride_;
-    return *this;
-  }
-
-  SortIterator& operator-=(difference_type diff) {
-    ptr_ -= diff * stride_;
-    return *this;
-  }
-
-  SortIterator& operator++() {
-    ptr_ += stride_;
-    return *this;
-  }
-
-  SortIterator& operator--() {
-    ptr_ -= stride_;
-    return *this;
-  }
-
-  SortIterator operator+(difference_type diff) const {
-    return SortIterator(ptr_ + diff * stride_, stride_);
-  }
-
-  SortIterator operator-(difference_type diff) const {
-    return SortIterator(ptr_ - diff * stride_, stride_);
-  }
-
-  bool operator==(const SortIterator& rhs) const { return ptr_ == rhs.ptr_; }
-  bool operator!=(const SortIterator& rhs) const { return ptr_ != rhs.ptr_; }
-  bool operator>(const SortIterator& rhs) const { return ptr_ > rhs.ptr_; }
-  bool operator<(const SortIterator& rhs) const { return ptr_ < rhs.ptr_; }
-  bool operator>=(const SortIterator& rhs) const { return ptr_ >= rhs.ptr_; }
-  bool operator<=(const SortIterator& rhs) const { return ptr_ <= rhs.ptr_; }
-
- private:
-  pointer ptr_;
-  difference_type stride_ = 1;
-};
-
-struct SortDims {
-  int64_t outer_dim_size;
-  int64_t sort_dim_size;
-  int64_t inner_dim_size;
-  int64_t num_iterations;
-};
-
-}  // namespace
-
-// Conceptually we have a 3-dimensional shape:
-//
-//   [outer_dim_size, sort_dim_size, inner_dim_size]
-//
-// We sort `outer_dim_size * inner_dim_size` vectors of length
-// `sort_dim_size`, by iterating over `data` memory and calling `std::sort`
-// (or `std::stable_sort`) on each (strided) slice of the buffer.
-static SortDims GetSortDims(const Shape& shape, int64_t dimension) {
-  int64_t sort_dimension =
-      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
-
-  // We need to normalize shape + layout into a descending layout, so that we
-  // can compute access strides according to the physical layout.
-  Shape physical_shape =
-      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
-
-  // Map `sort_dimension` from logical to physical.
-  auto logical_to_physical = LayoutUtil::MakeLogicalToPhysical(shape.layout());
-  sort_dimension = logical_to_physical[sort_dimension];
-
-  auto product = [](absl::Span<const int64_t> dims) {
-    return absl::c_accumulate(dims, int64_t{1}, std::multiplies<>());
-  };
-
-  // Use physical dimensions to compute access strides.
-  absl::Span<const int64_t> dimensions = physical_shape.dimensions();
-
-  int64_t outer_dim_size = product(dimensions.subspan(0, sort_dimension));
-  int64_t sort_dim_size = dimensions[sort_dimension];
-  int64_t inner_dim_size = product(dimensions.subspan(sort_dimension + 1));
-  int64_t num_iterations = outer_dim_size * inner_dim_size;
-
-  return SortDims{outer_dim_size, sort_dim_size, inner_dim_size,
-                  num_iterations};
-}
-
-template <class Iterator, class NativeT>
-static void Sort1DArrInplace(int64_t sort_dims_size, int64_t offset,
-                             Iterator begin, bool is_stable,
-                             SortThunk::SortDirection direction) {
-  if (direction == SortThunk::SortDirection::kAscending) {
-    if (is_stable) {
-      std::stable_sort(begin, begin + sort_dims_size, std::less<NativeT>());
-    } else {
-      std::sort(begin, begin + sort_dims_size, std::less<NativeT>());
-    }
-  } else {
-    if (is_stable) {
-      std::stable_sort(begin, begin + sort_dims_size, std::greater<NativeT>());
-    } else {
-      std::sort(begin, begin + sort_dims_size, std::greater<NativeT>());
-    }
-  };
-}
-
-// The most efficient way to sort a single buffer is to use the builtin
-// comparator functions.
-template <PrimitiveType Type>
-static void Sort1DArrInplace(const SortDims& sort_dims, int64_t offset,
-                             absl::Span<se::DeviceMemoryBase> data,
-                             bool is_stable,
-                             SortThunk::SortDirection direction) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<Type>::type;
-  DCHECK_EQ(data.size(), 1);
-  NativeT* begin = reinterpret_cast<NativeT*>(data[0].opaque()) + offset;
-
-  if (sort_dims.inner_dim_size == 1) {
-    Sort1DArrInplace<NativeT*, NativeT>(sort_dims.sort_dim_size, offset, begin,
-                                        is_stable, direction);
-  } else {
-    using Iterator = SortIterator<NativeT, NativeT&, NativeT*>;
-    Iterator begin_iter(begin, /*stride=*/sort_dims.inner_dim_size);
-    Sort1DArrInplace<Iterator, NativeT>(sort_dims.sort_dim_size, offset,
-                                        begin_iter, is_stable, direction);
-  }
-}
-
-// Sorts `n` buffers in place.
-template <size_t n>
-static void SortInplace(const SortDims& sort_dims, int64_t offset,
+// Sorts `data` of the given `shape` along the `dimension` inplace.
+static void SortInplace(const SortThunk::SortDims& sort_dims,
                         absl::Span<se::DeviceMemoryBase> data,
                         absl::Span<const Shape> shapes, bool is_stable,
-                        SortThunk::LessThan* less_than) {
-  std::array<std::byte*, n> ptrs;
-  std::array<size_t, n> primitive_sizes;
-
-  for (size_t i = 0; i < n; ++i) {
-    std::byte* base = reinterpret_cast<std::byte*>(data[i].opaque());
-    primitive_sizes[i] = primitive_util::ByteWidth(shapes[i].element_type());
-    ptrs[i] = base + offset * primitive_sizes[i];
-  }
-
-  Inputs<n> inputs(ptrs, primitive_sizes);
-
-  auto compare = [&](const auto& a, const auto& b) {
-    std::array<const void*, 2 * n> values;
-    a.FillComparedValues(&values[0]);
-    b.FillComparedValues(&values[1]);
-    return (*less_than)(values.data());
-  };
-
-  SortIterator<Value<n>, Ref<n>, Ptr<n>> begin(
-      Ptr<n>(&inputs), /*stride=*/sort_dims.inner_dim_size);
-  if (is_stable) {
-    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
-  } else {
-    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
-  }
-}
+                        SortThunk::LessThan* less_than,
+                        std::optional<SortThunk::SortDirection> direction) {
+  absl::InlinedVector<std::byte*, 16> raw_data;
+  absl::c_transform(data, std::back_inserter(raw_data),
+                    [](const se::DeviceMemoryBase& mem) {
+                      return reinterpret_cast<std::byte*>(mem.opaque());
+                    });
+
+  absl::InlinedVector<size_t, 16> primitive_sizes;
+  absl::c_transform(shapes, std::back_inserter(primitive_sizes),
+                    [](const Shape& shape) {
+                      return primitive_util::ByteWidth(shape.element_type());
+                    });
+
+  if (raw_data.size() == 1 && direction.has_value()) {
+    primitive_util::ArrayTypeSwitch(
+        [&](auto type) {
+          if constexpr ((primitive_util::IsFloatingPointType(type) &&
+                         primitive_util::BitWidth(type) >= 32) ||
+                        (primitive_util::IsIntegralType(type) &&
+                         primitive_util::BitWidth(type) >= 8)) {
+            using T = primitive_util::NativeTypeOf<type>;
+            internal::SortInplace<T>(sort_dims,
+                                     reinterpret_cast<T*>(raw_data[0]),
+                                     is_stable, *direction);
+          } else {
+            internal::SortInplace(sort_dims, raw_data, primitive_sizes,
+                                  is_stable, less_than);
+          }
+        },
+        shapes[0].element_type());
 
-static void DSortInplace(const SortDims& sort_dims, int64_t offset,
-                         absl::Span<se::DeviceMemoryBase> data,
-                         absl::Span<const Shape> shapes, bool is_stable,
-                         SortThunk::LessThan* less_than, size_t n) {
-  std::vector<std::byte*> ptrs(n);
-  std::vector<size_t> primitive_sizes(n);
-
-  for (size_t i = 0; i < n; ++i) {
-    std::byte* base = reinterpret_cast<std::byte*>(data[i].opaque());
-    primitive_sizes[i] = primitive_util::ByteWidth(shapes[i].element_type());
-    ptrs[i] = base + offset * primitive_sizes[i];
-  }
-
-  DInputs inputs(std::move(ptrs), std::move(primitive_sizes));
-
-  // Allocate scratch space for sorted values outside of the lambda to avoid
-  // allocating it on every call to `compare`.
-  std::vector<const void*> values(2 * n);
-
-  auto compare = [&, values = values.data()](const auto& a, const auto& b) {
-    a.FillComparedValues(&values[0]);
-    b.FillComparedValues(&values[1]);
-    return (*less_than)(values);
-  };
-
-  SortIterator<DValue, DRef, DPtr> begin(DPtr(&inputs),
-                                         /*stride=*/sort_dims.inner_dim_size);
-  if (is_stable) {
-    std::stable_sort(begin, begin + sort_dims.sort_dim_size, compare);
   } else {
-    std::sort(begin, begin + sort_dims.sort_dim_size, compare);
+    internal::SortInplace(sort_dims, raw_data, primitive_sizes, is_stable,
+                          less_than);
   }
 }
 
-// Sorts `data` of the given `shape` along the `dimension` inplace.
-static absl::Status SortInplace(
-    absl::Span<se::DeviceMemoryBase> data, absl::Span<const Shape> shapes,
-    int64_t dimension, bool is_stable, SortThunk::LessThan* less_than,
-    std::optional<SortThunk::SortDirection> direction) {
-  // All inputs have the same dimensions and layout, so we can use the first
-  // shape to get the sort dimensions.
-  SortDims sort_dims = GetSortDims(shapes[0], dimension);
-
-  // Iterate over all the 1-dimensional slices of the buffers and sort them.
-  for (int64_t i = 0; i < sort_dims.num_iterations; ++i) {
-    int64_t inner_idx = i % sort_dims.inner_dim_size;
-    int64_t offset = inner_idx + (i - inner_idx) * sort_dims.sort_dim_size;
-
-    auto sort = [&](auto num_inputs) {
-      SortInplace<decltype(num_inputs)::value>(sort_dims, offset, data, shapes,
-                                               is_stable, less_than);
-    };
-
-    auto dsort = [&](size_t num_inputs) {
-      DSortInplace(sort_dims, offset, data, shapes, is_stable, less_than,
-                   num_inputs);
-    };
-
-    // Sorts array using builtin comparator functor
-    auto builtin_sort = [&](PrimitiveType type,
-                            SortThunk::SortDirection direction) {
-      primitive_util::ArrayTypeSwitch(
-          [&](auto cst_type) {
-            if constexpr ((primitive_util::IsFloatingPointType(cst_type) ||
-                           primitive_util::IsIntegralType(cst_type)) &&
-                          primitive_util::BitWidth(cst_type) >= 8) {
-              Sort1DArrInplace<cst_type>(sort_dims, offset, data, is_stable,
-                                         direction);
-            } else {
-              sort(std::integral_constant<size_t, 1>{});
-            }
-          },
-          type);
-    };
-
-    // Use "sort" for statically known number of sorted inputs (expected to be
-    // faster) and "dsort" for dynamically known number of sorted inputs.
-    switch (data.size()) {
-      case 1:
-        DCHECK_EQ(shapes.size(), 1);
-        if (direction.has_value()) {
-          builtin_sort(shapes[0].element_type(), *direction);
-        } else {
-          sort(std::integral_constant<size_t, 1>{});
-        }
-        break;
-      case 2:
-        sort(std::integral_constant<size_t, 2>{});
-        break;
-      case 3:
-        sort(std::integral_constant<size_t, 3>{});
-        break;
-      case 4:
-        sort(std::integral_constant<size_t, 4>{});
-        break;
-      case 5:
-        sort(std::integral_constant<size_t, 5>{});
-        break;
-      case 6:
-        sort(std::integral_constant<size_t, 6>{});
-        break;
-      case 7:
-        sort(std::integral_constant<size_t, 7>{});
-        break;
-      case 8:
-        sort(std::integral_constant<size_t, 8>{});
-        break;
-      case 9:
-        sort(std::integral_constant<size_t, 9>{});
-        break;
-      case 10:
-        sort(std::integral_constant<size_t, 10>{});
-        break;
-      case 11:
-        sort(std::integral_constant<size_t, 11>{});
-        break;
-      case 12:
-        sort(std::integral_constant<size_t, 12>{});
-        break;
-      case 13:
-        sort(std::integral_constant<size_t, 13>{});
-        break;
-      case 14:
-        sort(std::integral_constant<size_t, 14>{});
-        break;
-      case 15:
-        sort(std::integral_constant<size_t, 15>{});
-        break;
-      case 16:
-        sort(std::integral_constant<size_t, 16>{});
-        break;
-      default:
-        dsort(data.size());
-        break;
-    }
-  }
-
-  return absl::OkStatus();
-}
-
 tsl::AsyncValueRef<SortThunk::ExecuteEvent> SortThunk::Execute(
     const ExecuteParams& params) {
-
   VLOG(3) << absl::StreamFormat(
       "Sort %d inputs along dimension %d (is_stable=%v)", inputs_.size(),
       dimension_, is_stable_);
@@ -901,8 +259,8 @@ tsl::AsyncValueRef<SortThunk::ExecuteEvent> SortThunk::Execute(
   TF_RETURN_IF_ERROR(less_than_.status());
   LessThan* less_than = &less_than_.value();
 
-  TF_RETURN_IF_ERROR(SortInplace(absl::MakeSpan(data), shapes, dimension_,
-                                 is_stable_, less_than, direction_));
+  SortInplace(sort_dims_, absl::MakeSpan(data), shapes, is_stable_, less_than,
+              direction_);
 
   return OkExecuteEvent();
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
index dd2f151296430b..4f5544834fb057 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.h
@@ -25,8 +25,11 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/sort_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
@@ -36,12 +39,9 @@ namespace xla::cpu {
 // less-than comparator function.
 class SortThunk final : public Thunk {
  public:
-  using LessThan = absl::AnyInvocable<bool(const void** data)>;
-
-  enum class SortDirection {
-    kAscending,
-    kDescending,
-  };
+  using LessThan = internal::LessThan;
+  using SortDims = internal::SortDims;
+  using SortDirection = internal::SortDirection;
 
   struct Input {
     BufferAllocation::Slice slice;
@@ -62,27 +62,31 @@ class SortThunk final : public Thunk {
 
   BufferUses buffer_uses() const final;
 
-  std::optional<SortDirection> direction() const { return direction_; }
   int64_t dimension() const { return dimension_; }
   bool is_stable() const { return is_stable_; }
-  const std::vector<Input>& inputs() const { return inputs_; }
 
-  const std::string& comparator_name() const { return comparator_name_; }
+  absl::Span<const Input> inputs() const { return inputs_; }
 
+  absl::string_view comparator_name() const { return comparator_name_; }
   bool has_less_than() const { return less_than_.ok(); }
 
+  const SortDims& sort_dims() const { return sort_dims_; }
+  std::optional<SortDirection> direction() const { return direction_; }
+
  private:
   SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
-            bool is_stable, LessThan less_than,
+            bool is_stable, LessThan less_than, SortDims sort_dims,
             std::optional<SortDirection> direction);
 
   SortThunk(Info info, absl::Span<const Input> inputs, int64_t dimension,
-            bool is_stable, std::string comparator_name,
+            bool is_stable, std::string comparator_name, SortDims sort_dims,
             std::optional<SortDirection> direction);
 
   std::vector<Input> inputs_;
   int64_t dimension_;
   bool is_stable_;
+
+  SortDims sort_dims_;
   std::optional<SortDirection> direction_;
 
   // Name of the comparator function, lazily resolved to a comparator function
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
index 797847a42c8bc5..9f9928ad99c3af 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk_test.cc
@@ -327,7 +327,9 @@ void BM_Sort1D(benchmark::State& state) {
   // Use sort direction to activate the most efficient sorting function, or fall
   // back on the comparator functor.
   std::optional<SortThunk::SortDirection> direction;
-  if (sort_ascending) direction = SortThunk::SortDirection::kAscending;
+  if (sort_ascending) {
+    direction = SortThunk::SortDirection::kAscending;
+  }
 
   auto [alloc, dummy_alloc] = CreateBufferAllocation(*data, *dummy_data);
   auto [slice, dummy_slice] = CreateBufferAllocationSlice(alloc, dummy_alloc);
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.proto b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
index 46bfcdb093ae81..0af36ecb40e915 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
@@ -17,7 +17,8 @@ syntax = "proto3";
 
 package xla.cpu;
 
-import "xla/backends/cpu/xnnpack_config.proto";
+import "xla/backends/cpu/xnn_fusion_options.proto";
+import "xla/backends/cpu/ynn_fusion_options.proto";
 import "xla/service/buffer_assignment.proto";
 import "xla/service/hlo.proto";
 import "xla/xla_data.proto";
@@ -169,7 +170,7 @@ message XnnFusionThunkProtoImpl {
 }
 
 message XnnFusionThunkProto {
-  XnnFusionBackendConfig options = 1;
+  XnnFusionOptions options = 1;
 
   oneof impl {
     XnnDotThunkProto xnn_dot_thunk = 2;
@@ -178,6 +179,14 @@ message XnnFusionThunkProto {
   }
 }
 
+message YnnFusionThunkProto {
+  YnnFusionOptions options = 1;
+
+  int64 instruction_id = 2;
+  repeated ShapeBufferAllocationSliceProto arguments_shapes = 3;
+  repeated ShapeBufferAllocationSliceProto results_shapes = 4;
+}
+
 message DotThunkProto {
   DotDimensionNumbers dot_dimensions = 1;
   ShapeBufferAllocationSliceProto lhs_buffer_shape = 2;
@@ -301,6 +310,7 @@ message ThunkProto {
     CollectiveThunkProto collective_thunk = 18;
     PartitionIdThunkProto partition_id_thunk = 19;
     ReplicaIdThunkProto replica_id_thunk = 20;
+    YnnFusionThunkProto ynn_fusion_thunk = 21;
   }
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
index 1d42b43f1731ad..a09487a91e36e3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
@@ -233,11 +233,12 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> ThunkExecutor::TracedExecute(
 
   // When thunk execution completes, create a consumer traceme to capture the
   // end event.
-  execute_event.AndThen([context_id = producer.GetContextId(), &thunk] {
-    tsl::profiler::TraceMeConsumer(
-        [&] { return absl::StrFormat("end: %s", thunk.info().op_name); },
-        tsl::profiler::ContextType::kGeneric, context_id);
-  });
+  execute_event.AndThen(
+      [context_id = producer.GetContextId(), op_name = thunk.info().op_name] {
+        tsl::profiler::TraceMeConsumer(
+            [&] { return absl::StrFormat("end: %s", op_name); },
+            tsl::profiler::ContextType::kGeneric, context_id);
+      });
 
   return execute_event;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 7cbc6ea7ac12a6..1928c719151f0f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -27,12 +27,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/all_gather_thunk.h"
 #include "xla/backends/cpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/cpu/runtime/all_to_all_thunk.h"
@@ -40,7 +42,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/cpu/runtime/collective_thunk.h"
 #include "xla/backends/cpu/runtime/conditional_thunk.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/convolution_thunk.h"
 #include "xla/backends/cpu/runtime/copy_thunk.h"
 #include "xla/backends/cpu/runtime/custom_call_thunk.h"
@@ -61,17 +63,29 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnnpack_config.pb.h"
+#include "xla/backends/cpu/xnn_fusion_options.pb.h"
+#include "xla/backends/cpu/ynn_fusion_options.pb.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
+#endif  // XLA_YNNPACK
+
 namespace xla::cpu {
 
 void ForEachThunkProto(const ThunkSequenceProto& proto,
@@ -163,6 +177,8 @@ static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
       return Thunk::Kind::kPartitionId;
     case ThunkProto::ImplCase::kReplicaIdThunk:
       return Thunk::Kind::kReplicaId;
+    case ThunkProto::ImplCase::kYnnFusionThunk:
+      return Thunk::Kind::kYnnFusion;
     case ThunkProto::ImplCase::IMPL_NOT_SET:
       return Internal("Thunk kind not set.");
   }
@@ -343,6 +359,7 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
  public:
   // Buffer allocations and resources are not needed for serialization.
   explicit ThunkSerDesProtobuf(
+      const HloModule* hlo_module = nullptr,
       const std::vector<BufferAllocation>* buffer_allocations = nullptr,
       const std::vector<std::shared_ptr<Resource>>* thunk_resources = nullptr);
   absl::StatusOr<std::string> Serialize(const Thunk& thunk) override;
@@ -355,16 +372,18 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
       const ThunkProto& proto) const;
 
  private:
-  // TODO(basiol) remove NOLINT when this actually gets used
-  const std::vector<BufferAllocation>* buffer_allocations_;  // NOLINT
+  const HloModule* hlo_module_;
+  const std::vector<BufferAllocation>* buffer_allocations_;
 
   const std::vector<std::shared_ptr<Resource>>* thunk_resources_;
 };
 
 ThunkSerDesProtobuf::ThunkSerDesProtobuf(
+    const HloModule* hlo_module,
     const std::vector<BufferAllocation>* buffer_allocations,
     const std::vector<std::shared_ptr<Resource>>* thunk_resources)
-    : buffer_allocations_(buffer_allocations),
+    : hlo_module_(hlo_module),
+      buffer_allocations_(buffer_allocations),
       thunk_resources_(thunk_resources) {}
 
 absl::StatusOr<std::string> ThunkSerDesProtobuf::Serialize(const Thunk& thunk) {
@@ -720,6 +739,28 @@ static absl::Status ToProto(const WhileThunk& thunk, ThunkProto& proto) {
   return absl::OkStatus();
 }
 
+#ifdef XLA_YNNPACK
+static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
+  YnnFusionThunkProto* ynn_fusion_proto = proto.mutable_ynn_fusion_thunk();
+  ynn_fusion_proto->mutable_options()->set_use_threadpool(
+      thunk.options().use_threadpool);
+  ynn_fusion_proto->set_instruction_id(thunk.hlo()->unique_id());
+
+  for (const YnnFusionThunk::Argument& argument : thunk.arguments()) {
+    TF_RETURN_IF_ERROR(
+        SerializeSliceShapeIntoProto(argument.slice, argument.shape,
+                                     ynn_fusion_proto->add_arguments_shapes()));
+  }
+
+  for (const YnnFusionThunk::Result& result : thunk.results()) {
+    TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
+        result.slice, result.shape, ynn_fusion_proto->add_results_shapes()));
+  }
+
+  return absl::OkStatus();
+}
+#endif  // XLA_YNNPACK
+
 static absl::Status ToProto(const XnnFusionThunk& thunk, ThunkProto& proto) {
   // TODO(basioli) XnnFusionThunk is not serializable because it contains
   // a builder function that is not serializable.
@@ -980,6 +1021,12 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
                   internal::LogicalIdKind::kReplicaId>&>(thunk)),
           proto));
       break;
+#ifdef XLA_YNNPACK
+    case Thunk::Kind::kYnnFusion:
+      TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
+          tsl::down_cast<const YnnFusionThunk&>(thunk), proto));
+      break;
+#endif  // XLA_YNNPACK
     default:
       return absl::UnimplementedError(
           absl::StrFormat("ToProto is not implemented for thunk kind: %s",
@@ -1086,9 +1133,10 @@ ReduceScatterThunkFromProto(
 }
 
 static absl::StatusOr<std::unique_ptr<CallThunk>> CallThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ThunkSequence> call_sequence,
@@ -1100,9 +1148,10 @@ static absl::StatusOr<std::unique_ptr<CallThunk>> CallThunkFromProto(
 
 static absl::StatusOr<std::unique_ptr<ConditionalThunk>>
 ConditionalThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   std::vector<ThunkSequence> branch_sequences;
   for (const ThunkSequenceProto& branch_sequence_proto :
@@ -1113,10 +1162,10 @@ ConditionalThunkFromProto(
   }
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice branch_index_buffer,
-      BufferAllocation::Slice::FromProto(
-          proto.conditional_thunk().branch_index_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice branch_index_buffer,
+                      BufferAllocation::Slice::FromProto(
+                          proto.conditional_thunk().branch_index_buffer(),
+                          *buffer_allocations));
 
   return ConditionalThunk::Create(std::move(info),
                                   std::move(branch_index_buffer),
@@ -1479,9 +1528,10 @@ static absl::StatusOr<std::unique_ptr<TopKThunk>> TopKThunkFromProto(
 }
 
 static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(&buffer_allocations);
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>* buffer_allocations) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(hlo_module,
+                                                    buffer_allocations);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ThunkSequence> cond_sequence,
@@ -1495,7 +1545,7 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice cond_buffer,
       BufferAllocation::Slice::FromProto(proto.while_thunk().cond_buffer(),
-                                         buffer_allocations));
+                                         *buffer_allocations));
 
   std::optional<int64_t> trip_count = std::nullopt;
   if (proto.while_thunk().trip_count().contains_value()) {
@@ -1507,6 +1557,86 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
                             std::move(*body_sequence), trip_count);
 }
 
+#ifdef XLA_YNNPACK
+static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
+    const ThunkProto& proto, const HloModule* hlo_module,
+    const std::vector<BufferAllocation>& buffer_allocations) {
+  const YnnFusionThunkProto& ynn_fusion_proto = proto.ynn_fusion_thunk();
+
+  YnnFusionThunk::Options options = {
+      ynn_fusion_proto.options().use_threadpool(),
+  };
+
+  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
+
+  const HloInstruction* hlo = std::invoke([&]() -> const HloInstruction* {
+    for (const HloComputation* computation : hlo_module->computations()) {
+      for (const HloInstruction* instruction : computation->instructions()) {
+        if (instruction->unique_id() == ynn_fusion_proto.instruction_id()) {
+          return instruction;
+        }
+      }
+    }
+    return nullptr;
+  });
+
+  if (hlo == nullptr) {
+    return Internal(
+        "HLO instruction with unique id %d not found in the HLO module",
+        ynn_fusion_proto.instruction_id());
+  }
+
+  std::vector<YnnFusionThunk::Argument> arguments;
+  for (auto& argument_shape_proto : ynn_fusion_proto.arguments_shapes()) {
+    TF_ASSIGN_OR_RETURN(auto argument_shape,
+                        DeserializeSliceShapeFromProto(argument_shape_proto,
+                                                       buffer_allocations));
+    arguments.push_back(
+        YnnFusionThunk::Argument{argument_shape.first, argument_shape.second});
+  }
+
+  std::vector<YnnFusionThunk::Result> results;
+  for (auto& result_shape_proto : ynn_fusion_proto.results_shapes()) {
+    TF_ASSIGN_OR_RETURN(
+        auto result_shape,
+        DeserializeSliceShapeFromProto(result_shape_proto, buffer_allocations));
+    results.push_back(
+        YnnFusionThunk::Result{result_shape.first, result_shape.second});
+  }
+
+  absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers)>
+      builder;
+  absl::Span<const int64_t> captured_arguments_ids;
+  if (hlo->opcode() == HloOpcode::kDot) {
+    const HloDotInstruction* dot = Cast<HloDotInstruction>(hlo);
+    // TODO(b/455903737): If we know the RHS is a constant, we should capture it
+    // here.
+    bool capture_rhs = false;
+    // Construct YNNPACK subgraph builder from the dot instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnDotBuilder(dot, capture_rhs));
+    static constexpr int64_t kCapturedIds[1] = {1};
+    if (capture_rhs) {
+      captured_arguments_ids = kCapturedIds;
+    }
+  } else {
+    auto* fusion = Cast<HloFusionInstruction>(hlo);
+    const HloComputation* computation =
+        fusion->fused_instructions_computation();
+    // Construct YNNPACK subgraph builder from the fusion computation.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnFusionBuilder(computation));
+  }
+
+  return YnnFusionThunk::Create(
+      std::move(options), std::move(info), hlo, std::move(arguments),
+      std::move(results),
+      [b = std::move(builder)](auto, auto, auto arg_buffers) mutable {
+        return b(arg_buffers);
+      },
+      captured_arguments_ids);
+}
+#endif  // XLA_YNNPACK
+
 static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunkFromProto(
     const ThunkProto& proto,
     const std::vector<BufferAllocation>& buffer_allocations) {
@@ -1661,9 +1791,9 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       }
     }
     case Thunk::Kind::kCall:
-      return CallThunkFromProto(proto, *buffer_allocations_);
+      return CallThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kConditional:
-      return ConditionalThunkFromProto(proto, *buffer_allocations_);
+      return ConditionalThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kConvolution:
       return ConvolutionThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kCopy:
@@ -1687,7 +1817,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
     case Thunk::Kind::kTopK:
       return TopKThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kWhile:
-      return WhileThunkFromProto(proto, *buffer_allocations_);
+      return WhileThunkFromProto(proto, hlo_module_, buffer_allocations_);
     case Thunk::Kind::kXnnFusion: {
       TF_ASSIGN_OR_RETURN(
           auto xnn_fusion_kind,
@@ -1705,6 +1835,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       return PartitionIdThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kReplicaId:
       return ReplicaIdThunkFromProto(proto, *buffer_allocations_);
+#ifdef XLA_YNNPACK
+    case Thunk::Kind::kYnnFusion:
+      return YnnFusionThunkFromProto(proto, hlo_module_, *buffer_allocations_);
+#endif  // XLA_YNNPACK
     default:
       return absl::Status(absl::StatusCode::kInvalidArgument,
                           absl::StrFormat("Unsupported thunk kind: %s",
@@ -1714,8 +1848,9 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
 }
 
 ThunkSequenceSerDesProtobuf::ThunkSequenceSerDesProtobuf(
+    const HloModule* hlo_module,
     const std::vector<BufferAllocation>* buffer_allocations)
-    : buffer_allocations_(buffer_allocations) {}
+    : hlo_module_(hlo_module), buffer_allocations_(buffer_allocations) {}
 
 absl::StatusOr<std::string> ThunkSequenceSerDesProtobuf::Serialize(
     const ThunkSequence& thunk_sequence) {
@@ -1735,7 +1870,7 @@ ThunkSequenceSerDesProtobuf::Deserialize(const std::string& serialized) {
 
 absl::StatusOr<ThunkSequenceProto> ThunkSequenceSerDesProtobuf::ToProto(
     const ThunkSequence& thunk_sequence) const {
-  ThunkSerDesProtobuf thunk_serdes(buffer_allocations_);
+  ThunkSerDesProtobuf thunk_serdes(hlo_module_, buffer_allocations_);
   ThunkSequenceProto proto;
   proto.mutable_thunks()->Reserve(thunk_sequence.size());
 
@@ -1797,7 +1932,7 @@ ThunkSequenceSerDesProtobuf::FromProto(const ThunkSequenceProto& proto) const {
 
   size_t thunk_index = 0;
   for (const ThunkProto& thunk_proto : proto.thunks()) {
-    ThunkSerDesProtobuf thunk_serdes(buffer_allocations_,
+    ThunkSerDesProtobuf thunk_serdes(hlo_module_, buffer_allocations_,
                                      &thunk_resources[thunk_index++]);
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
                         thunk_serdes.FromProto(thunk_proto));
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
index b18f727ccea94a..08b4deaaf91838 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/serdes_base.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::cpu {
@@ -36,10 +37,13 @@ void ForEachThunkProto(const ThunkSequenceProto& proto,
 
 class ThunkSequenceSerDesProtobuf : public SerDesBase<ThunkSequence> {
  public:
+  // For serialization, `hlo_module` and `buffer_allocations` are optional. For
+  // deserialization, both are required as we rely on the HLO module to resolve
+  // thunks that were generated from `HloComputation`s, and we also need buffer
+  // allocations to resolve buffer slices.
   explicit ThunkSequenceSerDesProtobuf(
-      const std::vector<BufferAllocation>* buffer_allocations =
-          nullptr);  // NOTE buffer allocations aren't
-                     // needed for serialization.
+      const HloModule* hlo_module = nullptr,
+      const std::vector<BufferAllocation>* buffer_allocations = nullptr);
 
   absl::StatusOr<std::string> Serialize(
       const ThunkSequence& thunk_sequence) override;
@@ -52,6 +56,7 @@ class ThunkSequenceSerDesProtobuf : public SerDesBase<ThunkSequence> {
       const ThunkSequenceProto& proto) const;
 
  private:
+  const HloModule* hlo_module_;
   const std::vector<BufferAllocation>* buffer_allocations_;
 };
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index 8daa2cb544d363..2fc4d850d1078e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -219,8 +219,8 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
 
  public:
   void SetUp() override {
-    thunk_sequence_serdes_ =
-        std::make_unique<T>(&buffer_allocations_.GetUnderlyingVector());
+    thunk_sequence_serdes_ = std::make_unique<T>(
+        nullptr, &buffer_allocations_.GetUnderlyingVector());
   }
 
  protected:
diff --git a/third_party/xla/xla/service/cpu/runtime_topk.cc b/third_party/xla/xla/backends/cpu/runtime/topk_lib.h
similarity index 75%
rename from third_party/xla/xla/service/cpu/runtime_topk.cc
rename to third_party/xla/xla/backends/cpu/runtime/topk_lib.h
index 867f398930fd41..f26043de1b5e87 100644
--- a/third_party/xla/xla/service/cpu/runtime_topk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/topk_lib.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,27 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/cpu/runtime_topk.h"
+#ifndef XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <limits>
 #include <numeric>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/base/dynamic_annotations.h"
 
+namespace xla::cpu::internal {
+
 template <typename T>
-static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
-                 const T* values, T* out_values, int32_t* out_indices) {
-  // 'values' is managed by the JIT code, so msan can't tell they are
-  // initialized.
+void TopK(int64_t batch_size, int64_t input_size, int64_t k, const T* values,
+          T* out_values, int32_t* out_indices) {
+  // values is managed by the JIT code, so msan can't tell they are initialized.
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values,
                                       input_size * batch_size * sizeof(T));
-  static constexpr auto convert_to_int = [](T value) {
+
+  auto convert_to_int = [](T value) {
     uint32_t x = absl::bit_cast<uint32_t>(value);
     return static_cast<int32_t>(x) < 0 ? std::numeric_limits<int32_t>::max() - x
                                        : x;
@@ -47,7 +49,7 @@ static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
 
     auto kth_element = temp_indices.begin() + k;
     std::partial_sort(temp_indices.begin(), kth_element, temp_indices.end(),
-                      [values_batch](size_t i1, size_t i2) {
+                      [&](size_t i1, size_t i2) {
                         // Do the comparison in integers to enforce a total
                         // order of -NaN < -Inf < -0 < +0 < +Inf < +NaN.
                         int32_t v1 = convert_to_int(values_batch[i1]);
@@ -67,8 +69,6 @@ static void TopK(int64_t batch_size, int64_t input_size, int64_t k,
   }
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TopKF32(
-    int64_t batch_size, int64_t input_size, int64_t k, const float* values,
-    float* out_values, int32_t* out_indices) {
-  TopK(batch_size, input_size, k, values, out_values, out_indices);
-}
+}  // namespace xla::cpu::internal
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_TOPK_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
index caff11b3c22ee2..cc473b82459879 100644
--- a/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/topk_thunk.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/topk_lib.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/cpu/runtime_topk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
@@ -62,10 +62,11 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> TopKThunk::Execute(
       se::DeviceMemoryBase indices,
       params.buffer_allocations->GetDeviceAddress(indices_buffer_));
 
-  __xla_cpu_runtime_TopKF32(batch_size_, input_size_, k_,
-                            reinterpret_cast<const float*>(values.opaque()),
-                            reinterpret_cast<float*>(output.opaque()),
-                            reinterpret_cast<int32_t*>(indices.opaque()));
+  internal::TopK<float>(batch_size_, input_size_, k_,
+                        static_cast<const float*>(values.opaque()),
+                        static_cast<float*>(output.opaque()),
+                        static_cast<int32_t*>(indices.opaque()));
+
   return OkExecuteEvent();
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/work_queue.h b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
index 93f6fe557fc5e1..2513c09ef2f577 100644
--- a/third_party/xla/xla/backends/cpu/runtime/work_queue.h
+++ b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
@@ -40,36 +40,37 @@ limitations under the License.
 
 namespace xla::cpu {
 
-// A work queue that partitions `num_tasks` tasks into `num_partitions`
-// partitions processed by parallel workers.
+// A work queue that partitions `num_work_items` work items into
+// `num_partitions` partitions processed by parallel workers.
 class WorkQueue {
  public:
-  WorkQueue(size_t num_tasks, size_t num_partitions);
+  WorkQueue(size_t num_work_items, size_t num_partitions);
 
-  // Returns the next task in the given partition. Returns std::nullopt
+  // Returns the next work item in the given partition. Returns std::nullopt
   // if the partition is complete.
   std::optional<size_t> Pop(size_t partition_index);
 
-  // Return the partition [begin, end) task range.
+  // Return the partition [begin, end) work items range.
   std::pair<size_t, size_t> partition_range(size_t partition_index) const;
 
   size_t num_partitions() const { return partitions_.size(); }
 
+  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+
  private:
   friend class Worker;
 
   struct Partition {
     void Initialize(size_t begin, size_t end);
 
-    // Tracks index of the next task in the assigned partition.
+    // Tracks index of the next work item in the assigned partition.
     ABSL_CACHELINE_ALIGNED std::atomic<size_t> index;
     size_t begin;
     size_t end;
   };
 
-  // An empty work queue flag to stop worker threads from looping through all
-  // partitions looking for work.
-  bool IsEmpty() const { return empty_.load(std::memory_order_relaxed); }
+  // Sets an empty work queue flag to stop worker threads from looping through
+  // all partitions looking for work.
   void SetEmpty() { empty_.store(true, std::memory_order_relaxed); }
 
   // Notify that one of the workers switched to the work stealing mode.
@@ -84,34 +85,38 @@ class WorkQueue {
   ABSL_CACHELINE_ALIGNED std::atomic<size_t> num_work_stealing_workers_;
 };
 
-// Worker processes tasks from the work queue starting from the assigned
+// Worker processes work items from the work queue starting from the assigned
 // work partition. Once the assigned partition is complete it tries to pop
-// the task from the next partition. Once the work queue is empty (the worker
-// wraps around to the initial partition) it returns and empty task.
+// the work item from the next partition. Once the work queue is empty (the
+// worker wraps around to the initial partition) it returns and empty work item.
 class Worker {
  public:
   Worker(size_t worker_index, WorkQueue* queue);
 
-  std::optional<size_t> Pop();
+  // Pops a work item from the work queue. If `notify_work_stealing` is true,
+  // the worker will notify the work queue when it switches to the work
+  // stealing mode. Worker parallelization has an optimization to avoid
+  // scheduling more workers if there are workers in the work stealing mode.
+  std::optional<size_t> Pop(bool notify_work_stealing = true);
 
   // Schedule `num_workers` workers into the Eigen thread pool that process
-  // `num_tasks` parallel tasks and return an async value that becomes
+  // `num_work_items` parallel work items and return an async value that becomes
   // available when all workers are completed.
-  template <typename ParallelTask>
+  template <typename ParallelWork>
   static tsl::AsyncValueRef<tsl::Chain> Parallelize(
       Eigen::ThreadPoolInterface* thread_pool, size_t num_workers,
-      size_t num_tasks, ParallelTask&& parallel_task);
+      size_t num_work_items, ParallelWork&& parallel_work);
 
  private:
-  template <typename ParallelTask>
+  template <typename ParallelWork>
   struct ParallelizeContext;
 
-  template <typename ParallelTask>
-  static absl::Status ExecuteInline(size_t num_tasks,
-                                    ParallelTask&& parallel_task);
+  template <typename ParallelWork>
+  static absl::Status ExecuteInline(size_t num_work_items,
+                                    ParallelWork&& parallel_work);
 
-  template <typename ParallelTask>
-  static void Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
+  template <typename ParallelWork>
+  static void Parallelize(std::shared_ptr<ParallelizeContext<ParallelWork>> ctx,
                           uint16_t start_index, uint16_t end_index);
 
   size_t worker_index_;
@@ -125,14 +130,14 @@ inline void WorkQueue::Partition::Initialize(size_t begin, size_t end) {
   this->end = end;
 }
 
-inline WorkQueue::WorkQueue(size_t num_tasks, size_t num_partitions)
+inline WorkQueue::WorkQueue(size_t num_work_items, size_t num_partitions)
     : partitions_(num_partitions),
-      empty_(num_tasks == 0),
+      empty_(num_work_items == 0),
       num_work_stealing_workers_(0) {
-  size_t partition_size = num_tasks / num_partitions;
-  size_t rem_tasks = num_tasks % num_partitions;
+  size_t partition_size = num_work_items / num_partitions;
+  size_t rem_work_items = num_work_items % num_partitions;
   for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
-    end = begin + partition_size + ((i < rem_tasks) ? 1 : 0);
+    end = begin + partition_size + ((i < rem_work_items) ? 1 : 0);
     partitions_[i].Initialize(begin, end);
   }
 }
@@ -147,7 +152,7 @@ inline std::optional<size_t> WorkQueue::Pop(size_t partition_index) {
     return std::nullopt;
   }
 
-  // Try to acquire the next task in the partition.
+  // Try to acquire the next work item in the partition.
   size_t index = partition.index.fetch_add(1, std::memory_order_relaxed);
   return ABSL_PREDICT_FALSE(index >= partition.end) ? std::nullopt
                                                     : std::make_optional(index);
@@ -181,19 +186,20 @@ inline Worker::Worker(size_t worker_index, WorkQueue* queue)
       partition_index_(worker_index),
       queue_(queue) {}
 
-inline std::optional<size_t> Worker::Pop() {
-  std::optional<size_t> task = queue_->Pop(partition_index_);
-  if (ABSL_PREDICT_TRUE(task)) {
-    return task;
+inline std::optional<size_t> Worker::Pop(bool notify_work_stealing) {
+  std::optional<size_t> work_item = queue_->Pop(partition_index_);
+  if (ABSL_PREDICT_TRUE(work_item)) {
+    return work_item;
   }
 
-  // If we didn't find a task in the initially assigned partition, notify the
-  // work queue that we are switching to work stealing mode.
-  if (ABSL_PREDICT_FALSE(partition_index_ == worker_index_)) {
+  // If we didn't find a work item in the initially assigned partition, notify
+  // the work queue that we are switching to work stealing mode.
+  if (ABSL_PREDICT_FALSE(notify_work_stealing &&
+                         partition_index_ == worker_index_)) {
     queue_->NotifyWorkStealingWorker();
   }
 
-  while (!task.has_value() && !queue_->IsEmpty()) {
+  while (!work_item.has_value() && !queue_->IsEmpty()) {
     // Wrap around to the first partition.
     if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
       partition_index_ = 0;
@@ -205,44 +211,44 @@ inline std::optional<size_t> Worker::Pop() {
       break;
     }
 
-    task = queue_->Pop(partition_index_);
+    work_item = queue_->Pop(partition_index_);
   }
 
-  return task;
+  return work_item;
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 struct Worker::ParallelizeContext {
   ParallelizeContext(Eigen::ThreadPoolInterface* thread_pool,
                      tsl::CountDownAsyncValueRef<tsl::Chain> count_down,
-                     size_t num_tasks, ParallelTask&& parallel_task);
+                     size_t num_work_items, ParallelWork&& parallel_work);
 
   Eigen::ThreadPoolInterface* thread_pool;
   tsl::CountDownAsyncValueRef<tsl::Chain> count_down;
 
   WorkQueue work_queue;
-  ParallelTask parallel_task;
+  ParallelWork parallel_work;
 };
 
-template <typename ParallelTask>
-Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
+template <typename ParallelWork>
+Worker::ParallelizeContext<ParallelWork>::ParallelizeContext(
     Eigen::ThreadPoolInterface* thread_pool,
-    tsl::CountDownAsyncValueRef<tsl::Chain> count_down, size_t num_tasks,
-    ParallelTask&& parallel_task)
+    tsl::CountDownAsyncValueRef<tsl::Chain> count_down, size_t num_work_items,
+    ParallelWork&& parallel_work)
     : thread_pool(thread_pool),
       count_down(std::move(count_down)),
-      work_queue(num_tasks, /*num_partitions=*/this->count_down.count()),
-      parallel_task(std::forward<ParallelTask>(parallel_task)) {}
+      work_queue(num_work_items, /*num_partitions=*/this->count_down.count()),
+      parallel_work(std::forward<ParallelWork>(parallel_work)) {}
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
+void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelWork>> ctx,
                          uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
 
-  using R = std::invoke_result_t<ParallelTask, size_t>;
+  using R = std::invoke_result_t<ParallelWork, size_t>;
   static_assert(std::is_same_v<R, absl::Status> || std::is_void_v<R>,
-                "Unsupported parallel task return type");
+                "Unsupported parallel work return type");
 
   // Recursively split assigned workers into two halves and schedule the
   // right half into the thread pool.
@@ -253,7 +259,7 @@ void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
     }
 
     // If we have workers in the work stealing mode, we can skip scheduling
-    // more tasks as existing workers will process remaining partitions. By
+    // more workers as existing workers will process remaining partitions. By
     // doing this optimization we avoid unnecessary thread pool overheads.
     size_t skip_workers =
         ctx->work_queue.DecrementWorkStealingWorkers(end_index - start_index);
@@ -282,54 +288,54 @@ void Worker::Parallelize(std::shared_ptr<ParallelizeContext<ParallelTask>> ctx,
 
   // Execute the `start_index` worker in the caller thread.
   Worker worker(start_index, &ctx->work_queue);
-  size_t num_processed_tasks = 0;
+  size_t num_processed_work_items = 0;
 
   // Keep track of the first error status encountered by any of the workers.
   absl::Status status;
 
-  while (std::optional<size_t> task = worker.Pop()) {
+  while (std::optional<size_t> work_item = worker.Pop()) {
     if constexpr (std::is_same_v<R, absl::Status>) {
       if (ABSL_PREDICT_TRUE(status.ok())) {
-        status.Update(ctx->parallel_task(*task));
+        status.Update(ctx->parallel_work(*work_item));
       }
     } else {
-      ctx->parallel_task(*task);
+      ctx->parallel_work(*work_item);
     }
-    ++num_processed_tasks;
+    ++num_processed_work_items;
   }
 
-  ctx->count_down.CountDown(num_processed_tasks, std::move(status));
+  ctx->count_down.CountDown(num_processed_work_items, std::move(status));
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 ABSL_ATTRIBUTE_ALWAYS_INLINE absl::Status Worker::ExecuteInline(
-    size_t num_tasks, ParallelTask&& parallel_task) {
-  using R = std::invoke_result_t<ParallelTask, size_t>;
+    size_t num_work_items, ParallelWork&& parallel_work) {
+  using R = std::invoke_result_t<ParallelWork, size_t>;
   static_assert(std::is_same_v<R, absl::Status> || std::is_void_v<R>,
-                "Unsupported parallel task return type");
+                "Unsupported parallel work return type");
 
-  for (size_t i = 0; i < num_tasks; ++i) {
+  for (size_t i = 0; i < num_work_items; ++i) {
     if constexpr (std::is_same_v<R, absl::Status>) {
-      absl::Status status = parallel_task(i);
+      absl::Status status = parallel_work(i);
       if (ABSL_PREDICT_FALSE(!status.ok())) {
         return status;
       }
     } else {
-      parallel_task(i);
+      parallel_work(i);
     }
   }
 
   return absl::OkStatus();
 }
 
-template <typename ParallelTask>
+template <typename ParallelWork>
 ABSL_ATTRIBUTE_ALWAYS_INLINE tsl::AsyncValueRef<tsl::Chain> Worker::Parallelize(
     Eigen::ThreadPoolInterface* thread_pool, size_t num_workers,
-    size_t num_tasks, ParallelTask&& parallel_task) {
+    size_t num_work_items, ParallelWork&& parallel_work) {
   // Short-circuit single-threaded execution.
   if (ABSL_PREDICT_FALSE(num_workers == 1)) {
-    if (absl::Status status =
-            ExecuteInline(num_tasks, std::forward<ParallelTask>(parallel_task));
+    if (absl::Status status = ExecuteInline(
+            num_work_items, std::forward<ParallelWork>(parallel_work));
         ABSL_PREDICT_FALSE(!status.ok())) {
       return status;
     }
@@ -340,16 +346,16 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE tsl::AsyncValueRef<tsl::Chain> Worker::Parallelize(
   if (ABSL_PREDICT_FALSE(num_workers > std::numeric_limits<uint16_t>::max())) {
     num_workers = std::numeric_limits<uint16_t>::max();
   }
-  // Ensure we don't launch more workers than tasks.
-  // Extra workers would be idle or cause out-of-bounds partition access.
-  num_workers = std::min(num_tasks, num_workers);
+  // Ensure we don't launch more workers than work items. Extra workers would be
+  // idle or cause out-of-bounds partition access.
+  num_workers = std::min(num_work_items, num_workers);
 
-  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(num_tasks);
+  tsl::CountDownAsyncValueRef<tsl::Chain> count_down(num_work_items);
   auto execute_event = count_down.AsRef();
 
-  auto ctx = std::make_shared<ParallelizeContext<ParallelTask>>(
-      thread_pool, std::move(count_down), num_tasks,
-      std::forward<ParallelTask>(parallel_task));
+  auto ctx = std::make_shared<ParallelizeContext<ParallelWork>>(
+      thread_pool, std::move(count_down), num_work_items,
+      std::forward<ParallelWork>(parallel_work));
 
   Parallelize(std::move(ctx), 0, num_workers);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc b/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
index 33e62f63f4c388..7b4a988e16d69f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/work_queue_test.cc
@@ -41,7 +41,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   };
 
   {
-    WorkQueue queue(/*num_tasks=*/2, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/2, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 1));
     EXPECT_EQ(queue.partition_range(1), task_range(1, 2));
     EXPECT_EQ(queue.partition_range(2), task_range(2, 2));
@@ -49,7 +49,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/4, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/4, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 1));
     EXPECT_EQ(queue.partition_range(1), task_range(1, 2));
     EXPECT_EQ(queue.partition_range(2), task_range(2, 3));
@@ -57,7 +57,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/5, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/5, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 2));
     EXPECT_EQ(queue.partition_range(1), task_range(2, 3));
     EXPECT_EQ(queue.partition_range(2), task_range(3, 4));
@@ -65,7 +65,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/9, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/9, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 3));
     EXPECT_EQ(queue.partition_range(1), task_range(3, 5));
     EXPECT_EQ(queue.partition_range(2), task_range(5, 7));
@@ -73,7 +73,7 @@ TEST(WorkQueueTest, WorkQueuePartitions) {
   }
 
   {
-    WorkQueue queue(/*num_tasks=*/14, /*num_partitions=*/4);
+    WorkQueue queue(/*num_work_items=*/14, /*num_partitions=*/4);
     EXPECT_EQ(queue.partition_range(0), task_range(0, 4));
     EXPECT_EQ(queue.partition_range(1), task_range(4, 8));
     EXPECT_EQ(queue.partition_range(2), task_range(8, 11));
@@ -107,17 +107,17 @@ TEST(WorkQueueTest, WorkQueue) {
     for (size_t num_partitions : {1, 2, 3, 4, 5, 6, 7, 8}) {
       WorkQueue queue(size, num_partitions);
 
-      std::vector<size_t> expected_tasks(size);
-      absl::c_iota(expected_tasks, 0);
+      std::vector<size_t> expected_work_items(size);
+      absl::c_iota(expected_work_items, 0);
 
-      std::vector<size_t> tasks;
+      std::vector<size_t> work_items;
       for (size_t i = 0; i < num_partitions; ++i) {
-        while (std::optional<size_t> task = queue.Pop(i)) {
-          tasks.push_back(*task);
+        while (std::optional<size_t> work_item = queue.Pop(i)) {
+          work_items.push_back(*work_item);
         }
       }
 
-      EXPECT_EQ(tasks, expected_tasks);
+      EXPECT_EQ(work_items, expected_work_items);
     }
   }
 }
@@ -126,21 +126,21 @@ TEST(WorkQueueTest, Worker) {
   for (size_t size : {1, 2, 4, 8, 16, 32, 64}) {
     for (size_t num_partitions : {1, 2, 3, 4, 5, 6, 7, 8}) {
       // We check that no matter what is the initial partition, the worker
-      // processes all tasks in the queue.
+      // processes all work items in the queue.
       for (size_t i = 0; i < num_partitions; ++i) {
         WorkQueue queue(size, num_partitions);
         Worker worker(i, &queue);
 
-        std::vector<size_t> expected_tasks(size);
-        absl::c_iota(expected_tasks, 0);
+        std::vector<size_t> expected_work_items(size);
+        absl::c_iota(expected_work_items, 0);
 
-        std::vector<size_t> tasks;
-        while (std::optional<size_t> task = worker.Pop()) {
-          tasks.push_back(*task);
+        std::vector<size_t> work_items;
+        while (std::optional<size_t> work_item = worker.Pop()) {
+          work_items.push_back(*work_item);
         }
 
-        absl::c_sort(tasks);  // we pop tasks out of order
-        EXPECT_EQ(tasks, expected_tasks);
+        absl::c_sort(work_items);  // we pop work_items out of order
+        EXPECT_EQ(work_items, expected_work_items);
       }
     }
   }
@@ -154,22 +154,22 @@ TEST(WorkQueueTest, WorkerConcurrency) {
 
   WorkQueue queue(size, num_partitions);
 
-  // Check that we pop exactly `size` tasks.
-  std::atomic<size_t> num_tasks(0);
+  // Check that we pop exactly `size` work_items.
+  std::atomic<size_t> num_work_items(0);
 
   absl::BlockingCounter counter(num_partitions);
   for (size_t i = 0; i < num_partitions; ++i) {
     threads.Schedule([&, i] {
       Worker worker(i, &queue);
-      while (std::optional<size_t> task = worker.Pop()) {
-        ++num_tasks;
+      while (std::optional<size_t> work_item = worker.Pop()) {
+        ++num_work_items;
       }
       counter.DecrementCount();
     });
   }
 
   counter.Wait();
-  EXPECT_EQ(num_tasks.load(), size);
+  EXPECT_EQ(num_work_items.load(), size);
 }
 
 TEST(WorkQueueTest, WorkerParallelize) {
@@ -215,35 +215,36 @@ TEST(WorkQueueTest, WorkerParallelizeVariousWorkerTaskRatios) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 16);
 
   struct TestCase {
-    size_t num_tasks;
+    size_t num_work_items;
     size_t num_workers;
   };
 
   std::vector<TestCase> test_cases = {
-      {0, 1},     // Edge: no tasks
+      {0, 1},     // Edge: no work_items
       {1, 1},     // Edge: single task, single worker
       {1, 8},     // Edge: single task, many workers
       {8, 1},     // Serial execution
-      {8, 4},     // Fewer workers than tasks
+      {8, 4},     // Fewer workers than work_items
       {8, 8},     // Equal
-      {8, 16},    // More workers than tasks
-      {1024, 8},  // Many tasks, fewer workers
-      {1024, 64}  // Many tasks, many workers
+      {8, 16},    // More workers than work_items
+      {1024, 8},  // Many work_items, fewer workers
+      {1024, 64}  // Many work_items, many workers
   };
 
   for (const auto& test : test_cases) {
-    std::vector<size_t> data(test.num_tasks, 0);
+    std::vector<size_t> data(test.num_work_items, 0);
 
     auto event = Worker::Parallelize(
-        threads.AsEigenThreadPool(), test.num_workers, test.num_tasks,
+        threads.AsEigenThreadPool(), test.num_workers, test.num_work_items,
         [&](size_t task_index) { ++data[task_index]; });
 
     tsl::BlockUntilReady(event);
 
-    // Verify that all tasks were executed once (if any exist)
-    std::vector<size_t> expected(test.num_tasks, 1);
-    EXPECT_EQ(data, expected) << "Failed for num_tasks=" << test.num_tasks
-                              << ", num_workers=" << test.num_workers;
+    // Verify that all work_items were executed once (if any exist)
+    std::vector<size_t> expected(test.num_work_items, 1);
+    EXPECT_EQ(data, expected)
+        << "Failed for num_work_items=" << test.num_work_items
+        << ", num_workers=" << test.num_workers;
   }
 }
 
@@ -251,35 +252,35 @@ TEST(WorkQueueTest, WorkerParallelizeVariousWorkerTaskRatios) {
 // Performance benchmarks.
 //===----------------------------------------------------------------------===//
 
-static void BM_PopTask(benchmark::State& state) {
+static void BM_PopWorkItem(benchmark::State& state) {
   std::optional<WorkQueue> queue;
   std::optional<Worker> worker;
 
   size_t n = 0;
   for (auto _ : state) {
     if (n++ % (1024 * 10) == 0) {
-      queue.emplace(/*num_tasks=*/1024 * 10, /*num_partitions=*/10);
+      queue.emplace(/*num_work_items=*/1024 * 10, /*num_partitions=*/10);
       worker.emplace(0, &*queue);
     }
     worker->Pop();
   }
 }
 
-BENCHMARK(BM_PopTask);
+BENCHMARK(BM_PopWorkItem);
 
-static void BM_PopTaskMultiThreaded(benchmark::State& state) {
+static void BM_PopWorkItemMultiThreaded(benchmark::State& state) {
   size_t num_threads = state.range(0);
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "benchmark",
-                                  num_threads);
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
 
   for (auto _ : state) {
     absl::BlockingCounter counter(num_threads);
-    WorkQueue queue(/*num_tasks=*/1024 * 10, /*num_partitions=*/num_threads);
+    WorkQueue queue(/*num_work_items=*/1024 * 10,
+                    /*num_partitions=*/num_threads);
 
     for (size_t i = 0; i < num_threads; ++i) {
       threads.Schedule([i, &queue, &counter] {
         Worker worker(i, &queue);
-        while (std::optional<size_t> task = worker.Pop()) {
+        while (std::optional<size_t> work_item = worker.Pop()) {
         }
         counter.DecrementCount();
       });
@@ -291,7 +292,7 @@ static void BM_PopTaskMultiThreaded(benchmark::State& state) {
   state.SetItemsProcessed(state.iterations() * 1024 * 10);
 }
 
-BENCHMARK(BM_PopTaskMultiThreaded)
+BENCHMARK(BM_PopWorkItemMultiThreaded)
     ->MeasureProcessCPUTime()
     ->Arg(2)
     ->Arg(4)
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
index 416424e2ab7dc3..3b3a8666298aff 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
@@ -53,7 +53,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:convolution_lib",
+        "//xla/backends/cpu/runtime:convolution_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
@@ -112,7 +112,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_lib",
+        "//xla/backends/cpu/runtime:dot_dims",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
index 27c7038932be43..2e9ce21d285239 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
index 2e37ab97ed7cb6..116f865a52a421 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/convolution_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
index 741c3e18155026..6780fc4b500f78 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
index 2349250b2f2736..b97789bf11ab5c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
index 6e03bc5b4db070..c8a41602e67a39 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
@@ -14,6 +14,39 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "slinky_threadpool",
+    srcs = ["slinky_threadpool.cc"],
+    hdrs = ["slinky_threadpool.h"],
+    deps = [
+        "//xla/backends/cpu/runtime:work_queue",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
+ynn_cc_test(
+    name = "slinky_threadpool_test",
+    srcs = ["slinky_threadpool_test.cc"],
+    deps = [
+        ":slinky_threadpool",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/strings:str_format",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
 cc_library(
     name = "ynn_interop",
     srcs = ["ynn_interop.cc"],
@@ -35,11 +68,18 @@ cc_library(
     srcs = ["ynn_threadpool.cc"],
     hdrs = ["ynn_threadpool.h"],
     deps = [
+        ":slinky_threadpool",
         ":ynn_interop",
         "@XNNPACK//ynnpack:ynnpack_h",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
+        "@slinky//slinky/base:thread_pool",
     ],
 )
 
@@ -51,6 +91,7 @@ cc_library(
         ":ynn_interop",
         "//xla:shape_util",
         "//xla/backends/cpu/runtime:thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/runtime:object_pool",
         "//xla/service:buffer_assignment",
@@ -89,6 +130,7 @@ ynn_cc_test(
         "//xla/backends/cpu/runtime:buffer_allocations",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/backends/cpu/runtime:thunk_testlib",
+        "//xla/hlo/ir:hlo",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc
new file mode 100644
index 00000000000000..018d7102b13230
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.cc
@@ -0,0 +1,424 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "slinky/base/function_ref.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+#include "xla/backends/cpu/runtime/work_queue.h"
+#include "tsl/profiler/lib/traceme.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// Task
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Running a task can result in three states:
+//
+//   kPending:  The task is still being processed by the worker threads.
+//   kComplete: The caller thread is the one who completed the task.
+//   kDone:     The task is done and all work items have been processed, however
+//              the caller thread did't process any work items.
+//
+// We need this state to signal the waiter thread just once, from a thread that
+// completed the task.S
+enum class TaskState { kPending, kComplete, kDone };
+
+class Task final : public SlinkyThreadPool::task {
+ public:
+  Task(SlinkyThreadPool::task_body body, size_t num_work_items,
+       size_t num_partitions);
+
+  // Runs this task by processing work items in the current thread.
+  TaskState Run();
+
+  // Returns true if the work queue is empty. It doesn't mean that the task is
+  // complete, as some threads might still be working on this task.
+  bool IsEmptyWorkQueue() const;
+
+  // Returns the number of workers that are currently working on this task.
+  int64_t num_workers() const;
+
+  // Returns true if the task is done.
+  bool done() const final;
+
+ private:
+  SlinkyThreadPool::task_body body_;
+  WorkQueue work_queue_;
+
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> worker_index_;
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> pending_work_items_;
+};
+}  // namespace
+
+Task::Task(SlinkyThreadPool::task_body body, size_t num_work_items,
+           size_t num_partitions)
+    : body_(std::move(body)),
+      work_queue_(num_work_items, num_partitions),
+      worker_index_(0),
+      pending_work_items_(num_work_items) {}
+
+TaskState Task::Run() {
+  // If we have more workers joining the task than the number of partitions,
+  // then we have to wrap around to the first partition.
+  size_t worker_index = worker_index_.fetch_add(1, std::memory_order_relaxed);
+  if (ABSL_PREDICT_FALSE(worker_index >= work_queue_.num_partitions())) {
+    worker_index %= work_queue_.num_partitions();
+  }
+
+  // Each worker processes the body using its own copy of the task.
+  Worker w(worker_index, &work_queue_);
+  size_t num_processed_work_items = 0;
+
+  if (std::optional<size_t> item = w.Pop(/*notify_work_stealing=*/false)) {
+    SlinkyThreadPool::task_body body = body_;
+
+    do {
+      body(*item);
+      ++num_processed_work_items;
+    } while ((item = w.Pop(/*notify_work_stealing=*/false)).has_value());
+  }
+
+  // The number of pending work items should never go below zero.
+  size_t previous_work_items = pending_work_items_.fetch_sub(
+      num_processed_work_items, std::memory_order_acq_rel);
+  DCHECK_GE(previous_work_items, num_processed_work_items);
+
+  // Task is done if we have no more work items to process. Task is complete if
+  // we are the one who processed the last work item.
+  bool is_done = previous_work_items == num_processed_work_items;
+  bool is_complete = is_done && num_processed_work_items > 0;
+
+  return is_complete ? TaskState::kComplete
+         : is_done   ? TaskState::kDone
+                     : TaskState::kPending;
+}
+
+int64_t Task::num_workers() const {
+  return worker_index_.load(std::memory_order_relaxed);
+}
+
+bool Task::IsEmptyWorkQueue() const { return work_queue_.IsEmpty(); }
+
+bool Task::done() const {
+  return pending_work_items_.load(std::memory_order_acquire) == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SlinkyThreadPool::Impl
+//===----------------------------------------------------------------------===//
+
+// We keep a stack of tasks that are currently being processed by current
+// thread, to avoid recursive calls.
+static thread_local std::vector<const Task*> task_stack;  // NOLINT
+
+class SlinkyThreadPool::Impl : public slinky::ref_counted<Impl> {
+ public:
+  explicit Impl(Eigen::ThreadPoolInterface* threadpool);
+
+  // Enqueues a new task into the queue and returns a reference to it.
+  slinky::ref_count<Task> Enqueue(SlinkyThreadPool::task_body body,
+                                  size_t num_work_items, size_t num_partitions);
+
+  // Work on the single task and return the state of the task.
+  TaskState WorkOnTask(Task* task);
+
+  // Work on all tasks in the queue. Returns when Run out of tasks to process.
+  void WorkOnTasks(const absl::Condition& condition);
+
+  void Await(const absl::Condition& condition);
+  void AtomicCall(slinky::function_ref<void()> t);
+
+  // Returns true if we can schedule more workers into the underlying scheduler.
+  bool CanScheduleWorkers() const;
+
+  // Schedules the given number of workers for the given task. Worker scheduling
+  // uses recursive work splitting and early exit if the task does not need any
+  // more workers, of if we reached the maximum number of scheduled workers.
+  void ScheduleWorkers(int64_t num_workers, slinky::ref_count<Task> task);
+
+  size_t thread_count() const { return thread_count_; }
+
+ private:
+  friend class slinky::ref_counted<Impl>;
+  static void destroy(Impl* ptr) { delete ptr; }
+
+  // A state of the work scheduling for a given task.
+  struct ScheduleState : public slinky::ref_counted<ScheduleState> {
+    ScheduleState(int64_t remaining_workers, slinky::ref_count<Task> task,
+                  slinky::ref_count<Impl> impl)
+        : remaining_workers(remaining_workers),
+          task(std::move(task)),
+          impl(std::move(impl)) {}
+
+    static void destroy(ScheduleState* ptr) { delete ptr; }
+
+    std::atomic<int64_t> remaining_workers;
+    slinky::ref_count<Task> task;
+    slinky::ref_count<Impl> impl;
+  };
+
+  // Worker scheduling function for the underlying scheduler.
+  template <bool release_impl_ref>
+  static void ScheduleWorkers(ScheduleState* context);
+
+  // Dequeues a pending task from the queue.
+  slinky::ref_count<Task> Dequeue();
+
+  // Signals all waiter threads waiting on the waiter mutex.
+  void SignalWaiters();
+
+  Eigen::ThreadPoolInterface* threadpool_;
+  size_t thread_count_;
+
+  std::deque<slinky::ref_count<Task>> tasks_ ABSL_GUARDED_BY(tasks_mutex_);
+
+  // A mutex for guarding mutable state accessed concurrently.
+  ABSL_CACHELINE_ALIGNED absl::Mutex tasks_mutex_;
+
+  // A mutex for signalling threads waiting on the tasks or conditions.
+  ABSL_CACHELINE_ALIGNED absl::Mutex waiter_mutex_;
+};
+
+SlinkyThreadPool::Impl::Impl(Eigen::ThreadPoolInterface* threadpool)
+    : threadpool_(threadpool),
+      thread_count_(threadpool_ ? threadpool_->NumThreads() : 0) {}
+
+slinky::ref_count<Task> SlinkyThreadPool::Impl::Enqueue(
+    SlinkyThreadPool::task_body body, size_t num_work_items,
+    size_t num_partitions) {
+  slinky::ref_count<Task> task(
+      new Task(std::move(body), num_work_items, num_partitions));
+
+  absl::MutexLock lock(tasks_mutex_);
+  return tasks_.emplace_back(std::move(task));
+}
+
+slinky::ref_count<Task> SlinkyThreadPool::Impl::Dequeue() {
+  absl::MutexLock lock(tasks_mutex_);
+
+  for (auto i = tasks_.begin(); i != tasks_.end();) {
+    slinky::ref_count<Task>& task = *i;
+
+    // Task doesn't have any more work items to process.
+    if (ABSL_PREDICT_FALSE(task->IsEmptyWorkQueue())) {
+      i = tasks_.erase(i);
+      continue;
+    }
+
+    // Don't Run the same task multiple times on the same thread.
+    if (ABSL_PREDICT_FALSE(absl::c_contains(task_stack, &*task))) {
+      ++i;
+      continue;
+    }
+
+    return task;
+  }
+
+  return nullptr;
+}
+
+TaskState SlinkyThreadPool::Impl::WorkOnTask(Task* task) {
+  DCHECK(absl::c_find(task_stack, task) == task_stack.end());
+
+  task_stack.push_back(task);
+  TaskState state = task->Run();
+  task_stack.pop_back();
+
+  // If we are the one who completed the task, we signal the waiters to wake upS
+  // any threads that are waiting for the task completion. If the task was
+  // completed by another worker, we do nothing to avoid the cost of waking up
+  // the same thread multiple times.
+  if (ABSL_PREDICT_FALSE(state == TaskState::kComplete)) {
+    SignalWaiters();
+  }
+
+  return state;
+}
+
+void SlinkyThreadPool::Impl::WorkOnTasks(const absl::Condition& condition) {
+  while (slinky::ref_count<Task> task = Dequeue()) {
+    WorkOnTask(&*task);
+
+    if (ABSL_PREDICT_TRUE(condition.Eval())) {
+      return;
+    }
+  }
+}
+
+void SlinkyThreadPool::Impl::Await(const absl::Condition& condition) {
+  if (ABSL_PREDICT_FALSE(!condition.Eval())) {
+    tsl::profiler::TraceMe trace("SlinkyThreadPool::Await");
+    absl::MutexLock lock(waiter_mutex_);
+    waiter_mutex_.Await(condition);
+  }
+}
+
+void SlinkyThreadPool::Impl::SignalWaiters() {
+  absl::MutexLock lock(waiter_mutex_);
+}
+
+void SlinkyThreadPool::Impl::AtomicCall(slinky::function_ref<void()> t) {
+  absl::MutexLock lock(waiter_mutex_);
+  t();
+}
+
+bool SlinkyThreadPool::Impl::CanScheduleWorkers() const {
+  // One reference is owned by the parent SlinkyThreadPool, every other
+  // reference is owned by a worker scheduled into the underlying scheduler.
+  return ref_count() < 1 + thread_count();
+}
+
+void SlinkyThreadPool::Impl::ScheduleWorkers(int64_t num_workers,
+                                             slinky::ref_count<Task> task) {
+  if (ABSL_PREDICT_TRUE(num_workers > 0 && CanScheduleWorkers())) {
+    slinky::ref_count<ScheduleState> state(
+        new ScheduleState(num_workers - 1, std::move(task), {this}));
+    threadpool_->Schedule([state = state.take()] {
+      ScheduleWorkers</*release_impl_ref=*/false>(state);
+    });
+  }
+}
+
+template <bool release_impl_ref>
+void SlinkyThreadPool::Impl::ScheduleWorkers(ScheduleState* context) {
+  auto state = slinky::ref_count<ScheduleState>::assume(context);
+
+  // We recursively keep scheduling workers into the underlying scheduler.
+  // This is more efficient than scheduling them sequentially from a single
+  // thread, because workers can start processing the task sooner and we
+  // distribute thread wake-ups evenly across underlying threads.
+  static constexpr int32_t kNumRecursiveWorkers = 2;
+
+  for (size_t i = 0; i < kNumRecursiveWorkers; ++i) {
+    bool schedule_worker =
+        state->impl->CanScheduleWorkers() && !state->task->IsEmptyWorkQueue() &&
+        state->remaining_workers.fetch_sub(1, std::memory_order_relaxed) > 0;
+
+    if (ABSL_PREDICT_TRUE(!schedule_worker)) {
+      break;
+    }
+
+    // Add +1 reference to account for the scheduled worker, as we use `impl`
+    // reference count to track the number of active workers.
+    state->impl->add_ref();
+    state->impl->threadpool_->Schedule(
+        [state = slinky::ref_count<ScheduleState>(state).take()] {
+          SlinkyThreadPool::Impl::ScheduleWorkers</*release_impl_ref=*/true>(
+              state);
+        });
+  }
+
+  // Keep processing tasks from the queue until we are out of tasks.
+  static constexpr bool kFalse = false;
+  state->impl->WorkOnTasks(absl::Condition(&kFalse));
+
+  // One `impl` reference implicitly owned by the `state`, every additional
+  // reference is added and released explicitly by the worker task.
+  if constexpr (release_impl_ref) {
+    state->impl->release();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SlinkyThreadPool
+//===----------------------------------------------------------------------===//
+
+SlinkyThreadPool::SlinkyThreadPool(Eigen::ThreadPoolDevice* device)
+    : impl_(new Impl(device ? device->getPool() : nullptr)) {}
+
+SlinkyThreadPool::SlinkyThreadPool(Eigen::ThreadPoolInterface* threadpool)
+    : impl_(new Impl(threadpool)) {}
+
+SlinkyThreadPool::SlinkyThreadPool(SlinkyThreadPool&&) = default;
+SlinkyThreadPool& SlinkyThreadPool::operator=(SlinkyThreadPool&&) = default;
+
+SlinkyThreadPool::~SlinkyThreadPool() = default;
+
+slinky::ref_count<SlinkyThreadPool::task> SlinkyThreadPool::enqueue(
+    size_t n, task_body t, int32_t max_workers) {
+  CHECK_GE(max_workers, n);
+
+  // Don't create more partitions than the number of threads. Also make sure
+  // that we have at least one partition (if we don't have a scheduler).
+  size_t num_partitions = std::min<size_t>(n, thread_count());
+  num_partitions = std::max<size_t>(1, num_partitions);
+
+  auto task = impl_->Enqueue(std::move(t), n, num_partitions);
+
+  // If we don't have any worker threads, we return a task to the caller, and
+  // assume that the caller will wait on it.
+  if (ABSL_PREDICT_FALSE(impl_->thread_count() == 0)) {
+    return task;
+  }
+
+  // We assume that the caller will immediately start working on the task, so we
+  // need to schedule workers only for the remaining number of partitions.
+  impl_->ScheduleWorkers(/*num_workers=*/num_partitions - 1, task);
+
+  return task;
+}
+
+void SlinkyThreadPool::wait_for(task* t) {
+  Task* task = static_cast<Task*>(t);
+  TaskState state = impl_->WorkOnTask(task);
+
+  // If the task is complete or done, we are immediately done with waiting.
+  if (ABSL_PREDICT_TRUE(state == TaskState::kComplete ||
+                        state == TaskState::kDone)) {
+    return;
+  }
+
+  // Switch to the work stealing mode and work on other tasks in the queue
+  // until the given task is done.
+  impl_->WorkOnTasks(absl::Condition(task, &Task::done));
+  impl_->Await(absl::Condition(task, &Task::done));
+}
+
+void SlinkyThreadPool::wait_for(predicate_ref condition) {
+  impl_->WorkOnTasks(absl::Condition(&condition));
+  impl_->Await(absl::Condition(&condition));
+}
+
+void SlinkyThreadPool::atomic_call(slinky::function_ref<void()> t) {
+  impl_->AtomicCall(t);
+}
+
+int SlinkyThreadPool::thread_count() const { return impl_->thread_count(); }
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h
similarity index 71%
rename from third_party/xla/xla/backends/cpu/ynn_threadpool.h
rename to third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h
index 1d440d83f51c4a..b27cb29d4c7b46 100644
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool.h
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
-#define XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
+#ifndef XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
+#define XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -32,14 +32,14 @@ namespace xla::cpu {
 
 // This is an implementation of slinky::thread_pool, using absl::Mutex for
 // synchronization, and dispatches work to Eigen::ThreadPoolInterface.
-class YnnThreadpool final : public slinky::thread_pool {
+class SlinkyThreadPool final : public slinky::thread_pool {
  public:
-  explicit YnnThreadpool(Eigen::ThreadPoolDevice* device);
-  explicit YnnThreadpool(Eigen::ThreadPoolInterface* threadpool);
-  ~YnnThreadpool() final;
+  explicit SlinkyThreadPool(Eigen::ThreadPoolDevice* device);
+  explicit SlinkyThreadPool(Eigen::ThreadPoolInterface* threadpool);
+  ~SlinkyThreadPool() final;
 
-  YnnThreadpool(YnnThreadpool&&) = delete;
-  YnnThreadpool& operator=(YnnThreadpool&&) = delete;
+  SlinkyThreadPool(SlinkyThreadPool&&);
+  SlinkyThreadPool& operator=(SlinkyThreadPool&&);
 
   slinky::ref_count<task> enqueue(size_t n, task_body t,
                                   int32_t max_workers) final;
@@ -52,10 +52,10 @@ class YnnThreadpool final : public slinky::thread_pool {
   int thread_count() const final;
 
  private:
-  class impl;
-  slinky::ref_count<impl> impl_;
+  class Impl;
+  slinky::ref_count<Impl> impl_;
 };
 
 }  // namespace xla::cpu
 
-#endif  // XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
+#endif  // XLA_BACKENDS_CPU_RUNTIME_YNNPACK_SLINKY_THREADPOOL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc
new file mode 100644
index 00000000000000..c79c9ccda45c08
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/slinky_threadpool_test.cc
@@ -0,0 +1,156 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+TEST(SlinkyThreadPoolTest, InlineScheduling) {
+  SlinkyThreadPool thread_pool(
+      static_cast<Eigen::ThreadPoolInterface*>(nullptr));
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, SingleLoop) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, LoopChain) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 5);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(SlinkyThreadPoolTest, NestedLoops) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  SlinkyThreadPool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 100;
+
+  std::array<std::atomic<int32_t>, size> data = {{0}};
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(
+      size, [&](size_t i) { thread_pool.parallel_for(size, inc); });
+
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_EQ(data[i], size);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks below.
+//===----------------------------------------------------------------------===//
+
+static void BM_ParallelFor(benchmark::State& state) {
+  int64_t num_threads = state.range(0);
+  int64_t num_threadpools = state.range(1);
+
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
+  std::vector<SlinkyThreadPool> thread_pools;
+  for (size_t i = 0; i < num_threadpools; ++i) {
+    thread_pools.emplace_back(threads.AsEigenThreadPool());
+  }
+
+  static constexpr size_t kNumLoops = 100;
+  static constexpr size_t kLoopSize = 100;
+
+  for (auto _ : state) {
+    std::vector<slinky::ref_count<slinky::thread_pool::task>> tasks;
+
+    for (size_t i = 0; i < kNumLoops; ++i) {
+      SlinkyThreadPool& thread_pool = thread_pools[i % num_threadpools];
+      tasks.push_back(thread_pool.enqueue(
+          kLoopSize, [](int64_t) {}, std::numeric_limits<int32_t>::max()));
+    }
+
+    for (size_t i = 0; i < kNumLoops; ++i) {
+      SlinkyThreadPool& thread_pool = thread_pools[i % num_threadpools];
+      thread_pool.wait_for(&*tasks[i]);
+    }
+  }
+
+  state.SetItemsProcessed(kLoopSize * kNumLoops * state.iterations());
+  state.SetLabel(absl::StrFormat("#threads=%d, #threadpools=%d", num_threads,
+                                 num_threadpools));
+}
+
+BENCHMARK(BM_ParallelFor)
+    ->MeasureProcessCPUTime()
+    ->ArgPair(8, 1)
+    ->ArgPair(8, 2)
+    ->ArgPair(8, 4)
+    ->ArgPair(8, 8)
+    ->ArgPair(8, 16)
+    ->ArgPair(16, 1)
+    ->ArgPair(16, 2)
+    ->ArgPair(16, 4)
+    ->ArgPair(16, 8)
+    ->ArgPair(16, 16)
+    ->ArgPair(32, 1)
+    ->ArgPair(32, 2)
+    ->ArgPair(32, 4)
+    ->ArgPair(32, 8)
+    ->ArgPair(32, 16);
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc
index 614a156dfbfcba..ee6a1bcdfc037b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -230,29 +231,33 @@ std::vector<se::DeviceMemoryBase> YnnFusionThunk::CaptureArguments(
 }
 
 absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, Builder builder) {
+    Options options, Info info, const HloInstruction* hlo,
+    std::vector<Argument> arguments, std::vector<Result> results,
+    Builder builder) {
   return absl::WrapUnique(new YnnFusionThunk(
-      YnnFusionKind::kFusion, std::move(options), std::move(info),
+      YnnFusionKind::kFusion, std::move(options), std::move(info), hlo,
       std::move(arguments), std::move(results), std::move(builder)));
 }
 
 absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, CapturingBuilder capturing_builder,
+    Options options, Info info, const HloInstruction* hlo,
+    std::vector<Argument> arguments, std::vector<Result> results,
+    CapturingBuilder capturing_builder,
     absl::Span<const int64_t> captured_arguments_ids) {
   return absl::WrapUnique(new YnnFusionThunk(
-      YnnFusionKind::kFusion, std::move(options), std::move(info),
+      YnnFusionKind::kFusion, std::move(options), std::move(info), hlo,
       std::move(arguments), std::move(results), std::move(capturing_builder),
       captured_arguments_ids));
 }
 
 YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                               const HloInstruction* hlo,
                                std::vector<Argument> arguments,
                                std::vector<Result> results, Builder builder)
     : Thunk(Kind::kYnnFusion, std::move(info)),
       ynn_fusion_kind_(kind),
       options_(std::move(options)),
+      hlo_(hlo),
       arguments_(std::move(arguments)),
       results_(std::move(results)),
       builder_(std::move(builder)),
@@ -260,6 +265,7 @@ YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
           absl::bind_front(&YnnFusionThunk::CreateYnnExecutable, this)) {}
 
 YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
+                               const HloInstruction* hlo,
                                std::vector<Argument> arguments,
                                std::vector<Result> results,
                                CapturingBuilder capturing_builder,
@@ -267,6 +273,7 @@ YnnFusionThunk::YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
     : Thunk(Kind::kYnnFusion, std::move(info)),
       ynn_fusion_kind_(kind),
       options_(std::move(options)),
+      hlo_(hlo),
       arguments_(std::move(arguments)),
       results_(std::move(results)),
       capturing_builder_(std::move(capturing_builder)),
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h
index 2afd9e133c904a..19518575a3f1e7 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
@@ -86,12 +87,14 @@ class YnnFusionThunk : public Thunk {
       absl::Span<const se::DeviceMemoryBase> arguments_buffers)>;
 
   static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, Builder builder);
+      Options options, Info info, const HloInstruction* hlo,
+      std::vector<Argument> arguments, std::vector<Result> results,
+      Builder builder);
 
   static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, CapturingBuilder capturing_builder,
+      Options options, Info info, const HloInstruction* hlo,
+      std::vector<Argument> arguments, std::vector<Result> results,
+      CapturingBuilder capturing_builder,
       absl::Span<const int64_t> captured_arguments_ids);
 
   tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
@@ -104,13 +107,19 @@ class YnnFusionThunk : public Thunk {
 
   YnnFusionKind ynn_fusion_kind() const { return ynn_fusion_kind_; }
 
+  const HloInstruction* hlo() const { return hlo_; }
+
+  absl::Span<const Argument> arguments() const { return arguments_; }
+  absl::Span<const Result> results() const { return results_; }
+
  protected:
   YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
-                 Builder builder);
+                 const HloInstruction* hlo, std::vector<Argument> arguments,
+                 std::vector<Result> results, Builder builder);
 
   YnnFusionThunk(YnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
+                 const HloInstruction* hlo, std::vector<Argument> arguments,
+                 std::vector<Result> results,
                  CapturingBuilder capturing_builder,
                  absl::Span<const int64_t> captured_arguments_ids);
 
@@ -151,6 +160,10 @@ class YnnFusionThunk : public Thunk {
   YnnFusionKind ynn_fusion_kind_;
   Options options_;
 
+  // A pointer to the HLO instruction that this thunk is associated with. Owned
+  // by the `HloModule` associated with the XLA executable.
+  const HloInstruction* hlo_;  // not owned
+
   std::vector<Argument> arguments_;
   std::vector<Result> results_;
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc
index 9e10fee93e2b8f..8a1c79540ac530 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -106,10 +107,6 @@ class YnnFusionThunkTest : public testing::TestWithParam<bool> {
 };
 
 TEST_P(YnnFusionThunkTest, ElementwiseAdd) {
-  if (use_threadpool()) {
-    GTEST_SKIP() << "Threadpool is not yet supported. Needs more clean-up.";
-  }
-
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
                                  threads.NumThreads());
@@ -134,6 +131,7 @@ TEST_P(YnnFusionThunkTest, ElementwiseAdd) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto thunk, YnnFusionThunk::Create(
                       YnnFusionThunk::Options{use_threadpool()}, {"fusion"},
+                      reinterpret_cast<HloInstruction*>(0xDEADBEEF),
                       {lhs_arg, rhs_arg}, {out_res}, &BuildBinaryAddSubgraph));
 
   YnnThreadpool threadpool;
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc
index 29342a02af6518..19dd8256204db6 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_interop.cc
@@ -46,12 +46,22 @@ absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
 
 absl::StatusOr<ynn_type> YnnType(const PrimitiveType& type) {
   switch (type) {
+    case S4:
+      return ynn_type_int4;
+    case U4:
+      return ynn_type_uint4;
+    case S8:
+      return ynn_type_int8;
+    case U8:
+      return ynn_type_uint8;
     case BF16:
       return ynn_type_bf16;
     case F16:
       return ynn_type_fp16;
     case F32:
       return ynn_type_fp32;
+    case S32:
+      return ynn_type_int32;
     default:
       return InvalidArgument("Unsupported YNNPACK type: %s",
                              primitive_util::LowercasePrimitiveTypeName(type));
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc
index 1736bbbde2c316..11027e4b90adee 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/ynn_threadpool.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
 
-#include <cstdint>
+#include <cassert>
 
 #include "ynnpack/include/ynnpack.h"
-#include "absl/base/optimization.h"
-#include "absl/status/statusor.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/backends/cpu/runtime/ynnpack/slinky_threadpool.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
 
 #define EIGEN_USE_THREADS
@@ -28,29 +29,12 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static int32_t NumThreads(void* pool) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    return 0;
-  }
-  return reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->NumThreads();
-}
-
-static void Schedule(void* pool, void* context, void (*task)(void* context)) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    (*task)(context);
-  }
-  reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->Schedule(
-      [task, context]() { (*task)(context); });
-}
-
-// An adaptor from Eigen::ThreadPoolInterface to xnn_threadpool_t.
-static constexpr ynn_scheduler kYnnScheduler = {&NumThreads, &Schedule};
-
 absl::StatusOr<YnnThreadpool> CreateYnnThreadpool(
     Eigen::ThreadPoolInterface* threadpool) {
   return CreateYnnThreadpool([&](ynn_threadpool_t* ynn_threadpool) {
-    return ynn_create_threadpool(&kYnnScheduler, threadpool, /*flags=*/1,
-                                 ynn_threadpool);
+    *ynn_threadpool =
+        reinterpret_cast<ynn_threadpool_t>(new SlinkyThreadPool(threadpool));
+    return ynn_status_success;
   });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/BUILD b/third_party/xla/xla/backends/cpu/testlib/BUILD
index 85c29a01922057..2594c12a4d52e2 100644
--- a/third_party/xla/xla/backends/cpu/testlib/BUILD
+++ b/third_party/xla/xla/backends/cpu/testlib/BUILD
@@ -24,6 +24,7 @@ cc_library(
     hdrs = ["kernel_runner.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:fusion_compiler",
@@ -35,21 +36,19 @@ cc_library(
         "//xla/backends/cpu/runtime:kernel_c_api",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/runtime:work_group",
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:cpu_options",
-        "//xla/service/cpu:runtime_symbol_generator",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
@@ -68,7 +67,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/hlo/testlib:test",
         "//xla/runtime:buffer_use",
@@ -91,10 +90,9 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -114,9 +112,8 @@ cc_library(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/runtime:work_group",
@@ -156,20 +153,17 @@ tsl_pybind_extension(
         "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
         "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:cpu_compiler_pure",
         "//xla/service/cpu:fusion_wrapper",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
     ],
 )
 
@@ -193,7 +187,7 @@ xla_cc_test(
         ":llvm_ir_kernel_emitter",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:launch_dim",
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
index 49b7bc25c9e272..96c60411fe6f41 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
@@ -16,16 +16,17 @@ limitations under the License.
 #include "xla/backends/cpu/testlib/kernel_runner.h"
 
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetOptions.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
@@ -37,13 +38,10 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/kernel_c_api.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/cpu/cpu_options.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -53,16 +51,15 @@ limitations under the License.
 namespace xla::cpu {
 
 absl::StatusOr<KernelRunner> KernelRunner::Create(
-    LlvmKernelDefinition kernel_definition, JitCompiler compiler) {
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-
-  auto thread_safe_module = std::move(source).thread_safe_module();
+    KernelDefinition<LlvmKernelSource> kernel, JitCompiler compiler) {
+  auto spec = kernel.spec();
+  auto thread_safe_module = std::move(kernel).TakeSource().thread_safe_module();
   SetModuleMemoryRegionName(*thread_safe_module.getModuleUnlocked(),
                             "kernel_runner_test");
 
   TF_RETURN_IF_ERROR(compiler.AddModule(std::move(thread_safe_module)));
 
-  const std::string& kernel_name = spec.name();
+  absl::string_view kernel_name = spec.name();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> library,
                       std::move(compiler).Compile(
                           {FunctionLibrary::Sym<XLA_CPU_Kernel>(kernel_name)}));
@@ -75,13 +72,12 @@ absl::StatusOr<KernelRunner> KernelRunner::Create(
 }
 
 absl::StatusOr<KernelRunner> KernelRunner::Create(
-    MlirKernelDefinition kernel_definition, JitCompiler compiler) {
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-
-  TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_kernel_source,
-                      LowerToLlvm(source));
+    KernelDefinition<MlirKernelSource> kernel, JitCompiler compiler) {
+  auto spec = kernel.spec();
+  auto source = std::move(kernel).TakeSource();
+  TF_ASSIGN_OR_RETURN(LlvmKernelSource llvm_kernel_source, LowerToLlvm(source));
 
-  return Create(LlvmKernelDefinition(spec, std::move(llvm_kernel_source)),
+  return Create(KernelDefinition(spec, std::move(llvm_kernel_source)),
                 std::move(compiler));
 }
 
@@ -120,7 +116,7 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
   // Needed to resolve symbols such as built in intrinsics (sin, cos etc).
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   JitCompiler::Options jit_compiler_options{
@@ -139,7 +135,7 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
                              std::move(ir_compiler));
 }
 
-absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+absl::StatusOr<LlvmKernelSource> LowerToLlvm(
     MlirKernelSource& mlir_kernel_source) {
   auto llvm_context = std::make_unique<llvm::LLVMContext>();
 
@@ -153,7 +149,7 @@ absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
       std::unique_ptr<llvm::Module> llvm_module,
       fusion_compiler.Compile(*llvm_context, mlir_kernel_source.module()));
 
-  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
+  return LlvmKernelSource(std::move(llvm_context), std::move(llvm_module));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
index 5cfb951aedebbf..62ce986b88bdb7 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
@@ -25,9 +25,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/kernel.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/runtime/work_group.h"
@@ -39,9 +37,9 @@ namespace xla::cpu {
 class KernelRunner final : public xla::KernelRunner {
  public:
   static absl::StatusOr<KernelRunner> Create(
-      LlvmKernelDefinition kernel_definition, JitCompiler compiler);
+      KernelDefinition<LlvmKernelSource> kernel, JitCompiler compiler);
   static absl::StatusOr<KernelRunner> Create(
-      MlirKernelDefinition kernel_definition, JitCompiler compiler);
+      KernelDefinition<MlirKernelSource> kernel, JitCompiler compiler);
 
   KernelRunner(KernelRunner&&) = default;
   KernelRunner& operator=(KernelRunner&&) = default;
@@ -59,7 +57,7 @@ class KernelRunner final : public xla::KernelRunner {
   NumWorkGroups num_workgroups_;
 };
 
-absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+absl::StatusOr<LlvmKernelSource> LowerToLlvm(
     MlirKernelSource& mlir_kernel_source);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
index 91b3d4a0cc2a46..25fcfad5dcde53 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
@@ -42,13 +42,11 @@ limitations under the License.
 #include "xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h"
 #include "xla/backends/cpu/testlib/mlir_kernel_emitter.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -57,7 +55,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/fusion_wrapper.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 
 namespace xla::cpu {
@@ -90,10 +87,10 @@ NB_MODULE(_extension, kernel_runner_module) {
   // Use a tuple and cast to NumWorkGroups to take advantage of built in
   // bindings.
   using NbNumWorkGroups = std::tuple<uint64_t, uint64_t, uint64_t>;
-  nb::class_<LlvmTestKernelEmitter, LlvmKernelEmitter>(kernel_runner_module,
-                                                       "LlvmTestKernelEmitter")
-      .def("__init__", [](LlvmKernelEmitter* self, absl::string_view ir,
-                          absl::string_view kernel_name,
+  nb::class_<LlvmTestKernelEmitter, KernelEmitter<LlvmKernelSource>>(
+      kernel_runner_module, "LlvmTestKernelEmitter")
+      .def("__init__", [](KernelEmitter<LlvmKernelSource>* self,
+                          absl::string_view ir, absl::string_view kernel_name,
                           NbNumWorkGroups num_workgroups) {
         new (self)
             LlvmTestKernelEmitter(ir, kernel_name,
@@ -103,10 +100,10 @@ NB_MODULE(_extension, kernel_runner_module) {
                                   {});
       });
 
-  nb::class_<MlirTestKernelEmitter, MlirKernelEmitter>(kernel_runner_module,
-                                                       "MlirTestKernelEmitter")
-      .def("__init__", [](MlirKernelEmitter* self, absl::string_view ir,
-                          absl::string_view kernel_name,
+  nb::class_<MlirTestKernelEmitter, KernelEmitter<MlirKernelSource>>(
+      kernel_runner_module, "MlirTestKernelEmitter")
+      .def("__init__", [](KernelEmitter<MlirKernelSource>* self,
+                          absl::string_view ir, absl::string_view kernel_name,
                           NbNumWorkGroups num_workgroups) {
         new (self)
             MlirTestKernelEmitter(ir, kernel_name,
@@ -117,15 +114,14 @@ NB_MODULE(_extension, kernel_runner_module) {
       });
 
   kernel_runner_module.def("lower_to_llvm", [](MlirKernelSource& source) {
-    absl::StatusOr<LlvmIrKernelSource> llvm_ir_kernel_source =
-        LowerToLlvm(source);
+    absl::StatusOr<LlvmKernelSource> llvm_kernel_source = LowerToLlvm(source);
 
-    if (!llvm_ir_kernel_source.ok()) {
+    if (!llvm_kernel_source.ok()) {
       throw std::runtime_error(
-          std::string(llvm_ir_kernel_source.status().message()));
+          std::string(llvm_kernel_source.status().message()));
     }
 
-    return std::move(llvm_ir_kernel_source).value();
+    return std::move(llvm_kernel_source).value();
   });
 
   nb::class_<CpuCompiler>(kernel_runner_module, "HloCompiler")
@@ -157,49 +153,48 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<mlir::MLIRContext>(kernel_runner_module, "MLIRContext")
       .def(nb::new_([] { return FusionCompiler::CreateContext(); }));
 
-  nb::class_<gpu::SymbolicExprContext>(kernel_runner_module,
-                                       "SymbolicExprContext")
+  nb::class_<SymbolicExprContext>(kernel_runner_module, "SymbolicExprContext")
       .def(nb::init<mlir::MLIRContext*>(), nb::keep_alive<1, 2>());
 
   nb::class_<TargetMachineFeatures>(kernel_runner_module,
                                     "TargetMachineFeatures")
       .def("__str__", &TargetMachineFeatures::get_target_feature_string);
 
-  nb::class_<ElementalKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ElementalKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ElementalKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<DotKernelEmitter, LlvmKernelEmitter>(kernel_runner_module,
-                                                  "DotKernelEmitter")
+  nb::class_<DotKernelEmitter, KernelEmitter<LlvmKernelSource>>(
+      kernel_runner_module, "DotKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<ConcatenateKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ConcatenateKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ConcatenateKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<ComputationKernelEmitter, LlvmKernelEmitter>(
+  nb::class_<ComputationKernelEmitter, KernelEmitter<LlvmKernelSource>>(
       kernel_runner_module, "ComputationKernelEmitter")
       .def(nb::init<const HloInstruction*, const BufferAssignment*,
                     const TargetMachineFeatures*>(),
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
-  nb::class_<CpuScatterFusion, MlirKernelEmitter>(kernel_runner_module,
-                                                  "ScatterKernelEmitter")
+  nb::class_<CpuScatterFusion, KernelEmitter<MlirKernelSource>>(
+      kernel_runner_module, "ScatterKernelEmitter")
       .def(
           "__init__",
           [](CpuScatterFusion* self, const HloFusionInstruction* instruction,
              const BufferAssignment* buffer_assignment,
-             gpu::SymbolicExprContext* symbolic_expr_context) {
+             SymbolicExprContext* symbolic_expr_context) {
             new (self) CpuScatterFusion(*buffer_assignment, instruction,
                                         symbolic_expr_context);
           },
@@ -208,12 +203,11 @@ NB_MODULE(_extension, kernel_runner_module) {
 
   kernel_runner_module.def(
       "emit_fusion_kernel",
-      [](gpu::SymbolicExprContext& symbolic_expr_context,
+      [](mlir::MLIRContext& mlir_context, SymbolicExprContext& expr_context,
          const HloFusionInstruction& fusion,
          const BufferAssignment* buffer_assignment) {
-        absl::StatusOr<MlirKernelDefinition> kernel_definition =
-            EmitFusionKernel(symbolic_expr_context, fusion, buffer_assignment,
-                             false);
+        auto kernel_definition = EmitFusionKernel(
+            mlir_context, expr_context, fusion, buffer_assignment, false);
         if (!kernel_definition.ok()) {
           throw std::runtime_error(kernel_definition.status().ToString());
         }
@@ -247,8 +241,8 @@ NB_MODULE(_extension, kernel_runner_module) {
                                               "KernelRunner")
       .def_static(
           "create",
-          [](std::unique_ptr<MlirKernelDefinition,
-                             nb::deleter<MlirKernelDefinition>>
+          [](std::unique_ptr<KernelDefinition<MlirKernelSource>,
+                             nb::deleter<KernelDefinition<MlirKernelSource>>>
                  kernel_definition,
              std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
                  jit_compiler) {
@@ -262,11 +256,12 @@ NB_MODULE(_extension, kernel_runner_module) {
             return *std::move(runner);
           })
       .def_static(
-          "create", [](std::unique_ptr<LlvmKernelDefinition,
-                                       nb::deleter<LlvmKernelDefinition>>
-                           kernel_definition,
-                       std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
-                           jit_compiler) {
+          "create",
+          [](std::unique_ptr<KernelDefinition<LlvmKernelSource>,
+                             nb::deleter<KernelDefinition<LlvmKernelSource>>>
+                 kernel_definition,
+             std::unique_ptr<JitCompiler, nb::deleter<JitCompiler>>
+                 jit_compiler) {
             absl::StatusOr<KernelRunner> runner = KernelRunner::Create(
                 std::move(*kernel_definition), std::move(*jit_compiler));
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
index fe85aaa811a145..c169c3fc14de26 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h"
 #include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
@@ -83,9 +83,8 @@ TEST(KernelRunnerTest, Add) {
                                 NumWorkGroups{kNumElements},
                                 {read_arg, read_arg, write_arg});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      KernelDefinition<LlvmIrKernelSource> kernel_definition,
-      emitter.EmitKernelDefinition());
+  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition<LlvmKernelSource> kernel_definition,
+                          emitter.EmitKernelDefinition());
   TF_ASSERT_OK_AND_ASSIGN(JitCompiler compiler,
                           KernelRunner::CreateJitCompiler(HloModuleConfig()));
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
index c517e3caf14ad0..aabcf084b0fa8c 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.cc
@@ -29,8 +29,7 @@ limitations under the License.
 #include "llvm/Support/SourceMgr.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -53,11 +52,11 @@ LlvmTestKernelEmitter::LlvmTestKernelEmitter(absl::string_view llvm_ir,
   }
 }
 
-absl::StatusOr<LlvmKernelDefinition>
+absl::StatusOr<LlvmTestKernelEmitter::KernelDefinition>
 LlvmTestKernelEmitter::EmitKernelDefinition() {
   auto context = std::make_unique<llvm::LLVMContext>();
 
-  // Parse LLVM IR into a module and create a LlvmIrKernelSource.
+  // Parse LLVM IR into a module and create a LlvmKernelSource.
   llvm::SMDiagnostic diagnostic;
   std::unique_ptr<llvm::Module> module = llvm::parseAssembly(
       llvm::MemoryBufferRef(llvm_ir_, kernel_name_), diagnostic, *context);
@@ -67,7 +66,7 @@ LlvmTestKernelEmitter::EmitKernelDefinition() {
                     diagnostic.getMessage().str());
   }
 
-  LlvmIrKernelSource source(std::move(context), std::move(module));
+  LlvmKernelSource source(std::move(context), std::move(module));
 
   // Convert kernel arguments to fake allocations and buffer uses.
   KernelSpec::Buffers argument_buffers;
@@ -85,7 +84,7 @@ LlvmTestKernelEmitter::EmitKernelDefinition() {
   KernelSpec kernel_spec(kernel_name_, num_workgroups_,
                          std::move(argument_buffers), std::move(result_buffers),
                          /*invariant_arguments=*/{});
-  return LlvmKernelDefinition(std::move(kernel_spec), std::move(source));
+  return KernelDefinition(std::move(kernel_spec), std::move(source));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
index 3e1a29fd1221f0..a9c353a13629a4 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,7 +35,7 @@ namespace xla::cpu {
 // into the dedicated LLVM context and module instance. This kernel emitter is
 // intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
 // into the XLA kernel spec; (2) Execute it with user provided input buffers.
-class LlvmTestKernelEmitter : public LlvmKernelEmitter {
+class LlvmTestKernelEmitter : public KernelEmitter<LlvmKernelSource> {
  public:
   // When loading kernel IR into the KernelSpec we create a separate buffer
   // allocation for every kernel argument. We don't use buffer assignment in
@@ -50,9 +50,8 @@ class LlvmTestKernelEmitter : public LlvmKernelEmitter {
                         NumWorkGroups num_workgroups,
                         absl::Span<const KernelArg> args);
 
-  absl::StatusOr<LlvmKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const override { return "llvm_test_kernel_emitter"; }
+  absl::string_view name() const override { return "llvm_test_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   std::string llvm_ir_;
diff --git a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
index 9eedd0656e758e..a40d98f62b0012 100644
--- a/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/llvm_ir_kernel_emitter_test.cc
@@ -25,13 +25,11 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
-#include "tsl/platform/casts.h"
 
 namespace xla::cpu {
 
@@ -48,12 +46,11 @@ TEST(LlvmIrKernelEmitterTest, ParseLlvmIr) {
   LlvmTestKernelEmitter::KernelArg arg{1024, BufferUse::MemoryAccess::kWrite};
   LlvmTestKernelEmitter emitter(kLlvmIr, "noop", {}, {arg});
 
-  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel_definition,
                           emitter.EmitKernelDefinition());
 
   // Check that LLVM IR was parsed and loaded as a LLVM IR kernel source.
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  const KernelSpec& kernel_spec = kernel_definition.spec();
 
   EXPECT_EQ(kernel_spec.name(), "noop");
 
@@ -66,7 +63,7 @@ TEST(LlvmIrKernelEmitterTest, ParseLlvmIr) {
   EXPECT_EQ(result_slice.size(), 1024);
 
   llvm::orc::ThreadSafeModule thread_safe_module =
-      std::move(kernel_source).thread_safe_module();
+      std::move(kernel_definition).TakeSource().thread_safe_module();
   const llvm::Module::FunctionListType& functions =
       thread_safe_module.getModuleUnlocked()->getFunctionList();
   EXPECT_THAT(functions,
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
index baa524191f318b..6f884f5559c3b5 100644
--- a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
@@ -48,7 +47,7 @@ MlirTestKernelEmitter::MlirTestKernelEmitter(absl::string_view mlir,
   }
 }
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<MlirTestKernelEmitter::KernelDefinition>
 MlirTestKernelEmitter::EmitKernelDefinition() {
   std::unique_ptr<mlir::MLIRContext> context = FusionCompiler::CreateContext();
 
@@ -72,6 +71,6 @@ MlirTestKernelEmitter::EmitKernelDefinition() {
   KernelSpec kernel_spec(kernel_name_, num_workgroups_,
                          std::move(argument_buffers), std::move(result_buffers),
                          /*invariant_arguments=*/{});
-  return MlirKernelDefinition(std::move(kernel_spec), std::move(source));
+  return KernelDefinition(std::move(kernel_spec), std::move(source));
 }
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
index ad447583e556d9..566b97c4b4037b 100644
--- a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,7 +35,7 @@ namespace xla::cpu {
 // into the dedicated MLIR context and module instance. This kernel emitter is
 // intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
 // into the XLA kernel spec; (2) Execute it with user provided input buffers.
-class MlirTestKernelEmitter : public MlirKernelEmitter {
+class MlirTestKernelEmitter : public KernelEmitter<MlirKernelSource> {
  public:
   // When loading kernel IR into the KernelSpec we create a separate buffer
   // allocation for every kernel argument. We don't use buffer assignment in
@@ -49,9 +49,8 @@ class MlirTestKernelEmitter : public MlirKernelEmitter {
                         NumWorkGroups num_workgroups,
                         absl::Span<const KernelArg> args);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() final;
-
-  std::string name() const override { return "mlir_test_kernel_emitter"; }
+  absl::string_view name() const override { return "mlir_test_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
 
  private:
   std::string mlir_;
diff --git a/third_party/xla/xla/backends/cpu/tests/BUILD b/third_party/xla/xla/backends/cpu/tests/BUILD
index 256c723d73f5fd..0e63cc568a6459 100644
--- a/third_party/xla/xla/backends/cpu/tests/BUILD
+++ b/third_party/xla/xla/backends/cpu/tests/BUILD
@@ -33,3 +33,22 @@ xla_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+xla_test(
+    name = "ynn_fusion_test",
+    srcs = ["ynn_fusion_test.cc"],
+    backends = ["cpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
+    deps = [
+        "//xla:error_spec",
+        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
new file mode 100644
index 00000000000000..413f90b4ab959b
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+struct YnnFusionTestParams {
+  std::string in_dtype;
+  std::string out_dtype;  // Only used for mixed input/output types.
+};
+
+class YnnFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
+      public ::testing::WithParamInterface<YnnFusionTestParams> {
+ public:
+  static std::string Name(
+      const ::testing::TestParamInfo<YnnFusionTestParams>& info) {
+    return absl::StrCat(info.param.in_dtype, "x", info.param.out_dtype);
+  }
+
+ protected:
+  void RunTest(absl::string_view hlo_template) {
+    YnnFusionTestParams params = GetParam();
+    std::string hlo_text =
+        absl::StrReplaceAll(hlo_template, {{"$dtype", params.in_dtype},
+                                           {"$in_dtype", params.in_dtype},
+                                           {"$out_dtype", params.out_dtype}});
+    bool bf16_compute = params.in_dtype == "bf16" || params.out_dtype == "bf16";
+    double tolerance = bf16_compute ? 1e-2 : 1e-7;
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        hlo_text, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  }
+};
+
+TEST_P(YnnFusionTest, AddAndMultiply) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule add_and_multiply
+
+    ynn_fusion {
+      %lhs = $dtype[4] parameter(0)
+      %rhs = $dtype[4] parameter(1)
+      %add = $dtype[4] add(%lhs, %rhs)
+      ROOT %mul = $in_dtype[4] multiply(%add, %add)
+    }
+
+    ENTRY entry {
+      %p0 = $dtype[4] parameter(0)
+      %p1 = $dtype[4] parameter(1)
+      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
+    })";
+
+  RunTest(kModuleStr);
+}
+
+std::vector<YnnFusionTestParams> GetSameTypeTestCases() {
+  return std::vector<YnnFusionTestParams>({
+      YnnFusionTestParams{"bf16", "bf16"},
+      YnnFusionTestParams{"f32", "f32"},
+  });
+}
+
+INSTANTIATE_TEST_SUITE_P(YnnFusionTestInstantiation, YnnFusionTest,
+                         ::testing::ValuesIn(GetSameTypeTestCases()),
+                         YnnFusionTest::Name);
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index a1340d72d42d7a..c749376eca78ed 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -3,6 +3,7 @@ load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
 load("//xla/tsl/mkl:graph.bzl", "onednn_graph_cc_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -42,7 +43,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
-    ],
+    ] + if_ynnpack([":ynn_matcher"]),
 )
 
 xla_cc_test(
@@ -115,6 +116,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ynn_matcher",
+    hdrs = ["ynn_matcher.h"],
+    deps = [
+        ":library_matcher",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:protobuf",
+    ] + if_ynnpack(["//xla/backends/cpu:ynn_support"]),
+)
+
 cc_library(
     name = "xnn_graph_fusion",
     srcs = ["xnn_graph_fusion.cc"],
diff --git a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
index 105ad861eff106..5bf97d564a25c7 100644
--- a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-absl::StatusOr<bool> CpuAllReduceCombiner::Run(
+absl::StatusOr<bool> CpuAllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithKeyCombiner(module, execution_threads,
diff --git a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
index 85d8560337e325..be3f2b21c03749 100644
--- a/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/backends/cpu/transforms/collectives/all_reduce_combiner.h
@@ -33,8 +33,8 @@ class CpuAllReduceCombiner : public AllReduceCombiner {
 
   absl::string_view name() const override { return "cpu-all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
index bc1c2bf9ba9f88..23c5874c652fb0 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
@@ -43,6 +43,9 @@ class LibraryMatcher {
         case DebugOptions::LIBRARY_FUSION_TYPE_ELTWISE:
           fuse_eltwise_ = true;
           break;
+        // Not intended to be used by LibraryMatcher.
+        case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT:
+          break;
         case DebugOptions::LIBRARY_FUSION_TYPE_REDUCE:
           fuse_reduce_ = true;
           break;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
index 0faebfbd42e6fa..4e3fd3e2d1a0d9 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
@@ -334,7 +334,7 @@ absl::StatusOr<bool> LibraryRewriter::ProcessComputation(
   return !fused_.empty();
 }
 
-absl::StatusOr<bool> LibraryRewriter::Run(
+absl::StatusOr<bool> LibraryRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool module_changed = false;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index fc03585abfdd71..a36f612c655442 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -39,6 +39,10 @@ limitations under the License.
 #include "xla/backends/cpu/transforms/onednn_matcher.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/transforms/ynn_matcher.h"
+#endif
+
 namespace xla::cpu {
 
 enum class FusionDirection {
@@ -50,8 +54,10 @@ enum class FusionDirection {
 struct LibraryRewriterOptions {
   bool use_onednn = false;
   bool use_xnnpack = false;
+  bool use_ynnpack = false;
   const tsl::protobuf::RepeatedField<int>* onednn_fusion_types = nullptr;
   const tsl::protobuf::RepeatedField<int>* xnn_fusion_types = nullptr;
+  const tsl::protobuf::RepeatedField<int>* ynn_fusion_types = nullptr;
 };
 
 // Rewrites suitable Dot operations into library fusions.
@@ -74,6 +80,14 @@ class LibraryRewriter : public HloModulePass {
       libs_.push_back(std::make_unique<XnnMatcher>(target_machine_features_,
                                                    options_.xnn_fusion_types));
     }
+#ifdef XLA_YNNPACK
+    if (options_.use_ynnpack && options_.ynn_fusion_types != nullptr &&
+        !options_.ynn_fusion_types->empty()) {
+      libs_.push_back(std::make_unique<YnnMatcher>(target_machine_features_,
+                                                   options_.ynn_fusion_types));
+    }
+#endif  // XLA_YNNPACK
+
     for (std::unique_ptr<LibraryMatcher>& lib : libs_) {
       supported_ops_.merge(lib->SupportedOps());
     }
@@ -116,14 +130,13 @@ class LibraryRewriter : public HloModulePass {
   // Finds and creates fusions in the given computation.
   absl::StatusOr<bool> ProcessComputation(HloComputation* computation);
 
-  // Runs the pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::string_view name() const override { return "dot-library-rewriter"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  absl::string_view name() const override { return "dot-library-rewriter"; }
-
  private:
   const TargetMachineFeatures* target_machine_features_;
   const LibraryRewriterOptions options_;
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 022407dbf9ce75..2646ee286b0259 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -101,11 +101,16 @@ class CpuLibraryTest : public TargetMachineTestBase {
     tsl::protobuf::RepeatedField<int> empty_fusion_types;
     bool use_onednn = spec.lib == "onednn";
     bool use_xnnpack = spec.lib == "xnn";
+    bool use_ynnpack = spec.lib == "ynn";
     LibraryRewriterOptions options = {
-        use_onednn, use_xnnpack,
+        use_onednn,
+        use_xnnpack,
+        use_ynnpack,
         /*onednn_fusion_types=*/
         use_onednn ? &fusion_types : &empty_fusion_types,
-        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types};
+        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types,
+        /*ynn_fusion_types=*/use_ynnpack ? &fusion_types : &empty_fusion_types,
+    };
     LibraryRewriter rewriter(features.get(), options);
     EXPECT_EQ(expected.changed, rewriter.Run(module.get()).value());
     if (!expected.changed) {
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
index 7e92e5d44e97a8..bf79072977f6ca 100644
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
@@ -89,7 +89,7 @@ HloInstruction* XnnGraphFusion::Fuse(HloInstruction* producer,
 
   BackendConfig backend_config;
   FusionBackendConfig* fusion_config = backend_config.mutable_fusion_config();
-  fusion_config->set_kind(std::string{kXnnFusionKind});
+  fusion_config->set_kind(kXnnFusionKind);
   CHECK(backend_config.has_fusion_config());
   TF_CHECK_OK(fusion->set_backend_config(backend_config));
   return fusion;
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
new file mode 100644
index 00000000000000..38dc8f6f820cf7
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
@@ -0,0 +1,115 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
+#define XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
+
+#include <string>
+
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/transforms/library_matcher.h"
+#include "xla/backends/cpu/ynn_support.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla::cpu {
+
+class YnnMatcher : public LibraryMatcher {
+ public:
+  explicit YnnMatcher(const TargetMachineFeatures* target_machine_features,
+                      const tsl::protobuf::RepeatedField<int>* fusion_types)
+      : LibraryMatcher(target_machine_features, fusion_types) {}
+  ~YnnMatcher() override = default;
+
+  // Returns the set of supported HLO instructions.
+  absl::flat_hash_set<HloOpcode> SupportedOps() const override {
+    static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
+        kSupportedOps{[]() {
+          absl::flat_hash_set<HloOpcode> supported_ops{
+              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
+          for (const auto& [op, _] : GetYnnUnaryOpMap()) {
+            supported_ops.insert(op);
+          }
+          for (const auto& [op, _] : GetYnnBinaryOpMap()) {
+            supported_ops.insert(op);
+          }
+          return supported_ops;
+        }()};
+    return *kSupportedOps;
+  }
+
+  // Returns true if the HLO instruction is supported by the library.
+  absl::StatusOr<bool> IsOpSupported(const HloInstruction* instr) override {
+    if (instr->opcode() == HloOpcode::kDot) {
+      return IsDotSupportedByYnn(instr->dot_dimension_numbers(),
+                                 instr->operand(0)->shape(),
+                                 instr->operand(1)->shape(), instr->shape());
+    }
+    if (instr->opcode() == HloOpcode::kReduce) {
+      return IsReduceOpOffloadedToYnn(instr);
+    }
+    if (instr->IsConstant()) {
+      return IsConstantSupportedByYnn(instr);
+    }
+    // TODO(b/441837668): Need to get the reduction performance right before
+    // enabling fusions. Fusions make performance analysis quite challenging.
+    if (fuse_reduce_) {
+      return false;
+    }
+    if (instr->IsElementwise()) {
+      return IsElementwiseOpSupportedByYnn(instr);
+    }
+    return false;
+  }
+
+  // Returns true if we should start a new fusion containing just the given HLO
+  // instruction. We control the instructions that can start a fusion with the
+  // `--xla_cpu_experimental_ynn_fusion_type` flag.
+  bool ShouldCreateFusion(const HloInstruction* instr) override {
+    if (fuse_dot_ && instr->opcode() == HloOpcode::kDot) {
+      return true;
+    }
+    if (fuse_reduce_ && instr->opcode() == HloOpcode::kReduce) {
+      return true;
+    }
+    return fuse_eltwise_ && instr->IsElementwise();
+  }
+
+  PrimitiveType LibraryOpOutputType(const HloInstruction* instr) override {
+    auto out_type = instr->shape().element_type();
+    if (instr->opcode() != HloOpcode::kDot) {
+      return out_type;
+    }
+    return out_type == BF16 ? F32 : out_type;
+  }
+
+  // Returns a prefix string for the fusion op's name.
+  std::string fusion_prefix() const override { return "ynn_"; }
+
+  // Returns a string for FusionBackendConfig's fusion kind.
+  absl::string_view fusion_kind() const override { return kYnnFusionKind; }
+
+ private:
+  absl::flat_hash_set<DebugOptions::LibraryFusionType> fusion_types_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_TRANSFORMS_YNN_MATCHER_H_
diff --git a/third_party/xla/xla/backends/cpu/xnnpack_config.proto b/third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
similarity index 67%
rename from third_party/xla/xla/backends/cpu/xnnpack_config.proto
rename to third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
index 77e023b6fda674..56a5cfbf29992f 100644
--- a/third_party/xla/xla/backends/cpu/xnnpack_config.proto
+++ b/third_party/xla/xla/backends/cpu/xnn_fusion_options.proto
@@ -2,6 +2,6 @@ syntax = "proto3";
 
 package xla.cpu;
 
-message XnnFusionBackendConfig {
+message XnnFusionOptions {
   bool use_threadpool = 1;
 }
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
index a6a03afd400003..83bac68b0c2ce8 100644
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
+++ b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
@@ -16,9 +16,10 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
 #define XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
 
+#include <functional>
+
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
-#include "xla/primitive_util.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.cc b/third_party/xla/xla/backends/cpu/xnn_support.cc
index a72d11b71abe87..307d7adb859472 100644
--- a/third_party/xla/xla/backends/cpu/xnn_support.cc
+++ b/third_party/xla/xla/backends/cpu/xnn_support.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
 #include "xla/backends/cpu/xnn_gemm_config.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index d865d5c56e17d3..5671bcbafb8c28 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "xla/backends/cpu/ynn_emitter.h"
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <numeric>
+#include <utility>
 #include <vector>
 
 #include "ynnpack/include/ynnpack.h"
@@ -25,13 +28,20 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
 #include "xla/backends/cpu/ynn_support.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
+#include "xla/primitive_util.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -47,11 +57,8 @@ using TensorIdMap = absl::flat_hash_map<const HloInstruction*, uint32_t>;
 //===----------------------------------------------------------------------===//
 
 static std::vector<size_t> YnnDimensions(const Shape& shape) {
-  std::vector<size_t> dims;
-  for (auto& dim : shape.dimensions()) {
-    dims.push_back(dim);
-  }
-  return dims;
+  absl::Span<const int64_t> dims = shape.dimensions();
+  return {dims.begin(), dims.end()};
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,6 +204,48 @@ static absl::StatusOr<uint32_t> DefineBinaryOp(ynn_subgraph_t subgraph,
   return out;
 }
 
+static absl::StatusOr<uint32_t> DefineReduceOp(ynn_subgraph_t subgraph,
+                                               TensorIdMap& tensor_ids,
+                                               const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for reduce op: %s",
+                                instr->ToString());
+  CHECK_EQ(instr->opcode(), HloOpcode::kReduce);
+  const HloReduceInstruction* reduce_instr = Cast<HloReduceInstruction>(instr);
+  const HloInstruction* input = instr->operand(0);
+  const HloInstruction* init = instr->operand(1);
+  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
+  CHECK_EQ(init->shape().element_type(), instr->shape().element_type());
+
+  ynn_reduce_operator ynn_reduce_op = ynn_reduce_invalid;
+  CHECK_EQ(reduce_instr->to_apply()->num_parameters(), 2);
+  CHECK_EQ(reduce_instr->to_apply()->instruction_count(), 3);
+
+  switch (reduce_instr->to_apply()->root_instruction()->opcode()) {
+    case HloOpcode::kAdd:
+      ynn_reduce_op = ynn_reduce_sum;
+      break;
+    case HloOpcode::kMaximum:
+      ynn_reduce_op = ynn_reduce_max;
+      break;
+    case HloOpcode::kMinimum:
+      ynn_reduce_op = ynn_reduce_min;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported reduction: " << instr->to_apply()->ToString();
+  }
+
+  const absl::Span<const int64_t> reduce_dims = reduce_instr->dimensions();
+  const std::vector<int32_t> dims(reduce_dims.begin(), reduce_dims.end());
+  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
+  TF_ASSIGN_OR_RETURN(auto init_id, FindTensorValue(tensor_ids, init));
+  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
+
+  YNN_RETURN_IF_ERROR(
+      ynn_define_reduce(subgraph, ynn_reduce_op, /*num_axes=*/dims.size(),
+                        /*axes=*/dims.data(), in, init_id, &out, /*flags=*/0));
+  return out;
+}
+
 //===----------------------------------------------------------------------===//
 // Emit YNNPACK subgraph for the given HLO computation.
 //===----------------------------------------------------------------------===//
@@ -210,7 +259,8 @@ static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
       YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
         return ynn_create_subgraph(
             /*external_value_ids=*/computation->num_parameters() + 1,
-            /*flags=*/0, subgraph);
+            YnnFlags(computation->parent()->config().debug_options()),
+            subgraph);
       }));
 
   // Traverse fused computation in post-order and define YNNPACK operations
@@ -270,6 +320,11 @@ static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
                             DefineBitcastOp(subgraph.get(), tensor_ids, instr));
       } break;
 
+      case HloOpcode::kReduce: {
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineReduceOp(subgraph.get(), tensor_ids, instr));
+      } break;
+
       default: {
         return InvalidArgument("Unsupported fusion instruction: %s",
                                instr->ToString());
@@ -277,10 +332,119 @@ static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
     }
   }
 
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
+  return subgraph;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit YNNPACK subgraph for the given HLO dot instruction.
+//===----------------------------------------------------------------------===//
+
+// TODO(ashaposhnikov): Use DefineBatchMatrixMultiply in EmitYnnSubgraph.
+static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
+                                            uint32_t input1_id,
+                                            uint32_t input2_id,
+                                            uint32_t output_id, size_t b_rank,
+                                            bool transpose_b) {
+  if (transpose_b) {
+    uint32_t input2_id_transposed = YNN_INVALID_VALUE_ID;
+    std::array<int32_t, YNN_MAX_TENSOR_RANK> perm;
+    std::iota(perm.begin(), perm.end(), 0);
+    CHECK_LT(b_rank, YNN_MAX_TENSOR_RANK);
+    std::swap(perm[b_rank - 1], perm[b_rank - 2]);
+    ynn_status status = ynn_define_static_transpose(
+        subgraph,
+        /*num_dims=*/b_rank, perm.data(), input2_id, &input2_id_transposed,
+        /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = input2_id_transposed;
+  }
+
+  return ynn_define_dot(subgraph, /*num_k_dims=*/1, input1_id, input2_id,
+                        YNN_INVALID_VALUE_ID, &output_id, /*flags=*/0);
+}
+
+static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
+    const HloDotInstruction* dot,
+    std::vector<std::unique_ptr<Literal>>& literals,
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers,
+    bool capture_rhs) {
+  TF_ASSIGN_OR_RETURN(
+      YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+        return ynn_create_subgraph(
+            /*external_value_ids=*/3,
+            YnnFlags(dot->GetModule()->config().debug_options()), subgraph);
+      }));
+
+  uint32_t lhs_id = 0;
+  uint32_t rhs_id = 1;
+  uint32_t out_id = 2;
+
+  const HloInstruction* lhs = dot->operand(0);
+  const HloInstruction* rhs = dot->operand(1);
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = dot->shape();
+
+  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
+    return {dims.begin(), dims.end()};
+  };
+
+  std::vector<size_t> lhs_dims = dims(lhs_shape.dimensions());
+  std::vector<size_t> rhs_dims = dims(rhs_shape.dimensions());
+  std::vector<size_t> out_dims = dims(out_shape.dimensions());
+
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_lhs_type, YnnType(lhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_rhs_type, YnnType(rhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_out_type, YnnType(out_shape.element_type()));
+
+  const uint32_t input_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_INPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_lhs_type, lhs_dims.size(), lhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &lhs_id));
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_rhs_type, rhs_dims.size(), rhs_dims.data(),
+      capture_rhs ? arguments_buffers[1].opaque() : nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &rhs_id));
+
+  const uint32_t output_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_out_type, out_dims.size(), out_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, output_tensor_flags, &out_id));
+
+  DotDimensionNumbers dot_dimensions = dot->dot_dimension_numbers();
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  const size_t b_rank = rhs_shape.dimensions().size();
+  const bool transpose_b = !dot_canonical_dims.rhs_canonical;
+  YNN_RETURN_IF_ERROR(DefineBatchMatrixMultiply(subgraph.get(), lhs_id, rhs_id,
+                                                out_id, b_rank, transpose_b));
+
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
   return subgraph;
 }
 
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>()>>
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
 EmitYnnFusionBuilder(const HloComputation* computation) {
   // We do not support non-array parameters for YNNPACK operations.
   for (auto& param : computation->parameter_instructions()) {
@@ -297,10 +461,19 @@ EmitYnnFusionBuilder(const HloComputation* computation) {
                            computation->root_instruction()->shape().ToString());
   }
 
-  return [computation,
-          literals = std::vector<std::unique_ptr<Literal>>()]() mutable {
+  return [computation, literals = std::vector<std::unique_ptr<Literal>>()](
+             absl::Span<const se::DeviceMemoryBase> arguments_buffers) mutable {
     return EmitYnnSubgraph(computation, literals);
   };
 }
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs) {
+  return [dot, capture_rhs, literals = std::vector<std::unique_ptr<Literal>>()](
+             absl::Span<const se::DeviceMemoryBase> arguments_buffers) mutable {
+    return EmitYnnDotSubgraph(dot, literals, arguments_buffers, capture_rhs);
+  };
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.h b/third_party/xla/xla/backends/cpu/ynn_emitter.h
index 86461960da46a4..280216a9a68a0d 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.h
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.h
@@ -20,12 +20,19 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/device_memory.h"
 
 namespace xla::cpu {
 
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>()>>
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
 EmitYnnFusionBuilder(const HloComputation* computation);
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceMemoryBase> arguments_buffers)>>
+EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs);
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_YNN_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto b/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto
new file mode 100644
index 00000000000000..5b4ed7e6c901b3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/ynn_fusion_options.proto
@@ -0,0 +1,7 @@
+syntax = "proto3";
+
+package xla.cpu;
+
+message YnnFusionOptions {
+  bool use_threadpool = 1;
+}
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index f9821859e2b2a1..14e3aef9563534 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -16,19 +16,28 @@ limitations under the License.
 #include "xla/backends/cpu/ynn_support.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <tuple>
 
 #include "ynnpack/include/ynnpack.h"
 #include "absl/base/no_destructor.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/dot_dims.h"
 #include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
+#include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
@@ -138,4 +147,136 @@ bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo) {
   }
 }
 
+absl::StatusOr<bool> IsDotSupportedByYnn(
+    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
+    const Shape& rhs_shape, const Shape& out_shape) {
+  // Stores tuple of allowed (input, output) dtypes.
+  static const absl::NoDestructor<absl::flat_hash_set<
+      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
+      kAllowedTypes({
+          // TODO(b/452693819): We plan to enable this in stages, starting with
+          // int8, and enable f32 later.
+          // {F32, F32, F32},
+          // TODO(b/449998002): We don't have fast fp16 kernels yet.
+          // {F16, F16, F32},
+          {BF16, BF16, F32},
+          {S8, S8, S32},
+          {U8, S8, S32},
+          // TODO(b/441600372): We don't have fast int4 kernels yet. Even the
+          // reference kernel might be pretty good though?
+          // {S8, S4, S32},
+      });
+
+  // Types must be in the allowed set.
+  PrimitiveType lhs_dtype = lhs_shape.element_type();
+  PrimitiveType rhs_dtype = rhs_shape.element_type();
+  PrimitiveType out_dtype = out_shape.element_type();
+  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+    return false;
+  }
+
+  if (!IsLayoutSupportedByYnn(lhs_shape) ||
+      !IsLayoutSupportedByYnn(rhs_shape) ||
+      !IsLayoutSupportedByYnn(out_shape)) {
+    return false;
+  }
+
+  // Check shapes.
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  if (dot_canonical_dims.m == 1 && dot_canonical_dims.n == 1 &&
+      dot_shape.batch_size > 1) {
+    // TODO(b/430079105): YNNPACK does not handle batch dimensions that are not
+    // matrix dimensions. We could handle this case by fully implementing dot
+    // (b/430079105), but we also could just insert dummy dimensions of size 1
+    // for the matrix dimensions, so the batch dimensions get handled correctly.
+    return false;
+  }
+
+  if (std::max({dot_canonical_dims.m, dot_canonical_dims.k,
+                dot_canonical_dims.n}) < 8) {
+    // If this dot is small, our overhead is probably too significant.
+    // TODO(b/458529782): This is here as a workaround for an unrelated bug.
+    return false;
+  }
+
+  // YNNPACK supports transposing the inputs efficiently if possible (they will
+  // fuse with dot packing), but we don't currently support generating the
+  // necessary transposes.
+  if (!dot_canonical_dims.lhs_canonical ||
+      dot_canonical_dims.lhs_column_major ||
+      dot_canonical_dims.rhs_column_major) {
+    return false;
+  }
+
+  return true;
+}
+
+bool IsReduceOpSupportedByYnn(const HloInstruction* hlo) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
+  if (!YnnType(hlo->shape().element_type()).ok()) {
+    return false;
+  }
+  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
+  CHECK_NE(reduce, nullptr);
+  // TODO(ashaposhnikov): we can support this edge case,
+  // planning to come back to this later.
+  if (reduce->dimensions().empty()) {
+    return false;
+  }
+
+  HloInstruction* init = reduce->init_values().front();
+  const PrimitiveType type = init->shape().element_type();
+  // TODO(ashaposhnikov): The list of supported types can be extended.
+  if (type != F32) {
+    return false;
+  }
+  if (type != hlo->shape().element_type()) {
+    return false;
+  }
+
+  const HloComputation* to_apply = reduce->to_apply();
+  CHECK_NE(to_apply, nullptr);
+  return Match(to_apply->root_instruction(),
+               match::AnyOf<HloInstruction>(match::Add(), match::Maximum(),
+                                            match::Minimum())
+                   .WithBinaryOperandsAnyOrder(match::Parameter(0),
+                                               match::Parameter(1)));
+}
+
+bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo) {
+  if (!IsReduceOpSupportedByYnn(hlo)) {
+    return false;
+  }
+  const HloInstruction* input = hlo->operand(0);
+  if (ShapeUtil::ElementsIn(input->shape()) < 32 * 1024) {
+    return false;
+  }
+  switch (input->opcode()) {
+    case HloOpcode::kMultiply:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kSlice:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConvert:
+    case HloOpcode::kReshape:
+      return false;
+    default: {
+      return true;
+    }
+  }
+}
+
+uint32_t YnnFlags(const DebugOptions& debug_options) {
+  uint32_t flags = 0;
+  if (!debug_options.xla_cpu_enable_platform_dependent_math()) {
+    flags |= YNN_FLAG_CONSISTENT_ARITHMETIC;
+  }
+  return flags;
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.h b/third_party/xla/xla/backends/cpu/ynn_support.h
index 74205d2dfc64f4..7025010715dad9 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.h
+++ b/third_party/xla/xla/backends/cpu/ynn_support.h
@@ -16,12 +16,16 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_YNN_SUPPORT_H_
 #define XLA_BACKENDS_CPU_YNN_SUPPORT_H_
 
+#include <cstdint>
+
 #include "ynnpack/include/ynnpack.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/xla.pb.h"
 
 namespace xla::cpu {
 
@@ -55,6 +59,21 @@ bool IsConstantSupportedByYnn(const HloInstruction* hlo);
 // Returns true if the nonconstant elementwise op is supported by YNNPACK.
 bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo);
 
+// Returns true if the dot operation is supported by YNNPACK. Returns an error
+// if the dot operation shape is invalid.
+absl::StatusOr<bool> IsDotSupportedByYnn(
+    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
+    const Shape& rhs_shape, const Shape& out_shape);
+
+// Returns true if the reduce op is supported by YNNPACK.
+bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
+
+// Returns true if the reduce op will be offloaded to YNNPACK.
+bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo);
+
+// Convert XLA options to YNNPACK flags.
+uint32_t YnnFlags(const DebugOptions& debug_options);
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_YNN_SUPPORT_H_
diff --git a/third_party/xla/xla/backends/cpu/ynn_threadpool.cc b/third_party/xla/xla/backends/cpu/ynn_threadpool.cc
deleted file mode 100644
index 0308f85b14811a..00000000000000
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool.cc
+++ /dev/null
@@ -1,558 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/ynn_threadpool.h"
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <deque>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/optimization.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/fixed_array.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/synchronization/mutex.h"
-#include "slinky/base/function_ref.h"
-#include "slinky/base/ref_count.h"
-#include "slinky/base/thread_pool.h"
-
-#define EIGEN_USE_THREADS
-#include "Eigen/ThreadPool"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-
-//===----------------------------------------------------------------------===//
-// work_queue
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Forward declare.
-class worker;
-
-// A work queue that partitions `num_work_items` work items into
-// `num_partitions` partitions processed by parallel workers.
-class work_queue {
- public:
-  work_queue(size_t num_work_items, size_t num_partitions);
-
-  // Returns the next work item in the given partition. Returns std::nullopt
-  // if the partition is complete.
-  std::optional<size_t> pop_work_item(size_t partition_index);
-
-  // Return the partition [begin, end) work item range.
-  std::pair<size_t, size_t> partition_range(size_t partition_index) const;
-
-  size_t num_partitions() const { return partitions_.size(); }
-
-  // If work queue is empty, it means that all work items are being processed by
-  // the workers, and the task will be done once all workers complete.
-  bool is_empty() const { return empty_.load(std::memory_order_relaxed); }
-
- private:
-  friend class worker;
-
-  // Work items partition tracking the next work item to process.
-  struct partition {
-    void initialize(size_t begin, size_t end);
-
-    // Tracks index of the next work item in the assigned partition.
-    ABSL_CACHELINE_ALIGNED std::atomic<size_t> index;
-    size_t begin;
-    size_t end;
-  };
-
-  void set_empty() { empty_.store(true, std::memory_order_relaxed); }
-
-  absl::FixedArray<partition, 32> partitions_;
-  ABSL_CACHELINE_ALIGNED std::atomic<bool> empty_;
-};
-
-}  // namespace
-
-void work_queue::partition::initialize(size_t begin, size_t end) {
-  index.store(begin, std::memory_order_relaxed);
-  this->begin = begin;
-  this->end = end;
-}
-
-work_queue::work_queue(size_t num_work_items, size_t num_partitions)
-    : partitions_(num_partitions), empty_(num_work_items == 0) {
-  size_t partition_size = num_work_items / num_partitions;
-  size_t rem_work = num_work_items % num_partitions;
-  for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
-    end = begin + partition_size + ((i < rem_work) ? 1 : 0);
-    partitions_[i].initialize(begin, end);
-  }
-}
-
-std::optional<size_t> work_queue::pop_work_item(size_t partition_index) {
-  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
-  partition& partition = partitions_.data()[partition_index];
-
-  // Check if partition is already empty.
-  if (size_t index = partition.index.load(std::memory_order_relaxed);
-      ABSL_PREDICT_FALSE(index >= partition.end)) {
-    return std::nullopt;
-  }
-
-  // Try to acquire the next work item in the partition.
-  size_t index = partition.index.fetch_add(1, std::memory_order_relaxed);
-  return ABSL_PREDICT_FALSE(index >= partition.end) ? std::nullopt
-                                                    : std::make_optional(index);
-}
-
-std::pair<size_t, size_t> work_queue::partition_range(
-    size_t partition_index) const {
-  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
-  return {partitions_[partition_index].begin, partitions_[partition_index].end};
-}
-
-//===----------------------------------------------------------------------===//
-// worker
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Worker processes work items from the work queue starting from the assigned
-// work partition. Once the assigned partition is complete it tries to pop
-// the work item from the next partition. Once the work queue is empty (the
-// worker wraps around to the initial partition) it returns and empty work item.
-class worker {
- public:
-  worker(size_t partition_index, work_queue* queue);
-
-  std::optional<size_t> pop_work_item();
-
- private:
-  size_t initial_partition_index_;
-  size_t partition_index_;
-  work_queue* queue_;
-};
-
-}  // namespace
-
-worker::worker(size_t partition_index, work_queue* queue)
-    : initial_partition_index_(partition_index),
-      partition_index_(partition_index),
-      queue_(queue) {}
-
-std::optional<size_t> worker::pop_work_item() {
-  std::optional<size_t> work = queue_->pop_work_item(partition_index_);
-  if (ABSL_PREDICT_TRUE(work)) {
-    return work;
-  }
-
-  while (!work.has_value() && !queue_->is_empty()) {
-    // Wrap around to the first partition.
-    if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
-      partition_index_ = 0;
-    }
-
-    // We checked all partitions and got back to the partition we started from.
-    if (ABSL_PREDICT_FALSE(partition_index_ == initial_partition_index_)) {
-      queue_->set_empty();
-      break;
-    }
-
-    work = queue_->pop_work_item(partition_index_);
-  }
-
-  return work;
-}
-
-//===----------------------------------------------------------------------===//
-// task_impl
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Running a task can result in three states:
-//
-//   kPending:  The task is still being processed by the worker threads.
-//   kComplete: The caller thread is the one who completed the task.
-//   kDone:     The task is done and all work items have been processed, however
-//              the caller thread did't process any work items.
-//
-// We need this state to signal the waiter thread just once, from a thread that
-// completed the task.
-enum class task_state { kPending, kComplete, kDone };
-
-class task_impl final : public YnnThreadpool::task {
- public:
-  task_impl(YnnThreadpool::task_body body, size_t num_work_items,
-            size_t num_partitions);
-
-  // Runs this task by process work items in the current thread.
-  task_state run();
-
-  int64_t num_workers() const;
-  bool is_empty_work_queue() const;
-  bool done() const final;
-
- private:
-  YnnThreadpool::task_body body_;
-  work_queue work_queue_;
-
-  ABSL_CACHELINE_ALIGNED std::atomic<size_t> worker_index_;
-  ABSL_CACHELINE_ALIGNED std::atomic<size_t> pending_work_items_;
-};
-
-}  // namespace
-
-task_impl::task_impl(YnnThreadpool::task_body body, size_t num_work_items,
-                     size_t num_partitions)
-    : body_(std::move(body)),
-      work_queue_(num_work_items, num_partitions),
-      worker_index_(0),
-      pending_work_items_(num_work_items) {}
-
-task_state task_impl::run() {
-  // If we have more workers joining the task than the number of partitions,
-  // then we have to wrap around to the first partition.
-  size_t worker_index = worker_index_.fetch_add(1, std::memory_order_relaxed);
-  if (ABSL_PREDICT_FALSE(worker_index >= work_queue_.num_partitions())) {
-    worker_index %= work_queue_.num_partitions();
-  }
-
-  // Each worker processes the body using its own copy of the task.
-  worker w(worker_index, &work_queue_);
-  size_t num_processed_work_items = 0;
-
-  if (std::optional<size_t> item = w.pop_work_item()) {
-    YnnThreadpool::task_body body = body_;
-
-    do {
-      body(*item);
-      ++num_processed_work_items;
-    } while ((item = w.pop_work_item()).has_value());
-  }
-
-  // The number of pending work items should never go below zero.
-  size_t previous_work_items = pending_work_items_.fetch_sub(
-      num_processed_work_items, std::memory_order_acq_rel);
-  DCHECK_GE(previous_work_items, num_processed_work_items);
-
-  // Task is done if we have no more work items to process. Task is complete if
-  // we are the one who processed the last work item.
-  bool is_done = previous_work_items == num_processed_work_items;
-  bool is_complete = is_done && num_processed_work_items > 0;
-
-  return is_complete ? task_state::kComplete
-         : is_done   ? task_state::kDone
-                     : task_state::kPending;
-}
-
-int64_t task_impl::num_workers() const {
-  return worker_index_.load(std::memory_order_relaxed);
-}
-
-bool task_impl::is_empty_work_queue() const { return work_queue_.is_empty(); }
-
-bool task_impl::done() const {
-  return pending_work_items_.load(std::memory_order_acquire) == 0;
-}
-
-//===----------------------------------------------------------------------===//
-// YnnThreadpool::impl
-//===----------------------------------------------------------------------===//
-
-// We keep a stack of tasks that are currently being processed by current
-// thread, to avoid recursive calls.
-static thread_local std::vector<const task_impl*> task_stack;  // NOLINT
-
-class YnnThreadpool::impl : public slinky::ref_counted<impl> {
- public:
-  explicit impl(Eigen::ThreadPoolInterface* threadpool);
-
-  // Work on the single task and return the state of the task.
-  task_state work_on_task(task_impl* task);
-
-  // Work on all tasks in the queue. Returns when run out of tasks to process.
-  void work_on_tasks(const absl::Condition& condition);
-
-  // Enqueues a new task into the queue and returns a reference to it.
-  slinky::ref_count<task_impl> enqueue(YnnThreadpool::task_body body,
-                                       size_t num_work_items,
-                                       size_t num_partitions);
-
-  void await(const absl::Condition& condition);
-
-  void atomic_call(slinky::function_ref<void()> t);
-
-  // Returns true if we can schedule more workers into the underlying scheduler.
-  bool can_schedule_workers() const;
-
-  // Schedules the given number of workers for the given task. Worker scheduling
-  // uses recursive work splitting and early exit if the task does not need any
-  // more workers, of if we reached the maximum number of scheduled workers.
-  void schedule_workers(int64_t num_workers, slinky::ref_count<task_impl> task);
-
-  size_t thread_count() const { return thread_count_; }
-
- private:
-  friend class slinky::ref_counted<impl>;
-  static void destroy(impl* ptr) { delete ptr; }
-
-  // A state of the work scheduling for a given task.
-  struct schedule_state : public slinky::ref_counted<schedule_state> {
-    schedule_state(int64_t remaining_workers, slinky::ref_count<task_impl> task,
-                   slinky::ref_count<impl> impl)
-        : remaining_workers(remaining_workers),
-          task(std::move(task)),
-          impl(std::move(impl)) {}
-
-    static void destroy(schedule_state* ptr) { delete ptr; }
-
-    std::atomic<int64_t> remaining_workers;
-    slinky::ref_count<task_impl> task;
-    slinky::ref_count<impl> impl;
-  };
-
-  // Worker scheduling function for the underlying scheduler.
-  template <bool release_impl_ref>
-  static void schedule_workers(schedule_state* context);
-
-  // Dequeues a pending task from the queue.
-  slinky::ref_count<task_impl> dequeue();
-
-  // Signals all waiter threads waiting on the waiter mutex.
-  void signal_waiters();
-
-  Eigen::ThreadPoolInterface* threadpool_;
-  size_t thread_count_;
-
-  std::deque<slinky::ref_count<task_impl>> tasks_ ABSL_GUARDED_BY(tasks_mutex_);
-
-  // A mutex for guarding mutable state accessed concurrently.
-  ABSL_CACHELINE_ALIGNED absl::Mutex tasks_mutex_;
-
-  // A mutex for signalling threads waiting on the tasks or conditions.
-  ABSL_CACHELINE_ALIGNED absl::Mutex waiter_mutex_;
-};
-
-YnnThreadpool::impl::impl(Eigen::ThreadPoolInterface* threadpool)
-    : threadpool_(threadpool),
-      thread_count_(threadpool_ ? threadpool_->NumThreads() : 0) {}
-
-slinky::ref_count<task_impl> YnnThreadpool::impl::enqueue(
-    YnnThreadpool::task_body body, size_t num_work_items,
-    size_t num_partitions) {
-  slinky::ref_count<task_impl> task(
-      new task_impl(std::move(body), num_work_items, num_partitions));
-
-  absl::MutexLock lock(tasks_mutex_);
-  return tasks_.emplace_back(std::move(task));
-}
-
-slinky::ref_count<task_impl> YnnThreadpool::impl::dequeue() {
-  absl::MutexLock lock(tasks_mutex_);
-
-  for (auto i = tasks_.begin(); i != tasks_.end();) {
-    slinky::ref_count<task_impl>& task = *i;
-
-    // Task doesn't have any more work items to process.
-    if (ABSL_PREDICT_FALSE(task->is_empty_work_queue())) {
-      i = tasks_.erase(i);
-      continue;
-    }
-
-    // Don't run the same task multiple times on the same thread.
-    if (ABSL_PREDICT_FALSE(absl::c_contains(task_stack, &*task))) {
-      ++i;
-      continue;
-    }
-
-    return task;
-  }
-
-  return nullptr;
-}
-
-task_state YnnThreadpool::impl::work_on_task(task_impl* task) {
-  DCHECK(absl::c_find(task_stack, task) == task_stack.end());
-
-  task_stack.push_back(task);
-  task_state state = task->run();
-  task_stack.pop_back();
-
-  // If we are the one who completed the task, we signal the waiters to wake upS
-  // any threads that are waiting for the task completion. If the task was
-  // completed by another worker, we do nothing to avoid the cost of waking up
-  // the same thread multiple times.
-  if (ABSL_PREDICT_FALSE(state == task_state::kComplete)) {
-    signal_waiters();
-  }
-
-  return state;
-}
-
-void YnnThreadpool::impl::work_on_tasks(const absl::Condition& condition) {
-  while (slinky::ref_count<task_impl> task = dequeue()) {
-    work_on_task(&*task);
-
-    if (ABSL_PREDICT_TRUE(condition.Eval())) {
-      return;
-    }
-  }
-}
-
-void YnnThreadpool::impl::await(const absl::Condition& condition) {
-  if (ABSL_PREDICT_FALSE(!condition.Eval())) {
-    absl::MutexLock lock(waiter_mutex_);
-    waiter_mutex_.Await(condition);
-  }
-}
-
-void YnnThreadpool::impl::signal_waiters() {
-  absl::MutexLock lock(waiter_mutex_);
-}
-
-void YnnThreadpool::impl::atomic_call(slinky::function_ref<void()> t) {
-  absl::MutexLock lock(waiter_mutex_);
-  t();
-}
-
-bool YnnThreadpool::impl::can_schedule_workers() const {
-  // One reference is owned by the parent YnnThreadpool, every other
-  // reference is owned by a worker scheduled into the underlying scheduler.
-  return ref_count() < 1 + thread_count();
-}
-
-void YnnThreadpool::impl::schedule_workers(int64_t num_workers,
-                                           slinky::ref_count<task_impl> task) {
-  if (ABSL_PREDICT_TRUE(num_workers > 0 && can_schedule_workers())) {
-    slinky::ref_count<schedule_state> state(
-        new schedule_state(num_workers - 1, std::move(task), {this}));
-    threadpool_->Schedule([state = state.take()]() {
-      schedule_workers</*release_impl_ref=*/false>(state);
-    });
-  }
-}
-
-template <bool release_impl_ref>
-void YnnThreadpool::impl::schedule_workers(schedule_state* context) {
-  auto state = slinky::ref_count<schedule_state>::assume(context);
-
-  // We recursively keep scheduling workers into the underlying scheduler.
-  // This is more efficient than scheduling them sequentially from a single
-  // thread, because workers can start processing the task sooner and we
-  // distribute thread wake-ups evenly across underlying threads.
-  static constexpr int32_t kNumRecursiveWorkers = 2;
-
-  for (size_t i = 0; i < kNumRecursiveWorkers; ++i) {
-    bool schedule_worker =
-        state->impl->can_schedule_workers() &&
-        !state->task->is_empty_work_queue() &&
-        state->remaining_workers.fetch_sub(1, std::memory_order_relaxed) > 0;
-
-    if (ABSL_PREDICT_TRUE(!schedule_worker)) {
-      break;
-    }
-
-    // Add +1 reference to account for the scheduled worker, as we use `impl`
-    // reference count to track the number of active workers.
-    state->impl->add_ref();
-    state->impl->threadpool_->Schedule(
-        [state = slinky::ref_count<schedule_state>(state).take()]() {
-          YnnThreadpool::impl::schedule_workers</*release_impl_ref=*/true>(
-              state);
-        });
-  }
-
-  // Keep processing tasks from the queue until we are out of tasks.
-  static constexpr bool kFalse = false;
-  state->impl->work_on_tasks(absl::Condition(&kFalse));
-
-  // One `impl` reference implicitly owned by the `state`, every additional
-  // reference is added and released explicitly by the worker task.
-  if constexpr (release_impl_ref) {
-    state->impl->release();
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// YnnThreadpool
-//===----------------------------------------------------------------------===//
-
-YnnThreadpool::YnnThreadpool(Eigen::ThreadPoolDevice* device)
-    : impl_(new impl(device ? device->getPool() : nullptr)) {}
-
-YnnThreadpool::YnnThreadpool(Eigen::ThreadPoolInterface* threadpool)
-    : impl_(new impl(threadpool)) {}
-
-YnnThreadpool::~YnnThreadpool() = default;
-
-slinky::ref_count<YnnThreadpool::task> YnnThreadpool::enqueue(
-    size_t n, task_body t, int32_t max_workers) {
-  CHECK_GE(max_workers, n);
-
-  // Don't create more partitions than the number of threads. Also make sure
-  // that we have at least one partition (if we don't have a scheduler).
-  size_t num_partitions = std::min<size_t>(n, thread_count());
-  num_partitions = std::max<size_t>(1, num_partitions);
-
-  auto task = impl_->enqueue(std::move(t), n, num_partitions);
-
-  // If we don't have any worker threads, we return a task to the caller, and
-  // assume that the caller will wait on it.
-  if (ABSL_PREDICT_FALSE(impl_->thread_count() == 0)) {
-    return task;
-  }
-
-  // We assume that the caller will immediately start working on the task, so we
-  // need to schedule workers only for the remaining number of partitions.
-  impl_->schedule_workers(/*num_workers=*/num_partitions - 1, task);
-
-  return task;
-}
-
-void YnnThreadpool::wait_for(task* t) {
-  task_impl* task = static_cast<task_impl*>(t);
-  task_state state = impl_->work_on_task(task);
-
-  // If the task is complete or done, we are immediately done with waiting.
-  if (ABSL_PREDICT_TRUE(state == task_state::kComplete ||
-                        state == task_state::kDone)) {
-    return;
-  }
-
-  // Switch to the work stealing mode and work on other tasks in the queue
-  // until the given task is done.
-  impl_->work_on_tasks(absl::Condition(task, &task_impl::done));
-  impl_->await(absl::Condition(task, &task_impl::done));
-}
-
-void YnnThreadpool::wait_for(predicate_ref condition) {
-  impl_->work_on_tasks(absl::Condition(&condition));
-  impl_->await(absl::Condition(&condition));
-}
-
-void YnnThreadpool::atomic_call(slinky::function_ref<void()> t) {
-  impl_->atomic_call(t);
-}
-
-int YnnThreadpool::thread_count() const { return impl_->thread_count(); }
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_threadpool_test.cc b/third_party/xla/xla/backends/cpu/ynn_threadpool_test.cc
deleted file mode 100644
index 096efb2b0b5ba1..00000000000000
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/ynn_threadpool.h"
-
-#include <array>
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/threadpool.h"
-
-namespace Eigen {
-class ThreadPoolInterface;
-}  // namespace Eigen
-
-namespace xla::cpu {
-
-TEST(YnnThreadpool, inline_scheduling) {
-  YnnThreadpool thread_pool(static_cast<Eigen::ThreadPoolInterface*>(nullptr));
-
-  static constexpr size_t size = 10000;
-
-  std::vector<int32_t> data(size, 0);
-  auto inc = [&](size_t i) { data[i]++; };
-
-  thread_pool.parallel_for(size, inc);
-
-  std::vector<int32_t> expected(size, 1);
-  EXPECT_EQ(data, expected);
-}
-
-TEST(YnnThreadpool, single_loop) {
-  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
-  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
-
-  static constexpr size_t size = 10000;
-
-  std::vector<int32_t> data(size, 0);
-  auto inc = [&](size_t i) { data[i]++; };
-
-  thread_pool.parallel_for(size, inc);
-
-  std::vector<int32_t> expected(size, 1);
-  EXPECT_EQ(data, expected);
-}
-
-TEST(YnnThreadpool, loop_chain) {
-  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
-  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
-
-  static constexpr size_t size = 10000;
-
-  std::vector<int32_t> data(size, 0);
-  auto inc = [&](size_t i) { data[i]++; };
-
-  thread_pool.parallel_for(size, inc);
-  thread_pool.parallel_for(size, inc);
-  thread_pool.parallel_for(size, inc);
-  thread_pool.parallel_for(size, inc);
-  thread_pool.parallel_for(size, inc);
-
-  std::vector<int32_t> expected(size, 5);
-  EXPECT_EQ(data, expected);
-}
-
-TEST(YnnThreadpool, nested_loops) {
-  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
-  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
-
-  static constexpr size_t size = 100;
-
-  std::array<std::atomic<int32_t>, size> data = {{0}};
-  auto inc = [&](size_t i) { data[i]++; };
-
-  thread_pool.parallel_for(
-      size, [&](size_t i) { thread_pool.parallel_for(size, inc); });
-
-  for (size_t i = 0; i < size; ++i) {
-    EXPECT_EQ(data[i], size);
-  }
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index 0a0b96b3d2962b..807f454db91582 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:compiler",
@@ -66,7 +67,6 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/service/gpu/model:gpu_indexing_performance_model",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:tma_metadata",
@@ -300,7 +300,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -426,6 +426,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:call_inliner",
@@ -437,7 +438,6 @@ cc_library(
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:gemm_rewriter",
@@ -502,6 +502,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:float_normalization",
         "//xla/hlo/utils:hlo_query",
@@ -515,7 +516,6 @@ cc_library(
         "//xla/service/gpu:split_k_gemm_rewriter",
         "//xla/service/gpu/autotuning:dot_search_space",
         "//xla/service/gpu/autotuning:triton_configs",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:fusion_wrapper",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
@@ -553,13 +553,13 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:nvptx_compiler_impl",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
@@ -679,10 +679,15 @@ cc_library(
         ":cublaslt",
         ":cudnn",
         ":factory",
+        ":fission_backend",
         ":triton",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/service:compiler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/platform:platform_object_registry",
@@ -702,8 +707,8 @@ cc_library(
         ":factory",
         ":triton",
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/stream_executor/rocm:rocm_platform_id",
@@ -716,8 +721,8 @@ cc_library(
     hdrs = ["factory.h"],
     deps = [
         "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:stream_executor_h",
     ],
 )
@@ -772,14 +777,96 @@ cc_library(
         "//xla/service/gpu/autotuning:autotune_cache_key",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:duration_cc_proto",
     ],
 )
 
+cc_library(
+    name = "fission_backend",
+    srcs = ["fission_backend.cc"],
+    hdrs = ["fission_backend.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/service:compiler",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "fission_backend_test",
+    srcs = ["fission_backend_test.cc"],
+    backends = ["h100"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cublas",
+        ":fission_backend",
+        ":gpu_codegen_backend",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "miopen",
+    srcs = ["miopen.cc"],
+    hdrs = ["miopen.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 xla_cc_test(
     name = "legacy_cache_test",
     srcs = ["legacy_cache_test.cc"],
@@ -789,10 +876,12 @@ xla_cc_test(
         "//xla/backends/autotuner:autotuner_cache_interface",
         "//xla/backends/autotuner:autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
@@ -818,12 +907,12 @@ xla_cc_binary(
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/autotuner:file_based_autotuner_cache",
         "//xla/backends/autotuner:profiler",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:compiler",
         "//xla/service:gpu_plugin",
         "//xla/service:platform_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -851,3 +940,36 @@ xla_cc_binary(
         "//xla/stream_executor/rocm:all_runtime",
     ]),
 )
+
+xla_test(
+    name = "miopen_test",
+    srcs = ["miopen_test.cc"],
+    backends = ["gpu"],
+    tags = [
+        "rocm-only",
+    ],
+    deps = [
+        ":miopen",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:amdgpu_compiler",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/rocm:rocm_platform_id",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
index bdf7c0995a3ca5..bdad4279289e3e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
@@ -36,12 +36,12 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_profiler.h"
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
@@ -182,7 +182,7 @@ int main(int argc, char* argv[]) {
   auto module = xla::gpu::GetModule(hlo_file);
   CHECK_OK(module.status());
   mlir::MLIRContext mlir_context;
-  xla::gpu::SymbolicExprContext symbolic_expr_context(&mlir_context);
+  xla::SymbolicExprContext symbolic_expr_context(&mlir_context);
   CHECK_OK(xla::gpu::Autotune(*module.value(), cache_dir, autotune_cache_mode,
                               &symbolic_expr_context));
   std::cout << module.value()->ToString() << std::endl;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
index 93041f10243e75..32c759190db72d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
 #include "xla/service/instruction_fusion.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
index 0bd3993dfd2efc..4434f5ec25b0c8 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
@@ -153,5 +153,9 @@ absl::Status CublasBackend::ApplyConfig(HloInstruction& instr,
   return absl::OkStatus();
 }
 
+bool CublasBackend::IsSupported(const HloInstruction& instr) {
+  return IsLegacyCublasMatmul(instr);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.h b/third_party/xla/xla/backends/gpu/autotuner/cublas.h
index 281be4581232b6..3dfe16e84b2100 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.h
@@ -60,6 +60,9 @@ class CublasBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
index ef03149115ccbf..54c5b0e50a7bd6 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
@@ -74,12 +74,12 @@ absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
   }
 }
 
-bool IsSupported(const HloInstruction& instr) {
+}  // namespace
+
+bool CublasLtBackend::IsSupported(const HloInstruction& instr) {
   return IsCublasLtMatmul(instr) || IsCublasLtMatmulF8(instr);
 }
 
-}  // namespace
-
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 CublasLtBackend::GetSupportedConfigs(const HloInstruction& instr) {
   if (!IsSupported(instr)) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
index 88c5f8a6d7e442..8178e95b61cb8a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.h
@@ -59,6 +59,9 @@ class CublasLtBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
index fe9f6fa3b56497..ed16280d5e7669 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -120,7 +120,7 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
       instr.backend_config<GpuBackendConfig>()
               ->fusion_backend_config()
               .kind() != kCuDnnFusionKind) {
-    LOG(ERROR) << "Instr is not a cudnn fusion.";
+    VLOG(1) << "Instr is not a cudnn fusion.";
     return false;
   }
 
@@ -128,17 +128,17 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
       Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
           *instr.fused_instructions_computation(), HloOpcode::kDot));
   if (dot == nullptr) {
-    LOG(ERROR) << "Fusion does not contain a dot.";
+    VLOG(1) << "Fusion does not contain a dot.";
     return false;
   }
   if (!algorithm_util::IsSupportedByCudnn(
           dot->precision_config().algorithm())) {
-    LOG(ERROR) << "Fusion contains a precision config not supported by cudnn.";
+    VLOG(1) << "Fusion contains a precision config not supported by cudnn.";
     return false;
   }
 
   if (GetDnnVersionInfoOrDefault(stream_executor).major_version() < 9) {
-    LOG(ERROR) << "Cudnn version is too old.";
+    VLOG(1) << "Cudnn version is too old.";
     return false;
   }
 
@@ -151,21 +151,7 @@ bool IsSupportedCudnnFusion(const HloInstruction& instr,
     return true;
   }
 
-  LOG(ERROR) << "Fusion is not supported by cudnn.";
-  return false;
-}
-
-bool IsSupportedByCudnn(const HloInstruction& instr,
-                        se::StreamExecutor* stream_executor,
-                        const DebugOptions& debug_options) {
-  if (instr.opcode() == HloOpcode::kFusion) {
-    return IsSupportedCudnnFusion(instr, stream_executor, debug_options);
-  }
-
-  if (instr.opcode() == HloOpcode::kCustomCall) {
-    return IsCustomCallToDnnConvolution(instr);
-  }
-
+  VLOG(1) << "Fusion is not supported by cudnn.";
   return false;
 }
 
@@ -173,7 +159,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
     se::dnn::DnnSupport* dnn, se::dnn::ConvolutionKind conv_kind,
     se::dnn::DataType input_type, se::dnn::DataType output_type,
     se::Stream* stream, const GpuConvConfig& gpu_conv_config,
-    const se::NumericOptions& numeric_options, bool use_fallback) {
+    const se::EngineOptions& engine_options, bool use_fallback) {
   std::vector<std::unique_ptr<const se::dnn::ConvRunner>> conv_runners;
   std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
       fused_conv_runners;
@@ -194,7 +180,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           gpu_conv_config.input_descriptor, gpu_conv_config.filter_descriptor,
           gpu_conv_config.bias_descriptor, gpu_conv_config.output_descriptor,
           gpu_conv_config.conv_desc, use_fallback, gpu_conv_config.fusion->mode,
-          numeric_options, &fused_conv_runners));
+          engine_options, &fused_conv_runners));
       break;
     }
     case se::dnn::ConvolutionKind::FORWARD_GRAPH: {
@@ -202,7 +188,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           conv_kind, input_type, output_type, stream,
           gpu_conv_config.input_descriptor, gpu_conv_config.filter_descriptor,
           gpu_conv_config.output_descriptor, gpu_conv_config.conv_desc,
-          use_fallback, numeric_options, &graph_conv_runners,
+          use_fallback, engine_options, &graph_conv_runners,
           gpu_conv_config.serialized_graph));
       break;
     }
@@ -218,7 +204,7 @@ absl::StatusOr<std::vector<CudnnBackendConfig>> GetAlgorithms(
           gpu_conv_config.output_descriptor,
           /*output_data=*/se::DeviceMemoryBase(nullptr),
           gpu_conv_config.conv_desc, use_fallback,
-          /*scratch_allocator=*/nullptr, numeric_options, &conv_runners));
+          /*scratch_allocator=*/nullptr, engine_options, &conv_runners));
       break;
     }
     default:
@@ -283,8 +269,9 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
   bool allow_tf32 = absl::c_all_of(
       instr->precision_config().operand_precision(),
       [](int precision) { return precision <= PrecisionConfig::HIGH; });
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   // Try to get algorithms without fallback first, as fallback algorithms can be
   // very slow.
@@ -292,13 +279,13 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
   TF_ASSIGN_OR_RETURN(
       algorithm_configs,
       GetAlgorithms(dnn, conv_kind, input_type, output_type, stream,
-                    gpu_conv_config, numeric_options, /*use_fallback=*/false));
+                    gpu_conv_config, engine_options, /*use_fallback=*/false));
 
   if (algorithm_configs.empty()) {
     TF_ASSIGN_OR_RETURN(
         algorithm_configs,
         GetAlgorithms(dnn, conv_kind, input_type, output_type, stream,
-                      gpu_conv_config, numeric_options, /*use_fallback=*/true));
+                      gpu_conv_config, engine_options, /*use_fallback=*/true));
   }
 
   std::vector<std::unique_ptr<BackendConfig>> configs;
@@ -338,6 +325,18 @@ absl::Status ApplyConfigToCudnnCustomCall(HloInstruction& instr,
 
 }  // namespace
 
+bool CudnnBackend::IsSupported(const HloInstruction& instr) {
+  if (instr.opcode() == HloOpcode::kFusion) {
+    return IsSupportedCudnnFusion(instr, stream_executor(), debug_options());
+  }
+
+  if (instr.opcode() == HloOpcode::kCustomCall) {
+    return IsCustomCallToDnnConvolution(instr);
+  }
+
+  return false;
+}
+
 absl::StatusOr<std::unique_ptr<BackendConfig>> CudnnBackend::GetDefaultConfig(
     const HloInstruction& instr) {
   if (IsCustomCallToDnnConvolution(instr)) {
@@ -358,7 +357,7 @@ absl::StatusOr<std::unique_ptr<BackendConfig>> CudnnBackend::GetDefaultConfig(
 
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 CudnnBackend::GetSupportedConfigs(const HloInstruction& instr) {
-  if (!IsSupportedByCudnn(instr, stream_executor(), debug_options())) {
+  if (!IsSupported(instr)) {
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   if (instr.opcode() == HloOpcode::kFusion) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.h b/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
index 5a16f3226c1e6e..39867d7e39a289 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.h
@@ -72,6 +72,9 @@ class CudnnBackend : public GpuCodegenBackend {
   // apply the configs with non-zero workspace size.
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
index 1b8d3e4bd8a24b..be72b4e6edf99c 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.cc
@@ -44,8 +44,7 @@ namespace se = ::stream_executor;
 
 using CustomKernelBackendConfig = AutotuneResult::CustomKernelFusionKey;
 
-namespace {
-bool IsSupported(const HloInstruction& instr) {
+bool CustomKernelBackend::IsSupported(const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kFusion) {
     LOG(ERROR)
         << "CustomKernelBackend doesn't support non-fusion instructions.";
@@ -61,7 +60,6 @@ bool IsSupported(const HloInstruction& instr) {
 
   return true;
 }
-}  // namespace
 
 absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
     const HloInstruction* fusion_instruction,
diff --git a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
index 7d4cec2f7d8eb2..ee6a7f3d5c3b4f 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/custom_kernel.h
@@ -48,6 +48,9 @@ class CustomKernelBackend : public GpuCodegenBackend {
 
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory.h b/third_party/xla/xla/backends/gpu/autotuner/factory.h
index c7e573e0787557..9e8fe7c461747d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -36,6 +36,13 @@ struct GetCodegenBackends {
       SymbolicExprContext* symbolic_expr_context)>;
 };
 
+struct GetFissionBackends {
+  using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
+      stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
+      const Compiler::TargetConfig*,
+      SymbolicExprContext* symbolic_expr_context)>;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
index 9a91ad0fbc8e0e..199d6467866dd0 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_BACKENDS_GPU_AUTOTUNER_CUDA_FACTORY_H_
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "xla/backends/autotuner/codegen_backend.h"
@@ -24,16 +25,40 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/cublaslt.h"
 #include "xla/backends/gpu/autotuner/cudnn.h"
 #include "xla/backends/gpu/autotuner/factory.h"
+#include "xla/backends/gpu/autotuner/fission_backend.h"
 #include "xla/backends/gpu/autotuner/triton.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
+namespace {
+
+std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
+    const se::DeviceDescription& device_description) {
+  auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+  for (GemmRewriterOptions::DType dtype :
+       {GemmRewriterOptions::DType::kFp8Only,
+        GemmRewriterOptions::DType::kNonFp8Only}) {
+    auto gemm_rewriter = std::make_unique<GemmRewriter>(
+        device_description.gpu_compute_capability(),
+        device_description.runtime_version(), GemmRewriterOptions{dtype});
+    pipeline->AddPass(std::move(gemm_rewriter));
+  }
+  return pipeline;
+}
+
+}  // namespace
+
 std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
     stream_executor::StreamExecutor* stream_executor,
     const DebugOptions* debug_options, Compiler* compiler,
@@ -51,10 +76,29 @@ std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
   return backends;
 }
 
+std::vector<std::unique_ptr<CodegenBackend>> GetFissionBackendsForCuda(
+    stream_executor::StreamExecutor* stream_executor,
+    const DebugOptions* debug_options, Compiler* compiler,
+    const Compiler::TargetConfig* target_config,
+    SymbolicExprContext* symbolic_expr_context) {
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::make_unique<FissionBackend>(
+      debug_options, compiler, target_config,
+      std::make_unique<CublasBackend>(stream_executor, debug_options, compiler,
+                                      target_config),
+      GetCublasRewriterPipeline(target_config->device_description),
+      symbolic_expr_context));
+  return backends;
+}
+
 STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetCodegenBackendsCudaRegistration,
                                            GetCodegenBackends,
                                            se::cuda::kCudaPlatformId,
                                            GetCodegenBackendsForCuda);
+STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetFissionBackendsCudaRegistration,
+                                           GetFissionBackends,
+                                           se::cuda::kCudaPlatformId,
+                                           GetFissionBackendsForCuda);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
index 902b1c28f3b3f9..6f0a549799576c 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/factory.h"
 #include "xla/backends/gpu/autotuner/triton.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -45,10 +45,22 @@ std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
   return backends;
 }
 
+std::vector<std::unique_ptr<CodegenBackend>> GetFissionBackendsForROCm(
+    stream_executor::StreamExecutor* stream_executor,
+    const DebugOptions* debug_options, Compiler* compiler,
+    const Compiler::TargetConfig* target_config,
+    SymbolicExprContext* symbolic_expr_context) {
+  return {};
+}
+
 STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetCodegenBackendsROCmRegistration,
                                            GetCodegenBackends,
                                            se::rocm::kROCmPlatformId,
                                            GetCodegenBackendsForROCm);
+STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(GetFissionBackendsROCmRegistration,
+                                           GetFissionBackends,
+                                           se::rocm::kROCmPlatformId,
+                                           GetFissionBackendsForROCm);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission.cc b/third_party/xla/xla/backends/gpu/autotuner/fission.cc
index 6cd9fd685fc7df..2aebe868a8e547 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/cublaslt.h"
 #include "xla/backends/gpu/autotuner/custom_kernel.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission.h b/third_party/xla/xla/backends/gpu/autotuner/fission.h
index 60b95da29bfa59..8722941d46c994 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission.h
@@ -27,9 +27,9 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/cublaslt.h"
 #include "xla/backends/gpu/autotuner/custom_kernel.h"
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla.pb.h"
 
@@ -65,6 +65,11 @@ class FissionBackend : public GpuCodegenBackend {
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
 
+ private:
+  bool IsSupported(const HloInstruction& instr) override {
+    return instr.opcode() == HloOpcode::kFusion;
+  }
+
   CublasBackend cublas_backend_;
   CublasLtBackend cublaslt_backend_;
   CustomKernelBackend custom_kernel_backend_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
new file mode 100644
index 00000000000000..83e91a231fa45b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
@@ -0,0 +1,170 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace gpu {
+
+namespace {
+
+// Replaces the fusion instruction with the instructions from the fissioned
+// computation.
+absl::Status InlineFissionedComputation(HloInstruction* fusion_instr,
+                                        HloComputation* fissioned_computation) {
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
+      cloned_instructions;
+  HloComputation* parent_computation = fusion_instr->parent();
+
+  for (HloInstruction* instruction_to_clone :
+       fissioned_computation->MakeInstructionPostOrder()) {
+    if (instruction_to_clone->opcode() == HloOpcode::kParameter) {
+      cloned_instructions[instruction_to_clone] = fusion_instr->mutable_operand(
+          instruction_to_clone->parameter_number());
+      continue;
+    }
+
+    std::vector<HloInstruction*> new_operands;
+    for (const HloInstruction* operand : instruction_to_clone->operands()) {
+      new_operands.push_back(cloned_instructions.at(operand));
+    }
+    HloInstruction* new_instruction = parent_computation->AddInstruction(
+        instruction_to_clone->CloneWithNewOperands(
+            instruction_to_clone->shape(), new_operands));
+    cloned_instructions[instruction_to_clone] = new_instruction;
+  }
+  HloInstruction* new_root =
+      cloned_instructions.at(fissioned_computation->root_instruction());
+  return parent_computation->ReplaceInstruction(fusion_instr, new_root);
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return std::vector<std::unique_ptr<BackendConfig>>();
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  TF_ASSIGN_OR_RETURN(HloInstruction * supported_instr,
+                      FindFirstSupportedInstruction(hlo_module.get()));
+  return codegen_backend_->GetSupportedConfigs(*supported_instr);
+
+  return std::vector<std::unique_ptr<BackendConfig>>();
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> FissionBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError("Not a fusion instruction.");
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  TF_ASSIGN_OR_RETURN(HloInstruction * supported_instr,
+                      FindFirstSupportedInstruction(hlo_module.get()));
+  return codegen_backend_->GetDefaultConfig(*supported_instr);
+
+  return absl::InvalidArgumentError("No supported configs found.");
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> FissionBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      codegen_backend_->RunHloPasses(std::move(hlo_module), options));
+
+  // Run priority fusion to fuse the fissioned HLOs.
+  HloCostAnalysis::Options priority_fusion_options;
+  priority_fusion_options.count_multiple_input_accesses = true;
+  // TODO: b/407494653 - Get rid of PriorityFusion.
+  PriorityFusion priority_fusion(
+      /*thread_pool=*/nullptr, target_config().device_description,
+      priority_fusion_options, symbolic_expr_context_);
+  TF_RETURN_IF_ERROR(priority_fusion.Run(module.get()).status());
+  return module;
+}
+
+absl::Status FissionBackend::ApplyConfig(HloInstruction& instr,
+                                         const BackendConfig& config) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      GetFissionedAndRewrittenModule(instr));
+  TF_ASSIGN_OR_RETURN(HloInstruction * supported_instr,
+                      FindFirstSupportedInstruction(hlo_module.get()));
+  TF_RETURN_IF_ERROR(codegen_backend_->ApplyConfig(*supported_instr, config));
+  return InlineFissionedComputation(&instr, hlo_module->entry_computation());
+}
+
+bool FissionBackend::IsSupported(const HloInstruction& instr) {
+  return instr.opcode() == HloOpcode::kFusion;
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>>
+FissionBackend::GetFissionedAndRewrittenModule(
+    const HloInstruction& fusion_instr) {
+  const auto* fusion = Cast<HloFusionInstruction>(&fusion_instr);
+  std::unique_ptr<HloModule> hlo_module =
+      ExtractComputationIntoNewModule(*fusion->called_computation());
+  TF_RETURN_IF_ERROR(rewriter_pipeline_->Run(hlo_module.get()).status());
+  return hlo_module;
+}
+
+absl::StatusOr<HloInstruction*> FissionBackend::FindFirstSupportedInstruction(
+    const HloModule* module) {
+  std::vector<HloInstruction*> supported_instructions;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (codegen_backend_->IsSupported(*instruction)) {
+        supported_instructions.push_back(instruction);
+      }
+    }
+  }
+  if (supported_instructions.empty()) {
+    return absl::InvalidArgumentError("No supported instructions found.");
+  }
+  if (supported_instructions.size() > 1) {
+    LOG(WARNING) << "Backend " << name()
+                 << " found multiple supported instructions found. Using the "
+                    "first one.";
+  }
+  return supported_instructions[0];
+}
+
+}  // namespace gpu
+
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
new file mode 100644
index 00000000000000..d5a51633baef6a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
+#define XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// A proxy backend that wraps an actual codegen backend. The `rewriter_pipeline`
+// is used to transform unfused instructions to retarget them for the underlying
+// codegen backend.
+// For the get/apply config operations, the proxy backend only operates on the
+// *first* supported instruction by the underlying backend, found in the unfused
+// and transmormed HLO.
+// The assumption is that there is only one operation of interest in the fusion
+// (e.g., a 'dot' in a gemm fusion).
+class FissionBackend : public GpuCodegenBackend {
+ public:
+  FissionBackend(const DebugOptions* debug_options, Compiler* compiler,
+                 const Compiler::TargetConfig* target_config,
+                 std::unique_ptr<GpuCodegenBackend> backend,
+                 std::unique_ptr<HloPassPipeline> rewriter_pipeline,
+                 SymbolicExprContext* symbolic_expr_context,
+                 stream_executor::StreamExecutor* stream_executor = nullptr)
+      : GpuCodegenBackend(absl::StrCat(backend->name(), "_fission"),
+                          debug_options, compiler, target_config,
+                          stream_executor),
+        rewriter_pipeline_(std::move(rewriter_pipeline)),
+        codegen_backend_(std::move(backend)),
+        symbolic_expr_context_(symbolic_expr_context) {}
+  ~FissionBackend() override = default;
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+
+  absl::Status ApplyConfig(HloInstruction& instr,
+                           const BackendConfig& config) override;
+
+  bool IsSupported(const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> GetFissionedAndRewrittenModule(
+      const HloInstruction& fusion_instr);
+  absl::StatusOr<HloInstruction*> FindFirstSupportedInstruction(
+      const HloModule* module);
+
+  std::unique_ptr<HloPassPipeline> rewriter_pipeline_;
+  std::unique_ptr<GpuCodegenBackend> codegen_backend_;
+  SymbolicExprContext* symbolic_expr_context_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_AUTOTUNER_FISSION_BACKEND_H_
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
new file mode 100644
index 00000000000000..19a47eed06fec0
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -0,0 +1,164 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/cublas.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+using absl_testing::IsOk;
+using absl_testing::IsOkAndHolds;
+using ::testing::HasSubstr;
+
+const char kTritonFusionHlo[] = R"(
+  HloModule module
+
+  computation {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    convert0 = f32[1024,1024]{1,0} convert(p0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    convert1 = f32[1024,1024]{1,0} convert(p1)
+    ROOT dot = f32[1024,1024]{1,0} dot(convert0, convert1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY main {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    p1 = s8[1024,1024]{1,0} parameter(1)
+    ROOT fusion = f32[1024,1024]{1,0} fusion(p0, p1),
+      kind=kCustom, calls=computation,
+      backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+  })";
+
+class CublasFissionTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  se::StreamExecutor* stream_executor_;
+  Compiler::TargetConfig target_config_;
+  se::DeviceDescription device_description_;
+  std::unique_ptr<HloPassPipeline> rewriter_pipeline_;
+  std::unique_ptr<GpuCodegenBackend> cublas_backend_;
+  std::unique_ptr<FissionBackend> fission_backend_;
+  mlir::MLIRContext mlir_context_;
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+
+  std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline() {
+    auto pipeline = std::make_unique<HloPassPipeline>("fission_pipeline");
+    pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+    for (GemmRewriterOptions::DType dtype :
+         {GemmRewriterOptions::DType::kFp8Only,
+          GemmRewriterOptions::DType::kNonFp8Only}) {
+      auto gemm_rewriter = std::make_unique<GemmRewriter>(
+          device_description_.gpu_compute_capability(),
+          device_description_.runtime_version(), GemmRewriterOptions{dtype});
+      pipeline->AddPass(std::move(gemm_rewriter));
+    }
+    return pipeline;
+  }
+
+  CublasFissionTest()
+      : stream_executor_(PlatformUtil::GetDefaultPlatform()
+                             .value()
+                             ->ExecutorForDevice(0)
+                             .value()),
+        target_config_(stream_executor_),
+        device_description_(stream_executor_->GetDeviceDescription()),
+        rewriter_pipeline_(GetCublasRewriterPipeline()),
+        cublas_backend_(std::make_unique<CublasBackend>(
+            stream_executor_, &debug_options_, &compiler_, &target_config_)),
+        fission_backend_(std::make_unique<FissionBackend>(
+            &debug_options_, &compiler_, &target_config_,
+            std::move(cublas_backend_), std::move(rewriter_pipeline_),
+            &symbolic_expr_context_, stream_executor_)) {}
+};
+
+TEST_F(CublasFissionTest, CanCreateFissionBackend) {
+  EXPECT_EQ(fission_backend_->name(), "Cublas_fission");
+}
+
+TEST_F(CublasFissionTest, GetSupportedConfigs) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kTritonFusionHlo));
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      fission_backend_->GetSupportedConfigs(
+          (*module->entry_computation()->root_instruction()));
+  EXPECT_THAT(configs, IsOkAndHolds(testing::SizeIs(1)));
+}
+
+TEST_F(CublasFissionTest, GetDefaultConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kTritonFusionHlo));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  EXPECT_THAT(fission_backend_->GetDefaultConfig(*fusion), IsOk());
+}
+
+TEST_F(CublasFissionTest, Compile) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kTritonFusionHlo));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<BackendConfig> config,
+                          fission_backend_->GetDefaultConfig(*fusion));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          fission_backend_->Compile(*fusion, *config));
+  EXPECT_NE(executable, nullptr);
+}
+
+TEST_F(CublasFissionTest, ApplyConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kTritonFusionHlo));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<BackendConfig> config,
+                          fission_backend_->GetDefaultConfig(*fusion));
+  EXPECT_THAT(fission_backend_->ApplyConfig(*fusion, *config), IsOk());
+  std::string module_str = module->ToString();
+  EXPECT_THAT(module_str, HasSubstr("custom_call_target=\"__cublas$gemm\""));
+  EXPECT_THAT(module_str, HasSubstr("\"selected_algorithm\":\"-1\""));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
index 68aef91ec19308..402752714de17b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend.h
@@ -121,6 +121,10 @@ class GpuCodegenBackend : public CodegenBackend {
     return hlo_module;
   };
 
+  virtual bool IsSupported(const HloInstruction& instr) = 0;
+
+  friend class FissionBackend;
+
   std::string name_;
   stream_executor::StreamExecutor* stream_executor_;
   const Compiler::TargetConfig& target_config_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
index a5de5c4a0b630f..f85951d686c1f1 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
@@ -16,14 +16,19 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 
 #include <optional>
+#include <string>
 
 #include "google/protobuf/duration.pb.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/autotuning/autotune_cache_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace xla {
@@ -42,7 +47,7 @@ std::optional<LegacyCache::Config> LegacyCache::Lookup(
   if (!result->has_value()) {
     return std::nullopt;
   }
-  return GetConfig(result->value());
+  return GetConfig(result->value(), instr->opcode() == HloOpcode::kFusion);
 }
 
 absl::Status LegacyCache::Insert(const HloInstruction* instr,
@@ -63,19 +68,48 @@ absl::Status LegacyCache::Insert(const HloInstruction* instr,
   return absl::OkStatus();
 }
 
+void LegacyCache::ClearCache() { AutotunerUtil::ClearAutotuneResults(); }
+
+absl::StatusOr<std::string> LegacyCache::Serialize(
+    absl::Span<const HloInstruction* const> instructions_to_serialize) {
+  AutotuneCacheKeySet key_set;
+  for (const HloInstruction* instr : instructions_to_serialize) {
+    key_set.insert(GetAutotuneCacheKey(*instr));
+  }
+
+  std::optional<const AutotuneCacheKeySet*> keys_to_send = std::nullopt;
+  if (!key_set.empty()) {
+    keys_to_send = &key_set;
+  }
+
+  AutotuneResults results;
+  TF_RETURN_IF_ERROR(
+      AutotunerUtil::SerializeAutotuneResults(&results, keys_to_send));
+  return AutotuneResultsToString(results, true);
+}
+
+absl::Status LegacyCache::Deserialize(absl::string_view serialized_cache) {
+  return AutotunerUtil::LoadAutotuneResults(serialized_cache,
+                                            /*as_textproto=*/true,
+                                            /*allow_override=*/true);
+}
+
 AutotuneCacheKey LegacyCache::GetAutotuneCacheKey(const HloInstruction& instr) {
   AutotuneCacheKey key(device_desc_, instr);
   return key;
 }
 
 std::optional<LegacyCache::Config> LegacyCache::GetConfig(
-    const AutotuneResult& result) {
+    const AutotuneResult& result, bool is_fusion_instruction) {
   Config config;
   if (result.has_triton()) {
     config.codegen_backend_name = "Triton";
     config.backend_config.PackFrom(result.triton());
   } else if (result.has_gemm()) {
     config.codegen_backend_name = "Cublas";
+    if (is_fusion_instruction) {
+      config.codegen_backend_name = "Cublas_fission";
+    }
     config.backend_config.PackFrom(result.gemm());
   } else if (result.has_algorithm()) {
     config.codegen_backend_name = "Cudnn";
@@ -94,7 +128,8 @@ std::optional<AutotuneResult> LegacyCache::GetAutotuneResult(
   AutotuneResult result;
   if (config.codegen_backend_name == "Triton") {
     config.backend_config.UnpackTo(result.mutable_triton());
-  } else if (config.codegen_backend_name == "Cublas") {
+  } else if (config.codegen_backend_name == "Cublas" ||
+             config.codegen_backend_name == "Cublas_fission") {
     config.backend_config.UnpackTo(result.mutable_gemm());
   } else if (config.codegen_backend_name == "Cudnn") {
     config.backend_config.UnpackTo(result.mutable_algorithm());
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
index 48b79d7fee666a..48c30b085235a4 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -45,12 +48,19 @@ class LegacyCache : public AutotunerCacheInterface {
   absl::Status Insert(const HloInstruction* instr,
                       const Config& best_config) override;
 
+  absl::StatusOr<std::string> Serialize(absl::Span<const HloInstruction* const>
+                                            instructions_to_serialize) override;
+  absl::Status Deserialize(absl::string_view serialized_cache) override;
+
+  void ClearCache();
+
  private:
   AutotuneCacheKey GetAutotuneCacheKey(const HloInstruction& instr);
 
   // Translates between the AutotunerCacheInterface::Config and the
   // AutotuneResult.
-  std::optional<Config> GetConfig(const AutotuneResult& result);
+  std::optional<Config> GetConfig(const AutotuneResult& result,
+                                  bool is_fusion_instruction);
   std::optional<AutotuneResult> GetAutotuneResult(const Config& config);
 
   const std::string cache_dir_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
index b1379a62e26b44..de1f1dee8bc20a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
 #include <gmock/gmock.h>
@@ -28,11 +29,13 @@ limitations under the License.
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace xla {
@@ -98,6 +101,13 @@ class LegacyCacheTest : public ::testing::Test {
     return config;
   }
 
+  Config CreateDummyCublasFissionConfig() {
+    Config config;
+    config.codegen_backend_name = "Cublas_fission";
+    config.backend_config.PackFrom(AutotuneResult::GemmKey());
+    return config;
+  }
+
   Config CreateDummyCudnnConfig() {
     Config config;
     config.codegen_backend_name = "Cudnn";
@@ -164,6 +174,31 @@ TEST_F(LegacyCacheTest, InsertAndLookupCublas) {
   EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
 }
 
+TEST_F(LegacyCacheTest, InsertAndLookupCublasFission) {
+  auto cache = LegacyCache(test_dir_, mode_, device_desc_);
+  constexpr char kHLO[] = R"(
+HloModule test_module
+
+fused_computation {
+  param.0 = f32[] parameter(0)
+  param.1 = f32[] parameter(1)
+  ROOT add.0 = f32[] add(param.0, param.1)
+}
+
+ENTRY main {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT fusion.0 = f32[] fusion(p0, p1), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHLO));
+  auto instr = module->entry_computation()->root_instruction();
+  Config config = CreateDummyCublasFissionConfig();
+
+  TF_ASSERT_OK(cache.Insert(instr, config));
+  EXPECT_THAT(cache.Lookup(instr), Optional(ConfigEq(config)));
+}
+
 TEST_F(LegacyCacheTest, InsertAndLookupCudnn) {
   auto cache = LegacyCache(test_dir_, mode_, device_desc_);
   auto instr = CreateDummyInstr("hlo3");
@@ -232,6 +267,32 @@ TEST_F(LegacyCacheTest, OnlyInsertOncePerHlo) {
   EXPECT_THAT(cache.Lookup(instr.get()), Optional(ConfigEq(config)));
 }
 
+TEST_F(LegacyCacheTest, SerializeAndDeserialize) {
+  LegacyCache cache(test_dir_, mode_, device_desc_);
+  std::unique_ptr<HloInstruction> instr_1 = CreateDummyInstr("hlo9");
+  std::unique_ptr<HloInstruction> instr_2 = CreateDummyInstr("hlo10");
+  Config orig_config = CreateDummyTritonConfig();
+  TF_ASSERT_OK(cache.Insert(instr_1.get(), orig_config));
+  TF_ASSERT_OK(cache.Insert(instr_2.get(), orig_config));
+
+  // Serialize instr_1 to a string.
+  std::vector<const HloInstruction*> instructions_to_serialize = {
+      instr_1.get()};
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_cache,
+                          cache.Serialize(instructions_to_serialize));
+
+  // Overwrite config for both instructions.
+  cache.ClearCache();
+  Config another_config = CreateDummyCublasConfig();
+  TF_ASSERT_OK(cache.Insert(instr_1.get(), another_config));
+  TF_ASSERT_OK(cache.Insert(instr_2.get(), another_config));
+
+  // Deserialize the cache, only instr_1 should be overwritten.
+  TF_ASSERT_OK(cache.Deserialize(serialized_cache));
+  EXPECT_THAT(cache.Lookup(instr_1.get()), Optional(ConfigEq(orig_config)));
+  EXPECT_THAT(cache.Lookup(instr_2.get()), Optional(ConfigEq(another_config)));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
new file mode 100644
index 00000000000000..c4e0872244c092
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/miopen.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using MIOpenBackendConfig = stream_executor::dnn::AlgorithmProto;
+
+namespace {
+
+// Replaces the instruction with a new instruction with the same name in the
+// parent computation. The given instruction will be replaced by a tuple of the
+// convolution result and the workspace size. A few following instructions will
+// be added to the parent computation to extract the convolution result from the
+// new tuple.
+absl::Status ApplyConfigAndUpdateWorkspaceInOutputTuple(
+    HloInstruction& instr, const MIOpenBackendConfig& config) {
+  HloComputation* computation = instr.parent();
+  std::vector<Shape> new_call_element_shapes;
+  // Add the shapes of the outputs of the convolution.
+  new_call_element_shapes.reserve(instr.shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < instr.shape().tuple_shapes().size() - 1; ++i) {
+    new_call_element_shapes.emplace_back(instr.shape().tuple_shapes(i));
+  }
+  // The final element is the size of the workspace.
+  int64_t workspace_size = config.workspace_size().value();
+  new_call_element_shapes.emplace_back(
+      ShapeUtil::MakeShape(U8, {workspace_size}));
+  Shape new_call_shape = ShapeUtil::MakeTupleShape(new_call_element_shapes);
+  HloInstruction* new_call = computation->AddInstruction(
+      instr.CloneWithNewOperands(new_call_shape, instr.operands()));
+  new_call->SetAndSanitizeName(instr.name());
+
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_backend_config,
+                      instr.backend_config<GpuBackendConfig>());
+  CudnnConvBackendConfig* cudnn_conv_config =
+      gpu_backend_config.mutable_cudnn_conv_backend_config();
+  *cudnn_conv_config->mutable_algorithm() = config;
+  TF_RETURN_IF_ERROR(new_call->set_backend_config(gpu_backend_config));
+
+  std::vector<HloInstruction*> new_tuple_elements;
+  new_tuple_elements.reserve(new_call->shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < new_call->shape().tuple_shapes().size() - 1; ++i) {
+    new_tuple_elements.emplace_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_call->shape().tuple_shapes(i), new_call, i)));
+  }
+  new_tuple_elements.emplace_back(computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<uint8_t>({}))));
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple(new_tuple_elements));
+
+  TF_RETURN_IF_ERROR(instr.parent()->ReplaceInstruction(&instr, new_tuple));
+  return absl::OkStatus();
+}
+
+absl::Status ApplyConfigToMIOpenCustomCall(HloInstruction& instr,
+                                           const MIOpenBackendConfig& config) {
+  if (config.has_workspace_size() && config.workspace_size().value() > 0) {
+    return ApplyConfigAndUpdateWorkspaceInOutputTuple(instr, config);
+  }
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      instr.backend_config<GpuBackendConfig>());
+  CudnnConvBackendConfig* cudnn_conv_config =
+      gpu_config.mutable_cudnn_conv_backend_config();
+  *cudnn_conv_config->mutable_algorithm() = config;
+  TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+bool MIOpenBackend::IsSupported(const HloInstruction& instr) {
+  return IsCustomCallToDnnConvolution(instr);
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> MIOpenBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (IsSupported(instr)) {
+    MIOpenBackendConfig config;
+    config.set_algo_id(-1);
+    auto any = std::make_unique<google::protobuf::Any>();
+    any->PackFrom(config);
+    return any;
+  }
+  return absl::InvalidArgumentError(
+      "MIOpen backend doesn't support getting a default config for this "
+      "instruction.");
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+MIOpenBackend::GetSupportedConfigs(const HloInstruction& instr) {
+  if (IsSupported(instr)) {
+    MIOpenBackendConfig config;
+    config.set_algo_id(-1);
+    auto any = std::make_unique<google::protobuf::Any>();
+    any->PackFrom(config);
+    std::vector<std::unique_ptr<BackendConfig>> configs;
+    configs.push_back(std::move(any));
+    return configs;
+  }
+  return std::vector<std::unique_ptr<BackendConfig>>();
+}
+
+absl::Status MIOpenBackend::ApplyConfig(HloInstruction& instr,
+                                        const BackendConfig& config) {
+  MIOpenBackendConfig algorithm_config;
+  if (!config.UnpackTo(&algorithm_config)) {
+    return absl::InvalidArgumentError(
+        "Failed to unpack MIOpenBackendConfig from Any.");
+  }
+  if (IsSupported(instr)) {
+    return ApplyConfigToMIOpenCustomCall(instr, algorithm_config);
+  }
+  return absl::InvalidArgumentError(
+      "MIOpen backend doesn't support this instruction.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.h b/third_party/xla/xla/backends/gpu/autotuner/miopen.h
new file mode 100644
index 00000000000000..dd5f31c75cefba
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.h
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
+#define XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// A codegen backend for MIOpen.
+class MIOpenBackend : public GpuCodegenBackend {
+ public:
+  explicit MIOpenBackend(stream_executor::StreamExecutor* stream_executor,
+                         const DebugOptions* debug_options, Compiler* compiler,
+                         const Compiler::TargetConfig* target_config)
+      : GpuCodegenBackend("MIOpen", debug_options, compiler, target_config,
+                          stream_executor) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(const HloInstruction& instr) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+  absl::Status ApplyConfig(HloInstruction& instr,
+                           const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_AUTOTUNER_MIOPEN_H_
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc
new file mode 100644
index 00000000000000..19b0420753b9fe
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/autotuner/miopen.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/amdgpu_compiler.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using MIOpenBackendConfig = stream_executor::dnn::AlgorithmProto;
+
+using ::testing::SizeIs;
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::testing::IsOkAndHolds;
+
+const char kMIOpenCustomCallHlo[] = R"(
+  HloModule module
+
+  ENTRY %main {
+    %arg0 = f32[3,56,56,16]{2,1,0,3} parameter(0)
+    %arg1 = f32[3,3,3,64]{2,1,0,3} parameter(1)
+    %cudnn-conv = (f32[54,54,16,64]{1,0,3,2}, u8[0]{0})
+      custom-call(%arg0, %arg1), custom_call_target="__cudnn$convForward",
+      window={size=3x3},
+      dim_labels=f01b_i01o->01bf,
+      backend_config={
+        "cudnn_conv_backend_config":{
+          "activation_mode":"kNone",
+          "conv_result_scale":1,
+          "side_input_scale":0,
+          "leakyrelu_alpha":0
+        },
+      }
+    ROOT %get-tuple-element = f32[54,54,16,64]{1,0,3,2} get-tuple-element(%cudnn-conv), index=0
+  })";
+
+class MIOpenBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  AMDGPUCompiler compiler_;
+  se::StreamExecutor* stream_executor_;
+  Compiler::TargetConfig target_config_;
+  MIOpenBackend backend_;
+
+  MIOpenBackendTest()
+      : stream_executor_(PlatformUtil::GetDefaultPlatform()
+                             .value()
+                             ->ExecutorForDevice(0)
+                             .value()),
+        target_config_(stream_executor_),
+        backend_(stream_executor_, &debug_options_, &compiler_,
+                 &target_config_) {}
+
+  bool IsRocm() {
+    return stream_executor_->GetPlatform()->id() == se::rocm::kROCmPlatformId;
+  }
+};
+
+TEST_F(MIOpenBackendTest, CanCreateMIOpenBackend) {
+  ASSERT_NE(nullptr, &backend_);
+}
+
+TEST_F(MIOpenBackendTest, GetSupportedConfigsFromMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(
+          (*hlo_module->entry_computation()->root_instruction()->operand(0)));
+  ASSERT_THAT(configs, IsOkAndHolds(SizeIs(1)));
+  MIOpenBackendConfig algorithm_config;
+  ASSERT_TRUE((*configs)[0]->UnpackTo(&algorithm_config));
+  EXPECT_EQ(algorithm_config.algo_id(), -1);
+}
+
+TEST_F(MIOpenBackendTest, GetDefaultConfigFromMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*hlo_module->entry_computation()->root_instruction()->operand(0)));
+  TF_ASSERT_OK(config);
+  MIOpenBackendConfig algorithm_config;
+  ASSERT_TRUE(config->get()->UnpackTo(&algorithm_config));
+  EXPECT_EQ(algorithm_config.algo_id(), -1);
+}
+
+TEST_F(MIOpenBackendTest, ApplyConfigToMIOpenCustomCall) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  MIOpenBackendConfig config;
+  config.set_algo_id(1);
+  HloInstruction* instr =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  google::protobuf::Any any;
+  any.PackFrom(config);
+  TF_ASSERT_OK(backend_.ApplyConfig(*instr, any));
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          instr->backend_config<GpuBackendConfig>());
+  EXPECT_THAT(gpu_config.cudnn_conv_backend_config().algorithm(),
+              EqualsProto(config));
+}
+
+TEST_F(MIOpenBackendTest, ApplyConfigToMIOpenCustomCallWithWorkspace) {
+  if (!IsRocm()) {
+    GTEST_SKIP() << "Skipping test on non-ROCm platform";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kMIOpenCustomCallHlo));
+  MIOpenBackendConfig config;
+  config.set_algo_id(1);
+  config.mutable_workspace_size()->set_value(1024);
+  HloInstruction* instr =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  google::protobuf::Any any;
+  any.PackFrom(config);
+  TF_ASSERT_OK(backend_.ApplyConfig(*instr, any));
+
+  auto* replaced_instr =
+      hlo_module->entry_computation()->GetInstructionWithName("cudnn-conv");
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          replaced_instr->backend_config<GpuBackendConfig>());
+  EXPECT_THAT(gpu_config.cudnn_conv_backend_config().algorithm(),
+              EqualsProto(config));
+  EXPECT_EQ(replaced_instr->shape().tuple_shapes(1).dimensions(0), 1024);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
index f3828419f64e4b..bb29210837ae18 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.cc
@@ -43,7 +43,7 @@ namespace gpu {
 // it has a config for another backend, and we currently don't have an easy way
 // to check that. Therefore, we only support fusions that are already set up to
 // go through the native emitter.
-bool IsSupported(const HloInstruction& instr) {
+bool NativeEmitterBackend::IsSupported(const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kFusion) {
     return false;
   }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
index 37eff23ea87e37..1bd6b55fea72ae 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter.h
@@ -55,6 +55,9 @@ class NativeEmitterBackend : public GpuCodegenBackend {
   // Applies a given fusion config to the instruction.
   absl::Status ApplyConfig(HloInstruction& instr,
                            const BackendConfig& config) override;
+
+ private:
+  bool IsSupported(const HloInstruction& instr) override;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index 6d1d4b3a9e4365..f181669f97ceea 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -59,39 +59,37 @@ namespace gpu {
 namespace {
 std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
     se::GpuComputeCapability compute_capability, bool autotune_tma) {
-  if (compute_capability.IsCuda()) {
-    auto* cuda_compute_capability =
-        compute_capability.cuda_compute_capability();
-    std::vector<TritonGemmConfig> configs;
-
-    if (cuda_compute_capability->IsAtLeastBlackwell()) {
-      configs = *kBlackwellConfigs;
-    } else if (cuda_compute_capability->IsHopper() ||
-               cuda_compute_capability->IsAmpere()) {
-      configs = *kHopperAmpereConfigs;
-    } else {
-      configs = *kDefaultCudaConfigs;
-    }
-
-    if (!autotune_tma) {
-      return configs;
-    }
-
-    // Hopper+ devices support TMA. Add TMA parameterized configs.
-    std::vector<TritonGemmConfig> tma_parameterized_configs;
-    for (auto& config : configs) {
-      config.is_tma_allowed = false;
-      tma_parameterized_configs.push_back(config);
-
-      config.is_tma_allowed = true;
-      tma_parameterized_configs.push_back(config);
-    }
-    return tma_parameterized_configs;
-  }
   if (compute_capability.IsRocm()) {
     return *kDefaultRocmConfigs;
   }
-  return {};
+
+  CHECK(compute_capability.IsCuda());
+  auto* cuda_compute_capability = compute_capability.cuda_compute_capability();
+  std::vector<TritonGemmConfig> configs;
+
+  if (cuda_compute_capability->IsAtLeastBlackwell()) {
+    configs = *kBlackwellConfigs;
+  } else if (cuda_compute_capability->IsHopper() ||
+             cuda_compute_capability->IsAmpere()) {
+    configs = *kHopperAmpereConfigs;
+  } else {
+    configs = *kDefaultCudaConfigs;
+  }
+
+  if (!autotune_tma) {
+    return configs;
+  }
+
+  // Hopper+ devices support TMA. Add TMA parameterized configs.
+  std::vector<TritonGemmConfig> tma_parameterized_configs;
+  for (auto& config : configs) {
+    config.is_tma_allowed = false;
+    tma_parameterized_configs.push_back(config);
+
+    config.is_tma_allowed = true;
+    tma_parameterized_configs.push_back(config);
+  }
+  return tma_parameterized_configs;
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.h b/third_party/xla/xla/backends/gpu/autotuner/triton.h
index e8efef8a619a83..a4be92308fef24 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.h
@@ -23,10 +23,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla.pb.h"
 
@@ -53,11 +53,12 @@ class TritonBackend : public GpuCodegenBackend {
   bool CanProduceWrongResults() const override { return true; }
 
  private:
+  bool IsSupported(const HloInstruction& instr) override;
+
   absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
       const Compiler::CompileOptions& options) override;
 
-  bool IsSupported(const HloInstruction& instr);
   SymbolicExprContext* symbolic_expr_context_;
 };
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
index 112f593258907b..ff2b9959412343 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
@@ -27,13 +27,13 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/nvptx_compiler.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index c9c1785b38a9ef..6a4f9c91b0aa6f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -104,7 +104,6 @@ xla_test(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -128,15 +127,16 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/runtime:all_reduce_thunk",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:custom_call_target",
         "//xla/backends/gpu/runtime:custom_call_thunk",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:dynamic_slice_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/ffi:attribute_map",
@@ -248,12 +248,12 @@ cc_library(
         "//xla/codegen/emitters:kernel_api_builder",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_dimensions",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -295,13 +295,13 @@ cc_library(
         "//xla/backends/gpu/codegen/emitters:transpose",
         "//xla/backends/gpu/codegen/triton:fusion",
         "//xla/codegen:ir_emission_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
index 8681ecd78e5080..41626debc7fc55 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
@@ -314,6 +314,24 @@ TEST_F(CopyFusionTest, BuildUpdateSliceDescriptor) {
   EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
 }
 
+TEST_F(CopyFusionTest, PackedSubByteTypesAreNotSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      a = s4[20]{0:E(4)} parameter(0)
+      c = s32[] constant(10)
+      s = s4[10] dynamic-slice(a, c), dynamic_slice_sizes={10}
+    }
+
+    entry {
+      a = s4[20]{0:E(4)} parameter(0)
+      f = s4[10] fusion(a), kind=kLoop, calls=dynamic_slice
+    }
+  )"));
+  EXPECT_FALSE(
+      DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(GetFusion(module.get()))
+          .has_value());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
index 573a6818b9dc25..f3713676400a0b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
@@ -59,8 +58,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
-
 class CuDnnFusionTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -80,12 +77,14 @@ class CuDnnFusionTest : public GpuCodegenTest {
     return get_cuda_cc().IsAtLeastAmpere() &&
            GetDnnVersionInfoOrDefault(executor).major_version() >= 9;
   }
-  bool IsAtLeastCuDnn91() {
+  bool IsAtLeastCuDnnVersion(int major, int minor) {
     se::StreamExecutor* executor = backend().default_stream_executor();
     const se::dnn::VersionInfo version = GetDnnVersionInfoOrDefault(executor);
-    return (version.major_version() == 9 && version.minor_version() >= 1) ||
-           version.major_version() > 9;
+    return (version.major_version() == major &&
+            version.minor_version() >= minor) ||
+           version.major_version() > major;
   }
+  bool IsAtLeastCuDnn91() { return IsAtLeastCuDnnVersion(9, 1); }
 
  protected:
   void SetUp() override {
@@ -457,6 +456,29 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, DotS4BF16ExecutesCorrectly) {
+  if (!IsAtLeastCuDnnVersion(9, 12)) {
+    GTEST_SKIP() << "This test case requires cuDNN 9.12+.";
+  }
+  EXPECT_TRUE(RunAndCompare(R"(
+f {
+  a = s4[3,128,128] parameter(0)
+  c = bf16[3,128,128] convert(a)
+  b = bf16[3,128,128] parameter(1)
+  d = bf16[3,128,128] dot(c, b),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+e {
+  a = s4[3,128,128] parameter(0)
+  b = bf16[3,128,128] parameter(1)
+  f = bf16[3,128,128] fusion(a, b), kind=kCustom, calls=f,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
+
 TEST_F(CuDnnFusionExecutionTest, DotF32WithOutputSubtractionExecutesCorrectly) {
   EXPECT_TRUE(RunAndCompare(R"(
 fusion1 {
@@ -1228,15 +1250,18 @@ ENTRY main {
       backend_config={"fusion_backend_config":{kind:"__cudnn$fusion"}}
 })";
   EXPECT_TRUE(*RunCuDnnFileCheck(kHloText, R"(
+CHECK: "intermediate_data_type": "FLOAT"
 CHECK: "nodes"
 CHECK: {
 CHECK: "block_size": [{{[[:space:]]*32[[:space:]]*}}]
+CHECK: "compute_data_type": "FLOAT"
 CHECK: "X": "lhs"
 CHECK: "scale": "lhs_scale"
 CHECK: "Y": "result_lhs_dq"
 CHECK: "tag": "BLOCK_SCALE_DEQUANTIZE"
 CHECK: {
 CHECK: "block_size": [{{[[:space:]]*32[[:space:]]*}}]
+CHECK: "compute_data_type": "FLOAT"
 CHECK: "X": "rhs"
 CHECK: "scale": "rhs_scale"
 CHECK: "Y": "result_rhs_dq"
diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index b0f39ba0a17097..b9803de2b58d42 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -36,15 +36,16 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_target.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/ffi/attribute_map.h"
@@ -100,9 +101,11 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       emitters::KernelArguments::Create(ir_emitter_context.buffer_assignment(),
                                         GetDefaultBufferAlignment(), &fusion));
 
-  return std::make_unique<CustomKernelThunk>(
-      &fusion, std::move(custom_kernel), std::move(kernel_arguments),
-      ir_emitter_context.GetNextThunkId());
+  Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(
+      &fusion, ir_emitter_context.GetNextThunkId());
+  return std::make_unique<CustomKernelThunk>(std::move(thunk_info),
+                                             std::move(custom_kernel),
+                                             std::move(kernel_arguments));
 }
 
 absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
@@ -613,7 +616,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   // different offset by creating new fake allocations so each operand will
   // have a different buffer index. The slices can thus always start at offset
   // 0. DynamicSliceThunk will take care of the offset adjustment.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+  std::vector<BufferAllocation> fake_allocations(4, {0, 0, 0});
   if (fusion.shape().IsArray()) {
     TF_ASSIGN_OR_RETURN(
         output, GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
@@ -645,10 +648,10 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
         extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
         indvar_idx, inlined_module));
-    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[arg_idx] = BufferAllocation(
         /*index=*/arg_idx, workspace->size(), /*color=*/0);
-    slice_workspace_fake = BufferAllocation::Slice(
-        fake_allocations[arg_idx].get(), 0, workspace->size());
+    slice_workspace_fake = BufferAllocation::Slice(&fake_allocations[arg_idx],
+                                                   0, workspace->size());
   }
 
   if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
@@ -676,27 +679,27 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
     unsigned fake_arg_idx = 0;
     int64_t lhs_byte_size =
         ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, lhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_lhs_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, lhs_byte_size);
+    BufferAllocation::Slice slice_lhs_fake(&fake_allocations[fake_arg_idx], 0,
+                                           lhs_byte_size);
 
     fake_arg_idx++;
     int64_t rhs_byte_size =
         ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, rhs_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_rhs_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, rhs_byte_size);
+    BufferAllocation::Slice slice_rhs_fake(&fake_allocations[fake_arg_idx], 0,
+                                           rhs_byte_size);
 
     fake_arg_idx++;
     int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
         custom_call.shape().IsArray() ? custom_call.shape()
                                       : custom_call.shape().tuple_shapes(0));
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+    fake_allocations[fake_arg_idx] = BufferAllocation(
         /*index=*/fake_arg_idx, out_fake_byte_size, /*color=*/0);
-    BufferAllocation::Slice slice_out_fake(fake_allocations[fake_arg_idx].get(),
-                                           0, out_fake_byte_size);
+    BufferAllocation::Slice slice_out_fake(&fake_allocations[fake_arg_idx], 0,
+                                           out_fake_byte_size);
     ThunkSequence seq;
     seq.emplace_back(std::make_unique<GemmThunk>(
         thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,
@@ -762,7 +765,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         "thunks");
   }
 
-  using Slices = std::vector<std::optional<ShapedSlice>>;
+  using Slices = std::vector<NullableShapedSlice>;
 
   int64_t num_args = ShapeUtil::GetLeafCount(custom_call.shape());
   absl::c_for_each(custom_call.operands(), [&](auto* operand) {
@@ -878,7 +881,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   // For XLA FFI handlers we decode opaque backend config into attributes map
   // at IR emission time, so that we do not need to parse MLIR at run time.
   // For FFI handlers backend config must be a compatible MLIR dictionary.
-  CustomCallThunk::AttributesMap attributes;
+  ffi::AttributesMap attributes;
 
   // For information about this calling convention, see
   // xla/g3doc/custom_call.md.
@@ -933,7 +936,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
       mlir::Attribute attr = mlir::parseAttribute(
           backend_config_str,
           // TODO: b/451959933 - Use reference or check pointer.
-          ir_emitter_context.symbolic_expr_context()->GetMLIRContext());
+          ir_emitter_context.expr_context()->GetMLIRContext());
       auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr);
       if (dict == nullptr) {
         return absl::InternalError(
@@ -962,7 +965,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                                    ir_emitter_context.platform_name());
   };
 
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(num_args);
+  std::vector<BufferAllocation> fake_allocations(num_args, {0, 0, 0});
   if (absl::c_any_of(slice_instrs, IsDynamicSliceOrDynamicUpdateSlice)) {
     // Creating embedded custom call thunk.
     unsigned fake_arg_idx = 0;
@@ -982,10 +985,10 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
             }
 
             int64_t operand_byte_size = ShapeUtil::ByteSizeOf(subshape);
-            fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+            fake_allocations[fake_arg_idx] = BufferAllocation(
                 /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
-            BufferAllocation::Slice fake_slice(
-                fake_allocations[fake_arg_idx].get(), 0, operand_byte_size);
+            BufferAllocation::Slice fake_slice(&fake_allocations[fake_arg_idx],
+                                               0, operand_byte_size);
 
             fake_arg_idx++;
             fake_operands.push_back(ShapedSlice{fake_slice, subshape});
@@ -1007,10 +1010,10 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           }
 
           int64_t result_byte_size = ShapeUtil::ByteSizeOf(subshape);
-          fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+          fake_allocations[fake_arg_idx] = BufferAllocation(
               /*index=*/fake_arg_idx, result_byte_size, /*color=*/0);
-          BufferAllocation::Slice fake_slice(
-              fake_allocations[fake_arg_idx].get(), 0, result_byte_size);
+          BufferAllocation::Slice fake_slice(&fake_allocations[fake_arg_idx], 0,
+                                             result_byte_size);
 
           fake_arg_idx++;
           fake_results.push_back(ShapedSlice{fake_slice, subshape});
@@ -1065,7 +1068,7 @@ using Slices = std::vector<Slice>;
 // fake_arguments: the fake slices of the inputs/outputs of the hero
 // instruction, when the slicing is dynamic.
 struct SliceDataForCollectives {
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
+  std::vector<BufferAllocation> fake_allocations;
   std::vector<HloInstruction*> slice_instrs;
   Slices arguments, fake_arguments;
   std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
@@ -1076,7 +1079,7 @@ struct SliceDataForCollectives {
   std::unique_ptr<HloModule> init_module, update_module;
   bool isDynamic, can_compute_indvar_on_host;
   explicit SliceDataForCollectives(int num_args)
-      : fake_allocations(num_args),
+      : fake_allocations(num_args, {0, 0, 0}),
         slice_instrs(num_args),
         arguments(num_args, std::nullopt),
         fake_arguments(num_args, std::nullopt),
@@ -1197,11 +1200,10 @@ CollectSliceArgumentMetadataForCollectives(
     unsigned fake_arg_idx = 0;
     for (HloInstruction* operand : instr->operands()) {
       int64_t operand_byte_size = ShapeUtil::ByteSizeOf(operand->shape());
-      slice_data.fake_allocations[fake_arg_idx] =
-          std::make_unique<BufferAllocation>(
-              /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
+      slice_data.fake_allocations[fake_arg_idx] = BufferAllocation(
+          /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
       BufferAllocation::Slice fake_slice(
-          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*allocation=*/&slice_data.fake_allocations[fake_arg_idx],
           /*offset=*/0,
           /*size=*/operand_byte_size);
       slice_data.fake_arguments[fake_arg_idx] = fake_slice;
@@ -1217,12 +1219,11 @@ CollectSliceArgumentMetadataForCollectives(
     }
     for (const HloInstruction* user : collective_results) {
       int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(user->shape());
-      slice_data.fake_allocations[fake_arg_idx] =
-          std::make_unique<BufferAllocation>(
-              /*index=*/fake_arg_idx, /*size=*/out_fake_byte_size,
-              /*color=*/0);
+      slice_data.fake_allocations[fake_arg_idx] = BufferAllocation(
+          /*index=*/fake_arg_idx, /*size=*/out_fake_byte_size,
+          /*color=*/0);
       BufferAllocation::Slice fake_slice(
-          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*allocation=*/&slice_data.fake_allocations[fake_arg_idx],
           /*offset=*/0, /*size=*/out_fake_byte_size);
       slice_data.fake_arguments[fake_arg_idx] = fake_slice;
       fake_arg_idx++;
diff --git a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
index 2f81904a8e1ad8..2d06e3c4a70d87 100644
--- a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
@@ -85,8 +85,8 @@ se::StreamExecutor* GpuExecutor() {
 
 bool IsAtLeastCuda12900(const se::StreamExecutor* stream_executor) {
   const auto& device_description = stream_executor->GetDeviceDescription();
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
+  const auto* cuda_cc =
+      device_description.gpu_compute_capability().cuda_compute_capability();
   if (cuda_cc != nullptr) {
     if (device_description.driver_version() >=
             stream_executor::SemanticVersion(12, 9, 0) &&
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index 638a72fca1ca47..b72a97787b82fe 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -25,13 +25,13 @@ cc_library(
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_constants",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -62,6 +62,7 @@ cc_library(
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
@@ -75,7 +76,6 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
         "//xla/service/gpu/llvm_gpu_backend:ptx_version_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
@@ -136,12 +136,12 @@ xla_cc_test(
         ":emitter_base",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
@@ -174,6 +174,7 @@ cc_library(
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -183,7 +184,6 @@ cc_library(
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -206,6 +206,7 @@ cc_library(
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -214,7 +215,6 @@ cc_library(
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -242,6 +242,7 @@ cc_library(
         "//xla/codegen/emitters:type_util",
         "//xla/codegen/emitters:utils",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:platform_util",
@@ -249,7 +250,6 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:reduction_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:launch_dim",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -334,13 +334,13 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:scatter_simplifier",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -377,12 +377,12 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
index 834f6bc57d223a..53e6a5819f643b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "xla/codegen/emitters/concatenate_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -79,17 +79,17 @@ ConcatenateFusion::ComputeThreadIdToInputIndexing(
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 ConcatenateFusion::CreateMLIRModule(
-    SymbolicExprContext& symbolic_expr_context,
-    const HloFusionInstruction& fusion, const std::string& entry_function_name,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   emitters::ConcatenateFusionKernelEmitter emitter(
       symbolic_expr_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status ConcatenateFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
index 55be6511ed7bcc..71a6282df7f3c1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
@@ -29,12 +29,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla {
@@ -54,7 +54,7 @@ class ConcatenateFusion final : public EmitterBase {
 
  protected:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      SymbolicExprContext& context, const HloFusionInstruction& fusion,
+      mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index 5c01a71985d650..6ef9b955840dea 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
@@ -92,6 +93,7 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -106,7 +108,6 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/ptx_version_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status_macros.h"
@@ -125,6 +126,7 @@ namespace gpu {
 namespace {
 
 using llvm::SmallVector;
+using mlir::MLIRContext;
 using mlir::Value;
 using mlir::ValueRange;
 using mlir::func::FuncOp;
@@ -267,6 +269,7 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
                                      ir_emitter_context.buffer_assignment(),
                                      GetDefaultBufferAlignment(), &fusion));
   auto launch_dims = launch_dimensions();
+  mlir::MLIRContext& mlir_context = *ir_emitter_context.mlir_context();
   auto [status_or_entry, cached] =
       ir_emitter_context.kernel_cache().GetWithStatus(
           fusion.fused_instructions_computation(), args.args(),
@@ -276,10 +279,12 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
                 ir_emitter_context.name_uniquer()->GetUniqueName(
                     llvm_ir::SanitizeFunctionName(std::string(fusion.name())));
             if (ir_emitter_context.emit_kernels()) {
+              mlir_context.appendDialectRegistry(GetDialectRegistry());
+              mlir_context.loadAllAvailableDialects();
               TF_ASSIGN_OR_RETURN(
                   auto module,
                   CreateLLVMModule(
-                      *ir_emitter_context.symbolic_expr_context(),
+                      mlir_context,
                       ir_emitter_context.llvm_module()->getContext(),
                       ir_emitter_context.gpu_device_info(), fusion, kernel_name,
                       &ir_emitter_context.buffer_assignment()));
@@ -326,16 +331,13 @@ absl::StatusOr<FusionEmissionResult> EmitterBase::Emit(
 }
 
 absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
-    SymbolicExprContext& symbolic_expr_context, llvm::LLVMContext& llvm_context,
+    mlir::MLIRContext& mlir_context, llvm::LLVMContext& llvm_context,
     const se::DeviceDescription& device, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  mlir::MLIRContext& mlir_context = *symbolic_expr_context.GetMLIRContext();
-  mlir_context.appendDialectRegistry(GetDialectRegistry());
-  mlir_context.loadAllAvailableDialects();
-  TF_ASSIGN_OR_RETURN(auto module,
-                      CreateMLIRModule(symbolic_expr_context, fusion,
-                                       entry_function_name, buffer_assignment));
+  TF_ASSIGN_OR_RETURN(
+      auto module, CreateMLIRModule(mlir_context, fusion, entry_function_name,
+                                    buffer_assignment));
 
   mlir::PassManager pm(&mlir_context);
   emitters::RegisterOptimizationPasses(pm);
@@ -353,10 +355,9 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
-    SymbolicExprContext& symbolic_expr_context,
-    const HloFusionInstruction& fusion, const std::string& entry_function_name,
+    MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  mlir::MLIRContext& mlir_context = *symbolic_expr_context.GetMLIRContext();
   mlir::OpBuilder builder(&mlir_context);
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion.name()));
   mlir::OwningOpRef<mlir::ModuleOp> module = llvm_ir::CreateMlirModuleOp(loc);
@@ -367,6 +368,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
                           GetDefaultBufferAlignment(), entry_function_name));
   SetBackendKind(&mlir_context, entry_func, BackendKind::kGpu);
 
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   TF_RETURN_IF_ERROR(
       EmitMlir(module.get(), entry_func, fusion, symbolic_expr_context));
   return module;
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
index 9c3d2c499aeada..005ddca4b28679 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
@@ -38,13 +38,13 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -59,17 +59,15 @@ class EmitterBase : public KernelFusionInterface {
   // Visible for testing. `buffer_assignment` is optional for testing (assigns
   // a different buffer to each tensor).
   absl::StatusOr<std::unique_ptr<llvm::Module>> CreateLLVMModule(
-      SymbolicExprContext& symbolic_expr_context,
-      llvm::LLVMContext& llvm_context, const se::DeviceDescription& device,
-      const HloFusionInstruction& fusion,
+      mlir::MLIRContext& mlir_context, llvm::LLVMContext& llvm_context,
+      const se::DeviceDescription& device, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const;
 
   // Visible for testing. `buffer_assignment` is optional for testing (assigns
   // a different buffer to each tensor).
   virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      SymbolicExprContext& symbolic_expr_context,
-      const HloFusionInstruction& fusion,
+      mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
index b5b42a0d29fbdb..53390e9cd98774 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
@@ -38,13 +38,13 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -92,7 +92,6 @@ class EmitterBaseTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::CudaOrRocmDeviceInfo();
 };
@@ -113,7 +112,7 @@ TEST_F(EmitterBaseTest, CreateMlirModule) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto mlir_module,
       emitter.CreateMLIRModule(
-          symbolic_expr_context_,
+          mlir_context_,
           *Cast<HloFusionInstruction>(
               module->entry_computation()->root_instruction()),
           "fusion",
@@ -144,7 +143,7 @@ TEST_F(EmitterBaseTest, CreateLLVMModule) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto llvm_module,
       emitter.CreateLLVMModule(
-          symbolic_expr_context_, llvm_context, device_info_,
+          mlir_context_, llvm_context, device_info_,
           *Cast<HloFusionInstruction>(
               module->entry_computation()->root_instruction()),
           "fusion",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
index f9d4568488d51a..b0af48bd2c195b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -96,17 +96,17 @@ WorkDimensions InPlaceDynamicUpdateSliceFusion::GetWorkDimensions() const {
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 InPlaceDynamicUpdateSliceFusion::CreateMLIRModule(
-    SymbolicExprContext& symbolic_expr_context,
-    const HloFusionInstruction& fusion, const std::string& entry_function_name,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   emitters::DynamicUpdateSliceKernelEmitter emitter(
       symbolic_expr_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status InPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
index 02e8df8d0d3b26..72d9a87939f5cb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace gpu {
@@ -69,8 +69,7 @@ class InPlaceDynamicUpdateSliceFusion : public EmitterBase {
 
  protected:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      SymbolicExprContext& symbolic_expr_context,
-      const HloFusionInstruction& fusion,
+      mlir::MLIRContext& context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
index b2ebabc0ab1fe9..3427a2b86b27b6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.cc
@@ -107,17 +107,17 @@ WorkDimensions LoopFusion::GetWorkDimensions() const {
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoopFusion::CreateMLIRModule(
-    SymbolicExprContext& symbolic_expr_context,
-    const HloFusionInstruction& fusion, const std::string& entry_function_name,
+    mlir::MLIRContext& mlir_context, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   emitters::LoopFusionKernelEmitter emitter(
       symbolic_expr_context, fusion, analysis_.fusion_spec(), buffer_assignment,
       GetDefaultBufferAlignment(), GetWorkDimensions(), entry_function_name,
       BackendKind::kGpu);
 
   TF_ASSIGN_OR_RETURN(auto kernel_definition, emitter.EmitKernelDefinition());
-  auto [spec, source] = std::move(kernel_definition).ReleaseStorage();
-  return std::move(source).ReleaseStorage().module;
+  return std::move(kernel_definition).TakeSource().TakeModule();
 }
 
 absl::Status LoopFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
index 3fc76b9b92c053..8716e802344aee 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace gpu {
@@ -38,7 +38,7 @@ namespace gpu {
 class LoopFusion final : public EmitterBase {
  public:
   LoopFusion(const HloFusionAnalysis& analysis,
-             gpu::SymbolicExprContext* symbolic_expr_context)
+             SymbolicExprContext* symbolic_expr_context)
       : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
   LaunchDimensions launch_dimensions() const override;
 
@@ -52,7 +52,7 @@ class LoopFusion final : public EmitterBase {
 
  private:
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
-      SymbolicExprContext& context, const HloFusionInstruction& fusion,
+      mlir::MLIRContext& context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
       const BufferAssignment* buffer_assignment) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
index 66f3a0153b71bf..69d2cecb0aaba3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
@@ -57,13 +57,13 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
@@ -195,9 +195,9 @@ PerThreadOutputs ReductionFusion::EmitterState::EmitPerThreadElements(
   const auto& reductions = owner.reduction_heroes_[group_id];
   absl::flat_hash_map<const HloInstruction*, int> iter_arg_starts;
 
-  for (const auto& [reduction, init] : inits) {
+  for (const HloInstruction* reduction : reductions) {
     iter_arg_starts[reduction] = iter_arg_inits.size();
-    iter_arg_inits.append(init);
+    iter_arg_inits.append(inits.find(reduction)->second);
   }
 
   auto body_builder = [&](ImplicitLocOpBuilder& nested_b,
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
index aeb238ae23193b..0137bcf6f7fcdd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
@@ -34,11 +34,11 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/reduction_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/shape.h"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
index 961ccd85204c1e..d4d84a143fa53a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -63,7 +64,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/scatter_simplifier.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
index 35230182c3ac24..961cee2c41171c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
@@ -30,11 +30,11 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/BUILD
index 08db97bdf94a75..1476b823c5d6af 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/BUILD
@@ -24,7 +24,7 @@ lit_test_suite(
         "@llvm-project//llvm:FileCheck",
     ],
     tags_override = {
-        "reduce_row/mof_scalar_variadic.hlo": ["cuda-only"], #TODO(rocm): weekly sync 25-07-14
-        "reduce_row/side_output_broadcast.hlo": ["cuda-only"], #TODO(rocm): weekly sync 25-07-14
+        "reduce_row/mof_scalar_variadic.hlo": [],
+        "reduce_row/side_output_broadcast.hlo": [],
     },
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo
new file mode 100644
index 00000000000000..3cca83f157acbd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/reduction_groups_two_in_same_group.hlo
@@ -0,0 +1,39 @@
+// RUN: fusion_to_mlir %s | FileCheck %s
+// RUN: gpu_test_correctness %s
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%add_f32.2 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+fusion {
+  param_1.23864 = f32[1,1,144,256,32]{4,3,2,1,0} parameter(1)
+  constant_8203_2_clone_1 = f32[] constant(0.844827533)
+  broadcast.6321.5.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} broadcast(constant_8203_2_clone_1), dimensions={}
+  mul.770.5.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} multiply(param_1.23864, broadcast.6321.5.clone.1)
+  bitcast.125.3.clone.1 = f32[1,144,256,32]{3,2,1,0} bitcast(mul.770.5.clone.1)
+  param_0.21084 = f32[1,144,256,32]{3,2,1,0} parameter(0)
+  add_any.68.3.clone.1 = f32[1,144,256,32]{3,2,1,0} add(bitcast.125.3.clone.1, param_0.21084)
+  constant_8190_1_clone_1 = f32[] constant(0.393919319)
+  broadcast.6364.3.clone.1 = f32[1,144,256,32]{3,2,1,0} broadcast(constant_8190_1_clone_1), dimensions={}
+  mul.675.1.clone.1 = f32[1,144,256,32]{3,2,1,0} multiply(add_any.68.3.clone.1, broadcast.6364.3.clone.1)
+  bitcast.15178.1 = f32[128,288,32]{2,1,0} bitcast(mul.675.1.clone.1)
+  constant_8186_50 = f32[] constant(0)
+  reduce.812.1 = f32[128,32]{1,0} reduce(bitcast.15178.1, constant_8186_50), dimensions={1}, to_apply=add_f32
+  constant_8204_2_clone_1 = f32[] constant(0.362068981)
+  broadcast.6327.3.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} broadcast(constant_8204_2_clone_1), dimensions={}
+  mul.771.3.clone.1 = f32[1,1,144,256,32]{4,3,2,1,0} multiply(param_1.23864, broadcast.6327.3.clone.1)
+  bitcast.15180.1.clone.1 = f32[128,288,32]{2,1,0} bitcast(mul.771.3.clone.1)
+  reduce.816.1.clone.1 = f32[128,32]{1,0} reduce(bitcast.15180.1.clone.1, constant_8186_50), dimensions={1}, to_apply=add_f32.2
+  ROOT tuple.1351 = (f32[128,32]{1,0}, f32[1,144,256,32]{3,2,1,0}, f32[128,32]{1,0}) tuple(reduce.812.1, mul.675.1.clone.1, reduce.816.1.clone.1)
+}
+
+// CHECK: xla.pure_call @add_f32_add
+// CHECK: xla.pure_call @add_f32_2_add_1
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
index 974875bdac571b..e52554f6e0a350 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 
 #include "llvm/ADT/APFloat.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index b859b61f371892..0a27de656edd66 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -69,7 +70,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index 72f0232f545b68..5787c288960948 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -35,12 +35,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
index 73f5e3478a876b..2752a19727c34d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
@@ -41,10 +41,10 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index 305bd50b42ca6e..848b791339b1c5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
 #include "xla/codegen/ir_emission_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla {
@@ -78,8 +78,7 @@ bool HloFusionInfo::CanEmitDynamicUpdateSliceInPlace() const {
 }
 
 std::unique_ptr<FusionInterface> GetFusionEmitter(
-    const FusionInfo& fusion_info,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    const FusionInfo& fusion_info, SymbolicExprContext* symbolic_expr_context) {
   const auto& analysis = fusion_info.analysis();
   const FusionBackendConfig& backend_config = analysis.fusion_backend_config();
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.h b/third_party/xla/xla/backends/gpu/codegen/fusions.h
index 422fcbf3cfea72..e99bc30b5f1369 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.h
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.h
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace gpu {
@@ -96,8 +96,7 @@ class PreBufferAssignmentFusionInfo : public FusionInfo {
 
 // Returns the emitter for the given fusion.
 std::unique_ptr<FusionInterface> GetFusionEmitter(
-    const FusionInfo& fusion_info,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    const FusionInfo& fusion_info, SymbolicExprContext* symbolic_expr_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index bbe33b134eed8f..dfe8f7f9a8100f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -32,10 +32,10 @@ cc_library(
         "//xla:status_macros",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/emitters:emitter_base",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//mlir:IR",
@@ -55,7 +55,7 @@ xla_cc_binary(
     deps = [
         ":test_lib",
         "//xla/codegen/tools:test_lib",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -109,7 +109,7 @@ xla_cc_binary(
     name = "fusion_wrapper",
     testonly = 1,
     srcs = ["fusion_wrapper.cc"],
-    visibility = [":codegen_tests"],
+    visibility = ["//visibility:private"],
     deps = [
         "//xla/codegen/tools:test_lib",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc b/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
index dab207a9c2316b..44fcbab4493561 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/fusion_to_mlir.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "xla/backends/gpu/codegen/tools/test_lib.h"
 #include "xla/codegen/tools/test_lib.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/statusor.h"
 
@@ -35,7 +35,7 @@ absl::Status Run(const std::string& filename) {
                       GetEmitter(*module, symbolic_expr_context));
   TF_ASSIGN_OR_RETURN(auto mlir_module,
                       emitter_data->emitter->CreateMLIRModule(
-                          symbolic_expr_context, *emitter_data->fusion, "main",
+                          mlir_context, *emitter_data->fusion, "main",
                           /*buffer_assignment=*/nullptr));
   llvm::outs() << *mlir_module;
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
index 283012e446098e..92de150071b788 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/fusions.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/status_macros.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
index 9505fd925d9240..6d308aa9bf6d68 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/test_lib.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 3bca8a90a3e32a..6c71ac59c4f2f5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -81,19 +81,19 @@ xla_cc_test(
         ":fusion",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -108,28 +108,35 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/emitters:elemental_hlo_to_mlir",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:transformation_helpers",
         "//xla/service/gpu:target_util",
+        "//xla/service/gpu:triton_fusion_analysis",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor/gpu:tma_metadata",
-        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
@@ -137,8 +144,8 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@triton//:TritonDialects",
     ],
 )
@@ -233,6 +240,7 @@ cc_library(
     ),
     hdrs = ["fusion_emitter.h"],
     deps = [
+        ":collective_emitter",
         "//xla:autotuning_proto_cc",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
@@ -242,18 +250,21 @@ cc_library(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -410,7 +421,9 @@ cc_library(
         ":emitter_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/xtile/ir:xtile",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate/hlo_to_mhlo:attribute_importer",
         "//xla/service:algorithm_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
@@ -427,6 +440,7 @@ cc_library(
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
+        "@stablehlo//:stablehlo_ops",
         "@triton//:TritonDialects",
     ],
 )
@@ -442,10 +456,16 @@ cc_library(
         "fusion_emitter_legacy_matmul.h",
     ],
     deps = [
+        ":emitter_helpers",
+        ":support",
         "//xla:autotuning_proto_cc",
+        "//xla:util",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
@@ -453,9 +473,10 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:triton_fusion_analysis",
         "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -478,10 +499,10 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -496,19 +517,19 @@ xla_cc_test(
         ":fusion_emitter",
         "//xla:xla_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:ir_headers",
@@ -544,6 +565,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -551,7 +573,6 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -590,6 +611,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -598,7 +620,6 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
@@ -640,10 +661,10 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
@@ -859,6 +880,7 @@ xla_test(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -866,7 +888,6 @@ xla_test(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -902,6 +923,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -914,7 +936,7 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/model:block_level_parameters",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
@@ -986,6 +1008,7 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -1139,6 +1162,7 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:triton_fusion_analysis",
+        "//xla/service/gpu/model:block_level_parameters",
         "//xla/stream_executor:device_description",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/log:check",
@@ -1167,6 +1191,79 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "collective_emitter",
+    srcs = ["collective_emitter.cc"],
+    hdrs = ["collective_emitter.h"],
+    deps = [
+        ":emitter_helpers",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/backends/gpu/runtime:all_reduce",
+        "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/tiling:tiled_hlo_instruction",
+        "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:all_reduce_kernel",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Support",
+        "@triton//:TritonDialects",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_emitter_test",
+    srcs = ["collective_emitter_test.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":collective_emitter",
+        ":fusion",
+        ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla/backends/gpu/codegen:fusion_emitter",
+        "//xla/backends/gpu/codegen:fusions",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 xla_cc_test(
     name = "tma_utils_test",
     srcs = ["tma_utils_test.cc"],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
new file mode 100644
index 00000000000000..6465938173535e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -0,0 +1,355 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
+
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/backends/gpu/runtime/all_reduce.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::mlir::ShapedType;
+using ::mlir::Value;
+using ::xla::gpu::triton::TensorValue;
+using ::xla::gpu::triton::TileInfo;
+using ::xla::se::gpu::AllReduceStrategy;
+
+namespace ttir = ::mlir::triton;
+namespace mtx = ::mlir::triton::xla;
+namespace arith = ::mlir::arith;
+
+// The main memory space on a device (HBM).
+static constexpr auto kGlobalAddressSpace =
+    static_cast<std::underlying_type_t<mlir::NVVM::NVVMMemorySpace>>(
+        mlir::NVVM::NVVMMemorySpace::Global);
+
+// Metadata arguments for the collective emitter.
+// device_rank, signal-value, signal_buffers.
+static constexpr int32_t kNumCollectiveMetadataArgs = 3;
+
+bool CanAllReduceBeEmitted(const HloAllReduceInstruction* all_reduce,
+                           ReductionKind reduction_kind, int64_t num_devices,
+                           int64_t num_elements, PrimitiveType element_type,
+                           AllReduceStrategy all_reduce_strategy) {
+  if (!all_reduce->GetModule()
+           ->config()
+           .debug_options()
+           .xla_gpu_unsupported_use_all_reduce_one_shot_kernel()) {
+    return false;
+  }
+  // TODO(b/383125489): Support variadic all-reduce.
+  if (all_reduce->operand_count() > 1) {
+    return false;
+  }
+  const int64_t byte_size =
+      num_elements * ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  // TODO(b/457333991): Support twoShot for codegen.
+  if (byte_size >
+      GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy::kOneShot)) {
+    return false;
+  }
+  return IsAllReduceKernelSupported(num_devices, num_elements, element_type,
+                                    reduction_kind, all_reduce_strategy);
+}
+
+// The logic here is very naive and assumes a monotonic layout
+// where only the last dimension is used as a tiling dimension.
+absl::StatusOr<std::optional<BlockLevelFusionConfig>>
+GetBlockLevelFusionConfigForAllReduce(
+    const se::DeviceDescription& device_info,
+    const HloAllReduceInstruction* all_reduce) {
+  const std::optional<ReductionKind> reduction_kind =
+      MatchReductionComputation(all_reduce->called_computations().front());
+  if (!reduction_kind.has_value()) {
+    return absl::InternalError(
+        "Reduction computation not found for all-reduce.");
+  }
+  const int64_t num_devices = all_reduce->device_list().num_devices_per_group();
+  const int64_t num_elements =
+      ShapeUtil::ElementsIn(all_reduce->operand(0)->shape());
+  const PrimitiveType element_type =
+      all_reduce->operand(0)->shape().element_type();
+  // NB: We do not codegen multimem kernels for now.
+  const AllReduceStrategy all_reduce_strategy =
+      GetAllReduceStrategy(num_elements, /*is_multimem_enabled=*/false);
+  if (!CanAllReduceBeEmitted(all_reduce, reduction_kind.value(), num_devices,
+                             num_elements, element_type, all_reduce_strategy)) {
+    return std::nullopt;
+  }
+  const Shape& output_shape = all_reduce->shape();
+  const LaunchDimensions launch_dims =
+      AllReduceLaunchDimensions(num_elements, num_devices, all_reduce_strategy);
+  BlockLevelFusionConfig block_level_config;
+  block_level_config.set_num_warps(launch_dims.num_threads_per_block() /
+                                   WarpSize(device_info));
+  block_level_config.set_num_ctas(1);    // No block-level clustering.
+  block_level_config.set_num_stages(1);  // No pipelining of loops.
+  Tile* output_tile = block_level_config.add_output_tiles();
+  const int64_t rank = output_shape.dimensions().size();
+
+  // Tile sizes are rolled up to power of 2 because this is what the triton
+  // expects (and consequently the tiling infra).
+  for (int i = 0; i < rank - 1; ++i) {
+    output_tile->add_sizes(llvm::PowerOf2Ceil(output_shape.dimensions(i)));
+  }
+  // The last dimension is divided amongst blocks.
+  if (rank > 0) {
+    const int64_t tile_size =
+        CeilOfRatio(output_shape.dimensions(rank - 1),
+                    absl::implicit_cast<int64_t>(launch_dims.num_blocks()));
+    output_tile->add_sizes(llvm::PowerOf2Ceil(tile_size));
+  }
+  return block_level_config;
+}
+
+absl::StatusOr<TensorValue> EmitAllReduce(
+    EmitterLocOpBuilder b, const HloComputation* computation,
+    const HloAllReduceInstruction& all_reduce,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
+  const TiledHloInstruction* tiled_input_hlo = tiled_hlo_reduce.operand(0);
+  TensorValue input_tile = values[tiled_input_hlo];
+
+  // Variadics are not supported yet so we can fix inputs to 1.
+  // Which means 2 arguments for input/output one for scratch buffers and 3
+  // metadata arguments. Plus 1 for the tile index for a total of 7.
+  const int32_t num_input_output_args = computation->num_parameters() * 2;
+  const int32_t num_scratch_buffers = computation->num_parameters();
+  static constexpr int32_t kNumTileIndexArgs = 1;
+  TF_RET_CHECK(fn.getNumArguments() ==
+               (num_input_output_args + num_scratch_buffers +
+                kNumCollectiveMetadataArgs + kNumTileIndexArgs));
+  // Opaque arguments start after the input/output arguments.
+  const int32_t start_idx = num_input_output_args;
+  mlir::Value device_rank = fn.getArgument(start_idx);
+  TF_RET_CHECK(device_rank.getType().isInteger(32));
+  mlir::Value signal_value = fn.getArgument(start_idx + 1);
+  TF_RET_CHECK(signal_value.getType().isInteger(32));
+  // !tt.ptr<!tt.ptr<i32>>
+  mlir::Value signal_buffers = fn.getArgument(start_idx + 2);
+  // !tt.ptr<!tt.ptr<i64>>
+  mlir::Value remote_input_buffers = fn.getArgument(start_idx + 3);
+
+  TF_ASSIGN_OR_RETURN(
+      TileInfo tile_info,
+      TileInfo::Construct(b, pid, /*runtime_values=*/{}, *tiled_input_hlo));
+
+  // 1. Scatter phase: Copy local tile to the remote buffer of the current rank.
+  const auto ptr_to_i64_type =
+      ttir::PointerType::get(b.getI64Type(), kGlobalAddressSpace);
+  auto remote_input_buffers_i64 =
+      b.create<ttir::BitcastOp>(ptr_to_i64_type, remote_input_buffers);
+  Value remote_buf_ptr_addr = b.create<ttir::AddPtrOp>(
+      ptr_to_i64_type, remote_input_buffers_i64, device_rank);
+  Value remote_buf_i64 =
+      b.create<ttir::LoadOp>(remote_buf_ptr_addr,
+                             ttir::CacheModifier::NONE,     //
+                             ttir::EvictionPolicy::NORMAL,  //
+                             false);                        // isVolatile
+  const auto elem_type =
+      mlir::cast<ShapedType>(input_tile.getType()).getElementType();
+  const auto ptr_to_elem_type =
+      ttir::PointerType::get(elem_type, kGlobalAddressSpace);
+  Value remote_buf_ptr =
+      b.create<ttir::IntToPtrOp>(ptr_to_elem_type, remote_buf_i64);
+  mlir::ArrayRef<int64_t> remote_shape = tile_info.original_shape();
+  const mlir::MemRefType remote_memref_type =
+      mlir::MemRefType::get(remote_shape, elem_type);
+  mlir::Value remote_buf_memref =
+      b.create<mtx::PtrToMemrefOp>(remote_memref_type, remote_buf_ptr);
+  b.create<xtile::InsertTileOp>(
+      input_tile, remote_buf_memref, tile_info.offsets(),
+      tile_info.padded_tile_sizes(), tile_info.tile_strides());
+
+  // 2. Synchronization phase: Wait for all ranks to complete the scatter.
+  int64_t world_size = all_reduce.device_list().num_devices_per_group();
+  b.create<mtx::BlockBarrierOp>(signal_buffers, device_rank, signal_value,
+                                b.getI32IntegerAttr(world_size));
+
+  // 3. Reduce phase: Load tiles from all ranks and reduce them.
+  HloComputation* reduction_computation = all_reduce.to_apply();
+  llvm::SmallVector<const HloInstruction*> to_emit;
+  // There is really only one non-parameter instruction in the computation.
+  for (const HloInstruction* instr : reduction_computation->instructions()) {
+    if (instr->opcode() != HloOpcode::kParameter) {
+      to_emit.push_back(instr);
+    }
+  }
+  // Set accumulator zero.
+  mlir::Value accumulator_zero =
+      b.create<arith::ConstantOp>(elem_type, b.getZeroAttr(elem_type));
+  TensorValue accumulator =
+      b.create<ttir::SplatOp>(input_tile.getType(), accumulator_zero);
+  for (int rank = 0; rank < world_size; ++rank) {
+    Value rank_idx =
+        b.create<arith::ConstantOp>(b.getI64Type(), b.getI64IntegerAttr(rank));
+    Value remote_buf_ptr_addr = b.create<ttir::AddPtrOp>(
+        ptr_to_i64_type, remote_input_buffers_i64, rank_idx);
+    Value remote_buf_i64 =
+        b.create<ttir::LoadOp>(remote_buf_ptr_addr,
+                               ttir::CacheModifier::NONE,     //
+                               ttir::EvictionPolicy::NORMAL,  //
+                               false);                        // isVolatile
+    Value remote_buf_ptr =
+        b.create<ttir::IntToPtrOp>(ptr_to_elem_type, remote_buf_i64);
+    Value remote_buf_memref =
+        b.create<mtx::PtrToMemrefOp>(remote_memref_type, remote_buf_ptr);
+    TensorValue next_tile =
+        EmitParameterExtract(b, tile_info, remote_buf_memref);
+
+    absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
+    region_values[reduction_computation->parameter_instruction(0)] =
+        accumulator;
+    region_values[reduction_computation->parameter_instruction(1)] = next_tile;
+    TF_ASSIGN_OR_RETURN(
+        accumulator,
+        triton::EmitScope(b,
+                          /*analysis=*/nullptr, /*instructions=*/to_emit,
+                          /*values=*/region_values));
+  }
+  return accumulator;
+}
+
+}  // namespace
+
+absl::StatusOr<std::optional<BlockLevelFusionConfig>>
+GetCollectiveBlockLevelFusionConfig(const se::DeviceDescription& device_info,
+                                    const HloFusionInstruction* fusion_instr) {
+  const HloInstruction* root = fusion_instr->fused_expression_root();
+  switch (root->opcode()) {
+    case HloOpcode::kAllReduceStart:
+      return GetBlockLevelFusionConfigForAllReduce(
+          device_info, Cast<HloAllReduceInstruction>(root));
+    default:
+      return std::nullopt;
+  }
+}
+
+absl::StatusOr<bool> TrySetGpuBackendConfigForCollective(
+    const se::DeviceDescription& device_info,
+    HloFusionInstruction* fusion_instr) {
+  TF_ASSIGN_OR_RETURN(
+      const std::optional<BlockLevelFusionConfig> block_config,
+      GetCollectiveBlockLevelFusionConfig(device_info, fusion_instr));
+  if (!block_config.has_value()) {
+    return false;
+  }
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_backend_config,
+                      fusion_instr->backend_config<GpuBackendConfig>());
+  gpu_backend_config.mutable_fusion_backend_config()->set_kind(
+      kTritonCollectiveFusionKind);
+  *gpu_backend_config.mutable_fusion_backend_config()
+       ->mutable_block_level_fusion_config() = *std::move(block_config);
+  TF_RETURN_IF_ERROR(
+      fusion_instr->set_backend_config(std::move(gpu_backend_config)));
+  return true;
+}
+
+absl::StatusOr<int32_t> AddCollectiveMetadataArguments(
+    llvm::SmallVector<mlir::Type>& fn_arg_types, EmitterLocOpBuilder& b,
+    const HloComputation* hlo_computation) {
+  // rank: i32
+  fn_arg_types.push_back(b.getI32Type());
+  // signal_value: i32
+  fn_arg_types.push_back(b.getI32Type());
+  // signal_buffers: !tt.ptr<!tt.ptr<i32>>
+  fn_arg_types.push_back(ttir::PointerType::get(
+      ttir::PointerType::get(b.getI32Type(), kGlobalAddressSpace),
+      kGlobalAddressSpace));
+  for (HloInstruction* p : hlo_computation->parameter_instructions()) {
+    PrimitiveType type = p->shape().element_type();
+    mlir::Type ir_type;
+    if (type == U16) {
+      ir_type = b.getI16Type();
+    } else if (type == S4) {
+      ir_type = b.getI4Type();
+    } else {
+      TF_ASSIGN_OR_RETURN(ir_type, triton::TritonType(b, type));
+    }
+    // Also add the remote/scratch buffers for collectives.
+    // !tt.ptr<!tt.ptr<type>>
+    fn_arg_types.push_back(ttir::PointerType::get(
+        ttir::PointerType::get(ir_type, kGlobalAddressSpace),
+        kGlobalAddressSpace));
+  }
+  // num_metadata_args =
+  return hlo_computation->num_parameters() + kNumCollectiveMetadataArgs;
+}
+
+absl::StatusOr<TensorValue> EmitCollective(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
+  const HloComputation* computation = fusion->fused_instructions_computation();
+  const HloInstruction* root = computation->root_instruction();
+  switch (root->opcode()) {
+    case HloOpcode::kAllReduceStart:
+      return EmitAllReduce(
+          b, computation, *xla::Cast<HloAllReduceInstruction>(root),
+          tiled_hlo_reduce, block_level_parameters, fn, pid, values);
+    default:
+      return absl::UnimplementedError(
+          absl::StrCat("Unsupported collective fusion: ", root->ToString()));
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h
new file mode 100644
index 00000000000000..e9f23515b00583
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.h
@@ -0,0 +1,79 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla::gpu {
+
+// Returns the block level fusion config for the collective kernel.
+// For now only all-reduce is supported.
+// If an std::nullopt is returned, it implies that the collective kernel is
+// not supported and cannot be emitted.
+absl::StatusOr<std::optional<xla::gpu::BlockLevelFusionConfig>>
+GetCollectiveBlockLevelFusionConfig(const se::DeviceDescription& device_info,
+                                    const HloFusionInstruction* fusion_instr);
+
+// Sets the BlockLevelFusionConfig for a collective op inside the
+// GpuBackendConfig for the fusion instruction.
+// Returns true if the collective op is supported and the config is set.
+// Returns false if the collective op is not supported. No backend config is set
+// in this case.
+// Returns an error in case of an internal error or invalid arguments.
+absl::StatusOr<bool> TrySetGpuBackendConfigForCollective(
+    const se::DeviceDescription& device_info,
+    HloFusionInstruction* fusion_instr);
+
+// Adds the metadata arguments to the function's argument list.
+// For collective some extra metadata arguments are needed such as rank,
+// and pointers to remote GPU buffers.
+// The fn_arg_types is updated in place to add these.
+// Returns the number of metadata arguments added or error.
+absl::StatusOr<int32_t> AddCollectiveMetadataArguments(
+    llvm::SmallVector<mlir::Type>& fn_arg_types, EmitterLocOpBuilder& b,
+    const HloComputation* hlo_computation);
+
+// Emits tiled XTile/Triton IR for a collective op.
+// See [EmitTiledHloInstruction] for an overview of how this fits into the
+// emitter.
+absl::StatusOr<triton::TensorValue> EmitCollective(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_reduce,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, mlir::Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, triton::TensorValue>&
+        values);
+
+}  // namespace xla::gpu
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_COLLECTIVE_EMITTER_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
new file mode 100644
index 00000000000000..777de291e9a2e9
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
@@ -0,0 +1,240 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
+
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/fusions.h"
+#include "xla/backends/gpu/codegen/triton/fusion.h"
+#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/hlo_creation_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::Optional;
+using ::tsl::proto_testing::EqualsProto;
+
+struct ModuleWithFusion {
+  std::unique_ptr<HloModule> module;
+
+  const HloFusionInstruction* FusionInstr() const {
+    return Cast<HloFusionInstruction>(
+        module->entry_computation()->root_instruction());
+  }
+  HloFusionInstruction* MutableFusionInstr() {
+    return Cast<HloFusionInstruction>(
+        module->entry_computation()->root_instruction());
+  }
+};
+
+struct ModuleWithEmitter : public ModuleWithFusion {
+  mlir::MLIRContext mlir_context;
+  SymbolicExprContext symbolic_expr_context{&mlir_context};
+  std::optional<HloFusionAnalysis> analysis;
+  std::unique_ptr<TritonFusion> emitter;
+  llvm::LLVMContext llvm_context;
+  llvm::Module llvm_module{"test_module", llvm_context};
+
+  explicit ModuleWithEmitter(std::unique_ptr<HloModule> module_arg)
+      : ModuleWithFusion{std::move(module_arg)} {}
+};
+
+class CollectiveBlockLevelConfigTest : public HloHardwareIndependentTestBase {
+ public:
+  CollectiveBlockLevelConfigTest()
+      : device_info_{TestGpuDeviceInfo::RTXH100SXMDeviceInfo()} {}
+
+  absl::StatusOr<ModuleWithFusion> BuildModuleWithFusion(
+      const Shape& shape) const {
+    const std::string module_str = GetModuleStr(shape);
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        ParseAndReturnVerifiedModule(module_str));
+    const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+        *module->entry_computation(), HloOpcode::kAllReduceStart);
+    std::unique_ptr<HloModule> module_with_fusion =
+        NewModuleWithFusion(instr, HloInstruction::FusionKind::kLoop);
+    module_with_fusion->mutable_config()
+        .mutable_debug_options()
+        .set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(true);
+    return ModuleWithFusion{std::move(module_with_fusion)};
+  }
+
+ protected:
+  static std::string GetModuleStr(const Shape& shape) {
+    return absl::StrFormat(R"(
+      HloModule test
+      apply_op {
+        x = f32[] parameter(0)
+        y = f32[] parameter(1)
+        ROOT apply_op = f32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        param_0 = %1$s parameter(0)
+        all-reduce-start = %1$s all-reduce-start(param_0), to_apply=apply_op, replica_groups={{0,1}}
+        ROOT all-reduce-done = %1$s all-reduce-done(all-reduce-start)
+      }
+    )",
+                           shape.ToString());
+  }
+
+  const se::DeviceDescription device_info_;
+};
+
+class CollectiveEmitterTest : public CollectiveBlockLevelConfigTest {
+ public:
+  absl::StatusOr<std::unique_ptr<ModuleWithEmitter>> BuildModuleWithEmitter(
+      const Shape& shape, const se::DeviceDescription& device_info) const {
+    TF_ASSIGN_OR_RETURN(ModuleWithFusion module_with_fusion,
+                        BuildModuleWithFusion(shape));
+    TF_ASSIGN_OR_RETURN(
+        bool collective_fusion_config_set,
+        TrySetGpuBackendConfigForCollective(
+            device_info_, module_with_fusion.MutableFusionInstr()));
+    if (!collective_fusion_config_set) {
+      return absl::InternalError(
+          "Failed to set collective fusion config. "
+          "TrySetGpuBackendConfigForCollective returned false.");
+    }
+    auto result = std::make_unique<ModuleWithEmitter>(
+        std::move(module_with_fusion.module));
+    result->analysis =
+        HloFusionAnalysis::Create(*result->FusionInstr(), device_info);
+    std::unique_ptr<FusionInterface> fusion_emitter =
+        GetFusionEmitter(PreBufferAssignmentFusionInfo{*result->analysis},
+                         &result->symbolic_expr_context);
+    TritonFusion* triton_emitter =
+        dynamic_cast<TritonFusion*>(fusion_emitter.get());
+    TF_RET_CHECK(triton_emitter != nullptr);
+    fusion_emitter.release();
+    result->emitter = absl::WrapUnique(triton_emitter);
+    return result;
+  }
+};
+
+struct AllReduceBlockLevelConfigTestCase {
+  std::string test_name;
+  Shape shape;
+  std::string expected_proto;
+
+  // Teach gTest how to print the test case.
+  [[maybe_unused]] friend void PrintTo(
+      const AllReduceBlockLevelConfigTestCase& test_case, std::ostream* os) {
+    *os << "{test_name: " << test_case.test_name
+        << " shape: " << test_case.shape.ToString()
+        << " expected_proto: " << test_case.expected_proto << "}";
+  }
+};
+
+class CollectiveEmitterParameterizedTest
+    : public CollectiveBlockLevelConfigTest,
+      public ::testing::WithParamInterface<AllReduceBlockLevelConfigTestCase> {
+};
+
+TEST_P(CollectiveEmitterParameterizedTest, AllReduceBlockLevelConfig) {
+  const auto& param = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(const auto module_with_fusion,
+                          BuildModuleWithFusion(param.shape));
+  TF_ASSERT_OK_AND_ASSIGN(const auto block_level_config,
+                          GetCollectiveBlockLevelFusionConfig(
+                              device_info_, module_with_fusion.FusionInstr()));
+  EXPECT_THAT(block_level_config, Optional(EqualsProto(param.expected_proto)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CollectiveEmitterParameterizedTestInstantiation,
+    CollectiveEmitterParameterizedTest,
+    ::testing::Values(AllReduceBlockLevelConfigTestCase{
+                          /* .test_name = */ "F32_65536",
+                          /* .shape = */ ShapeUtil::MakeShape(F32, {65536}),
+                          /* .expected_proto = */ R"pb(
+                            num_warps: 16
+                            num_ctas: 1
+                            num_stages: 1
+                            output_tiles { sizes: 4096 }
+                          )pb"},
+                      AllReduceBlockLevelConfigTestCase{
+                          /* .test_name= */ "F32_200_100",
+                          /* .shape= */ ShapeUtil::MakeShape(F32, {200, 100}),
+                          /* .expected_proto= */ R"pb(
+                            num_warps: 16
+                            num_ctas: 1
+                            num_stages: 1
+                            output_tiles { sizes: 256 sizes: 16 }
+                          )pb"}),
+    [](const ::testing::TestParamInfo<
+        CollectiveEmitterParameterizedTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+TEST_F(CollectiveEmitterTest, AllReduceWithTritonGetLaunchConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ModuleWithEmitter> result_ptr,
+      BuildModuleWithEmitter(ShapeUtil::MakeShape(F32, {65536}), device_info_));
+  auto& result = *result_ptr;
+  const TritonFusion* triton_fusion = result.emitter.get();
+  ASSERT_NE(triton_fusion, nullptr);
+  auto const launch_config = triton_fusion->GetLaunchConfig();
+  ASSERT_NE(launch_config, std::nullopt);
+  EXPECT_EQ(launch_config->launch_dimensions.num_blocks(), 16);
+  EXPECT_EQ(launch_config->launch_dimensions.num_threads_per_block(), 512);
+}
+
+TEST_F(CollectiveEmitterTest, AllReduceWithTritonGenerateTritonKernel) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ModuleWithEmitter> result,
+      BuildModuleWithEmitter(ShapeUtil::MakeShape(F32, {65536}), device_info_));
+  const TritonFusion* triton_fusion = result->emitter.get();
+  ASSERT_NE(triton_fusion, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(
+      TritonWrapperResult triton_kernel,
+      triton_fusion->GenerateTritonKernelAndWrapper(
+          *result->FusionInstr(), "test-all-reduce-start", device_info_,
+          &result->llvm_module, &result->symbolic_expr_context));
+}
+
+}  // namespace
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
index bd1b2204267cec..cb9b4f4f43dba6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
 
-#include <variant>
-
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -32,10 +30,13 @@ namespace xla::gpu {
 void CreateTritonXlaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::GpuComputeCapability& gpu_cc, bool rewrite_int4,
-    bool allow_tma) {
+    bool allow_tma, int num_stages) {
   pm->addPass(mlir::triton::xla::CreateTritonXLASqueezeDimsPass());
-  pm->addPass(mlir::triton::xla::CreateTritonXLALowerXTilePass());
   pm->addPass(mlir::triton::xla::CreateTritonXLAFoldTransposePass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerBlockBarrierPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerAtomicsPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerGetTidPass());
+  pm->addPass(mlir::triton::xla::CreateTritonXLALowerXTilePass());
 
   auto* cuda_cc = gpu_cc.cuda_compute_capability();
   bool is_at_least_hopper = cuda_cc != nullptr && cuda_cc->IsAtLeastHopper();
@@ -46,7 +47,7 @@ void CreateTritonXlaPipeline(
   }
 
   pm->addPass(mlir::triton::xla::CreateTritonXLAExtractInsertToTritonPass(
-      /*allow_tma=*/allow_tma && is_at_least_hopper));
+      /*allow_tma=*/allow_tma && is_at_least_hopper, num_stages));
 
   // Lower affine expressions into arithmetic ops.
   pm->addPass(mlir::createLowerAffinePass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
index e43939888106f3..8ae26d3a691cc5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
@@ -26,7 +26,7 @@ namespace xla::gpu {
 void CreateTritonXlaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::GpuComputeCapability& gpu_cc, bool rewrite_int4,
-    bool allow_tma);
+    bool allow_tma, int num_stages);
 
 // Creates a Triton compilation pipeline.
 //
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 4ecc62a998f6ab..133de281e50e85 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -102,6 +102,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
     pm->addPass(
         mt::gpu::createTritonGPUAutomaticWarpSpecialization({num_stages}));
     pm->addPass(mt::gpu::createTritonGPUPipeline({num_stages}));
+    pm->addPass(mt::gpu::createTritonGPUOptimizePartitionWarps());
     pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
     pm->addPass(mt::gpu::createTritonGPUHoistTMEMAlloc({true}));
     pm->addPass(ttng::createTritonNvidiaGPURemoveTMEMTokensPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index bf122c7ea130eb..761a45c1356db9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -81,10 +81,10 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCanonicalizerPass());
 
   if (rocm_cc.has_amd_matrix_instr()) {
-    pm->addPass(mlir::createTritonAMDGPUStreamPipeline(
-        {num_stages, /*global_prefetch=*/0, /*local_prefetch=*/0,
-         /*use_async_copy=*/false, /*use_block_pingpong=*/false}));
     // TODO(ROCm) Modify when corresponding run time flags are introduced.
+    pm->addPass(mlir::createTritonAMDGPUScheduleLoops({num_stages}));
+    pm->addPass(mlir::createTritonAMDGPUPipeline(
+        {/*useAsyncCopy=*/false, /*usePingpong=*/false}));
     if (/*use_async_copy=*/false) {  // Not enabled by default.
       pm->addPass(mlir::createTritonAMDGPUCoalesceAsyncCopy());
     }
@@ -123,6 +123,10 @@ static void MakeLLIR(mlir::OpPassManager* pm,
                      const stream_executor::RocmComputeCapability& rocm_cc,
                      int num_stages) {
   const int custom_lds_size = 0;
+  // The `createTritonGPUAllocateWarpGroups` pass is not implemented in the
+  // upstream Triton, but is necessary for `ExtractThreadDims` in emitter
+  // helpers. It adds the `ttg.total-num-warps` attribute.
+  pm->addPass(mt::gpu::createTritonGPUAllocateWarpGroups());
   pm->addPass(mlir::triton::AMD::createOptimizeLDSUsagePass(
       rocm_cc.gfx_version(), custom_lds_size));
   pm->addPass(mlir::createSCFToControlFlowPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
index 8e341120d06fd0..2c9272ce02b4f7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
@@ -39,7 +39,8 @@ TEST(CompilationPipelineTest, UnswitchLoopsAfterLICM) {
   mlir::PassManager pm(&ctx);
 
   CreateTritonXlaPipeline(&pm, stream_executor::CudaComputeCapability(),
-                          /*rewrite_int4=*/false, /*allow_tma=*/true);
+                          /*rewrite_int4=*/false, /*allow_tma=*/true,
+                          /*num_stages=*/1);
 
   std::vector<std::string> pass_names;
   for (const mlir::Pass& pass : pm.getPasses()) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
index c966c37452e797..45312046f1ed04 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
@@ -35,10 +35,13 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -53,8 +56,6 @@ namespace triton {
 
 namespace {
 
-namespace arith = ::mlir::arith;
-namespace math = ::mlir::math;
 namespace ttir = ::mlir::triton;
 
 using ::mlir::ShapedType;
@@ -63,69 +64,23 @@ using ::mlir::Value;
 
 Type ElementType(Value v) { return mlir::getElementTypeOrSelf(v); }
 
-// Precision-relevant configuration bits for `dot`s.
-struct PrecisionSpec {
-  PrecisionConfig::Algorithm algorithm;
-  // TODO(bchetioui): we hope to get rid of operand precisions eventually, they
-  // are currently a (XLA-wide) bridge to work with ALG_UNSET.
-  PrecisionConfig::Precision lhs_operand_precision;
-  PrecisionConfig::Precision rhs_operand_precision;
-  // Encodes `tt.dot`'s `inputPrecision` attribute.
-  ttir::InputPrecision ttir_input_precision;
-};
-
-using AlgorithmEmitter = absl::StatusOr<Value> (*)(EmitterLocOpBuilder,
-                                                   const DotOperands&,
-                                                   const PrecisionSpec&);
-
-// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
-// If rhs is +infinity, we will have:
-// +infinity * 1.0 = +infinity
-// +infinity * 0.0 = NaN
-// We would get the wrong result if we sum these partial products. Instead, we
-// must override any accumulated result if the last partial product is
-// non-finite. See b/115844437.
-Value ZeroNaNs(EmitterLocOpBuilder b, Value input) {
-  Value positive_inf =
-      CreateConst<float>(b, b.getF32Type(),
-                         std::numeric_limits<float>::infinity(),
-                         mlir::cast<ShapedType>(input.getType()).getShape())
-          .UnwrapTensor();
-  Value abs_input = b.create<math::AbsFOp>(input);
-  Value is_finite = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT,
-                                            positive_inf, abs_input);
-  return b.create<arith::SelectOp>(is_finite, input, ZerosLike(b, input));
-}
-
-absl::Status ExpectType(Value v, Type expected_type) {
-  if (ElementType(v) != expected_type) {
-    std::string expected_type_str, actual_type_str;
-    {
-      llvm::raw_string_ostream os_expected(expected_type_str);
-      llvm::raw_string_ostream os_actual(actual_type_str);
-      expected_type.print(os_expected);
-      ElementType(v).print(os_actual);
-    }
-    return absl::FailedPreconditionError(absl::StrCat(
-        "Expected type ", expected_type_str, " but got ", actual_type_str));
+mlir::stablehlo::Precision XlaPrecisionToStableHloPrecision(
+    PrecisionConfig::Precision precision) {
+  switch (precision) {
+    case PrecisionConfig::DEFAULT:
+      return mlir::stablehlo::Precision::DEFAULT;
+    case PrecisionConfig::HIGH:
+      return mlir::stablehlo::Precision::HIGH;
+    case PrecisionConfig::HIGHEST:
+      return mlir::stablehlo::Precision::HIGHEST;
+    default:
+      LOG(FATAL) << "Unsupported precision: " << precision;
   }
-  return absl::OkStatus();
 }
 
-std::vector<Value> SplitF32(EmitterLocOpBuilder b, Value input,
-                            int split_count) {
-  std::vector<Value> split_inputs;
-  split_inputs.reserve(split_count);
-  for (int i = 0; i < split_count; ++i) {
-    Value input_as_bf16 = Cast(b, input, b.getBF16Type());
-    if (i != split_count - 1) {
-      Value input_as_f32 = Cast(b, input_as_bf16, b.getF32Type());
-      input = b.create<arith::SubFOp>(input, input_as_f32);
-    }
-    split_inputs.push_back(input_as_bf16);
-  }
-  return split_inputs;
-}
+}  // namespace
+
+namespace internal {
 
 absl::StatusOr<ttir::ScaleDotElemType> GetScaleDotElemType(Type value) {
   auto type = getElementTypeOrSelf(value);
@@ -145,12 +100,16 @@ absl::StatusOr<ttir::ScaleDotElemType> GetScaleDotElemType(Type value) {
       absl::StrCat("Unsupported type: ", llvm_ir::DumpToString(type)));
 }
 
+}  // namespace internal
+
+namespace {
+
 absl::StatusOr<Value> ScaledDot(EmitterLocOpBuilder b,
                                 ScaledDotOperands& operands) {
   TF_ASSIGN_OR_RETURN(auto lhs_dot_elem_type,
-                      GetScaleDotElemType(operands.lhs.getType()));
+                      internal::GetScaleDotElemType(operands.lhs.getType()));
   TF_ASSIGN_OR_RETURN(auto rhs_dot_elem_type,
-                      GetScaleDotElemType(operands.rhs.getType()));
+                      internal::GetScaleDotElemType(operands.rhs.getType()));
 
   Value lhs_scale;
   if (lhs_dot_elem_type != ttir::ScaleDotElemType::BF16) {
@@ -159,140 +118,59 @@ absl::StatusOr<Value> ScaledDot(EmitterLocOpBuilder b,
   Value rhs_scale;
   if (rhs_dot_elem_type != ttir::ScaleDotElemType::BF16) {
     rhs_scale = Bitcast(b, operands.rhs_scale, b.getI8Type());
-    rhs_scale =
-        b.create<ttir::TransOp>(rhs_scale, mlir::ArrayRef<int32_t>{1, 0});
+    rhs_scale = b.create<mlir::stablehlo::TransposeOp>(
+        rhs_scale, b.getDenseI64ArrayAttr({1, 0}));
   }
 
-  // make type with the same shape as the scale but with i8 type
-  return b.create<ttir::DotScaledOp>(
-      operands.accumulator.getType(), operands.lhs, operands.rhs,
-      operands.accumulator, lhs_scale, rhs_scale, lhs_dot_elem_type,
-      rhs_dot_elem_type, true);
-}
+  auto dot_scaled_op =
+      b.create<xtile::DotScaledOp>(operands.accumulator.getType(), operands.lhs,
+                                   operands.rhs, lhs_scale, rhs_scale, true);
 
-Value IEEEDot(EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
-  return b.create<ttir::DotOp>(lhs, rhs, acc,
-                               /*inputPrecision=*/ttir::InputPrecision::IEEE,
-                               /*maxNumImpreciseAcc=*/0);
+  auto add_result =
+      mlir::isa<mlir::IntegerType>(
+          dot_scaled_op.getResult().getType().getElementType())
+          ? b.create<mlir::arith::AddIOp>(operands.accumulator, dot_scaled_op)
+          : b.create<mlir::arith::AddFOp>(operands.accumulator, dot_scaled_op);
+  return add_result->getResult(0);
 }
 
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-absl::StatusOr<Value> EmitBF16x9Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 3;
-  constexpr int kHigh = 0;
-  constexpr int kMid = 1;
-  constexpr int kLow = 2;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
-
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kLow], result);
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kLow], result);
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
-
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
-}
-
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-absl::StatusOr<Value> EmitBF16x6Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 3;
-  constexpr int kHigh = 0;
-  constexpr int kMid = 1;
-  constexpr int kLow = 2;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
-
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
-
-  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
-
-  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
-
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
-}
-
-// Compute F32 matmul with 3 BF16 dots. It is less accurate than
-// EmitBF16x6Matmul.
-absl::StatusOr<Value> EmitBF16x3Matmul(EmitterLocOpBuilder b,
-                                       const DotOperands& dot_operands,
-                                       const PrecisionSpec& precision_spec) {
-  constexpr int kNumParts = 2;
-  constexpr int kHigh = 0;
-  constexpr int kLow = 1;
-
-  Type f32 = b.getF32Type();
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
-  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
-
-  std::vector<Value> lhs_bf16 = SplitF32(b, dot_operands.lhs, kNumParts);
-  std::vector<Value> rhs_bf16 = SplitF32(b, dot_operands.rhs, kNumParts);
-
-  Value result = triton::ZerosLike(b, dot_operands.accumulator);
-  result = IEEEDot(b, lhs_bf16[kLow], rhs_bf16[kHigh], result);
-  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kLow], result);
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kHigh], result);
-  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
-  return result;
-}
+namespace {
 
-bool IsTf32Allowed(const HloDotInstruction& dot) {
-  auto precision_config = dot.precision_config();
-  if (precision_config.algorithm() == PrecisionConfig::ALG_UNSET) {
-    return tsl::tensor_float_32_execution_enabled() &&
-           precision_config.operand_precision(0) == PrecisionConfig::DEFAULT &&
-           precision_config.operand_precision(1) == PrecisionConfig::DEFAULT;
-  }
-  return algorithm_util::HasTf32InputType(precision_config.algorithm());
+Value EmitStableHloDotAndAdd(EmitterLocOpBuilder b, Value lhs, Value rhs,
+                             Value acc, PrecisionSpec precision_spec) {
+  auto lhs_type = mlir::cast<ShapedType>(lhs.getType());
+  auto rhs_type = mlir::cast<ShapedType>(rhs.getType());
+
+  CHECK(lhs_type.getRank() <= 2 && rhs_type.getRank() <= 2)
+      << "Unsupported ranks. LHS rank: " << lhs_type.getRank()
+      << " RHS rank: " << rhs_type.getRank();
+
+  llvm::SmallVector<int64_t> array_attr{0};
+  auto dot_dimension_numbers = mlir::stablehlo::DotDimensionNumbersAttr::get(
+      b.getContext(), /*lhsBatchingDimensions=*/{},
+      /*rhsBatchingDimensions=*/{},
+      /*lhsContractingDimensions=*/
+      {lhs_type.getRank() - 1},
+      /*rhsContractingDimensions=*/
+      {0});
+
+  auto precision_config = mlir::stablehlo::PrecisionConfigAttr::get(
+      b.getContext(), {precision_spec.lhs_operand_precision,
+                       precision_spec.rhs_operand_precision});
+  auto dot = b.create<mlir::stablehlo::DotGeneralOp>(
+      acc.getType(), lhs, rhs, dot_dimension_numbers,
+      /*precision_config=*/precision_config,
+      /*algorithm=*/
+      stablehlo::ConvertDotAlgorithm(precision_spec.algorithm, &b));
+
+  auto add_result =
+      mlir::isa<mlir::IntegerType>(dot.getResult().getType().getElementType())
+          ? b.create<mlir::arith::AddIOp>(acc, dot)
+          : b.create<mlir::arith::AddFOp>(acc, dot);
+  return add_result->getResult(0);
 }
 
-ttir::InputPrecision InferDotPrecision(const HloDotInstruction& dot) {
-  if (dot.precision_config().algorithm() ==
-      PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
-    return ttir::InputPrecision::TF32x3;
-  }
-
-  return IsTf32Allowed(dot) ? ttir::InputPrecision::TF32
-                            : ttir::InputPrecision::IEEE;
-}
+}  // namespace
 
 absl::StatusOr<Type> GetAlgUnsetAccumulatorType(EmitterLocOpBuilder b,
                                                 const HloDotInstruction& dot) {
@@ -321,102 +199,6 @@ absl::StatusOr<Type> GetAlgUnsetAccumulatorType(EmitterLocOpBuilder b,
                                                         : b.getF32Type();
 }
 
-absl::StatusOr<Value> EmitDotAlgUnset(EmitterLocOpBuilder b,
-                                      const DotOperands& dot_operands,
-                                      const PrecisionSpec& precision_spec) {
-  // Execute matrix multiplication of input tiles and pass the accumulator.
-  // TODO(manany): Should be looked into once we enable Hopper workloads.
-  // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
-  // lower precision than the output type. The change was introduced here:
-  // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-  Value lhs = dot_operands.lhs;
-  Value rhs = dot_operands.rhs;
-  Value acc = dot_operands.accumulator;
-
-  int max_num_imprecise_acc = 0;
-  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
-    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
-    // sense to enable frequent accumulator promotion at higher matmul
-    // precisions set in the config.
-    max_num_imprecise_acc = std::numeric_limits<int>::max();
-  }
-
-  return b.create<ttir::DotOp>(
-      lhs, rhs, acc,
-      /*inputPrecision=*/precision_spec.ttir_input_precision,
-      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
-}
-
-absl::StatusOr<Value> EmitRegularDot(EmitterLocOpBuilder b,
-                                     const DotOperands& dot_operands,
-                                     const PrecisionSpec& precision_spec) {
-  Value lhs = dot_operands.lhs;
-  Value rhs = dot_operands.rhs;
-
-  int max_num_imprecise_acc = 0;
-  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
-    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
-    // sense to enable frequent accumulator promotion at higher matmul
-    // precisions set in the config.
-    max_num_imprecise_acc = std::numeric_limits<int>::max();
-  }
-
-  // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
-  // TODO(bchetioui): abstract this.
-  if (precision_spec.algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
-    if (ElementType(lhs).isF32()) {
-      lhs = Cast(b, lhs, b.getBF16Type());
-    }
-
-    if (ElementType(rhs).isF32()) {
-      rhs = Cast(b, rhs, b.getBF16Type());
-    }
-  }
-
-  return b.create<ttir::DotOp>(
-      dot_operands.lhs, dot_operands.rhs, dot_operands.accumulator,
-      /*inputPrecision=*/precision_spec.ttir_input_precision,
-      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
-}
-
-// Returns an emitter for the given dot algorithm. Raises an
-// `UnimplementedError` if the algorithm is not supported.
-absl::StatusOr<AlgorithmEmitter> GetAlgorithmEmitter(
-    const PrecisionConfig::Algorithm algorithm) {
-  switch (algorithm) {
-    case PrecisionConfig::ALG_UNSET:
-      return EmitDotAlgUnset;
-    case PrecisionConfig::ALG_DOT_F16_F16_F16:
-    case PrecisionConfig::ALG_DOT_F32_F32_F32:
-    case PrecisionConfig::ALG_DOT_F64_F64_F64:
-    case PrecisionConfig::ALG_DOT_F16_F16_F32:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
-      return EmitBF16x3Matmul;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      return EmitBF16x6Matmul;
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
-      // TODO(bchetioui): this should be factored out of EmitRegularDot.
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
-      // TODO(bchetioui): this should be factored out of EmitRegularDot.
-      return EmitRegularDot;
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
-      return EmitBF16x9Matmul;
-    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
-    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
-    default:
-      break;
-  }
-
-  // Couldn't find an algorithm emitter for this algorithm. Raise an error.
-  return absl::UnimplementedError(
-      absl::StrCat("This algorithm is not supported yet: ",
-                   PrecisionConfig::Algorithm_Name(algorithm)));
-}
-
 // Returns the `Type` that the dot operands should be casted to if there is a
 // clear candidate. Raises an error if there are multiple allowed choices but
 // the operands do not already conform to any of them. Returns `std::nullopt` if
@@ -490,11 +272,11 @@ absl::StatusOr<Value> EmitSingleTileDot(EmitterLocOpBuilder b,
                                         DotOperands dot_operands) {
   PrecisionConfig::Algorithm algorithm = dot.precision_config().algorithm();
   PrecisionSpec precision_spec{
-      algorithm, dot.precision_config().operand_precision(0),
-      dot.precision_config().operand_precision(1), InferDotPrecision(dot)};
-
-  TF_ASSIGN_OR_RETURN(AlgorithmEmitter algorithm_emitter,
-                      GetAlgorithmEmitter(algorithm));
+      algorithm,
+      XlaPrecisionToStableHloPrecision(
+          dot.precision_config().operand_precision(0)),
+      XlaPrecisionToStableHloPrecision(
+          dot.precision_config().operand_precision(1))};
 
   TF_ASSIGN_OR_RETURN(std::optional<Type> force_operands_type,
                       GetForceOperandsType(b, dot, dot_operands));
@@ -517,8 +299,9 @@ absl::StatusOr<Value> EmitSingleTileDot(EmitterLocOpBuilder b,
         Cast(b, dot_operands.accumulator, force_accumulator_type);
   }
 
-  TF_ASSIGN_OR_RETURN(Value result,
-                      algorithm_emitter(b, dot_operands, precision_spec));
+  Value result =
+      EmitStableHloDotAndAdd(b, dot_operands.lhs, dot_operands.rhs,
+                             dot_operands.accumulator, precision_spec);
 
   // TODO(b/393299275): once we've moved on from the legacy emitter, we should
   // make sure that this accumulator type is equal to the one derived here.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
index 24268fca654848..1a55f393fdbe8a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -27,6 +28,15 @@ namespace xla {
 namespace gpu {
 namespace triton {
 
+// Precision-relevant configuration bits for `dot`s.
+struct PrecisionSpec {
+  PrecisionConfig::Algorithm algorithm;
+  // TODO(bchetioui): we hope to get rid of operand precisions eventually, they
+  // are currently a (XLA-wide) bridge to work with ALG_UNSET.
+  mlir::stablehlo::Precision lhs_operand_precision;
+  mlir::stablehlo::Precision rhs_operand_precision;
+};
+
 // Carries named `Value`s corresponding to `dot` operands. This includes an
 // accumulator.
 struct DotOperands {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
index c1386b05096374..8a6850c8562fca 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -81,10 +81,16 @@ class AlgorithmTest : public GpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(0);
-    // TODO(b/393299275): remove when the flag is enabled by default.
+    // TODO(b/393299275): remove when these flags are on by default.
     debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
     debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
         DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
     return debug_options;
   }
 
@@ -135,9 +141,6 @@ class BlasAlgorithmTest : public AlgorithmTest {
 using TritonAlgorithmTest = AlgorithmTest;
 
 TEST_F(AlgorithmTest, Algorithm3xBF16) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X3 not supported on ROCM.";
-  }
   constexpr absl::string_view kHloText = R"(
     HloModule Algorithm3xBF16
 
@@ -154,9 +157,6 @@ TEST_F(AlgorithmTest, Algorithm3xBF16) {
 }
 
 TEST_F(AlgorithmTest, Algorithm6xBF16) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "ALG_DOT_BF16_BF16_F32_X6 not supported on ROCM.";
-  }
   constexpr absl::string_view kHloText = R"(
     HloModule Algorithm6xBF16
 
@@ -491,10 +491,26 @@ TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
   constexpr absl::string_view kHloText = R"(
     HloModule Emit6xBF16GemmWhenBothInputsAreF32
 
+    lhs {
+      ROOT p0 = f32[5,7] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[7,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,7] parameter(0)
       p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,7] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      rhs = f32[7,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x6
     }
@@ -503,9 +519,11 @@ TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
       p0 = f32[5,7]{1,0} parameter(0)
       p1 = f32[7,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["32","32"]}],
+            "num_stages":1,"num_warps":1,"num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -542,10 +560,26 @@ TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
   constexpr absl::string_view kHloText = R"(
     HloModule Triton6xBF16GemmWorksForLongContractingDimension
 
+    lhs {
+      ROOT p0 = f32[5,2048] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[2048,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,2048] parameter(0)
       p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,2048] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+      rhs = f32[2048,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x6
     }
@@ -554,9 +588,11 @@ TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
       p0 = f32[5,2048]{1,0} parameter(0)
       p1 = f32[2048,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["64","32"]}],
+            "num_stages":1,"num_warps":4, "num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -601,10 +637,26 @@ TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
   constexpr absl::string_view kHloText = R"(
     HloModule Emit3xBF16GemmWhenBothInputsAreF32
 
+    lhs {
+      ROOT p0 = f32[5,7] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[7,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,7] parameter(0)
       p1 = f32[7,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,7] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      rhs = f32[7,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x3
     }
@@ -613,9 +665,11 @@ TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
       p0 = f32[5,7]{1,0} parameter(0)
       p1 = f32[7,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["32","32"]}],
+            "num_stages":1,"num_warps":1,"num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -646,10 +700,26 @@ TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
   constexpr absl::string_view kHloText = R"(
     HloModule Triton3xBF16GemmWorksForLongContractingDimension
 
+    lhs {
+      ROOT p0 = f32[5,2048] parameter(0)
+    }
+
+    rhs {
+      ROOT p0 = f32[2048,33] parameter(0)
+    }
+
     triton_dot {
       p0 = f32[5,2048] parameter(0)
       p1 = f32[2048,33] parameter(1)
-      ROOT dot = f32[5,33] dot(p0, p1),
+      lhs = f32[5,2048] fusion(p0), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+      rhs = f32[2048,33] fusion(p1), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+      ROOT dot = f32[5,33] dot(lhs, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0},
         algorithm=dot_bf16_bf16_f32_x3
     }
@@ -658,9 +728,11 @@ TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
       p0 = f32[5,2048]{1,0} parameter(0)
       p1 = f32[2048,33]{1,0} parameter(1)
       ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-        triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
+        backend_config={"fusion_backend_config": {
+          kind: "__triton_nested_gemm_fusion",
+          block_level_fusion_config: {
+            "output_tiles":[{"sizes":["64","32"]}],
+            "num_stages":1,"num_warps":4, "num_ctas":1}}}
     }
   )";
   TF_ASSERT_OK(
@@ -792,9 +864,6 @@ TEST_F(TritonAlgorithmTest, Algorithm_TF32_TF32_F32_X3) {
 }
 
 TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
-    GTEST_SKIP() << "Triton currently disabled on ROCM.";
-  }
   if (!SupportsBF16(GpuComputeComp())) {
     GTEST_SKIP() << "BF16 not supported.";
   }
@@ -821,46 +890,53 @@ TEST_F(TritonAlgorithmTest, Algorithm_BF16_BF16_F32) {
 }
 
 TEST_F(TritonAlgorithmTest, Dot_BF16_X6_WithConst) {
-  constexpr std::string_view kHloText = R"(
+  constexpr absl::string_view kHloText = R"(
     HloModule Dot_BF16_X6_WithConst
 
-    %triton_fusion_dot (p_0: f32[1,258]) -> f32[258] {
-      %c_1 = f32[] constant(-1.22474492)
-      %r_1 = f32[1]{0} reshape(f32[] %c_1)
-      %r_2 = f32[1,1]{1,0} reshape(f32[1]{0} %r_1)
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      %r_3 = f32[258]{0} reshape(f32[1,258]{1,0} %p_0)
-      %r_4 = f32[258,1]{1,0} reshape(f32[258]{0} %r_3)
-      %dot_0 = f32[1,258]{1,0} dot(f32[1,1]{1,0} %r_2, f32[258,1]{1,0} %r_4),
+    lhs {
+      constant = f32[] constant(-1.22474492)
+      ROOT broadcast = f32[1,1] broadcast(constant)
+    }
+
+    rhs {
+      ROOT p0 = f32[258,1] parameter(0)
+    }
+
+    triton_fusion_dot {
+      p0 = f32[258,1] parameter(0)
+      lhs = f32[1,1] fusion(), kind=kCustom, calls=lhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["16","16"]}]}}}
+      rhs = f32[258,1] fusion(p0), kind=kCustom, calls=rhs,
+       backend_config={"fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{"output_tiles":[{"sizes":["16","256"]}]}}}
+      dot = f32[1,258] dot(lhs, rhs),
           lhs_contracting_dims={0},
           rhs_contracting_dims={1},
           algorithm=dot_bf16_bf16_f32_x6
-      %r_5 = f32[258]{0} reshape(f32[1,258]{1,0} %dot_0)
-      %c_2 = f32[] constant(0.282094777)
-      %b_0 = f32[258]{0} broadcast(f32[] %c_2), dimensions={}
-      ROOT %m_0 = f32[258]{0} multiply(f32[258]{0} %r_5, f32[258]{0} %b_0)
+      constant = f32[] constant(0.282094777)
+      broadcast = f32[1,258] broadcast(constant), dimensions={}
+      ROOT root = f32[1,258] multiply(dot, broadcast)
     }
 
-    ENTRY %entry_computation {
-      %p_0 = f32[1,258]{1,0} parameter(0)
-      ROOT %dot = f32[258]{0} fusion(f32[1,258]{1,0} %p_0),
+    ENTRY entry_computation {
+      p0 = f32[258,1] parameter(0)
+      ROOT root = f32[1,258] fusion(p0),
         kind=kCustom,
-        calls=%triton_fusion_dot,
+        calls=triton_fusion_dot,
         backend_config={
           "operation_queue_id":"0",
           "wait_on_operation_queues":[],
           "fusion_backend_config":{
-            "kind":"__triton_gemm",
-            "triton_gemm_config":{
-              "block_m":"16",
-              "block_n":"256",
-              "block_k":"16",
-              "split_k":"1",
-              "num_stages":"4",
-              "num_warps":"4",
-              "num_ctas":"1"
-            }
-          },
+            "kind":"__triton_nested_gemm_fusion",
+            "block_level_fusion_config":{
+              "output_tiles": [{"sizes": ["16","256"]}],
+              "num_stages":4,
+              "num_warps":4,
+              "num_ctas":1
+            }},
           "force_earliest_schedule":false
         }
     }
@@ -1491,7 +1567,6 @@ TEST_P(TritonAndBlasSupportForDifferentTensorSizes, Regular2DDot) {
 
 TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
        IsDotAlgorithmSupportedByTriton) {
-
   // Here we test which dot algorithm is supported by triton.
   // In case of a change you need to update the expected results.
   constexpr absl::string_view kHloText = R"(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index f94f4d33213631..c0c5ce5ed479ad 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -16,15 +16,16 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 
 #include <cstdint>
-#include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Metadata.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -48,19 +50,29 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/transforms/transformation_helpers.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/target_util.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -82,11 +94,84 @@ namespace mh = ::mlir::mhlo;
 namespace mm = ::mlir::math;
 namespace mt = ::mlir::triton;
 
-ScalarOrTensor::ScalarOrTensor(mlir::Value value) : value_(value) {
-  CHECK(IsScalar() || UnwrapTensor().getType().getRank() > 0)
-      << "0D tensors are not supported by Triton";
+namespace {
+using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
+
+// Emit a value as Index clamped to [lower, upper].
+Value EmitClampedIndex(EmitterLocOpBuilder b, Value value, int64_t lower,
+                       int64_t upper) {
+  Value clamped_index =
+      b.create<ma::MaxSIOp>(value, CreateConst(b, value.getType(), lower));
+  clamped_index = b.create<ma::MinSIOp>(clamped_index,
+                                        CreateConst(b, value.getType(), upper));
+  return b.create<ma::IndexCastOp>(b.getIndexType(), clamped_index);
 }
 
+absl::StatusOr<SmallVector<Value>> ComputeOffsetsForTile(
+    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
+                      tiled_hlo.tile_offsets_indexing());
+  const std::vector<IndexingMap::Variable>& rt_vars =
+      tile_offsets_indexing.GetRTVars();
+  CHECK_EQ(rt_vars.size(), runtime_values.size())
+      << absl::StrCat(tiled_hlo.ToString(), " has ", rt_vars.size(),
+                      " runtime variables in tile_offsets_indexing but only ",
+                      runtime_values.size(), " runtime values were provided");
+  CHECK_EQ(tile_offsets_indexing.GetRangeVars().size(), 0)
+      << "Range variables must be converted to dimensions. Instruction: "
+      << tiled_hlo.ToString();
+  // emitters::ApplyIndexing does not support symbols at the moment. As a
+  // workaround we convert them to dimensions.
+  IndexingMap dim_only_tiling =
+      tile_offsets_indexing.ConvertSymbolsToDimensions();
+  SmallVector<Value> dims;
+  dims.reserve(1 /* pid */ + runtime_values.size());
+  dims.push_back(pid);
+  for (const auto& [rt_var, value] : llvm::zip(rt_vars, runtime_values)) {
+    Value clamped_index =
+        EmitClampedIndex(b, value, rt_var.bounds.lower, rt_var.bounds.upper);
+    dims.push_back(triton::Cast(b, clamped_index, pid.getType()));
+  }
+  return emitters::ApplyIndexing(dim_only_tiling, /*dims=*/dims,
+                                 /*symbols=*/{}, b);
+}
+
+// Emit code corresponding to a fusion instruction somehow nested within the
+// initial Triton fusion. This can happen when we carry around auxiliary
+// computations, e.g. with reduces. Since we are emitting a single Triton
+// fusion, we simply flatten the fusion inside the computation.
+//
+// TODO(b/331413981): get rid of this special handling once this is solved.
+absl::StatusOr<TensorValue> EmitNestedFusion(
+    EmitterLocOpBuilder b, const HloFusionInstruction& fusion_instruction,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values) {
+  // TODO(b/331402498): revisit the order of scope once we completely
+  // deprecate Triton fusion analysis.
+  const HloComputation* fusion_computation =
+      fusion_instruction.fused_instructions_computation();
+
+  absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
+
+  std::vector<const HloInstruction*> to_emit;
+  for (const HloInstruction* instr :
+       fusion_computation->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kParameter) {
+      int64_t parameter_number = instr->parameter_number();
+      auto it = values.find(fusion_instruction.operand(parameter_number));
+      TF_RET_CHECK(it != values.end());
+      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
+    } else {
+      to_emit.push_back(instr);
+    }
+  }
+
+  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
+
+  return EmitScope(b, /*analysis=*/nullptr, to_emit, region_values);
+}
+}  // namespace
+
 SmallVector<int64_t> GetPaddedTileSizes(ArrayRef<int64_t> tile_sizes) {
   SmallVector<int64_t> result;
   result.reserve(tile_sizes.size());
@@ -236,6 +321,11 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
       }
       return b.create<ma::ExtSIOp>(dst_ty, value);
     }
+    // int => bool is always value != 0.
+    if (dst_element_ty.isInteger(1)) {
+      return b.create<ma::CmpIOp>(ma::CmpIPredicate::ne, value,
+                                  ZerosLike(b, value));
+    }
     return b.create<ma::TruncIOp>(dst_ty, value);
   }
   // int => float
@@ -254,20 +344,18 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
     }
     // The current logic handles signed integer types only. Additional handling
     // is needed for unsigned integer types.
-    auto cst_int = [&](int64_t x) {
+    auto cst_int = [&](int64_t x) -> Value {
       if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape())
-            .UnwrapUnsafe();
+        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape());
       } else {
-        return CreateConst(b, dst_element_ty, x).UnwrapUnsafe();
+        return CreateConst(b, dst_element_ty, x);
       }
     };
-    auto cst_float = [&](int64_t x) {
+    auto cst_float = [&](int64_t x) -> Value {
       if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape())
-            .UnwrapUnsafe();
+        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape());
       } else {
-        return CreateConst(b, src_fp_element_ty, x).UnwrapUnsafe();
+        return CreateConst(b, src_fp_element_ty, x);
       }
     };
     auto fptosi = b.create<ma::FPToSIOp>(dst_ty, value);
@@ -318,8 +406,7 @@ Value Compare(EmitterLocOpBuilder& b, ValueRange values,
       values[0], values[1]);
 }
 
-Value Maximum(EmitterLocOpBuilder& b, const se::DeviceDescription& device_info,
-              ValueRange values) {
+Value Maximum(EmitterLocOpBuilder& b, ValueRange values) {
   if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::MaximumFOp>(values);
   }
@@ -339,8 +426,7 @@ Value Maximum(EmitterLocOpBuilder& b, const se::DeviceDescription& device_info,
       values[0], values[1]);
 }
 
-Value Minimum(EmitterLocOpBuilder& b, const se::DeviceDescription& device_info,
-              ValueRange values) {
+Value Minimum(EmitterLocOpBuilder& b, ValueRange values) {
   if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::MinimumFOp>(values);
   }
@@ -372,6 +458,8 @@ bool IsSupportedElementwiseLibdeviceFunction(const HloInstruction& hlo) {
          output_type == PrimitiveType::F32 || output_type == PrimitiveType::F64;
 }
 
+// TODO(willfroom): Remove this (and associated functions) once the legacy
+// matmul is removed.
 absl::StatusOr<Value> EmitElementwiseLibdeviceFunction(
     EmitterLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info, const HloInstruction& hlo,
@@ -413,14 +501,8 @@ absl::StatusOr<Value> EmitElementwiseLibdeviceFunction(
 }
 
 absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
-                                      absl::string_view libdevice_path,
-                                      const se::DeviceDescription& device_info,
                                       const HloInstruction& hlo,
                                       ValueRange inputs) {
-  if (IsSupportedElementwiseLibdeviceFunction(hlo)) {
-    return EmitElementwiseLibdeviceFunction(b, libdevice_path, device_info, hlo,
-                                            inputs);
-  }
   const bool is_integer =
       mlir::isa<mlir::IntegerType>(getElementTypeOrSelf(inputs[0].getType()));
 
@@ -451,7 +533,7 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
       if (is_integer) {
         // XLA add semantics for predicates is equal to bitwise OR, while Arith
         // defines it differently. Replace add with or in this case.
-        if (inputs[0].getType().isInteger(1)) {
+        if (getElementTypeOrSelf(inputs[0]).isInteger(1)) {
           return b.create<ma::OrIOp>(inputs[0], inputs[1]);
         }
         return b.create<ma::AddIOp>(inputs[0], inputs[1]);
@@ -465,13 +547,11 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
       }
       return b.create<ma::MulFOp>(inputs[0], inputs[1]);
     case HloOpcode::kMaximum:
-      return Maximum(b, device_info, inputs);
+      return Maximum(b, inputs);
     case HloOpcode::kMinimum:
-      return Minimum(b, device_info, inputs);
+      return Minimum(b, inputs);
     case HloOpcode::kClamp:
-      return Maximum(
-          b, device_info,
-          {Minimum(b, device_info, {inputs[1], inputs[2]}), inputs[0]});
+      return Maximum(b, {Minimum(b, {inputs[1], inputs[2]}), inputs[0]});
     case HloOpcode::kAnd:
       return b.create<ma::AndIOp>(inputs[0], inputs[1]);
     case HloOpcode::kOr:
@@ -496,16 +576,60 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder& b,
                   mh::ComparisonDirection::NE),
           inputs[1], inputs[2]);
     case HloOpcode::kReducePrecision:
-      return mh::reducePrecision<mt::BitcastOp>(
+      return mh::reducePrecision<mlir::tensor::BitcastOp>(
           b.getLoc(), inputs[0], hlo.exponent_bits(), hlo.mantissa_bits(), &b);
+    case HloOpcode::kAcos:
+      return b.create<mm::AcosOp>(inputs[0]);
+    case HloOpcode::kAcosh:
+      return b.create<mm::AcoshOp>(inputs[0]);
+    case HloOpcode::kAsin:
+      return b.create<mm::AsinOp>(inputs[0]);
+    case HloOpcode::kAsinh:
+      return b.create<mm::AsinhOp>(inputs[0]);
+    case HloOpcode::kAtan2:
+      return b.create<mm::Atan2Op>(inputs[0], inputs[1]);
+    case HloOpcode::kAtanh:
+      return b.create<mm::AtanhOp>(inputs[0]);
+    case HloOpcode::kCos:
+      return b.create<mm::CosOp>(inputs[0]);
+    case HloOpcode::kCosh:
+      return b.create<mm::CoshOp>(inputs[0]);
+    case HloOpcode::kExp:
+      return b.create<mm::ExpOp>(inputs[0]);
+    case HloOpcode::kErf:
+      return b.create<mm::ErfOp>(inputs[0]);
+    case HloOpcode::kExpm1:
+      return b.create<mm::ExpM1Op>(inputs[0]);
+    case HloOpcode::kLog:
+      return b.create<mm::LogOp>(inputs[0]);
+    case HloOpcode::kLog1p:
+      return b.create<mm::Log1pOp>(inputs[0]);
+    case HloOpcode::kPower:
+      return b.create<mm::PowFOp>(inputs[0], inputs[1]);
+    case HloOpcode::kRemainder:
+      return b.create<ma::RemFOp>(inputs[0], inputs[1]);
+    case HloOpcode::kRsqrt:
+      return b.create<mm::RsqrtOp>(inputs[0]);
+    case HloOpcode::kSin:
+      return b.create<mm::SinOp>(inputs[0]);
+    case HloOpcode::kSinh:
+      return b.create<mm::SinhOp>(inputs[0]);
+    case HloOpcode::kSqrt:
+      return b.create<mm::SqrtOp>(inputs[0]);
+    case HloOpcode::kTan:
+      return b.create<mm::TanOp>(inputs[0]);
+    case HloOpcode::kTanh:
+      return b.create<mm::TanhOp>(inputs[0]);
+    case HloOpcode::kCbrt:
+      return b.create<mm::CbrtOp>(inputs[0]);
     default:
       return absl::InvalidArgumentError(
           absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
   }
 }
 
-absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
-                                            const HloInstruction& constant) {
+absl::StatusOr<mlir::TypedValue<mlir::RankedTensorType>> EmitConstant(
+    EmitterLocOpBuilder& b, const HloInstruction& constant) {
   TF_ASSIGN_OR_RETURN(Type ty, TritonType(b, constant.shape().element_type()));
   llvm::SmallVector<int64_t> shape{constant.shape().dimensions().begin(),
                                    constant.shape().dimensions().end()};
@@ -620,4 +744,98 @@ mt::PointerType GetGlobalPointerType(mlir::Type element_type) {
   return mlir::cast<mt::PointerType>(mt::getPointerTypeToElement(element_type));
 }
 
+/*static */ absl::StatusOr<TileInfo> TileInfo::Construct(
+    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(SmallVector<Value> offsets,
+                      ComputeOffsetsForTile(b, pid, runtime_values, tiled_hlo));
+
+  // Triton requires that all block dimensions are a power of 2.
+  auto padded_tile_sizes = GetPaddedTileSizes(tiled_hlo.tile_sizes());
+  SmallVector<int64_t> original_shape;
+  original_shape.assign(tiled_hlo.hlo()->shape().dimensions().begin(),
+                        tiled_hlo.hlo()->shape().dimensions().end());
+
+  const Shape& shape = tiled_hlo.hlo()->shape();
+  TF_ASSIGN_OR_RETURN(Type expected_element_type,
+                      TritonType(b, shape.element_type()));
+  auto storage_type = StorageType(expected_element_type);
+
+  auto tile_strides = tiled_hlo.tile_strides();
+  auto minor_to_major_layout = llvm::to_vector(LayoutUtil::MinorToMajor(shape));
+
+  return TileInfo(offsets, tile_strides, original_shape, padded_tile_sizes,
+                  minor_to_major_layout, storage_type);
+}
+
+TensorValue EmitParameterExtract(EmitterLocOpBuilder b,
+                                 const TileInfo& tile_info, Value arg) {
+  auto tensor_type = mlir::RankedTensorType::get(tile_info.padded_tile_sizes(),
+                                                 tile_info.storage_type());
+
+  return b.create<xla::xtile::ExtractTileOp>(
+      tensor_type, arg, tile_info.offsets(), tile_info.padded_tile_sizes(),
+      tile_info.tile_strides());
+}
+
+absl::StatusOr<TensorValue> EmitScope(
+    EmitterLocOpBuilder b, const TritonFusionAnalysis* analysis,
+    absl::Span<const HloInstruction* const> instructions,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values) {
+  for (const HloInstruction* hlo : instructions) {
+    TensorValue result;
+    if (hlo->opcode() == HloOpcode::kConcatenate ||
+        hlo->opcode() == HloOpcode::kDynamicSlice) {
+      // Parameter loads and their concatenations are handled outside EmitScope.
+      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
+      continue;
+    }
+    if (hlo->opcode() == HloOpcode::kParameter) {
+      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
+          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
+        continue;
+      }
+      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
+      continue;
+    }
+    if (hlo->opcode() == HloOpcode::kBroadcast) {
+      return absl::InvalidArgumentError(
+          "Broadcast is not yet supported in EmitScope().");
+    }
+    if (hlo->opcode() == HloOpcode::kConstant) {
+      TF_ASSIGN_OR_RETURN(result, EmitConstant(b, *hlo));
+    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
+      std::vector<Value> operands;
+      operands.reserve(hlo->operands().size());
+      for (const HloInstruction* operand : hlo->operands()) {
+        operands.push_back(values[operand]);
+      }
+      TF_ASSIGN_OR_RETURN(Value elementwise_result,
+                          EmitElementwise(b, *hlo, operands));
+      result = mlir::cast<TensorValue>(elementwise_result);
+    } else if (hlo->opcode() == HloOpcode::kTuple) {
+      TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
+    } else if (hlo->opcode() == HloOpcode::kBitcast ||
+               hlo->opcode() == HloOpcode::kTranspose ||
+               hlo->opcode() == HloOpcode::kSlice ||
+               hlo->opcode() == HloOpcode::kReshape ||
+               hlo->opcode() == HloOpcode::kPad) {
+      // All these are currently supported only as operations on indices
+      // which are pushed to loads and stores. No operations on tiles are
+      // performed here.
+      result = values[hlo->operand(0)];
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
+      TF_ASSIGN_OR_RETURN(result,
+                          EmitNestedFusion(b, *fusion_instruction, values));
+    } else {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported operation ", hlo->ToString()));
+    }
+    TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
+    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
+  }
+  return values[instructions.back()];
+}
+
 }  // namespace xla::gpu::triton
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
index 8f7d83e8097759..8fff3ab3f09af2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
@@ -18,12 +18,15 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -39,9 +42,11 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
@@ -54,6 +59,8 @@ limitations under the License.
 
 namespace xla::gpu::triton {
 
+using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
+
 // Returns a string representation of the given MLIR entity.
 template <typename T>
 std::string MlirToString(T&& value) {
@@ -63,43 +70,60 @@ std::string MlirToString(T&& value) {
   return result;
 }
 
-// This is a wrapper around mlir::Value that can hold either a scalar or a
-// non-0D tensor. An attempt to use this class with 0D tensors will CHECK-fail
-// because 0D tensors are not supported by Triton.
-class ScalarOrTensor {
-  using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
-
+// Constructs and holds information needed to construct a tile. This information
+// is propagated to Extract/Insert ops to use them to load and store the correct
+// tiles.
+class TileInfo {
  public:
-  ScalarOrTensor() = default;
+  static absl::StatusOr<TileInfo> Construct(
+      EmitterLocOpBuilder b, mlir::Value pid, mlir::ValueRange runtime_values,
+      const TiledHloInstruction& tiled_hlo);
 
-  // Wraps the given value in a ScalarOrTensor. CHECK-fails if the
-  // value is a 0D tensor, because Triton does not support 0D tensors.
-  explicit ScalarOrTensor(mlir::Value value);
+  // Tile offsets. Its size is equal to the rank of the output shape.
+  inline mlir::ValueRange offsets() const { return offsets_; }
 
-  bool IsScalar() const { return !IsTensor(); }
-  bool IsTensor() const { return mlir::isa<TensorValue>(value_); }
+  // Tile strides. Its size is equal to the rank of the output shape.
+  inline mlir::ArrayRef<int64_t> tile_strides() const { return tile_strides_; }
 
-  mlir::Value UnwrapScalar() const {
-    CHECK(IsScalar());
-    return value_;
+  // The original shape of the tensor.
+  inline mlir::ArrayRef<int64_t> original_shape() const {
+    return original_shape_;
   }
 
-  TensorValue UnwrapTensor() const {
-    CHECK(IsTensor());
-    return mlir::cast<TensorValue>(value_);
+  // Tile sizes after padding to a power of 2 (Triton requirement).
+  inline mlir::ArrayRef<int64_t> padded_tile_sizes() const {
+    return padded_tile_sizes_;
   }
 
-  // Returns the underlying value regardless of whether it is a scalar or a
-  // tensor. Only call this method in contexts where the consumer of the result
-  // both needs to use an `mlir::Value` and functions identically for scalars
-  // and tensors. In other cases, prefer to use the `UnwrapScalar` or
-  // `UnwrapTensor` methods.
-  mlir::Value UnwrapUnsafe() const { return value_; }
+  // The layout of the tensor in minor-to-major order.
+  inline const llvm::SmallVector<int64_t>& minor_to_major_layout() const {
+    return minor_to_major_layout_;
+  }
 
-  mlir::Type getType() const { return value_.getType(); }
+  // The storage type of the tensor. This could be different from the element
+  // type. e.g. predicates are stored as i8 instead of i1.
+  mlir::Type storage_type() const { return storage_type_; }
 
  private:
-  mlir::Value value_;
+  llvm::SmallVector<mlir::Value> offsets_;
+  llvm::SmallVector<int64_t> tile_strides_;
+  llvm::SmallVector<int64_t> original_shape_;
+  llvm::SmallVector<int64_t> padded_tile_sizes_;
+  llvm::SmallVector<int64_t> minor_to_major_layout_;
+  mlir::Type storage_type_;
+
+  inline TileInfo(llvm::SmallVector<mlir::Value> offsets,
+                  llvm::SmallVector<int64_t> tile_strides,
+                  llvm::SmallVector<int64_t> original_shape,
+                  llvm::SmallVector<int64_t> padded_tile_sizes,
+                  llvm::SmallVector<int64_t> minor_to_major_layout,
+                  mlir::Type storage_type)
+      : offsets_(std::move(offsets)),
+        tile_strides_(std::move(tile_strides)),
+        original_shape_(std::move(original_shape)),
+        padded_tile_sizes_(std::move(padded_tile_sizes)),
+        minor_to_major_layout_(std::move(minor_to_major_layout)),
+        storage_type_(std::move(storage_type)) {}
 };
 
 // Triton requires that all block dimensions are a power of 2.
@@ -128,47 +152,41 @@ T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
 
 // Create a scalar constant.
 template <typename T>
-ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
+mlir::Value CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
   if (mlir::isa<mlir::IntegerType>(type)) {
-    auto result =
-        b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
-    return ScalarOrTensor(result);
+    return b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
   }
 
   if (mlir::isa<mlir::IndexType>(type)) {
-    auto result = b.create<mlir::arith::ConstantOp>(b.getIndexAttr(value));
-    return ScalarOrTensor(result);
+    return b.create<mlir::arith::ConstantOp>(b.getIndexAttr(value));
   }
 
   if (mlir::isa<mlir::FloatType>(type)) {
-    auto result = b.create<mlir::arith::ConstantOp>(
+    return b.create<mlir::arith::ConstantOp>(
         b.getFloatAttr(type, static_cast<double>(value)));
-    return ScalarOrTensor(result);
   }
   LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
 }
 
 // Create a tensor constant.
 template <typename T>
-ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value,
-                           llvm::ArrayRef<int64_t> shape) {
-  if (shape.empty()) {
-    return CreateConst<T>(b, type, value);
-  }
+mlir::TypedValue<mlir::RankedTensorType> CreateConst(
+    EmitterLocOpBuilder& b, mlir::Type type, T value,
+    llvm::ArrayRef<int64_t> shape) {
   auto tensor_type = mlir::RankedTensorType::get(shape, type);
   if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
-    auto result =
+    mlir::Value result =
         b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
             tensor_type,
             mlir::APInt(int_type.getIntOrFloatBitWidth(), value,
                         /*isSigned=*/false, /*implicitTrunc=*/true)));
-    return ScalarOrTensor(result);
+    return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(result);
   }
   if (auto float_type = mlir::dyn_cast<mlir::FloatType>(type)) {
-    auto result =
+    mlir::Value result =
         b.create<mlir::arith::ConstantOp>(mlir::DenseElementsAttr::get(
             tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
-    return ScalarOrTensor(result);
+    return mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(result);
   }
   LOG(FATAL) << "Constant type not supported: " << llvm_ir::DumpToString(type);
 }
@@ -178,10 +196,9 @@ template <typename T>
 mlir::Value ConstLike(EmitterLocOpBuilder& b, mlir::Value like, T new_value) {
   if (auto src_shaped_ty = mlir::dyn_cast<mlir::ShapedType>(like.getType())) {
     mlir::Type src_ty = src_shaped_ty.getElementType();
-    return CreateConst(b, src_ty, new_value, src_shaped_ty.getShape())
-        .UnwrapUnsafe();
+    return CreateConst(b, src_ty, new_value, src_shaped_ty.getShape());
   }
-  return CreateConst(b, like.getType(), new_value).UnwrapUnsafe();
+  return CreateConst(b, like.getType(), new_value);
 }
 
 inline mlir::Value ZerosLike(EmitterLocOpBuilder& b, mlir::Value x) {
@@ -199,8 +216,8 @@ mlir::Value Cast(EmitterLocOpBuilder& b, mlir::Value value,
                  mlir::Type dst_element_ty);
 
 // Emits a scalar constant.
-absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
-                                            const HloInstruction& constant);
+absl::StatusOr<mlir::TypedValue<mlir::RankedTensorType>> EmitConstant(
+    EmitterLocOpBuilder& b, const HloInstruction& constant);
 
 bool IsSupportedElementwiseLibdeviceFunction(const HloInstruction& hlo);
 
@@ -211,10 +228,9 @@ absl::StatusOr<mlir::Value> EmitElementwiseLibdeviceFunction(
     const se::DeviceDescription& device_info, const HloInstruction& hlo,
     mlir::ValueRange inputs);
 
-absl::StatusOr<mlir::Value> EmitElementwise(
-    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info, const HloInstruction& hlo,
-    mlir::ValueRange inputs);
+absl::StatusOr<mlir::Value> EmitElementwise(EmitterLocOpBuilder& b,
+                                            const HloInstruction& hlo,
+                                            mlir::ValueRange inputs);
 
 mlir::Value Bitcast(EmitterLocOpBuilder& b, mlir::Value value, mlir::Type type);
 
@@ -235,6 +251,25 @@ absl::StatusOr<stream_executor::ThreadDim> ExtractThreadDims(
 // element type.
 ::mlir::triton::PointerType GetGlobalPointerType(mlir::Type element_type);
 
+// Emits an xtile::ExtractTileOp for the given tile info and argument.
+TensorValue EmitParameterExtract(EmitterLocOpBuilder b,
+                                 const TileInfo& tile_info, mlir::Value arg);
+
+// Emits a sequence of HLO instructions within a specific scope.
+//
+// This function traverses the provided `hlo_instructions` in a
+// defined-before-use order and emits the corresponding MLIR operations using
+// the given `EmitterLocOpBuilder`. It uses `emitted_values` to look up already
+// emitted results for instructions, typically parameters or results from
+// outer scopes. New results are added to the `emitted_values` map.
+//
+// Example usage within [EmitReduce] includes using it to emit the body of the
+// `HloInstruction::to_apply` computation.
+absl::StatusOr<TensorValue> EmitScope(
+    EmitterLocOpBuilder b, const TritonFusionAnalysis* analysis,
+    absl::Span<const HloInstruction* const> instructions,
+    absl::flat_hash_map<const HloInstruction*, TensorValue>& values);
+
 }  // namespace xla::gpu::triton
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_EMITTER_HELPERS_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
index 6cfefcc6a5f685..e3ec9bc77579a6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
@@ -103,7 +103,8 @@ TritonFusion::GenerateTritonKernelAndWrapper(
 
   if (fusion_kind == kTritonFusionKind ||
       fusion_kind == kTritonNestedGemmFusionKind ||
-      fusion_kind == kTritonScaledDotFusionKind) {
+      fusion_kind == kTritonScaledDotFusionKind ||
+      fusion_kind == kTritonCollectiveFusionKind) {
     if (!analysis_.fusion_backend_config().has_block_level_fusion_config()) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Block level fusion config is required for Triton fusions: ",
@@ -166,10 +167,10 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
 
     TF_ASSIGN_OR_RETURN(
         TritonWrapperResult triton_wrapper_result,
-        GenerateTritonKernelAndWrapper(
-            fusion, impl_fn_name, ir_emitter_context.gpu_device_info(),
-            ir_emitter_context.llvm_module(),
-            ir_emitter_context.symbolic_expr_context()));
+        GenerateTritonKernelAndWrapper(fusion, impl_fn_name,
+                                       ir_emitter_context.gpu_device_info(),
+                                       ir_emitter_context.llvm_module(),
+                                       ir_emitter_context.expr_context()));
 
     auto backend_config =
         fusion.backend_config<GpuBackendConfig>()->fusion_backend_config();
@@ -178,7 +179,8 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     LaunchDimensions launch_dimensions;
     if (fusion_kind == kTritonFusionKind ||
         fusion_kind == kTritonNestedGemmFusionKind ||
-        fusion_kind == kTritonScaledDotFusionKind) {
+        fusion_kind == kTritonScaledDotFusionKind ||
+        fusion_kind == kTritonCollectiveFusionKind) {
       std::optional<LaunchConfig> launch_config;
       // Currently GetLaunchConfig will compute the same value as the extracted
       // one. They are different only when warp specialization is enabled.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
index 288f55be5bdeba..5a76bd3b3c03bb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -64,11 +65,13 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
@@ -91,13 +94,13 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
+#include "xla/backends/gpu/codegen/triton/collective_emitter.h"
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
 #include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
@@ -109,9 +112,11 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_fusion_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
 #include "xla/codegen/xtile/ir/xtile_dialect.h"
 #include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -130,9 +135,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/llvm_gpu_backend/nvptx_libdevice_path.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/triton_emitter_constraints.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -144,6 +147,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -165,7 +169,6 @@ namespace xgt = ::xla::gpu::triton;
 using ::llvm::SmallVector;
 using ::mlir::AffineMap;
 using ::mlir::ArrayRef;
-using ::mlir::ShapedType;
 using ::mlir::Type;
 using ::mlir::Value;
 using ::mlir::ValueRange;
@@ -174,9 +177,11 @@ using ::xla::gpu::triton::Cast;
 using ::xla::gpu::triton::CreateConst;
 using ::xla::gpu::triton::EmitConstant;
 using ::xla::gpu::triton::EmitElementwise;
+using ::xla::gpu::triton::EmitScope;
 using ::xla::gpu::triton::GetPaddedTileSizes;
-using ::xla::gpu::triton::ScalarOrTensor;
 using ::xla::gpu::triton::StorageType;
+using ::xla::gpu::triton::TensorValue;
+using ::xla::gpu::triton::TileInfo;
 using ::xla::gpu::triton::TritonType;
 
 namespace {
@@ -185,200 +190,43 @@ Value MakeIndex(EmitterLocOpBuilder& b, int64_t value) {
   return b.create<arith::ConstantIndexOp>(value);
 }
 
-// Emit a value as Index clamped to [lower, upper].
-Value EmitClampedIndex(EmitterLocOpBuilder b, Value value, int64_t lower,
-                       int64_t upper) {
-  Value clamped_index = b.create<arith::MaxSIOp>(
-      value, CreateConst(b, value.getType(), lower).UnwrapUnsafe());
-  clamped_index = b.create<arith::MinSIOp>(
-      clamped_index, CreateConst(b, value.getType(), upper).UnwrapUnsafe());
-  return b.create<arith::IndexCastOp>(b.getIndexType(), clamped_index);
-}
-
-absl::StatusOr<SmallVector<Value>> ComputeOffsetsForTile(
-    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-    const TiledHloInstruction& tiled_hlo) {
-  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
-                      tiled_hlo.tile_offsets_indexing());
-  const std::vector<IndexingMap::Variable>& rt_vars =
-      tile_offsets_indexing.GetRTVars();
-  CHECK_EQ(rt_vars.size(), runtime_values.size())
-      << absl::StrCat(tiled_hlo.ToString(), " has ", rt_vars.size(),
-                      " runtime variables in tile_offsets_indexing but only ",
-                      runtime_values.size(), " runtime values were provided");
-  CHECK_EQ(tile_offsets_indexing.GetRangeVars().size(), 0)
-      << "Range variables must be converted to dimensions. Instruction: "
-      << tiled_hlo.ToString();
-  // emitters::ApplyIndexing does not support symbols at the moment. As a
-  // workaround we convert them to dimensions.
-  IndexingMap dim_only_tiling =
-      tile_offsets_indexing.ConvertSymbolsToDimensions();
-  SmallVector<Value> dims;
-  dims.reserve(1 /* pid */ + runtime_values.size());
-  dims.push_back(pid);
-  for (const auto& [rt_var, value] : llvm::zip(rt_vars, runtime_values)) {
-    Value clamped_index =
-        EmitClampedIndex(b, value, rt_var.bounds.lower, rt_var.bounds.upper);
-    dims.push_back(triton::Cast(b, clamped_index, pid.getType()));
-  }
-  return emitters::ApplyIndexing(dim_only_tiling, /*dims=*/dims,
-                                 /*symbols=*/{}, b);
-}
-
-// Constructs and holds information needed to construct a tile. This information
-// is propagated to Extract/Insert ops to use them to load and store the correct
-// tiles.
-class TileInfo {
- public:
-  static absl::StatusOr<TileInfo> Construct(
-      EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-      const TiledHloInstruction& tiled_hlo);
-
-  // Tile offsets. Its size is equal to the rank of the output shape.
-  ValueRange offsets() const { return offsets_; }
-
-  // Tile strides. Its size is equal to the rank of the output shape.
-  ArrayRef<int64_t> tile_strides() const { return tile_strides_; }
-
-  // The original shape of the tensor.
-  ArrayRef<int64_t> original_shape() const { return original_shape_; }
-
-  // Tile sizes after padding to a power of 2 (Triton requirement).
-  ArrayRef<int64_t> padded_tile_sizes() const { return padded_tile_sizes_; }
-
-  // The layout of the tensor in minor-to-major order.
-  const SmallVector<int64_t>& minor_to_major_layout() const {
-    return minor_to_major_layout_;
-  }
-
-  // The storage type of the tensor. This could be different from the element
-  // type. e.g. predicates are stored as i8 instead of i1.
-  Type storage_type() const { return storage_type_; }
-
- private:
-  SmallVector<Value> offsets_;
-  SmallVector<int64_t> tile_strides_;
-  SmallVector<int64_t> original_shape_;
-  SmallVector<int64_t> padded_tile_sizes_;
-  SmallVector<int64_t> minor_to_major_layout_;
-  Type storage_type_;
-
-  explicit TileInfo(SmallVector<Value> offsets,
-                    SmallVector<int64_t> tile_strides,
-                    SmallVector<int64_t> original_shape,
-                    SmallVector<int64_t> padded_tile_sizes,
-                    SmallVector<int64_t> minor_to_major_layout,
-                    Type storage_type)
-      : offsets_(std::move(offsets)),
-        tile_strides_(std::move(tile_strides)),
-        original_shape_(std::move(original_shape)),
-        padded_tile_sizes_(std::move(padded_tile_sizes)),
-        minor_to_major_layout_(std::move(minor_to_major_layout)),
-        storage_type_(std::move(storage_type)) {}
-};
-
-absl::StatusOr<TileInfo> TileInfo::Construct(
-    EmitterLocOpBuilder b, Value pid, ValueRange runtime_values,
-    const TiledHloInstruction& tiled_hlo) {
-  TF_ASSIGN_OR_RETURN(SmallVector<Value> offsets,
-                      ComputeOffsetsForTile(b, pid, runtime_values, tiled_hlo));
-
-  // Triton requires that all block dimensions are a power of 2.
-  auto padded_tile_sizes = GetPaddedTileSizes(tiled_hlo.tile_sizes());
-  SmallVector<int64_t> original_shape;
-  original_shape.assign(tiled_hlo.hlo()->shape().dimensions().begin(),
-                        tiled_hlo.hlo()->shape().dimensions().end());
-
-  const Shape& shape = tiled_hlo.hlo()->shape();
-  TF_ASSIGN_OR_RETURN(Type expected_element_type,
-                      TritonType(b, shape.element_type()));
-  auto storage_type = StorageType(expected_element_type);
-
-  auto tile_strides = tiled_hlo.tile_strides();
-  auto minor_to_major_layout = llvm::to_vector(LayoutUtil::MinorToMajor(shape));
-
-  return TileInfo(offsets, tile_strides, original_shape, padded_tile_sizes,
-                  minor_to_major_layout, storage_type);
-}
-
-using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
-
 // Same as HLO BroadcastInDims. The sorted indices in `dims` specify the mapping
 // of the input dimensions to the output dimensions.
-ScalarOrTensor BroadcastInDims(EmitterLocOpBuilder b, ScalarOrTensor value,
-                               ArrayRef<int64_t> output_shape,
-                               ArrayRef<int64_t> dims) {
+TensorValue BroadcastInDims(EmitterLocOpBuilder b, TensorValue value,
+                            ArrayRef<int64_t> output_shape,
+                            ArrayRef<int64_t> dims) {
   CHECK(llvm::is_sorted(dims)) << "broadcast dims must be sorted";
 
-  mlir::TypedValue<mlir::RankedTensorType> broadcast_in_dim_input;
-
-  if (value.IsScalar()) {
-    CHECK(dims.empty()) << "scalar broadcast must have empty dims";
-    auto scalar_tensor_type =
-        mlir::RankedTensorType::get(/*shape=*/{}, value.getType());
-
-    broadcast_in_dim_input = b.create<mlir::tensor::FromElementsOp>(
-                                  scalar_tensor_type, value.UnwrapScalar())
-                                 .getResult();
-  } else {
-    broadcast_in_dim_input = value.UnwrapTensor();
-  }
-
   auto result_type = mlir::RankedTensorType::get(
-      output_shape, broadcast_in_dim_input.getType().getElementType());
+      output_shape, value.getType().getElementType());
 
-  return ScalarOrTensor(b.create<stablehlo::BroadcastInDimOp>(
-      result_type, broadcast_in_dim_input, dims));
+  return b.create<stablehlo::BroadcastInDimOp>(result_type, value, dims);
 }
 
-ScalarOrTensor Splat(EmitterLocOpBuilder b, ScalarOrTensor value,
-                     ArrayRef<int64_t> output_shape) {
-  return BroadcastInDims(b, value, output_shape, /*dims=*/{});
+TensorValue Splat(EmitterLocOpBuilder b, Value value,
+                  ArrayRef<int64_t> output_shape) {
+  auto tensor_value = mlir::dyn_cast<TensorValue>(value);
+  if (!tensor_value) {
+    tensor_value = b.create<mlir::tensor::FromElementsOp>(
+        mlir::RankedTensorType::get({}, value.getType()), value);
+  }
+  return BroadcastInDims(b, tensor_value, output_shape, /*dims=*/{});
 }
 
-ScalarOrTensor Iota(EmitterLocOpBuilder b, int32_t limit) {
+TensorValue Iota(EmitterLocOpBuilder b, int32_t limit) {
   auto type = mlir::RankedTensorType::get(limit, b.getI32Type());
-  return ScalarOrTensor(
-      b.create<stablehlo::IotaOp>(type, /*iota_dimension=*/0));
+  return b.create<stablehlo::IotaOp>(type, /*iota_dimension=*/0);
 }
 
-ScalarOrTensor EmitParameterExtract(EmitterLocOpBuilder b,
-                                    const TileInfo& tile_info, Value arg) {
-  auto tensor_type = mlir::RankedTensorType::get(tile_info.padded_tile_sizes(),
-                                                 tile_info.storage_type());
-
-  mlir::Value extracted_tensor = b.create<xla::xtile::ExtractTileOp>(
-      tensor_type, arg, tile_info.offsets(), tile_info.padded_tile_sizes(),
-      tile_info.tile_strides());
-
-  if (tensor_type.getRank() == 0) {
-    // Triton does not support 0-D tensors so we must extract the scalar value.
-    // TODO(csigg): This should be handled in the extract/insert rewrite.
-    return ScalarOrTensor(b.create<mlir::tensor::ExtractOp>(extracted_tensor));
-  }
-
-  return ScalarOrTensor(extracted_tensor);
-}
-
-absl::StatusOr<ScalarOrTensor> EmitScope(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const TritonFusionAnalysis* analysis,
-    absl::Span<const HloInstruction* const> instructions,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values);
-
-absl::StatusOr<ScalarOrTensor> EmitReduce(
+absl::StatusOr<TensorValue> EmitReduce(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_hlo_reduce,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values,
-    absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   // At the moment, we should only emit a full reduction over a single
   // dimension using a scalar as a neutral element.
   const HloReduceInstruction& hlo_reduce =
       *::xla::Cast<HloReduceInstruction>(tiled_hlo_reduce.hlo());
-  ScalarOrTensor input = values[tiled_hlo_reduce.operand(0)];
-  llvm::ArrayRef<int64_t> input_shape =
-      mlir::cast<ShapedType>(input.getType()).getShape();
+  TensorValue input = values[tiled_hlo_reduce.operand(0)];
+  llvm::ArrayRef<int64_t> input_shape = input.getType().getShape();
   absl::Span<const int64_t> source_tensor_shape =
       hlo_reduce.operand(0)->shape().dimensions();
 
@@ -397,43 +245,51 @@ absl::StatusOr<ScalarOrTensor> EmitReduce(
   int64_t input_reduction_dimension_size = input_shape[reduction_dimension];
   if (input_reduction_dimension_size !=
       source_tensor_reduction_dimension_size) {
-    ScalarOrTensor range = Iota(b, input_reduction_dimension_size);
-    ScalarOrTensor bcast =
+    TensorValue range = Iota(b, input_reduction_dimension_size);
+    TensorValue bcast =
         BroadcastInDims(b, range, input_shape, {reduction_dimension});
-    ScalarOrTensor constant = CreateConst(
+    TensorValue constant = CreateConst(
         b, b.getI32Type(), source_tensor_reduction_dimension_size, input_shape);
     Value mask =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast.UnwrapUnsafe(),
-                                constant.UnwrapUnsafe());
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast, constant);
 
-    ScalarOrTensor neutral = BroadcastInDims(
+    TensorValue neutral = BroadcastInDims(
         b, values[tiled_hlo_reduce.operand(1)], input_shape, /*dims=*/{});
-    input = ScalarOrTensor(b.create<arith::SelectOp>(mask, input.UnwrapUnsafe(),
-                                                     neutral.UnwrapUnsafe()));
+    input = mlir::cast<TensorValue>(
+        b.create<arith::SelectOp>(mask, input, neutral).getResult());
   }
 
-  ttir::ReduceOp reduction =
-      b.create<ttir::ReduceOp>(input.UnwrapUnsafe(), reduction_dimension);
+  Value init_value = values[tiled_hlo_reduce.operand(1)];
+
+  stablehlo::ReduceOp reduction =
+      b.create<stablehlo::ReduceOp>(input, init_value, reduction_dimension);
   {
     TF_ASSIGN_OR_RETURN(Type result_ty,
                         TritonType(b, hlo_reduce.shape().element_type()));
+    result_ty = mlir::RankedTensorType::get({}, result_ty);
+
     mlir::Location loc = b.getLoc();
     mlir::Block* reducer = b.createBlock(&reduction->getRegion(0), {},
                                          {result_ty, result_ty}, {loc, loc});
+    b.setInsertionPointToStart(reducer);
 
     HloComputation* reduction_computation = hlo_reduce.to_apply();
 
     std::vector<const HloInstruction*> to_emit;
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor> region_values;
+    absl::flat_hash_map<const HloInstruction*, TensorValue> region_values;
     for (const HloInstruction* instr :
          reduction_computation->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kParameter) {
         int parameter_number = instr->parameter_number();
         TF_RET_CHECK(parameter_number < 2);
-        TF_RET_CHECK(region_values
-                         .insert({instr, ScalarOrTensor(reducer->getArgument(
-                                             parameter_number))})
-                         .second);
+        auto argument = mlir::cast<mlir::TypedValue<mlir::RankedTensorType>>(
+            reducer->getArgument(parameter_number));
+
+        if (!argument) {
+          return Internal("Expected reducer argument to be a tensor.");
+        }
+
+        TF_RET_CHECK(region_values.insert({instr, argument}).second);
       } else {
         to_emit.push_back(instr);
       }
@@ -441,53 +297,13 @@ absl::StatusOr<ScalarOrTensor> EmitReduce(
 
     TF_RET_CHECK(!to_emit.empty());
 
-    b.setInsertionPointToStart(reducer);
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor result,
-        EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr, to_emit,
-                  region_values));
-    b.create<ttir::ReduceReturnOp>(SmallVector<Value>({result.UnwrapUnsafe()}));
+    TF_ASSIGN_OR_RETURN(TensorValue result, EmitScope(b, /*analysis=*/nullptr,
+                                                      to_emit, region_values));
+    b.create<stablehlo::ReturnOp>(SmallVector<Value>({result}));
     b.setInsertionPointAfter(reduction);
   }
 
-  return ScalarOrTensor(reduction.getResult().front());
-}
-
-// Emit code corresponding to a fusion instruction somehow nested within the
-// initial Triton fusion. This can happen when we carry around auxiliary
-// computations, e.g. with reduces. Since we are emitting a single Triton
-// fusion, we simply flatten the fusion inside the computation.
-//
-// TODO(b/331413981): get rid of this special handling once this is solved.
-absl::StatusOr<ScalarOrTensor> EmitNestedFusion(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction& fusion_instruction,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values) {
-  // TODO(b/331402498): revisit the order of scope once we completely
-  // deprecate Triton fusion analysis.
-  const HloComputation* fusion_computation =
-      fusion_instruction.fused_instructions_computation();
-
-  absl::flat_hash_map<const HloInstruction*, ScalarOrTensor> region_values;
-
-  std::vector<const HloInstruction*> to_emit;
-  for (const HloInstruction* instr :
-       fusion_computation->MakeInstructionPostOrder()) {
-    if (instr->opcode() == HloOpcode::kParameter) {
-      int64_t parameter_number = instr->parameter_number();
-      auto it = values.find(fusion_instruction.operand(parameter_number));
-      TF_RET_CHECK(it != values.end());
-      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
-    } else {
-      to_emit.push_back(instr);
-    }
-  }
-
-  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
-
-  return EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr,
-                   to_emit, region_values);
+  return mlir::cast<TensorValue>(reduction.getResult(0));
 }
 
 template <typename T>
@@ -495,9 +311,9 @@ ArrayRef<T> MakeArrayRef(const absl::Span<const T> span) {
   return ArrayRef(span.data(), span.size());
 }
 
-ScalarOrTensor EmitTiledBroadcast(
+TensorValue EmitTiledBroadcast(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_broadcast,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const SmallVector<int64_t>& input_tile_shape =
       tiled_broadcast.operand(0)->tile_sizes();
   const SmallVector<int64_t>& output_tile_shape = tiled_broadcast.tile_sizes();
@@ -510,12 +326,12 @@ ScalarOrTensor EmitTiledBroadcast(
   SmallVector<int64_t> padded_output_tile_shape =
       GetPaddedTileSizes(output_tile_shape);
 
-  ScalarOrTensor input = values[tiled_broadcast.operand(0)];
+  TensorValue input = values[tiled_broadcast.operand(0)];
   return BroadcastInDims(b, input, padded_output_tile_shape,
                          MakeArrayRef(tiled_broadcast.hlo()->dimensions()));
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledIota(
+absl::StatusOr<TensorValue> EmitTiledIota(
     EmitterLocOpBuilder b, Value pid, const TiledHloInstruction& tiled_iota) {
   const HloIotaInstruction* hlo_iota =
       ::xla::Cast<HloIotaInstruction>(tiled_iota.hlo());
@@ -537,17 +353,14 @@ absl::StatusOr<ScalarOrTensor> EmitTiledIota(
 
   // First, stride as needed between the iota components.
   Value range = b.create<arith::MulIOp>(
-      Iota(b, padded_tile_sizes[iota_dim]).UnwrapTensor(),
+      Iota(b, padded_tile_sizes[iota_dim]),
       Splat(b,
             CreateConst(b, b.getI32Type(), tiled_iota.tile_strides()[iota_dim]),
-            padded_tile_sizes[iota_dim])
-          .UnwrapTensor());
+            padded_tile_sizes[iota_dim]));
 
   // Then, add the base offset to the iota components.
   range = b.create<arith::AddIOp>(
-      range,
-      Splat(b, ScalarOrTensor(iota_dim_offset), padded_tile_sizes[iota_dim])
-          .UnwrapTensor());
+      range, Splat(b, iota_dim_offset, padded_tile_sizes[iota_dim]));
 
   // Cast the result to the targeted type.
   TF_ASSIGN_OR_RETURN(Type iota_element_type,
@@ -557,109 +370,56 @@ absl::StatusOr<ScalarOrTensor> EmitTiledIota(
 
   // And finally, produce a broadcast along the non-iota dimensions in order to
   // produce the whole iota tile.
-  return BroadcastInDims(b, ScalarOrTensor(range), padded_tile_sizes,
+  return BroadcastInDims(b, mlir::cast<TensorValue>(range), padded_tile_sizes,
                          /*dims=*/{iota_dim});
 }
 
 SmallVector<Value> GetRuntimeValues(
     const TiledHloInstruction& tiled_hlo,
-    const absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>&
+    const absl::flat_hash_map<const TiledHloInstruction*, TensorValue>&
         values) {
   SmallVector<Value> runtime_values;
   if (!tiled_hlo.runtime_variables().empty()) {
     for (const TiledHloInstruction* rt : tiled_hlo.runtime_variables()) {
       CHECK(values.contains(rt))
           << absl::StrCat(" runtime variable ", rt->ToString(), " not found");
-      runtime_values.push_back(values.at(rt).UnwrapScalar());
+      TensorValue value = values.at(rt);
+      mlir::OpBuilder builder(value.getContext());
+      builder.setInsertionPointAfterValue(value);
+      runtime_values.push_back(
+          mlir::tensor::ExtractOp::create(builder, value.getLoc(), value));
     }
   }
   return runtime_values;
 }
 
-// Reshapes a non-0D tensor of shape [1, 1, 1, ...] to a scalar.
-ScalarOrTensor ReshapeTensorToScalar(EmitterLocOpBuilder b, Value input) {
-  auto element_type = mlir::cast<ShapedType>(input.getType()).getElementType();
-
-  // First, reshape to a 1D tensor if not already the case. This is needed
-  // because triton::ReduceOp can only reduce 1 dimension at a time.
-  auto single_dim_tensor = input;
-  if (mlir::cast<ShapedType>(input.getType()).getRank() > 1) {
-    Type output_tensor_type = mlir::RankedTensorType::get({1}, element_type);
-    single_dim_tensor = b.create<ttir::ReshapeOp>(output_tensor_type, input,
-                                                  /*allow_reorder=*/true);
-  }
-
-  // Second, reduce to a scalar.
-  ttir::ReduceOp reduction =
-      b.create<ttir::ReduceOp>(single_dim_tensor, /*axis*/ 0);
-
-  mlir::Location loc = b.getLoc();
-  mlir::Block* reducer = b.createBlock(
-      &reduction->getRegion(0), /*insertPt=*/{},
-      /*argTypes=*/{element_type, element_type}, /*locs=*/{loc, loc});
-
-  b.setInsertionPointToStart(reducer);
-  Value result = mlir::isa<mlir::IntegerType>(element_type)
-                     ? b.create<arith::AddIOp>(reducer->getArgument(0),
-                                               reducer->getArgument(1))
-                           .getResult()
-                     : b.create<arith::AddFOp>(reducer->getArgument(0),
-                                               reducer->getArgument(1))
-                           .getResult();
-  b.create<ttir::ReduceReturnOp>(SmallVector<Value>({result}));
-  b.setInsertionPointAfter(reduction);
-
-  return ScalarOrTensor(reduction.getResult().front());
-}
-
-absl::StatusOr<ScalarOrTensor> EmitTiledReshape(EmitterLocOpBuilder b,
-                                                ArrayRef<int64_t> tile_sizes,
-                                                ScalarOrTensor input) {
+absl::StatusOr<TensorValue> EmitTiledReshape(EmitterLocOpBuilder b,
+                                             ArrayRef<int64_t> tile_sizes,
+                                             TensorValue input) {
+  mlir::RankedTensorType input_type = input.getType();
   SmallVector<int64_t> padded_tile_sizes = GetPaddedTileSizes(tile_sizes);
 
-  if (input.IsScalar()) {
-    if (tile_sizes.empty()) {
-      // Nothing to do.
-      return input;
-    }
-    // Convert the scalar to a tensor.
-    return Splat(b, input, padded_tile_sizes);
-  }
-
-  // At this point we know that the input is a non-0D tensor.
-  auto input_shaped_type = mlir::cast<ShapedType>(input.getType());
-
-  // Handle the case of reshaping [1,1,1...] to a scalar.
-  if (tile_sizes.empty()) {
-    return ReshapeTensorToScalar(b, input.UnwrapTensor());
-  }
-
   // At this point we know that neither the input nor the output are 0D tensors.
   auto output_tensor_type = mlir::RankedTensorType::get(
-      padded_tile_sizes, input_shaped_type.getElementType());
+      padded_tile_sizes, input_type.getElementType());
 
-  if (input_shaped_type.getNumElements() !=
-      output_tensor_type.getNumElements()) {
+  if (input_type.getNumElements() != output_tensor_type.getNumElements()) {
     return absl::InvalidArgumentError(
         absl::StrCat("Reshape input and output shapes must be the same, got ",
-                     absl::StrJoin(input_shaped_type.getShape(), "x"), " -> ",
+                     absl::StrJoin(input_type.getShape(), "x"), " -> ",
                      absl::StrJoin(output_tensor_type.getShape(), "x")));
   }
 
-  // Conservatively prevent Triton from reordering elements within the tile.
-  // TODO(b/353637689): see if this restriction can be lifted.
-  bool allow_reorder = false;
-  auto reshape = b.create<ttir::ReshapeOp>(output_tensor_type,
-                                           input.UnwrapUnsafe(), allow_reorder);
-  return ScalarOrTensor(reshape.getResult());
+  return b.create<stablehlo::ReshapeOp>(output_tensor_type, input);
 }
 
-Value EmitTiledTranspose(EmitterLocOpBuilder b, ArrayRef<int64_t> tile_sizes,
-                         SmallVector<int64_t> dimensions, Value input) {
+TensorValue EmitTiledTranspose(EmitterLocOpBuilder b,
+                               ArrayRef<int64_t> tile_sizes,
+                               SmallVector<int64_t> dimensions,
+                               TensorValue input) {
   SmallVector<int64_t> padded_tile_sizes = GetPaddedTileSizes(tile_sizes);
 
-  Type input_element_type =
-      mlir::cast<ShapedType>(input.getType()).getElementType();
+  Type input_element_type = input.getType().getElementType();
   Type output_tensor_type =
       mlir::RankedTensorType::get(padded_tile_sizes, input_element_type);
 
@@ -668,9 +428,9 @@ Value EmitTiledTranspose(EmitterLocOpBuilder b, ArrayRef<int64_t> tile_sizes,
   return b.create<stablehlo::TransposeOp>(output_tensor_type, input, order);
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
+absl::StatusOr<TensorValue> EmitTiledBitcast(
     EmitterLocOpBuilder b, const TiledHloInstruction& tiled_bitcast,
-    Value input) {
+    TensorValue input) {
   Shape input_shape = tiled_bitcast.hlo()->operand(0)->shape();
   const Shape& output_shape = tiled_bitcast.hlo()->shape();
   // If the bitcast changes the element type to an element type of the same
@@ -684,13 +444,11 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
     }
     TF_ASSIGN_OR_RETURN(Type output_element_type,
                         TritonType(b, output_shape.element_type()));
-    Type output_type =
-        mlir::isa<TensorValue>(input)
-            ? mlir::RankedTensorType::get(
-                  GetPaddedTileSizes(tiled_bitcast.operand(0)->tile_sizes()),
-                  output_element_type)
-            : output_element_type;
-    input = b.create<mlir::tensor::BitcastOp>(output_type, input);
+    auto output_type = mlir::RankedTensorType::get(
+        GetPaddedTileSizes(tiled_bitcast.operand(0)->tile_sizes()),
+        output_element_type);
+    input = mlir::cast<TensorValue>(
+        b.create<mlir::tensor::BitcastOp>(output_type, input).getResult());
     input_shape.set_element_type(output_shape.element_type());
   }
 
@@ -713,7 +471,7 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
   // the bitcast, so it's not possible to easily propagate them from the output.
   std::vector<int64_t> transpose1_tile_sizes =
       Permute(tiled_bitcast.operand(0)->tile_sizes(), trt->transpose1_dims);
-  Value normalized_input =
+  TensorValue normalized_input =
       trt->IsTranspose1Identity()
           ? input
           : EmitTiledTranspose(b, transpose1_tile_sizes,
@@ -726,34 +484,30 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
   // the inverse permutation.
   std::vector<int64_t> reshape_tile_sizes =
       PermuteInverse(tiled_bitcast.tile_sizes(), trt->transpose2_dims);
-  Value normalized_reshape;
+  TensorValue normalized_reshape;
   if (ShapeUtil::Equal(trt->transpose1_shape, trt->reshape_shape)) {
     normalized_reshape = normalized_input;
   } else {
-    TF_ASSIGN_OR_RETURN(auto reshape,
-                        EmitTiledReshape(b, reshape_tile_sizes,
-                                         ScalarOrTensor(normalized_input)));
-    normalized_reshape = reshape.UnwrapUnsafe();
+    TF_ASSIGN_OR_RETURN(
+        normalized_reshape,
+        EmitTiledReshape(b, reshape_tile_sizes, normalized_input));
   }
 
   // The final transpose simply uses the tile sizes computed for the original
   // bitcast by the tiling analysis.
-  return ScalarOrTensor{
-      trt->IsTranspose2Identity()
-          ? normalized_reshape
-          : EmitTiledTranspose(b, tiled_bitcast.tile_sizes(),
-                               llvm::to_vector(trt->transpose2_dims),
-                               normalized_reshape)};
+  return trt->IsTranspose2Identity()
+             ? normalized_reshape
+             : EmitTiledTranspose(b, tiled_bitcast.tile_sizes(),
+                                  llvm::to_vector(trt->transpose2_dims),
+                                  normalized_reshape);
 }
 
-absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values);
-
+absl::StatusOr<std::vector<TensorValue>> EmitTiledComputation(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloComputation& tiled_computation,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values);
 // Returns the number of iterations of the loop over the contracting
 // dimension of matrix multiplication.
 absl::StatusOr<int64_t> GetDotLoopIterationCount(
@@ -782,18 +536,16 @@ absl::StatusOr<int64_t> GetDotLoopIterationCount(
 //
 // Note: we currently assume that contracting_dimension_tile_index is an i32
 // scalar.
-absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
-                                     const TiledHloInstruction& dot_operand,
-                                     Value dot_operand_value,
-                                     Value contracting_dimension_tile_index,
-                                     int contraction_dimension_index) {
+absl::StatusOr<TensorValue> MaskDotOperand(
+    EmitterLocOpBuilder b, const TiledHloInstruction& dot_operand,
+    TensorValue dot_operand_value, Value contracting_dimension_tile_index,
+    int contraction_dimension_index) {
   if (contracting_dimension_tile_index.getType() != b.getI32Type()) {
     return absl::FailedPreconditionError(
         "contracting_dimension_tile_index must be an i32 scalar");
   }
 
-  llvm::ArrayRef<int64_t> tile_shape =
-      mlir::cast<ShapedType>(dot_operand_value.getType()).getShape();
+  llvm::ArrayRef<int64_t> tile_shape = dot_operand_value.getType().getShape();
 
   int64_t contracting_dimension_size =
       dot_operand.hlo()->shape().dimensions(contraction_dimension_index);
@@ -804,11 +556,9 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
     // contracting dimension---i.e. tiles whose index exceeds the number of
     // full tiles (tiles without padding).
     Type result_type = dot_operand_value.getType();
-    Value tile_size_value =
-        CreateConst(b, b.getI32Type(), tile_size, {}).UnwrapScalar();
+    Value tile_size_value = CreateConst(b, b.getI32Type(), tile_size);
     Value num_full_tiles = b.create<arith::DivSIOp>(
-        CreateConst(b, b.getI32Type(), contracting_dimension_size, {})
-            .UnwrapScalar(),
+        CreateConst(b, b.getI32Type(), contracting_dimension_size),
         tile_size_value);
     // if tile_index >= num_full_tiles...
     auto cond = b.create<arith::CmpIOp>(arith::CmpIPredicate::sge,
@@ -825,29 +575,26 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
       // operand = select(broadcast(mask, operand.shape), operand, 0)
       Value tile_offset = b.create<arith::MulIOp>(
           contracting_dimension_tile_index, tile_size_value);
-      Value range = Iota(b, tile_size).UnwrapTensor();
-      Value broadcasted_tile_offset =
-          Splat(b, ScalarOrTensor(tile_offset), {tile_size}).UnwrapTensor();
+      TensorValue range = Iota(b, tile_size);
+      TensorValue broadcasted_tile_offset = Splat(b, tile_offset, {tile_size});
       Value indices = b.create<arith::AddIOp>(range, broadcasted_tile_offset);
 
       Value boundary = CreateConst(b, b.getI32Type(),
-                                   contracting_dimension_size, {tile_size})
-                           .UnwrapTensor();
+                                   contracting_dimension_size, {tile_size});
 
       Value mask =
           b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, indices, boundary);
 
-      mask = BroadcastInDims(b, ScalarOrTensor(mask), tile_shape,
-                             {contraction_dimension_index})
-                 .UnwrapTensor();
+      mask = BroadcastInDims(b, mlir::cast<TensorValue>(mask), tile_shape,
+                             {contraction_dimension_index});
       TF_ASSIGN_OR_RETURN(
           auto element_type,
           TritonType(b, dot_operand.hlo()->shape().element_type()));
 
-      ScalarOrTensor zero = CreateConst(b, element_type, 0.0f, tile_shape);
+      TensorValue zero = CreateConst(b, element_type, 0.0f, tile_shape);
 
-      Value masked_dot_operand = b.create<arith::SelectOp>(
-          mask, dot_operand_value, zero.UnwrapTensor());
+      Value masked_dot_operand =
+          b.create<arith::SelectOp>(mask, dot_operand_value, zero);
       b.create<mlir::scf::YieldOp>(masked_dot_operand);
     }
     // else ...
@@ -856,7 +603,7 @@ absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder b,
       b.create<mlir::scf::YieldOp>(dot_operand_value);
     }
     b.setInsertionPointAfter(if_op);
-    return if_op.getResult(0);
+    return mlir::cast<TensorValue>(if_op.getResult(0));
   }
 
   return dot_operand_value;
@@ -894,15 +641,13 @@ enum class DotOperandSide { kLhs, kRhs };
 //   because the last one of the lhs operand is not equal to 1.
 //
 // Returns an error if canonicalization is not possible.
-absl::StatusOr<Value> CanonicalizeDotOperand(
-    EmitterLocOpBuilder b, Value operand, int64_t contracting_dim_idx,
-    DotOperandSide side, Value counterpart_operand = nullptr) {
-  llvm::ArrayRef<int64_t> shape =
-      mlir::cast<ShapedType>(operand.getType()).getShape();
+absl::StatusOr<TensorValue> CanonicalizeDotOperand(
+    EmitterLocOpBuilder b, TensorValue operand, int64_t contracting_dim_idx,
+    DotOperandSide side, TensorValue counterpart_operand = nullptr) {
+  llvm::ArrayRef<int64_t> shape = operand.getType().getShape();
   llvm::ArrayRef<int64_t> counterpart_shape =
-      counterpart_operand == nullptr
-          ? shape
-          : mlir::cast<ShapedType>(counterpart_operand.getType()).getShape();
+      counterpart_operand == nullptr ? shape
+                                     : counterpart_operand.getType().getShape();
 
   auto [shape_without_unit_dims, non_unit_dims_indices] =
       CollapseUnitDims(shape, counterpart_shape);
@@ -913,10 +658,8 @@ absl::StatusOr<Value> CanonicalizeDotOperand(
   }
 
   if (shape.size() != shape_without_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_operand,
-        EmitTiledReshape(b, shape_without_unit_dims, ScalarOrTensor(operand)));
-    operand = wrapped_operand.UnwrapTensor();
+    TF_ASSIGN_OR_RETURN(operand,
+                        EmitTiledReshape(b, shape_without_unit_dims, operand));
   }
 
   int expected_contracting_dim_position = side == DotOperandSide::kLhs ? 1 : 0;
@@ -934,13 +677,12 @@ absl::StatusOr<Value> CanonicalizeDotOperand(
   return operand;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitDot(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_hlo_dot, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitDot(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_dot,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   // We expect to get a tiled HLO in form:
   //
   // left { ... }
@@ -1000,9 +742,8 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
   // and the dot's output type does not match its expectations.
   TF_ASSIGN_OR_RETURN(Type accumulator_type,
                       triton::GetDotAccumulatorType(b, dot));
-  Value accumulator =
-      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims)
-          .UnwrapTensor();
+  TensorValue accumulator =
+      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims);
 
   TF_ASSIGN_OR_RETURN(int64_t loop_iteration_count,
                       GetDotLoopIterationCount(tiled_hlo_dot));
@@ -1021,10 +762,7 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
       /*upperBound=*/MakeIndex(b, loop_iteration_count),
       /*step=*/MakeIndex(b, 1), accumulator);
 
-  if (fusion->GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_experimental_enable_triton_warp_specialization()) {
+  if (block_level_parameters.is_warp_specialization_allowed) {
     for_op->setAttr("tt.warp_specialize", b.getBoolAttr(true));
   }
 
@@ -1041,18 +779,17 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
       const TiledHloFusionInstruction* tiled_fusion_operand =
           static_cast<const TiledHloFusionInstruction*>(operand);
       TF_ASSIGN_OR_RETURN(
-          std::vector<ScalarOrTensor> result,
+          std::vector<TensorValue> result,
           EmitTiledComputation(
-              b, libdevice_path, device_info,
-              ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-              *tiled_fusion_operand->called_computation(), fn,
-              computation_index, values));
+              b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+              *tiled_fusion_operand->called_computation(),
+              block_level_parameters, fn, computation_index, values));
       if (result.size() != 1) {
         return absl::InternalError(absl::StrCat(
             "Expected nested fusion computation to emit a single value, got ",
             result.size()));
       }
-      dot_args.push_back(result.front().UnwrapTensor());
+      dot_args.push_back(result.front());
     }
     Value acc = for_op.getRegionIterArgs().front();
     int64_t lhs_contracting_dim_idx =
@@ -1063,12 +800,14 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
 
     Value ki_i32 = Cast(b, ki, b.getI32Type());
     TF_ASSIGN_OR_RETURN(
-        Value lhs, MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0],
-                                  ki_i32, lhs_contracting_dim_idx));
+        TensorValue lhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0], ki_i32,
+                       lhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value rhs, MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1],
-                                  ki_i32, rhs_contracting_dim_idx));
+        TensorValue rhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1], ki_i32,
+                       rhs_contracting_dim_idx));
 
     // Canonicalize the dot operands to match Triton's/the hardware's
     // expectations.
@@ -1095,23 +834,21 @@ absl::StatusOr<ScalarOrTensor> EmitDot(
     result = Cast(b, result, dot_output_type);
   }
 
+  auto tensor_result = mlir::cast<TensorValue>(result);
+
   if (padded_tile_sizes.size() != padded_tile_sizes_no_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_result,
-        EmitTiledReshape(b, padded_tile_sizes, ScalarOrTensor(result)));
-    result = wrapped_result.UnwrapTensor();
+    return EmitTiledReshape(b, padded_tile_sizes, tensor_result);
   }
 
-  return ScalarOrTensor(result);
+  return tensor_result;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitScaledDot(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_hlo_dot, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitScaledDot(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo_dot,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   VLOG(2) << "EmitScaledDot: " << tiled_hlo_dot.ToString();
   const HloScaledDotInstruction& scaled_dot =
       *::xla::Cast<HloScaledDotInstruction>(tiled_hlo_dot.hlo());
@@ -1139,9 +876,8 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
   }
 
   Type accumulator_type = b.getF32Type();
-  Value accumulator =
-      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims)
-          .UnwrapTensor();
+  TensorValue accumulator =
+      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims);
 
   TF_ASSIGN_OR_RETURN(int64_t loop_iteration_count,
                       GetDotLoopIterationCount(tiled_hlo_dot));
@@ -1155,6 +891,8 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
       {IndexingMap::Variable{{0, loop_iteration_count - 1}, "k"}},
       /*rt_vars=*/{}};
 
+  // TODO(b/449668102): Consider adding warp specialization support for scaled
+  // dot. At the moment, there are no benchmarks that use scaled dot.
   auto for_op = b.create<mlir::scf::ForOp>(
       /*lowerBound=*/MakeIndex(b, 0),
       /*upperBound=*/MakeIndex(b, loop_iteration_count),
@@ -1172,18 +910,17 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
       const TiledHloFusionInstruction* tiled_fusion_operand =
           static_cast<const TiledHloFusionInstruction*>(operand);
       TF_ASSIGN_OR_RETURN(
-          std::vector<ScalarOrTensor> result,
+          std::vector<TensorValue> result,
           EmitTiledComputation(
-              b, libdevice_path, device_info,
-              ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-              *tiled_fusion_operand->called_computation(), fn,
-              computation_index, values));
+              b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+              *tiled_fusion_operand->called_computation(),
+              block_level_parameters, fn, computation_index, values));
       if (result.size() != 1) {
         return absl::InternalError(absl::StrCat(
             "Expected nested fusion computation to emit a single value, got ",
             result.size()));
       }
-      dot_args.push_back(result.front().UnwrapTensor());
+      dot_args.push_back(result.front());
     }
     Value acc = for_op.getRegionIterArgs().front();
     int64_t lhs_contracting_dim_idx =
@@ -1197,19 +934,21 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
     // hinders performance for Triton.
     Value ki_i32 = Cast(b, ki, b.getI32Type());
     TF_ASSIGN_OR_RETURN(
-        Value lhs, MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0],
-                                  ki_i32, lhs_contracting_dim_idx));
+        TensorValue lhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0], ki_i32,
+                       lhs_contracting_dim_idx));
     TF_ASSIGN_OR_RETURN(
-        Value rhs, MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1],
-                                  ki_i32, rhs_contracting_dim_idx));
+        TensorValue rhs,
+        MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1], ki_i32,
+                       rhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value lhs_scale,
+        TensorValue lhs_scale,
         MaskDotOperand(b, *tiled_hlo_dot.operand(2), dot_args[2], ki_i32,
                        lhs_contracting_dim_idx));
 
     TF_ASSIGN_OR_RETURN(
-        Value rhs_scale,
+        TensorValue rhs_scale,
         MaskDotOperand(b, *tiled_hlo_dot.operand(3), dot_args[3], ki_i32,
                        rhs_contracting_dim_idx));
 
@@ -1247,23 +986,21 @@ absl::StatusOr<ScalarOrTensor> EmitScaledDot(
     result = Cast(b, result, dot_output_type);
   }
 
+  auto tensor_result = mlir::cast<TensorValue>(result);
+
   if (padded_tile_sizes.size() != padded_tile_sizes_no_unit_dims.size()) {
-    TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor wrapped_result,
-        EmitTiledReshape(b, padded_tile_sizes, ScalarOrTensor(result)));
-    result = wrapped_result.UnwrapTensor();
+    return EmitTiledReshape(b, padded_tile_sizes, tensor_result);
   }
 
-  return ScalarOrTensor(result);
+  return tensor_result;
 }
 
-absl::StatusOr<ScalarOrTensor> EmitConcatenate(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloInstruction& tiled_concatenate, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<TensorValue> EmitConcatenate(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_concatenate,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const int64_t concatenate_dimension =
       tiled_concatenate.hlo()->concatenate_dimension();
 
@@ -1272,21 +1009,24 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
   // prologue of reductions.
   SmallVector<int64_t> padded_tile_sizes =
       GetPaddedTileSizes(tiled_concatenate.tile_sizes());
-  int64_t concatenate_dimension_tile_size =
-      padded_tile_sizes[concatenate_dimension];
+  int64_t concat_dim_tile_size = padded_tile_sizes[concatenate_dimension];
 
-  for (const TiledHloInstruction* operand : tiled_concatenate.operands()) {
+  int64_t num_operands = tiled_concatenate.operands().size();
+  for (const auto [index, operand] :
+       llvm::enumerate(tiled_concatenate.operands())) {
     if (operand->hlo()->opcode() != HloOpcode::kFusion) {
       // Sanity check: all operands should be nested fusions.
       return absl::FailedPreconditionError(
           "Expected concatenate operands to be nested fusions.");
     }
 
-    int64_t operand_concatenate_dimension_size =
-        tiled_concatenate.hlo()->shape().dimensions(concatenate_dimension);
+    int64_t operand_concat_dim_size =
+        operand->hlo()->shape().dimensions(concatenate_dimension);
 
-    if (operand_concatenate_dimension_size % concatenate_dimension_tile_size !=
-        0) {
+    // The last operand does not have to be a multiple of the tile size, since
+    // we can pad it.
+    if (index != num_operands - 1 &&
+        operand_concat_dim_size % concat_dim_tile_size != 0) {
       // Sanity check: concatenation dimension should be divisible by the tile
       // size for each operand. This is not a fundamental limitation, but this
       // lowering will emit incorrect code if this does not hold---so we gate
@@ -1294,8 +1034,7 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
       return absl::FailedPreconditionError(absl::StrCat(
           "Expected the tile size of the concatenation dimension of operand ",
           operand->ToString(), "to divide the dimension size exactly, but got",
-          operand_concatenate_dimension_size, " % ",
-          concatenate_dimension_tile_size, " != 0"));
+          operand_concat_dim_size, " % ", concat_dim_tile_size, " != 0"));
     }
   }
   TF_ASSIGN_OR_RETURN(
@@ -1328,8 +1067,7 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
     // directly populates the `else` block of the previous `if_op`.
     if (if_ops.size() < tiled_concatenate.operands().size() - 1) {
       limit += operand->hlo()->shape().dimensions()[concatenate_dimension];
-      Value offset_limit =
-          CreateConst(b, b.getIndexType(), limit, {}).UnwrapScalar();
+      Value offset_limit = CreateConst(b, b.getIndexType(), limit);
 
       auto cond =
           b.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
@@ -1351,30 +1089,24 @@ absl::StatusOr<ScalarOrTensor> EmitConcatenate(
         static_cast<const TiledHloFusionInstruction*>(
             tiled_concatenate.operand(i));
     TF_ASSIGN_OR_RETURN(
-        std::vector<ScalarOrTensor> result,
+        std::vector<TensorValue> result,
         EmitTiledComputation(
-            b, libdevice_path, device_info,
-            ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
-            *tiled_fusion_operand->called_computation(), fn, pid, values));
+            b, ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+            *tiled_fusion_operand->called_computation(), block_level_parameters,
+            fn, pid, values));
     CHECK_EQ(result.size(), 1);
-    b.create<mlir::scf::YieldOp>(result.front().UnwrapTensor());
+    b.create<mlir::scf::YieldOp>(result.front());
   }
 
   b.setInsertionPointAfter(if_ops.front());
 
-  return ScalarOrTensor(if_ops.front().getResult(0));
+  return mlir::cast<TensorValue>(if_ops.front().getResult(0));
 }
 
-absl::StatusOr<ScalarOrTensor> EmitPad(
-    EmitterLocOpBuilder b, const se::DeviceDescription& device_info,
-    const TiledHloInstruction& tiled_pad,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values,
+absl::StatusOr<TensorValue> EmitPad(
+    EmitterLocOpBuilder b, const TiledHloInstruction& tiled_pad,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values,
     Value pid) {
-  if (!IsTritonSupportedInstruction(*tiled_pad.hlo(),
-                                    device_info.gpu_compute_capability())) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("Pad is not supported: ", tiled_pad.hlo()->ToString()));
-  }
   // TODO(b/393299275): get rid of calls to `GetPaddedTileSizes` once tiling
   // is using power of twos everywhere, including when propagating into the
   // prologue of reductions.
@@ -1402,20 +1134,17 @@ absl::StatusOr<ScalarOrTensor> EmitPad(
     }
 
     // LHS for the compare is an iota broadcasted to the output shape.
-    ScalarOrTensor range = Iota(b, pad_output_dim_size);
-    ScalarOrTensor bcast = BroadcastInDims(b, range, padded_tile_sizes,
-                                           {static_cast<int64_t>(dim_index)});
+    TensorValue range = Iota(b, pad_output_dim_size);
+    TensorValue bcast = BroadcastInDims(b, range, padded_tile_sizes,
+                                        {static_cast<int64_t>(dim_index)});
 
     // RHS for the compare is splat(pad_input_dim_size - tile_offset).
     Value tile_offset_i32 = Cast(b, tile_offset, i32_type);
     Value threshold = b.create<arith::SubIOp>(
-        CreateConst(b, i32_type, pad_input_dim_size).UnwrapScalar(),
-        tile_offset_i32);
-    ScalarOrTensor threshold_splat =
-        Splat(b, ScalarOrTensor(threshold), padded_tile_sizes);
-    Value cmp =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast.UnwrapTensor(),
-                                threshold_splat.UnwrapTensor());
+        CreateConst(b, i32_type, pad_input_dim_size), tile_offset_i32);
+    TensorValue threshold_splat = Splat(b, threshold, padded_tile_sizes);
+    Value cmp = b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, bcast,
+                                        threshold_splat);
     mask = mask ? b.create<arith::AndIOp>(mask, cmp) : cmp;
   }
   if (!mask) {
@@ -1423,20 +1152,19 @@ absl::StatusOr<ScalarOrTensor> EmitPad(
   }
   const TiledHloInstruction* padding_value = tiled_pad.operand(1);
 
-  ScalarOrTensor pad_value_splat =
+  TensorValue pad_value_splat =
       Splat(b, values[padding_value], padded_tile_sizes);
-  auto result = ScalarOrTensor(
-      b.create<arith::SelectOp>(mask, values[tiled_operand].UnwrapUnsafe(),
-                                pad_value_splat.UnwrapUnsafe()));
-  return result;
+  return mlir::cast<TensorValue>(
+      b.create<arith::SelectOp>(mask, values[tiled_operand], pad_value_splat)
+          .getResult());
 }
 
-absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion, const TiledHloInstruction& tiled_hlo,
+absl::StatusOr<TensorValue> EmitTiledHloInstruction(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_hlo,
+    const BlockLevelParameters& block_level_parameters,
     mlir::FunctionOpInterface fn, Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const HloInstruction* hlo = tiled_hlo.hlo();
   VLOG(4) << "EmitTiledHloInstruction: " << hlo->ToString();
 
@@ -1455,7 +1183,7 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
         TileInfo tile_info,
         TileInfo::Construct(b, pid, GetRuntimeValues(tiled_hlo, values),
                             tiled_hlo));
-    ScalarOrTensor parameter =
+    TensorValue parameter =
         EmitParameterExtract(b, tile_info, fn.getArgument(arg_index));
 
     // Some types are stored using different types, e.g. i1 is stored in memory
@@ -1475,30 +1203,30 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
             "while lowering ",
             fusion->called_computation()->ToString()));
       }
-      parameter = ScalarOrTensor(
-          Cast(b, parameter.UnwrapUnsafe(), expected_element_type));
+      parameter =
+          mlir::cast<TensorValue>(Cast(b, parameter, expected_element_type));
     }
 
     return parameter;
   }
 
   if (hlo->opcode() == HloOpcode::kConcatenate) {
-    return EmitConcatenate(b, libdevice_path, device_info, fusion, tiled_hlo,
-                           fn, pid, values);
+    return EmitConcatenate(b, fusion, tiled_hlo, block_level_parameters, fn,
+                           pid, values);
   }
 
   if (hlo->opcode() == HloOpcode::kPad) {
-    return EmitPad(b, device_info, tiled_hlo, values, pid);
+    return EmitPad(b, tiled_hlo, values, pid);
   }
 
   if (hlo->opcode() == HloOpcode::kDot) {
-    return EmitDot(b, libdevice_path, device_info, fusion, tiled_hlo, fn, pid,
+    return EmitDot(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
                    values);
   }
 
   if (hlo->opcode() == HloOpcode::kScaledDot) {
-    return EmitScaledDot(b, libdevice_path, device_info, fusion, tiled_hlo, fn,
-                         pid, values);
+    return EmitScaledDot(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
+                         values);
   }
 
   if (hlo->opcode() == HloOpcode::kConstant) {
@@ -1518,7 +1246,12 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
   }
 
   if (hlo->opcode() == HloOpcode::kReduce) {
-    return EmitReduce(b, tiled_hlo, values, libdevice_path, device_info);
+    return EmitReduce(b, tiled_hlo, values);
+  }
+
+  if (hlo->opcode() == HloOpcode::kAllReduceStart) {
+    return EmitCollective(b, fusion, tiled_hlo, block_level_parameters, fn, pid,
+                          values);
   }
 
   if (hlo->IsElementwise()) {
@@ -1526,12 +1259,10 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
     operands.reserve(hlo->operands().size());
 
     for (const TiledHloInstruction* operand : tiled_hlo.operands()) {
-      operands.push_back(values[operand].UnwrapUnsafe());
+      operands.push_back(values[operand]);
     }
-    TF_ASSIGN_OR_RETURN(
-        Value result,
-        EmitElementwise(b, libdevice_path, device_info, *hlo, operands));
-    return ScalarOrTensor(result);
+    TF_ASSIGN_OR_RETURN(Value result, EmitElementwise(b, *hlo, operands));
+    return mlir::cast<TensorValue>(result);
   }
 
   if (hlo->opcode() == HloOpcode::kReshape) {
@@ -1540,16 +1271,15 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
   }
 
   if (hlo->opcode() == HloOpcode::kBitcast) {
-    return EmitTiledBitcast(b, tiled_hlo,
-                            values[tiled_hlo.operand(0)].UnwrapUnsafe());
+    return EmitTiledBitcast(b, tiled_hlo, values[tiled_hlo.operand(0)]);
   }
 
   if (hlo->opcode() == HloOpcode::kTranspose) {
     auto transpose =
         ::xla::Cast<const HloTransposeInstruction>(tiled_hlo.hlo());
-    return ScalarOrTensor(EmitTiledTranspose(
-        b, tiled_hlo.tile_sizes(), llvm::to_vector(transpose->dimensions()),
-        values[tiled_hlo.operand(0)].UnwrapUnsafe()));
+    return EmitTiledTranspose(b, tiled_hlo.tile_sizes(),
+                              llvm::to_vector(transpose->dimensions()),
+                              values[tiled_hlo.operand(0)]);
   }
 
   // Slice is currently supported only as an operation on indices
@@ -1568,16 +1298,12 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
       absl::StrCat("Unsupported operation ", hlo->ToString()));
 }
 
-// Emit a sequence of instructions using compatible tiling with producers
-// ordered before consumers in `tiled_computation`. Returns the results for the
-// roots of `tiled_computation`.
-absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion,
-    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
-    Value pid,
-    absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
+absl::StatusOr<std::vector<TensorValue>> EmitTiledComputation(
+    EmitterLocOpBuilder b, const HloFusionInstruction* fusion,
+    const TiledHloComputation& tiled_computation,
+    const BlockLevelParameters& block_level_parameters,
+    mlir::FunctionOpInterface fn, Value pid,
+    absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   VLOG(2) << "EmitTiledComputation: " << tiled_computation.ToString();
   for (const TiledHloInstruction* tiled_hlo :
        tiled_computation.instructions()) {
@@ -1585,30 +1311,17 @@ absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
     // Skip generating nested fusions, they are emitted by their consumer.
     if (hlo->parent()->IsFusionComputation() &&
         hlo->opcode() == HloOpcode::kFusion) {
-      if (hlo->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_experimental_scaled_dot_with_triton()) {
-        continue;
-      }
-      CodegenDecision decision = IsTritonSupportedInstruction(
-          *hlo, device_info.gpu_compute_capability());
-      if (!decision.CanFuse()) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Fusion ", hlo->ToString(),
-                         " is not supported: ", decision.Explain()));
-      }
       VLOG(1) << "Skipping nested fusion: " << hlo->ToString();
       continue;
     }
     TF_ASSIGN_OR_RETURN(
-        ScalarOrTensor result,
-        EmitTiledHloInstruction(b, libdevice_path, device_info, fusion,
-                                *tiled_hlo, fn, pid, values));
+        TensorValue result,
+        EmitTiledHloInstruction(b, fusion, *tiled_hlo, block_level_parameters,
+                                fn, pid, values));
     TF_RET_CHECK(values.insert({tiled_hlo, result}).second) << hlo->ToString();
     VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
   }
-  std::vector<ScalarOrTensor> results;
+  std::vector<TensorValue> results;
   results.reserve(tiled_computation.GetRoots().size());
   for (const auto* root : tiled_computation.GetRoots()) {
     results.push_back(values[root]);
@@ -1616,68 +1329,6 @@ absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
   return std::move(results);
 }
 
-// Emit sequence of instructions using compatible tiling ordered producers
-// before consumers.
-absl::StatusOr<ScalarOrTensor> EmitScope(
-    EmitterLocOpBuilder b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const TritonFusionAnalysis* analysis,
-    absl::Span<const HloInstruction* const> instructions,
-    absl::flat_hash_map<const HloInstruction*, ScalarOrTensor>& values) {
-  for (const HloInstruction* hlo : instructions) {
-    ScalarOrTensor result;
-    if (hlo->opcode() == HloOpcode::kConcatenate ||
-        hlo->opcode() == HloOpcode::kDynamicSlice) {
-      // Parameter loads and their concatenations are handled outside EmitScope.
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kParameter) {
-      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
-          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
-        continue;
-      }
-      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
-      continue;
-    } else if (hlo->opcode() == HloOpcode::kConstant) {
-      TF_ASSIGN_OR_RETURN(result, EmitConstant(b, *hlo));
-    } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      return absl::InvalidArgumentError(
-          "Broadcast is not yet supported in EmitScope().");
-    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
-      std::vector<Value> operands;
-      operands.reserve(hlo->operands().size());
-      for (const HloInstruction* operand : hlo->operands()) {
-        operands.push_back(values[operand].UnwrapUnsafe());
-      }
-      TF_ASSIGN_OR_RETURN(
-          Value elementwise_result,
-          EmitElementwise(b, libdevice_path, device_info, *hlo, operands));
-      result = ScalarOrTensor(elementwise_result);
-    } else if (hlo->opcode() == HloOpcode::kTuple) {
-      TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
-    } else if (hlo->opcode() == HloOpcode::kBitcast ||
-               hlo->opcode() == HloOpcode::kTranspose ||
-               hlo->opcode() == HloOpcode::kSlice ||
-               hlo->opcode() == HloOpcode::kReshape ||
-               hlo->opcode() == HloOpcode::kPad) {
-      // All these are currently supported only as operations on indices
-      // which are pushed to loads and stores. No operations on tiles are
-      // performed here.
-      result = values[hlo->operand(0)];
-    } else if (hlo->opcode() == HloOpcode::kFusion) {
-      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
-      TF_ASSIGN_OR_RETURN(result,
-                          EmitNestedFusion(b, libdevice_path, device_info,
-                                           *fusion_instruction, values));
-    } else {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported operation ", hlo->ToString()));
-    }
-    TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
-    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
-  }
-  return values[instructions.back()];
-}
 }  // namespace
 
 namespace ir_emitter_triton_internal {
@@ -1714,6 +1365,11 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
       dot_tiling_parameters.reserve(num_tiling_parameters);
       for (int64_t contracting_dim_id :
            hlo->dot_dimension_numbers().lhs_contracting_dimensions()) {
+        if (contracting_dim_id >= lhs_output_tile_sizes.size()) {
+          return absl::FailedPreconditionError(
+              absl::StrCat("Output tile sizes index ", contracting_dim_id,
+                           " is out of bounds for ", lhs->ToString()));
+        }
         dot_tiling_parameters.push_back(
             lhs_output_tile_sizes[contracting_dim_id]);
       }
@@ -1724,6 +1380,12 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
     // TODO(b/390559452): this should change for generalized multi-output
     // fusions.
     if (hlo == real_root) {
+      if (real_root_index >= block_level_parameters.output_tile_sizes.size()) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Output tile sizes index ", real_root_index,
+            " is out of bounds for block level fusion config: ",
+            block_level_parameters.ToBlockLevelFusionConfig().DebugString()));
+      }
       absl::Span<const int64_t> output_tile_sizes =
           block_level_parameters.output_tile_sizes[real_root_index];
       tile_mapping[hlo].insert(tile_mapping[hlo].end(),
@@ -1738,19 +1400,14 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
 }  // namespace ir_emitter_triton_internal
 
 namespace {
-
 using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
 
-// Generate Triton IR inside 'fn', using the given block_level_parameters.
-// TODO(b/421837868): `BlockLevelParameters` should hold all the necessary
-// tiling information.
-absl::Status EmitGeneric(mlir::OpBuilder builder,
-                         absl::string_view libdevice_path,
-                         const se::DeviceDescription& device_info,
-                         const HloFusionInstruction* fusion,
-                         xtile::EntryFuncOp fn,
-                         const BlockLevelParameters& block_level_parameters,
-                         SymbolicExprContext* symbolic_expr_context) {
+absl::Status EmitGeneric(
+    mlir::OpBuilder builder,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion, xtile::EntryFuncOp fn,
+    const BlockLevelParameters& block_level_parameters,
+    SymbolicExprContext* symbolic_expr_context) {
   if (VLOG_IS_ON(6)) {
     VLOG(6) << "Emitting Triton IR for fusion\n"
             << ExtractInstructionIntoNewModule(*fusion)->ToString();
@@ -1759,7 +1416,8 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
   SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
       SymbolicTileAnalysis::AnalyzeComputation(
           *computation, symbolic_expr_context,
-          TritonEmitterConstraints::GetBuilder(device_info));
+          emitter_specific_constraints_builder);
+
   if (std::holds_alternative<FusionDecision>(symbolic_tile_analysis_or)) {
     return Internal(
         "Unsupported fusion in EmitGeneric: %s",
@@ -1836,11 +1494,11 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
           << tiled_hlo_computation.ToString();
 
   Value tile_id = fn.getTileId();
-  absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor> values;
+  absl::flat_hash_map<const TiledHloInstruction*, TensorValue> values;
   TF_ASSIGN_OR_RETURN(
       auto results,
-      EmitTiledComputation(b, libdevice_path, device_info, fusion,
-                           tiled_hlo_computation, fn, tile_id, values));
+      EmitTiledComputation(b, fusion, tiled_hlo_computation,
+                           block_level_parameters, fn, tile_id, values));
 
   for (auto [root, result, arg] :
        llvm::zip(tiled_hlo_computation.GetRoots(), results,
@@ -1852,27 +1510,14 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
     Type result_storage_type = StorageType(result_element_type);
 
     if (result_element_type != result_storage_type) {
-      result =
-          ScalarOrTensor(Cast(b, result.UnwrapUnsafe(), result_storage_type));
-    }
-
-    mlir::Value input_tensor;
-    if (result.IsScalar()) {
-      // TODO(csigg): Handle this in extract/insert rewrite.
-      mlir::Value scalar_value = result.UnwrapScalar();
-      auto tensor_type =
-          mlir::RankedTensorType::get({}, scalar_value.getType());
-      input_tensor =
-          b.create<mlir::tensor::FromElementsOp>(tensor_type, scalar_value);
-    } else {
-      input_tensor = result.UnwrapTensor();
+      result = mlir::cast<TensorValue>(Cast(b, result, result_storage_type));
     }
 
     TF_ASSIGN_OR_RETURN(
         auto tile_info,
         TileInfo::Construct(b, tile_id, /*runtime_values=*/{}, *root));
 
-    b.create<xtile::InsertTileOp>(input_tensor, arg, tile_info.offsets(),
+    b.create<xtile::InsertTileOp>(result, arg, tile_info.offsets(),
                                   tile_info.padded_tile_sizes(),
                                   tile_info.tile_strides());
   }
@@ -1951,17 +1596,57 @@ mlir::MemRefType GetMemRefType(const Shape& shape, mlir::Type element_type) {
   return mlir::MemRefType::get(shape.dimensions(), storage_type, layout);
 }
 
+absl::Status IsTritonSupportedFusion(const HloFusionInstruction& fusion,
+                                     const se::DeviceDescription& device_info) {
+  const HloComputation* computation = fusion.fused_instructions_computation();
+  for (const HloInstruction* hlo : computation->instructions()) {
+    // Skip generating nested fusions, they are emitted by their consumer.
+    if (hlo->parent()->IsFusionComputation() &&
+        hlo->opcode() == HloOpcode::kFusion) {
+      if (hlo->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_experimental_scaled_dot_with_triton()) {
+        continue;
+      }
+      CodegenDecision decision = IsTritonSupportedInstruction(
+          *hlo, device_info.gpu_compute_capability());
+      if (!decision.CanFuse()) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Fusion ", hlo->ToString(),
+                         " is not supported: ", decision.Explain()));
+      }
+      VLOG(1) << "Skipping nested fusion: " << hlo->ToString();
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kPad) {
+      if (!IsTritonSupportedInstruction(*hlo,
+                                        device_info.gpu_compute_capability())) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Pad is not supported: ", hlo->ToString()));
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     absl::string_view fn_name, const HloFusionInstruction* fusion,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
     SymbolicExprContext& symbolic_expr_context) {
+  TF_RETURN_IF_ERROR(IsTritonSupportedFusion(*fusion, device_info));
+
   // TODO: b/451959933 - Use reference or check pointer.
   mlir::MLIRContext& mlir_context = *symbolic_expr_context.GetMLIRContext();
-  TF_ASSIGN_OR_RETURN(auto triton_module,
-                      ir_emitter_triton_internal::EmitXTileModule(
-                          fn_name, fusion, device_info, block_level_parameters,
-                          symbolic_expr_context));
+
+  TF_ASSIGN_OR_RETURN(
+      auto triton_module,
+      ir_emitter_triton_internal::EmitXTileModule(
+          fn_name, TritonEmitterConstraints::GetBuilder(device_info), fusion,
+          block_level_parameters, symbolic_expr_context,
+          ir_emitter_triton_internal::LegacyMatmulEmitter(device_info)));
 
   const HloComputation* hlo_computation =
       fusion->fused_instructions_computation();
@@ -1984,7 +1669,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
   }
 
   TF_RETURN_IF_ERROR(ir_emitter_triton_internal::LowerXTileToTriton(
-      triton_module.get(), mlir_context, *fusion));
+      triton_module.get(), mlir_context, *fusion, device_info));
 
   VLOG(6) << DumpTritonIR(triton_module.get(),
                           fusion->GetModule()
@@ -2108,7 +1793,8 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   }
 
   CreateTritonXlaPipeline(&pm, gpu_cc, /*rewrite_int4=*/is_xla_fusion,
-                          block_level_parameters.is_tma_allowed);
+                          block_level_parameters.is_tma_allowed,
+                          block_level_parameters.num_stages);
 
   int num_warps = block_level_parameters.num_warps;
   int num_ctas = block_level_parameters.num_ctas;
@@ -2248,14 +1934,27 @@ std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
 
 namespace ir_emitter_triton_internal {
 
+absl::Status LegacyMatmulEmitter::Emit(
+    EmitterLocOpBuilder& b, const HloFusionInstruction* fusion,
+    xtile::EntryFuncOp& fn,
+    const BlockLevelParameters& block_level_parameters) {
+  std::string libdevice_path =
+      GetLibdevicePath(fusion->GetModule()->config(), device_info_);
+  TF_RETURN_IF_ERROR(EmitMatMul(b, libdevice_path, device_info_, fusion, fn,
+                                block_level_parameters));
+  return absl::OkStatus();
+}
+
 // TODO(b/447133106): Contrary to the name, this function still does a lot of
 // triton specific things. It should be migrated to use non-triton specific
 // utilities.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
+    absl::string_view fn_name,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion,
     const BlockLevelParameters& block_level_parameters,
-    SymbolicExprContext& symbolic_expr_context) {
+    SymbolicExprContext& symbolic_expr_context,
+    std::optional<LegacyMatmulEmitter> legacy_matmul_emitter) {
   mlir::MLIRContext& mlir_context = *symbolic_expr_context.GetMLIRContext();
   LoadMlirDialectsForTriton(mlir_context);
   const auto debug_options = fusion->GetModule()->config().debug_options();
@@ -2297,14 +1996,25 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
     fn_arg_types.push_back(GetMemRefType(shape, triton_ty));
   }
 
-  auto fn = b.create<xtile::EntryFuncOp>(fn_name, fn_arg_types);
+  // Add metadata arguments for collectives.
+  // This is done after the input and output arguments but before the tile
+  // index.
+  int32_t num_metadata_arguments = 0;
+  if (fusion_kind == kTritonCollectiveFusionKind) {
+    TF_ASSIGN_OR_RETURN(
+        num_metadata_arguments,
+        AddCollectiveMetadataArguments(fn_arg_types, b, hlo_computation));
+  }
+  // Metadata arguments are opaque to the tiling infra.
+  llvm::SmallVector<mlir::NamedAttribute> named_attributes{b.getNamedAttr(
+      "num_opaque_args", b.getI32IntegerAttr(num_metadata_arguments))};
+
+  auto fn =
+      b.create<xtile::EntryFuncOp>(fn_name, fn_arg_types, named_attributes, {});
 
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
 
-  std::string libdevice_path =
-      GetLibdevicePath(fusion->GetModule()->config(), device_info);
-
   if (fusion_kind == kTritonGemmFusionKind) {
     if (absl::c_contains(
             fusion->GetModule()
@@ -2314,13 +2024,16 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
             DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM)) {
       return Internal("Legacy GEMM emitter is disabled.");
     }
-    TF_RETURN_IF_ERROR(EmitMatMul(b, libdevice_path, device_info, fusion, fn,
-                                  block_level_parameters));
+    CHECK(legacy_matmul_emitter.has_value())
+        << "emit_legacy_matmul_fn is not set";
+    TF_RETURN_IF_ERROR(
+        legacy_matmul_emitter->Emit(b, fusion, fn, block_level_parameters));
   } else if (fusion_kind == kTritonFusionKind ||
              fusion_kind == kTritonNestedGemmFusionKind ||
-             fusion_kind == kTritonScaledDotFusionKind) {
-    TF_RETURN_IF_ERROR(EmitGeneric(b, libdevice_path, device_info, fusion, fn,
-                                   block_level_parameters,
+             fusion_kind == kTritonScaledDotFusionKind ||
+             fusion_kind == kTritonCollectiveFusionKind) {
+    TF_RETURN_IF_ERROR(EmitGeneric(b, emitter_specific_constraints_builder,
+                                   fusion, fn, block_level_parameters,
                                    &symbolic_expr_context));
   } else {
     return Internal("Unsupported fusion kind: %s", fusion_kind);
@@ -2333,15 +2046,39 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
 
 absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
                                 mlir::MLIRContext& mlir_context,
-                                const HloFusionInstruction& fusion) {
-  {  // Convert xTile ops to Triton ops.
+                                const HloFusionInstruction& fusion,
+                                const se::DeviceDescription& device_info) {
+  {
+    auto backend_config =
+        fusion.backend_config<GpuBackendConfig>()->fusion_backend_config();
+    absl::string_view fusion_kind = backend_config.kind();
+
+    // Convert xTile ops to Triton ops.
     mlir::PassManager pm(&mlir_context);
     // Disable verifier because the Triton code may be invalid due to the
     // unsupported types.
     pm.enableVerifier(/*enabled=*/false);
+    // The legacy emitter supports 0D tensors so we would get inconsistent
+    // results if we try to rewrite them.
+    if (fusion_kind != kTritonGemmFusionKind) {
+      pm.addPass(xtile::createConvertElementwise0DTensorToScalarPass());
+    }
     pm.addPass(mlir::triton::xla::CreateTensorLowerToTritonPass());
     pm.addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
-    if (mlir::failed(pm.run(xtile_dialect_module))) {
+    pm.addPass(mlir::triton::xla::CreateXTileLowerToTritonPass());
+
+    std::string libdevice_path =
+        GetLibdevicePath(fusion.GetModule()->config(), device_info);
+    absl::string_view triple = device_info.gpu_compute_capability().IsRocm()
+                                   ? "amdgcn-unknown-unknown"
+                                   : "nvptx64-unknown-unknown";
+    pm.addPass(mlir::triton::xla::CreateTritonXLAMathToLibdevicePass(
+        libdevice_path, triple));
+
+    tsl::StatusScopedDiagnosticHandler diagnostic_handler(&mlir_context);
+    if (absl::Status status =
+            diagnostic_handler.consumeStatus(pm.run(xtile_dialect_module));
+        !status.ok()) {
       return CreateInternalError(
           "Failed to lower from shared dialect to Triton.", &fusion,
           xtile_dialect_module);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
index 423b3a58fc2fb3..8b66e41ce7ff43 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
@@ -35,11 +35,12 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
@@ -145,16 +146,33 @@ absl::StatusOr<Tiling> TilingFromAnnotatedFusion(
     const SymbolicTileAnalysis& symbolic_tile_analysis,
     const BlockLevelParameters& block_level_parameters);
 
+// TODO(basioli): Remove this class once the legacy matmul
+// emitter no longer exists.
+class LegacyMatmulEmitter {
+ public:
+  explicit LegacyMatmulEmitter(const se::DeviceDescription& device_info)
+      : device_info_(device_info) {}
+
+  absl::Status Emit(EmitterLocOpBuilder& b, const HloFusionInstruction* fusion,
+                    xtile::EntryFuncOp& fn,
+                    const BlockLevelParameters& block_level_parameters);
+
+ private:
+  const se::DeviceDescription& device_info_;
+};
+
 // This function (or its future equivalent) should emit the MLIR module in the
 // shared dialect between XLA:CPU and XLA:GPU. At the moment it is still
 // emitting GPU specific modules. It is currently exposed only for testing
 // purposes and will only be used to make sure we are properly emitting the
 // shared dialect.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
-    absl::string_view fn_name, const HloFusionInstruction* fusion,
-    const se::DeviceDescription& device_info,
+    absl::string_view fn_name,
+    EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
+    const HloFusionInstruction* fusion,
     const BlockLevelParameters& block_level_parameters,
-    SymbolicExprContext& symbolic_expr_context);
+    SymbolicExprContext& symbolic_expr_context,
+    std::optional<LegacyMatmulEmitter> legacy_matmul_emitter = std::nullopt);
 
 // This function lowers the shared dialect module to Triton. It is exposed for
 // testing with the same motivation as EmitXTileModule.
@@ -163,7 +181,8 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitXTileModule(
 // dialect module.
 absl::Status LowerXTileToTriton(mlir::ModuleOp xtile_dialect_module,
                                 mlir::MLIRContext& mlir_context,
-                                const HloFusionInstruction& fusion);
+                                const HloFusionInstruction& fusion,
+                                const se::DeviceDescription& device_info);
 
 }  // namespace ir_emitter_triton_internal
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
index 06cc1b57baf737..34f0a7ec805f59 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/pattern_matcher.h"
@@ -93,6 +93,7 @@ class TritonTest : public GpuCodegenTest {
         debug_options
             .mutable_xla_gpu_unsupported_generic_triton_emitter_features();
     emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
+    emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
     emitter_opts->Add(
         DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
     emitter_opts->Add(
@@ -112,15 +113,6 @@ class TritonTest : public GpuCodegenTest {
     return device_desc().gpu_compute_capability();
   }
 
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (GpuComputeCapability().IsRocm()) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    }
-    return stream_executor::GpuComputeCapability{
-        se::CudaComputeCapability::Ampere()};
-  }
-
   // Returns the module, its fusion computation and associated block level
   // parameters from an HLO module text whose entry computation contains a
   // single GEMM fusion.
@@ -268,8 +260,8 @@ ENTRY e {
                                  module_and_metadata.block_level_parameters,
                                  R"(
 CHECK: %[[LOAD:.*]] = xtile.extract {{.*}} -> tensor<16x16xi8>
-CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
-CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
+CHECK: %[[CMPI:.*]] = arith.cmpi ne, %[[LOAD]], {{.*}} : tensor<16x16xi8>
+CHECK: %{{.*}} = arith.andi %[[CMPI]], %{{.*}} : tensor<16x16xi1>
 )"));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
index 0a6e6175b4c1b1..5d09574131c0e7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -33,6 +32,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
@@ -86,6 +85,13 @@ class TritonTest : public GpuCodegenTest {
     }
   }
 
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // This is a legacy test, we are testing the old emitter.
+    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
+    return debug_options;
+  }
+
  protected:
   const stream_executor::DeviceDescription& device_desc() {
     return backend().default_stream_executor()->GetDeviceDescription();
@@ -104,7 +110,6 @@ class TritonGemmTest : public TritonTest {
     debug_options.set_xla_gpu_enable_split_k_autotuning(false);
     // Always rewrite Gemms with Triton regardless of size.
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
     return debug_options;
   }
 
@@ -468,9 +473,10 @@ ENTRY e {
 )";
   TF_EXPECT_OK(CreateTritonIrAndFileCheckForDot(this, kHloText,
                                                 "triton_gemm_computation", R"(
+CHECK: %[[CST:.*]] = arith.constant dense<0>
 CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>>
-CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
-CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
+CHECK: %[[CMPI:.*]] = arith.cmpi ne, %[[LOAD]], %[[CST]] : tensor<16x16xi8>
+CHECK: %{{.*}} = arith.andi %[[CMPI]], %{{.*}} : tensor<16x16xi1>
 )"));
 }
 
@@ -1559,7 +1565,9 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
-TEST_F(TritonGemmTest, DynamicSliceIsSupportedInLhsEndToEnd) {
+// Dynamic slice is not supported by the generic Triton emitter yet and disabled
+// in the triton gemm fusion pass.
+TEST_F(TritonGemmTest, DISABLED_DynamicSliceIsSupportedInLhsEndToEnd) {
   // The select is used to restrict the start index to values that make sense.
   // If it was constant, then the dynamic-slice would be optimized to slice. It
   // is not strictly needed, because we also support clamping the indices.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 1e74a04a910b4b..2d73efd23fa573 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -55,7 +56,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -231,6 +231,32 @@ ENTRY entry {
       hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
 
+TEST_F(TritonEmitterTest, ConvertIntegerToPredIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_convert {
+  p0 = s32[3,2,2]{2,1,0} parameter(0)
+  ROOT convert0 = pred[3,2,2]{2,1,0} convert(p0)
+}
+
+ENTRY %main {
+  p0 = s32[3,2,2]{2,1,0} parameter(0)
+  ROOT input_convert_fusion = pred[3,2,2]{2,1,0} fusion(p0), kind=kCustom,
+    calls=fused_convert,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton","block_level_fusion_config":{
+        "num_warps":"1","output_tiles":[{"sizes":["1","2","2"]}],"num_ctas":1,
+        "num_stages":1,"is_tma_allowed":false}}}
+}
+)";
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_convert", R"(
+CHECK: %[[CST:.*]] = arith.constant dense<0>
+CHECK: arith.cmpi ne, %{{.*}}, %[[CST]]
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, PredicateAddIsEmittedCorrectly) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -315,11 +341,19 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK:  "tt.reduce"(%[[LOAD:.*]]) <{axis = 1 : i32}>
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%{{.*}} init: %{{.*}}) across dimensions = [1] : (tensor<4x4xf32>, tensor<f32>) -> tensor<4xf32>
 )"));
 
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:  "tt.reduce"(%[[LOAD:.*]]) <{axis = 1 : i32}>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -353,13 +387,26 @@ ENTRY entry_computation {
         "num_ctas":1,"num_stages":1,"is_tma_allowed":false}}}
 }
 )";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_reduce", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_reduce", R"(
+CHECK: stablehlo.reduce
+CHECK: reducer(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+CHECK:   %[[ADD:.*]] = arith.addf %[[ARG0]], %[[ARG1]] : tensor<f32>
+CHECK:   %[[MIN:.*]] = arith.minimumf %[[ADD]]
+CHECK:   stablehlo.return %[[MIN]] : tensor<f32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK: "tt.reduce"
 CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32)
 CHECK: %[[ADD:.*]] = arith.addf %[[ARG0]], %[[ARG1]]
 CHECK: %[[MIN:.*]] = arith.minimumf %[[ADD]]
 CHECK: tt.reduce.return %[[MIN]]
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_reduce")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -395,14 +442,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
 CHECK-COUNT-1:  xtile.extract
 CHECK:  %[[ABS:.*]] = math.absf
-CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 1 : i32}>
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%[[ABS]] init: %{{.*}}) across dimensions = [1] : (tensor<64x512xf32>, tensor<f32>) -> tensor<64xf32>
 CHECK:  xtile.insert %[[REDUCE]] {{.*}} : tensor<64xf32>
 CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<64x512xf32>
 )"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK:  %[[ABS:.*]] = math.absf
+CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 1 : i32}>
+CHECK:  xtile.insert %[[REDUCE]] {{.*}} : tensor<64xf32>
+CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<64x512xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -437,15 +496,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
 CHECK-COUNT-1:  xtile.extract
 CHECK:  %[[ABS:.*]] = math.absf
-CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 0 : i32}>
-CHECK: %[[SCALAR_TENSOR:.*]] = tensor.from_elements %[[REDUCE]] : tensor<f32>
-CHECK: xtile.insert %[[SCALAR_TENSOR]] into %arg1
-CHECK: xtile.insert %[[ABS]] {{.*}} : tensor<512xf32>
+CHECK:  %[[REDUCE:.*]] = stablehlo.reduce(%[[ABS]] init: %{{.*}}) across dimensions = [0]
+CHECK:  xtile.insert %[[ABS]]
 )"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK:  %[[ABS:.*]] = math.absf
+CHECK:  %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 0 : i32}>
+CHECK: %[[REDUCE_TENSOR:.*]] = tensor.from_elements %[[REDUCE]] : tensor<f32>
+CHECK: xtile.insert %[[REDUCE_TENSOR]] into %arg1
+CHECK:  xtile.insert %[[ABS]] {{.*}} : tensor<512xf32>
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -805,12 +875,24 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "fused_computation", R"(
 CHECK-COUNT-1:  xtile.extract
-CHECK: tt.reduce
+CHECK: stablehlo.reduce
 CHECK-COUNT-2:  xtile.insert
 )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK-COUNT-1:  xtile.extract
+CHECK: tt.reduce
+CHECK-COUNT-2:  xtile.insert
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "fused_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -889,9 +971,10 @@ ENTRY main {
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_reduction_computation",
                                 R"(
-CHECK:  stablehlo.iota
-CHECK:  stablehlo.broadcast_in_dim
-CHECK:  "tt.reduce"(%[[SELECT:.*]]) <{axis = 2 : i32}>
+
+        CHECK:  stablehlo.iota
+        CHECK:  stablehlo.broadcast_in_dim
+        CHECK:  stablehlo.reduce(%[[SELECT:.*]] init: %{{.*}}) across dimensions = [2] : (tensor<4x2x8x8x1xf32>, tensor<f32>) -> tensor<4x2x8x1xf32>
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
@@ -944,12 +1027,12 @@ CHECK:  %[[LOAD:.*]] = xtile.extract
 CHECK:  %[[RANGE:.*]] = stablehlo.iota
 CHECK:  %[[BROADCAST:.*]] = stablehlo.broadcast_in_dim %[[RANGE]]
 CHECK:  %[[CMPI:.*]] = arith.cmpi slt, %[[BROADCAST]]
-CHECK:  %[[SELECT:.*]] = arith.select %[[CMPI]], %[[LOAD]]
-CHECK:  "tt.reduce"(%[[SELECT]]) <{axis = 0 : i32}>
-CHECK:  ^bb0(%[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32):
-CHECK:    %[[MAXIMUM:.*]] = arith.maximumf %[[ARG2]], %[[ARG3]] : f32
-CHECK:    tt.reduce.return %[[MAXIMUM]] : f32
-CHECK:  })
+CHECK:  %[[SELECT:.*]] = arith.select %[[CMPI]], %[[LOAD]], %{{.*}}
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%[[SELECT]] init: %{{.*}}) across dimensions = [0] : (tensor<8x4xf32>, tensor<f32>) -> tensor<4xf32>
+CHECK:   reducer(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)  {
+CHECK:   %[[MAX:.*]] = arith.maximumf %[[ARG0]], %[[ARG1]] : tensor<f32>
+CHECK:   stablehlo.return %[[MAX]] : tensor<f32>
+CHECK: }
           )"));
 
   TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
@@ -1004,9 +1087,28 @@ ENTRY main {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}})";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
-CHECK:        xtile.entry_func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[PID:.*]]: index)
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[PID:.*]]: index)
+CHECK-DAG:        %[[EXTRACT_IDX_0:.*]] = xla.apply_indexing #indexing_map(%[[PID]])
+CHECK-NEXT:       xtile.extract %[[P0]]
+CHECK-SAME:       [%[[PID]], %[[EXTRACT_IDX_0]]] [1, 128] [1, 1]
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG2:[^:]*]]: tensor<f32>, %[[ARG3:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : tensor<f32>
+CHECK:              stablehlo.return %[[ADD]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<1x128xf32>
+CHECK:            xtile.insert {{.*}}[%[[PID]], %{{.*}}] [1, 128] [1, 1]
+CHECK:            return
+CHECK:        }
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[PID:.*]]: index)
 CHECK-DAG:        %[[C_0:.*]] = arith.constant 0 : index
 CHECK-NEXT:       xtile.extract %[[P0]]
 CHECK-SAME:       [%[[PID]], %[[C_0]]] [1, 128] [1, 1]
@@ -1020,7 +1122,9 @@ CHECK-SAME:       tensor<1x128xf32>
 CHECK:            xtile.insert {{.*}}[%[[PID]], %[[C_0]]] [1, 128] [1, 1]
 CHECK:            return
 CHECK:        }
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
 }
 
 // TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
@@ -1058,9 +1162,32 @@ ENTRY main {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}})";
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
-CHECK:         xtile.entry_func @triton_fn(
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
+CHECK:         xtile.entry_func @xtile_dialect_fn(
+CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: memref<127xf32>
+CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                      %[[TID:[A-Za-z0-9_]*]]: index)
+CHECK-DAG:        %[[EXTRACT_IDX_0:.*]] = xla.apply_indexing #indexing_map(%[[TID]])
+CHECK-DAG:        xtile.extract %[[P0]][%[[TID]], %[[EXTRACT_IDX_0]]] [1, 128] [1, 1] : {{.*}} -> tensor<1x128xf32>
+CHECK-DAG:        %[[EXTRACT_IDX_1:.*]] = xla.apply_indexing #indexing_map(%[[TID]])
+CHECK-DAG:        xtile.extract %[[P1]][%[[EXTRACT_IDX_1]]] [128] [1] : {{.*}} -> tensor<128xf32>
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG3:[^:]*]]: tensor<f32>, %[[ARG4:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : tensor<f32>
+CHECK:              stablehlo.return %[[ADD]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            arith.mulf
+CHECK-DAG:        xtile.insert {{.*}} into %[[P2]]
+CHECK-SAME:       [%[[TID]], %{{.*}}] [1, 128] [1, 1] : tensor<1x128xf32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:         xtile.entry_func @xtile_dialect_fn(
 CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
 CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: memref<127xf32>
 CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: memref<125x127xf32>
@@ -1076,7 +1203,9 @@ CHECK-NEXT:       }) : (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf
 CHECK-DAG:        xtile.insert {{.*}} into %[[P2]]
 CHECK-SAME:       [%[[TID]], %[[C_0]]] [1, 128] [1, 1] : tensor<1x128xf32>
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
 }
 
 TEST_F(TritonEmitterTest, TestGenericEmitterWithMultipleTiledDimensions) {
@@ -1119,11 +1248,35 @@ ENTRY main {
           "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_softmax_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(
+                              this, kHloText, "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 floordiv 125), domain: pid_0 in [0, 1249]">
+CHECK:        #[[MAP1:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 mod 125), domain: pid_0 in [0, 1249]">
+CHECK:        #[[C_0_MAP:.*]] = #xla.indexing_map<"(pid_0) -> (0), domain: pid_0 in [0, 1249]">
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}}, %[[TID:.*]]: index)
+CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
+CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
+CHECK-DAG:        %[[C_0:.*]] = xla.apply_indexing #[[C_0_MAP]](%[[TID]])
+CHECK:            xtile.extract %[[P0]][%[[ROW_INDEX]], %[[COL_INDEX]], %[[C_0]]] [1, 1, 128] [1, 1, 1] : {{.*}} -> tensor<1x1x128xf32>
+CHECK:            %[[C_0_COPY:.*]] = xla.apply_indexing #[[C_0_MAP]](%[[TID]])
+CHECK:            xtile.extract %[[P1]][%[[C_0_COPY]]] [128] [1] : {{.*}} -> tensor<128xf32>
+CHECK-DAG:        %[[ROW_INDEX_COPY:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
+CHECK-DAG:        %[[COL_INDEX_COPY:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
+CHECK:            xtile.extract %[[P2]][%[[ROW_INDEX_COPY]], %[[COL_INDEX_COPY]]] [1, 1] [1, 1] : {{.*}} -> tensor<1x1xf32>
+CHECK:            stablehlo.reduce
+CHECK-NEXT:       reducer(%[[ARG4:[^:]*]]: tensor<f32>, %[[ARG5:[^:]*]]: tensor<f32>)  {
+CHECK:              %[[MAX:.*]] = arith.maximumf %[[ARG4]], %[[ARG5]] : tensor<f32>
+CHECK:              stablehlo.return %[[MAX]] : tensor<f32>
+CHECK-NEXT:       }
+CHECK:            xtile.insert {{.*}} into %[[P3]]{{.*}}
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:        #[[MAP:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 floordiv 125), domain: pid_0 in [0, 1249]">
 CHECK:        #[[MAP1:.*]] = #xla.indexing_map<"(pid_0) -> (pid_0 mod 125), domain: pid_0 in [0, 1249]">
-CHECK:        xtile.entry_func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}}, %[[TID:.*]]: index)
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}}, %[[TID:.*]]: index)
 CHECK-DAG:        %[[C_0:.*]] = arith.constant 0 : index
 CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[TID]]
 CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[TID]]
@@ -1137,7 +1290,9 @@ CHECK-NEXT:           tt.reduce.return %[[MAX]] : f32
 CHECK-NEXT:       }) : (tensor<1x1x128xf32>) -> tensor<1x1xf32>
 CHECK:            xtile.insert {{.*}} into %[[P3]]
 CHECK-SAME:       [%[[ROW_INDEX]], %[[COL_INDEX]], %[[C_0]]] [1, 1, 128] [1, 1, 1] : tensor<1x1x128xf32>
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_softmax_computation")));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -1517,12 +1672,26 @@ ENTRY main {
         "output_tiles":[{"sizes":["1"]}],
         "num_warps":"1",
         "num_ctas":"1",
-        "num_stages":"1"}}}
+          "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
-                                          "triton_reduction_computation", R"(
-CHECK:        xtile.entry_func @triton_fn(%[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_reduction_computation",
+                                R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
+CHECK-SAME:                               %[[P1:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-SAME:                               %[[P2:[A-Za-z0-9_]*]]: memref<125xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1xf32>
+CHECK-DAG:        xtile.extract {{.*}} -> tensor<1x128xf32>
+CHECK: %[[REDUCE:.*]] = stablehlo.reduce(%[[REDUCE_ARG:.*]] init: %{{.*}}) across dimensions = [1] : (tensor<1x128xf32>,    tensor<f32>) -> tensor<1xf32>
+CHECK:            arith.mulf {{.*}} tensor<1xf32>
+CHECK:            xtile.insert {{.*}} : tensor<1xf32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:        xtile.entry_func @xtile_dialect_fn(%[[P0:[A-Za-z0-9_]*]]: memref<125x127xf32>
 CHECK-SAME:                               %[[P1:[A-Za-z0-9_]*]]: memref<125xf32>
 CHECK-SAME:                               %[[P2:[A-Za-z0-9_]*]]: memref<125xf32>
 CHECK-DAG:        xtile.extract {{.*}} -> tensor<1xf32>
@@ -1531,7 +1700,9 @@ CHECK:            tt.reduce
 CHECK:              (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf {{.*}} tensor<1xf32>
 CHECK:            xtile.insert {{.*}} : tensor<1xf32>
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_reduction_computation")));
 }
 
 TEST_F(TritonEmitterTest,
@@ -1813,11 +1984,20 @@ ENTRY main {
   const bool is_tma_allowed = GetParam();
   const std::string hlo_text =
       absl::Substitute(kHloTextTemplate, is_tma_allowed);
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, hlo_text, "triton_computation", R"(
-CHECK: tt.reshape
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, hlo_text, "triton_computation", R"(
+CHECK: stablehlo.reshape
 )"));
 
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK: tt.reshape
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, kExactMatch));
 }
 
@@ -1852,18 +2032,17 @@ ENTRY main {
 // CHECK-SAME: %[[OUT:.*]]: memref<49xf32>
 
 // CHECK: %[[EXTRACT:.*]] = xtile.extract %[[IN]]{{.*}}
-// CHECK: %[[PAD_VALUE:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[PAD_VALUE:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
 // CHECK: %[[TILE_OFFSET:.*]] = xla.apply_indexing
 // CHECK: %[[IOTA_VAL:.*]] = stablehlo.iota dim = 0 : tensor<32xi32>
 // CHECK: %[[IOTA:.*]] = stablehlo.broadcast_in_dim %[[IOTA_VAL]], dims = [0] : (tensor<32xi32>) -> tensor<32xi32>
 // CHECK: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET]]
 // CHECK: %[[C17:.*]] = arith.constant 17 : i32
 // CHECK: %[[THRESHOLD:.*]] = arith.subi %[[C17]], %[[TILE_OFFSET_I32]]
-// CHECK: %[[FROM_ELEMENTS_THRESHOLD:.*]] = tensor.from_elements %[[THRESHOLD]]
-// CHECK: %[[THRESHOLD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[FROM_ELEMENTS_THRESHOLD]], dims = []
+// CHECK: %[[THRESHOLD_TENSOR:.*]] = tensor.from_elements %[[THRESHOLD]]
+// CHECK: %[[THRESHOLD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[THRESHOLD_TENSOR]], dims = []
 // CHECK: %[[MASK:.*]] = arith.cmpi slt, %[[IOTA]], %[[THRESHOLD_SPLAT]]
-// CHECK: %[[FROM_ELEMENTS_PAD_VALUE:.*]] = tensor.from_elements %[[PAD_VALUE]]
-// CHECK: %[[PAD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[FROM_ELEMENTS_PAD_VALUE]], dims = []
+// CHECK: %[[PAD_SPLAT:.*]] = stablehlo.broadcast_in_dim %[[PAD_VALUE]], dims = []
 // CHECK: %[[SELECT:.*]] = arith.select %[[MASK]], %[[EXTRACT]], %[[PAD_SPLAT]]
 
 // CHECK:   xtile.insert %[[SELECT]] into %[[OUT]]
@@ -1972,11 +2151,19 @@ ENTRY main {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK: tt.reshape
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK: stablehlo.reshape
 )"));
 
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK: tt.reshape
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -2155,7 +2342,7 @@ ENTRY entry_computation {
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract
 CHECK-NOT: stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK-NOT: stablehlo.transpose
 CHECK:     xtile.insert
           )"));
@@ -2202,7 +2389,7 @@ CHECK-SAME: memref<48x16xi32, #triton_xla.layout<[0, 1]>>
 CHECK-SAME: memref<16x16x3xi32>,
 CHECK:      xtile.extract
 CHECK:      stablehlo.transpose
-CHECK:      tt.reshape
+CHECK:      stablehlo.reshape
 CHECK-NOT:  stablehlo.transpose
 CHECK:      xtile.insert
           )"));
@@ -2246,7 +2433,7 @@ ENTRY entry_computation {
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract
 CHECK-NOT: stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK:     stablehlo.transpose
 CHECK:     xtile.insert
           )"));
@@ -2291,7 +2478,7 @@ ENTRY entry_computation {
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract
 CHECK:     stablehlo.transpose
-CHECK:     tt.reshape
+CHECK:     stablehlo.reshape
 CHECK:     stablehlo.transpose
 CHECK:     xtile.insert
           )"));
@@ -2335,7 +2522,7 @@ ENTRY entry_computation {
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract
 CHECK:     stablehlo.transpose
-CHECK-NOT: tt.reshape
+CHECK-NOT: stablehlo.reshape
 CHECK-NOT: stablehlo.transpose
 CHECK:     xtile.insert
           )"));
@@ -2388,13 +2575,24 @@ ENTRY main {
         "num_ctas":"1",
         "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract
+CHECK:     stablehlo.reduce
+CHECK:     stablehlo.broadcast_in_dim
+CHECK:     xtile.insert
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:     xtile.extract
 CHECK:     tt.reduce
 CHECK:     tt.broadcast
 CHECK:     xtile.insert
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2422,8 +2620,8 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(
       auto xtile_module_and_hlo_module,
       CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:       %[[RES_FROM_ELEMENTS:.*]] = tensor.from_elements %[[ARG:.*]] : tensor<f32>
-CHECK:       stablehlo.broadcast_in_dim %[[RES_FROM_ELEMENTS]], dims = []
+CHECK:       %[[EXTRACTED_VALUE:.*]] = xtile.extract
+CHECK:       stablehlo.broadcast_in_dim %[[EXTRACTED_VALUE]], dims = []
           )"));
 
   TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
@@ -2503,7 +2701,7 @@ ENTRY main {
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:      %[[I8_PARAM:.*]] = xtile.extract {{.*}} -> tensor<4xi8>
-CHECK:      arith.trunci %[[I8_PARAM]] : tensor<4xi8> to tensor<4xi1>
+CHECK:      arith.cmpi ne, %[[I8_PARAM]], {{.*}} : tensor<4xi8>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -2833,14 +3031,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract {{.*}} -> tensor<f32>
+CHECK:     stablehlo.broadcast_in_dim
+CHECK:     arith.addf
+CHECK:     stablehlo.reduce
+CHECK:     xtile.insert {{.*}} : tensor<f32>
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.splat
 CHECK:     arith.addf
 CHECK:     tt.reduce
 CHECK:     xtile.insert {{.*}} : tensor<f32>
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/6e-1, /*arel=*/6e-1}));
@@ -2869,14 +3079,23 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     stablehlo.reshape {{.*}} : (tensor<1x1x1x1xf32>) -> tensor<f32>
+CHECK:     xtile.insert {{.*}} : tensor<f32>
+)"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:     tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
 CHECK-NOT: tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
 CHECK:     xtile.insert {{.*}} : tensor<f32>
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2913,14 +3132,26 @@ ENTRY entry_computation {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "triton_computation", R"(
+CHECK:     xtile.extract
+CHECK:     stablehlo.reshape
+CHECK:     stablehlo.reduce
+CHECK:     stablehlo.reduce
+CHECK:     xtile.insert
+)"));
+
+  TF_EXPECT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK:     xtile.extract
 CHECK:     tt.reshape
 CHECK:     tt.reduce
 CHECK:     tt.reduce
 CHECK:     xtile.insert
-)"));
+)",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second,
+                           "triton_computation")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -2974,7 +3205,6 @@ ENTRY entry_computation {
 // Reproducer from b/384110192.
 TEST_F(TritonEmitterTest,
        FusionWithOutputContainingMoreThanInt32MaxElementsExecutesCorrectly) {
-   //               "issue with triton.";
   // The point here is to check the output of the Triton fusion. The `slice` op
   // at the end is inserted to allow the comparison of output to run in a
   // reasonable amount of time, and has been proven to still correctly capture
@@ -3039,12 +3269,6 @@ TEST_F(TritonEmitterTest, ConvertF16ToF8E5M2Exhaustive) {
                     "always correct";
   }
 
-  if (std::holds_alternative<se::RocmComputeCapability>(
-          GpuComputeCapability())) {
-    GTEST_SKIP() << "Skipping tests on Rocm, Triton's conversion isn't "
-                    "always correct";
-  }
-
   constexpr absl::string_view kHloTextTemplate = R"(
 computation {
   p0 = f16[65536]{0} parameter(0)
@@ -3185,10 +3409,20 @@ ENTRY entry {
           "num_ctas":"1",
           "num_stages":"1"}}}
 })";
-  // We expect that for loop instruction will be optimized away.
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, kHloText, "fdot",
+                                                    R"(
+CHECK:  stablehlo.dot_general
+CHECK:  arith.addf
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
 CHECK: tt.dot {{.*}} -> tensor<16x16xf32>
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
@@ -3248,8 +3482,32 @@ ENTRY entry {
 
   const bool is_tma_allowed = GetParam();
   std::string hlo_text = absl::Substitute(kHloTextTemplate, is_tma_allowed);
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, hlo_text, "fdot", R"(
-CHECK:      xtile.entry_func @triton_fn(%[[ARG0:[A-Za-z0-9_]*]]: memref<32x123xf32>
+
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, hlo_text, "fdot",
+                                                    R"(
+CHECK:      xtile.entry_func @xtile_dialect_fn(%[[ARG0:[A-Za-z0-9_]*]]: memref<32x123xf32>
+CHECK-SAME:                             %[[ARG1:[A-Za-z0-9_]*]]: memref<123x512xf32>
+CHECK-SAME:                             %[[ARG2:[A-Za-z0-9_]*]]: memref<32x512xf32>
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
+CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
+CHECK-DAG:  xtile.extract %[[ARG0]]
+CHECK-DAG:  xtile.extract %[[ARG1]]
+CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
+CHECK:      stablehlo.dot_general {{.*}} (tensor<16x32xf32>, tensor<32x64xf32>) -> tensor<16x64xf32>
+CHECK:      arith.addf {{.*}}
+CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
+CHECK-COUNT-1: xtile.insert
+
+          )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
+CHECK:      xtile.entry_func @xtile_dialect_fn(%[[ARG0:[A-Za-z0-9_]*]]: memref<32x123xf32>
 CHECK-SAME:                             %[[ARG1:[A-Za-z0-9_]*]]: memref<123x512xf32>
 CHECK-SAME:                             %[[ARG2:[A-Za-z0-9_]*]]: memref<32x512xf32>
 CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
@@ -3264,7 +3522,9 @@ CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      tt.dot {{.*}} tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
 CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
 CHECK-COUNT-1: xtile.insert
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
@@ -3291,7 +3551,8 @@ fdot {
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
         "output_tiles":[{"sizes":["128", "64"]}],
-        "is_tma_allowed":"1"
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
       }
     }
   }
@@ -3299,7 +3560,8 @@ fdot {
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
         "output_tiles":[{"sizes":["64", "128"]}],
-        "is_tma_allowed":"1"
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
       }
     }
   }
@@ -3320,7 +3582,8 @@ ENTRY entry {
           "num_warps":"8",
           "num_ctas":"1",
           "num_stages":"1",
-          "is_tma_allowed":"1"}}}
+          "is_tma_allowed":"1",
+          "is_warp_specialization_allowed":"1"}}}
 })";
 
   // Check that the IR attribute is set correctly.
@@ -3383,7 +3646,21 @@ ENTRY entry {
           "num_stages":"1"}}}
 })";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+  TF_ASSERT_OK_AND_ASSIGN(auto xtile_module_and_hlo_module,
+                          CreateXTileIrAndFileCheck(this, kHloText, "fdot", R"(
+  // Ensure that masking is applied only conditionally to both operands.
+  CHECK:      %[[MASKED_OPERAND0:.*]] = scf.if
+  CHECK:        %[[SELECT0:.*]] = arith.select
+  CHECK-NEXT:   scf.yield %[[SELECT0]]
+  CHECK:      %[[MASKED_OPERAND1:.*]] = scf.if
+  CHECK:        %[[SELECT1:.*]] = arith.select
+  CHECK-NEXT:   scf.yield %[[SELECT1]]
+  CHECK:      stablehlo.dot_general %[[MASKED_OPERAND0]], %[[MASKED_OPERAND1]]
+  CHECK:      arith.addf %{{.*}}
+  )"));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(), R"(
   // Ensure that masking is applied only conditionally to both operands.
   CHECK:      %[[MASKED_OPERAND0:.*]] = scf.if
   CHECK:        %[[SELECT0:.*]] = arith.select
@@ -3392,7 +3669,8 @@ ENTRY entry {
   CHECK:        %[[SELECT1:.*]] = arith.select
   CHECK-NEXT:   scf.yield %[[SELECT1]]
   CHECK:      tt.dot %[[MASKED_OPERAND0]], %[[MASKED_OPERAND1]]
-)"));
+  )",
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "fdot")));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
@@ -3566,13 +3844,13 @@ nest1 {
 }
 
 nest2 {
-  ROOT p0 = s32[128] parameter(0)
+  ROOT p0 = s32[25] parameter(0)
 }
 
 concatenate_fusion {
   p0 = s32[128] parameter(0)
   p1 = s32[128] parameter(1)
-  p2 = s32[128] parameter(2)
+  p2 = s32[25] parameter(2)
 
   fusion0 = s32[128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
     "fusion_backend_config":{
@@ -3590,7 +3868,7 @@ concatenate_fusion {
         "num_warps":"1",
         "num_ctas":"1",
         "num_stages":"1"}}}
-  fusion2 = s32[128] fusion(p2), kind=kCustom, calls=nest2, backend_config={
+  fusion2 = s32[25] fusion(p2), kind=kCustom, calls=nest2, backend_config={
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
@@ -3599,14 +3877,15 @@ concatenate_fusion {
         "num_ctas":"1",
         "num_stages":"1"}}}
 
-  ROOT concatenate = s32[384] concatenate(fusion0, fusion1, fusion2), dimensions={0}
+  ROOT concatenate = s32[281] concatenate(fusion0, fusion1, fusion2),
+    dimensions={0}
 }
 
 ENTRY main {
   p0 = s32[128] parameter(0)
   p1 = s32[128] parameter(1)
-  p2 = s32[128] parameter(2)
-  ROOT fusion = s32[384] fusion(p0, p1, p2), kind=kCustom,
+  p2 = s32[25] parameter(2)
+  ROOT fusion = s32[281] fusion(p0, p1, p2), kind=kCustom,
     calls=concatenate_fusion, backend_config={
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion",
@@ -3930,13 +4209,26 @@ TEST_P(BasicDotAlgorithmEmitterTest, BasicAlgorithmIsEmittedCorrectly) {
                           algorithm_util::GetDotAccumulatorType(algorithm));
   const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(
+          this, kHloText, "dot",
+          absl::Substitute(
+              R"(
+  CHECK:  stablehlo.dot_general{{.*}} : (tensor<16x32x$0>, tensor<32x64x$0>) -> tensor<16x64x$1>
+  CHECK:  arith.addf
+  )",
+              primitive_util::LowercasePrimitiveTypeName(in_ty),
+              primitive_util::LowercasePrimitiveTypeName(out_ty))));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK:  tt.dot{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
-                       primitive_util::LowercasePrimitiveTypeName(out_ty))));
+                       primitive_util::LowercasePrimitiveTypeName(out_ty)),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
 
   EXPECT_TRUE(
       RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
@@ -3970,21 +4262,26 @@ TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
       algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? F32 : BF16;
   // Dummy value to ensure that the dot count is explicitly set.
   int dot_count_for_algorithm = 0x1337;
+  int stablehlo_dot_count_for_algorithm = 0x1337;
   std::string input_precision_string = "";
   switch (algorithm) {
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
       dot_count_for_algorithm = 3;
+      stablehlo_dot_count_for_algorithm = 3;
       break;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
       dot_count_for_algorithm = 6;
+      stablehlo_dot_count_for_algorithm = 6;
       break;
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
       dot_count_for_algorithm = 9;
+      stablehlo_dot_count_for_algorithm = 9;
       break;
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
       // Triton implements TF32x3 as a specific precision mode.
       input_precision_string = "tf32x3";
       dot_count_for_algorithm = 1;
+      stablehlo_dot_count_for_algorithm = 3;
       break;
     default:
       // Unreachable.
@@ -3993,14 +4290,24 @@ TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
 
   const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(this, kHloText, "dot",
+                                absl::Substitute(
+                                    R"(
+  CHECK:  stablehlo.dot_general{{.*}} num_primitive_operations = $0, {{.*}}
+  )",
+                                    stablehlo_dot_count_for_algorithm)));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK-COUNT-$2:  tt.dot{{.*}}$3{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
                        primitive_util::LowercasePrimitiveTypeName(out_ty),
-                       dot_count_for_algorithm, input_precision_string)));
+                       dot_count_for_algorithm, input_precision_string),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
 
   EXPECT_TRUE(
       RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
@@ -4033,14 +4340,32 @@ TEST_P(TF32DotAlgorithmEmitterTest, TF32AlgorithmsUseTF32InputPrecision) {
       algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? "tf32x3"
                                                              : "tf32";
 
-  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
-      this, kHloText, "dot",
+  std::string num_primitive_operations_string =
+      algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? "3" : "1";
+
+  // TODO(basioli): maybe algorithm string?
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto xtile_module_and_hlo_module,
+      CreateXTileIrAndFileCheck(
+          this, kHloText, "dot",
+          absl::Substitute(
+              R"(
+  CHECK:  stablehlo.dot_general{{.*}}, contracting_dims = [1] x [0], {{.*}} algorithm = <lhs_precision_type = tf32, rhs_precision_type = tf32, accumulation_type = f32, lhs_component_count = 1, rhs_component_count = 1, num_primitive_operations = $2, allow_imprecise_accumulation = false> : (tensor<16x32x$0>, tensor<32x64x$0>) -> tensor<16x64x$1>
+  )",
+              primitive_util::LowercasePrimitiveTypeName(in_ty),
+              primitive_util::LowercasePrimitiveTypeName(out_ty),
+              num_primitive_operations_string)));
+
+  TF_ASSERT_OK(LowerXTileIrToTritonAndFileCheck(
+      this, xtile_module_and_hlo_module.first.get(),
       absl::Substitute(R"(
   CHECK:  tt.dot{{.*}} inputPrecision = $2 : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
   )",
                        primitive_util::LowercasePrimitiveTypeName(in_ty),
                        primitive_util::LowercasePrimitiveTypeName(out_ty),
-                       input_precision_string)));
+                       input_precision_string),
+      GetFusionInstruction(*xtile_module_and_hlo_module.second, "dot")));
+
   // No need to `RunAndCompare` here, these algorithms are already covered by
   // other tests.
 }
@@ -4116,22 +4441,9 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
     GTEST_SKIP() << "Warp size is always 32 on CUDA";
   }
 
-  // TODO (rocm) weekly-sync-20251021 Use legacy emitter otherwise test segfaults
-  constexpr absl::string_view kHloText = R"(
-  %gemm_fusion___computation.clone {
-    %parameter_0 = f16[30,30]{1,0} parameter(0)
-    %parameter_1 = s8[30,30]{1,0} parameter(1)
-    %cp1.1 = f16[30,30]{1,0} convert(%parameter_1)
-    ROOT %_.1 = f16[30,30]{1,0} dot(%parameter_0, %cp1.1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
-  }
-  ENTRY %entry_computation {
-    %p1 = s8[30,30]{1,0} parameter(1)
-    %p0 = f16[30,30]{1,0} parameter(0)
-    ROOT %gemm_fusion__ = f16[30,30]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%gemm_fusion___computation.clone, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"256","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false,"reification_cost":[]}
-  })";
-
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
+                          ParseAndReturnVerifiedModule(GetDotAlgorithmHlo(
+                              F16, F16, PrecisionConfig::ALG_UNSET)));
 
   std::string output_directory;
   if (!tsl::io::GetTestUndeclaredOutputsDir(&output_directory)) {
@@ -4168,8 +4480,7 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
       // CHECK: "ttg.threads-per-warp" = 64
     )";
   EXPECT_THAT(RunFileCheck(triton_passes_log, kPattern), true);
-  // TODO (rocm) weekly-sync-20251021 Enable this whence test pass
-#if 0
+
   // For RX7900 warp_size should be 32
   const se::DeviceDescription dev_info_n =
       TestGpuDeviceInfo::AMDRX7900DeviceInfo();
@@ -4186,7 +4497,6 @@ TEST_F(TritonEmitterTest, RocmWarpSizeIsSetCorrectly) {
       // CHECK: "ttg.threads-per-warp" = 32
     )";
   EXPECT_THAT(RunFileCheck(triton_passes_log, kPattern_n), true);
-#endif
 }
 
 TEST_F(TritonEmitterTest, EmitsCorrectlyForReshapeOfPad) {
@@ -4573,6 +4883,12 @@ class TritonScaledDotGemmTest
     debug_options.set_xla_gpu_cublas_fallback(false);
     debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
         DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
+    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
+        DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
     return debug_options;
   }
 };
@@ -4671,7 +4987,7 @@ ENTRY e {
     calls=triton_dot,
     backend_config={
       "fusion_backend_config": {
-        kind: "__triton_scaled_dot_fusion",
+        kind: "__triton_nested_gemm_fusion",
         "block_level_fusion_config":{
           "output_tiles":[{"sizes":["128", "256"]}],
           "num_warps":"4",
@@ -4697,16 +5013,8 @@ ENTRY e {
   auto expected_triton_ir = absl::StrReplaceAll(
       kExpectedTritonIrTmpl, {{"$triton_type", params.expected_triton_type}});
   EXPECT_THAT(
-      CreateTritonIrAndFileCheck(*module->GetComputationWithName("triton_dot"),
-                                 /*block_level_parameters=*/
-                                 {
-                                     {{128, 256}},
-                                     4,
-                                     1,
-                                     1,
-                                     true,
-                                 },
-                                 expected_triton_ir),
+      CreateTritonIrAndFileCheckForDot(
+          *module->GetComputationWithName("triton_dot"), expected_triton_ir),
       absl_testing::IsOk());
   if (GetCudaComputeCapability().IsAtLeastBlackwell()) {
     CompileAndOptionallyVerifyPtx(
@@ -4835,16 +5143,8 @@ ENTRY e {
       CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
       CHECK: -> tensor<16x16xf32>
   )";
-  EXPECT_THAT(CreateTritonIrAndFileCheck(*scaled_dot_computation,
-                                         /*block_level_parameters=*/
-                                         {
-                                             {{1, 16, 16}},
-                                             4,
-                                             1,
-                                             1,
-                                             false,
-                                         },
-                                         kExpectedTritonIr),
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
               absl_testing::IsOk());
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
@@ -4892,21 +5192,79 @@ ENTRY e {
       CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
       CHECK: -> tensor<16x16xf32>
   )";
-  EXPECT_THAT(CreateTritonIrAndFileCheck(*scaled_dot_computation,
-                                         /*block_level_parameters=*/
-                                         {
-                                             {{1, 16, 16}},
-                                             4,
-                                             1,
-                                             1,
-                                             false,
-                                         },
-                                         kExpectedTritonIr),
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
+              absl_testing::IsOk());
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(optimized_module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonScaledDotTest, BroadcastAndReshapeGetFused) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Skipping test for pre-Hopper GPUs.";
+  }
+  constexpr absl::string_view kHloTextTemplate = R"hlo(
+HloModule ScaledDotWithBatchGetFusedAndExecutedCorrectly
+
+ENTRY e {
+  lhs = f8e4m3fn[3,128,128] parameter(0)
+  rhs = f8e4m3fn[3,128,128] parameter(1)
+  lhs_scale = f8e8m0fnu[3,128,1] parameter(2)
+  lhs_scale_broadcasted = f8e8m0fnu[3,128,1,4] broadcast(lhs_scale),
+      dimensions={0,1,2}
+  lhs_scale_reshaped = f8e8m0fnu[3,128,4] reshape(lhs_scale_broadcasted)
+  rhs_scale = f8e8m0fnu[3,128,1] parameter(3)
+  rhs_scale_broadcasted = f8e8m0fnu[3,128,1,4] broadcast(rhs_scale),
+      dimensions={0,1,2}
+  rhs_scale_reshaped = f8e8m0fnu[3,128,4] reshape(rhs_scale_broadcasted)
+  ROOT _ = bf16[3,128,128] scaled-dot(
+      lhs,
+      rhs,
+      lhs_scale_reshaped,
+      rhs_scale_reshaped),
+    lhs_batch_dims={0},
+    rhs_batch_dims={0},
+    lhs_contracting_dims={2},
+    rhs_contracting_dims={2}
+}
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloTextTemplate));
+  TF_ASSERT_OK_AND_ASSIGN(auto optimized_module,
+                          GetOptimizedModule(std::move(module)));
+  constexpr absl::string_view kExpectedOptimizedHLO = R"(
+    CHECK: ROOT %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} broadcast(%{{.*}}), dimensions={0,1}
+    CHECK: ROOT %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} broadcast(%{{.*}}), dimensions={0,1}
+    CHECK: %fusion
+    CHECK: %[[parameter_2:.*]] = f8e8m0fnu[3,128]{1,0} parameter(2)
+    CHECK: %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} fusion(%[[parameter_2]])
+    CHECK: %[[parameter_3:.*]] = f8e8m0fnu[3,128]{1,0} parameter(3)
+    CHECK: %{{.*}} = f8e8m0fnu[3,128,4]{2,1,0} fusion(%[[parameter_3]])
+    CHECK: ROOT {{.*}} scaled-dot
+    CHECK: ENTRY
+    CHECK: __triton_nested_gemm_fusion
+  )";
+  EXPECT_THAT(RunFileCheck(optimized_module->ToString(), kExpectedOptimizedHLO),
+              true);
+
+  HloComputation* scaled_dot_computation = GetFirstComputationWithInstruction(
+      *optimized_module, HloOpcode::kScaledDot);
+  constexpr absl::string_view kExpectedTritonIr = R"(
+      CHECK: tt.dot_scaled
+      CHECK: tensor<16x128xf8E4M3FN>, tensor<16x4xi8>
+      CHECK: tensor<128x16xf8E4M3FN>, tensor<16x4xi8>
+      CHECK: -> tensor<16x16xf32>
+  )";
+  EXPECT_THAT(CreateTritonIrAndFileCheckForDot(*scaled_dot_computation,
+                                               kExpectedTritonIr),
               absl_testing::IsOk());
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(optimized_module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index 4131a617880e06..97fe1b6d855d36 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -31,17 +33,14 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
 namespace xla::gpu {
 namespace {
 
-using ::tsl::testing::IsOkAndHolds;
 using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
 
 using TritonEmitterDevicelessTest = HloHardwareIndependentTestBase;
@@ -72,10 +71,25 @@ TEST_F(AnnotationsTest, Annotations) {
   static constexpr absl::string_view kHloText = R"(
 HloModule Annotations
 
+triton_dot_lhs {
+  p0 = f32[8,8] parameter(0)
+  ROOT copy = f32[8,8] copy(p0)
+}
+triton_dot_rhs {
+  p1 = f32[8,8] parameter(0)
+  ROOT copy = f32[8,8] copy(p1)
+}
+
 triton_dot {
   p0 = f32[8,8] parameter(0)
   p1 = f32[8,8] parameter(1)
-  ROOT dot = f32[8,8] dot(p0, p1),
+  a = f32[8,8] fusion(p0), kind=kCustom, calls=triton_dot_lhs,
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+    block_level_fusion_config: {output_tiles:[{sizes:["8","8"]}]}}}
+  b = f32[8,8] fusion(p1), kind=kCustom, calls=triton_dot_rhs,
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+    block_level_fusion_config: {output_tiles:[{sizes:["8","8"]}]}}}
+  ROOT dot = f32[8,8] dot(a, b),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     algorithm=dot_bf16_bf16_f32_x3
 }
@@ -84,13 +98,10 @@ ENTRY e {
   p0 = f32[8,8]{1, 0} parameter(0)
   p1 = f32[8,8]{1, 0} parameter(1)
   ROOT _ = f32[8,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
+    backend_config={"fusion_backend_config": {kind: "__triton_nested_gemm_fusion",
+      block_level_fusion_config:
       {
-        "block_m":32,
-        "block_n":32,
-        "block_k":32,
-        "split_k":1,
+        "output_tiles":[{"sizes":["8","8"]}],
         "num_stages":1,
         "num_warps":1,
         "num_ctas":1
@@ -109,7 +120,11 @@ ENTRY e {
       auto triton_module,
       CreateTritonModule("triton_fn", fusion,
                          TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-                         BlockLevelParameters(), symbolic_expr_context));
+                         BlockLevelParameters::FromBlockLevelFusionConfig(
+                             fusion->backend_config<GpuBackendConfig>()
+                                 ->fusion_backend_config()
+                                 .block_level_fusion_config()),
+                         symbolic_expr_context));
 
   std::string annotated_ir = DumpTritonIR(triton_module.get(), true);
 
@@ -169,6 +184,82 @@ ENTRY entry {
                       "(num_warps, num_ctas, num_stages) must be positive")));
 }
 
+TEST_F(TritonEmitterDevicelessTest,
+       BitcastReshapeDifferentTotalSizeRegressionTest) {
+  // This is a regression test for a bug where indexing analysis would fail to
+  // correctly preserve trivial dimensions, causing symbolic tile analysis to
+  // produce an incorrect reshape.
+  const std::string kHloText = R"(
+parameter0 {
+  p0 = bf16[1,5,4] parameter(0)
+  convert = f32[1,5,4] convert(p0)
+  slice = f32[1,5,2] slice(convert), slice={[0:1], [0:5], [2:4]}
+  ROOT bitcast = f32[5,2] bitcast(slice)
+}
+
+parameter1 {
+  ROOT p0 = f32[5,20]{0,1} parameter(0)
+}
+
+fusion {
+  p0 = bf16[1,5,4] parameter(0)
+  p1 = f32[5,20]{0,1} parameter(1)
+  fusion0 = f32[5,2] fusion(p0), kind=kCustom, calls=parameter0,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+  fusion1 = f32[5,20]{0,1} fusion(p1), kind=kCustom, calls=parameter1,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+  ROOT dot = f32[2,20] dot(fusion0, fusion1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = bf16[1,5,4] parameter(0)
+  p1 = f32[5,20]{0,1} parameter(1)
+  ROOT root = f32[2,20] fusion(p0, p1), kind=kCustom, calls=fusion,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "num_warps":"2",
+        "output_tiles":[{"sizes":["16","16"]}],
+        "num_ctas":1,
+        "num_stages":1,
+        "is_tma_allowed":false}}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloFusionInstruction* triton_fusion = Cast<HloFusionInstruction>(
+      hlo_module->entry_computation()->root_instruction());
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
+
+  EXPECT_OK(
+      CreateTritonModule("test_fn", triton_fusion, dev_info,
+                         BlockLevelParameters::FromBlockLevelFusionConfig(
+                             triton_fusion->backend_config<GpuBackendConfig>()
+                                 ->fusion_backend_config()
+                                 .block_level_fusion_config()),
+                         symbolic_expr_context));
+}
+
 TEST_F(WarpSpecializationTritonEmitterTest,
        ExtraWarpsAreRequestedForWarpSpecialization) {
   const std::string hlo_text = R"(
@@ -187,7 +278,8 @@ fdot {
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
         "output_tiles":[{"sizes":["128", "64"]}],
-        "is_tma_allowed":"1"
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
       }
     }
   }
@@ -195,7 +287,8 @@ fdot {
     "fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
         "output_tiles":[{"sizes":["64", "128"]}],
-        "is_tma_allowed":"1"
+        "is_tma_allowed":"1",
+        "is_warp_specialization_allowed":"1"
       }
     }
   }
@@ -216,7 +309,8 @@ ENTRY entry {
           "num_warps":"8",
           "num_ctas":"1",
           "num_stages":"1",
-          "is_tma_allowed":"1"}}}
+          "is_tma_allowed":"1",
+          "is_warp_specialization_allowed":"1"}}}
 })";
 
   // Check that we extract the launch configuration correctly when warp
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
index db344207384d55..04c47b12e09c2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
@@ -29,12 +29,12 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -107,6 +107,7 @@ class TritonTest : public GpuCodegenTest {
             .mutable_debug_options()
             .mutable_xla_gpu_unsupported_generic_triton_emitter_features();
     emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
+    emitter_opts->Add(DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
     emitter_opts->Add(
         DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
     emitter_opts->Add(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
index ff0ff8de464100..4af7ab9cbf76e1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
@@ -67,18 +67,30 @@ ENTRY e {
   const char* kHloTextTest = R"(
 HloModule t
 
-triton_dot {
+lhs {
+  ROOT p0 = f16[65536,32800] parameter(0)
+}
+
+rhs {
+  ROOT p1 = f16[32800,32] parameter(0)
+}
+
+triton_dot_computation {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
-  ROOT dot = f16[65536,32] dot(p0, p1),
+  lhs = f16[65536,32800] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}]}}}"
+  rhs = f16[32800,32] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}]}}}"
+  ROOT dot = f16[65536,32] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
-  ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\",\"num_ctas\":\"1\"}}}"
+  ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot_computation,
+    backend_config="{\"fusion_backend_config\":{\"kind\":\"__triton_nested_gemm_fusion\",\"block_level_fusion_config\":{\"output_tiles\":[{\"sizes\":[\"32\",\"32\"]}],\"num_stages\":1,\"num_warps\":1,\"num_ctas\":1}}}"
 }
 )";
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
index 01e142caf7c554..172a8d3d4c7977 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -185,19 +186,31 @@ std::string ElementwiseTestParamsToString(
 
 using UnaryElementwiseTest = ElementwiseTest;
 
-TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = f32[15,33]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[33,68]{1,0} parameter(0)
+  f1.1 = $0[33,68]{1,0} $1(parameter_1)
+  ROOT c.1 = f32[33,68]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = f32[15,33]{1,0} parameter(0)
   parameter_1 = $0[33,68]{1,0} parameter(1)
-  f1.1 = $0[33,68]{1,0} $1(parameter_1)
-  c.1 = f32[33,68]{1,0} convert(f1.1)
-  ROOT _.1 = f32[15,68]{1,0} dot(parameter_0, c.1),
+  lhs = f32[15,33]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+  rhs = f32[33,68]{1,0} fusion(parameter_1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["32","32"]}]}}}
+  ROOT _.1 = f32[15,68]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -207,12 +220,9 @@ ENTRY e {
   p0 = f32[15,33]{1,0} parameter(0)
   ROOT triton_gemm__ = f32[15,68]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"32",
-                       "block_n":"32",
-                       "block_k":"32",
-                       "split_k":"1",
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+                    "block_level_fusion_config":
+                      {"output_tiles":[{"sizes":["32","32"]}],
                        "num_stages":"1",
                        "num_warps":"4",
                        "num_ctas":"1"}}}
@@ -250,7 +260,7 @@ ENTRY e {
       /*run_hlo_passes=*/false));
 }
 
-TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseUnaryOpExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, ElementwiseUnaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -361,20 +371,33 @@ INSTANTIATE_TEST_SUITE_P(
 
 using BinaryElementwiseTest = ElementwiseTest;
 
-TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
   std::tie(data_type, opcode, tolerance) = GetParam();
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = f32[92,11]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[11,63]{1,0} parameter(0)
+  parameter_2 = $0[11,63]{1,0} parameter(1)
+  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
+  ROOT c.1 = f32[11,63]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = $0[11,63]{1,0} parameter(1)
   parameter_2 = $0[11,63]{1,0} parameter(2)
-  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
-  c.1 = f32[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
+  lhs = f32[92,11]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+  rhs = f32[11,63]{1,0} fusion(parameter_1, parameter_2), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["64","32"]}]}}}
+  ROOT _.1 = f32[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -385,12 +408,9 @@ ENTRY e {
   p2 = $0[11,63]{1,0} parameter(2)
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
     calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"64",
-                       "block_n":"32",
-                       "block_k":"64",
-                       "split_k":"1",
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+                    "block_level_fusion_config":
+                      {"output_tiles":[{"sizes":["64","32"]}],
                        "num_stages":"2",
                        "num_warps":"2",
                        "num_ctas":"1"}}}
@@ -430,7 +450,7 @@ ENTRY e {
       /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
 }
 
-TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseBinaryOpExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, ElementwiseBinaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -611,7 +631,7 @@ class SelectTest : public TritonTest,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(SelectTest, DISABLED_SelectFusionExecutesCorrectly) {
+TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -624,14 +644,28 @@ TEST_P(SelectTest, DISABLED_SelectFusionExecutesCorrectly) {
   }
 
   const std::string kHloTestTemplate = R"(
+lhs_computation {
+  ROOT parameter_0 = $1[92,13]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = $0[13,63]{1,0} parameter(0)
+  parameter_2 = $0[13,63]{1,0} parameter(1)
+  parameter_3 = pred[13,63]{1,0} parameter(2)
+  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
+  ROOT c.1 = $1[13,63]{1,0} convert(f1.1)
+}
+
 triton_gemm___computation {
   parameter_0 = $1[92,13]{1,0} parameter(0)
   parameter_1 = $0[13,63]{1,0} parameter(1)
   parameter_2 = $0[13,63]{1,0} parameter(2)
   parameter_3 = pred[13,63]{1,0} parameter(3)
-  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
-  c.1 = $1[13,63]{1,0} convert(f1.1)
-  ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
+  lhs = $1[92,13]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  rhs = $1[13,63]{1,0} fusion(parameter_1, parameter_2, parameter_3), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  ROOT _.1 = $1[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -644,12 +678,9 @@ ENTRY e {
   ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
     calls=triton_gemm___computation, backend_config={
       "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config": {
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config": {
+          "output_tiles":[{"sizes":["16","64"]}],
           "num_stages":"3",
           "num_warps":"2",
           "num_ctas":"1"}}}
@@ -716,7 +747,7 @@ INSTANTIATE_TEST_SUITE_P(
 class ConstantTest : public TritonTest,
                      public ::testing::WithParamInterface<PrimitiveType> {};
 
-TEST_P(ConstantTest, DISABLED_ConstantFusionExecutesCorrectly) {
+TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
   const PrimitiveType data_type = GetParam();
   if (!legacy_triton::IsTritonSupportedDataType(data_type,
                                                 GetCudaComputeCapability())) {
@@ -726,14 +757,26 @@ TEST_P(ConstantTest, DISABLED_ConstantFusionExecutesCorrectly) {
   }
 
   const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = f32[11,63]{1,0} parameter(1)
+lhs_computation {
+  ROOT parameter_0 = f32[92,11]{1,0} parameter(0)
+}
+
+rhs_computation {
+  parameter_1 = f32[11,63]{1,0} parameter(0)
   c = $0[] constant(123)
   b = $0[11,63] broadcast(c)
   cv = f32[11,63] convert(b)
-  m = f32[11,63] multiply(cv, parameter_1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, m),
+  ROOT m = f32[11,63] multiply(cv, parameter_1)
+}
+
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  lhs = f32[92,11]{1,0} fusion(parameter_0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  rhs = f32[11,63]{1,0} fusion(parameter_1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","64"]}]}}}
+  ROOT _.1 = f32[92,63]{1,0} dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -744,12 +787,9 @@ ENTRY e {
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation, backend_config={
       "fusion_backend_config":{
-        "kind":"__triton_gemm",
-        "triton_gemm_config":{
-          "block_m":"16",
-          "block_n":"64",
-          "block_k":"16",
-          "split_k":"1",
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","64"]}],
           "num_stages":"3",
           "num_warps":"2",
           "num_ctas":"1"}}}
@@ -818,7 +858,7 @@ class ConvertTest : public TritonTest,
                     public ::testing::WithParamInterface<
                         std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(ConvertTest, DISABLED_ConvertFusionExecutesCorrectly) {
+TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -832,12 +872,24 @@ TEST_P(ConvertTest, DISABLED_ConvertFusionExecutesCorrectly) {
 
   const std::string hlo_text = absl::Substitute(
       R"(
-t {
+lhs_computation {
   p0 = $0[2,2] parameter(0)
   p0c = $1[2,2] convert(p0)
-  p0cc = f32[2,2] convert(p0c)
+  ROOT p0cc = f32[2,2] convert(p0c)
+}
+
+rhs_computation {
+  ROOT p1 = f32[2,2] parameter(0)
+}
+
+t {
+  p0 = $0[2,2] parameter(0)
   p1 = f32[2,2] parameter(1)
-  ROOT r = f32[2,2] dot(p0cc, p1),
+  lhs = f32[2,2] fusion(p0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
+  rhs = f32[2,2] fusion(p1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
+  ROOT r = f32[2,2] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
@@ -846,14 +898,12 @@ ENTRY e {
   p0 = $0[2,2] parameter(0)
   p1 = f32[2,2] parameter(1)
   ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion"}}
 })",
       primitive_util::LowercasePrimitiveTypeName(data_type1),
       primitive_util::LowercasePrimitiveTypeName(data_type2));
 
-  MatchOptimizedHlo(hlo_text, R"(
-CHECK: block_m
-  )");
+  TF_ASSERT_OK(GetOptimizedModule(hlo_text).status());
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
index 529534d0667d01..324641547f2e9a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_shared_dialect_test.cc
@@ -179,11 +179,258 @@ ENTRY e {
       this, *module->GetComputationWithName("broadcast_in_dim_fusion"),
       block_level_parameters,
       R"(
-CHECK: %[[RES_FROM_ELEMENTS:.*]] = tensor.from_elements %[[ARG:.*]] : tensor<f32>
-CHECK: %[[RES:.*]] = stablehlo.broadcast_in_dim %[[RES_FROM_ELEMENTS]], dims = [] : (tensor<f32>) -> tensor<16x32x8xf32>
+CHECK: %[[RES:.*]] = stablehlo.broadcast_in_dim %[[ARG:.*]], dims = [] : (tensor<f32>) -> tensor<16x32x8xf32>
 )"));
 }
 
+TEST_F(XTileDialectTest, HloReduceIsLoweredToStableHloReduce) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+add {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+reduce_fusion {
+  p0 = f32[150,160] parameter(0)
+  const = f32[] constant(0.0)
+  ROOT broadcast = f32[160] reduce(p0, const), dimensions={0}, to_apply=add
+}
+
+ENTRY e {
+  p0 = f32[150,160] parameter(0)
+  ROOT custom-call = f32[160] fusion(p0), kind=kCustom,
+    calls=reduce_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{16}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("reduce_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[INIT:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
+CHECK: %[[REDUCE_INPUT:.*]] = arith.select {{.*}}
+CHECK: %[[RES:.*]] = stablehlo.reduce(%[[REDUCE_INPUT]] init: %[[INIT]]) across dimensions = [0] : (tensor<256x16xf32>, tensor<f32>) -> tensor<16xf32>
+CHECK: reducer(%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<f32>)  {
+CHECK:   %[[SUM:.*]] = arith.addf %[[ARG_0]], %[[ARG_1]] : tensor<f32>
+CHECK:   stablehlo.return %[[SUM]] : tensor<f32>
+CHECK: }
+)"));
+}
+
+TEST_F(XTileDialectTest, HloReshapeIsLoweredToStableHloReshape) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t, is_scheduled=true
+
+reshape_fusion {
+  p0 = s32[150] parameter(0)
+  ROOT reshape = s32[15, 10] reshape(p0)
+}
+
+ENTRY e {
+  p0 = s32[150] parameter(0)
+  ROOT custom-call = s32[15, 10] fusion(p0), kind=kCustom,
+    calls=reshape_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{1, 16}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("reshape_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.reshape %[[ARG:.*]] : (tensor<16xi32>) -> tensor<1x16xi32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloDotIsLoweredToStableHloDot) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+flhs {
+  ROOT flhs.p0 = f32[150,160] parameter(0)
+}
+
+frhs {
+  ROOT frhs.p0 = f32[160,31] parameter(0)
+}
+
+dot_fusion {
+  fdot.p0 = f32[150,160] parameter(0)
+  fdot.p1 = f32[160,31] parameter(1)
+  fdot.lhs = f32[150,160] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "8"]}]
+      }
+    }
+  }
+  fdot.rhs = f32[160,31]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "8"]}]
+      }
+    }
+  }
+
+  ROOT dot = f32[150,31] dot(fdot.lhs, fdot.rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[150, 160] parameter(0)
+  p1 = f32[160, 31] parameter(1)
+  ROOT custom-call = f32[150,31] fusion(p0, p1), kind=kCustom,
+    calls=dot_fusion,
+    backend_config={"fusion_backend_config": {kind: "__triton"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{32, 8}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("dot_fusion"),
+      block_level_parameters,
+      R"(
+CHECK: %[[RES:.*]] = stablehlo.dot_general %[[ARG0:.*]], %[[ARG1:.*]], contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<32x8xf32>, tensor<8x8xf32>) -> tensor<32x8xf32>
+CHECK: %[[ADD_RES:.*]] = arith.addf %[[ARG2:.*]], %[[RES]] : tensor<32x8xf32>
+)"));
+}
+
+TEST_F(XTileDialectTest, HloScaledDotIsLoweredToXTileDotScaled) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+flhs (p0: f8e5m2[128,128]) -> f8e5m2[128,128] {
+  ROOT p0 = f8e5m2[128,128]{1,0} parameter(0)
+}
+frhs (p0: f8e5m2[128,256]) -> f8e5m2[128,256] {
+  ROOT p0 = f8e5m2[128,256]{1,0} parameter(0)
+}
+flhs_scale (p0: f8e8m0fnu[128,4]) -> f8e8m0fnu[128,4] {
+  ROOT p0 = f8e8m0fnu[128,4]{1,0} parameter(0)
+}
+frhs_scale (p0: f8e8m0fnu[4,256]) -> f8e8m0fnu[4,256] {
+  ROOT p0 = f8e8m0fnu[4,256]{1,0} parameter(0)
+}
+
+triton_dot {
+  lhs = f8e5m2[128,128] parameter(0)
+  lhs1 = f8e5m2[128,128]{1,0} fusion(lhs),
+    kind=kCustom,
+    calls=flhs,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","128"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  rhs = f8e5m2[128,256] parameter(1)
+  rhs1 = f8e5m2[128,256]{1,0} fusion(rhs),
+    kind=kCustom,
+    calls=frhs,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  lhs_scale = f8e8m0fnu[128,4] parameter(2)
+  lhs_scale1 = f8e8m0fnu[128,4]{1,0} fusion(lhs_scale),
+    kind=kCustom,
+    calls=flhs_scale,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","128"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  rhs_scale = f8e8m0fnu[4,256] parameter(3)
+  rhs_scale1 = f8e8m0fnu[4,256]{1,0} fusion(rhs_scale),
+    kind=kCustom,
+    calls=frhs_scale,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1",
+        }
+      }
+    }
+  ROOT _ = bf16[128,256]{1,0} scaled-dot(lhs1, rhs1, lhs_scale1, rhs_scale1),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  lhs = f8e5m2[128,128]{1,0} parameter(0)
+  rhs = f8e5m2[128,256]{1,0} parameter(1)
+  lhs_scale = f8e8m0fnu[128,4]{1,0} parameter(2)
+  rhs_scale = f8e8m0fnu[4,256]{1,0} parameter(3)
+  ROOT _ = bf16[128,256]{1,0} fusion(lhs, rhs, lhs_scale, rhs_scale),
+    kind=kCustom,
+    calls=triton_dot,
+    backend_config={
+      "fusion_backend_config": {
+        kind: "__triton_scaled_dot_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128", "256"]}],
+          "num_warps":"4",
+          "num_stages":"1",
+          "num_ctas":"1"
+        }
+      }
+    }
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  auto& debug_options = module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_gpu_experimental_scaled_dot_with_triton(true);
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{128, 256}};
+
+  TF_EXPECT_OK(CreateXTileIrAndFileCheck(
+      this, *module->GetComputationWithName("triton_dot"),
+      block_level_parameters,
+      R"(
+      CHECK: %[[DOT:.*]] = xtile.dot_scaled %[[LHS:.*]] scale %[[LHS_SCALE:.*]], %[[RHS:.*]] scale %[[RHS_SCALE:.*]] {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+      CHECK: %[[RES:.*]] = arith.addf %{{.*}}, %[[DOT]] : tensor<128x256xf32>
+      )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
index 27e3ca4cc1ce01..dcdcf4ce0cc82c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
index fe842129d3d2dc..41a2edde543cc2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index e76a3e31bf8404..85ff158b16868d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -20,27 +20,26 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-using ::tsl::testing::StatusIs;
 
 class TritonFusionTest : public HloHardwareIndependentTestBase {};
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
index 1ad5da6e683405..12d00b6ae80f03 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
@@ -401,22 +401,26 @@ def TTXLA_GetRankOp : TTXLA_Op<"get_rank", [Pure]> {
 def TTXLA_GetPeerPtrOp : TTXLA_Op<"get_peer_ptr", [Pure]> {
   let summary = [{
     Extract the pointer to the given symmetric memory `address` on the given
-    `peer` device using the symmetric memory `metadata`.
-    For this an operation first calculates an offset of the `address` to the
-    current rank symmetric memory range, and the adds this offset to the 
-    symmetric memory range of the `peer` device.
+    `peer` device. An `address` should point to the memory of the given kernel
+    argument with `argument_index`. The result is calculated using the symmetric
+    memory `metadata` constructed at the runtime.
+    To calculate offsets operation also need to know the number of devices
+    participating in the collective operation (`world_size`).
   }];
   let arguments = (ins
     Arg<TT_PtrLike, "",
       [MemRead<GlobalMemory>]>:$address,
     I64:$peer_id,
     Arg<TT_PtrLike, "",
-      [MemRead<GlobalMemory>]>:$metadata);
+      [MemRead<GlobalMemory>]>:$metadata,
+    I32Attr:$argument_index,
+    // The number of devices participating in the collective operation.
+    I32Attr:$world_size);
 
   let results = (outs Arg<TT_PtrLike, "", [MemRead<GlobalMemory>]>:$result);
 
   let assemblyFormat = [{
-    $address `,` $peer_id `,` $metadata attr-dict `:`
+    $address `,` $peer_id `,` $metadata `,` attr-dict `:`
     functional-type(operands, results)
   }];
 }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index e6675aed173862..0ceae13fce9ec4 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -63,16 +64,11 @@ bool IsTritonSupportedDataType(PrimitiveType type,
       return true;
     case F8E5M2:
     case F8E4M3FN:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
-             std::holds_alternative<se::RocmComputeCapability>(gpu_version);
-    case F8E5M2FNUZ:
-    case F8E4M3FNUZ:
-      return std::holds_alternative<se::RocmComputeCapability>(gpu_version);
+      return gpu_version.IsCuda();
     case BF16:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
-             (std::holds_alternative<se::RocmComputeCapability>(gpu_version) &&
-              std::get<se::RocmComputeCapability>(gpu_version)
-                  .has_bf16_dtype_support());
+      return gpu_version.IsCuda() ||
+             (gpu_version.IsRocm() &&
+              gpu_version.rocm_compute_capability()->has_bf16_dtype_support());
     default:
       return false;
   }
@@ -96,9 +92,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
   absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kCopy};
 
   if (element_type != PrimitiveType::F8E5M2 &&
-      element_type != PrimitiveType::F8E4M3FN &&
-      element_type != PrimitiveType::F8E5M2FNUZ &&
-      element_type != PrimitiveType::F8E4M3FNUZ) {
+      element_type != PrimitiveType::F8E4M3FN) {
     ret.insert(HloOpcode::kNegate);
   }
 
@@ -111,11 +105,32 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
       element_type == PrimitiveType::F32 ||
       element_type == PrimitiveType::F64) {
     absl::flat_hash_set<HloOpcode> additional_opcodes{
-        HloOpcode::kCos,  HloOpcode::kFloor, HloOpcode::kLog1p,
-        HloOpcode::kSqrt, HloOpcode::kTanh,  HloOpcode::kAcosh,
-        HloOpcode::kExp,  HloOpcode::kExpm1, HloOpcode::kCbrt,
-        HloOpcode::kErf,  HloOpcode::kLog,   HloOpcode::kTan,
-        HloOpcode::kCeil, HloOpcode::kRsqrt, HloOpcode::kSin};
+        // clang-format off
+        // go/keep-sorted start
+        HloOpcode::kAcos,
+        HloOpcode::kAcosh,
+        HloOpcode::kAsin,
+        HloOpcode::kAsinh,
+        HloOpcode::kAtanh,
+        HloOpcode::kCbrt,
+        HloOpcode::kCeil,
+        HloOpcode::kCos,
+        HloOpcode::kCosh,
+        HloOpcode::kErf,
+        HloOpcode::kExp,
+        HloOpcode::kExpm1,
+        HloOpcode::kFloor,
+        HloOpcode::kLog,
+        HloOpcode::kLog1p,
+        HloOpcode::kRsqrt,
+        HloOpcode::kSin,
+        HloOpcode::kSinh,
+        HloOpcode::kSqrt,
+        HloOpcode::kTan,
+        HloOpcode::kTanh,
+        // go/keep-sorted end
+        // clang-format on
+    };
     ret.insert(additional_opcodes.begin(), additional_opcodes.end());
   }
 
@@ -141,16 +156,14 @@ CodegenDecision IsTritonSupportedConversion(
   };
 
   if (input != output && any_is(PrimitiveType::F8E4M3FN) &&
-      std::holds_alternative<se::CudaComputeCapability>(gpu_version) &&
-      !std::get<se::CudaComputeCapability>(gpu_version).IsAtLeastHopper()) {
+      gpu_version.IsCuda() &&
+      !gpu_version.cuda_compute_capability()->IsAtLeastHopper()) {
     return error_message();
   }
 
-  auto supported_fp8_types = {F8E4M3FN, F8E5M2, F8E4M3FNUZ, F8E5M2FNUZ};
-  bool is_input_fp8 = absl::c_linear_search(supported_fp8_types, input);
-  bool is_output_fp8 = absl::c_linear_search(supported_fp8_types, output);
-  bool is_f8_conversion = is_input_fp8 && is_output_fp8;
-  bool is_f8 = is_input_fp8 || is_output_fp8;
+  bool is_f8_conversion =
+      any_is(PrimitiveType::F8E4M3FN) && any_is(PrimitiveType::F8E5M2);
+  bool is_f8 = any_is(PrimitiveType::F8E4M3FN) || any_is(PrimitiveType::F8E5M2);
   bool is_f16_or_f32 = any_is(PrimitiveType::F16) ||
                        any_is(PrimitiveType::BF16) ||
                        any_is(PrimitiveType::F32);
@@ -179,9 +192,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     PrimitiveType element_type, const se::GpuComputeCapability& gpu_version) {
   if (element_type == PrimitiveType::S4 || element_type == PrimitiveType::U16 ||
       element_type == PrimitiveType::F8E5M2 ||
-      element_type == PrimitiveType::F8E4M3FN ||
-      element_type == PrimitiveType::F8E5M2FNUZ ||
-      element_type == PrimitiveType::F8E4M3FNUZ) {
+      element_type == PrimitiveType::F8E4M3FN) {
     return {};
   }
 
@@ -217,7 +228,6 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     ret.insert(HloOpcode::kAtan2);
     ret.insert(HloOpcode::kPower);
     ret.insert(HloOpcode::kRemainder);
-    ret.insert(HloOpcode::kDivide);
   }
 
   return ret;
@@ -231,9 +241,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedTernaryElementwiseOps(
   }
 
   if (element_type == PrimitiveType::F8E5M2 ||
-      element_type == PrimitiveType::F8E4M3FN ||
-      element_type == PrimitiveType::F8E5M2FNUZ ||
-      element_type == PrimitiveType::F8E4M3FNUZ) {
+      element_type == PrimitiveType::F8E4M3FN) {
     return {HloOpcode::kSelect};
   }
 
@@ -261,9 +269,7 @@ CodegenDecision CanTritonHandleReduce(
     const HloReduceInstruction& reduce,
     const se::GpuComputeCapability& gpu_version) {
   if (reduce.shape().element_type() == PrimitiveType::F8E4M3FN ||
-      reduce.shape().element_type() == PrimitiveType::F8E5M2 ||
-      reduce.shape().element_type() == PrimitiveType::F8E5M2FNUZ ||
-      reduce.shape().element_type() == PrimitiveType::F8E4M3FNUZ) {
+      reduce.shape().element_type() == PrimitiveType::F8E5M2) {
     return CodegenDecision::Forbid(
         "F8E4M3FN and F8E5M2 are not supported for reductions.");
   }
@@ -285,18 +291,11 @@ CodegenDecision CanTritonHandleReduce(
 }
 
 bool IsInTritonNestedGemmFusion(const HloInstruction& hlo) {
-  const HloComputation* computation = hlo.parent();
-  if (!computation->IsFusionComputation()) {
+  if (!hlo.parent()->IsFusionComputation()) {
     return false;
   }
-  absl::StatusOr<GpuBackendConfig> backend_config =
-      computation->FusionInstruction()->backend_config<GpuBackendConfig>();
-  if (!backend_config.ok()) {
-    return false;
-  }
-  absl::string_view fusion_kind =
-      backend_config.value().fusion_backend_config().kind();
-  return fusion_kind == kTritonNestedGemmFusionKind;
+  return IsGpuFusionKind(*hlo.parent()->FusionInstruction(),
+                         kTritonNestedGemmFusionKind);
 }
 
 absl::Status CheckSupportedCheckDotDimensions(const HloDotInstruction& dot) {
@@ -311,8 +310,7 @@ absl::Status CheckSupportedCheckDotDimensions(const HloDotInstruction& dot) {
   return absl::OkStatus();
 }
 
-bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm,
-                             const se::GpuComputeCapability& gpu_version) {
+bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm) {
   switch (algorithm) {
     case PrecisionConfig::ALG_UNSET:
     case PrecisionConfig::ALG_DOT_F16_F16_F16:
@@ -325,13 +323,8 @@ bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm,
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
-      if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
-        return true;
-      }
+      return true;
     case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
-      if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
-        return true;
-      }
     case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
     case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
     default:
@@ -350,24 +343,14 @@ CodegenDecision AreTypesSupportedByAlgUnsetDot(
   }
 
   if (input_type == F8E4M3FN || result_type == F8E4M3FN) {
-    if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
+    if (auto* cuda_cc = gpu_version.cuda_compute_capability();
         cuda_cc && !cuda_cc->IsAtLeastHopper()) {
       return CodegenDecision::Forbid(
           "Dot operation for F8E4M3FN is not supported before Hopper.");
     }
   }
 
-  if (input_type == F8E4M3FN || input_type == F8E5M2 ||
-      input_type == F8E5M2FNUZ || input_type == F8E4M3FNUZ) {
-    if (auto* rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version);
-        rocm_cc) {
-      return CodegenDecision::Forbid(
-          "Not supported Dot operation on Rocm.");
-    }
-  }
-
-  auto supported_float_types = {BF16, F16, F32, F64, F8E4M3FN, F8E5M2,
-                                F8E4M3FNUZ, F8E5M2FNUZ};
+  auto supported_float_types = {BF16, F16, F32, F64, F8E5M2, F8E4M3FN};
   if (absl::c_linear_search(supported_float_types, input_type)) {
     return CodegenDecision::Allow();
   }
@@ -378,15 +361,11 @@ CodegenDecision AreTypesSupportedByAlgUnsetDot(
 
   auto partially_supported_signed_types = {S4, S8, S16, S32, S64};
   if (absl::c_linear_search(partially_supported_signed_types, input_type)) {
-    if ((absl::c_linear_search(partially_supported_signed_types, result_type) &&
-          std::holds_alternative<se::CudaComputeCapability>(gpu_version)) ||
-        (input_type == S4 &&
-          std::holds_alternative<se::RocmComputeCapability>(gpu_version))) {
+    if (absl::c_linear_search(partially_supported_signed_types, result_type)) {
       return CodegenDecision::Forbid(
           "Dot operation does not support these signed integer types.");
     }
-    if (primitive_util::IsFloatingPointType(result_type) &&
-        std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+    if (primitive_util::IsFloatingPointType(result_type)) {
       return CodegenDecision::Forbid(
           "Dot operation does not support floating point input and signed "
           "integer result types.");
@@ -436,7 +415,7 @@ CodegenDecision AreDotAlgorithmInputAndOutputConversionsSupported(
 
   if (algorithm == PrecisionConfig::ALG_DOT_F64_F64_F64 &&
       primitive_util::BitWidth(lhs_type) < 32 &&
-      !std::get<se::CudaComputeCapability>(gpu_version).IsAtLeastBlackwell()) {
+      !gpu_version.cuda_compute_capability()->IsAtLeastBlackwell()) {
     return forbid("Unsupported BF16 on GPUs before Blackwell");
   }
 
@@ -502,7 +481,7 @@ CodegenDecision IsTritonSupportedDot(
   const PrecisionConfig& precision_config = dot.precision_config();
   const PrecisionConfig::Algorithm algorithm = precision_config.algorithm();
 
-  if (!IsSupportedDotAlgorithm(algorithm, gpu_version)) {
+  if (!IsSupportedDotAlgorithm(algorithm)) {
     return CodegenDecision::Forbid(
         absl::StrCat("Unsupported dot algorithm: ",
                      PrecisionConfig::Algorithm_Name(algorithm)));
@@ -661,8 +640,6 @@ CodegenDecision IsTritonSupportedInstructionImpl(
     return CodegenDecision(
         element_type != PrimitiveType::F8E4M3FN &&
             element_type != PrimitiveType::F8E5M2 &&
-            element_type != PrimitiveType::F8E4M3FNUZ &&
-            element_type != PrimitiveType::F8E5M2FNUZ &&
             element_type != PrimitiveType::S4,
         "F8E4M3FN, F8E5M2 and S4 are not supported for iota.");
   }
@@ -746,9 +723,9 @@ bool IsTritonUnsupportedOpcode(HloOpcode opcode) {
 absl::Status EnsureTritonSupportsComputeCapability(
     const se::GpuComputeCapability& gpu_compute_capability) {
   auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_compute_capability);
+      gpu_compute_capability.cuda_compute_capability();
   auto rocm_compute_capability =
-      std::get_if<se::RocmComputeCapability>(&gpu_compute_capability);
+      gpu_compute_capability.rocm_compute_capability();
   if (!cuda_compute_capability && !rocm_compute_capability) {
     return absl::FailedPreconditionError(
         "Triton support is only enabled for CUDA and ROCm GPUs.");
@@ -780,6 +757,14 @@ CodegenDecision IsTritonSupportedComputation(
     const se::GpuComputeCapability& gpu_compute_capability) {
   VLOG(3) << "IsTritonSupportedComputation: " << computation.ToString();
   for (const auto* instruction : computation.instructions()) {
+    // TODO(b/452478982): This check can be removed if we support Tuple ops
+    // generally.
+    if (instruction == computation.root_instruction() &&
+        instruction->opcode() == HloOpcode::kTuple) {
+      // While Tuple is not generally supported by Triton codegen, it is
+      // supported for fusion roots.
+      continue;
+    }
     if (CodegenDecision can_codegen =
             IsTritonSupportedInstruction(*instruction, gpu_compute_capability);
         !can_codegen) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
index 3ec9acc875e2ed..07e88ba8c458e8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
@@ -249,13 +249,17 @@ bool IsDotAlgorithmSupportedByTriton(
 CodegenDecision AreDotInputAndOutputTypesSupportedAndCompatible(
     const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
   auto output_type = dot.shape().element_type();
-  auto lhs_type = dot.operand(0)->shape().element_type();
-  auto rhs_type = dot.operand(1)->shape().element_type();
-
   if (!IsTritonSupportedDotOutputType(output_type, gpu_version)) {
     return CodegenDecision::Forbid("Unsupported output data type for Dot op.");
   }
 
+  auto lhs_type = dot.operand(0)->shape().element_type();
+  auto rhs_type = dot.operand(1)->shape().element_type();
+  if (lhs_type != rhs_type && !(primitive_util::IsF8Type(lhs_type) &&
+                                primitive_util::IsF8Type(rhs_type))) {
+    return CodegenDecision::Forbid("Non-fp8 input types must be the same.");
+  }
+
   if (!IsTritonSupportedDataType(lhs_type, gpu_version) ||
       !IsTritonSupportedDataType(rhs_type, gpu_version)) {
     return CodegenDecision::Forbid("Unsupported input data type for Dot op.");
@@ -268,6 +272,12 @@ CodegenDecision AreDotInputAndOutputTypesSupportedAndCompatible(
         "Currently, S32 output is only supported for 8-bit integral inputs.");
   }
 
+  if (primitive_util::IsIntegralType(lhs_type) !=
+      primitive_util::IsIntegralType(output_type)) {
+    return CodegenDecision::Forbid(
+        "Dots between integer and floating-point types are not supported.");
+  }
+
   return CodegenDecision::Allow();
 }
 
@@ -280,12 +290,11 @@ CodegenDecision CanTritonHandleGEMM(
   CHECK(cuda_compute_capability || rocm_compute_capability);
 
   if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
-    if (!tsl::tensor_float_32_execution_enabled() ||
-        absl::c_any_of(dot.precision_config().operand_precision(),
+    if (absl::c_any_of(dot.precision_config().operand_precision(),
                        [](int x) { return x != PrecisionConfig::DEFAULT; })) {
       return CodegenDecision::Forbid(
-          "Having non-default operand precisions or TensorFloat-32 disabled "
-          "for Dot op with unset algorithm.");
+          "Having non-default operand precisions for Dot op with unset "
+          "algorithm.");
     }
   } else {
     if (!IsDotAlgorithmSupportedByTriton(dot.precision_config().algorithm(),
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
index 77c2f01a10e3ec..f956f47ec88c7c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -32,10 +31,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -83,10 +84,22 @@ class DotTest : public TritonSupportTestBaseWithParam {
         primitive_util::LowercasePrimitiveTypeName(output_type);
 
     const std::string kHloTestTemplate = R"(
+lhs {
+  ROOT p0 = $0[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = $1[11,63]{1,0} parameter(0)
+}
 triton_computation {
-  parameter_0 = $0[92,11]{1,0} parameter(0)
-  parameter_1 = $1[11,63]{1,0} parameter(1)
-  ROOT dot = $2[92,63]{1,0} $3(parameter_0, parameter_1),
+  p0 = $0[92,11]{1,0} parameter(0)
+  p1 = $1[11,63]{1,0} parameter(1)
+  lhs_fusion = $0[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = $1[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = $2[92,63]{1,0} $3(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -95,11 +108,9 @@ ENTRY e {
   parameter_1 = $1[11,63]{1,0} parameter(1)
   ROOT triton_op = $2[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
     const std::string hlo_test = absl::Substitute(
         kHloTestTemplate, lhs, rhs, output, HloOpcodeString(opcode));
@@ -121,10 +132,12 @@ ENTRY e {
       }
       const se::DeviceDescription dev_info =
           TestGpuDeviceInfo::RTXA6000DeviceInfo(GetComputeCapability());
-      BlockLevelParameters block_level_parameters;
-      block_level_parameters.num_ctas = 1;
-      block_level_parameters.num_stages = 4;
-      block_level_parameters.num_warps = 8;
+      auto block_level_parameters =
+          BlockLevelParameters::FromBlockLevelFusionConfig(
+              ti.TritonFusion()
+                  .backend_config<GpuBackendConfig>()
+                  ->fusion_backend_config()
+                  .block_level_fusion_config());
       EXPECT_THAT(
           TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
                         dev_info, block_level_parameters, &llvm_module_,
@@ -202,15 +215,28 @@ TEST_P(DynamicSliceTest, IsTritonSupportedDynamicSlice) {
 
   constexpr absl::string_view kHloTestTemplate =
       R"(
+lhs {
+  p0 = $0[$2,$3] parameter(0)
+  p1 = $1[] parameter(1)
+  p2 = $1[] parameter(2)
+  ds = $0[5,2] dynamic-slice(p0, p1, p2), dynamic_slice_sizes={5,2}
+  ROOT convert = f32[5,2] convert(ds)
+}
+rhs {
+  ROOT p0 = f32[2,4] parameter(0)
+}
 triton_computation {
   dynamic_slice_input = $0[$2,$3] parameter(0)
   dot_rhs = f32[2,4] parameter(1)
   start_index0 = $1[] parameter(2)
   start_index1 = $1[] parameter(3)
-  dynamic_slice = $0[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
-                  dynamic_slice_sizes={5,2}
-  convert = f32[5,2] convert(dynamic_slice)
-  ROOT dot = f32[5, 4] dot(convert, dot_rhs),
+  lhs = f32[5,2] fusion(dynamic_slice_input, start_index0, start_index1), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}]}}}
+  rhs = f32[2,4] fusion(dot_rhs), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}]}}}
+  ROOT dot = f32[5, 4] dot(lhs, rhs),
           lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -223,9 +249,9 @@ ENTRY e {
        kind=kCustom, calls=triton_computation,
        backend_config={
          "fusion_backend_config":{
-           "kind":"__triton_gemm","triton_gemm_config":{
-             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
-             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+           "kind":"__triton_nested_gemm_fusion",
+            "block_level_fusion_config":{"output_tiles":[{"sizes":[32,32]}],
+            "num_stages":1,"num_warps":4,"num_ctas":1}}}
 })";
 
   const std::string hlo_test = absl::Substitute(
@@ -237,27 +263,32 @@ ENTRY e {
       param.is_the_majormost_dim_being_sliced ? 1 : 0,  // start_index0
       param.is_the_majormost_dim_being_sliced ? 0 : 1   // start_index1
   );
-  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
-                                                    hlo_test, /*data_type=*/{},
-                                                    HloOpcode::kDynamicSlice));
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction dot,
+                          ParseTemplateAndGetInstruction(
+                              hlo_test, /*data_type=*/{}, HloOpcode::kDot));
+  HloInstruction* dynamic_slice =
+      FindInstruction(dot.Module().get(), HloOpcode::kDynamicSlice);
+  ASSERT_NE(dynamic_slice, nullptr);
 
   const bool is_supported_instruction =
-      legacy_triton::IsTritonSupportedInstruction(ti.Instruction(),
+      legacy_triton::IsTritonSupportedInstruction(*dynamic_slice,
                                                   GetComputeCapability())
           .CanFuse();
   const bool is_supported_dynamic_slice =
       legacy_triton::IsTritonSupportedDynamicSlice(
-          *Cast<HloDynamicSliceInstruction>(&ti.Instruction()))
+          *Cast<HloDynamicSliceInstruction>(dynamic_slice))
           .CanFuse();
   EXPECT_EQ(is_supported_instruction, is_supported_dynamic_slice);
 
   if (is_supported_instruction) {
+    // TODO(goncharov): Change to `EXPECT_FALSE(is_supported_instruction)`.
+    GTEST_SKIP() << "The generic emitter does not support dynamic slice yet.";
     TF_EXPECT_OK(
-        ApplyFloatNormalization(ti.Module().get(), GetComputeCapability()));
+        ApplyFloatNormalization(dot.Module().get(), GetComputeCapability()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
-        std::move(ti.Module()), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+        std::move(dot.Module()), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
   } else {
-    EXPECT_THAT(TritonFusionAnalysis::Execute(ti.TritonComputation()),
+    EXPECT_THAT(TritonFusionAnalysis::Execute(dot.TritonComputation()),
                 absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition));
   }
 }
@@ -270,69 +301,156 @@ INSTANTIATE_TEST_SUITE_P(
     DynamicSliceTestParamToString);
 
 TEST_F(TritonSupportTestBase,
-       UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
+       UnsupportedDotOutputTypeFailsCanTritonHandleGEMM) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[11,63]{1,0} parameter(0)
+}
 triton_computation {
+  p0 = f32[92,11]{1,0} parameter(0)
+  p1 = f32[11,63]{1,0} parameter(1)
+  lhs_fusion = f32[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = f32[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = pred[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+ENTRY e {
   parameter_0 = f32[92,11]{1,0} parameter(0)
   parameter_1 = f32[11,63]{1,0} parameter(1)
-  ROOT dot = pred[92,63]{1,0} dot(parameter_0, parameter_1),
+  ROOT triton_op = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTest, /*data_type=*/{}, HloOpcode::kDot));
+  EXPECT_THAT(
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported output data type for Dot op."));
+}
+
+TEST_F(TritonSupportTestBase, UnsupportedIntFloatDotFailsCanTritonHandleGEMM) {
+  const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = s8[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = s8[11,63]{1,0} parameter(0)
+}
+triton_computation {
+  p0 = s8[92,11]{1,0} parameter(0)
+  p1 = s8[11,63]{1,0} parameter(1)
+  lhs_fusion = s8[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = s8[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = f32[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
+ENTRY e {
+  parameter_0 = s8[92,11]{1,0} parameter(0)
+  parameter_1 = s8[11,63]{1,0} parameter(1)
+  ROOT triton_op = f32[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTest, /*data_type=*/{}, HloOpcode::kDot));
+  EXPECT_THAT(
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Dots between integer and floating-point "
+                           "types are not supported."));
+}
 
+TEST_F(TritonSupportTestBase,
+       UnsupportedDifferentOperandTypesDotFailsCanTritonHandleGEMM) {
+  const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f16[92,11]{1,0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[11,63]{1,0} parameter(0)
+}
+triton_computation {
+  p0 = f16[92,11]{1,0} parameter(0)
+  p1 = f32[11,63]{1,0} parameter(1)
+  lhs_fusion = f16[92,11]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,16]}]}}}
+  rhs_fusion = f32[11,63]{1,0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}]}}}
+  ROOT dot = f32[92,63]{1,0} dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
 ENTRY e {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_0 = f16[92,11]{1,0} parameter(0)
   parameter_1 = f32[11,63]{1,0} parameter(1)
-  ROOT triton_op = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+  ROOT triton_op = f32[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[16,32]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
                               kHloTest, /*data_type=*/{}, HloOpcode::kDot));
-  const se::DeviceDescription dev_info =
-      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetComputeCapability());
-  EXPECT_THAT(legacy_triton::IsTritonSupportedInstruction(
-                  ti.Instruction(), GetComputeCapability())
-                  .Explain(),
-              ::testing::HasSubstr("Unsupported output data type for Dot op."));
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 4;
-  block_level_parameters.num_warps = 8;
   EXPECT_THAT(
-      TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
-                    dev_info, block_level_parameters, &llvm_module_,
-                    symbolic_expr_context_),
-      absl_testing::StatusIs(
-          absl::StatusCode::kInternal,
-          ::testing::HasSubstr("Failed to verify Triton module for fusion")));
+      legacy_triton::CanTritonHandleGEMM(
+          *Cast<HloDotInstruction>(&ti.Instruction()), GetComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("input types must be the same"));
 }
 
 TEST_F(TritonSupportTestBase,
-       UnsupportedDotWithMultipleBatchDimensionsFailsGracefullyWithTriton) {
+       DotWithMultipleBatchDimensionsIsSupportedWithTriton) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[2,2,2,2] parameter(0)
+}
+rhs {
+  ROOT p0 = f32[2,2,2,2] parameter(0)
+}
 triton_computation {
-  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
-  ROOT dot = f32[2,2,2,2]{3,2,1,0} dot(parameter_0, parameter_1),
-    lhs_contracting_dims={3}, lhs_batch_dims={1,0}, rhs_contracting_dims={2},
-    rhs_batch_dims={1,0}
+  p0 = f32[2,2,2,2] parameter(0)
+  p1 = f32[2,2,2,2] parameter(1)
+  lhs_fusion = f32[2,2,2,2] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}]}}}
+  rhs_fusion = f32[2,2,2,2] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}]}}}
+  ROOT dot = f32[2,2,2,2] dot(lhs_fusion, rhs_fusion),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
+    rhs_contracting_dims={2}, rhs_batch_dims={1,0}
 }
 
 ENTRY e {
-  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
-  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
-  ROOT triton_op = f32[2,2,2,2]{3,2,1,0} fusion(parameter_0, parameter_1),
+  parameter_0 = f32[2,2,2,2] parameter(0)
+  parameter_1 = f32[2,2,2,2] parameter(1)
+  ROOT triton_op = f32[2,2,2,2] fusion(parameter_0, parameter_1),
     kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-      triton_gemm_config:
-        {"block_m":16,"block_n":32,"block_k":512,
-         "split_k":1,"num_stages":4,"num_warps":8,
-         "num_ctas":1}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1,2,2]}],
+       "num_stages":4,"num_warps":8,"num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
@@ -343,25 +461,36 @@ ENTRY e {
                   ti.Instruction(), GetComputeCapability())
                   .Explain(),
               ::testing::HasSubstr("Multiple batch dimensions"));
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.num_ctas = 1;
-  block_level_parameters.num_stages = 4;
-  block_level_parameters.num_warps = 8;
-  EXPECT_THAT(
-      TritonWrapper("test_fn", &ti.TritonFusion(), GetComputeCapability(),
-                    dev_info, block_level_parameters, &llvm_module_,
-                    symbolic_expr_context_),
-      absl_testing::StatusIs(absl::StatusCode::kInternal,
-                             ::testing::HasSubstr("num_batch_dims <= 1")));
+  auto block_level_parameters =
+      BlockLevelParameters::FromBlockLevelFusionConfig(
+          ti.TritonFusion()
+              .backend_config<GpuBackendConfig>()
+              ->fusion_backend_config()
+              .block_level_fusion_config());
+  TF_EXPECT_OK(TritonWrapper(
+      "test_fn", &ti.TritonFusion(), GetComputeCapability(), dev_info,
+      block_level_parameters, &llvm_module_, symbolic_expr_context_));
 }
 
 TEST_F(TritonSupportTestBase,
        UnsupportedDotWithNoNonContractingDimensionsFailsGracefullyWithTriton) {
   const std::string kHloTest = R"(
+lhs {
+  ROOT p0 = f32[2]{0} parameter(0)
+}
+rhs {
+  ROOT p0 = f32[2]{0} parameter(0)
+}
 triton_computation {
-  parameter_0 = f32[2]{0} parameter(0)
-  parameter_1 = f32[2]{0} parameter(1)
-  ROOT dot = f32[] dot(parameter_0, parameter_1),
+  p0 = f32[2]{0} parameter(0)
+  p1 = f32[2]{0} parameter(1)
+  lhs_fusion = f32[2]{0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
+  rhs_fusion = f32[2]{0} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
+  ROOT dot = f32[] dot(lhs_fusion, rhs_fusion),
     lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
@@ -370,7 +499,8 @@ ENTRY e {
   parameter_1 = f32[2]{0} parameter(1)
   ROOT triton_op = f32[] fusion(parameter_0, parameter_1), kind=kCustom,
     calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":[1,1]}]}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
                           ParseTemplateAndGetInstruction(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
index 8299cc88ac501c..605597bda175fd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
@@ -52,6 +52,8 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/model/block_level_parameters.h"
+#include "xla/service/gpu/model/triton_emitter_constraints.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
@@ -168,8 +170,12 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateXTileIrAndFileCheck(
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> xtile_dialect_module,
       ir_emitter_triton_internal::EmitXTileModule(
-          "xtile_dialect_fn", fusion, TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-          block_level_parameters, *test->symbolic_expr_context()));
+          "xtile_dialect_fn",
+          TritonEmitterConstraints::GetBuilder(
+              TestGpuDeviceInfo::RTXA6000DeviceInfo()),
+          fusion, block_level_parameters, *test->symbolic_expr_context(),
+          ir_emitter_triton_internal::LegacyMatmulEmitter(
+              TestGpuDeviceInfo::RTXA6000DeviceInfo())));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -187,7 +193,7 @@ absl::Status LowerXTileIrToTritonAndFileCheck(
     const HloFusionInstruction& fusion) {
   TF_RETURN_IF_ERROR(ir_emitter_triton_internal::LowerXTileToTriton(
       xtile_dialect_module, *test->symbolic_expr_context()->GetMLIRContext(),
-      fusion));
+      fusion, TestGpuDeviceInfo::RTXH100SXMDeviceInfo()));
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -206,13 +212,20 @@ absl::Status CreateTritonIrAndFileCheckForDot(
                       test->ParseAndReturnVerifiedModule(hlo_text));
   auto* comp = verified_module->GetComputationWithName(triton_fusion_name);
   TF_RET_CHECK(comp != nullptr);
-  return CreateTritonIrAndFileCheck(*comp, /*block_level_parameters=*/{},
-                                    filecheck_pattern);
+  return CreateTritonIrAndFileCheckForDot(*comp, filecheck_pattern);
 }
 
 absl::Status CreateTritonIrAndFileCheckForDot(
     const HloComputation& computation, absl::string_view filecheck_pattern) {
-  return CreateTritonIrAndFileCheck(computation, /*block_level_parameters=*/{},
+  BlockLevelParameters block_level_parameters;
+  if (auto gpu_config =
+          computation.FusionInstruction()->backend_config<GpuBackendConfig>();
+      gpu_config.ok() && gpu_config->has_fusion_backend_config() &&
+      gpu_config->fusion_backend_config().has_block_level_fusion_config()) {
+    block_level_parameters = BlockLevelParameters::FromBlockLevelFusionConfig(
+        gpu_config->fusion_backend_config().block_level_fusion_config());
+  }
+  return CreateTritonIrAndFileCheck(computation, block_level_parameters,
                                     filecheck_pattern);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
index 4e9742b47b6cb1..a7c26e53017321 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/hlo_test_base_with_symbolic_expr_context.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
index 0c6e057cf16aa0..bff199fea4ca07 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -45,32 +45,41 @@ cc_library(
         "triton_xla_lower_get_tid_pass.cc",
         "triton_xla_lower_remote_access_pass.cc",
         "triton_xla_lower_xtile_pass.cc",
+        "triton_xla_math_to_libdevice.cc",
         "triton_xla_squeeze_dims_pass.cc",
         "triton_xla_unswitch_loops_pass.cc",
+        "xtile_lower_to_triton.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
         ":passes_inc_gen",
         "//xla:permutation_util",
         "//xla:util",
+        "//xla/backends/gpu/codegen/triton:dot_algorithms",
         "//xla/backends/gpu/codegen/triton:emitter_helpers",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/xtile/ir:xtile",
+        "//xla/hlo/translate/mhlo_to_hlo:attribute_exporter",
+        "//xla/service:algorithm_util",
+        "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -79,6 +88,7 @@ cc_library(
         "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
@@ -87,6 +97,7 @@ cc_library(
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@stablehlo//:stablehlo_ops",
         "@triton//:TritonDialects",
     ],
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
index 3637c2c580e05a..0e2bcb76852919 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
@@ -31,7 +32,7 @@ namespace mlir::triton::xla {
 
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
-    bool allow_tma);
+    bool allow_tma, int num_stages);
 std::unique_ptr<mlir::Pass> CreateTritonXLASqueezeDimsPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAFoldTransposePass();
 std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
@@ -49,6 +50,9 @@ std::unique_ptr<mlir::Pass> CreateTritonXLALowerRemoteAccessPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLALowerXTilePass();
 std::unique_ptr<mlir::Pass> CreateStableHLOLowerToTritonPass();
 std::unique_ptr<mlir::Pass> CreateTensorLowerToTritonPass();
+std::unique_ptr<mlir::Pass> CreateTritonXLAMathToLibdevicePass(
+    absl::string_view libdevice_path, absl::string_view triple);
+std::unique_ptr<mlir::Pass> CreateXTileLowerToTritonPass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
index 534a7e9f9abf1b..2dbec5bf0fc95e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -28,6 +28,8 @@ def TritonXLAExtractInsertToTritonPass : Pass<"triton-xla-extract-insert-to-trit
   let options = [
     Option<"allow_tma_", "allow_tma", "bool", "false",
            "Whether to permit lowering to TMA.">,
+    Option<"num_stages_", "num_stages", "int", "1",
+           "Number of stages for pipelining.">,
   ];
   let dependentDialects = [
     "triton::TritonDialect",
@@ -60,7 +62,8 @@ def TritonXLAFoldTransposePass : Pass<"triton-xla-fold-transpose", "mlir::Module
     This pass tries to remove transposes by folding them into loads.
   }];
   let dependentDialects = [
-    "::mlir::triton::xla::XlaTritonDialect"
+    "::mlir::triton::xla::XlaTritonDialect",
+    "::mlir::memref::MemRefDialect",
   ];
   let constructor = "CreateTritonXLAFoldTransposePass()";
 }
@@ -210,8 +213,9 @@ def StableHLOLowerToTritonPass
     : Pass<"stablehlo-lower-to-triton", "mlir::ModuleOp"> {
   let summary = "Lowers StableHLO operations to their Triton equivalent.";
   let dependentDialects = [
-    "stablehlo::StablehloDialect",
-    "triton::TritonDialect"
+    "::mlir::stablehlo::StablehloDialect",
+    "::mlir::triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
   ];
   let constructor = "CreateStableHLOLowerToTritonPass()";
 }
@@ -221,9 +225,36 @@ def TensorLowerToTritonPass
   let summary = "Lowers tensor operations to their Triton equivalent.";
   let dependentDialects = [
     "tensor::TensorDialect",
-    "triton::TritonDialect"
+    "triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
   ];
   let constructor = "CreateTensorLowerToTritonPass()";
 }
 
+def TritonXLAMathToLibdevicePass
+    : Pass<"triton-xla-math-to-libdevice", "mlir::ModuleOp"> {
+  let summary = "Lowers math operations to tt.extern_elementwise calls to their libdevice equivalent.";
+  let dependentDialects = [
+    "tensor::TensorDialect",
+    "triton::TritonDialect"
+  ];
+  let options = [
+    Option<"libdevice_path_", "libdevice_path", "std::string", "",
+           "Path to the libdevice library.">,
+    Option<"triple_string_", "triple", "std::string", "",
+           "Device triple string.">,
+  ];
+}
+
+
+def XTileLowerToTritonPass
+    : Pass<"xtile-lower-to-triton", "mlir::ModuleOp"> {
+  let summary = "Lowers XTile operations to their Triton equivalent.";
+  let dependentDialects = [
+    "::mlir::triton::TritonDialect",
+    "::xla::xtile::XTileDialect",
+  ];
+  let constructor = "CreateXTileLowerToTritonPass()";
+}
+
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
index 42e0fb33043462..96f94f8ffce981 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/stablehlo_lower_to_triton.cc
@@ -14,17 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <iterator>
+#include <limits>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -32,7 +47,14 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/StablehloOps.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
+#include "xla/service/algorithm_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir::triton::xla {
@@ -111,11 +133,9 @@ class LowerBroadcastInDim
 
     if (input_shape.empty()) {
       auto broadcast_dim_input = op.getOperand();
-      auto broadcast_dim_input_element_type =
-          broadcast_dim_input.getType().getElementType();
 
-      auto extracted = rewriter.create<mlir::tensor::ExtractOp>(
-          op.getLoc(), broadcast_dim_input_element_type, broadcast_dim_input);
+      auto extracted = mlir::tensor::ExtractOp::create(rewriter, op.getLoc(),
+                                                       broadcast_dim_input);
 
       rewriter.replaceOpWithNewOp<ttir::SplatOp>(op, op.getResult().getType(),
                                                  extracted);
@@ -142,14 +162,640 @@ class LowerBroadcastInDim
   }
 };
 
+class LowerReduce : public mlir::OpRewritePattern<stablehlo::ReduceOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::ReduceOp op, mlir::PatternRewriter& rewriter) const override {
+    if (mlir::failed(VerifyOpIsCompatibleWithTritonReduce(op, rewriter))) {
+      return mlir::failure();
+    }
+
+    int32_t axis = op.getDimensions()[0];
+
+    // In case shlo returns a 0 rank tensor triton needs to return a scalar as
+    // triton doesn't support 0 rank tensors.
+    SmallVector<Type> adjusted_result_types;
+    adjusted_result_types.reserve(op.getNumResults());
+    for (auto result : op.getResults()) {
+      auto shaped_type = cast<mlir::ShapedType>(result.getType());
+      if (shaped_type.getRank() == 0) {
+        adjusted_result_types.push_back(shaped_type.getElementType());
+      } else {
+        adjusted_result_types.push_back(shaped_type);
+      }
+    }
+
+    auto triton_reduce_op = ttir::ReduceOp::create(
+        rewriter, op.getLoc(), adjusted_result_types, op.getInputs(), axis);
+    Region& triton_reduce_region = triton_reduce_op.getCombineOp();
+
+    mlir::Block& old_block = op.getBody().front();
+    llvm::SmallVector<Type> arg_types;
+    llvm::SmallVector<mlir::Location> arg_locs;
+    for (auto old_arg_type : old_block.getArgumentTypes()) {
+      arg_types.push_back(
+          llvm::cast<ShapedType>(old_arg_type).getElementType());
+      arg_locs.push_back(op.getLoc());
+    }
+    rewriter.createBlock(&triton_reduce_region, triton_reduce_region.begin(),
+                         arg_types, arg_locs);
+
+    mlir::IRMapping mapping;
+    Block& triton_reduce_region_block = triton_reduce_region.front();
+    rewriter.setInsertionPointToStart(&triton_reduce_region_block);
+    for (auto [old_arg, new_arg] :
+         llvm::zip(old_block.getArguments(),
+                   triton_reduce_region_block.getArguments())) {
+      auto to_tensor_op = mlir::tensor::FromElementsOp::create(
+          rewriter, op.getLoc(), old_arg.getType(), new_arg);
+      mapping.map(old_arg, to_tensor_op);
+    }
+
+    for (mlir::Operation& op : old_block.without_terminator()) {
+      rewriter.clone(op, mapping);
+    }
+
+    SmallVector<Value> return_operands;
+    for (Value operand : old_block.getTerminator()->getOperands()) {
+      return_operands.push_back(mlir::tensor::ExtractOp::create(
+          rewriter, op->getLoc(), mapping.lookupOrDefault(operand)));
+    }
+    ttir::ReduceReturnOp::create(rewriter, op.getLoc(), return_operands);
+
+    // Replace usages of the original op results. If the original result was a
+    // 0-rank tensor, we need to wrap the scalar result of tt.reduce in a
+    // tensor.to_tensor op.
+    rewriter.setInsertionPointAfter(triton_reduce_op);
+    llvm::SmallVector<Value> new_results;
+    for (const auto& triton_result : triton_reduce_op.getResults()) {
+      if (mlir::isa<mlir::ShapedType>(triton_result.getType())) {
+        new_results.push_back(triton_result);
+      } else {
+        new_results.push_back(mlir::tensor::FromElementsOp::create(
+            rewriter, op.getLoc(), op.getType(0), triton_result));
+      }
+    }
+
+    rewriter.replaceOp(op, new_results);
+    return mlir::success();
+  }
+
+  // Verifies that the stablehlo reduce op can be lowered to a triton reduce
+  // op.
+  // This checks that proper emitting of `tensor.from_elements` and
+  // `tensor.extract` on reducer inputs and outputs has happened. It also checks
+  // that `tensor.extract` was emitted on the result of the reduce operation if
+  // the result is a zero rank tensor.
+  mlir::LogicalResult VerifyOpIsCompatibleWithTritonReduce(
+      stablehlo::ReduceOp op, mlir::PatternRewriter& rewriter) const {
+    // Check that the reduction is along a single dimension.
+    auto dimensions = op.getDimensions();
+    if (dimensions.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(), "tt.reduce only supports single dimension reductions.");
+    }
+
+    return mlir::success();
+  }
+};
+
+class LowerReshape : public mlir::OpRewritePattern<stablehlo::ReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::ReshapeOp op, mlir::PatternRewriter& rewriter) const override {
+    bool input_is_0d = op.getOperand().getType().getRank() == 0;
+    bool output_is_0d = op.getType().getRank() == 0;
+
+    if (input_is_0d && output_is_0d) {
+      rewriter.replaceAllUsesWith(op, op.getOperand());
+      return mlir::success();
+    }
+
+    if (input_is_0d) {
+      auto to_scalar = mlir::tensor::ExtractOp::create(rewriter, op->getLoc(),
+                                                       op.getOperand());
+      rewriter.replaceOpWithNewOp<ttir::SplatOp>(op, op.getType(), to_scalar);
+      return mlir::success();
+    }
+
+    if (output_is_0d) {
+      // We know the input dimensions must be all 1s as reshape input-output
+      // must have the same number of elements.
+      return LowerRank0ToReduce(op, rewriter);
+    }
+
+    // Conservatively prevent Triton from reordering elements within the tile.
+    // TODO(b/353637689): see if this restriction can be lifted.
+    bool allow_reorder = false;
+    rewriter.replaceOpWithNewOp<ttir::ReshapeOp>(
+        op, op.getResult().getType(), op.getOperand(), allow_reorder);
+    return mlir::success();
+  }
+
+  static mlir::LogicalResult LowerRank0ToReduce(
+      stablehlo::ReshapeOp op, mlir::PatternRewriter& rewriter) {
+    auto input_tensor_type = op.getOperand().getType();
+
+    // First, reshape to a 1D tensor if not already the case. This is needed
+    // because triton::ReduceOp can only reduce 1 dimension at a time.
+    auto single_dim_tensor = op.getOperand();
+    if (input_tensor_type.getRank() > 1) {
+      Type output_tensor_type =
+          mlir::RankedTensorType::get({1}, input_tensor_type.getElementType());
+      single_dim_tensor = ttir::ReshapeOp::create(
+          rewriter, op.getLoc(), output_tensor_type, single_dim_tensor,
+          /*allow_reorder=*/true);
+    }
+
+    // Second, reduce to a scalar.
+    ttir::ReduceOp reduction = ttir::ReduceOp::create(
+        rewriter, op.getLoc(), single_dim_tensor, /*axis=*/0);
+
+    auto element_type = input_tensor_type.getElementType();
+    mlir::Location loc = op.getLoc();
+    mlir::Block* reducer =
+        rewriter.createBlock(&reduction->getRegion(0), /*insertPt=*/{},
+                             /*argTypes=*/
+                             {element_type, element_type},
+                             /*locs=*/{loc, loc});
+
+    rewriter.setInsertionPointToStart(reducer);
+    auto create_binary_op = [&](auto op_type) -> Value {
+      return op_type.create(rewriter, reducer->getArgument(0).getLoc(),
+                            reducer->getArgument(0), reducer->getArgument(1));
+    };
+    Value result = mlir::isa<mlir::IntegerType>(element_type)
+                       ? create_binary_op(arith::AddIOp())
+                       : create_binary_op(arith::AddFOp());
+    ttir::ReduceReturnOp::create(rewriter, result.getLoc(), {result});
+
+    rewriter.setInsertionPointAfter(reduction);
+    rewriter.replaceOpWithNewOp<mlir::tensor::FromElementsOp>(
+        op, op.getType(), reduction.getResult());
+
+    return mlir::success();
+  }
+};
+
+namespace {
+
+LogicalResult PopulateOperandPrecision(PatternRewriter& rewriter,
+                                       stablehlo::DotGeneralOp op,
+                                       stablehlo::Precision& lhs_precision,
+                                       stablehlo::Precision& rhs_precision) {
+  auto precision_config = op.getPrecisionConfig();
+
+  if (!precision_config.has_value()) {
+    return rewriter.notifyMatchFailure(op->getLoc(),
+                                       "Dot op must have precision config.");
+  }
+
+  if (precision_config.value().size() != 2) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        "Dot op must have exactly two precisions. One for lhs and one for "
+        "rhs.");
+  }
+
+  auto lhs_precision_attr =
+      mlir::cast<stablehlo::PrecisionAttr>(precision_config.value()[0]);
+  auto rhs_precision_attr =
+      mlir::cast<stablehlo::PrecisionAttr>(precision_config.value()[1]);
+
+  lhs_precision = lhs_precision_attr.getValue();
+  rhs_precision = rhs_precision_attr.getValue();
+
+  return mlir::success();
+}
+
+::xla::PrecisionConfig::Precision StableHloPrecisionToXlaPrecision(
+    stablehlo::Precision precision) {
+  switch (precision) {
+    case stablehlo::Precision::DEFAULT:
+      return ::xla::PrecisionConfig::DEFAULT;
+    case stablehlo::Precision::HIGH:
+      return ::xla::PrecisionConfig::HIGH;
+    case stablehlo::Precision::HIGHEST:
+      return ::xla::PrecisionConfig::HIGHEST;
+    default:
+      LOG(FATAL) << "Unsupported precision";
+  }
+}
+
+// Triton implementations of dot algorithms.
+
+struct TritonPrecisionSpec {
+  ::xla::PrecisionConfig::Algorithm algorithm;
+  // Encodes `tt.dot`'s `inputPrecision` attribute.
+  ttir::InputPrecision ttir_input_precision;
+};
+
+mlir::Type ElementType(mlir::Value v) { return mlir::getElementTypeOrSelf(v); }
+
+using AlgorithmEmitter = absl::StatusOr<Value> (*)(
+    ::xla::EmitterLocOpBuilder, const ::xla::gpu::triton::DotOperands&,
+    const TritonPrecisionSpec&);
+
+absl::StatusOr<Value> EmitDotAlgUnset(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  // Execute matrix multiplication of input tiles and pass the accumulator.
+  // TODO(manany): Should be looked into once we enable Hopper workloads.
+  // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
+  // lower precision than the output type. The change was introduced here:
+  // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+  Value acc = dot_operands.accumulator;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  return b.create<ttir::DotOp>(
+      lhs, rhs, acc,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+absl::StatusOr<Value> EmitRegularDot(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
+  // TODO(bchetioui): abstract this.
+  if (precision_spec.algorithm ==
+      ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
+    if (ElementType(lhs).isF32()) {
+      lhs = ::xla::gpu::triton::Cast(b, lhs, b.getBF16Type());
+    }
+
+    if (ElementType(rhs).isF32()) {
+      rhs = ::xla::gpu::triton::Cast(b, rhs, b.getBF16Type());
+    }
+  }
+
+  return b.create<ttir::DotOp>(
+      dot_operands.lhs, dot_operands.rhs, dot_operands.accumulator,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
+// If rhs is +infinity, we will have:
+// +infinity * 1.0 = +infinity
+// +infinity * 0.0 = NaN
+// We would get the wrong result if we sum these partial products. Instead, we
+// must override any accumulated result if the last partial product is
+// non-finite. See b/115844437.
+Value ZeroNaNs(::xla::EmitterLocOpBuilder b, Value input) {
+  Value positive_inf = ::xla::gpu::triton::CreateConst<float>(
+      b, b.getF32Type(), std::numeric_limits<float>::infinity(),
+      mlir::cast<ShapedType>(input.getType()).getShape());
+  Value abs_input = b.create<math::AbsFOp>(input);
+  Value is_finite = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT,
+                                            positive_inf, abs_input);
+  return b.create<arith::SelectOp>(is_finite, input,
+                                   ::xla::gpu::triton::ZerosLike(b, input));
+}
+
+absl::Status ExpectType(Value v, Type expected_type) {
+  if (ElementType(v) != expected_type) {
+    std::string expected_type_str, actual_type_str;
+    {
+      llvm::raw_string_ostream os_expected(expected_type_str);
+      llvm::raw_string_ostream os_actual(actual_type_str);
+      expected_type.print(os_expected);
+      ElementType(v).print(os_actual);
+    }
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected type ", expected_type_str, " but got ", actual_type_str));
+  }
+  return absl::OkStatus();
+}
+
+std::vector<Value> SplitF32(::xla::EmitterLocOpBuilder b, Value input,
+                            int split_count) {
+  std::vector<Value> split_inputs;
+  split_inputs.reserve(split_count);
+  for (int i = 0; i < split_count; ++i) {
+    Value input_as_bf16 = ::xla::gpu::triton::Cast(b, input, b.getBF16Type());
+    if (i != split_count - 1) {
+      Value input_as_f32 =
+          ::xla::gpu::triton::Cast(b, input_as_bf16, b.getF32Type());
+      input = b.create<arith::SubFOp>(input, input_as_f32);
+    }
+    split_inputs.push_back(input_as_bf16);
+  }
+  return split_inputs;
+}
+
+Value IEEEDot(::xla::EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
+  return b.create<ttir::DotOp>(lhs, rhs, acc,
+                               /*inputPrecision=*/ttir::InputPrecision::IEEE,
+                               /*maxNumImpreciseAcc=*/0);
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x9Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x6Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+// Compute F32 matmul with 3 BF16 dots. It is less accurate than
+// EmitBF16x6Matmul.
+absl::StatusOr<Value> EmitBF16x3Matmul(
+    ::xla::EmitterLocOpBuilder b,
+    const ::xla::gpu::triton::DotOperands& dot_operands,
+    const TritonPrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 2;
+  constexpr int kHigh = 0;
+  constexpr int kLow = 1;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_bf16 = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_bf16 = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = ::xla::gpu::triton::ZerosLike(b, dot_operands.accumulator);
+  result = IEEEDot(b, lhs_bf16[kLow], rhs_bf16[kHigh], result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kLow], result);
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+// Returns an emitter for the given dot algorithm. Raises an
+// `UnimplementedError` if the algorithm is not supported.
+absl::StatusOr<AlgorithmEmitter> GetAlgorithmEmitter(
+    const ::xla::PrecisionConfig::Algorithm algorithm) {
+  switch (algorithm) {
+    case ::xla::PrecisionConfig::ALG_UNSET:
+      return EmitDotAlgUnset;
+    case ::xla::PrecisionConfig::ALG_DOT_F16_F16_F16:
+    case ::xla::PrecisionConfig::ALG_DOT_F32_F32_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_F64_F64_F64:
+    case ::xla::PrecisionConfig::ALG_DOT_F16_F16_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      return EmitBF16x3Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      return EmitBF16x6Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case ::xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return EmitBF16x9Matmul;
+    case ::xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case ::xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
+    default:
+      break;
+  }
+
+  // Couldn't find an algorithm emitter for this algorithm. Raise an error.
+  return absl::UnimplementedError(
+      absl::StrCat("This algorithm is not supported yet: ",
+                   ::xla::PrecisionConfig::Algorithm_Name(algorithm)));
+}
+
+bool IsTf32Allowed(const ::xla::gpu::triton::PrecisionSpec& precision_spec) {
+  if (precision_spec.algorithm == ::xla::PrecisionConfig::ALG_UNSET) {
+    return tsl::tensor_float_32_execution_enabled() &&
+           StableHloPrecisionToXlaPrecision(
+               precision_spec.lhs_operand_precision) ==
+               ::xla::PrecisionConfig::DEFAULT &&
+           StableHloPrecisionToXlaPrecision(
+               precision_spec.rhs_operand_precision) ==
+               ::xla::PrecisionConfig::DEFAULT;
+  }
+  return ::xla::algorithm_util::HasTf32InputType(precision_spec.algorithm);
+}
+
+ttir::InputPrecision InferDotPrecision(
+    const ::xla::gpu::triton::PrecisionSpec& precision_spec) {
+  if (precision_spec.algorithm ==
+      ::xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
+    return ttir::InputPrecision::TF32x3;
+  }
+
+  return IsTf32Allowed(precision_spec) ? ttir::InputPrecision::TF32
+                                       : ttir::InputPrecision::IEEE;
+}
+
+LogicalResult RewriteDotGeneralToTritonDot(mlir::PatternRewriter& rewriter,
+                                           stablehlo::DotGeneralOp op,
+                                           mlir::Operation* add_op,
+                                           Value accumulator) {
+  auto dot_algorithm = op.getAlgorithm();
+
+  auto hlo_algorithm_or_status =
+      dot_algorithm.has_value()
+          ? ::xla::ConvertDotAlgorithm(dot_algorithm.value())
+          : ::xla::PrecisionConfig::ALG_UNSET;
+
+  if (!hlo_algorithm_or_status.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        "Dot op must have algorithm set to be converted to "
+        "triton dot.");
+  }
+
+  auto hlo_algorithm = hlo_algorithm_or_status.value();
+  auto algorithm_emitter_or_status = GetAlgorithmEmitter(hlo_algorithm);
+
+  if (!algorithm_emitter_or_status.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(),
+        absl::StrCat("Algorithm emitter not found with error: ",
+                     algorithm_emitter_or_status.status().message()));
+  }
+
+  auto algorithm_emitter = algorithm_emitter_or_status.value();
+
+  ::xla::EmitterLocOpBuilder builder(op->getLoc(), rewriter);
+
+  ::xla::gpu::triton::DotOperands dot_operands{op.getLhs(), op.getRhs(),
+                                               accumulator};
+
+  stablehlo::Precision lhs_precision;
+  stablehlo::Precision rhs_precision;
+
+  if (mlir::failed(PopulateOperandPrecision(rewriter, op, lhs_precision,
+                                            rhs_precision))) {
+    return mlir::failure();
+  }
+
+  ::xla::gpu::triton::PrecisionSpec precision_spec{hlo_algorithm, lhs_precision,
+                                                   rhs_precision};
+
+  TritonPrecisionSpec triton_precision_spec{hlo_algorithm,
+                                            InferDotPrecision(precision_spec)};
+
+  auto triton_dot_op_or_result =
+      algorithm_emitter(builder, dot_operands, triton_precision_spec);
+
+  if (!triton_dot_op_or_result.ok()) {
+    return rewriter.notifyMatchFailure(
+        op->getLoc(), absl::StrCat("Algorithm emitter failed with error: ",
+                                   triton_dot_op_or_result.status().message()));
+  }
+
+  auto triton_dot_op = triton_dot_op_or_result.value();
+
+  rewriter.replaceAllOpUsesWith(add_op, op.getResult());
+  rewriter.replaceOp(op, triton_dot_op);
+
+  return mlir::success();
+}
+
+}  // namespace
+
+class LowerDotGeneral : public mlir::OpRewritePattern<stablehlo::DotGeneralOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      stablehlo::DotGeneralOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (std::distance(op->getUsers().begin(), op->getUsers().end()) != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must have exactly one user in order to be lowered to "
+          "triton.");
+    }
+
+    mlir::Operation* add_op = dyn_cast<arith::AddFOp>(*op->getUsers().begin());
+    if (!add_op) {
+      add_op = dyn_cast<arith::AddIOp>(*op->getUsers().begin());
+    }
+
+    if (!add_op) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must be consumed by an AddOp in order to be convertible to "
+          "triton dot.");
+    }
+
+    // Accumulator is the operand of add that is not the dot operation.
+    auto accumulator = add_op->getOperand(1) == op ? add_op->getOperand(0)
+                                                   : add_op->getOperand(1);
+
+    if (mlir::failed(
+            RewriteDotGeneralToTritonDot(rewriter, op, add_op, accumulator))) {
+      return mlir::failure();
+    }
+    return mlir::success();
+  }
+};
+
 class StableHLOLowerToTritonPass
     : public impl::StableHLOLowerToTritonPassBase<StableHLOLowerToTritonPass> {
  public:
   void runOnOperation() override {
     mlir::MLIRContext* mlir_context = &getContext();
     mlir::RewritePatternSet patterns(mlir_context);
-    patterns.add<LowerTranspose, LowerIotaToMakeRange, LowerBroadcastInDim>(
-        mlir_context);
+    patterns.add<LowerTranspose, LowerIotaToMakeRange, LowerBroadcastInDim,
+                 LowerReduce, LowerReshape, LowerDotGeneral>(mlir_context);
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc
index b729dd71366c11..d367ca8b1ea5e4 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tensor_lower_to_triton.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir::triton::xla {
@@ -44,8 +45,27 @@ class LowerBitcast : public mlir::OpRewritePattern<tensor::BitcastOp> {
  private:
   mlir::LogicalResult matchAndRewrite(
       tensor::BitcastOp op, mlir::PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<ttir::BitcastOp>(op, op.getResult().getType(),
-                                                 op.getOperand());
+    mlir::Value source = op.getSource();
+    if (op.getSource().getType().getRank() == 0) {
+      source = mlir::tensor::ExtractOp::create(rewriter, op.getLoc(), source);
+    }
+
+    mlir::TensorType tensor_result_type = op.getResult().getType();
+    bool is_0d_result = tensor_result_type.getRank() == 0;
+    mlir::Type triton_result_type =
+        is_0d_result ? tensor_result_type.getElementType() : tensor_result_type;
+
+    auto bitcast = ttir::BitcastOp::create(rewriter, op.getLoc(),
+                                           triton_result_type, source);
+
+    mlir::Value result = bitcast.getResult();
+    if (is_0d_result) {
+      result = mlir::tensor::FromElementsOp::create(rewriter, op.getLoc(),
+                                                    tensor_result_type, result);
+    }
+
+    rewriter.replaceOp(op, result);
+
     return mlir::success();
   }
 };
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
index 8cfdc6bef16e50..b58315a54122d0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/stable_hlo_to_triton_lowering.mlir
@@ -44,12 +44,139 @@ func.func @lower_broadcast_in_dim(%arg0: tensor<2x4xf32>) -> tensor<8x2x4x16xf32
   return %0 : tensor<8x2x4x16xf32>
 }
 
-// CHECK: func @lower_broadcast_in_dim_on_0d_tensor_produced_by_from_elements_to_splat(%[[ARG0:.*]]: f32) -> tensor<4x2xf32>
-func.func @lower_broadcast_in_dim_on_0d_tensor_produced_by_from_elements_to_splat(%arg0: f32) -> tensor<4x2xf32> {
+// CHECK: func @lower_broadcast_in_dim_on_0d_tensor_produced_by_to_tensor_to_splat(%[[ARG0:.*]]: f32) -> tensor<4x2xf32>
+func.func @lower_broadcast_in_dim_on_0d_tensor_produced_by_to_tensor_to_splat(%arg0: f32) -> tensor<4x2xf32> {
   // CHECK-NOT: tensor.from_elements
   // CHECK: %[[RES:.*]] = tt.splat %[[ARG0]] : f32 -> tensor<4x2xf32>
-  %from_elements = tensor.from_elements %arg0 : tensor<f32>
-  %0 = stablehlo.broadcast_in_dim %from_elements, dims = [] : (tensor<f32>) -> tensor<4x2xf32>
+  %to_tensor = tensor.from_elements %arg0 : tensor<f32>
+  %0 = stablehlo.broadcast_in_dim %to_tensor, dims = [] : (tensor<f32>) -> tensor<4x2xf32>
   // CHECK: return %[[RES]] : tensor<4x2xf32>
   return %0 : tensor<4x2xf32>
 }
+
+// CHECK: func @reduce(%[[ARG0:.*]]: tensor<16x8xf32>) -> tensor<8xf32>
+func.func @reduce(%arg0: tensor<16x8xf32>) -> tensor<8xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RES:.*]] = "tt.reduce"(%[[ARG0]]) <{axis = 0 : i32}> ({
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  //CHECK: ^bb0(%[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32):
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: %[[ARG1_CAST:.*]] = tensor.from_elements %[[ARG1]] : tensor<f32>
+    // CHECK: %[[ARG2_CAST:.*]] = tensor.from_elements %[[ARG2]] : tensor<f32>
+    // CHECK: %[[RES:.*]] = arith.addf %[[ARG1_CAST]], %[[ARG2_CAST]] : tensor<f32>
+    // CHECK: %[[RES_CAST:.*]] = tensor.extract %[[RES]][] : tensor<f32>
+    // CHECK: tt.reduce.return %[[RES_CAST]] : f32
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16x8xf32>, tensor<f32>) -> tensor<8xf32>
+  return %1 : tensor<8xf32>
+}
+
+// CHECK: func @reduce_to_scalar_followed_by_extract(%[[ARG0:.*]]: tensor<16xf32>) -> f32
+func.func @reduce_to_scalar_followed_by_extract(%arg0: tensor<16xf32>) -> f32 {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[REDUCE_RESULT:.*]] = "tt.reduce"(%[[ARG0]]) <{axis = 0 : i32}> ({
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  //CHECK: ^bb0(%[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32):
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: %[[RES:.*]] = arith.addf {{.*}} : tensor<f32>
+    // CHECK: tt.reduce.return {{.*}} : f32
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NOT: tensor.from_elements
+  // CHECK-NOT: tensor.extract
+  %extract = tensor.extract %1[] : tensor<f32>
+  // CHECK: return %[[REDUCE_RESULT:.*]] : f32
+  return %extract : f32
+}
+
+// CHECK: func @reduce_over_multiple_dimensions_falls_back_to_stablehlo(%[[ARG0:.*]]: tensor<16x8x4xf32>) -> tensor<4xf32>
+func.func @reduce_over_multiple_dimensions_falls_back_to_stablehlo(%arg0: tensor<16x8x4xf32>) -> tensor<4xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RES:.*]] = stablehlo.reduce(%[[ARG0]] init: %{{.*}}) across dimensions = [0, 1] : (tensor<16x8x4xf32>, tensor<f32>) -> tensor<4xf32>
+  %1 = "stablehlo.reduce"(%arg0, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %add = arith.addf %arg1, %arg2 : tensor<f32>
+    stablehlo.return %add : tensor<f32>
+  }) {dimensions = array<i64: 0, 1>} : (tensor<16x8x4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: return %[[RES]] : tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK: func @reduce_with_multiple_inputs(%[[ARG0:.*]]: tensor<16x8xf32>, %[[ARG1:.*]]: tensor<16x8xf32>) -> tensor<8xf32>
+func.func @reduce_with_multiple_inputs(%arg0: tensor<16x8xf32>, %arg1: tensor<16x8xf32>) -> tensor<8xf32> {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[REDUCE_RESULT:.*]] = "tt.reduce"(%[[ARG0]], %[[ARG1]]) <{axis = 0 : i32}> ({
+  %1, %2 = "stablehlo.reduce"(%arg0, %arg1, %0, %0) ({
+  ^bb0(%arg0_reducer: tensor<f32>, %arg1_reducer: tensor<f32>, %arg2_reducer: tensor<f32>, %arg3_reducer: tensor<f32>):
+    %add0 = arith.addf %arg0_reducer, %arg1_reducer : tensor<f32>
+    %add1 = arith.addf %arg2_reducer, %arg3_reducer : tensor<f32>
+    stablehlo.return %add0, %add1 : tensor<f32>, tensor<f32>
+  }) {dimensions = array<i64: 0>} : (tensor<16x8xf32>, tensor<16x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<8xf32>, tensor<8xf32>)
+  return %1 : tensor<8xf32>
+}
+
+func.func @lower_reshape(%arg0: tensor<2x4x8xf32>) -> tensor<8x2x4xf32> {
+  // CHECK: %[[RES:.*]] = tt.reshape %[[ARG]] : tensor<2x4x8xf32> -> tensor<8x2x4xf32>
+  %0 = stablehlo.reshape %arg0 : (tensor<2x4x8xf32>) -> tensor<8x2x4xf32>
+  return %0 : tensor<8x2x4xf32>
+}
+
+// CHECK-LABEL: @reshape_0d_to_0d_folds(%arg0: tensor<f32>)
+func.func @reshape_0d_to_0d_folds(%arg0: tensor<f32>) -> tensor<f32> {
+  %0 = stablehlo.reshape %arg0 : (tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0 : tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @reshape_0d_to_2d_splats(%arg0: tensor<f32>)
+func.func @reshape_0d_to_2d_splats(%arg0: tensor<f32>) -> tensor<1x1xf32> {
+  // CHECK: %[[SCALAR:.*]] = tensor.extract %arg0[] : tensor<f32>
+  // CHECK: %[[SPLAT:.*]] = tt.splat %[[SCALAR]] : f32 -> tensor<1x1xf32>
+  %0 = stablehlo.reshape %arg0 : (tensor<f32>) -> tensor<1x1xf32>
+  // CHECK: return %[[SPLAT]]
+  return %0 : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @reshape_2d_to_0d_reduces(%arg0: tensor<1x1xf32>)
+func.func @reshape_2d_to_0d_reduces(%arg0: tensor<1x1xf32>) -> tensor<f32> {
+  // CHECK: %[[RESHAPE:.*]] = tt.reshape %arg0 allow_reorder : tensor<1x1xf32> -> tensor<1xf32>
+  // CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[RESHAPE]]) <{axis = 0 : i32}> ({
+  // CHECK:  ^bb0(%arg1: f32, %arg2: f32):
+  // CHECK:    %[[ADD:.*]] = arith.addf %arg1, %arg2 : f32
+  // CHECK:    tt.reduce.return %[[ADD]] : f32
+  // CHECK:  }) : (tensor<1xf32>) -> f32
+  // CHECK:  %[[REDUCE_TENSOR:.*]] = tensor.from_elements %[[REDUCE]] : tensor<f32>
+  %0 = stablehlo.reshape %arg0 : (tensor<1x1xf32>) -> tensor<f32>
+  // CHECK: return %[[REDUCE_TENSOR]]
+  return %0 : tensor<f32>
+}
+
+// CHECK: func @lower_dot_add_to_triton(%[[ARG0:.*]]: tensor<2x4xf32>, %[[ARG1:.*]]: tensor<4x8xf32>, %[[ARG2:.*]]: tensor<2x8xf32>) -> tensor<2x8xf32>
+func.func @lower_dot_add_to_triton(%arg0: tensor<2x4xf32>, %arg1: tensor<4x8xf32>, %arg2: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  // CHECK: %[[RES:.*]] = tt.dot %[[ARG0]], %[[ARG1]], %[[ARG2]], inputPrecision = tf32 : tensor<2x4xf32> * tensor<4x8xf32> -> tensor<2x8xf32>
+  // CHECK-NOT: arith.addf
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = arith.addf %0, %arg2 : tensor<2x8xf32>
+  // CHECK: return %[[RES]] : tensor<2x8xf32>
+  return %1 : tensor<2x8xf32>
+}
+
+// CHECK: func @lower_dot_without_add_falls_back_to_stablehlo(%[[ARG0:.*]]: tensor<2x4xf32>, %[[ARG1:.*]]: tensor<4x8xf32>, %[[ARG2:.*]]: tensor<2x8xf32>) -> tensor<2x8xf32>
+func.func @lower_dot_without_add_falls_back_to_stablehlo(%arg0: tensor<2x4xf32>, %arg1: tensor<4x8xf32>, %arg2: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  // CHECK: %[[RES:.*]] = stablehlo.dot_general %[[ARG0]], %[[ARG1]], contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf32>, tensor<4x8xf32>) -> tensor<2x8xf32>
+  // CHECK: return %[[RES]] : tensor<2x8xf32>
+  return %0 : tensor<2x8xf32>
+}
+
+// CHECK: func @lower_dot_f8_no_ieee_has_max_num_imprecise_acc_set_to_max(%[[ARG0:.*]]: tensor<2x4xf8E4M3FN>, %[[ARG1:.*]]: tensor<4x8xf8E4M3FN>, %[[ARG2:.*]]: tensor<2x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN>
+func.func @lower_dot_f8_no_ieee_has_max_num_imprecise_acc_set_to_max(%arg0: tensor<2x4xf8E4M3FN>, %arg1: tensor<4x8xf8E4M3FN>, %arg2: tensor<2x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN> {
+  // CHECK: %[[RES:.*]] = tt.dot %[[ARG0]], %[[ARG1]], %[[ARG2]], inputPrecision = tf32 {maxNumImpreciseAcc = 2147483647 : i32} : tensor<2x4xf8E4M3FN> * tensor<4x8xf8E4M3FN> -> tensor<2x8xf8E4M3FN>
+  // CHECK-NOT: arith.addf
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<2x4xf8E4M3FN>, tensor<4x8xf8E4M3FN>) -> tensor<2x8xf8E4M3FN>
+  %1 = arith.addf %0, %arg2 : tensor<2x8xf8E4M3FN>
+  // CHECK: return %[[RES]] : tensor<2x8xf8E4M3FN>
+  return %1 : tensor<2x8xf8E4M3FN>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir
index 6ce9f505c8f41d..76412bc44997c8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/tensor_to_triton_lowering.mlir
@@ -11,3 +11,13 @@ func.func @lower_bitcast(%arg0: tensor<2x4x8xf32>) -> tensor<2x4x8xi32> {
   // CHECK: return %[[RES]] : tensor<2x4x8xi32>
   return %0 : tensor<2x4x8xi32>
 }
+
+// CHECK: func @lower_bitcast_0d(%[[ARG:.*]]: tensor<f32>) -> tensor<i32>
+func.func @lower_bitcast_0d(%arg0: tensor<f32>) -> tensor<i32> {
+  // CHECK: %[[SCALAR_ARG:.*]] = tensor.extract %[[ARG]][] : tensor<f32>
+  // CHECK: %[[RES:.*]] = tt.bitcast %[[SCALAR_ARG]] : f32 -> i32
+  // CHECK: %[[TENSOR_RES:.*]] = tensor.from_elements %[[RES]] : tensor<i32>
+  %0 = tensor.bitcast %arg0 : tensor<f32> to tensor<i32>
+  // CHECK: return %[[TENSOR_RES]] : tensor<i32>
+  return %0 : tensor<i32>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
index 9d2649bf637514..a5b50bb5533f7b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
@@ -3,7 +3,7 @@
 // RUN: | FileCheck %s
 
 // RUN: xla-opt %s -split-input-file \
-// RUN: -triton-xla-extract-insert-to-triton=allow_tma=1 \
+// RUN: -triton-xla-extract-insert-to-triton="allow_tma=1 num_stages=3" \
 // RUN: | FileCheck %s --check-prefix=CHECK-TMA
 
 func.func @lower_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -250,3 +250,28 @@ module {
 // CHECK-TMA-LABEL: tt.func @incompatible_tma_dynamic_offset_not_divisible_by_16_bytes
 // CHECK-TMA:         tt.load
 // CHECK-TMA:         tt.descriptor_store
+
+// -----
+
+func.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma(
+          %arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>) {
+  %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
+  %extracted_tile = triton_xla.extract from %arg0 as
+      memref<64xf32, #triton_xla.layout<[0]>> [0] [64] [1] : tensor<64xf32>
+  %0 = tt.expand_dims %extracted_tile {axis = 1 : i32}
+      : tensor<64xf32> -> tensor<64x1xf32>
+  %1 = tt.broadcast %0 : tensor<64x1xf32> -> tensor<64x64xf32>
+  %extracted_tile_0 = triton_xla.extract from %arg1 as
+      memref<64x64xf32, #triton_xla.layout<[1, 0]>> [0, 0] [64, 64] [1, 1]
+      : tensor<64x64xf32>
+  %2 = tt.dot %1, %extracted_tile_0, %cst, inputPrecision = tf32
+      : tensor<64x64xf32> * tensor<64x64xf32> -> tensor<64x64xf32>
+  triton_xla.insert %2 into %arg2 as
+      memref<64x64xf32, #triton_xla.layout<[1, 0]>> [0, 0] [64, 64] [1, 1]
+      : tensor<64x64xf32>
+  return
+}
+
+// CHECK-TMA-LABEL: tt.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma
+// CHECK-TMA-NOT:         tt.descriptor_load %arg0
+// CHECK-TMA:             tt.descriptor_load %arg1
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
index 8bc1f58ca8cf64..26cc8ea843a4e7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_fold_transpose.mlir
@@ -1,16 +1,25 @@
 // RUN: xla-opt %s --triton-xla-fold-transpose | FileCheck %s
 
-// CHECK-LABEL: func @fold_transpose_of_extract
-func.func @fold_transpose_of_extract(%arg0: !tt.ptr<f32>, %arg1: i32) -> tensor<8x4xf32> {
-  // CHECK: %[[EXTRACT:.*]] = triton_xla.extract from %arg0
-  // CHECK-SAME: as memref<16x8x4xf32, #triton_xla.layout<[0, 2, 1]>>
-  // CHECK-SAME: [0, 0, 0] [8, 1, 4] [1, 1, 1] : tensor<8x4xf32>
-  %0 = triton_xla.extract from %arg0
-    as memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>
-    [0, 0, 0] [4, 1, 8] [1, 1, 1] : tensor<4x8xf32>
-  %1 = tt.trans %0 {order = array<i32: 1, 0>} : tensor<4x8xf32> -> tensor<8x4xf32>
+// CHECK-LABEL: func @push_transpose_of_extract_tile_to_memref
+// CHECK-SAME: (%[[INPUT:.*]]: memref
+// CHECK-SAME: , %[[OFFSET0:.*]]: index, %[[OFFSET1:.*]]: index, %[[OFFSET2:.*]]: index)
+func.func @push_transpose_of_extract_tile_to_memref(
+  %input: memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>,
+  %offset0: index, %offset1: index, %offset2: index)  ->  tensor<8x4xf32>
+{
+  // CHECK: %[[TRANSPOSE:.*]] = memref.transpose %[[INPUT]]
+
+  // CHECK-SAME: (d0, d1, d2) -> (d2, d1, d0)
+  // CHECK-SAME: : memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>>
+  // CHECK-SAME: to memref<16x8x4xf32, strided<[1, 64, 16]>>
+  // CHECK: %[[EXTRACT:.*]] = xtile.extract %[[TRANSPOSE]]
+  // CHECK-SAME: [%[[OFFSET2]], %[[OFFSET1]], %[[OFFSET0]]] [8, 1, 4] [1, 1, 1]
+  // CHECK-SAME: : memref<16x8x4xf32, strided<[1, 64, 16]>> -> tensor<8x4xf32>
+  %tile = xtile.extract %input[%offset0, %offset1, %offset2][4, 1, 8][1, 1, 1]
+    : memref<4x8x16xf32, #triton_xla.layout<[2, 0, 1]>> -> tensor<4x8xf32>
+  %transposed = tt.trans %tile {order = array<i32: 1, 0>} : tensor<4x8xf32> -> tensor<8x4xf32>
   // CHECK: return %[[EXTRACT]] : tensor<8x4xf32>
-  return %1 : tensor<8x4xf32>
+  return %transposed : tensor<8x4xf32>
 }
 
 // CHECK-LABEL: func @push_transpose_up_through_broadcast
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir
index 44fc5a64ee3814..071f12e5202080 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_lower_xtile.mlir
@@ -10,8 +10,7 @@ xtile.entry_func @extract_insert_no_layout(%input: memref<1024xf32, #nvvm.memory
 
 // CHECK: func.func @extract_insert_no_layout(%[[ARG0:.*]]: !tt.ptr<f32>, %[[ARG1:.*]]: !tt.ptr<f32>) {
 // CHECK:   %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:   %[[PID_I64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID_I64]] : i64 to index
+// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID]] : i32 to index
 // CHECK:   %[[TILE:.*]] = triton_xla.extract from %[[ARG0]] as memref<1024xf32, #triton_xla.layout<[0]>> [%[[PID_IDX]]] [1] [1] : tensor<1xf32>
 // CHECK:   triton_xla.insert %[[TILE]] into %[[ARG1]] as memref<32xf32, #triton_xla.layout<[0]>> [%[[PID_IDX]]] [1] [1] : tensor<1xf32>
 // CHECK:   return
@@ -29,8 +28,7 @@ xtile.entry_func @layout_preserved(%input: !arg_type,
 
 // CHECK: func.func @layout_preserved(%[[ARG0:.*]]: !tt.ptr<bf16>) {
 // CHECK:   %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:   %[[PID_I64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID_I64]] : i64 to index
+// CHECK:   %[[PID_IDX:.*]] = arith.index_cast %[[PID]] : i32 to index
 // CHECK:   %[[TILE:.*]] = triton_xla.extract from %[[ARG0]]
 // CHECK-SAME: as memref<1024x32x1x1xbf16, #triton_xla.layout<[3, 2, 0, 1]>>
 // CHECK-SAME: [%[[PID_IDX]], 0, 0, 0]
@@ -52,3 +50,34 @@ xtile.entry_func @scalar_insert_extract(%input: !memref_type,
   xtile.insert %tile into %output[%tile_id][1][1] : tensor<f64> -> !memref_type
   xtile.return
 }
+
+// -----
+
+!memref_type = memref<32xf64, #nvvm.memory_space<global>>
+// CHECK:func.func @insert_extract_with_opaque_arg(
+// CHECK-SAME: %[[ARG0:.*]]: !tt.ptr<f64>, %[[ARG1:.*]]: !tt.ptr<f64>, %[[ARG2:.*]]: i32) {
+xtile.entry_func @insert_extract_with_opaque_arg(%input: !memref_type,
+                                                 %output: !memref_type,
+                                                 %opaque_arg: i32,
+                                                 %tile_id: index) attributes {
+                                                   num_opaque_args = 1: i32} {
+  // CHECK: %[[SCALAR_VALUE:.*]] = tt.load %[[ARG0]] : !tt.ptr<f64>
+  %tile = xtile.extract %input[%tile_id][1][1] : !memref_type -> tensor<f64>
+  // CHECK: tt.store %[[ARG1]], %[[SCALAR_VALUE]] : !tt.ptr<f64>
+  xtile.insert %tile into %output[%tile_id][1][1] : tensor<f64> -> !memref_type
+  xtile.return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fold_transpose_into_ptr
+// CHECK-SAME: (%[[ARG0:.*]]: memref<32x16xf64, #triton_xla.layout<[0, 1]>>)
+func.func @fold_transpose_into_ptr(
+    %arg0: memref<32x16xf64, #triton_xla.layout<[0, 1]>>) -> !tt.ptr<f64> {
+  %transposed = memref.transpose %arg0 (d0, d1) -> (d1, d0)
+    : memref<32x16xf64, #triton_xla.layout<[0, 1]>> to memref<16x32xf64>
+  // CHECK: %[[PTR:.*]] = triton_xla.memref_to_ptr %[[ARG0]] from memref<32x16xf64, #triton_xla.layout<[0, 1]>> to <f64>
+  %ptr = triton_xla.memref_to_ptr %transposed from memref<16x32xf64> to !tt.ptr<f64>
+  // CHECK: return %[[PTR]] : !tt.ptr<f64>
+  return %ptr : !tt.ptr<f64>
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir
new file mode 100644
index 00000000000000..09d35c269367a0
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_math_to_libdevice.mlir
@@ -0,0 +1,296 @@
+// RUN: xla-opt %s -split-input-file -triton-xla-math-to-libdevice=' \
+// RUN: libdevice_path=/path/to/libdevice triple=nvptx64-unknown-unknown' \
+// RUN: | FileCheck %s
+
+func.func @main(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.acos %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_acosf"}
+
+// -----
+
+func.func @acosh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.acosh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_acoshf"}
+
+// -----
+
+func.func @asin(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.asin %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_asinf"}
+
+// -----
+
+func.func @asinh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.asinh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_asinhf"}
+
+// -----
+
+func.func @atan2(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.atan2 %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_atan2f"}
+
+// -----
+
+func.func @atanh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.atanh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_atanhf"}
+
+// -----
+
+func.func @cos(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cos %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_cosf"}
+
+// -----
+
+func.func @cosh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cosh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_coshf"}
+
+// -----
+
+func.func @exp(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.exp %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_expf"}
+
+// -----
+
+func.func @exp_bf16(%arg0: tensor<1024xbf16>) -> tensor<1024xbf16> {
+  %result = math.exp %arg0 : tensor<1024xbf16>
+  return %result : tensor<1024xbf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xbf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_expf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xbf16>
+
+// -----
+
+func.func @exp_f16(%arg0: tensor<1024xf16>) -> tensor<1024xf16> {
+  %result = math.exp %arg0 : tensor<1024xf16>
+  return %result : tensor<1024xf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_expf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xf16>
+
+// -----
+
+func.func @erf(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.erf %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_erff"}
+
+// -----
+
+func.func @expm1(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.expm1 %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_expm1f"}
+
+// -----
+
+func.func @log(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.log %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_logf"}
+
+// -----
+
+
+func.func @log_bf16(%arg0: tensor<1024xbf16>) -> tensor<1024xbf16> {
+  %result = math.log %arg0 : tensor<1024xbf16>
+  return %result : tensor<1024xbf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xbf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_logf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xbf16>
+
+// -----
+
+func.func @log_f16(%arg0: tensor<1024xf16>) -> tensor<1024xf16> {
+  %result = math.log %arg0 : tensor<1024xf16>
+  return %result : tensor<1024xf16>
+}
+
+// CHECK:       %[[CAST:.*]] = arith.extf %arg0 : tensor<1024xf16> to tensor<1024xf32>
+// CHECK:       tt.extern_elementwise %[[CAST]]
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fast_logf"}
+// CHECK:       arith.truncf {{.*}} : tensor<1024xf32> to tensor<1024xf16>
+
+// -----
+
+func.func @log1p(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.log1p %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_log1pf"}
+
+// -----
+
+func.func @powf(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.powf %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_powf"}
+
+// -----
+
+func.func @remf(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = arith.remf %arg0, %arg1 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0, %arg1
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_fmodf"}
+
+// -----
+
+func.func @rsqrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.rsqrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_rsqrtf"}
+
+// -----
+
+func.func @sin(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sin %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sinf"}
+
+// -----
+
+func.func @sinh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sinh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sinhf"}
+
+// -----
+
+func.func @sqrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.sqrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_sqrtf"}
+
+// -----
+
+func.func @tan(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.tan %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_tanf"}
+
+// -----
+
+func.func @tanh(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.tanh %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_tanhf"}
+
+// -----
+
+func.func @cbrt(%arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+  %result = math.cbrt %arg0 : tensor<1024xf32>
+  return %result : tensor<1024xf32>
+}
+
+// CHECK:       tt.extern_elementwise %arg0
+// CHECK-SAME:    {libname = "libdevice", libpath = "/path/to/libdevice",
+// CHECK-SAME:    pure = true, symbol = "__nv_cbrtf"}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
index 55a7ec570939fd..3f571820d261a2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_remote_access.mlir
@@ -12,24 +12,25 @@ tt.func @get_rank(
 }
 
 tt.func @get_peer_ptr(
-  %arg0: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
+  %arg0: !tt.ptr<i64>, %arg1: !tt.ptr<i64>, %peer_id: i64, %metadata: !tt.ptr<i64>
 ) -> !tt.ptr<i64> {
   // CHECK-NOT: triton_xla.get_peer_ptr
-  // Offset to local_buffer_root_ptrs.
-  // CHECK: %c72_i64 = arith.constant 72 : i64
-
-  // Byte size of a pointer.
-  // CHECK-NEXT: %c8_i64 = arith.constant 8 : i64
+  // An offset from the beginning of metadata to the peer pointers for the %arg1
+  // offset(param_to_peers) + sizeof(uint64_t) * 2 = 20
+  // CHECK: %c24_i64 = arith.constant 24 : i64
+  // Size of the uint64_t.
+  // CHECK: %c8_i64 = arith.constant 8 : i64
 
   // Load metadata->rank
-  // CHECK-NEXT: %0 = tt.load %arg2 : !tt.ptr<i64>
+  // CHECK-NEXT: %0 = tt.load %arg3 : !tt.ptr<i64>
 
   // Calculate offset to current base pointer.
   // CHECK-NEXT: %1 = arith.muli %0, %c8_i64 : i64
 
-  // Load metadata->local_buffer_root_ptrs[metadata->rank].
-  // CHECK-NEXT: %2 = arith.addi %1, %c72_i64 : i64
-  // CHECK-NEXT: %3 = tt.addptr %arg2, %2 : !tt.ptr<i64>, i64
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // Here argument_offset = 0 since %arg0 is the first argument.
+  // CHECK-NEXT: %2 = arith.addi %1, %c8_i64 : i64
+  // CHECK-NEXT: %3 = tt.addptr %arg3, %2 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %4 = tt.load %3 : !tt.ptr<i64>
 
   // Calculate offset to address.
@@ -37,17 +38,60 @@ tt.func @get_peer_ptr(
   // CHECK-NEXT: %6 = arith.subi %5, %4 : i64
 
   // Calculate offset to peer base pointer.
-  // CHECK-NEXT: %7 = arith.muli %arg1, %c8_i64 : i64
-  // CHECK-NEXT: %8 = arith.addi %7, %c72_i64 : i64
+  // CHECK-NEXT: %7 = arith.muli %arg2, %c8_i64 : i64
+  // CHECK-NEXT: %8 = arith.addi %7, %c8_i64 : i64
 
-  // Load metadata->local_buffer_root_ptrs[peer_id].
-  // CHECK-NEXT: %9 = tt.addptr %arg2, %8 : !tt.ptr<i64>, i64
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %9 = tt.addptr %arg3, %8 : !tt.ptr<i64>, i64
   // CHECK-NEXT: %10 = tt.load %9 : !tt.ptr<i64>
 
-  // Load metadata->local_buffer_root_ptrs[peer_id] + offset.
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
   // CHECK-NEXT: %11 = arith.addi %10, %6 : i64
   // CHECK-NEXT: %12 = tt.int_to_ptr %11 : i64 -> !tt.ptr<i64>
-  // CHECK-NEXT: tt.return %12 : !tt.ptr<i64>
-  %peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata : (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
-  tt.return %peer_ptr : !tt.ptr<i64>
+  %arg_0_peer_ptr = triton_xla.get_peer_ptr %arg0, %peer_id, %metadata,
+     { argument_index = 0 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+
+  // Load metadata->rank
+  // CHECK-NEXT: %13 = tt.load %arg3 : !tt.ptr<i64>
+  // Calculate offset to current base pointer.
+  // CHECK-NEXT: %14 = arith.muli %13, %c8_i64 : i64
+  // Load metadata->param_to_peers[argument_offset + metadata->rank].
+  // CHECK-NEXT: %15 = arith.addi %14, %c24_i64 : i64
+  // CHECK-NEXT: %16 = tt.addptr %arg3, %15 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %17 = tt.load %16 : !tt.ptr<i64>
+  // Calculate offset to address.
+  // CHECK-NEXT: %18 = tt.ptr_to_int %arg1 : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %19 = arith.subi %18, %17 : i64
+
+  // Calculate offset to peer base pointer.
+  // CHECK-NEXT: %20 = arith.muli %arg2, %c8_i64 : i64
+  // CHECK-NEXT: %21 = arith.addi %20, %c24_i64 : i64
+
+  // Load metadata->peer_base_ptrs[argument_offset + peer_id].
+  // CHECK-NEXT: %22 = tt.addptr %arg3, %21 : !tt.ptr<i64>, i64
+  // CHECK-NEXT: %23 = tt.load %22 : !tt.ptr<i64>
+
+  // Load metadata->buffer_root_ptrs[argument_offset + peer_id] + offset.
+  // CHECK-NEXT: %24 = arith.addi %23, %19 : i64
+  // CHECK-NEXT: %25 = tt.int_to_ptr %24 : i64 -> !tt.ptr<i64>
+
+  %arg_1_peer_ptr = triton_xla.get_peer_ptr %arg1, %peer_id, %metadata,
+     { argument_index = 1 : i32, world_size = 2 : i32 } :
+     (!tt.ptr<i64>, i64, !tt.ptr<i64>) -> !tt.ptr<i64>
+  
+  // Avoid optimizing away the get_peer_ptr calls, by returning xor of the two
+  // peer pointers.
+  // 
+  // CHECK-NEXT: %26 = tt.ptr_to_int %12 : !tt.ptr<i64> -> i64
+  %int_arg0 = tt.ptr_to_int %arg_0_peer_ptr : !tt.ptr<i64> -> i64
+  // CHECK-NEXT: %27 = tt.ptr_to_int %25 : !tt.ptr<i64> -> i64
+  %int_arg1 = tt.ptr_to_int %arg_1_peer_ptr : !tt.ptr<i64> -> i64
+
+  // CHECK-NEXT: %28 = arith.ori %26, %27 : i64
+  %result_int = arith.ori %int_arg0, %int_arg1 : i64
+  // CHECK-NEXT: %29 = tt.int_to_ptr %28 : i64 -> !tt.ptr<i64>
+  %result_ptr = tt.int_to_ptr %result_int : i64 -> !tt.ptr<i64>
+  // CHECK-NEXT: tt.return %29 : !tt.ptr<i64>
+  tt.return %result_ptr : !tt.ptr<i64>
 }
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir
new file mode 100644
index 00000000000000..aa095137acb3f2
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/xtile_to_triton_lowering.mlir
@@ -0,0 +1,32 @@
+// RUN: xla-opt %s -split-input-file \
+// RUN: -xtile-lower-to-triton \
+// RUN: | FileCheck %s
+
+
+// CHECK: func @lower_dot_scaled_add_to_triton(%[[LHS:.*]]: tensor<128x128xf8E5M2>, %[[LHS_SCALE:.*]]: tensor<128x4xi8>, %[[RHS:.*]]: tensor<128x256xf8E5M2>, %[[RHS_SCALE:.*]]: tensor<256x4xi8>, %[[ACC:.*]]: tensor<128x256xf32>) -> tensor<128x256xf32> {
+func.func @lower_dot_scaled_add_to_triton(
+  %lhs: tensor<128x128xf8E5M2>, %lhs_scale: tensor<128x4xi8>,
+  %rhs: tensor<128x256xf8E5M2>, %rhs_scale: tensor<256x4xi8>,
+  %acc: tensor<128x256xf32>) -> tensor<128x256xf32> {
+  // CHECK: %[[RES:.*]] = tt.dot_scaled %[[LHS]] scale %[[LHS_SCALE]], %[[RHS]] scale %[[RHS_SCALE]], %[[ACC]] lhs = e5m2 rhs = e5m2 {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  // CHECK-NOT: arith.addf
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale
+    {fastMath = true} : tensor<128x128xf8E5M2>,
+    tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  %1 = arith.addf %acc, %0 : tensor<128x256xf32>
+  // CHECK: return %[[RES]] : tensor<128x256xf32>
+  return %1 : tensor<128x256xf32>
+}
+
+// CHECK: func @lower_dot_scaled_without_add_falls_back_to_xtile(%[[LHS:.*]]: tensor<128x128xf8E5M2>, %[[LHS_SCALE:.*]]: tensor<128x4xi8>, %[[RHS:.*]]: tensor<128x256xf8E5M2>, %[[RHS_SCALE:.*]]: tensor<256x4xi8>) -> tensor<128x256xf32> {
+func.func @lower_dot_scaled_without_add_falls_back_to_xtile(
+  %lhs: tensor<128x128xf8E5M2>, %lhs_scale: tensor<128x4xi8>,
+  %rhs: tensor<128x256xf8E5M2>, %rhs_scale: tensor<256x4xi8>)
+  -> tensor<128x256xf32> {
+  // CHECK: %[[RES:.*]] = xtile.dot_scaled %[[LHS]] scale %[[LHS_SCALE]], %[[RHS]] scale %[[RHS_SCALE]] {fastMath = true} : tensor<128x128xf8E5M2>, tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale
+    {fastMath = true} : tensor<128x128xf8E5M2>,
+    tensor<128x4xi8> * tensor<128x256xf8E5M2>, tensor<256x4xi8> -> tensor<128x256xf32>
+  // CHECK: return %[[RES]] : tensor<128x256xf32>
+  return %0 : tensor<128x256xf32>
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
index 6fcb5f53ea2ebf..c72caaa6356002 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_convert_unsupported_types.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -26,7 +25,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
@@ -101,6 +99,8 @@ class TritonXLAConvertUnsupportedTypesPass
                  GenericOpConversionPattern<::xla::xtile::InsertTileOp>,
                  GenericOpConversionPattern<ReshapeOp>,
                  GenericOpConversionPattern<TransOp>,
+                 GenericOpConversionPattern<ExpandDimsOp>,
+                 GenericOpConversionPattern<BroadcastOp>,
                  GenericOpConversionPattern<arith::BitcastOp>>(converter, ctx);
     scf::populateSCFStructuralTypeConversions(converter, patterns);
     populateFunctionOpInterfaceTypeConversionPattern<::xla::xtile::EntryFuncOp>(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
index 4933b6bfa06b69..ed2cacfa5e4093 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -58,6 +60,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/permutation_util.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace mlir::triton::xla {
 
@@ -69,6 +72,17 @@ namespace xgt = xg::triton;
 
 namespace {
 
+bool HasBroadcastConsumer(Operation* op) {
+  llvm::SetVector<Operation*> slice;
+  mlir::getForwardSlice(op, &slice);
+  for (Operation* sliced_op : slice) {
+    if (llvm::isa<triton::BroadcastOp>(sliced_op)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 PointerType GetTensorPtrType(Type type) {
   return PointerType::get(
       xgt::StorageType(type),
@@ -148,7 +162,8 @@ bool IsOffsetDivisibilityGuaranteed(mlir::Value offset_val,
 //      minor tile dimension (in bytes) must be divisible by 16, it is
 //      sufficient to check that the offset in the minor dimension (in bytes) is
 //      divisible by 16.
-bool CanUseTma(bool allow_tma, const ArrayRef<int64_t>& original_shape,
+bool CanUseTma(Operation* op, bool allow_tma, int num_stages,
+               const ArrayRef<int64_t>& original_shape,
                const ArrayRef<int64_t>& tile_shape,
                const ArrayRef<int64_t>& tile_strides, ValueRange offsets,
                const TypedValue<PointerType>& pointer,
@@ -168,6 +183,15 @@ bool CanUseTma(bool allow_tma, const ArrayRef<int64_t>& original_shape,
     return false;
   }
 
+  // TODO(b/421858850): CUDA_ERROR_MISALIGNED_ADDRESS errors are
+  // happening for some cases when pipelining stages are > 2. The pattern
+  // observed is that these happen in the presence of a broadcast.
+  // This is a temporary solution. We should remove this once we have a fix for
+  // the error.
+  if (num_stages > 2 && HasBroadcastConsumer(op)) {
+    return false;
+  }
+
   // Some TMA constraints can't be validated if tile strides are dynamic.
   if (mlir::ShapedType::isDynamicShape(tile_strides)) {
     return false;
@@ -284,30 +308,25 @@ class RewriteFuncOp : public mlir::OpRewritePattern<func::FuncOp> {
 
     SmallVector<Type> new_operand_types(input_types);
     for (auto&& [index, operand_type] : llvm::enumerate(new_operand_types)) {
+      auto attr = op.getArgAttr(index, "tt.tma_descriptor");
+      if (!attr) {
+        continue;
+      }
       mlir::BlockArgument func_arg = op.getArgument(index);
       auto element_type =
           mlir::cast<PointerType>(operand_type).getPointeeType();
+      auto tma_descriptor = mlir::cast<TmaDescriptorAttr>(attr);
+      auto layout = tma_descriptor.getLayout();
+      auto block_shape = tma_descriptor.getTileShape();
+      SmallVector<int64_t> ordered_block_shape =
+          GetMajorToMinorOrder(block_shape, layout);
 
-      mlir::UnrealizedConversionCastOp cast_to_orig_type;
-      if (auto attr = op.getArgAttr(index, "tt.tma_descriptor")) {
-        auto tma_descriptor = mlir::cast<TmaDescriptorAttr>(attr);
-        auto layout = tma_descriptor.getLayout();
-        auto block_shape = tma_descriptor.getTileShape();
-        SmallVector<int64_t> ordered_block_shape =
-            GetMajorToMinorOrder(block_shape, layout);
-
-        operand_type = TensorDescType::get(
-            builder.getContext(),
-            RankedTensorType::get(ordered_block_shape, element_type));
-        // !tt.tensordesc<tensor<block_shape x element_type>> -> !tt.ptr<>
-        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
-            operand_type, func_arg);
-      } else {
-        // !tt.ptr<> -> !tt.ptr<>
-        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
-            operand_type, func_arg);
-        operand_type = GetTensorPtrType(element_type);
-      }
+      operand_type = TensorDescType::get(
+          builder.getContext(),
+          RankedTensorType::get(ordered_block_shape, element_type));
+      // !tt.tensordesc<tensor<block_shape x element_type>> -> !tt.ptr<>
+      auto cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
+          operand_type, func_arg);
       func_arg.replaceAllUsesExcept(cast_to_orig_type.getResult(0),
                                     cast_to_orig_type);
     }
@@ -495,8 +514,10 @@ static std::pair<Value, Value> CreateTensorOfPointersAndMask(
 
 class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
  public:
-  RewriteExtract(mlir::MLIRContext* context, bool allow_tma)
-      : OpRewritePattern(context), allow_tma_(allow_tma) {}
+  RewriteExtract(mlir::MLIRContext* context, bool allow_tma, int num_stages)
+      : OpRewritePattern(context),
+        allow_tma_(allow_tma),
+        num_stages_(num_stages) {}
   using OpRewritePattern::OpRewritePattern;
 
  private:
@@ -523,8 +544,8 @@ class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
     auto sizes = op.getStaticSizes();
     auto strides = to_vector(op.getStaticStrides());
 
-    if (CanUseTma(allow_tma_, src_shape, sizes, strides, offsets, op.getSrc(),
-                  src_layout)) {
+    if (CanUseTma(op, allow_tma_, num_stages_, src_shape, sizes, strides,
+                  offsets, op.getSrc(), src_layout)) {
       if (auto result = CanonicalizeTileStrides(strides, sizes, src_shape);
           !result.ok()) {
         return rewriter.notifyMatchFailure(op, result.message());
@@ -586,12 +607,15 @@ class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
   }
 
   const bool allow_tma_;
+  const int num_stages_;
 };
 
 class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
  public:
-  RewriteInsert(mlir::MLIRContext* context, bool allow_tma)
-      : OpRewritePattern(context), allow_tma_(allow_tma) {}
+  RewriteInsert(mlir::MLIRContext* context, bool allow_tma, int num_stages)
+      : OpRewritePattern(context),
+        allow_tma_(allow_tma),
+        num_stages_(num_stages) {}
   using OpRewritePattern::OpRewritePattern;
 
  private:
@@ -626,8 +650,8 @@ class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
     SmallVector<unsigned> reduced_dims = to_vector(*reduction_mask);
     absl::c_sort(reduced_dims);
 
-    if (CanUseTma(allow_tma_, dst_shape, sizes, strides, offsets, op.getDst(),
-                  dst_layout)) {
+    if (CanUseTma(op, allow_tma_, num_stages_, dst_shape, sizes, strides,
+                  offsets, op.getDst(), dst_layout)) {
       if (auto result = CanonicalizeTileStrides(strides, sizes, dst_shape);
           !result.ok()) {
         return rewriter.notifyMatchFailure(op, result.message());
@@ -672,6 +696,7 @@ class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
   }
 
   const bool allow_tma_;
+  const int num_stages_;
 };
 
 // Rewriting tensor::InsertOp as tt.store.
@@ -732,8 +757,8 @@ class TritonXLAExtractInsertToTritonPass
   void runOnOperation() override {
     mlir::MLIRContext* mlir_context = &getContext();
     mlir::RewritePatternSet patterns(mlir_context);
-    patterns.add<RewriteExtract, RewriteInsert>(mlir_context,
-                                                allow_tma_.getValue());
+    patterns.add<RewriteExtract, RewriteInsert>(
+        mlir_context, allow_tma_.getValue(), num_stages_.getValue());
     patterns.add<RewriteScalarExtract, RewriteScalarInsert>(mlir_context);
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
@@ -756,9 +781,9 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass() {
 }
 
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
-    bool allow_tma) {
+    bool allow_tma, int num_stages) {
   return std::make_unique<TritonXLAExtractInsertToTritonPass>(
-      TritonXLAExtractInsertToTritonPassOptions{allow_tma});
+      TritonXLAExtractInsertToTritonPassOptions{allow_tma, num_stages});
 }
 
 }  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
index 0a3c351da1c663..40bf7793c34d40 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_fold_transpose_pass.cc
@@ -16,18 +16,18 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <type_traits>
 #include <utility>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OperationSupport.h"
@@ -38,8 +38,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
 #include "xla/util.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
@@ -58,20 +58,17 @@ namespace {
   return guard;
 }
 
-LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
-  auto extract = op.getSrc().getDefiningOp<ExtractOp>();
+// Push the transpose up through the extract tile, this will then be folded into
+// MemrefToPtr at the lowering stage.
+LogicalResult PushTransposeThroughExtractTile(TransOp op,
+                                              PatternRewriter& rewriter) {
+  auto extract = op.getSrc().getDefiningOp<::xla::xtile::ExtractTileOp>();
   if (!extract) {
     return rewriter.notifyMatchFailure(op, "Transpose source is not extract.");
   }
 
-  // Compute the dimensions dropped from the source.
-  std::optional<llvm::SmallDenseSet<unsigned>> reduction_mask =
-      computeRankReductionMask(extract.getStaticSizes(),
-                               extract.getType().getShape());
-  if (!reduction_mask) {
-    return rewriter.notifyMatchFailure(op, "Unsupported rank reduction.");
-  }
-  SmallVector<unsigned> reduced_dims = to_vector(*reduction_mask);
+  SmallVector<unsigned> reduced_dims =
+      to_vector(extract.getReducedDimensions());
   absl::c_sort(reduced_dims);
 
   // Compute the set of not-reduced dimensions.
@@ -88,8 +85,8 @@ LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
   }
 
   // Compute the permutation of source dimensions.
-  size_t src_rank = extract.getSrcShape().size();
-  SmallVector<int32_t> permutation;
+  size_t src_rank = extract.getSource().getType().getRank();
+  SmallVector<int64_t> permutation;
   permutation.reserve(src_rank);
   for (auto [src_dim, dst_dim] :
        llvm::zip_equal(retained_dims, op.getOrder())) {
@@ -111,24 +108,20 @@ LogicalResult FoldTransposeOfExtract(TransOp op, PatternRewriter& rewriter) {
     return result;
   };
 
-  SmallVector<int32_t> inv_permutation(permutation.size());
-  for (auto [i, dim] : llvm::enumerate(permutation)) {
-    inv_permutation[dim] = i;
-  }
+  auto permutation_map = mlir::AffineMapAttr::get(
+      mlir::AffineMap::getPermutationMap(permutation, rewriter.getContext()));
+  // TODO(willfroom): Return a permutation layout (b/455478641).
+  auto pushed_transpose = mlir::memref::TransposeOp::create(
+      rewriter, extract.getLoc(), extract.getSource(), permutation_map);
 
-  SmallVector<int64_t> layout;
-  layout.reserve(extract.getSrcLayout().size());
-  for (auto dim : extract.getSrcLayout()) {
-    layout.push_back(inv_permutation[dim]);
-  }
+  rewriter.replaceOpWithNewOp<::xla::xtile::ExtractTileOp>(
+      op, op.getType(), pushed_transpose, permute(extract.getOffsets()),
+      permute(extract.getFullTileShape()), permute(extract.getStrides()));
 
-  rewriter.replaceOpWithNewOp<ExtractOp>(
-      op, op.getType(), extract.getSrc(), permute(extract.getMixedOffsets()),
-      permute(extract.getStaticSizes()), permute(extract.getStaticStrides()),
-      permute(extract.getSrcShape()), layout);
   if (extract->use_empty()) {
     rewriter.eraseOp(extract);
   }
+
   return success();
 }
 
@@ -332,7 +325,7 @@ class TritonXLAFoldTransposePass
  private:
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    patterns.add(FoldTransposeOfExtract);
+    patterns.add(PushTransposeThroughExtractTile);
     patterns.add(PushTransposeUpIntoIf);
     patterns.add(HoistTransposeUpFromIf, /*benefit=*/2);
     patterns.add(PushTransposeUpThroughBroadcast);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
index 59814592786756..579eb21390ca89 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_remote_access_pass.cc
@@ -67,8 +67,14 @@ LogicalResult LowerGetRankOp(GetRankOp get_rank, PatternRewriter& rewriter) {
 
 // The peer address should be computed as follows:
 //
-// offset = address - metadata->buffer_root_ptrs[metadata->rank].
-// peer_address = metadata->buffer_root_ptrs[peer_id] + offset.
+// argument_offset = world_size * argument_index
+// argument_base = metadata->param_to_peers[argument_offset + metadata->rank]
+// offset = address - argument_base
+// peer_base = metadata->param_to_peers[argument_offset + peer_id]
+// peer_address = peer_base + offset
+//
+// For more details regarding peer pointers layout see comments in the:
+// `stream_executor::gpu::CollectiveKernelMetadata`.
 LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
                                 PatternRewriter& rewriter) {
   Value metadata = get_peer_ptr.getMetadata();
@@ -94,16 +100,26 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
   // 1. Load metadata->rank.
   Value current_rank_load_op = builder.create<GetRankOp>(metadata);
 
-  // 2. Load metadata->local_buffer_root_ptrs[metadata->rank].
+  // 2. Calculate argument_offset = num_ranks * argument_index.
+  const int32_t argument_index = get_peer_ptr.getArgumentIndex();
+  const int32_t world_size = get_peer_ptr.getWorldSize();
+  const int32_t argument_offset =
+      world_size * argument_index * sizeof(uint64_t);
+
+  // 3. Load metadata->param_to_peers[argument_offset + metadata->rank].
   Value local_buffers_ptrs_offset = builder.create<arith::ConstantIntOp>(
-      type_i64, offsetof(CollectiveKernelMetadata, local_buffer_root_ptrs));
+      type_i64, offsetof(CollectiveKernelMetadata, param_to_peers));
 
   Value rank_offset =
       builder.create<arith::ExtUIOp>(type_i64, current_rank_load_op);
+  Value argument_offset_bytes =
+      builder.create<arith::ConstantIntOp>(type_i64, argument_offset);
   Value current_rank_offset_bytes =
       builder.create<arith::MulIOp>(rank_offset, pointer_size_bytes_const);
+  Value argument_ptr_offset_bytes = builder.create<arith::AddIOp>(
+      local_buffers_ptrs_offset, argument_offset_bytes);
   Value current_ptr_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, current_rank_offset_bytes);
+      argument_ptr_offset_bytes, current_rank_offset_bytes);
 
   Value current_range_address = builder.create<AddPtrOp>(
       metadata.getType(), metadata, current_ptr_offset_bytes);
@@ -115,19 +131,19 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 3. Calculate offset =
-  //      address - metadata->local_buffer_root_ptrs[metadata->rank].
+  // 4. Calculate offset =
+  //      address - metadata->param_to_peers[argument_offset + metadata->rank].
   Value current_range_address_int =
       builder.create<PtrToIntOp>(type_i64, address);
   Value offsetInt = builder.create<arith::SubIOp>(current_range_address_int,
                                                   current_range_address_value);
 
-  // 4. Load metadata->local_buffer_root_ptrs[peer_id].
+  // 5. Load metadata->param_to_peers[argument_offset + peer_id].
   Value peer_index = builder.create<arith::ExtUIOp>(type_i64, peer_id);
   Value peer_index_offset_bytes =
       builder.create<arith::MulIOp>(peer_index, pointer_size_bytes_const);
   Value peer_range_offset_bytes = builder.create<arith::AddIOp>(
-      local_buffers_ptrs_offset, peer_index_offset_bytes);
+      argument_ptr_offset_bytes, peer_index_offset_bytes);
   Value peer_range_address = builder.create<AddPtrOp>(
       metadata.getType(), metadata, peer_range_offset_bytes);
 
@@ -138,7 +154,7 @@ LogicalResult LowerGetPeerPtrOp(GetPeerPtrOp get_peer_ptr,
       EvictionPolicyAttr::get(ctx, EvictionPolicy::NORMAL),
       /*isVolatile=*/rewriter.getBoolAttr(false));
 
-  // 5. Calculate the result address: peerBasePtr + offset.
+  // 6. Calculate the result address: peerBasePtr + offset.
   Value result_int =
       builder.create<arith::AddIOp>(peer_range_address_value, offsetInt);
   Value result_address = builder.create<IntToPtrOp>(result_type, result_int);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc
index c3db109f89a685..db583e0bd767d8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_lower_xtile_pass.cc
@@ -63,14 +63,18 @@ namespace ma = ::mlir::arith;
 
 // Get the new arg types of the lowered function by translating memrefs to the
 // corresponding pointer types.
-llvm::SmallVector<mlir::Type> GetPtrArgTypes(mlir::ValueRange args) {
+llvm::SmallVector<mlir::Type> GetTransformedArgTypes(
+    ::xla::xtile::EntryFuncOp& entry_op) {
   llvm::SmallVector<mlir::Type> arg_types;
-  arg_types.reserve(args.size());
-  for (auto arg : args) {
+  // Tile id is not carried over hence -1.
+  arg_types.reserve(entry_op.getNumArguments() - 1U);
+  for (const auto& arg : entry_op.getBufferArgs()) {
     mlir::MemRefType memref_type = mlir::cast<mlir::MemRefType>(arg.getType());
     arg_types.push_back(
         ::xla::gpu::triton::GetGlobalPointerType(memref_type.getElementType()));
   }
+  mlir::TypeRange opaque_args(entry_op.getOpaqueArgs());
+  arg_types.append(opaque_args.begin(), opaque_args.end());
   return arg_types;
 }
 
@@ -136,16 +140,17 @@ MemrefToPtrOp CreateMemrefToPtr(mlir::OpBuilder& builder,
 class XTileEntryToTriton
     : public mlir::OpRewritePattern<::xla::xtile::EntryFuncOp> {
  public:
-  XTileEntryToTriton(mlir::MLIRContext* context, mlir::ModuleOp& module)
-      : OpRewritePattern(context), module_(module) {}
+  using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
       ::xla::xtile::EntryFuncOp entry_op,
       mlir::PatternRewriter& rewriter) const override {
-    mlir::ImplicitLocOpBuilder builder(module_->getLoc(), module_);
-    builder.setInsertionPointToStart(module_.getBody());
+    mlir::ModuleOp module = entry_op->getParentOfType<mlir::ModuleOp>();
+    mlir::ImplicitLocOpBuilder builder(module->getLoc(), module);
+    builder.setInsertionPointToStart(module.getBody());
 
-    auto new_arg_types = GetPtrArgTypes(entry_op.getBufferArgs());
+    const int64_t num_buffer_args = entry_op.getBufferArgs().size();
+    auto new_arg_types = GetTransformedArgTypes(entry_op);
     auto new_func_op = builder.create<mlir::func::FuncOp>(
         entry_op.getName(), builder.getFunctionType(new_arg_types, {}));
 
@@ -163,17 +168,15 @@ class XTileEntryToTriton
 
     BlockArgument tile_id_arg = old_args.back();
 
-    // TODO(b/389955087): we can decide whether to sign extend by
-    // understanding if we need 64 bits to encode indices or if 32 bits are
-    // enough. For now, just use 64 bits to avoid issues.
     auto pid = builder.create<ttir::GetProgramIdOp>(ttir::ProgramIDDim::X);
-    Value pid_i64 = builder.create<ma::ExtSIOp>(builder.getI64Type(), pid);
     Value pid_idx =
-        builder.create<ma::IndexCastOp>(builder.getIndexType(), pid_i64);
+        builder.create<ma::IndexCastOp>(builder.getIndexType(), pid);
     rewriter.replaceAllUsesWith(tile_id_arg, pid_idx);
 
-    // Handle memeref arguments.
-    for (auto [old_arg, new_arg] : llvm::zip(old_args, new_args)) {
+    // Handle memref arguments.
+    for (auto [old_arg, new_arg] :
+         llvm::zip(old_args,
+                   mlir::ValueRange(new_args).take_front(num_buffer_args))) {
       mlir::MemRefType memref_type =
           mlir::cast<mlir::MemRefType>(old_arg.getType());
 
@@ -183,6 +186,13 @@ class XTileEntryToTriton
       // Replace all uses of the old argument with the result of the cast.
       rewriter.replaceAllUsesWith(old_arg, memref_cast);
     }
+    // For opaque arguments, we can simply replace all uses with the new
+    // argument.
+    for (auto [old_arg, new_arg] :
+         llvm::zip(mlir::ValueRange(old_args).drop_front(num_buffer_args),
+                   mlir::ValueRange(new_args).drop_front(num_buffer_args))) {
+      rewriter.replaceAllUsesWith(old_arg, new_arg);
+    }
 
     entry_block.eraseArguments(0, old_args.size());
 
@@ -194,9 +204,6 @@ class XTileEntryToTriton
     rewriter.eraseOp(entry_op);
     return success();
   }
-
- private:
-  mlir::ModuleOp& module_;
 };
 
 // Rewrite a xtile extract to a triton_xla extract.
@@ -214,13 +221,13 @@ class XTileExtractToTriton
     mlir::Value memref_to_ptr =
         CreateMemrefToPtr(rewriter, extract_op.getSource());
 
-    if (extract_op.getType().getRank() == 0) {
+    if (result_type.getRank() == 0) {
       mlir::Value scalar_value = rewriter.create<ttir::LoadOp>(
           extract_op->getLoc(), memref_to_ptr, ttir::CacheModifier::NONE,
           ttir::EvictionPolicy::NORMAL, /*isVolatile=*/false);
 
       rewriter.replaceOpWithNewOp<mlir::tensor::FromElementsOp>(
-          extract_op, extract_op.getType(), scalar_value);
+          extract_op, result_type, scalar_value);
       return mlir::success();
     }
 
@@ -257,8 +264,8 @@ class XTileInsertToTriton
         CreateMemrefToPtr(rewriter, insert_op.getDestination());
 
     if (insert_op.getSource().getType().getRank() == 0) {
-      mlir::Value scalar_value = rewriter.create<mlir::tensor::ExtractOp>(
-          insert_op.getLoc(), insert_op.getSource());
+      mlir::Value scalar_value = mlir::tensor::ExtractOp::create(
+          rewriter, insert_op.getLoc(), insert_op.getSource());
 
       rewriter.replaceOpWithNewOp<ttir::StoreOp>(
           insert_op, memref_to_ptr, scalar_value, /*mask=*/nullptr);
@@ -283,6 +290,25 @@ class XTileInsertToTriton
   }
 };
 
+class FoldIntoMemrefToPtr : public mlir::OpRewritePattern<MemrefToPtrOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      MemrefToPtrOp op, mlir::PatternRewriter& rewriter) const override {
+    // As a transpose doesn't add any offset we can simply fold it into the
+    // memref_to_ptr.
+    auto transpose = op.getSrc().getDefiningOp<mlir::memref::TransposeOp>();
+    if (!transpose) {
+      return mlir::failure();
+    }
+
+    rewriter.replaceOpWithNewOp<MemrefToPtrOp>(op, op.getType(),
+                                               transpose.getIn());
+    return mlir::success();
+  }
+};
+
 class TritonXLALowerXTilePass
     : public impl::TritonXLALowerXTilePassBase<TritonXLALowerXTilePass> {
  public:
@@ -294,8 +320,8 @@ class TritonXLALowerXTilePass
 
     mlir::RewritePatternSet patterns(context);
 
-    patterns.add<XTileEntryToTriton>(context, module);
-    patterns.add<XTileExtractToTriton, XTileInsertToTriton>(context);
+    patterns.add<XTileEntryToTriton, XTileExtractToTriton, XTileInsertToTriton,
+                 FoldIntoMemrefToPtr>(context);
     if (mlir::failed(
             mlir::applyPatternsGreedily(module, std::move(patterns)))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc
new file mode 100644
index 00000000000000..61a4edcb3ec6c1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_math_to_libdevice.cc
@@ -0,0 +1,280 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/TargetParser/Triple.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/service/gpu/target_util.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DEF_TRITONXLAMATHTOLIBDEVICEPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+using ::xla::gpu::TargetDeviceFunctionID;
+
+template <typename OpTy>
+struct OpInfo;
+
+template <>
+struct OpInfo<mlir::math::AcosOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAcos;
+};
+
+template <>
+struct OpInfo<mlir::math::AcoshOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAcosh;
+};
+
+template <>
+struct OpInfo<mlir::math::AsinOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAsin;
+};
+
+template <>
+struct OpInfo<mlir::math::AsinhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAsinh;
+};
+
+template <>
+struct OpInfo<mlir::math::Atan2Op> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAtan2;
+};
+
+template <>
+struct OpInfo<mlir::math::AtanhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kAtanh;
+};
+
+template <>
+struct OpInfo<mlir::math::CosOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCos;
+};
+
+template <>
+struct OpInfo<mlir::math::CoshOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCosh;
+};
+
+template <>
+struct OpInfo<mlir::math::ExpOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kExp;
+};
+
+template <>
+struct OpInfo<mlir::math::ErfOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kErf;
+};
+
+template <>
+struct OpInfo<mlir::math::ExpM1Op> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kExpm1;
+};
+
+template <>
+struct OpInfo<mlir::math::LogOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kLog;
+};
+
+template <>
+struct OpInfo<mlir::math::Log1pOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kLog1p;
+};
+
+template <>
+struct OpInfo<mlir::math::PowFOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kPow;
+};
+
+template <>
+struct OpInfo<mlir::arith::RemFOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kFmod;
+};
+
+template <>
+struct OpInfo<mlir::math::RsqrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kRsqrt;
+};
+
+template <>
+struct OpInfo<mlir::math::SinOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSin;
+};
+
+template <>
+struct OpInfo<mlir::math::SinhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSinh;
+};
+
+template <>
+struct OpInfo<mlir::math::SqrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kSqrt;
+};
+
+template <>
+struct OpInfo<mlir::math::TanOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kTan;
+};
+
+template <>
+struct OpInfo<mlir::math::TanhOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kTanh;
+};
+
+template <>
+struct OpInfo<mlir::math::CbrtOp> {
+  static constexpr auto kFunctionID = TargetDeviceFunctionID::kCbrt;
+};
+
+template <typename OpTy>
+class ConvertToLibdevice : public mlir::OpRewritePattern<OpTy> {
+ public:
+  ConvertToLibdevice(mlir::MLIRContext* context,
+                     absl::string_view libdevice_path,
+                     const llvm::Triple& triple)
+      : mlir::OpRewritePattern<OpTy>(context),
+        libdevice_path_(libdevice_path),
+        triple_(triple) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      OpTy op, mlir::PatternRewriter& rewriter) const override {
+    auto maybe_shaped_type = mlir::dyn_cast<mlir::ShapedType>(op.getType());
+    mlir::Type output_type =
+        maybe_shaped_type ? maybe_shaped_type.getElementType() : op.getType();
+
+    bool output_type_is_16bit_float =
+        output_type.isBF16() || output_type.isF16();
+    if (!(output_type_is_16bit_float || output_type.isF32() ||
+          output_type.isF64())) {
+      op.emitError() << "unsupported output type";
+      return rewriter.notifyMatchFailure(op, "unsupported output type");
+    }
+
+    absl::StatusOr<::xla::PrimitiveType> primitive_type_or =
+        ::xla::gpu::triton::GetPrimitiveType(output_type);
+    if (!primitive_type_or.ok()) {
+      return rewriter.notifyMatchFailure(op, "could not get primitive type");
+    }
+
+    ::xla::EmitterLocOpBuilder builder(op->getLoc(), rewriter);
+
+    llvm::SmallVector<Value, 2> casted_inputs;
+    if (output_type_is_16bit_float) {
+      // Upcast the inputs to F32.
+      for (auto operand : op->getOperands()) {
+        casted_inputs.push_back(
+            ::xla::gpu::triton::Cast(builder, operand, rewriter.getF32Type()));
+      }
+    } else {
+      casted_inputs = llvm::to_vector(op->getOperands());
+    }
+
+    Value res = mlir::triton::ExternElementwiseOp::create(
+        builder, casted_inputs[0].getType(), casted_inputs, "libdevice",
+        libdevice_path_,
+        ObtainDeviceFunctionName(OpInfo<OpTy>::kFunctionID, *primitive_type_or,
+                                 triple_),
+        /*pure=*/true);
+
+    if (res.getType() != output_type) {
+      // Downcast back to the original output type.
+      res = ::xla::gpu::triton::Cast(builder, res, output_type);
+    }
+
+    rewriter.replaceOp(op, res);
+
+    return mlir::success();
+  }
+
+ private:
+  // These are both owned by the parent pass (TritonXLAMathToLibdevicePass), so
+  // it is safe to store references here.
+  absl::string_view libdevice_path_;
+  const llvm::Triple& triple_;
+};
+
+template <typename... OpTypes>
+void AddPattens(mlir::RewritePatternSet& patterns,
+                absl::string_view libdevice_path, const llvm::Triple& triple) {
+  patterns.add<ConvertToLibdevice<OpTypes>...>(patterns.getContext(),
+                                               libdevice_path, triple);
+}
+
+class TritonXLAMathToLibdevicePass
+    : public impl::TritonXLAMathToLibdevicePassBase<
+          TritonXLAMathToLibdevicePass> {
+ public:
+  using TritonXLAMathToLibdevicePassBase::TritonXLAMathToLibdevicePassBase;
+
+ private:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::MLIRContext* context = &getContext();
+
+    mlir::RewritePatternSet patterns(context);
+
+    llvm::Triple triple(triple_string_);
+
+    AddPattens<mlir::math::AcosOp, mlir::math::AcoshOp, mlir::math::AsinOp,
+               mlir::math::AsinhOp, mlir::math::Atan2Op, mlir::math::AtanhOp,
+               mlir::math::CosOp, mlir::math::CoshOp, mlir::math::ExpOp,
+               mlir::math::ErfOp, mlir::math::ExpM1Op, mlir::math::LogOp,
+               mlir::math::Log1pOp, mlir::math::PowFOp, mlir::arith::RemFOp,
+               mlir::math::RsqrtOp, mlir::math::SinOp, mlir::math::SinhOp,
+               mlir::math::SqrtOp, mlir::math::TanOp, mlir::math::TanhOp,
+               mlir::math::CbrtOp>(patterns, libdevice_path_, triple);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateTritonXLAMathToLibdevicePass(
+    absl::string_view libdevice_path, absl::string_view triple) {
+  TritonXLAMathToLibdevicePassOptions options;
+  options.libdevice_path_ = libdevice_path;
+  options.triple_string_ = triple;
+
+  return std::make_unique<TritonXLAMathToLibdevicePass>(options);
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc
new file mode 100644
index 00000000000000..def9cb2421f05f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/xtile_lower_to_triton.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace ttir = ::mlir::triton;
+
+#define GEN_PASS_DEF_XTILELOWERTOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class LowerDotScaled
+    : public mlir::OpRewritePattern<::xla::xtile::DotScaledOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+ private:
+  mlir::LogicalResult matchAndRewrite(
+      ::xla::xtile::DotScaledOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    if (std::distance(op->getUsers().begin(), op->getUsers().end()) != 1) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must have exactly one user in order to be lowered to "
+          "triton.");
+    }
+
+    mlir::Operation* add_op = dyn_cast<arith::AddFOp>(*op->getUsers().begin());
+    if (!add_op) {
+      add_op = dyn_cast<arith::AddIOp>(*op->getUsers().begin());
+    }
+
+    if (!add_op) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          "Dot op must be consumed by an AddOp in order to be convertible to "
+          "triton dot.");
+    }
+
+    // Accumulator is the operand of add that is not the dot operation.
+    auto accumulator = add_op->getOperand(1) == op ? add_op->getOperand(0)
+                                                   : add_op->getOperand(1);
+
+    auto lhs_dot_elem_type_or_status =
+        ::xla::gpu::triton::internal::GetScaleDotElemType(
+            op.getLhs().getType());
+    auto rhs_dot_elem_type_or_status =
+        ::xla::gpu::triton::internal::GetScaleDotElemType(
+            op.getRhs().getType());
+
+    if (!lhs_dot_elem_type_or_status.ok() ||
+        !rhs_dot_elem_type_or_status.ok()) {
+      return rewriter.notifyMatchFailure(
+          op->getLoc(),
+          absl::StrCat(
+              "Failed to get dot element type for lhs or rhs.\nLhs status: ",
+              lhs_dot_elem_type_or_status.status().message(), "\nRhs status: ",
+              rhs_dot_elem_type_or_status.status().message()));
+    }
+
+    auto lhs_dot_elem_type = lhs_dot_elem_type_or_status.value();
+    auto rhs_dot_elem_type = rhs_dot_elem_type_or_status.value();
+
+    auto triton_dot_scaled_op = ttir::DotScaledOp::create(
+        rewriter, op.getLoc(), accumulator.getType(), op.getLhs(), op.getRhs(),
+        accumulator, op.getLhsScale(), op.getRhsScale(), lhs_dot_elem_type,
+        rhs_dot_elem_type, op.getFastMath(), op.getLhsKPack(),
+        op.getRhsKPack());
+
+    rewriter.replaceAllOpUsesWith(add_op, op.getResult());
+    rewriter.replaceOp(op, triton_dot_scaled_op);
+    return mlir::success();
+  }
+};
+
+class XTileLowerToTritonPass
+    : public impl::XTileLowerToTritonPassBase<XTileLowerToTritonPass> {
+ public:
+  void runOnOperation() override {
+    mlir::MLIRContext* mlir_context = &getContext();
+    mlir::RewritePatternSet patterns(mlir_context);
+    patterns.add<LowerDotScaled>(mlir_context);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateXTileLowerToTritonPass() {
+  return std::make_unique<XTileLowerToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index c38c4beb4b2b96..232fa3a36f8b8e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -129,6 +129,7 @@ xla_cc_test(
     srcs = ["gpu_clique_key_test.cc"],
     deps = [
         ":gpu_clique_key",
+        "//xla:xla_data_proto_cc",
         "//xla/core/collectives:clique_id",
         "//xla/service:global_device_id",
         "//xla/tsl/platform:test",
@@ -163,6 +164,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:coordination_service_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:btree",
@@ -171,6 +173,7 @@ cc_library(
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -189,7 +192,6 @@ cc_library(
     srcs = ["gpu_collectives.cc"],
     hdrs = ["gpu_collectives.h"],
     deps = [
-        ":gpu_communicator",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -198,6 +200,7 @@ cc_library(
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
@@ -208,6 +211,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
index b91733d78f3ec8..6d12361b56047f 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/core/collectives/clique_key.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
index 8263cd88ad737c..0742ebaa08456c 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/core/collectives/clique_id.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
index a41432ca3c773b..f78d12490c7d28 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/collectives/gpu_cliques.h"
 
+#include <atomic>
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -59,6 +61,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/hash.h"
@@ -100,7 +103,15 @@ namespace {
 // Container for initialized and ready to use local (in-process) GPU cliques.
 struct ProcessGpuCliques {
   absl::Mutex mu;
+
+  // GpuCliques, keyed by GpuCliqueKey.
   absl::node_hash_map<GpuCliqueKey, LockableGpuClique> map ABSL_GUARDED_BY(mu);
+
+  // Booleans that can be set to cancel the construction of a GpuClique.
+  absl::node_hash_map<GpuCliqueKey, std::atomic_bool> cancel
+      ABSL_GUARDED_BY(mu);
+
+  // The latest state of every task.
   std::vector<tensorflow::CoordinatedTaskStateInfo> task_state_infos
       ABSL_GUARDED_BY(mu);
 };
@@ -263,7 +274,7 @@ static absl::Status CheckCliqueKeyIsntStaleImpl(
 
 absl::Status CheckCliqueKeyIsntStale(const GpuCliqueKey& clique_key) {
   ProcessGpuCliques& cliques = GetProcessGpuCliques();
-  absl::MutexLock lock(&cliques.mu);
+  absl::MutexLock lock(cliques.mu);
   return CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
 }
 
@@ -343,6 +354,7 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         clique_ids.fingerprint(), peer_access_enabled);
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
+    std::atomic_bool* cancel = nullptr;
     {
       VLOG(5) << "Locking cliques.mu";
       absl::MutexLock lock(cliques.mu);
@@ -350,17 +362,28 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
               << " for staleness";
       TF_RETURN_IF_ERROR(
           CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key));
+      auto [it, unused_inserted] = cliques.cancel.emplace(clique_key, false);
+      cancel = &it->second;
     }
 
     VLOG(5) << "Creating communicators";
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::unique_ptr<Communicator>> created_comms,
-        collectives->CreateCommunicators(clique_key, clique_ids, ranks,
-                                         config));
+    // Don't hold cliques.mu while creating the communicators, because creating
+    // communicators can block.
+    absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> created_comms =
+        collectives->CreateCommunicatorsWithCancel(clique_key, clique_ids,
+                                                   ranks, config, cancel);
+
+    VLOG(5) << "Locking cliques.mu";
+    absl::MutexLock lock(cliques.mu);
+    cliques.cancel.erase(clique_key);
+
+    if (!created_comms.ok()) {
+      return created_comms.status();
+    }
 
     absl::btree_map<RankId, std::unique_ptr<Communicator>> comms;
     for (size_t i = 0; i < ranks.size(); ++i) {
-      comms[ranks[i].rank] = std::move(created_comms[i]);
+      comms[ranks[i].rank] = std::move((*created_comms)[i]);
     }
 
     VLOG(3) << absl::StreamFormat(
@@ -369,14 +392,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         clique_key.ToString(), DeviceRanksToString(ranks), nroots,
         clique_ids.fingerprint(), peer_access_enabled);
 
-    VLOG(5) << "Locking cliques.mu";
-    absl::MutexLock lock(cliques.mu);
     if (absl::Status s =
             CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
         !s.ok()) {
       LOG(WARNING) << "Clique key " << clique_key.ToString()
                    << " is stale. Aborting recently created communicators.";
-      for (std::unique_ptr<Communicator>& comm : created_comms) {
+      for (auto& [rank, comm] : comms) {
         TF_RETURN_IF_ERROR(comm->Abort());
       }
       return s;
@@ -496,7 +517,6 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     // creating new communicators.
     std::vector<Communicator*> parent_comms;
     std::vector<RankId> keys;
-
     for (auto& [parent_rank, split_rank] : rank_mapping) {
       auto parent_comm = (*parent_clique)->comm(parent_rank);
       if (!parent_comm.has_value()) {
@@ -509,6 +529,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
       keys.push_back(split_rank);
     }
 
+    std::vector<DeviceRank> ranks;
+    ranks.reserve(rank_pairs.size());
+    for (auto& rank_pair : rank_pairs) {
+      ranks.emplace_back(rank_pair->second);
+    }
+
     // Get a globally consistent color value for newly created clique.
     int32_t color = GetCommSplitColor(clique_key);
 
@@ -520,11 +546,6 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     } else {
       // The parent clique is not local, but this clique can be local. We need
       // to check if peer access is possible between all devices in this clique.
-      std::vector<DeviceRank> ranks;
-      ranks.reserve(rank_pairs.size());
-      for (auto& rank_pair : rank_pairs) {
-        ranks.emplace_back(rank_pair->second);
-      }
       TF_ASSIGN_OR_RETURN(peer_access_enabled,
                           EnablePeerAccess(clique_key, ranks));
     }
@@ -537,6 +558,7 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
+    std::atomic_bool* cancel = nullptr;
     {
       VLOG(5) << "Locking cliques.mu";
       absl::MutexLock lock(cliques.mu);
@@ -544,16 +566,28 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
               << " for staleness";
       TF_RETURN_IF_ERROR(
           CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key));
+      cancel = &cliques.cancel[clique_key];
+      auto [it, unused_inserted] = cliques.cancel.emplace(clique_key, false);
+      cancel = &it->second;
     }
 
+    // Don't hold cliques.mu while creating the communicators, because creating
+    // communicators can block.
     VLOG(5) << "Splitting communicators";
-    TF_ASSIGN_OR_RETURN(
-        auto splitted_comms,
-        collectives->SplitCommunicators(parent_comms, color, keys, config));
+    auto splitted_comms = collectives->SplitCommunicatorsWithCancel(
+        parent_comms, color, keys, config, ranks, cancel);
+
+    VLOG(5) << "Locking cliques.mu";
+    absl::MutexLock lock(cliques.mu);
+    cliques.cancel.erase(clique_key);
+
+    if (!splitted_comms.ok()) {
+      return splitted_comms.status();
+    }
 
     absl::btree_map<RankId, std::unique_ptr<Communicator>> comms;
-    for (size_t i = 0; i < splitted_comms.size(); ++i) {
-      comms[keys[i]] = std::move(splitted_comms[i]);
+    for (size_t i = 0; i < splitted_comms->size(); ++i) {
+      comms[keys[i]] = std::move((*splitted_comms)[i]);
     }
 
     VLOG(3) << absl::StreamFormat(
@@ -564,14 +598,12 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
         peer_access_enabled,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
-    VLOG(5) << "Locking cliques.mu";
-    absl::MutexLock lock(cliques.mu);
     if (absl::Status s =
             CheckCliqueKeyIsntStaleImpl(cliques.task_state_infos, clique_key);
         !s.ok()) {
       LOG(WARNING) << "Clique key " << clique_key.ToString()
                    << " is stale. Aborting recently split communicators.";
-      for (std::unique_ptr<Communicator>& comm : splitted_comms) {
+      for (auto& [rank, comm] : comms) {
         TF_RETURN_IF_ERROR(comm->Abort());
       }
       return s;
@@ -712,6 +744,7 @@ bool CliqueKeyContainsIncarnation(
 // REQUIRES: GetProcessGpuCliques().mu held
 static absl::Status AbortCliquesWithIncarnations(
     absl::node_hash_map<GpuCliqueKey, LockableGpuClique>& map,
+    absl::node_hash_map<GpuCliqueKey, std::atomic_bool>& cancel,
     absl::Span<const IncarnationId> incarnations) {
   VLOG(1) << "Aborting GPU cliques for incarnations "
           << absl::StrJoin(incarnations, ", ",
@@ -720,19 +753,59 @@ static absl::Status AbortCliquesWithIncarnations(
                            });
   const absl::flat_hash_set<IncarnationId> incarnation_set(incarnations.begin(),
                                                            incarnations.end());
+
+  // Cancel pending collectives.
+  for (auto& [key, b] : cancel) {
+    if (CliqueKeyContainsIncarnation(key, incarnation_set)) {
+      VLOG(1) << "Canceling pending GPU clique " << key.ToString();
+      b.store(true);
+    }
+  }
+
+  // Abort collectives.
   absl::Status result;
+  absl::Mutex result_mu;
+  {
+    // We need to abort all communicators concurrently. If we abort serially, an
+    // abort of one communicator may get blocked by a pending collective on a
+    // different communicator.
+    std::vector<std::unique_ptr<tsl::Thread>> threads;
+    for (auto& [key, lockable_clique] : map) {
+      if (!CliqueKeyContainsIncarnation(key, incarnation_set)) {
+        VLOG(1) << "Not aborting GPU clique " << key.ToString()
+                << " because it does not include a stale incarnation";
+        continue;
+      }
+
+      auto abort = [&result, &result_mu, key = key,
+                    lockable_clique = &lockable_clique]() {
+        VLOG(1) << "Aborting GPU clique " << key.ToString();
+        if (absl::Status s = lockable_clique->Abort(); !s.ok()) {
+          LOG(ERROR) << "Error aborting GPU clique " << key.ToString() << ": "
+                     << s;
+          absl::MutexLock lock(result_mu);
+          result = std::move(s);
+        } else {
+          VLOG(1) << "Aborted GPU clique " << key.ToString();
+        }
+      };
+
+      VLOG(1) << "Launching thread to abort GPU clique " << key.ToString();
+      threads.push_back(absl::WrapUnique(tsl::Env::Default()->StartThread(
+          tsl::ThreadOptions(), "abort", abort)));
+    }
+  }  // threads' destructor will block until all threads finish.
+
+  // Garbage collect aborted collectives.
   for (auto it = map.begin(); it != map.end();) {
     auto copy = it++;
     auto& [key, lockable_clique] = *copy;
     if (!CliqueKeyContainsIncarnation(key, incarnation_set)) {
-      VLOG(1) << "Not aborting GPU clique " << key.ToString();
+      VLOG(1) << "Not removing GPU clique " << key.ToString()
+              << " because it does not include a stale incarnation";
       continue;
     }
-    VLOG(1) << "Aborting GPU clique " << key.ToString();
-    if (absl::Status s = lockable_clique.Abort(); !s.ok()) {
-      LOG(ERROR) << "Error aborting GPU clique " << key.ToString() << ": " << s;
-      result = std::move(s);
-    }
+    VLOG(1) << "Removing GPU clique " << key.ToString();
     map.erase(copy);
   }
   return result;
@@ -744,6 +817,7 @@ static absl::Status AbortCliquesWithIncarnations(
 // REQUIRES: GetProcessGpuCliques().mu held
 static absl::Status AbortOnFailure(
     absl::node_hash_map<GpuCliqueKey, LockableGpuClique>& map,
+    absl::node_hash_map<GpuCliqueKey, std::atomic_bool>& cancel,
     absl::Span<const tensorflow::CoordinatedTaskStateInfo> previous_state,
     absl::Span<const tensorflow::CoordinatedTaskStateInfo> current_state) {
   if (previous_state.empty()) {
@@ -785,7 +859,7 @@ static absl::Status AbortOnFailure(
   }
 
   if (!failed_incarnations.empty()) {
-    return AbortCliquesWithIncarnations(map, failed_incarnations);
+    return AbortCliquesWithIncarnations(map, cancel, failed_incarnations);
   }
   return absl::OkStatus();
 }
@@ -794,7 +868,8 @@ absl::Status UpdateGlobalProcessInfo(
     absl::Span<tensorflow::CoordinatedTaskStateInfo> infos) {
   ProcessGpuCliques& cliques = GetProcessGpuCliques();
   absl::MutexLock lock(cliques.mu);
-  absl::Status s = AbortOnFailure(cliques.map, cliques.task_state_infos, infos);
+  absl::Status s = AbortOnFailure(cliques.map, cliques.cancel,
+                                  cliques.task_state_infos, infos);
   if (!s.ok()) {
     LOG(WARNING) << s;
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
index 44015365ce576d..c02697587a7c23 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/types.h"  // IWYU pragma: keep
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
index 31c67bb13241ba..76062ce505336e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -16,19 +16,23 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_GPU_COLLECTIVES_H_
 
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "absl/types/span.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/global_device_id.h"
@@ -95,6 +99,28 @@ class GpuCollectives : public Collectives {
     bool async_execution = false;
   };
 
+  // A cancelable version of Collectives::CreateCommunicators.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) {
+    // By default, we ignore cancel.
+    return CreateCommunicators(clique_key, clique_ids, ranks, config);
+  }
+
+  // A cancelable version of Collectives::SplitCommunicators.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) {
+    // By default, we ignore cancel.
+    return SplitCommunicators(comms, color, keys, config, ranks);
+  }
+
   // Returns true if GPU collectives are implemented.
   virtual bool IsImplemented() const = 0;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
index f56e0e52249401..45aedd20acbbbe 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
@@ -58,7 +58,7 @@ class GpuCollectivesStub : public GpuCollectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const>, int32_t, absl::Span<const RankId>,
-      const Collectives::Config&) final {
+      const Collectives::Config&, absl::Span<const DeviceRank> ranks) final {
     return UnimplementedError();
   }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index e5f78e7de02e28..2f900d8d119f17 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -105,15 +106,28 @@ NcclCollectives::GetCliqueIdCallback(const CliqueIdCallback* clique_id_callback,
   return local_callback;
 }
 
-static ncclConfig_t AsNcclConfig(const GpuCollectives::Config& config) {
+static absl::StatusOr<ncclConfig_t> AsNcclConfig(
+    const GpuCollectives::Config& config,
+    const se::StreamExecutor* stream_executor) {
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
   comm_config.blocking = config.blocking_communicators ? 1 : 0;
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
 #endif
+  int nccl_version;
+  XLA_NCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
   if (config.max_nchannels > 0) {
     VLOG(1) << "Maximum number of channels is set to: " << comm_config.maxCTAs;
     comm_config.maxCTAs = config.max_nchannels;
+  } else if (stream_executor->GetDeviceDescription()
+                 .cuda_compute_capability()
+                 .IsBlackwell() &&
+             nccl_version >= NCCL_VERSION(2, 28, 0)) {
+    // Future NCCL versions will reduce the default max number of channels on
+    // Blackwell to 16. We need to manually set it to 32 here to avoid surprise
+    // perf regressions.
+    VLOG(1) << "Setting max number of channels to 32 on Blackwell.";
+    comm_config.maxCTAs = 32;
   }
   return comm_config;
 }
@@ -130,10 +144,10 @@ static absl::StatusOr<ncclUniqueId> AsNcclUniqueId(const CliqueId& clique_id) {
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
-NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
-                                     const std::optional<CliqueIds>& clique_ids,
-                                     absl::Span<const DeviceRank> ranks,
-                                     const Collectives::Config& config) {
+NcclCollectives::CreateCommunicatorsWithCancel(
+    const CliqueKey& clique_key, const std::optional<CliqueIds>& clique_ids,
+    absl::Span<const DeviceRank> ranks, const Collectives::Config& config,
+    std::atomic_bool* cancel) {
   // Validate clique ids. With the NCCL backend, we rely on the host to exchange
   // unique clique ids.
   if (!clique_ids.has_value() || clique_ids->data().empty()) {
@@ -154,7 +168,6 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
         "async_execution is false. Non-blocking communicators require "
         "asynchronous execution.");
   }
-  ncclConfig_t comm_config = AsNcclConfig(gpu_config);
 
   // make_comm returns a new ncclComm_t.
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
@@ -165,6 +178,10 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
     auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
     TF_RET_CHECK(device != nullptr);
     auto activate_context = device->stream_executor()->Activate();
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsNcclConfig(gpu_config, device->stream_executor()));
+
     TF_ASSIGN_OR_RETURN(auto nccl_unique_id, AsNcclUniqueId(clique_ids->at(0)));
     ncclComm_t comm;
     XLA_NCCL_RETURN_IF_ERROR(
@@ -184,7 +201,7 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
       pool.Schedule([&, i]() {
         absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
             NcclCommunicator::Create(std::bind(make_comm, i),
-                                     gpu_config.async_execution);
+                                     gpu_config.async_execution, cancel);
         if (!comm.ok()) {
           absl::call_once(once, [&] { status = comm.status(); });
           return;
@@ -198,10 +215,10 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
-NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
-                                    int32_t color,
-                                    absl::Span<const RankId> keys,
-                                    const Collectives::Config& config) {
+NcclCollectives::SplitCommunicatorsWithCancel(
+    absl::Span<const Communicator* const> comms, int32_t color,
+    absl::Span<const RankId> keys, const Collectives::Config& config,
+    absl::Span<const DeviceRank> ranks, std::atomic_bool* cancel) {
   auto rank_formatter = [](std::string* str, RankId rank) {
     absl::StrAppend(str, rank.value());
   };
@@ -218,10 +235,15 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
 
   const auto& gpu_config =
       tsl::down_cast<const GpuCollectives::Config&>(config);
-  ncclConfig_t comm_config = AsNcclConfig(gpu_config);
 
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsNcclConfig(gpu_config, device->stream_executor()));
+
     VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
             << " and key " << keys[i];
     ncclComm_t split_comm;
@@ -240,7 +262,7 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
       pool.Schedule([&, i]() {
         absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
             NcclCommunicator::Create(std::bind(make_comm, i),
-                                     gpu_config.async_execution);
+                                     gpu_config.async_execution, cancel);
         if (!comm.ok()) {
           absl::call_once(once, [&] { status = comm.status(); });
           return;
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
index 07da8ff485810d..f12cfb70f5a88e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COLLECTIVES_H_
 
+#include <atomic>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -49,14 +50,36 @@ class NcclCollectives : public GpuCollectives {
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
-                      const Collectives::Config& config) final;
+                      const Collectives::Config& config) final {
+    return CreateCommunicatorsWithCancel(clique_key, clique_ids, ranks, config,
+                                         nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
+    return SplitCommunicatorsWithCancel(comms, color, keys, config, ranks,
+                                        nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) final;
 
   absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
     return absl::UnimplementedError("Not implemented.");
   }
-  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
-      absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Collectives::Config& config) final;
 
   absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index a9784d06adb39f..428e52862a0f21 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -225,11 +225,15 @@ class NcclCommunicator::NcclRegisteredBufferHandle
 
 absl::StatusOr<std::unique_ptr<NcclCommunicator>> NcclCommunicator::Create(
     absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm, bool is_async,
-    tsl::Env& env) {
-  // TODO(mwhittaker): There is currently no way to abort these operations.
-  auto f = [&make_comm]() -> absl::StatusOr<ncclComm_t> {
+    std::atomic_bool* cancel, tsl::Env& env) {
+  auto f = [cancel, &make_comm]() -> absl::StatusOr<ncclComm_t> {
     TF_ASSIGN_OR_RETURN(ncclComm_t comm, make_comm());
-    TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, std::atomic_bool{}));
+    if (cancel) {
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, *cancel));
+    } else {
+      std::atomic_bool never_cancelled;
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, never_cancelled));
+    }
     return comm;
   };
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
index 379f02ced597b3..5620a5aa3130a1 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -69,7 +69,8 @@ class NcclCommunicator : public GpuCommunicator {
   // synchronously on the calling thread.
   static absl::StatusOr<std::unique_ptr<NcclCommunicator>> Create(
       absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm,
-      bool is_async = false, tsl::Env& env = *tsl::Env::Default());
+      bool is_async = false, std::atomic_bool* cancel = nullptr,
+      tsl::Env& env = *tsl::Env::Default());
 
   ~NcclCommunicator() override;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
index 8967805603510a..4fc7485d48a0e1 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -76,7 +76,8 @@ class NvshmemCollectives : public GpuCollectives {
 
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Collectives::Config& config) final {
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
index 7e62541ba0ede8..12356029078e3c 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
@@ -293,6 +293,20 @@ Future<> NvshmemCommunicator::AllReduce(
           dest_ptr, count);
       break;
     }
+    case PrimitiveType::PRED:
+    case PrimitiveType::U8: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          uint8, uint8_t, NVSHMEM_TEAM_SHARED,
+          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
+          dest_ptr, count);
+      break;
+    }
+    case PrimitiveType::S8: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          int8, int8_t, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
+      break;
+    }
     default:
       return absl::InternalError("Invalid Nvshmem reduction type.");
   }
diff --git a/third_party/xla/xla/backends/gpu/profiler/BUILD b/third_party/xla/xla/backends/gpu/profiler/BUILD
index 1c07d8043f3fe6..7e175865b87390 100644
--- a/third_party/xla/xla/backends/gpu/profiler/BUILD
+++ b/third_party/xla/xla/backends/gpu/profiler/BUILD
@@ -32,7 +32,7 @@ cc_library(
         "//xla/backends/profiler/gpu:cupti_collector",
         "//xla/backends/profiler/gpu:cupti_tracer",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor/cuda:cuda_platform",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
index 908c10b056dfa6..3f85e66973a1d3 100644
--- a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_cuda.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/backends/gpu/profiler/kernel_name_tracer_factory.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
-#include "xla/stream_executor/cuda/cuda_platform.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 41212204ebb7d3..99868b2e59bb10 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1,6 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
+load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal", "xla_py_proto_library")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility", "nvtx_headers")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
@@ -70,10 +74,12 @@ cc_library(
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
         ":gpublas_lt_matmul_thunk",
+        ":shaped_slice",
         ":thunk",
         ":while_thunk",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -81,6 +87,7 @@ cc_library(
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -141,6 +148,7 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":command_buffer_cmd",
+        ":copy_thunk",
         ":thunk",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -152,6 +160,7 @@ xla_test(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
@@ -186,6 +195,7 @@ cc_library(
         ":copy_thunk",
         ":cudnn_thunk",
         ":custom_call_thunk",
+        ":custom_kernel_thunk",
         ":dynamic_slice_thunk",
         ":gemm_thunk",
         ":gpublas_lt_matmul_thunk",
@@ -224,7 +234,6 @@ cc_library(
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
-        ":thunk_proto_deserialization",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -233,11 +242,11 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -252,9 +261,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -269,10 +275,14 @@ xla_test(
         ":dynamic_slice_thunk",
         ":dynamic_slice_thunk_proto_cc",
         ":gemm_thunk",
+        ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
+        ":thunk_proto_deserialization",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
@@ -293,7 +303,6 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
@@ -304,57 +313,6 @@ xla_test(
     ],
 )
 
-cc_library(
-    name = "cholesky_thunk",
-    srcs = ["cholesky_thunk.cc"],
-    hdrs = ["cholesky_thunk.h"],
-    deps = [
-        ":make_batch_pointers",
-        ":thunk",
-        ":thunk_proto_cc",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor/platform:platform_object_registry",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_test(
-    name = "cholesky_thunk_test",
-    srcs = ["cholesky_thunk_test.cc"],
-    backends = ["gpu"],
-    deps = [
-        ":cholesky_thunk",
-        ":thunk",
-        ":thunk_proto_cc",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor/platform:platform_object_registry",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/util/proto:proto_matchers",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:status_matchers",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
-    ],
-)
-
 cc_library(
     name = "command_buffer_thunk",
     srcs = ["command_buffer_thunk.cc"],
@@ -398,15 +356,12 @@ xla_test(
         ":memset_thunk",
         ":sequential_thunk",
         ":thunk",
-        "//xla:error_spec",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/service:hlo_module_config",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
@@ -425,11 +380,8 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
-        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
-        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
@@ -462,8 +414,8 @@ xla_test(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:kernel_spec",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
@@ -523,6 +475,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_proto_cc",
         "//xla/service:buffer_assignment",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/util/proto:proto_matchers",
@@ -530,7 +483,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -551,7 +504,6 @@ cc_library(
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -674,6 +626,7 @@ cc_library(
     hdrs = ["cub_sort_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -684,7 +637,6 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -692,23 +644,55 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
+xla_cc_test(
+    name = "cub_sort_thunk_test",
+    srcs = ["cub_sort_thunk_test.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":cub_sort_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/service:buffer_assignment",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:all_runtime",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:all_runtime",
+    ]),
+)
+
 cc_library(
     name = "custom_call_thunk",
     srcs = ["custom_call_thunk.cc"],
     hdrs = ["custom_call_thunk.h"],
     deps = [
         ":custom_call_target",
+        ":shaped_slice",
         ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_context",
         "//xla/ffi:execution_state",
@@ -727,12 +711,14 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -746,20 +732,30 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":custom_call_thunk",
+        ":shaped_slice",
         ":thunk",
         "//xla:executable_run_options",
+        "//xla:shape_util",
         "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status_public_headers",
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:resource_requests",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -836,8 +832,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
     ],
 )
@@ -928,6 +922,7 @@ cc_library(
     srcs = ["infeed_thunk.cc"],
     hdrs = ["infeed_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -945,6 +940,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -961,7 +957,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -970,20 +966,19 @@ cc_library(
     srcs = ["kernel_thunk.cc"],
     hdrs = ["kernel_thunk.h"],
     deps = [
+        ":print_buffer_contents",
         ":thunk",
-        ":thunk_id",
         ":thunk_proto_cc",
         "//xla:shape_util",
         "//xla:types",
         "//xla/codegen/emitters:kernel_arguments",
-        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -997,7 +992,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -1046,7 +1040,6 @@ xla_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1213,13 +1206,10 @@ cc_library(
         ":thunk",
         "//xla:future",
         "//xla:shape_util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
@@ -1260,6 +1250,7 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:all_reduce_kernel",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -1279,6 +1270,11 @@ cc_library(
 xla_test(
     name = "collective_kernel_thunk_test",
     srcs = ["collective_kernel_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
     backends = ["h100"],
     deps = [
         ":collective_kernel_thunk",
@@ -1287,7 +1283,7 @@ xla_test(
         "//xla:array",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:collective_op_group_mode",
+        "//xla/pjrt:worker_thread",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
@@ -1301,11 +1297,16 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/tsl/concurrency:future",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1333,7 +1334,6 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -1358,17 +1358,15 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:event",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -1402,7 +1400,6 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1411,7 +1408,6 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -1440,17 +1436,14 @@ cc_library(
         ":thunk",
         "//xla:future",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -1475,7 +1468,6 @@ xla_test(
         ":thunk_proto_cc",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:backend",
@@ -1505,15 +1497,13 @@ cc_library(
         ":thunk",
         "//xla:executable_run_options",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1593,15 +1583,20 @@ cc_library(
     hdrs = ["p2p_thunk_common.h"],
     deps = [
         ":collective_thunk",
+        ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:global_device_id",
+        "//xla/service:source_target_pairs",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -1609,7 +1604,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1622,10 +1616,10 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
@@ -1650,17 +1644,15 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -1681,7 +1673,6 @@ cc_library(
         ":thunk_id",
         "//xla:future",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
@@ -1743,6 +1734,7 @@ cc_library(
     srcs = ["outfeed_thunk.cc"],
     hdrs = ["outfeed_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -1826,6 +1818,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_id",
         ":thunk_proto_cc",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1840,6 +1833,7 @@ cc_library(
     hdrs = ["host_send_recv_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
@@ -1860,6 +1854,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -1937,6 +1932,46 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "shaped_slice_proto",
+    srcs = ["shaped_slice.proto"],
+    protodeps = [
+        # keep sorted
+        "//xla:xla_data_proto",
+        "//xla/service:buffer_assignment_proto",
+    ],
+)
+
+cc_library(
+    name = "shaped_slice",
+    srcs = ["shaped_slice.cc"],
+    hdrs = ["shaped_slice.h"],
+    deps = [
+        ":shaped_slice_proto_cc",
+        "//xla:shape_util",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "shaped_slice_test",
+    srcs = ["shaped_slice_test.cc"],
+    deps = [
+        ":shaped_slice",
+        ":shaped_slice_proto_cc",
+        "//xla:shape_util",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "for_all_thunks_test",
     srcs = ["for_all_thunks_test.cc"],
@@ -1972,6 +2007,7 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -1993,7 +2029,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -2009,7 +2045,7 @@ xla_cc_test(
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -2066,17 +2102,17 @@ xla_test(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -2088,10 +2124,10 @@ cc_library(
         ":thunk",
         ":thunk_proto_cc",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2139,8 +2175,9 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -2188,6 +2225,8 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:buffer_comparator_kernel",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -2214,11 +2253,11 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2254,6 +2293,7 @@ xla_test(
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -2278,7 +2318,6 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:safe_reinterpret_cast",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -2322,8 +2361,10 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:all_reduce_kernel",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_init",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:literal_test_util",
@@ -2335,6 +2376,7 @@ xla_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -2400,8 +2442,7 @@ cc_library(
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -2427,10 +2468,12 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:kernel_serialization_check",
         "//xla/stream_executor/host:host_platform",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -2451,9 +2494,12 @@ tf_proto_library(
         # keep sorted
         ":convolution_filter_thunk_proto",
         ":dynamic_slice_thunk_proto",
+        ":shaped_slice_proto",
         "//xla:xla_data_proto",
         "//xla/core/host_offloading:host_offloading_executable_proto",
+        "//xla/ffi:attribute_map_proto",
         "//xla/service:buffer_assignment_proto",
+        "//xla/service:hlo_proto",
         "//xla/service/gpu:backend_configs",
         "//xla/service/gpu:gpu_conv_runner_proto",
         "//xla/service/gpu:gpu_norm_runner_proto",
@@ -2464,6 +2510,11 @@ tf_proto_library(
     ],
 )
 
+xla_py_proto_library(
+    name = "thunk_proto_py",
+    deps = [":thunk_proto"],
+)
+
 tf_proto_library(
     name = "dynamic_slice_thunk_proto",
     srcs = [
@@ -2486,14 +2537,20 @@ cc_library(
         ":convolution_reorder_thunk",
         ":convolution_thunk",
         ":copy_thunk",
+        ":cub_sort_thunk",
         ":cudnn_thunk",
+        ":custom_call_thunk",
+        ":dynamic_slice_thunk",
         ":fft_thunk",
         ":gemm_thunk",
         ":gpublas_lt_matmul_thunk",
+        ":host_execute_thunk",
+        ":host_send_recv_thunk",
         ":infeed_thunk",
         ":kernel_thunk",
         ":memset_thunk",
         ":norm_thunk",
+        ":outfeed_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
         ":thunk",
@@ -2501,8 +2558,10 @@ cc_library(
         ":triangular_solve_thunk",
         ":wait_for_streams_thunk",
         ":while_thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -2519,17 +2578,24 @@ xla_cc_test(
     deps = [
         ":conditional_thunk",
         ":copy_thunk",
+        ":host_execute_thunk",
+        ":host_send_recv_thunk",
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
         ":thunk_proto_deserialization",
         ":while_thunk",
+        "//xla:shape_util",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/tsl/platform:status_matchers",
+        "//xla/service:hlo_module_config",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -2548,11 +2614,12 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives",
         "//xla/core/collectives:collectives_registry",
-        "//xla/core/collectives:communicator",
         "//xla/service:computation_placer",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2562,9 +2629,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -2580,15 +2645,12 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
@@ -2619,15 +2681,14 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -2651,14 +2712,13 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -2673,13 +2733,13 @@ cc_library(
     hdrs = ["nvshmem_all_reduce_thunk.h"],
     deps = [
         ":all_reduce_thunk",
+        ":collective_kernel_thunk",
         ":collective_thunk",
         ":nvshmem_collective_thunk",
         ":thunk",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
@@ -2700,6 +2760,7 @@ cc_library(
     srcs = ["host_execute_thunk.cc"],
     hdrs = ["host_execute_thunk.h"],
     deps = [
+        ":shaped_slice_proto_cc",
         ":thunk",
         ":thunk_proto_cc",
         "//xla:executable_run_options",
@@ -2748,7 +2809,6 @@ xla_test(
     deps = [
         ":host_execute_thunk",
         ":thunk",
-        ":thunk_proto_cc",
         "//xla:executable_run_options",
         "//xla:literal",
         "//xla:literal_util",
@@ -2757,7 +2817,6 @@ xla_test(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu/nanort:nanort_client",
         "//xla/backends/cpu/nanort:nanort_executable",
-        "//xla/core/host_offloading:host_offloading_executable",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:buffer_assignment",
@@ -2773,16 +2832,15 @@ xla_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -2842,13 +2900,11 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -2881,7 +2937,6 @@ xla_test(
         ":thunk_pass_pipeline",
         ":while_thunk",
         "//xla:shape_util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_module_config",
@@ -2893,6 +2948,7 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -2902,19 +2958,34 @@ xla_test(
 )
 
 cc_library(
-    name = "thunk_checksum_tracing_pass",
-    srcs = ["thunk_checksum_tracing_pass.cc"],
-    hdrs = ["thunk_checksum_tracing_pass.h"],
+    name = "thunk_buffer_debug_pass",
+    srcs = [
+        "thunk_buffer_debug_checksum.cc",
+        "thunk_buffer_debug_filter.cc",
+        "thunk_buffer_debug_float_check.cc",
+        "thunk_buffer_debug_pass.cc",
+    ],
+    hdrs = [
+        "thunk_buffer_debug_checksum.h",
+        "thunk_buffer_debug_filter.h",
+        "thunk_buffer_debug_float_check.h",
+        "thunk_buffer_debug_pass.h",
+    ],
     deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
         ":buffers_checksum_thunk",
+        ":buffers_float_check_thunk",
         ":custom_call_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
-        ":thunk_buffer_id",
+        ":thunk_id",
         ":thunk_pass_pipeline",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
@@ -2923,29 +2994,37 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
 xla_cc_test(
-    name = "thunk_checksum_tracing_pass_test",
-    srcs = ["thunk_checksum_tracing_pass_test.cc"],
+    name = "thunk_buffer_debug_pass_test",
+    srcs = ["thunk_buffer_debug_pass_test.cc"],
     deps = [
         ":buffers_checksum_thunk",
+        ":buffers_float_check_thunk",
+        ":conditional_thunk",
         ":custom_call_thunk",
         ":sequential_thunk",
         ":thunk",
-        ":thunk_buffer_id",
-        ":thunk_checksum_tracing_pass",
+        ":thunk_buffer_debug_pass",
         ":thunk_id",
         ":thunk_pass_pipeline",
+        ":while_thunk",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
@@ -2976,12 +3055,18 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
+        "//xla/tsl/lib/io:record_writer",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
     ],
     alwayslink = 1,
 )
@@ -2992,15 +3077,26 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":runtime_intrinsics",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/io:record_reader",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:tstring",
     ],
 )
 
@@ -3009,8 +3105,10 @@ cc_library(
     srcs = ["buffers_checksum_thunk.cc"],
     hdrs = ["buffers_checksum_thunk.h"],
     deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
         ":thunk",
-        ":thunk_buffer_id",
+        ":thunk_id",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
@@ -3022,32 +3120,109 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
 xla_test(
     name = "buffers_checksum_thunk_test",
     srcs = ["buffers_checksum_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
     backends = ["gpu"],
     tags = [
         "cuda-only",
         "gpu",
     ],
     deps = [
+        ":buffer_debug_log_entry_metadata_store",
         ":buffer_debug_log_structs",
         ":buffers_checksum_thunk",
         ":thunk",
-        ":thunk_buffer_id",
         ":thunk_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:resource_requests",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:mock_stream",
+        "//xla/stream_executor:mock_stream_executor",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "buffers_float_check_thunk",
+    srcs = ["buffers_float_check_thunk.cc"],
+    hdrs = ["buffers_float_check_thunk.h"],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":thunk",
+        ":thunk_id",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_test(
+    name = "buffers_float_check_thunk_test",
+    srcs = ["buffers_float_check_thunk_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "multi_gpu",
+        ],
+    },
+    backends = ["gpu"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":buffers_float_check_thunk",
+        ":thunk",
+        ":thunk_id",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:resource_requests",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -3056,6 +3231,7 @@ xla_test(
         "//xla/stream_executor/gpu:buffer_debug_log",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -3065,36 +3241,135 @@ tf_proto_library(
     srcs = ["buffer_debug_log.proto"],
 )
 
+xla_py_proto_library(
+    name = "buffer_debug_log_proto_py",
+    deps = [":buffer_debug_log_proto"],
+)
+
 cc_library(
-    name = "thunk_buffer_id",
-    hdrs = ["thunk_buffer_id.h"],
+    name = "buffer_debug_log_structs",
+    hdrs = ["buffer_debug_log_structs.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":buffer_debug_log_proto_cc",
+        "//xla/tsl/lib/gtl:int_type",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_library(
+    name = "buffer_debug_log_entry_metadata_store",
+    srcs = ["buffer_debug_log_entry_metadata_store.cc"],
+    hdrs = ["buffer_debug_log_entry_metadata_store.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":buffer_debug_log_proto_cc",
+        ":buffer_debug_log_structs",
         ":thunk_id",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "print_buffer_contents",
+    srcs = ["print_buffer_contents.cc"],
+    hdrs = ["print_buffer_contents.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
-xla_cc_test(
-    name = "thunk_buffer_id_test",
-    srcs = ["thunk_buffer_id_test.cc"],
+xla_test(
+    name = "print_buffer_contents_test",
+    srcs = ["print_buffer_contents_test.cc"],
+    backends = ["gpu"],
     deps = [
-        ":thunk_buffer_id",
+        ":print_buffer_contents",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "custom_kernel_thunk",
+    srcs = ["custom_kernel_thunk.cc"],
+    hdrs = ["custom_kernel_thunk.h"],
+    deps = [
+        ":print_buffer_contents",
+        ":thunk",
         ":thunk_id",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "custom_kernel_thunk_test",
+    srcs = ["custom_kernel_thunk_test.cc"],
+    deps = [
+        ":custom_kernel_thunk",
+        ":thunk",
+        "//xla:shape_util",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream_executor_h",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
-    name = "buffer_debug_log_structs",
-    hdrs = ["buffer_debug_log_structs.h"],
-    compatible_with = get_compatible_with_portable(),
+xla_cc_test(
+    name = "buffer_debug_log_entry_metadata_store_test",
+    srcs = ["buffer_debug_log_entry_metadata_store_test.cc"],
     deps = [
-        ":thunk_buffer_id",
+        ":buffer_debug_log_entry_metadata_store",
+        ":buffer_debug_log_structs",
+        ":thunk_id",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
index 0db4b965bdfc5d..ad5eec5de09bd9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -21,14 +21,12 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/future.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
index a38b02861bc42a..81b69793cbe213 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
@@ -22,11 +22,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
index f270f26550bf8b..2c563852103d59 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 
-#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -43,7 +42,10 @@ namespace xla::gpu {
 
 namespace {
 
-using ::stream_executor::gpu::AllReduceStrategy;
+using se::gpu::AllReduceStrategy;
+static constexpr int64_t kMaxOneShotAllReduceSizeBytes = 256 * 1024;  // 256 KB
+static constexpr int64_t kMaxTwoShotAllReduceSizeBytes =
+    2 * 1024 * 1024;  // 2 MB
 
 template <typename T, ReductionKind kReductionKindV>
 class TagRegistry {
@@ -58,6 +60,7 @@ class TagRegistry {
  public:
   static constexpr auto kOneShot = Impl<AllReduceStrategy::kOneShot>{};
   static constexpr auto kTwoShot = Impl<AllReduceStrategy::kTwoShot>{};
+  static constexpr auto kMultimem = Impl<AllReduceStrategy::kMultimem>{};
 };
 
 // Static set of supported kernel tags.
@@ -174,6 +177,28 @@ bool IsElementReductionSupported(PrimitiveType element_type,
 
 }  // namespace
 
+AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes,
+                                       bool is_multimem_enabled) {
+  if (input_size_bytes > kMaxOneShotAllReduceSizeBytes) {
+    return AllReduceStrategy::kTwoShot;
+  }
+  if (is_multimem_enabled) {
+    return AllReduceStrategy::kMultimem;
+  }
+  return AllReduceStrategy::kOneShot;
+}
+
+int64_t GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy strategy) {
+  switch (strategy) {
+    case AllReduceStrategy::kOneShot:
+      return kMaxOneShotAllReduceSizeBytes;
+    case AllReduceStrategy::kTwoShot:
+      return kMaxTwoShotAllReduceSizeBytes;
+    case AllReduceStrategy::kMultimem:
+      return kMaxTwoShotAllReduceSizeBytes;
+  }
+}
+
 LaunchDimensions AllReduceLaunchDimensions(int64_t elements, int64_t num_ranks,
                                            AllReduceStrategy strategy) {
   int64_t threads_per_block;
@@ -197,7 +222,8 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
     return false;
   }
   const int64_t alignment_requirement =
-      all_reduce_strategy == AllReduceStrategy::kOneShot
+      all_reduce_strategy == AllReduceStrategy::kOneShot ||
+              all_reduce_strategy == AllReduceStrategy::kMultimem
           ? se::gpu::kNumElementsPerThread
           : se::gpu::kNumElementsPerThread * num_ranks;
 
@@ -247,6 +273,8 @@ absl::Status RunAllReduceKernel(
         return launch_kernel_impl(tag_registry.kOneShot);
       case AllReduceStrategy::kTwoShot:
         return launch_kernel_impl(tag_registry.kTwoShot);
+      case AllReduceStrategy::kMultimem:
+        return launch_kernel_impl(tag_registry.kMultimem);
     }
   };
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
index 78656499452388..e2d5ae8bc1d1ef 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/status/status.h"
-#include "absl/types/span.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -31,6 +30,16 @@ limitations under the License.
 
 namespace xla::gpu {
 
+// Returns the all-reduce strategy for the given input size.
+// If `is_multimem_enabled` is true, then multimem strategies are also
+// considered.
+se::gpu::AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes,
+                                                bool is_multimem_enabled);
+
+// Returns the maximum supported all-reduce size in bytes for the given
+// strategy.
+int64_t GetMaxSupportedAllReduceSizeBytes(se::gpu::AllReduceStrategy strategy);
+
 // Returns the launch dimensions for the all-reduce kernel.
 // The launch dimensions are determined by the number of elements and the
 // the all-reduce strategy.
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
index 1a5344730d245c..e356c84a36a8a3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_reduce.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <tuple>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -42,13 +44,14 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
 #include "xla/stream_executor/gpu/collective_kernel_metadata.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status_matchers.h"
@@ -103,6 +106,19 @@ class AllReduceKernelTest : public ::testing::Test,
     TF_RETURN_IF_ERROR(executors[0]->EnablePeerAccessTo(executors[1]));
     TF_RETURN_IF_ERROR(executors[1]->EnablePeerAccessTo(executors[0]));
 
+    std::unique_ptr<stream_executor::gpu::GpuExecutor::MulticastMemory>
+        multicast_memory;
+    if (params_.all_reduce_strategy == AllReduceStrategy::kMultimem) {
+      TF_ASSIGN_OR_RETURN(
+          multicast_memory,
+          dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[0])
+              ->CreateMulticastMemory(num_elements * sizeof(T), num_ranks));
+
+      for (int i = 0; i < num_ranks; ++i) {
+        TF_RETURN_IF_ERROR(multicast_memory->SubscribeDevice(i));
+      }
+    }
+
     std::vector<std::unique_ptr<se::Stream>> streams;
     std::vector<se::DeviceMemoryBase> allocated_buffers;
     std::vector<se::DeviceMemoryBase> local_input_buffers;
@@ -124,7 +140,8 @@ class AllReduceKernelTest : public ::testing::Test,
           /*local_input_buffer_size=*/aligned_input_size +
           /*data_buffer_size=*/aligned_input_size +
           /*signal_buffer_size=*/aligned_signal_size;
-      allocated_buffers.emplace_back(executor->AllocateArray<T>(total_size));
+      allocated_buffers.emplace_back(executor->AllocateArray<T>(
+          total_size, static_cast<int64_t>(stream_executor::MemoryType::kP2P)));
       local_input_buffers.emplace_back(
           allocated_buffers[i].GetByteSlice(0, aligned_input_size));
       TF_RET_CHECK(!local_input_buffers[i].is_null());
@@ -143,23 +160,49 @@ class AllReduceKernelTest : public ::testing::Test,
     }
 
     std::vector<se::DeviceMemoryBase> metadata_buffers;
+    // One for signal and one for input parameters.
+    constexpr int kNumPeerParameters = 2;
+    size_t param_to_peers_size =
+        sizeof(uint64_t) * kNumPeerParameters * num_ranks;
+    std::vector<uint64_t> param_to_peers_ptrs;
+    for (const auto& local_input_buffer : local_input_buffers) {
+      param_to_peers_ptrs.push_back((uint64_t)local_input_buffer.opaque());
+    }
+    for (const auto& signal_flags_buffer : signal_flags_buffers) {
+      param_to_peers_ptrs.push_back((uint64_t)signal_flags_buffer.opaque());
+    }
 
     for (int i = 0; i < num_ranks; ++i) {
       CollectiveKernelMetadata metadata;
       metadata.rank = i;
 
-      for (int j = 0; j < num_ranks; ++j) {
-        // One-Shot all-reduce doesn't use an input buffer from the peers.
-        metadata.buffer_root_ptrs[j] = 0;
-        metadata.local_buffer_root_ptrs[j] =
-            (uint64_t)allocated_buffers[j].opaque();
+      if (params_.all_reduce_strategy == AllReduceStrategy::kMultimem) {
+        stream_executor::gpu::GpuExecutor* gpu_executor =
+            dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[i]);
+        TF_RET_CHECK(gpu_executor != nullptr);
+        TF_ASSIGN_OR_RETURN(
+            void* mapped_memory,
+            multicast_memory->MapMemory(allocated_buffers[i], gpu_executor));
+        metadata.multicast_buffer_ptr = (uint64_t)mapped_memory;
+      } else {
+        metadata.multicast_buffer_ptr = 0;
       }
 
+      // First map from parameter to peer ptrs and then metadata.
       metadata_buffers.emplace_back(executors[i]->AllocateArray<uint64_t>(
-          sizeof(CollectiveKernelMetadata)));
+          sizeof(CollectiveKernelMetadata) + param_to_peers_size));
+
+      se::DeviceMemoryBase param_to_peers_ptrs_buffer =
+          metadata_buffers[i].GetByteSlice(sizeof(CollectiveKernelMetadata),
+                                           param_to_peers_size);
+      metadata.param_to_peers =
+          reinterpret_cast<uint64_t*>(param_to_peers_ptrs_buffer.opaque());
 
       TF_RETURN_IF_ERROR(streams[i]->Memcpy(&metadata_buffers[i], &metadata,
                                             sizeof(CollectiveKernelMetadata)));
+      TF_RETURN_IF_ERROR(streams[i]->Memcpy(&param_to_peers_ptrs_buffer,
+                                            param_to_peers_ptrs.data(),
+                                            param_to_peers_size));
     }
 
     for (int i = 0; i < num_ranks; ++i) {
@@ -205,6 +248,8 @@ class AllReduceKernelTest : public ::testing::Test,
 
   int64_t num_elements() const { return params_.num_elements; }
 
+  AllReduceStrategy strategy() const { return params_.all_reduce_strategy; }
+
  private:
   TestParams params_;
 };
@@ -214,6 +259,11 @@ TEST_P(AllReduceKernelTest, KernelTestAddF32) {
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
                                                 GetGpuExecutor(1)};
+  if (strategy() == AllReduceStrategy::kMultimem &&
+      !dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[0])
+           ->is_multicast_supported()) {
+    GTEST_SKIP() << "Multimem not supported on this device.";
+  }
 
   if (!executors[0]->CanEnablePeerAccessTo(executors[1])) {
     GTEST_SKIP() << "Test requires direct peer memory access between devices.";
@@ -247,6 +297,9 @@ TEST_P(AllReduceKernelTest, KernelTestAddF32) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestAddBF16) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support BF16.";
+  }
   constexpr int64_t kNumRanks = 2;
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
@@ -280,6 +333,9 @@ TEST_P(AllReduceKernelTest, KernelTestAddBF16) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestOrPred) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support predicates.";
+  }
   constexpr int64_t kNumRanks = 2;
 
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
@@ -314,6 +370,9 @@ TEST_P(AllReduceKernelTest, KernelTestOrPred) {
 }
 
 TEST_P(AllReduceKernelTest, KernelTestAddPred_Unsupported) {
+  if (strategy() == AllReduceStrategy::kMultimem) {
+    GTEST_SKIP() << "Multimem does not support predicates.";
+  }
   constexpr int64_t kNumRanks = 2;
   std::vector<se::StreamExecutor*> executors = {GetGpuExecutor(0),
                                                 GetGpuExecutor(1)};
@@ -336,7 +395,8 @@ INSTANTIATE_TEST_SUITE_P(
     AllReduceKernelTest, AllReduceKernelTest,
     ::testing::ConvertGenerator(
         ::testing::Combine(::testing::Values(AllReduceStrategy::kOneShot,
-                                             AllReduceStrategy::kTwoShot),
+                                             AllReduceStrategy::kTwoShot,
+                                             AllReduceStrategy::kMultimem),
                            ::testing::Values(128000, 124000)),
         [](const std::tuple<AllReduceStrategy, int64_t>& params) {
           return TestParams{std::get<0>(params), std::get<1>(params)};
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index 2946e85fbd5212..1fcbc951078a22 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
index 304ef9122ce477..c8daf0bf1a28e2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
@@ -22,8 +22,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
index b1e004b1b1be53..875aaa57bb2ea9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
@@ -48,9 +47,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
index 1d544565b01d6a..caff93eee95bb5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
@@ -29,8 +29,9 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/annotation.cc b/third_party/xla/xla/backends/gpu/runtime/annotation.cc
index f549647b9d0386..b6e383f0a2a30f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/annotation.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <iterator>
 #include <optional>
 #include <ostream>
@@ -31,13 +30,13 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/utils/hlo_longest_prefix.h"
 #include "xla/printer.h"
-#include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
index fe59409a2fb773..8f06af032c53b7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
index 21829801efcfcd..ae3cb487b0b0ee 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
@@ -33,10 +33,10 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/test.h"
 
 #if GOOGLE_CUDA
 #define PLATFORM "CUDA"
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto
index a4cb355671d7b3..4649a76c766cfe 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log.proto
@@ -28,6 +28,27 @@ message BufferDebugLogEntryProto {
 
   // The checksum of the buffer.
   uint32 checksum = 3;
+
+  // If true, the entry refers to a thunk input buffer, and the checksum is
+  // calculated based on the buffer value before the thunk execution.
+  //
+  // If false, it refers to thunk output, and the checksum is calculated based
+  // on the buffer value after the thunk execution.
+  bool is_input_buffer = 4;
+
+  // ID of the thunk execution that produced this entry. Entries with the same
+  // (thunk_id, execution_id) describe buffers used by a single execution of a
+  // thunk.
+  uint32 execution_id = 5;
+
+  // The type of check that produced this entry.
+  enum CheckType {
+    CHECK_TYPE_UNSPECIFIED = 0;
+    CHECK_TYPE_CHECKSUM = 1;
+    CHECK_TYPE_FLOAT_CHECKS = 2;
+  }
+
+  CheckType check_type = 6;
 }
 
 // A dump of a `BufferDebugLog` contents.
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc
new file mode 100644
index 00000000000000..949a6ae1fcea67
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.cc
@@ -0,0 +1,94 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+
+namespace xla::gpu {
+
+BufferDebugLogEntryId BufferDebugLogEntryMetadataStore::AssignId(
+    const BufferDebugLogEntryMetadataStore::Metadata& metadata) {
+  absl::MutexLock lock{mutex_};
+  size_t id = log_entry_metadata_.size();
+  CHECK_LT(id, std::numeric_limits<uint32_t>::max())
+      << "BufferDebugLogEntryId overflowed";
+
+  log_entry_metadata_.push_back(std::move(metadata));
+  return BufferDebugLogEntryId{static_cast<uint32_t>(id)};
+}
+
+std::optional<BufferDebugLogEntryMetadataStore::Metadata>
+BufferDebugLogEntryMetadataStore::GetEntryMetadata(
+    BufferDebugLogEntryId entry_id) {
+  absl::MutexLock lock{mutex_};
+  return GetEntryMetadataLocked(entry_id);
+}
+
+std::vector<std::optional<BufferDebugLogEntryMetadataStore::Metadata>>
+BufferDebugLogEntryMetadataStore::GetEntryMetadataBatch(
+    absl::Span<const BufferDebugLogEntryId> entry_ids) {
+  absl::MutexLock lock{mutex_};
+  std::vector<std::optional<Metadata>> result;
+  result.reserve(entry_ids.size());
+  for (BufferDebugLogEntryId entry_id : entry_ids) {
+    result.push_back(GetEntryMetadataLocked(entry_id));
+  }
+  return result;
+}
+
+std::optional<BufferDebugLogEntryMetadataStore::Metadata>
+BufferDebugLogEntryMetadataStore::GetEntryMetadataLocked(
+    BufferDebugLogEntryId entry_id) {
+  if (entry_id >= log_entry_metadata_.size()) {
+    return std::nullopt;
+  }
+  return log_entry_metadata_[entry_id.value()];
+}
+
+BufferDebugLogProto BufferDebugLogEntryMetadataStore::EntriesToProto(
+    absl::Span<const BufferDebugLogEntry> entries) {
+  absl::MutexLock lock{mutex_};
+
+  BufferDebugLogProto proto;
+  for (const BufferDebugLogEntry& entry : entries) {
+    std::optional<Metadata> metadata = GetEntryMetadataLocked(entry.entry_id);
+    if (!metadata.has_value()) {
+      continue;
+    }
+
+    BufferDebugLogEntryProto* entry_proto = proto.add_entries();
+    entry_proto->set_thunk_id(metadata->thunk_id.value());
+    entry_proto->set_buffer_idx(metadata->buffer_idx);
+    entry_proto->set_execution_id(metadata->execution_id);
+    entry_proto->set_is_input_buffer(metadata->is_input);
+    entry_proto->set_checksum(entry.value);
+    entry_proto->set_check_type(metadata->check_type);
+  }
+  return proto;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h
new file mode 100644
index 00000000000000..06ea762e3c7246
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h
@@ -0,0 +1,111 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+
+namespace xla::gpu {
+
+// Provides unique mapping between `BufferDebugLogEntry::entry_id` and
+// additional information about the entry.
+//
+// For checksumming, the entry_id is transferred between host and device to
+// identify the context of checksummed buffer. This class provides a way to
+// store additional data about the entry without passing excessive information
+// back and forth.
+class BufferDebugLogEntryMetadataStore {
+ public:
+  // Metadata stored for each entry.
+  struct Metadata {
+    // ID of the thunk the entry relates to.
+    ThunkId thunk_id;
+    // Index of the thunk's buffer within the array returned by
+    // `Thunk::buffer_uses()`.
+    size_t buffer_idx;
+    // ID of the execution of the thunk, to distinguish between different
+    // executions of the same thunk, e.g. when it's used in a loop.
+    size_t execution_id;
+    // True if the entry represents a check made before the thunk executes.
+    bool is_input;
+
+    // The type of check that produced this entry.
+    BufferDebugLogEntryProto::CheckType check_type;
+
+    // Profile annotation of the HLO instruction that produced this entry.
+    // This is used to identify the HLO instruction in HloModule that was under
+    // the check. We need that to be able to log the HLO instruction when
+    // a non-zero number of infs or nans were found.
+    std::string profile_annotation;
+
+    std::string ToString() const {
+      return absl::StrCat(
+          "thunk_id: ", thunk_id.value(), ", buffer_idx: ", buffer_idx,
+          ", execution_id: ", execution_id,
+          ", is_input: ", is_input ? "true" : "false", ", check_type: ",
+          BufferDebugLogEntryProto::CheckType_Name(check_type));
+    }
+  };
+
+  // Inserts `metadata` into the store and returns an ID that can be used to
+  // retrieve it with `GetEntryMetadata`.
+  //
+  // The returned ID is guaranteed to be unique within the lifetime of this
+  // store, and stays valid until the store gets destroyed.
+  BufferDebugLogEntryId AssignId(const Metadata& metadata)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Returns the metadata for the entry with `entry_id` previously returned by
+  // `AssignId`, or `std::nullopt` if the ID is invalid.
+  std::optional<Metadata> GetEntryMetadata(BufferDebugLogEntryId entry_id)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Returns the metadata for the entries with `entry_ids` previously
+  // returned by `AssignId`, or `std::nullopt` if the ID is invalid.
+  std::vector<std::optional<Metadata>> GetEntryMetadataBatch(
+      absl::Span<const BufferDebugLogEntryId> entry_ids)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  // Converts a list of `entries` with IDs assigned by this store to a
+  // `BufferDebugLogProto` with additional metadata.
+  BufferDebugLogProto EntriesToProto(
+      absl::Span<const BufferDebugLogEntry> entries)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  std::optional<Metadata> GetEntryMetadataLocked(BufferDebugLogEntryId entry_id)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  absl::Mutex mutex_;
+  std::vector<Metadata> log_entry_metadata_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFER_DEBUG_LOG_ENTRY_METADATA_STORE_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc
new file mode 100644
index 00000000000000..d2d19f9a501c5d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(BufferDebugLogEntryMetadataStoreTest, RoundTrip) {
+  BufferDebugLogEntryMetadataStore store;
+  BufferDebugLogEntryMetadataStore::Metadata metadata = {
+      /*thunk_id=*/ThunkId(123),
+      /*buffer_idx=*/4,
+      /*execution_id=*/5,
+      /*is_input=*/true,
+  };
+
+  BufferDebugLogEntryId entry_id = store.AssignId(metadata);
+  // Add a second entry to ensure mutating the store doesn't invalidate
+  // previously assigned IDs.
+  [[maybe_unused]] BufferDebugLogEntryId unused_id =
+      store.AssignId(BufferDebugLogEntryMetadataStore::Metadata{});
+  std::optional<BufferDebugLogEntryMetadataStore::Metadata> retrieved_metadata =
+      store.GetEntryMetadata(entry_id);
+
+  ASSERT_TRUE(retrieved_metadata.has_value());
+  EXPECT_EQ(retrieved_metadata->thunk_id, metadata.thunk_id);
+  EXPECT_EQ(retrieved_metadata->buffer_idx, metadata.buffer_idx);
+  EXPECT_EQ(retrieved_metadata->execution_id, metadata.execution_id);
+  EXPECT_EQ(retrieved_metadata->is_input, metadata.is_input);
+}
+
+TEST(BufferDebugLogEntryMetadataStoreTest, InvalidId) {
+  BufferDebugLogEntryMetadataStore store;
+
+  EXPECT_EQ(store.GetEntryMetadata(BufferDebugLogEntryId{123}), std::nullopt);
+}
+
+TEST(BufferDebugLogEntryMetadataStoreTest, EntriesToProto) {
+  BufferDebugLogEntryMetadataStore store;
+  const BufferDebugLogEntryId entry_id1 = store.AssignId({
+      /*thunk_id=*/ThunkId(123),
+      /*buffer_idx=*/4,
+      /*execution_id=*/5,
+      /*is_input=*/true,
+      BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+  });
+  const BufferDebugLogEntryId entry_id2 = store.AssignId({
+      /*thunk_id=*/ThunkId(567),
+      /*buffer_idx=*/8,
+      /*execution_id=*/9,
+      /*is_input=*/false,
+      BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+  });
+  std::vector<BufferDebugLogEntry> entries = {
+      {
+          /*entry_id=*/entry_id1,
+          /*checksum=*/12341234,
+      },
+      {
+          /*entry_id=*/entry_id2,
+          /*checksum=*/56785678,
+      },
+  };
+
+  BufferDebugLogProto log_proto = store.EntriesToProto(entries);
+
+  EXPECT_THAT(log_proto, EqualsProto(R"pb(
+                entries {
+                  thunk_id: 123
+                  buffer_idx: 4
+                  execution_id: 5
+                  is_input_buffer: true
+                  checksum: 12341234
+                  check_type: CHECK_TYPE_CHECKSUM
+                }
+                entries {
+                  thunk_id: 567
+                  buffer_idx: 8
+                  execution_id: 9
+                  is_input_buffer: false
+                  checksum: 56785678,
+                  check_type: CHECK_TYPE_FLOAT_CHECKS
+                }
+              )pb"));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
index a8940f2ba90746..9ff067c00b633d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
@@ -20,25 +20,27 @@ limitations under the License.
 #include <cstdint>
 #include <tuple>
 
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
+#include "xla/tsl/lib/gtl/int_type.h"
 
 namespace xla::gpu {
 
+TSL_LIB_GTL_DEFINE_INT_TYPE(BufferDebugLogEntryId, uint32_t)
+
 struct BufferDebugLogEntry {
-  // An ID that uniquely identifies a thunk and its specific input or output
-  // buffer.
-  ThunkBufferId entry_id;
-  uint32_t checksum;
+  // An ID that uniquely identifies a log entry within a HLO module execution.
+  BufferDebugLogEntryId entry_id;
+  uint32_t value;
 
   template <typename Sink>
   friend void AbslStringify(Sink& sink, const BufferDebugLogEntry& entry) {
-    absl::Format(&sink, "{entry_id: %v, checksum: %u}", entry.entry_id,
-                 entry.checksum);
+    absl::Format(&sink, "{entry_id: %v, value: %u}", entry.entry_id.value(),
+                 entry.value);
   }
 
   bool operator==(const BufferDebugLogEntry& other) const {
-    return std::tie(entry_id, checksum) ==
-           std::tie(other.entry_id, other.checksum);
+    return std::tie(entry_id, value) == std::tie(other.entry_id, other.value);
   }
 
   bool operator!=(const BufferDebugLogEntry& other) const {
@@ -50,7 +52,41 @@ struct BufferDebugLogEntry {
 static_assert(_Alignof(BufferDebugLogEntry) == _Alignof(uint32_t));
 static_assert(sizeof(BufferDebugLogEntry) == sizeof(uint32_t) * 2);
 static_assert(offsetof(BufferDebugLogEntry, entry_id) == 0);
-static_assert(offsetof(BufferDebugLogEntry, checksum) == sizeof(uint32_t));
+static_assert(offsetof(BufferDebugLogEntry, value) == sizeof(uint32_t));
+
+struct BufferDebugFloatCheckEntry {
+  // An ID that uniquely identifies a log entry within a HLO module execution.
+  BufferDebugLogEntryId entry_id;
+  uint32_t nan_count;
+  uint32_t inf_count;
+  uint32_t zero_count;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const BufferDebugFloatCheckEntry& entry) {
+    absl::Format(&sink,
+                 "{entry_id: %v, nan_count: %u, inf_count: %u, zero_count: %u}",
+                 entry.entry_id.value(), entry.nan_count, entry.inf_count,
+                 entry.zero_count);
+  }
+
+  bool operator==(const BufferDebugFloatCheckEntry& other) const {
+    return std::tie(entry_id, nan_count, inf_count, zero_count) ==
+           std::tie(other.entry_id, other.nan_count, other.inf_count,
+                    other.zero_count);
+  }
+
+  bool operator!=(const BufferDebugFloatCheckEntry& other) const {
+    return !(*this == other);
+  }
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(BufferDebugFloatCheckEntry) == _Alignof(uint32_t));
+static_assert(sizeof(BufferDebugFloatCheckEntry) == sizeof(uint32_t) * 4);
+static_assert(offsetof(BufferDebugFloatCheckEntry, entry_id) == 0);
+static_assert(offsetof(BufferDebugFloatCheckEntry, nan_count) ==
+              sizeof(uint32_t));
 
 struct BufferDebugLogHeader {
   // The first entry in `BufferDebugLogEntry` following the header that has not
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc
index 4823094b5f2e86..46bd1554469d5a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.cc
@@ -16,11 +16,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
 
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <utility>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
@@ -53,42 +58,75 @@ absl::Status BuffersDebugChecksumThunk::Initialize(
     return absl::OkStatus();
   }
 
-  se::gpu::GpuKernelRegistry registry =
-      se::gpu::GpuKernelRegistry::GetGlobalRegistry();
-  TF_ASSIGN_OR_RETURN(
-      kernel_, registry.LoadKernel<se::gpu::BufferDebugXorChecksumKernel>(
-                   params.executor));
-
-  VLOG(1) << "Checksum kernel loaded";
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    if (!kernels_.contains(params.executor)) {
+      se::gpu::GpuKernelRegistry registry =
+          se::gpu::GpuKernelRegistry::GetGlobalRegistry();
+      TF_ASSIGN_OR_RETURN(
+          auto kernel,
+          registry.LoadKernel<se::gpu::BufferDebugXorChecksumKernel>(
+              params.executor));
+      kernels_[params.executor] =
+          std::make_unique<se::gpu::BufferDebugXorChecksumKernel::KernelType>(
+              std::move(kernel));
+      VLOG(1) << "Checksum kernel loaded on device "
+              << params.executor->device_ordinal()
+              << " (stream_executor: " << params.executor
+              << "), kernel: " << kernels_[params.executor].get();
+    }
+  }
   return absl::OkStatus();
 }
 
 absl::Status BuffersDebugChecksumThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
-  if (!kernel_.has_value()) {
-    // Initialize didn't load the kernel. This can happen when we're running on
-    // an unsupported platform.
-    VLOG(1) << "Checksum kernel not loaded, skipping";
-    return absl::OkStatus();
+
+  se::gpu::BufferDebugXorChecksumKernel::KernelType* kernel = nullptr;
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    auto kernel_it = kernels_.find(executor);
+    if (kernel_it == kernels_.end()) {
+      // Initialize didn't load the kernel. This can happen when we're running
+      // on an unsupported platform.
+      VLOG(1) << "Checksum kernel not loaded on device "
+              << executor->device_ordinal() << ", skipping";
+      return absl::OkStatus();
+    }
+    kernel = kernel_it->second.get();
   }
 
-  VLOG(1) << "BuffersDebugChecksumThunk::ExecuteOnStream";
+  VLOG(1) << "BuffersDebugChecksumThunk::ExecuteOnStream, device "
+          << executor->device_ordinal() << " (stream_executor: " << executor
+          << "), kernel: " << kernel;
+  const uint32_t execution_id = execution_count_.fetch_add(1);
 
   const se::ThreadDim thread_dim(
       executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
 
   se::DeviceMemory<uint8_t> log_ptr(
       params.buffer_allocations->GetDeviceAddress(log_slice_));
-  se::cuda::BufferDebugLog buffer_debug_log =
-      se::cuda::BufferDebugLog::FromDeviceMemoryUnchecked(log_ptr);
+  auto buffer_debug_log =
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::FromDeviceMemoryUnchecked(
+          log_ptr);
+
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    BufferDebugLogEntryMetadataStore::Metadata metadata{
+        /*thunk_id*/ checked_thunk_id_,
+        /*buffer_idx*/ buffer_idx,
+        /*execution_id*/ execution_id,
+        /*is_input*/ runs_before_checked_thunk_,
+        /*check_type*/ BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+    };
+    const BufferDebugLogEntryId log_entry_id =
+        metadata_store_->AssignId(metadata);
 
-  for (const auto& [entry_id, buffer] : buffers_) {
     se::DeviceMemory<uint8_t> device_buffer(
         params.buffer_allocations->GetDeviceAddress(buffer));
 
-    TF_RETURN_IF_ERROR(kernel_->Launch(
-        thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
+    TF_RETURN_IF_ERROR(kernel->Launch(
+        thread_dim, se::BlockDim(1, 1, 1), params.stream, log_entry_id,
         device_buffer, device_buffer.size(), buffer_debug_log.GetDeviceHeader(),
         buffer_debug_log.GetDeviceEntries()));
   }
@@ -98,10 +136,11 @@ absl::Status BuffersDebugChecksumThunk::ExecuteOnStream(
 
 std::string BuffersDebugChecksumThunk::ToString(int indent) const {
   std::string result;
-  absl::StrAppend(&result, ", buffers = ", buffers_.size());
-  for (const auto& [buffer_id, buffer] : buffers_) {
+  absl::StrAppend(&result, ", buffers = ", checked_thunk_buffers_.size());
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
     absl::StrAppend(&result, "\n", std::string(indent + 2, ' '),
-                    "buffer_id: ", buffer_id, ", buffer: ", buffer.ToString());
+                    "buffer_idx: ", buffer_idx,
+                    ", buffer: ", buffer.ToString());
   }
   return result;
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h
index 71f4ee186b4faa..4baf4fb10d7efa 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk.h
@@ -16,17 +16,22 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFERS_CHECKSUM_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_BUFFERS_CHECKSUM_THUNK_H_
 
-#include <optional>
+#include <atomic>
+#include <cstddef>
+#include <memory>
 #include <string>
 #include <utility>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla::gpu {
 
@@ -34,13 +39,23 @@ class BuffersDebugChecksumThunk : public Thunk {
  public:
   explicit BuffersDebugChecksumThunk(
       ThunkInfo info, BufferAllocation::Slice log_slice,
-      absl::flat_hash_map<ThunkBufferId, BufferAllocation::Slice> buffers)
+      ThunkId checked_thunk_id,
+      // buffer_idx => buffer slice
+      absl::flat_hash_map<size_t, BufferAllocation::Slice>
+          checked_thunk_buffers,
+      bool runs_before_checked_thunk,
+      std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
       : Thunk(Thunk::Kind::kBuffersDebugChecksum, std::move(info)),
         log_slice_(log_slice),
-        buffers_(std::move(buffers)) {}
+        metadata_store_(std::move(metadata_store)),
+        checked_thunk_id_(checked_thunk_id),
+        checked_thunk_buffers_(std::move(checked_thunk_buffers)),
+        runs_before_checked_thunk_(runs_before_checked_thunk) {}
 
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
 
   std::string ToString(int indent) const override;
 
@@ -49,17 +64,42 @@ class BuffersDebugChecksumThunk : public Thunk {
     return {};
   }
 
-  const absl::flat_hash_map<ThunkBufferId, BufferAllocation::Slice>&
-  buffer_slices() const {
-    return buffers_;
+  const absl::flat_hash_map<size_t, BufferAllocation::Slice>& buffer_slices()
+      const {
+    return checked_thunk_buffers_;
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const BuffersDebugChecksumThunk& thunk) {
+    absl::Format(&sink, "BuffersDebugChecksumThunk{buffers=%s}",
+                 absl::StrJoin(thunk.checked_thunk_buffers_, ", ",
+                               [](std::string* out, const auto& buffer) {
+                                 const auto& [id, slice] = buffer;
+                                 absl::StrAppend(out, id, "=",
+                                                 slice.ToString());
+                               }));
   }
 
  private:
-  // Loaded in Initialize.
-  std::optional<stream_executor::gpu::BufferDebugXorChecksumKernel::KernelType>
-      kernel_;
+  absl::Mutex kernels_mutex_;
+  // Each loaded kernel is associated with a specific device (represented by its
+  // StreamExecutor).
+  //
+  // ExecuteOnStream implementation requires pointer stability of values, hence
+  // unique_ptr.
+  absl::flat_hash_map<
+      stream_executor::StreamExecutor*,
+      std::unique_ptr<
+          stream_executor::gpu::BufferDebugXorChecksumKernel::KernelType>>
+      kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+
   BufferAllocation::Slice log_slice_;
-  absl::flat_hash_map<ThunkBufferId, BufferAllocation::Slice> buffers_;
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
+  ThunkId checked_thunk_id_;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
+  bool runs_before_checked_thunk_;
+  std::atomic<size_t> execution_count_ = 0;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
index 863b43c8aa9f09..4aa9116473b4c5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
@@ -19,18 +19,22 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/resource_requests.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/platform.h"
@@ -45,9 +49,46 @@ namespace {
 
 namespace se = stream_executor;
 
-using ::stream_executor::cuda::BufferDebugLog;
+using ::stream_executor::gpu::BufferDebugLog;
+using Metadata = BufferDebugLogEntryMetadataStore::Metadata;
+
+using ::testing::AllOf;
+using ::testing::Field;
 using ::testing::UnorderedElementsAre;
 
+MATCHER_P2(IsEntryWithMetadata, store, metadata, "") {
+  std::optional<Metadata> actual_metadata =
+      store->GetEntryMetadata(arg.entry_id);
+  if (!actual_metadata.has_value()) {
+    *result_listener << "metadata not found for entry_id "
+                     << arg.entry_id.value();
+    return false;
+  }
+
+  return ExplainMatchResult(
+      AllOf(Field(&Metadata::thunk_id, metadata.thunk_id),
+            Field(&Metadata::buffer_idx, metadata.buffer_idx),
+            Field(&Metadata::execution_id, metadata.execution_id),
+            Field(&Metadata::is_input, metadata.is_input)),
+      *actual_metadata, result_listener);
+}
+
+class FakeThunk : public Thunk {
+ public:
+  explicit FakeThunk(ThunkInfo info, BufferUses buffer_uses)
+      : Thunk(Thunk::Kind::kGemm, std::move(info)),
+        buffer_uses_(std::move(buffer_uses)) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+
+  BufferUses buffer_uses() const override { return buffer_uses_; }
+
+ private:
+  BufferUses buffer_uses_;
+};
+
 class BuffersDebugChecksumThunkTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -75,7 +116,8 @@ class BuffersDebugChecksumThunkTest : public ::testing::Test {
 };
 
 TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
-  static constexpr size_t kLogSize = BufferDebugLog::RequiredSizeForEntries(10);
+  static constexpr size_t kLogSize =
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(10);
   static constexpr size_t kInputSize = 1024;
   static constexpr size_t kInputCount = 2;
   static constexpr size_t kTotalDeviceMemoryBytes =
@@ -97,8 +139,8 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
   se::DeviceMemoryBase inputs0_mem = allocations.GetDeviceAddress(inputs[0]);
   se::DeviceMemoryBase inputs1_mem = allocations.GetDeviceAddress(inputs[1]);
   // Initialize the log in device memory
-  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLog device_log,
-                          BufferDebugLog::CreateOnDevice(
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
                               *stream_, se::DeviceMemory<uint8_t>(log_mem)));
   // Fill inputs with some data
   std::vector<uint32_t> zeros(1024, 0);
@@ -115,11 +157,13 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
       ServiceExecutableRunOptions(), allocations, stream_.get(),
       /*command_buffer_trace_stream=*/stream_.get(),
       /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
+  auto metadata_store = std::make_shared<BufferDebugLogEntryMetadataStore>();
 
   BuffersDebugChecksumThunk thunk(
       Thunk::ThunkInfo(), log_slice,
-      {{ThunkBufferId::Create(ThunkId(123), 4).value(), inputs[0]},
-       {ThunkBufferId::Create(ThunkId(456), 8).value(), inputs[1]}});
+      /*checked_thunk_id=*/ThunkId(123),
+      {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
+      /*runs_before_checked_thunk=*/true, metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
   TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}, resource_requests));
   TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
@@ -128,17 +172,96 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
 
   // BuffersDebugChecksumThunk launches a kernel for each input buffer, they may
   // complete in any order.
-  EXPECT_THAT(
-      entries,
-      UnorderedElementsAre(
-          BufferDebugLogEntry{
-              /*entry_id=*/ThunkBufferId::Create(ThunkId(123), 4).value(),
-              /*checksum=*/12341234,
-          },
-          BufferDebugLogEntry{
-              /*entry_id=*/ThunkBufferId::Create(ThunkId(456), 8).value(),
-              /*checksum=*/56785678,
-          }));
+  EXPECT_THAT(entries,
+              UnorderedElementsAre(
+                  AllOf(IsEntryWithMetadata(
+                            metadata_store,
+                            Metadata{
+                                /*thunk_id=*/ThunkId(123),
+                                /*buffer_idx=*/0,
+                                /*execution_id=*/0,
+                                /*is_input=*/true,
+                                BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+                            }),
+                        Field(&BufferDebugLogEntry::value, 12341234)),
+                  AllOf(IsEntryWithMetadata(
+                            metadata_store,
+                            Metadata{
+                                /*thunk_id=*/ThunkId(123),
+                                /*buffer_idx=*/1,
+                                /*execution_id=*/0,
+                                /*is_input=*/true,
+                                BufferDebugLogEntryProto::CHECK_TYPE_CHECKSUM,
+                            }),
+                        Field(&BufferDebugLogEntry::value, 56785678))));
+}
+
+TEST_F(BuffersDebugChecksumThunkTest,
+       ExecutesCorrectKernelsForDifferentDevices) {
+  // Loaded kernels are associated with a specific device represented by its
+  // StreamExecutor. The same Thunk will be Initialized once for each device,
+  // which will load the kernel onto that device. During ExecuteOnStream, the
+  // correct kernel needs to be launched.
+  if (platform_->VisibleDeviceCount() < 2) {
+    GTEST_SKIP() << "need at least 2 devices for this test";
+  }
+
+  static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kInputSizeBytes = 1024;
+
+  struct TestDevice {
+    se::StreamExecutor* executor;
+    std::unique_ptr<se::Stream> stream;
+    std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator;
+    BufferAllocations allocations;
+  };
+  auto setup_device = [this](int device_ordinal) -> absl::StatusOr<TestDevice> {
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        platform_->ExecutorForDevice(device_ordinal));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                        executor->CreateStream());
+    auto allocator =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
+    BufferAllocations allocations(
+        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        executor->device_ordinal(), allocator.get());
+
+    return TestDevice{std::move(executor), std::move(stream),
+                      std::move(allocator), std::move(allocations)};
+  };
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
+  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
+  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
+  BufferAllocation::Slice input_slice(&allocation, kLogSizeBytes,
+                                      kInputSizeBytes);
+  BuffersDebugChecksumThunk thunk(
+      Thunk::ThunkInfo(), log_slice,
+      /*checked_thunk_id=*/ThunkId(123), {{/*buffer_idx=*/0, input_slice}},
+      /*runs_before_checked_thunk=*/true,
+      std::make_shared<BufferDebugLogEntryMetadataStore>());
+
+  // Initialize the Thunk on both devices and run the kernel. An attempt to run
+  // a kernel on the wrong device will fail with CUDA_ERROR_INVALID_HANDLE. The
+  // error may be reported from the next operation on the stream, so assert on
+  // BlockHostUntilDone as well.
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device0.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device0.allocations, device0.stream.get(),
+      /*command_buffer_trace_stream=*/device0.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device0.stream->BlockHostUntilDone());
+
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device1.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device1.allocations, device1.stream.get(),
+      /*command_buffer_trace_stream=*/device1.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device1.stream->BlockHostUntilDone());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
new file mode 100644
index 00000000000000..155b72015d6938
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
@@ -0,0 +1,165 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+absl::Status BuffersDebugFloatCheckThunk::Initialize(
+    const InitializeParams& params) {
+  if (params.executor->GetPlatform()->id() != se::cuda::kCudaPlatformId) {
+    VLOG(1) << "Buffer float checking not supported on non-CUDA platforms, "
+               "skipping";
+    return absl::OkStatus();
+  }
+  if (!params.executor->GetDeviceDescription()
+           .cuda_compute_capability()
+           .IsAtLeastPascal()) {
+    VLOG(1)
+        << "Buffer float checking not supported on CUDA architectures older "
+           "than Pascal due to missing atomic fetch_add with system scope, "
+           "skipping";
+    return absl::OkStatus();
+  }
+
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    if (!kernels_.contains(params.executor)) {
+      se::gpu::GpuKernelRegistry registry =
+          se::gpu::GpuKernelRegistry::GetGlobalRegistry();
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_f32,
+          registry.LoadKernel<se::gpu::BufferDebugFloatCheckF32Kernel>(
+              params.executor));
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_bf16,
+          registry.LoadKernel<se::gpu::BufferDebugFloatCheckBf16Kernel>(
+              params.executor));
+      kernels_[params.executor] = std::make_unique<Kernels>(
+          Kernels{std::move(kernel_f32), std::move(kernel_bf16)});
+    }
+  }
+
+  VLOG(1) << "FloatCheck kernel loaded";
+  return absl::OkStatus();
+}
+
+absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
+    const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  Kernels* kernels = nullptr;
+  {
+    absl::MutexLock lock(kernels_mutex_);
+    auto kernel_it = kernels_.find(executor);
+    if (kernel_it == kernels_.end()) {
+      // Initialize didn't load the kernel. This can happen when we're running
+      // on an unsupported platform.
+      VLOG(1) << "FloatCheck kernels not loaded on device "
+              << executor->device_ordinal() << ", skipping";
+      return absl::OkStatus();
+    }
+    kernels = kernel_it->second.get();
+  }
+
+  VLOG(1) << "BuffersDebugFloatCheckThunk::ExecuteOnStream";
+
+  const se::ThreadDim thread_dim(
+      executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
+
+  se::DeviceMemory<uint8_t> log_ptr(
+      params.buffer_allocations->GetDeviceAddress(log_slice_));
+  se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry> buffer_debug_log =
+      se::gpu::BufferDebugLog<
+          BufferDebugFloatCheckEntry>::FromDeviceMemoryUnchecked(log_ptr);
+  const uint32_t execution_id = execution_count_.fetch_add(1);
+
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    BufferDebugLogEntryMetadataStore::Metadata metadata{
+        checked_thunk_info_.thunk_id,
+        buffer_idx,
+        execution_id,
+        /*is_input=*/false,
+        BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+        checked_thunk_info_.profile_annotation,
+    };
+    const BufferDebugLogEntryId entry_id = metadata_store_->AssignId(metadata);
+
+    PrimitiveType buffer_type = buffer.element_type();
+    se::DeviceMemoryBase device_buffer =
+        params.buffer_allocations->GetDeviceAddress(buffer);
+    if (buffer_type == PrimitiveType::F32) {
+      VLOG(1) << "F32 buffer detected with id: " << entry_id
+              << " and size: " << device_buffer.size();
+      se::DeviceMemory<float> f32_buffer(device_buffer);
+      TF_RETURN_IF_ERROR(kernels->f32.Launch(
+          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
+          f32_buffer, f32_buffer.size(), buffer_debug_log.GetDeviceHeader(),
+          buffer_debug_log.GetDeviceEntries()));
+    } else if (buffer_type == PrimitiveType::BF16) {
+      VLOG(1) << "BF16 buffer detected with id: " << entry_id
+              << " and size: " << device_buffer.size();
+      se::DeviceMemory<Eigen::bfloat16> bf16_buffer(device_buffer);
+      TF_RETURN_IF_ERROR(kernels->bf16.Launch(
+          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
+          bf16_buffer, bf16_buffer.size(), buffer_debug_log.GetDeviceHeader(),
+          buffer_debug_log.GetDeviceEntries()));
+    } else {
+      VLOG(1) << "Unsupported primitive type for float checking: "
+              << PrimitiveType_Name(buffer_type);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+std::string BuffersDebugFloatCheckThunk::ToString(int indent) const {
+  std::string result;
+  absl::StrAppend(&result, ", buffers = ", checked_thunk_buffers_.size());
+  for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
+    absl::StrAppend(&result, "\n", std::string(indent + 2, ' '),
+                    "buffer_idx: ", buffer_idx,
+                    ", buffer: ", buffer.ToString());
+  }
+  return result;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
new file mode 100644
index 00000000000000..5d2f78e80edb99
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
@@ -0,0 +1,90 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+class BuffersDebugFloatCheckThunk : public Thunk {
+ public:
+  explicit BuffersDebugFloatCheckThunk(
+      ThunkInfo info, const ThunkInfo& checked_thunk_info,
+      BufferAllocation::Slice log_slice,
+      absl::flat_hash_map<size_t, BufferAllocation::Slice>
+          checked_thunk_buffers,
+      std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
+      : Thunk(Thunk::Kind::kBuffersDebugFloatCheck, std::move(info)),
+        log_slice_(log_slice),
+        checked_thunk_info_(checked_thunk_info),
+        checked_thunk_buffers_(std::move(checked_thunk_buffers)),
+        metadata_store_(std::move(metadata_store)) {}
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  std::string ToString(int indent) const override;
+
+  BufferUses buffer_uses() const override {
+    // Intentionally left empty to not float-check the float-checking thunk.
+    return {};
+  }
+
+  const absl::flat_hash_map<size_t, BufferAllocation::Slice>& buffer_slices()
+      const {
+    return checked_thunk_buffers_;
+  }
+
+ private:
+  struct Kernels {
+    stream_executor::gpu::BufferDebugFloatCheckF32Kernel::KernelType f32;
+    stream_executor::gpu::BufferDebugFloatCheckBf16Kernel::KernelType bf16;
+  };
+  absl::Mutex kernels_mutex_;
+  // Each loaded kernel is associated with a specific device (represented by its
+  // StreamExecutor).
+  //
+  // ExecuteOnStream implementation requires pointer stability of values, hence
+  // unique_ptr.
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      std::unique_ptr<Kernels>>
+      kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+
+  BufferAllocation::Slice log_slice_;
+  ThunkInfo checked_thunk_info_;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
+  std::atomic<size_t> execution_count_ = 0;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFERS_FLOAT_CHECK_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
new file mode 100644
index 00000000000000..d6b566e6006c0f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
@@ -0,0 +1,269 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/resource_requests.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace se = stream_executor;
+
+using Metadata = BufferDebugLogEntryMetadataStore::Metadata;
+
+using ::stream_executor::gpu::BufferDebugLog;
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::UnorderedElementsAre;
+
+MATCHER_P2(IsEntryWithMetadata, store, metadata, "") {
+  std::optional<Metadata> actual_metadata =
+      store->GetEntryMetadata(arg.entry_id);
+  if (!actual_metadata.has_value()) {
+    *result_listener << "metadata not found for entry_id "
+                     << arg.entry_id.value();
+    return false;
+  }
+
+  return ExplainMatchResult(
+      AllOf(Field(&Metadata::thunk_id, metadata.thunk_id),
+            Field(&Metadata::buffer_idx, metadata.buffer_idx),
+            Field(&Metadata::execution_id, metadata.execution_id),
+            Field(&Metadata::is_input, metadata.is_input)),
+      *actual_metadata, result_listener);
+}
+
+class BuffersDebugFloatCheckThunkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "buffer float checking is not supported on CUDA architectures "
+             "older than Pascal due to missing atomic fetch_add with "
+             "system scope";
+    }
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
+  static constexpr size_t kLogSize =
+      BufferDebugLog<BufferDebugFloatCheckEntry>::RequiredSizeForEntries(10);
+  static constexpr size_t kInputElems = 1024;
+  static constexpr size_t kInputSizeInBytes = kInputElems * sizeof(float);
+  static constexpr size_t kTotalDeviceMemoryBytes =
+      kLogSize + kInputSizeInBytes * 2;
+  // Setup memory allocations for the log and inputs
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kTotalDeviceMemoryBytes,
+                         /*color=*/0);
+  int64_t input_offset = kLogSize;
+  BufferAllocation::Slice log_slice(&alloc, /*offset=*/0, kLogSize);
+  input_offset += kLogSize;
+
+  BufferAllocation::Slice inputs[2];
+  int64_t input_size_bf16 = kInputElems * sizeof(Eigen::bfloat16);
+  inputs[0] = BufferAllocation::Slice(&alloc, input_offset, input_size_bf16,
+                                      PrimitiveType::BF16);
+  input_offset += input_size_bf16;
+
+  inputs[1] = BufferAllocation::Slice(
+      &alloc, input_offset, kInputElems * sizeof(float), PrimitiveType::F32);
+
+  BufferAllocations allocations(
+      {executor_->AllocateArray<uint8_t>(kTotalDeviceMemoryBytes)},
+      executor_->device_ordinal(), allocator_.get());
+  se::DeviceMemoryBase log_mem = allocations.GetDeviceAddress(log_slice);
+  se::DeviceMemoryBase inputs0_mem = allocations.GetDeviceAddress(inputs[0]);
+  se::DeviceMemoryBase inputs1_mem = allocations.GetDeviceAddress(inputs[1]);
+  // Initialize the log in device memory
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, se::DeviceMemory<uint8_t>(log_mem)));
+  // Fill inputs with some data
+  {
+    std::vector<Eigen::bfloat16> data(kInputElems, Eigen::bfloat16(0));
+    data[123] = std::numeric_limits<Eigen::bfloat16>::quiet_NaN();
+    TF_ASSERT_OK(stream_->Memcpy(&inputs0_mem, data.data(), kInputSizeInBytes));
+  }
+  {
+    std::vector<float> data(kInputElems, 0);
+    data[456] = std::numeric_limits<float>::quiet_NaN();
+    data[789] = std::numeric_limits<float>::quiet_NaN();
+    TF_ASSERT_OK(stream_->Memcpy(&inputs1_mem, data.data(), kInputSizeInBytes));
+  }
+
+  // Setup parameters for Initialize/Prepare/ExecuteOnStream
+  Thunk::InitializeParams init_params;
+  init_params.executor = executor_;
+  init_params.stream = stream_.get();
+  ResourceRequests resource_requests;
+  auto execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), allocations, stream_.get(),
+      /*command_buffer_trace_stream=*/stream_.get(),
+      /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
+  auto metadata_store = std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  Thunk::ThunkInfo checked_thunk_info;
+  checked_thunk_info.thunk_id = ThunkId(123);
+  BuffersDebugFloatCheckThunk thunk(
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
+      metadata_store);
+  TF_ASSERT_OK(thunk.Initialize(init_params));
+  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}, resource_requests));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugFloatCheckEntry> entries,
+                          device_log.ReadFromDevice(*stream_));
+
+  // BuffersDebugFloatCheckThunk launches a kernel for each input buffer, they
+  // may complete in any order.
+  EXPECT_THAT(entries,
+              UnorderedElementsAre(
+                  IsEntryWithMetadata(
+                      metadata_store,
+                      Metadata{
+                          /*thunk_id=*/ThunkId(123),
+                          /*buffer_idx=*/0,
+                          /*execution_id=*/0,
+                          /*is_input=*/false,
+                          BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+                      }),
+                  IsEntryWithMetadata(
+                      metadata_store,
+                      Metadata{
+                          /*thunk_id=*/ThunkId(123),
+                          /*buffer_idx=*/1,
+                          /*execution_id=*/0,
+                          /*is_input=*/false,
+                          BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS,
+                      })));
+}
+
+TEST_F(BuffersDebugFloatCheckThunkTest,
+       ExecutesCorrectKernelsForDifferentDevices) {
+  // Loaded kernels are associated with a specific device represented by its
+  // StreamExecutor. The same Thunk will be Initialized once for each device,
+  // which will load the kernel onto that device. During ExecuteOnStream, the
+  // correct kernel needs to be launched.
+  if (platform_->VisibleDeviceCount() < 2) {
+    GTEST_SKIP() << "need at least 2 devices for this test";
+  }
+
+  static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kInputSizeBytes = 1024;
+
+  struct TestDevice {
+    se::StreamExecutor* executor;
+    std::unique_ptr<se::Stream> stream;
+    std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator;
+    BufferAllocations allocations;
+  };
+  auto setup_device = [this](int device_ordinal) -> absl::StatusOr<TestDevice> {
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                        platform_->ExecutorForDevice(device_ordinal));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                        executor->CreateStream());
+    auto allocator =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
+    BufferAllocations allocations(
+        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        executor->device_ordinal(), allocator.get());
+
+    return TestDevice{std::move(executor), std::move(stream),
+                      std::move(allocator), std::move(allocations)};
+  };
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
+  TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
+  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
+  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
+  BufferAllocation::Slice f32_slice(&allocation, kLogSizeBytes, kInputSizeBytes,
+                                    PrimitiveType::F32);
+  BufferAllocation::Slice bf16_slice(&allocation, kLogSizeBytes,
+                                     kInputSizeBytes, PrimitiveType::BF16);
+  Thunk::ThunkInfo checked_thunk_info;
+  checked_thunk_info.thunk_id = ThunkId(123);
+  BuffersDebugFloatCheckThunk thunk(
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      {{/*buffer_idx=*/0, f32_slice}, {/*buffer_idx=*/1, bf16_slice}},
+      std::make_shared<BufferDebugLogEntryMetadataStore>());
+
+  // Initialize the Thunk on both devices and run the kernel. An attempt to run
+  // a kernel on the wrong device will fail with CUDA_ERROR_INVALID_HANDLE. The
+  // error may be reported from the next operation on the stream, so assert on
+  // BlockHostUntilDone as well.
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device0.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device0.allocations, device0.stream.get(),
+      /*command_buffer_trace_stream=*/device0.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device0.stream->BlockHostUntilDone());
+
+  TF_ASSERT_OK(
+      thunk.Initialize(Thunk::InitializeParams{/*executor=*/device1.executor}));
+  TF_ASSERT_OK(thunk.ExecuteOnStream(Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device1.allocations, device1.stream.get(),
+      /*command_buffer_trace_stream=*/device1.stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr)));
+  TF_ASSERT_OK(device1.stream->BlockHostUntilDone());
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
deleted file mode 100644
index 44ee84dd5cd3ea..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/cholesky_thunk.h"
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/make_batch_pointers.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-template <typename T>
-absl::Status DoPotrfBatched(CholeskyParams* params, se::Stream* stream,
-                            stream_executor::GpuSolverContext& context) {
-  T* a_base = static_cast<T*>(params->a_buffer.opaque());
-  se::DeviceMemory<int> infos(params->info_buffer);
-#if TENSORFLOW_USE_ROCSOLVER
-  // hipsolver is not supported so allocate a GPU buffer
-  se::ScopedDeviceMemory<T*> ptrs(
-      stream->parent(), stream->parent()->AllocateArray<T*>(batch_size_));
-  auto as = *ptrs;
-#else
-  se::DeviceMemory<T*> as(params->workspace_buffer);
-#endif
-
-  CHECK_GE(as.size(), params->batch_size);
-  CHECK_GE(infos.size(), params->batch_size);
-
-  // Run a kernel that sets as[i] = &a_base[i * stride].
-  const int64_t stride_bytes = params->n * params->n * sizeof(T);
-  TF_RETURN_IF_ERROR(MakeBatchPointers(
-      stream, se::DeviceMemoryBase(a_base), stride_bytes,
-      static_cast<int>(params->batch_size), se::DeviceMemoryBase(as)));
-
-  // Now that we've set up the `as` array, we can call cusolver.
-  return context.PotrfBatched(params->uplo, params->n, as, params->n, infos,
-                              params->batch_size);
-}
-
-template <typename T>
-absl::Status DoPotrfUnbatched(CholeskyParams* params, se::Stream* stream,
-                              stream_executor::GpuSolverContext& context) {
-  T* a_base = static_cast<T*>(params->a_buffer.opaque());
-  int* info_base = static_cast<int*>(params->info_buffer.opaque());
-
-  int64_t stride = params->n * params->n;
-  for (int64_t i = 0; i < params->batch_size; ++i) {
-    se::DeviceMemory<T> a_data(
-        se::DeviceMemoryBase(&a_base[i * stride], sizeof(T) * stride));
-    se::DeviceMemory<int> info_data(
-        se::DeviceMemoryBase(&info_base[i], sizeof(int)));
-    se::DeviceMemory<T> workspace_data(params->workspace_buffer);
-    TF_RETURN_IF_ERROR(context.Potrf(params->uplo, params->n, a_data, params->n,
-                                     info_data, workspace_data));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RunCholesky(PrimitiveType type, CholeskyParams* cholesky_params,
-                         se::Stream* stream,
-                         stream_executor::GpuSolverContext* local_context) {
-  TF_RETURN_IF_ERROR(local_context->SetStream(stream));
-  if (cholesky_params->batch_size > 1) {
-    switch (type) {
-      case F32:
-        return DoPotrfBatched<float>(cholesky_params, stream, *local_context);
-      case F64:
-        return DoPotrfBatched<double>(cholesky_params, stream, *local_context);
-      case C64:
-        return DoPotrfBatched<std::complex<float>>(cholesky_params, stream,
-                                                   *local_context);
-      case C128:
-        return DoPotrfBatched<std::complex<double>>(cholesky_params, stream,
-                                                    *local_context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
-  } else {
-    switch (type) {
-      case F32:
-        return DoPotrfUnbatched<float>(cholesky_params, stream, *local_context);
-      case F64:
-        return DoPotrfUnbatched<double>(cholesky_params, stream,
-                                        *local_context);
-      case C64:
-        return DoPotrfUnbatched<std::complex<float>>(cholesky_params, stream,
-                                                     *local_context);
-      case C128:
-        return DoPotrfUnbatched<std::complex<double>>(cholesky_params, stream,
-                                                      *local_context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
-  }
-}
-
-}  // namespace
-
-CholeskyThunk::CholeskyThunk(
-    ThunkInfo thunk_info, const CholeskyOptions& options,
-    BufferAllocation::Slice a_buffer, BufferAllocation::Slice workspace_buffer,
-    BufferAllocation::Slice info_buffer, PrimitiveType type, int64_t batch_size,
-    int64_t n,
-    absl::AnyInvocable<
-        absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-        solver_context_creator)
-    : Thunk(Kind::kCholesky, thunk_info),
-      uplo_(options.lower() ? se::blas::UpperLower::kLower
-                            : se::blas::UpperLower::kUpper),
-      a_buffer_(a_buffer),
-      workspace_buffer_(workspace_buffer),
-      info_buffer_(info_buffer),
-      type_(type),
-      batch_size_(batch_size),
-      n_(n),
-      solver_context_creator_(std::move(solver_context_creator)) {}
-
-absl::Status CholeskyThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "type=" << PrimitiveType_Name(type_)
-          << " uplo=" << se::blas::UpperLowerString(uplo_)
-          << " batch_size=" << batch_size_ << " n=" << n_
-          << " a=" << a_buffer_.ToString()
-          << " workspace=" << workspace_buffer_.ToString()
-          << " info=" << info_buffer_.ToString();
-
-  se::DeviceMemoryBase a_buffer =
-      params.buffer_allocations->GetDeviceAddress(a_buffer_);
-  se::DeviceMemoryBase info_buffer =
-      params.buffer_allocations->GetDeviceAddress(info_buffer_);
-  se::DeviceMemoryBase workspace_buffer =
-      params.buffer_allocations->GetDeviceAddress(workspace_buffer_);
-  CholeskyParams cholesky_params{n_,       batch_size_,      uplo_,
-                                 a_buffer, workspace_buffer, info_buffer};
-  thread_local absl::StatusOr<
-      std::unique_ptr<stream_executor::GpuSolverContext>>
-      context = solver_context_creator_();
-  TF_RETURN_IF_ERROR(context.status());
-  auto local_context = context.value().get();
-  return RunCholesky(type_, &cholesky_params, params.stream, local_context);
-}
-
-absl::StatusOr<ThunkProto> CholeskyThunk::ToProto() const {
-  ThunkProto proto;
-  *proto.mutable_thunk_info() = thunk_info().ToProto();
-
-  CholeskyThunkProto* cholesky_thunk_proto = proto.mutable_cholesky_thunk();
-
-  auto options = cholesky_thunk_proto->mutable_options();
-  options->set_lower(uplo_ == se::blas::UpperLower::kLower);
-
-  TF_ASSIGN_OR_RETURN(*cholesky_thunk_proto->mutable_a_buffer(),
-                      a_buffer_.ToProto());
-  TF_ASSIGN_OR_RETURN(*cholesky_thunk_proto->mutable_workspace_buffer(),
-                      workspace_buffer_.ToProto());
-  TF_ASSIGN_OR_RETURN(*cholesky_thunk_proto->mutable_info_buffer(),
-                      info_buffer_.ToProto());
-  cholesky_thunk_proto->set_type(type_);
-  cholesky_thunk_proto->set_batch_size(batch_size_);
-  cholesky_thunk_proto->set_n(n_);
-  return proto;
-}
-
-absl::StatusOr<std::unique_ptr<CholeskyThunk>> CholeskyThunk::FromProto(
-    ThunkInfo thunk_info, const CholeskyThunkProto& proto,
-    absl::Span<const BufferAllocation> allocations,
-    const stream_executor::Platform& platform) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice a_buffer,
-      BufferAllocation::Slice::FromProto(proto.a_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice workspace_buffer,
-                      BufferAllocation::Slice::FromProto(
-                          proto.workspace_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice info_buffer,
-      BufferAllocation::Slice::FromProto(proto.info_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      std::function<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_creator,
-      stream_executor::PlatformObjectRegistry::GetGlobalRegistry()
-          .FindObject<stream_executor::GpuSolverContextFactory>(platform.id()));
-  return std::make_unique<CholeskyThunk>(
-      thunk_info, proto.options(), a_buffer, workspace_buffer, info_buffer,
-      proto.type(), proto.batch_size(), proto.n(), solver_creator);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h
deleted file mode 100644
index 2f4cad73e34961..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-// This class stores everything that StreamExecutor needs to launch a Cholesky
-// decomposition (LAPACK potrf). It is generated by IrEmitter.
-//
-// As an implementation detail, we may run potrf (potentially in a loop, if
-// batch_size >1), or potrfBatched.
-//
-// Thread-compatible.
-class CholeskyThunk : public Thunk {
- public:
-  static absl::StatusOr<std::unique_ptr<CholeskyThunk>> FromProto(
-      ThunkInfo thunk_info, const CholeskyThunkProto& proto,
-      absl::Span<const BufferAllocation> allocations,
-      const stream_executor::Platform& platform);
-
-  CholeskyThunk(
-      ThunkInfo thunk_info, const CholeskyOptions& options,
-      BufferAllocation::Slice a_buffer,
-      BufferAllocation::Slice workspace_buffer,
-      BufferAllocation::Slice info_buffer, PrimitiveType type,
-      int64_t batch_size, int64_t n,
-      absl::AnyInvocable<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_context_creator);
-
-  CholeskyThunk(const CholeskyThunk&) = delete;
-  CholeskyThunk& operator=(const CholeskyThunk&) = delete;
-
-  absl::StatusOr<ThunkProto> ToProto() const override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  se::blas::UpperLower uplo_;
-
-  const BufferAllocation::Slice a_buffer_;
-  const BufferAllocation::Slice workspace_buffer_;
-  const BufferAllocation::Slice info_buffer_;
-
-  const PrimitiveType type_;
-  const int64_t batch_size_;
-  const int64_t n_;
-  absl::AnyInvocable<
-      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-      solver_context_creator_;
-};
-
-struct CholeskyParams {
-  int64_t n;
-  int64_t batch_size;
-  se::blas::UpperLower uplo;
-  se::DeviceMemoryBase a_buffer;
-  se::DeviceMemoryBase workspace_buffer;
-  se::DeviceMemoryBase info_buffer;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_CHOLESKY_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk_test.cc
deleted file mode 100644
index 0fc226577d249f..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk_test.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/cholesky_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status_matchers.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/util/proto/proto_matchers.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-namespace {
-
-using ::absl_testing::IsOkAndHolds;
-using ::tsl::proto_testing::EqualsProto;
-
-class CholeskyThunkTest : public HloTestBase {};
-
-TEST_F(CholeskyThunkTest, ProtoRoundTrip) {
-  Thunk::ThunkInfo thunk_info;
-  thunk_info.profile_annotation = "cholesky";
-  CholeskyOptions options;
-  options.set_lower(true);
-  std::vector<BufferAllocation> buffer_allocations = {
-      BufferAllocation(0, 256, 0), BufferAllocation(1, 128, 0),
-      BufferAllocation(2, 4, 0)};
-  BufferAllocation::Slice a_buffer(&buffer_allocations[0], 0, 256);
-  BufferAllocation::Slice workspace_buffer(&buffer_allocations[1], 0, 128);
-  BufferAllocation::Slice info_buffer(&buffer_allocations[2], 0, 4);
-  PrimitiveType type = F32;
-  int64_t batch_size = 1;
-  int64_t n = 16;
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto solver_creator,
-      stream_executor::PlatformObjectRegistry::GetGlobalRegistry()
-          .FindObject<stream_executor::GpuSolverContextFactory>(
-              backend().platform()->id()));
-
-  CholeskyThunk thunk(thunk_info, options, a_buffer, workspace_buffer,
-                      info_buffer, type, batch_size, n, solver_creator);
-
-  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<CholeskyThunk> round_trip_thunk,
-      CholeskyThunk::FromProto(thunk.thunk_info(), proto.cholesky_thunk(),
-                               buffer_allocations, *backend().platform()));
-
-  EXPECT_THAT(round_trip_thunk->ToProto(), IsOkAndHolds(EqualsProto(proto)));
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
index 86820ab04f3432..b119d69ab83443 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -35,7 +34,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
index 44216e5ae52900..74397b399331a8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc
index 48b298e0bbb176..bc0d707ff7715b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
index 14d132091ea61c..337a5b40029358 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -136,5 +135,16 @@ void CollectiveGroupThunk::ForAllThunksMutable(
   }
 }
 
+absl::Status CollectiveGroupThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->TransformAllNestedThunks(fn));
+    TF_ASSIGN_OR_RETURN(thunk, fn(std::move(thunk)));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
index 27e8dd3ebfe1f5..9b53dd8417dec0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
@@ -45,6 +45,11 @@ class CollectiveGroupThunk : public Thunk {
   absl::Status Initialize(const InitializeParams& params) override;
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
+
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events() const {
     return async_events_;
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
index 5178e8bc8efea3..7b51c14027151f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.*/
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <optional>
 #include <string>
@@ -45,6 +46,7 @@ limitations under the License.*/
 #include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
 #include "xla/stream_executor/gpu/collective_kernel_metadata.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -85,21 +87,6 @@ absl::StatusOr<se::DeviceMemoryHandle> AllocateMemory(
   return local_buffer_alloc;
 };
 
-AllReduceStrategy GetAllReduceStrategy(int64_t input_size_bytes) {
-  return input_size_bytes > kMaxOneShotAllReduceSizeBytes
-             ? AllReduceStrategy::kTwoShot
-             : AllReduceStrategy::kOneShot;
-}
-
-int64_t GetMaxSupportedAllReduceSizeBytes(AllReduceStrategy strategy) {
-  switch (strategy) {
-    case AllReduceStrategy::kOneShot:
-      return kMaxOneShotAllReduceSizeBytes;
-    case AllReduceStrategy::kTwoShot:
-      return kMaxTwoShotAllReduceSizeBytes;
-  }
-}
-
 }  // namespace
 
 absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
@@ -116,7 +103,8 @@ absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
 
   const int64_t num_elements = buffers_[0].element_count;
   const int64_t input_size_bytes = GetInputSizeBytes();
-  const AllReduceStrategy strategy = GetAllReduceStrategy(input_size_bytes);
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(input_size_bytes, is_multimem_enabled_);
   // Custom all-reduce strategy is only supported for small inputs.
   if (input_size_bytes > GetMaxSupportedAllReduceSizeBytes(strategy)) {
     return false;
@@ -152,8 +140,8 @@ int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
 
 struct BaseRangePtrRendezvousValue {
   RankId rank;
-  se::DeviceMemoryBase locally_allocated_buffer_ptr;
   se::DeviceMemoryBase buffer_ptr;
+  se::DeviceMemoryBase signal_ptr;
 
   bool operator<(const BaseRangePtrRendezvousValue& other) const {
     return rank < other.rank;
@@ -161,8 +149,8 @@ struct BaseRangePtrRendezvousValue {
 };
 
 absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
-    const GpuCliqueKey& clique_key, StreamState& state,
-    const InitializeParams& params) {
+    const GpuCliqueKey& clique_key, const InitializeParams& params,
+    StreamState& state) {
   BaseRangePtrRendezvousValue rendezvous_value;
   const std::optional<RankId> rank =
       clique_key.rank(params.collective_params->global_device_id);
@@ -170,11 +158,8 @@ absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
   rendezvous_value.rank = rank.value();
-  rendezvous_value.locally_allocated_buffer_ptr = state.local_buffer.memory();
-  TF_ASSIGN_OR_RETURN(rendezvous_value.buffer_ptr,
-                      params.executor->GetMemoryRange(
-                          params.buffer_allocations->GetDeviceAddress(
-                              buffers_[0].source_buffer)));
+  rendezvous_value.buffer_ptr = state.local_buffers_handle.memory();
+  rendezvous_value.signal_ptr = state.signal_buffers_handle.memory();
 
   auto rendezvous_fn =
       [](absl::Span<const BaseRangePtrRendezvousValue* const> values) {
@@ -194,36 +179,99 @@ absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
   TF_ASSIGN_OR_RETURN(std::shared_ptr<std::vector<BaseRangePtrRendezvousValue>>
                           rendezvous_values,
                       Rendezvous<std::vector<BaseRangePtrRendezvousValue>>(
-                          /*name=*/
-                          start_rendezvous_key, /*key=*/clique_key,
+                          /*name=*/start_rendezvous_key, /*key=*/clique_key,
                           /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
                           rendezvous_fn));
 
-  if (rendezvous_values->size() > CollectiveKernelMetadata::kMaxNumDevices) {
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Multi-device kernels require at most %d peers.",
-                        CollectiveKernelMetadata::kMaxNumDevices));
+  if (rendezvous_values->size() > num_ranks) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Multi-device kernels require at most %d peers.", num_ranks));
   }
   CollectiveKernelMetadata metadata;
   metadata.rank = rank.value().value();
-  for (int i = 0; i < rendezvous_values->size(); ++i) {
-    metadata.local_buffer_root_ptrs[i] =
-        (uint64_t)rendezvous_values->at(i)
-            .locally_allocated_buffer_ptr.opaque();
-    metadata.buffer_root_ptrs[i] =
-        (uint64_t)rendezvous_values->at(i).buffer_ptr.opaque();
+  metadata.multicast_buffer_ptr =
+      reinterpret_cast<uint64_t>(state.multicast_device_ptr);
+
+  std::vector<uint64_t> param_to_peers_ptrs;
+  param_to_peers_ptrs.reserve(rendezvous_values->size() * 2);
+  for (const auto& value : *rendezvous_values) {
+    param_to_peers_ptrs.push_back(
+        reinterpret_cast<uint64_t>(value.buffer_ptr.opaque()));
+  }
+  for (const auto& value : *rendezvous_values) {
+    param_to_peers_ptrs.push_back(
+        reinterpret_cast<uint64_t>(value.signal_ptr.opaque()));
   }
 
-  se::DeviceMemoryBase metadata_ptr =
-      params.executor->Allocate(sizeof(CollectiveKernelMetadata), 0);
+  size_t param_to_peers_ptrs_size_bytes =
+      param_to_peers_ptrs.size() * sizeof(uint64_t);
+  se::DeviceMemoryBase metadata_ptr = params.executor->Allocate(
+      sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
+  se::DeviceMemoryBase param_to_peers_ptrs_buffer = metadata_ptr.GetByteSlice(
+      sizeof(CollectiveKernelMetadata), param_to_peers_ptrs_size_bytes);
+  VLOG(3) << "[" << params.executor->device_ordinal() << "]"
+          << " ExchangeStateMetadata: metadata_ptr = " << metadata_ptr.opaque()
+          << ", param_to_peers_ptrs_buffer = "
+          << param_to_peers_ptrs_buffer.opaque()
+          << ", param_to_peers_ptrs_size = " << param_to_peers_ptrs.size();
+  metadata.param_to_peers =
+      reinterpret_cast<uint64_t*>(param_to_peers_ptrs_buffer.opaque());
   TF_RETURN_IF_ERROR(params.stream->Memcpy(&metadata_ptr, (void*)&metadata,
                                            sizeof(CollectiveKernelMetadata)));
+  TF_RETURN_IF_ERROR(params.stream->Memcpy(&param_to_peers_ptrs_buffer,
+                                           param_to_peers_ptrs.data(),
+                                           param_to_peers_ptrs_size_bytes));
   TF_RETURN_IF_ERROR(params.stream->BlockHostUntilDone());
 
   state.metadata = metadata_ptr;
   return absl::OkStatus();
 }
 
+absl::Status Barrier(int device_number, const GpuCliqueKey& clique_key) {
+  std::string start_rendezvous_key = absl::StrFormat(
+      "Barrier for device %d, "
+      "clique %s",
+      device_number, clique_key.ToString());
+  return Rendezvous(
+      /*name=*/
+      start_rendezvous_key, /*key=*/clique_key,
+      /*num_threads=*/clique_key.num_local_participants());
+}
+
+absl::Status CollectiveKernelThunk::SetupMultimem(
+    const GpuCliqueKey& clique_key, const se::StreamExecutor* stream_executor,
+    StreamState& state) {
+  const stream_executor::gpu::GpuExecutor* gpu_executor =
+      dynamic_cast<const stream_executor::gpu::GpuExecutor*>(stream_executor);
+  if (gpu_executor == nullptr) {
+    return absl::UnimplementedError("Multicast is not supported on device.");
+  }
+
+  size_t data_size = buffers_[0].source_buffer.size();
+  int device_number = gpu_executor->device_ordinal();
+
+  if (device_number == 0) {
+    TF_ASSIGN_OR_RETURN(multicast_memory_,
+                        gpu_executor->CreateMulticastMemory(
+                            data_size, clique_key.num_local_participants()));
+  }
+
+  // Wait for all devices to create the multicast object.
+  TF_RETURN_IF_ERROR(Barrier(device_number, clique_key));
+
+  // Add current devices to the multicast object.
+  TF_RETURN_IF_ERROR(multicast_memory_->SubscribeDevice(device_number));
+
+  // Wait for all devices to register the multicast object.
+  TF_RETURN_IF_ERROR(Barrier(device_number, clique_key));
+
+  TF_ASSIGN_OR_RETURN(state.multicast_device_ptr,
+                      multicast_memory_->MapMemory(
+                          state.local_buffers_handle.memory(), gpu_executor));
+
+  return absl::OkStatus();
+}
+
 absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
       const GpuCliqueKey clique_key,
@@ -234,9 +282,11 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_RET_CHECK(rank.has_value())
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
   const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
-      buffers_[0].element_count, clique_key.num_local_participants(),
-      GetAllReduceStrategy(GetInputSizeBytes()));
+      buffers_[0].element_count, clique_key.num_local_participants(), strategy);
+
   StreamState* state = nullptr;
   {
     absl::MutexLock lock(mutex_);
@@ -249,11 +299,16 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
           kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
       const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
           buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
+
       TF_ASSIGN_OR_RETURN(
-          se::DeviceMemoryHandle local_buffer_alloc,
-          AllocateMemory(params.executor,
-                         (kSignalBufferSize + kLocalBufferSize) * kNumBuffers,
-                         "Local and Signal buffers"));
+          se::DeviceMemoryHandle local_buffers_handle,
+          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                         "Local buffers"));
+
+      TF_ASSIGN_OR_RETURN(
+          se::DeviceMemoryHandle signal_buffers_handle,
+          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                         "Signal buffers"));
 
       // Step2: We needs 1 atomic flag per block per device on each device.
       // One-shot kernel expects that the signal flags buffer is zeroed out.
@@ -262,7 +317,8 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
       // correct state after use, so we don't need to zero out after
       // initialization.
       TF_RETURN_IF_ERROR(params.executor->SynchronousMemZero(
-          local_buffer_alloc.memory_ptr(), local_buffer_alloc.memory().size()));
+          signal_buffers_handle.memory_ptr(),
+          signal_buffers_handle.memory().size()));
       // Create a kernel for execution.
       std::unique_ptr<se::Kernel> kernel = nullptr;
       // If PTX is provided, we create a kernel from it.
@@ -277,29 +333,33 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
           params.executor,
           std::make_unique<StreamState>(
               params.executor->device_ordinal(), rank.value(),
-              std::move(local_buffer_alloc), std::move(kernel)));
+              std::move(local_buffers_handle), std::move(signal_buffers_handle),
+              std::move(kernel)));
 
       state = per_stream_state_.at(params.executor).get();
 
       // NB: This is a double buffer allocation. So size of a single buffer is
       // half of the total allocation.
       for (int i = 0; i < kNumBuffers; ++i) {
-        uint64_t offset = i * (kLocalBufferSize + kSignalBufferSize);
         state->remote_buffer_ptrs[i] =
-            state->local_buffer.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/offset,
+            state->local_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * kLocalBufferSize,
                 /*size_bytes=*/kLocalBufferSize);
 
         state->signal_buffer_ptrs[i] =
-            state->local_buffer.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/offset + kLocalBufferSize,
+            state->signal_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * kSignalBufferSize,
                 /*size_bytes=*/kSignalBufferSize);
       }
     }
   }
 
   if (state != nullptr) {
-    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, *state, params));
+    if (strategy == AllReduceStrategy::kMultimem) {
+      se::StreamExecutor* stream_executor = params.executor;
+      TF_RETURN_IF_ERROR(SetupMultimem(clique_key, stream_executor, *state));
+    }
+    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, params, *state));
   }
 
   return absl::OkStatus();
@@ -347,7 +407,8 @@ absl::Status CollectiveKernelThunk::ExecuteOnStream(
   }
 
   const uint32_t buffer_index = state->invocation_count % kNumBuffers;
-  auto const strategy = GetAllReduceStrategy(GetInputSizeBytes());
+  const AllReduceStrategy strategy =
+      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
   const LaunchDimensions launch_dimensions =
       AllReduceLaunchDimensions(buffer.element_count, num_devices, strategy);
   // In case of two-shot we want to increment in multiples of 2.
@@ -361,8 +422,8 @@ absl::Status CollectiveKernelThunk::ExecuteOnStream(
   se::DeviceMemoryBase signal_buffer_ptr =
       state->signal_buffer_ptrs[buffer_index];
   VLOG(3) << "[" << device_ordinal
-          << "] input_buffer_ptr: " << (uint64_t)input_buffer_ptr.opaque()
-          << " signal_buffer_ptr: " << (uint64_t)signal_buffer_ptr.opaque();
+          << "] input_buffer_ptr: " << input_buffer_ptr.opaque()
+          << " signal_buffer_ptr: " << signal_buffer_ptr.opaque();
   VLOG(3) << "[" << device_ordinal
           << "] launch dimensions: " << launch_dimensions.num_blocks() << "x"
           << launch_dimensions.num_threads_per_block()
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
index ebcff9784677fc..9fd6b941f7eb33 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
@@ -36,6 +36,7 @@ limitations under the License.*/
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream.h"
 
@@ -60,14 +61,16 @@ class CollectiveKernelThunk : public Thunk {
                         ReductionKind reduction_kind, bool is_async,
                         absl::Span<const CollectiveThunk::Buffer> buffers,
                         bool is_collective_kernel_enabled,
-                        absl::string_view kernel_name = "")
+                        absl::string_view kernel_name = "",
+                        bool is_multimem_enabled = false)
       : Thunk{Thunk::kCollectiveKernel, info},
         collective_kernel_enabled_(is_collective_kernel_enabled),
         is_async_(is_async),
         collective_config_(std::move(collective_config)),
         reduction_kind_(reduction_kind),
         kernel_name_(kernel_name),
-        buffers_(buffers) {
+        buffers_(buffers),
+        is_multimem_enabled_(is_multimem_enabled) {
     per_stream_state_.reserve(kMaxNumExecutors);
   }
 
@@ -96,7 +99,7 @@ class CollectiveKernelThunk : public Thunk {
   struct StreamState {
     int device_ordinal;
     RankId rank;
-    // Buffers and signal flags allocated for the collective.
+    // Buffers allocated for the collective.
     // Buffers are double buffered to allow for consecutive invocation
     // of the kernel on different GPUs.
     // - GPUs sync on Buffer 0 on first invocation.
@@ -104,7 +107,11 @@ class CollectiveKernelThunk : public Thunk {
     //   This implies that all GPUs must have finished the first invocation
     //   before they can sync on the second invocation.
     // - Alternate back to Buffer 0 on third invocation. And so on.
-    se::DeviceMemoryHandle local_buffer;
+    se::DeviceMemoryHandle local_buffers_handle;
+
+    // Signal buffers allocated for the collective.
+    // Also double buffered for the same reason as local buffers.
+    se::DeviceMemoryHandle signal_buffers_handle;
 
     // Pointer to the collective kernel metadata on device.
     se::DeviceMemoryBase metadata;
@@ -118,14 +125,18 @@ class CollectiveKernelThunk : public Thunk {
     std::unique_ptr<se::Kernel> kernel;
     uint32_t invocation_count = 0;
 
+    void* multicast_device_ptr = nullptr;
+
     // Constructor to make OSS builds happy.
     StreamState() = default;
     StreamState(int device_ordinal_arg, RankId rank_arg,
-                se::DeviceMemoryHandle local_buffer_arg,
+                se::DeviceMemoryHandle local_buffers_handle_arg,
+                se::DeviceMemoryHandle signal_buffers_handle_arg,
                 std::unique_ptr<se::Kernel> kernel_arg)
         : device_ordinal(device_ordinal_arg),
           rank(rank_arg),
-          local_buffer(std::move(local_buffer_arg)),
+          local_buffers_handle(std::move(local_buffers_handle_arg)),
+          signal_buffers_handle(std::move(signal_buffers_handle_arg)),
           kernel(std::move(kernel_arg)) {}
   };
 
@@ -135,8 +146,15 @@ class CollectiveKernelThunk : public Thunk {
   // Internal method to sync thread after Initialize.
   // Returns the collective kernel metadata for the given clique key.
   absl::Status ExchangeStateMetadata(const GpuCliqueKey& clique_key,
-                                     StreamState& state,
-                                     const InitializeParams& params);
+                                     const InitializeParams& params,
+                                     StreamState& state);
+
+  // Initializes and multimem memory. Each thunk participant should call this
+  // method once. Multimem should be setup before usage when multimem strategy
+  // is selected.
+  absl::Status SetupMultimem(const GpuCliqueKey& clique_key,
+                             const se::StreamExecutor* stream_executor,
+                             StreamState& state);
 
   // Whether the one-shot kernel is enabled.
   const bool collective_kernel_enabled_;
@@ -152,11 +170,14 @@ class CollectiveKernelThunk : public Thunk {
   // Reference to the buffer related information required for the collective.
   absl::Span<const CollectiveThunk::Buffer> buffers_;
 
+  std::unique_ptr<stream_executor::gpu::GpuExecutor::MulticastMemory>
+      multicast_memory_;
   // Guard access to the stream state across different threads (which control
   // different streams).
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamState>>
       per_stream_state_ ABSL_GUARDED_BY(mutex_);
+  const bool is_multimem_enabled_;
 };
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index 38ef1d2cb7289c..afab4b10dcb3c4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -26,11 +26,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/array.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
@@ -38,14 +35,18 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -133,136 +134,203 @@ static constexpr absl::string_view kKernelSource = R"(
   }
 )";
 
-absl::StatusOr<se::StreamExecutor*> GpuExecutor(int32_t device_ordinal) {
-  TF_ASSIGN_OR_RETURN(auto name, PlatformUtil::CanonicalPlatformName("gpu"));
-  TF_ASSIGN_OR_RETURN(auto* platform,
-                      se::PlatformManager::PlatformWithName(name));
-  return platform->ExecutorForDevice(device_ordinal);
+se::StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
+  auto* platform =
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+  return platform->ExecutorForDevice(device_ordinal).value();
 }
 
-TEST(CollectiveKernelThunkTest, ExecutesPtxKernel) {
-  using DataT = int64_t;
-  static constexpr int64_t kNumElements = 128;
-  static constexpr int64_t kInputSizeBytes = kNumElements * sizeof(DataT);
-  static constexpr uint32_t kExpectedSignalValue = 1;
+struct CollectiveKernelThunkMetadata {
+  BufferAllocation buffer_allocation;
+  std::unique_ptr<CollectiveKernelThunk> thunk;
+  int64_t total_buffer_size;
+  int64_t input_data_size_bytes;
+  int64_t aligned_input_size_bytes;
+  int64_t num_devices;
+  std::vector<CollectiveThunk::Buffer> buffers;
+};
 
-  // --------------------
-  // Arrange
-  // --------------------
-  // # Prepare input data and expected output data.
-  Array<DataT> input_data({/*num_elements=*/kNumElements});
-  input_data.FillRandom(5, 5, /*seed=*/12345);
-  Array<DataT> expected_output_data({/*num_elements=*/kNumElements});
-  expected_output_data.Each([&](absl::Span<const int64_t> indices, DataT* val) {
-    *val = input_data(indices) + kExpectedSignalValue;
-  });
-  // # Prepare Infrastructure.
-  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor0, GpuExecutor(0));
-  Thunk::ThunkInfo thunk_info;
-  thunk_info.profile_annotation = kProfileName;
+CollectiveKernelThunkMetadata CreateCollectiveKernelThunk(
+    int num_devices, int num_elements, bool is_multimem_enabled) {
+  const int64_t input_size_bytes = num_elements * sizeof(uint64_t);
   ReplicaGroup replica_group;
-  replica_group.add_replica_ids(0);
+
+  for (int device_number = 0; device_number < num_devices; ++device_number) {
+    replica_group.add_replica_ids(device_number);
+  }
+
   CollectiveConfig collective_config{
       /*operand_count=*/1,
       /*operand_element_type=*/{PrimitiveType::F32},
-      /* replica_groups=*/{replica_group},
-      /* collective_op_kind=*/RendezvousKey::CollectiveOpKind::kCrossReplica,
-      /* op_id=*/0,
-      /* group_mode=*/
+      /*replica_groups=*/{replica_group},
+      /*collective_op_kind=*/
+      RendezvousKey::CollectiveOpKind::kCrossReplica,
+      /*op_id=*/0,
+      /*group_mode=*/
       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA,
-      /* use_symmetric_buffer=*/false};
+      /*use_symmetric_buffer=*/false};
   const int64_t aligned_input_size_bytes =
-      xla::RoundUpTo<uint64_t>(kInputSizeBytes, kXlaAllocatedBufferAlignBytes);
-  // 2x because we have two buffers, one for input and one for output so we can
-  // test output independently of input.
+      xla::RoundUpTo<uint64_t>(input_size_bytes, kXlaAllocatedBufferAlignBytes);
+  // 2x because we have two buffers, one for input and one for output so we
+  // can test output independently of input.
   const int64_t total_buffer_size = aligned_input_size_bytes * 2;
-  // ## Create physical buffers.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
-                          executor0->CreateStream());
-  std::vector<se::DeviceMemoryBase> allocated_buffers = {
-      executor0->AllocateArray<DataT>(total_buffer_size)};
-  std::vector<se::DeviceMemoryBase> input_buffers = {
-      allocated_buffers[0].GetByteSlice(0, aligned_input_size_bytes)};
-  std::vector<se::DeviceMemoryBase> output_buffers = {
-      allocated_buffers[0].GetByteSlice(aligned_input_size_bytes,
-                                        aligned_input_size_bytes)};
-  BufferAllocations buffer_allocations(
-      /*buffers=*/allocated_buffers,
-      /*device_ordinal=*/0,
-      /*memory_allocator=*/nullptr);
-  TF_ASSERT_OK(
-      stream->Memcpy(&input_buffers[0], input_data.data(), kInputSizeBytes));
+  CollectiveKernelThunkMetadata result{
+      BufferAllocation(/*index=*/0, /*size=*/total_buffer_size, /*color=*/0)};
+  BufferAllocation::Slice input_slice(&result.buffer_allocation, /*offset=*/0,
+                                      /*size=*/aligned_input_size_bytes);
+  BufferAllocation::Slice output_slice(&result.buffer_allocation,
+                                       aligned_input_size_bytes,
+                                       aligned_input_size_bytes);
+  result.buffers = {{/*element_count=*/num_elements,
+                     /*source_buffer=*/input_slice,
+                     /*destination_buffer=*/output_slice,
+                     /*source_memory_space=*/0,
+                     /*destination_memory_space=*/0}};
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = kProfileName;
+  result.thunk = std::make_unique<CollectiveKernelThunk>(
+      std::move(thunk_info), collective_config, ReductionKind::SUM,
+      /*is_async=*/false, result.buffers,
+      /*is_collective_kernel_enabled=*/true,
+      /*kernel_name=*/kKernelName,
+      /*is_multimem_enabled=*/is_multimem_enabled);
+  result.total_buffer_size = total_buffer_size;
+  result.num_devices = num_devices;
+  result.aligned_input_size_bytes = aligned_input_size_bytes;
+  result.input_data_size_bytes = input_size_bytes;
+  return result;
+}
 
-  // ## Create Logical Buffers.
+absl::StatusOr<se::DeviceMemoryBase> RunCollectiveKernelThunk(
+    CollectiveKernelThunkMetadata& metadata, se::StreamExecutor* executor,
+    std::vector<uint64_t> input_data) {
   BufferAllocation buffer_allocation(
-      /*index=*/0, /*size=*/total_buffer_size, /*color=*/0);
-  BufferAllocation::Slice input_slice(&buffer_allocation, /*offset=*/0,
-                                      /*size=*/aligned_input_size_bytes);
-  BufferAllocation::Slice output_slice(
-      &buffer_allocation, aligned_input_size_bytes, aligned_input_size_bytes);
-  std::vector<CollectiveThunk::Buffer> buffers = {
-      {/*element_count=*/kNumElements,
-       /*source_buffer=*/input_slice,
-       /*destination_buffer=*/output_slice,
-       /*source_memory_space=*/0,
-       /*destination_memory_space=*/0}};
-
-  // ## Setup device mapping.
-  DeviceAssignment device_assignment(/*replica_count=*/1,
-                                     /*computation_count=*/1);
-  device_assignment(0, 0) = 0;
+      /*index=*/0, /*size=*/metadata.total_buffer_size, /*color=*/0);
   GpuExecutableRunOptions gpu_options;
   gpu_options.set_gpu_global_device_ids(
-      std::map{std::make_pair(0, GlobalDeviceId(0))});
+      std::map{std::make_pair(0, GlobalDeviceId(0)),
+               std::make_pair(1, GlobalDeviceId(1))});
+
+  TF_ASSIGN_OR_RETURN(auto stream, executor->CreateStream());
   ServiceExecutableRunOptions run_options;
   run_options.mutable_run_options()->set_stream(stream.get());
+  DeviceAssignment device_assignment(/*replica_count=*/metadata.num_devices,
+                                     /*computation_count=*/1);
+
+  for (int i = 0; i < metadata.num_devices; ++i) {
+    device_assignment(i, 0) = i;
+  }
+
   run_options.mutable_run_options()->set_device_assignment(&device_assignment);
   run_options.mutable_run_options()->set_gpu_executable_run_options(
       &gpu_options);
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto collective_params,
-      Thunk::CollectiveExecuteParams::Create(run_options, /*async_streams=*/{},
-                                             /*local_device_ordinal=*/0));
-  // --------------------
-  // Act
-  // --------------------
-  CollectiveKernelThunk thunk(std::move(thunk_info), collective_config,
-                              ReductionKind::SUM,
-                              /*is_async=*/false, buffers,
-                              /*is_collective_kernel_enabled=*/true,
-                              /*kernel_name=*/kKernelName);
-
-  // # Thunk::Initialize
+
+  TF_ASSIGN_OR_RETURN(auto collective_params,
+                      Thunk::CollectiveExecuteParams::Create(
+                          run_options, /*async_streams=*/{},
+                          /*local_device_ordinal=*/executor->device_ordinal()));
+  std::vector<se::DeviceMemoryBase> allocated_buffers = {
+      executor->AllocateArray<uint64_t>(metadata.total_buffer_size)};
+
+  se::DeviceMemoryBase input_buffer =
+      allocated_buffers[0].GetByteSlice(0, metadata.aligned_input_size_bytes);
+  se::DeviceMemoryBase output_buffer = allocated_buffers[0].GetByteSlice(
+      metadata.aligned_input_size_bytes, metadata.aligned_input_size_bytes);
+  BufferAllocations buffer_allocations(
+      /*buffers=*/allocated_buffers,
+      /*device_ordinal=*/executor->device_ordinal(),
+      /*memory_allocator=*/nullptr);
+
+  if (!input_data.empty()) {
+    VLOG(3) << "Copying input data to the device";
+    TF_RETURN_IF_ERROR(stream->Memcpy(&input_buffer, input_data.data(),
+                                      metadata.input_data_size_bytes));
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  }
+
   Thunk::InitializeParams initialize_params;
-  initialize_params.executor = executor0;
+  initialize_params.executor = executor;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
   initialize_params.collective_params = &collective_params;
   initialize_params.src = {kKernelSource};
-  TF_ASSERT_OK(thunk.Initialize(initialize_params));
-
-  // # Thunk::Execute
-  auto execute_params =
-      Thunk::ExecuteParams::Create(run_options,                              //
-                                   buffer_allocations,                       //
-                                   stream.get(),                             //
-                                   /*command_buffer_trace_stream=*/nullptr,  //
-                                   &collective_params,                       //
-                                   /*collective_cliques=*/nullptr            //
-      );
-  TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
-
-  // --------------------
-  // Assert
-  // --------------------
-  Array<DataT> output_data({kNumElements});
-  TF_ASSERT_OK(
-      stream->Memcpy(output_data.data(), output_buffers[0], kInputSizeBytes));
+  TF_RETURN_IF_ERROR(metadata.thunk->Initialize(initialize_params));
+
+  auto execute_params = Thunk::ExecuteParams::Create(
+      run_options, buffer_allocations, stream.get(),
+      /*command_buffer_trace_stream=*/nullptr, &collective_params,
+      /*collective_cliques=*/nullptr);
+  TF_RETURN_IF_ERROR(metadata.thunk->ExecuteOnStream(execute_params));
+  return output_buffer;
+}
+
+std::vector<absl::StatusOr<se::DeviceMemoryBase>>
+RunCollectiveKernelThunkOnDevices(CollectiveKernelThunkMetadata& metadata) {
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "device_threads",
+                                      metadata.num_devices);
+  std::vector<tsl::Future<se::DeviceMemoryBase>> futures;
+  for (int device_number = 0; device_number < metadata.num_devices;
+       ++device_number) {
+    futures.push_back(tsl::Future<se::DeviceMemoryBase>::MakeOn(
+        *thread_pool.AsExecutor(), [&metadata, device_number] {
+          return RunCollectiveKernelThunk(metadata,
+                                          GetGpuExecutor(device_number), {});
+        }));
+  }
+
+  std::vector<absl::StatusOr<se::DeviceMemoryBase>> results;
+  for (auto& future : futures) {
+    results.push_back(std::move(future).Await());
+  }
+  return results;
+}
+
+TEST(CollectiveKernelThunkTest, ExecutesPtxKernel) {
+  static constexpr int64_t kNumElements = 128;
+  static constexpr uint32_t kExpectedSignalValue = 1;
+
+  std::vector<uint64_t> input_data(kNumElements);
+  for (int i = 0; i < kNumElements; ++i) {
+    input_data[i] = i;
+  }
+
+  std::vector<uint64_t> expected_output_data(kNumElements);
+  for (int i = 0; i < kNumElements; ++i) {
+    expected_output_data[i] = input_data[i] + kExpectedSignalValue;
+  }
+
+  CollectiveKernelThunkMetadata metadata = CreateCollectiveKernelThunk(
+      /*num_devices=*/1, /*num_elements=*/kNumElements,
+      /*is_multimem_enabled=*/false);
+
+  se::StreamExecutor* executor0 = GetGpuExecutor(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      se::DeviceMemoryBase result_buffer,
+      RunCollectiveKernelThunk(metadata, executor0, input_data));
+
+  std::vector<uint64_t> output_data(kNumElements);
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor0->CreateStream());
+  TF_ASSERT_OK(stream->Memcpy(output_data.data(), result_buffer,
+                              metadata.input_data_size_bytes));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
   for (auto i = 0; i < kNumElements; ++i) {
-    ASSERT_EQ(expected_output_data(i), output_data(i))
+    ASSERT_EQ(expected_output_data[i], output_data[i])
         << "comparison failed at i = " << i;
   }
 }
 
+TEST(CollectiveKernelThunkTest, MultimemSetupTest) {
+  static constexpr int kDevicesCount = 2;
+  static constexpr int64_t kNumElements = 128;
+
+  CollectiveKernelThunkMetadata metadata = CreateCollectiveKernelThunk(
+      /*num_devices=*/kDevicesCount, /*num_elements=*/kNumElements,
+      /*is_multimem_enabled=*/true);
+  for (absl::StatusOr<se::DeviceMemoryBase> result :
+       RunCollectiveKernelThunkOnDevices(metadata)) {
+    TF_ASSERT_OK(result);
+  }
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
index 5db7609fe161a3..4efcabc1d67d8d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -43,11 +42,10 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
@@ -63,21 +61,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-absl::StatusOr<const int64_t> GetCurrentId(
-    Thunk::CollectiveExecuteParams* collective_params,
-    const P2PConfig& config) {
-  GlobalDeviceId global_device_id = collective_params->global_device_id;
-  TF_ASSIGN_OR_RETURN(
-      const DeviceAssignment::LogicalID current_logical_id,
-      collective_params->device_assn->LogicalIdForDevice(global_device_id));
-  const int64_t current_id =
-      config.config.group_mode ==
-              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  return current_id;
-}
-
 bool IsLocalPeerTransfer(const P2PConfig::SourceTargetMapEntry& source_target,
                          const int64_t current_id, const int64_t device_count) {
   const std::optional<int64_t> source_id = source_target.source;
@@ -189,8 +172,9 @@ absl::Status CollectivePermuteStartThunk::Initialize(
   VLOG(5) << "Local device count: " << device_count_;
 
   if (p2p_memcpy_enabled_) {
-    TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                        GetCurrentId(params.collective_params, config_));
+    TF_ASSIGN_OR_RETURN(
+        const int64_t current_id,
+        GetCollectiveCurrentId(params.collective_params, config_));
     {
       absl::MutexLock lock(barrier_mutex_);
       if (receiver_barrier_events_.find(current_id) ==
@@ -255,8 +239,9 @@ absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
       ConvertToDeviceBuffers(params,
                              std::vector<CollectiveThunk::Buffer>(buffers_),
                              config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                      GetCurrentId(params.collective_params, config_));
+  TF_ASSIGN_OR_RETURN(
+      const int64_t current_id,
+      GetCollectiveCurrentId(params.collective_params, config_));
   std::string device_string = GetDeviceString(*params.collective_params);
 
   const P2PConfig::SourceTargetMapEntry source_target =
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
index 00426412ceee16..a4b87bffa58020 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
@@ -29,12 +29,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index dbdffe424660d4..748746d82cc3e9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -20,13 +20,11 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -43,6 +41,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
@@ -256,10 +255,14 @@ absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
         "using a tool designed for only one device like run_hlo_module.");
   }
 
+  // Get the list of all devices that are participating in the collective
+  // operation.
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDeviceId> participants,
       GetParticipatingDevices(global_device_id, *params.device_assn,
                               replica_groups, group_mode));
+
+  // Get grouping of participating devices.
   std::vector<std::vector<GlobalDeviceId>> participant_groups;
   if (use_nccl) {
     // If splitting is enabled, participating groups must match in order for a
@@ -280,6 +283,13 @@ absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
           "environment configuration.");
     }
   }
+
+  // Remove trivial group that contains all participants, as we do not want to
+  // create two sets of communicator handles for these cases.
+  if (participant_groups.size() == 1 && participant_groups[0] == participants) {
+    participant_groups.clear();
+  }
+
   int64_t num_local_participants =
       GetNumLocalParticipants(params, participants);
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 10e9584af0e670..3637fc51b4aca6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/debug_options_flags.h"
@@ -63,6 +64,7 @@ limitations under the License.
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/runtime/execution_graph.h"
@@ -1832,7 +1834,7 @@ namespace {
 // Records each buffer associated with each slice into the provided vector.
 // Returns an error if any of the slices is missing a buffer allocation.
 absl::Status GetBuffers(const Thunk::ExecuteParams& execute_params,
-                        absl::Span<const std::optional<ShapedSlice>> slices,
+                        absl::Span<const NullableShapedSlice> slices,
                         std::vector<void*>& buffers, absl::string_view label) {
   for (int i = 0; i < slices.size(); ++i) {
     if (!slices[i].has_value()) {
@@ -1915,7 +1917,7 @@ CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
   arguments.reserve(operands_.size());
 
   for (int i = 0; i < operands_.size(); ++i) {
-    const std::optional<ShapedSlice>& slice = operands_[i];
+    const NullableShapedSlice& slice = operands_[i];
     if (!slice.has_value()) {
       arguments.push_back(se::DeviceMemoryBase{});
       continue;
@@ -1932,7 +1934,7 @@ CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
   results.reserve(results_.size());
 
   for (int i = 0; i < results_.size(); ++i) {
-    const std::optional<ShapedSlice>& slice = results_[i];
+    const NullableShapedSlice& slice = results_[i];
     if (!slice.has_value()) {
       results.push_back(se::DeviceMemoryBase{});
       continue;
@@ -1999,7 +2001,7 @@ CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() const {
 CollectiveCmd::CollectiveCmd(
     CommandBufferCmdType cmd_type, CollectiveConfig config,
     std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
-    : CommandBufferCmd(cmd_type),
+    : CommandBufferCmd(cmd_type, se::StreamPriority::Highest),
       config_(std::move(config)),
       async_events_(std::move(async_events)) {}
 
@@ -2376,7 +2378,7 @@ CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() const {
 DynamicSliceFusionCmd::DynamicSliceFusionCmd(
     CommandBufferCmdExecutor embedded_commands,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
-    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+    std::vector<BufferAllocation> fake_allocations,
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 67e49d55d79b43..bab539447e37cd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -42,8 +42,10 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/buffer_use.h"
@@ -997,13 +999,13 @@ class CuDnnCmd : public TracedCommandBufferCmd {
 class CustomCallCmd : public CommandBufferCmd {
  public:
   using CustomCallTarget = CustomCallThunk::CustomCallTarget;
-  using AttributesMap = CustomCallThunk::AttributesMap;
+  using AttributesMap = ffi::AttributesMap;
 
   // This is a legacy custom call API that is discouraged, and will be
   // deprecated once XLA:FFI mechanism is ready.
   CustomCallCmd(std::string target_name, CustomCallTarget call_target,
-                std::vector<std::optional<ShapedSlice>> operands,
-                std::vector<std::optional<ShapedSlice>> results,
+                std::vector<NullableShapedSlice> operands,
+                std::vector<NullableShapedSlice> results,
                 absl::string_view opaque)
       : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd),
         target_name_(std::move(target_name)),
@@ -1013,8 +1015,8 @@ class CustomCallCmd : public CommandBufferCmd {
         results_(std::move(results)) {}
 
   CustomCallCmd(std::string target_name, XLA_FFI_Handler* handler,
-                std::vector<std::optional<ShapedSlice>> operands,
-                std::vector<std::optional<ShapedSlice>> results,
+                std::vector<NullableShapedSlice> operands,
+                std::vector<NullableShapedSlice> results,
                 ffi::CallFrame call_frame,
                 const HloComputation* called_computation)
       : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd),
@@ -1066,8 +1068,8 @@ class CustomCallCmd : public CommandBufferCmd {
 
   const HloComputation* called_computation_;
 
-  std::vector<std::optional<ShapedSlice>> operands_;
-  std::vector<std::optional<ShapedSlice>> results_;
+  std::vector<NullableShapedSlice> operands_;
+  std::vector<NullableShapedSlice> results_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1224,7 +1226,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
   DynamicSliceFusionCmd(
       CommandBufferCmdExecutor embedded_commands,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
-      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
+      std::vector<BufferAllocation> fake_allocations,
       std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
           offsets,
       std::vector<std::optional<Shape>> orig_shapes,
@@ -1259,7 +1261,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
  private:
   CommandBufferCmdExecutor embedded_commands_;
   std::vector<DynamicSliceThunk::SliceDef> slices_;
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<BufferAllocation> fake_allocations_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index 9cb965bbd79100..d2e50af0dfb45b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
@@ -197,10 +198,10 @@ static absl::StatusOr<Command> Convert(
       ConvertToCommands(thunk.get_embedded_thunk()->thunks(), options));
 
   auto& thunk_fake_allocations = thunk.get_fake_allocations();
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
+  std::vector<BufferAllocation> fake_allocations;
   for (auto it = thunk_fake_allocations.begin();
        it != thunk_fake_allocations.end(); ++it) {
-    fake_allocations.push_back(std::make_unique<BufferAllocation>(**it));
+    fake_allocations.push_back(BufferAllocation(*it));
   }
   return std::make_unique<DynamicSliceFusionCmd>(
       std::move(embedded_cmds), thunk.get_arguments(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index d0453371c39fcb..86888630f86e63 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 
+#include <algorithm>
 #include <array>
 #include <cstdint>
 #include <utility>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -38,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -299,8 +302,6 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
 }
 
 TEST(CommandBufferCmdTest, LaunchCmd) {
-  // TODO(rocm): weekly sync 24-12-10
-  GTEST_SKIP() << "CUDA graph conditionals are not supported";
   se::StreamExecutor* stream_executor = GpuExecutor();
 
   auto stream = stream_executor->CreateStream().value();
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
index f3b558de34e251..f6ef881b0966e3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
@@ -23,13 +23,11 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/overload.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -48,7 +46,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/tsl/platform/errors.h"
@@ -206,7 +203,7 @@ bool IsConvertible(const CustomCallThunk& custom_call_thunk,
   absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(target_name, "gpu");
   return registration.ok()
-             ? ffi::IsCommandBufferCompatible(registration->traits)
+             ? ffi::IsCommandBufferCompatible(registration->metadata)
              : false;
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
index 5ea3cdece94140..fbde0e128a9479 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_COMMAND_BUFFER_CONVERSION_PASS_H_
 #define XLA_BACKENDS_GPU_RUNTIME_COMMAND_BUFFER_CONVERSION_PASS_H_
 
+#include <string>
+
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
index e6361d2ac1fd95..772404cef107d2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
@@ -57,6 +56,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
index 660a3e9fc2e5af..98ef994a72f44a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
@@ -157,6 +157,13 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
     TF_RETURN_IF_ERROR(thunks_->Initialize(params));
   }
 
+  // If there are no thunks, or command buffer does not require initialization,
+  // we can mark warm up as done immediately.
+  if ((!thunks_ || !commands_.requires_initialization()) &&
+      !cmd_buffer->warmup_done) {
+    cmd_buffer->warmup_done = true;
+  }
+
   // Construct ExecuteParams with empty fields for everything that is not needed
   // for recording commands.
   Thunk::ExecuteParams execute_params(
@@ -177,9 +184,9 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
   // If commands require initialization, we also record them into the command
   // buffer before execution. This is required to guarantee that collective
   // commands recorded on all participating ranks to avoid deadlocks.
-  if (cmd_buffer->command_buffer->state() ==
-          se::CommandBuffer::State::kCreate ||
-      commands_.requires_initialization()) {
+  if (cmd_buffer->warmup_done && (cmd_buffer->command_buffer->state() ==
+                                      se::CommandBuffer::State::kCreate ||
+                                  commands_.requires_initialization())) {
     VLOG(3) << "Initialize command buffer on device #"
             << params.executor->device_ordinal()
             << " by recoding command buffer cmd sequence"
@@ -229,6 +236,7 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
       !enable_command_buffers_during_profiling_) {
     VLOG(1) << "Execute command buffer thunk as a regular thunk sequence "
                "because we detected active profiling session";
+    TraceMe trace("WARNING: CommandBuffer disabled when profiling");
     return thunks_->ExecuteOnStream(params);
   }
 
@@ -238,6 +246,14 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   absl::MutexLock lock(cmd_buffer->mutex);
 
+  // warm up iteration, run through thunks if they are present.
+  if (!cmd_buffer->warmup_done && thunks_) {
+    VLOG(2) << "Executing warm up iteration of command buffer thunk";
+    TF_RETURN_IF_ERROR(thunks_->ExecuteOnStream(params));
+    cmd_buffer->warmup_done = true;
+    return absl::OkStatus();
+  }
+
   // Update buffer allocations and collect all allocations that changed since
   // the last command buffer execution.
   auto updated_allocs = cmd_buffer->UpdateBufferAllocations(commands_, params);
@@ -331,7 +347,7 @@ void CommandBufferThunk::TrackCommandBuffers(
 }
 
 void CommandBufferThunk::EvictCommandBuffers() {
-  TraceMe trace([&] { return "EvictCommandBuffers"; });
+  TraceMe trace("EvictCommandBuffers");
 
   auto* global_state = GetGlobalState();
   absl::MutexLock global_state_lock(global_state->mutex);
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
index 6930106f758f4d..9f1b360a1db437 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
@@ -101,6 +101,15 @@ class CommandBufferThunk : public Thunk {
 
     // Number of command buffer executions since last update.
     int64_t num_executions ABSL_GUARDED_BY(mutex) = 0;
+
+    // For GPU backend, NCCL may call cuda-graph un-supported host side API
+    // during graph capturing (e.g. cuCtxEnablePeerAccess), this will break XLA
+    // cuda graph run. To work around the issue, this PR introduces a warm up
+    // iteration for command buffer thunk, during warm up iteration, command
+    // buffer thunk are executed through normal thunks. The warm up iteration
+    // will do the proper NCCL setup, so later iterations running through
+    // command buffer does not need to call NCCL setup APIs.
+    bool warmup_done ABSL_GUARDED_BY(mutex) = false;
   };
 
   // Command buffer thunk owns commands buffers instantiated on all executors.
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index 952dde26e4cf89..98c53490804a7d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -23,14 +23,12 @@ limitations under the License.
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
-#include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
@@ -38,22 +36,18 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -67,8 +61,6 @@ limitations under the License.
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
-#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -419,6 +411,11 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersEnabledDuringProfiling) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto profiler_lock,
                           tsl::profiler::ProfilerLock::Acquire());
+
+  // skip warm up iteration
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
   // Execute command buffer thunk and verify that it set the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
   TF_ASSERT_OK(stream->BlockHostUntilDone());
@@ -474,10 +471,6 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
 
 TEST(CommandBufferThunkTest, LaunchCmd) {
   se::StreamExecutor* stream_executor = GpuExecutor();
-  if (!IsAtLeastCuda12300(stream_executor)) {
-    // TODO(rocm): weekly sync 24-12-10
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
@@ -572,10 +565,6 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
 
 TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   se::StreamExecutor* stream_executor = GpuExecutor();
-  if (!IsAtLeastCuda12300(stream_executor)) {
-    // TODO(rocm): weekly sync 24-12-10
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
@@ -965,24 +954,22 @@ TEST(CommandBufferThunkTest, DISABLED_DynamicSliceFusionCmd) {
   TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   // Prepare buffer allocations for recording command buffer.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
-  fake_allocations[0] = std::make_unique<BufferAllocation>(
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(
       /*index=*/0, fake_lhs_length, /*color=*/0);
-  fake_allocations[1] =
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0);
-  fake_allocations[2] =
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length,
-                                         /*color=*/0);
-
-  fake_allocations[3] =
-      std::make_unique<BufferAllocation>(/*index=*/3, 1024 * 1024,
-                                         /*color=*/0);
-  BufferAllocation::Slice fake_slice_lhs(fake_allocations[0].get(), 0,
+  fake_allocations.emplace_back(
+      /*index=*/1, rhs_length, /*color=*/0);
+  fake_allocations.emplace_back(/*index=*/2, out_length,
+                                /*color=*/0);
+
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024,
+                                /*color=*/0);
+  BufferAllocation::Slice fake_slice_lhs(&fake_allocations[0], 0,
                                          fake_lhs_length);
-  BufferAllocation::Slice slice_rhs(fake_allocations[1].get(), 0, rhs_length);
-  BufferAllocation::Slice slice_out(fake_allocations[2].get(), 0, out_length);
-  BufferAllocation::Slice slice_workspace(fake_allocations[3].get(), 0,
-                                          1024 * 1024);
+  BufferAllocation::Slice slice_rhs(&fake_allocations[1], 0, rhs_length);
+  BufferAllocation::Slice slice_out(&fake_allocations[2], 0, out_length);
+  BufferAllocation::Slice slice_workspace(&fake_allocations[3], 0, 1024 * 1024);
   auto config = GemmConfig::For(
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {}, {1},
       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), {}, {0},
@@ -1242,10 +1229,6 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
 
 TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   se::StreamExecutor* stream_executor = GpuExecutor();
-  if (!IsAtLeastCuda12300(stream_executor)) {
-    // TODO(rocm): weekly sync 24-12-10
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
index d693a343f13229..244c6b53ad35c9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
@@ -165,6 +165,20 @@ void ConditionalThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   }
 }
 
+absl::Status ConditionalThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<SequentialThunk>& branch_thunk : branch_thunks_) {
+    TF_RETURN_IF_ERROR(branch_thunk->TransformAllNestedThunks(fn));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                        fn(std::move(branch_thunk)));
+    branch_thunk = SequentialThunk::FromThunk(std::move(thunk));
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<ThunkProto> ConditionalThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
index be43d4d6755c56..d0ffcbe7a20dba 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_CONDITIONAL_THUNK_H_
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -72,6 +73,11 @@ class ConditionalThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
+
   bool branch_index_is_bool() const { return branch_index_is_bool_; }
 
   absl::StatusOr<ThunkProto> ToProto() const override;
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
index 96437e5dad8fe3..247ab4eca9835f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
@@ -25,20 +25,22 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::Pointee;
 using ::testing::Property;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 using Kind = Thunk::Kind;
 
@@ -288,5 +290,29 @@ TEST(ConditionalThunkTest, ToString) {
             "  000: kGemm\t\n");
 }
 
+TEST(ConditionalThunkTest, TransformAllNestedThunks) {
+  BufferAllocation::Slice slice;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  branch_thunks.push_back(
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
+  branch_thunks.push_back(
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
+  auto conditional_thunk = std::make_unique<ConditionalThunk>(
+      Thunk::ThunkInfo(), slice, std::move(branch_thunks),
+      /*branch_index_is_bool=*/false);
+
+  TF_EXPECT_OK(conditional_thunk->TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(conditional_thunk->branch_thunks(), SizeIs(2));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[0]->thunks(), SizeIs(1));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[0]->thunks()[0]->kind(),
+              Kind::kCustomCall);
+  EXPECT_THAT(conditional_thunk->branch_thunks()[1]->thunks(), SizeIs(1));
+  EXPECT_THAT(conditional_thunk->branch_thunks()[1]->thunks()[0]->kind(),
+              Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
index 3e2e171c5c12bb..004a003aa07172 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -33,7 +32,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/lazy_op_runner.h"
-#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
index 1e3e58df979072..7d532fcf4cb433 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
@@ -46,6 +48,8 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+using buffer_assignment::BufferAllocationSliceProto;
+
 namespace {
 
 // N pairs of [start_offset, end_offset) require (N+1) storage.
@@ -62,7 +66,9 @@ absl::Status CopyOffsets(se::Stream* stream, se::DeviceMemoryBase scratch,
       static_cast<char*>(scratch.opaque()) + scratch.size() - offsets_size;
   se::DeviceMemoryBase d_offsets(offsets_buffer, offsets_size);
   std::vector<int> h_offsets(batch_size + 1);
-  for (int i = 0; i <= batch_size; ++i) h_offsets[i] = i * segment_size;
+  for (int i = 0; i <= batch_size; ++i) {
+    h_offsets[i] = i * segment_size;
+  }
   return stream->Memcpy(&d_offsets, h_offsets.data(), offsets_size);
 }
 
@@ -283,21 +289,90 @@ CubSortRunnerInterface::Create(PrimitiveType type,
              : CreateCubSortRunner(type, platform_name);
 }
 
-CubSortThunk::CubSortThunk(
+absl::StatusOr<std::unique_ptr<CubSortThunk>> CubSortThunk::Create(
     ThunkInfo thunk_info, PrimitiveType type,
     std::optional<PrimitiveType> value_type,
     absl::InlinedVector<BufferAllocation::Slice, 2> operands,
     absl::InlinedVector<BufferAllocation::Slice, 2> results,
     BufferAllocation::Slice scratch, bool descending, int64_t batch_size,
-    absl::string_view platform_name)
+    absl::string_view platform_name) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<CubSortRunnerInterface> runner,
+      CubSortRunnerInterface::Create(type, value_type, platform_name));
+
+  return absl::WrapUnique<CubSortThunk>(new CubSortThunk(
+      thunk_info, std::move(runner), type, value_type, std::move(operands),
+      std::move(results), scratch, descending, batch_size));
+}
+
+CubSortThunk::CubSortThunk(
+    ThunkInfo thunk_info, std::unique_ptr<CubSortRunnerInterface> runner,
+    PrimitiveType type, std::optional<PrimitiveType> value_type,
+    absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+    absl::InlinedVector<BufferAllocation::Slice, 2> results,
+    BufferAllocation::Slice scratch, bool descending, int64_t batch_size)
     : Thunk(Thunk::kCubSort, thunk_info),
-      runner_(CubSortRunnerInterface::Create(type, value_type, platform_name)
-                  .value()),
+      runner_(std::move(runner)),
       operands_(std::move(operands)),
       results_(std::move(results)),
       scratch_(scratch),
+      type_(type),
+      value_type_(value_type),
       descending_(descending),
       batch_size_(batch_size) {}
 
+absl::StatusOr<std::unique_ptr<CubSortThunk>> CubSortThunk::FromProto(
+    ThunkInfo thunk_info, const CubSortThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    absl::string_view platform_name) {
+  absl::InlinedVector<BufferAllocation::Slice, 2> operands;
+  for (const BufferAllocationSliceProto& slice_proto : proto.operands()) {
+    TF_ASSIGN_OR_RETURN(
+        operands.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+  }
+
+  absl::InlinedVector<BufferAllocation::Slice, 2> results;
+  for (const BufferAllocationSliceProto& slice_proto : proto.results()) {
+    TF_ASSIGN_OR_RETURN(
+        results.emplace_back(),
+        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice scratch,
+      BufferAllocation::Slice::FromProto(proto.scratch(), buffer_allocations));
+
+  std::optional<PrimitiveType> value_type;
+  if (proto.has_value_type()) {
+    value_type = proto.value_type();
+  }
+
+  return Create(thunk_info, proto.type(), value_type, operands, results,
+                scratch, proto.descending(), proto.batch_size(), platform_name);
+}
+
+absl::StatusOr<ThunkProto> CubSortThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  CubSortThunkProto* cub_sort_proto = proto.mutable_cub_sort_thunk();
+
+  cub_sort_proto->set_type(type_);
+  if (value_type_.has_value()) {
+    cub_sort_proto->set_value_type(*value_type_);
+  }
+  for (const BufferAllocation::Slice& slice : operands_) {
+    TF_ASSIGN_OR_RETURN(*cub_sort_proto->add_operands(), slice.ToProto());
+  }
+  for (const BufferAllocation::Slice& slice : results_) {
+    TF_ASSIGN_OR_RETURN(*cub_sort_proto->add_results(), slice.ToProto());
+  }
+  TF_ASSIGN_OR_RETURN(*cub_sort_proto->mutable_scratch(), scratch_.ToProto());
+  cub_sort_proto->set_descending(descending_);
+  cub_sort_proto->set_batch_size(batch_size_);
+
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
index f66c1259b7f276..f04b84ca022a29 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
@@ -24,10 +24,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 
@@ -55,17 +56,25 @@ class CubSortRunnerInterface {
 
 class CubSortThunk : public Thunk {
  public:
-  CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
-               std::optional<PrimitiveType> value_type,
-               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
-               absl::InlinedVector<BufferAllocation::Slice, 2> results,
-               BufferAllocation::Slice scratch, bool descending,
-               int64_t batch_size, absl::string_view platform_name);
+  static absl::StatusOr<std::unique_ptr<CubSortThunk>> Create(
+      ThunkInfo thunk_info, PrimitiveType type,
+      std::optional<PrimitiveType> value_type,
+      absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+      absl::InlinedVector<BufferAllocation::Slice, 2> results,
+      BufferAllocation::Slice scratch, bool descending, int64_t batch_size,
+      absl::string_view platform_name);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override {
     return runner_->Run(params, this);
   }
 
+  static absl::StatusOr<std::unique_ptr<CubSortThunk>> FromProto(
+      ThunkInfo thunk_info, const CubSortThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      absl::string_view platform_name);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   BufferAllocation::Slice operand(int i) const { return operands_[i]; }
   BufferAllocation::Slice result(int i) const { return results_[i]; }
   BufferAllocation::Slice scratch() const { return scratch_; }
@@ -73,10 +82,19 @@ class CubSortThunk : public Thunk {
   int64_t batch_size() const { return batch_size_; }
 
  private:
+  CubSortThunk(ThunkInfo thunk_info,
+               std::unique_ptr<CubSortRunnerInterface> runner,
+               PrimitiveType type, std::optional<PrimitiveType> value_type,
+               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+               absl::InlinedVector<BufferAllocation::Slice, 2> results,
+               BufferAllocation::Slice scratch, bool descending,
+               int64_t batch_size);
   std::unique_ptr<CubSortRunnerInterface> runner_;
   absl::InlinedVector<BufferAllocation::Slice, 2> operands_;
   absl::InlinedVector<BufferAllocation::Slice, 2> results_;
   BufferAllocation::Slice scratch_;
+  PrimitiveType type_;
+  std::optional<PrimitiveType> value_type_;
   bool descending_;
   int64_t batch_size_;
 };
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc
new file mode 100644
index 00000000000000..dfd04751d72bcd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/ffi/ffi.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/platform_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(CubSortThunkTest, ProtoRoundTrip) {
+  TF_ASSERT_OK_AND_ASSIGN(absl::string_view name,
+                          PlatformUtil::CanonicalPlatformName("gpu"));
+  auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
+    thunk_info {
+      profile_annotation: "cub_sort_thunk_profile"
+      execution_stream_id: 1
+    }
+    cub_sort_thunk {
+      type: F32
+      value_type: F32
+      operands { offset: 0 size: 4 buffer_allocation_index: 0 }
+      results { offset: 0 size: 4 buffer_allocation_index: 1 }
+      scratch { offset: 0 size: 1024 buffer_allocation_index: 2 }
+      descending: true
+      batch_size: 1
+    }
+  )pb");
+
+  std::vector<BufferAllocation> buffer_allocations;
+  buffer_allocations.emplace_back(/*index=*/0, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/1, /*size=*/4, /*color=*/0);
+  buffer_allocations.emplace_back(/*index=*/2, /*size=*/1024, /*color=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(Thunk::ThunkInfo thunk_info,
+                          Thunk::ThunkInfo::FromProto(proto.thunk_info()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CubSortThunk> thunk,
+      CubSortThunk::FromProto(thunk_info, proto.cub_sort_thunk(),
+                              buffer_allocations, name));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
index 1677b6416be29a..863cb97b8dced1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
@@ -46,8 +46,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -115,7 +115,10 @@ TEST(CommandBufferThunkTest, CuDnnCmd) {
     return graph;
   }());
   int64_t workspace_size = graph.Graph().get_workspace_size();
-  TF_ASSERT_OK(graph.Prepare(dnn_support, se::NumericOptions{}));
+  TF_ASSERT_OK(graph.Prepare(
+      dnn_support, se::EngineOptions{/*require_determinism=*/false,
+                                     /*allow_tf32=*/true,
+                                     /*require_command_buffer=*/true}));
   TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
   EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
               absl_testing::IsOkAndHolds(true));
diff --git a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
index 00bb675e42b54c..1cd4d2200de5eb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cudnn_thunk_test.cc
@@ -20,13 +20,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
index 141985530a05c4..4d06e2418559c8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
@@ -22,27 +22,34 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/custom_call_target.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
@@ -67,9 +74,9 @@ using xla::ffi::CallOptions;
 // memory addresses. This is called once when creating the CustomCall thunk,
 // then the thunk will need to update the addresses at runtime.
 static absl::StatusOr<ffi::CallFrame> BuildCallFramePrototype(
-    absl::Span<const std::optional<ShapedSlice>> operands,
-    absl::Span<const std::optional<ShapedSlice>> results,
-    CustomCallThunk::AttributesMap attributes) {
+    absl::Span<const NullableShapedSlice> operands,
+    absl::Span<const NullableShapedSlice> results,
+    ffi::AttributesMap attributes) {
   CallFrameBuilder builder(
       /*num_args=*/operands.size(),
       /*num_rets=*/results.size());
@@ -172,8 +179,8 @@ ResolveLegacyCustomCall(const CustomCallTargetRegistry& registry,
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name, CustomCallTarget call_target,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, std::string opaque) {
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque) {
   return absl::WrapUnique(new CustomCallThunk(
       thunk_info, std::move(target_name), std::move(operands),
       std::move(results), std::move(opaque), std::move(call_target),
@@ -182,8 +189,8 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, std::string opaque,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque,
     CustomCallApiVersion api_version, absl::string_view platform_name) {
   if (api_version == CustomCallApiVersion::API_VERSION_TYPED_FFI) {
     return absl::InvalidArgumentError(
@@ -203,8 +210,8 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, AttributesMap attributes,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
     const HloComputation* called_computation, absl::string_view platform_name) {
   TF_ASSIGN_OR_RETURN(ffi::HandlerRegistration registration,
                       ffi::FindHandler(target_name, platform_name));
@@ -216,9 +223,8 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
-    XLA_FFI_Handler_Bundle bundle,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, AttributesMap attributes,
+    XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
     const HloComputation* called_computation) {
   auto execution_state = std::make_unique<ffi::ExecutionState>();
 
@@ -248,10 +254,48 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
       std::move(attributes), std::move(execution_state), called_computation));
 }
 
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
+    ThunkInfo thunk_info, std::string target_name, OwnedHandlerBundle bundle,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results,
+    xla::ffi::AttributesMap attributes,
+    const HloComputation* called_computation) {
+  if (!bundle.execute) {
+    return absl::InvalidArgumentError(
+        "Execute handler is required for a CustomCallThunk");
+  }
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  // Initialize FFI handler state if it has an instantiate callback.
+  if (bundle.instantiate) {
+    // At FFI handler instantiation time, we don't have any arguments or
+    // results or access to the underlying device (stream, etc.)
+    CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+
+    CallFrameBuilder::AttributesBuilder attrs;
+    attrs.Append(attributes);
+
+    builder.AddAttributes(attrs.Build());
+    CallFrame call_frame = builder.Build();
+
+    CallOptions options;
+    options.execution_state = execution_state.get();
+    TF_RETURN_IF_ERROR(Call(*bundle.instantiate, call_frame, options,
+                            xla::ffi::ExecutionStage::kInstantiate));
+  }
+
+  TF_ASSIGN_OR_RETURN(CallFrame call_frame,
+                      BuildCallFramePrototype(operands, results, attributes));
+  return absl::WrapUnique(new CustomCallThunk(
+      thunk_info, std::move(target_name), std::move(bundle),
+      std::move(operands), std::move(results), std::move(call_frame),
+      std::move(attributes), std::move(execution_state), called_computation));
+}
+
 CustomCallThunk::CustomCallThunk(
     ThunkInfo thunk_info, std::string target_name,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, std::string opaque,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, std::string opaque,
     CustomCallTarget call_target,
     const std::optional<CustomCallApiVersion>& api_version)
     : Thunk(Thunk::kCustomCall, thunk_info),
@@ -264,10 +308,10 @@ CustomCallThunk::CustomCallThunk(
 
 CustomCallThunk::CustomCallThunk(
     ThunkInfo thunk_info, std::string target_name,
-    XLA_FFI_Handler_Bundle bundle,
-    std::vector<std::optional<ShapedSlice>> operands,
-    std::vector<std::optional<ShapedSlice>> results, CallFrame call_frame,
-    AttributesMap attributes,
+    std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle> bundle,
+    std::vector<NullableShapedSlice> operands,
+    std::vector<NullableShapedSlice> results, CallFrame call_frame,
+    ffi::AttributesMap attributes,
     std::unique_ptr<ffi::ExecutionState> execution_state,
     const HloComputation* called_computation)
     : Thunk(Thunk::kCustomCall, thunk_info),
@@ -315,18 +359,15 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
   return absl::OkStatus();
 }
 
-absl::Status CustomCallThunk::ExecuteFfiHandler(
-    RunId run_id, XLA_FFI_Handler* handler, XLA_FFI_ExecutionStage stage,
-    se::Stream* stream, const ffi::ExecutionContext* execution_context,
-    const BufferAllocations* buffer_allocations) {
-  if (handler == nullptr) {
-    return absl::InternalError("FFI execute handler is not set");
-  }
-  if (stage != XLA_FFI_ExecutionStage_PREPARE &&
-      !(buffer_allocations && stream)) {
-    return absl::InternalError("buffer allocations and stream are required");
-  }
-
+// Builds a call frame for the custom call.
+//
+// If `buffer_allocations` is provided, the call frame will contain the actual
+// device memory addresses of the buffers. Otherwise, the call frame will
+// contain placeholders - this should only be the case when calling Prepare()
+// stage handler.
+absl::StatusOr<ObjectPool<CallFrame>::BorrowedObject>
+CustomCallThunk::BuildCallFrame(
+    const BufferAllocations* absl_nullable buffer_allocations) {
   auto device_memory = [&](BufferAllocation::Slice slice) {
     return buffer_allocations ? buffer_allocations->GetDeviceAddress(slice)
                               : se::DeviceMemoryBase{};
@@ -358,60 +399,227 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(
   // device memory addresses.
   TF_ASSIGN_OR_RETURN(auto call_frame, call_frames_->GetOrCreate());
   TF_RETURN_IF_ERROR(call_frame->UpdateWithBuffers(arguments, results));
+  return call_frame;
+}
 
+// Builds call options object for the custom call.
+//
+// `stream` and `buffer_allocations may only be non-null for options passed to
+// Prepare()_stage handler.
+CallOptions CustomCallThunk::BuildCallOptions(
+    RunId run_id, se::Stream* absl_nullable stream,
+    const BufferAllocations* absl_nullable buffer_allocations,
+    const ffi::ExecutionContext* absl_nullable execution_context) {
   int32_t device_ordinal = -1;
   se::DeviceMemoryAllocator* allocator = nullptr;
-  if (stage != XLA_FFI_ExecutionStage_PREPARE) {
+  if (buffer_allocations != nullptr) {
     device_ordinal = buffer_allocations->device_ordinal();
     allocator = buffer_allocations->memory_allocator();
   }
 
-  CallOptions options = {run_id,
-                         device_ordinal,
-                         CallOptions::GpuOptions{stream, allocator},
-                         called_computation_,
-                         execution_context,
-                         execution_state_.get()};
+  return CallOptions{run_id,
+                     device_ordinal,
+                     CallOptions::GpuOptions{stream, allocator},
+                     called_computation_,
+                     execution_context,
+                     execution_state_.get()};
+}
+
+absl::Status CustomCallThunk::ExecuteFfiHandler(
+    RunId run_id, XLA_FFI_Handler* handler, XLA_FFI_ExecutionStage stage,
+    se::Stream* stream, const ffi::ExecutionContext* execution_context,
+    const BufferAllocations* buffer_allocations) {
+  if (handler == nullptr) {
+    return absl::InternalError("FFI execute handler is not set");
+  }
+  if (stage != XLA_FFI_ExecutionStage_PREPARE &&
+      !(buffer_allocations && stream)) {
+    return absl::InternalError("buffer allocations and stream are required");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto call_frame, BuildCallFrame(buffer_allocations));
+  CallOptions options =
+      BuildCallOptions(run_id, stream, buffer_allocations, execution_context);
+  return Call(handler, *call_frame, options, stage);
+}
+
+absl::Status CustomCallThunk::ExecuteFfiHandler(
+    RunId run_id, xla::ffi::Ffi& handler, xla::ffi::ExecutionStage stage,
+    se::Stream* stream, const ffi::ExecutionContext* execution_context,
+    const BufferAllocations* buffer_allocations) {
+  if (stage != xla::ffi::ExecutionStage::kPrepare &&
+      !(buffer_allocations && stream)) {
+    return absl::InternalError("buffer allocations and stream are required");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto call_frame, BuildCallFrame(buffer_allocations));
+  CallOptions options =
+      BuildCallOptions(run_id, stream, buffer_allocations, execution_context);
   return Call(handler, *call_frame, options, stage);
 }
 
 absl::Status CustomCallThunk::Prepare(
     const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  if (!bundle_ || !bundle_->prepare) {
-    return absl::OkStatus();
+  if (bundle_.has_value()) {
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle && c_bundle->prepare) {
+      return ExecuteFfiHandler(run_id, c_bundle->prepare,
+                               XLA_FFI_ExecutionStage_PREPARE,
+                               /*stream=*/nullptr,
+                               /*execution_context=*/nullptr,
+                               /*buffer_allocations=*/nullptr);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle && owned_bundle->prepare) {
+      return ExecuteFfiHandler(run_id, *owned_bundle->prepare,
+                               xla::ffi::ExecutionStage::kPrepare,
+                               /*stream=*/nullptr,
+                               /*execution_context=*/nullptr,
+                               /*buffer_allocations=*/nullptr);
+    }
   }
 
-  return ExecuteFfiHandler(
-      params.collective_params ? params.collective_params->run_id : RunId{-1},
-      bundle_->prepare, XLA_FFI_ExecutionStage_PREPARE,
-      /*stream=*/nullptr,
-      /*execution_context=*/nullptr,
-      /*buffer_allocations=*/nullptr);
+  return absl::OkStatus();
 }
 
 absl::Status CustomCallThunk::Initialize(const InitializeParams& params) {
-  if (!bundle_ || !bundle_->initialize) {
-    return absl::OkStatus();
+  if (bundle_.has_value()) {
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle && c_bundle->initialize) {
+      return ExecuteFfiHandler(run_id, *c_bundle->initialize,
+                               XLA_FFI_ExecutionStage_INITIALIZE, params.stream,
+                               params.ffi_execution_context,
+                               params.buffer_allocations);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle && owned_bundle->initialize) {
+      return ExecuteFfiHandler(run_id, *owned_bundle->initialize,
+                               xla::ffi::ExecutionStage::kInitialize,
+                               params.stream, params.ffi_execution_context,
+                               params.buffer_allocations);
+    }
   }
-
-  return ExecuteFfiHandler(
-      params.collective_params ? params.collective_params->run_id : RunId{-1},
-      bundle_->initialize, XLA_FFI_ExecutionStage_INITIALIZE, params.stream,
-      params.ffi_execution_context, params.buffer_allocations);
+  return absl::OkStatus();
 }
 
 absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
       GetStreamForExecution(Thunk::execution_stream_id(), params));
+
   if (bundle_.has_value()) {
-    return ExecuteFfiHandler(
-        params.collective_params ? params.collective_params->run_id : RunId{-1},
-        bundle_->execute, XLA_FFI_ExecutionStage_EXECUTE, stream,
-        params.ffi_execution_context, params.buffer_allocations);
+    const RunId run_id =
+        params.collective_params ? params.collective_params->run_id : RunId{-1};
+    if (const auto* c_bundle =
+            std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+        c_bundle) {
+      return ExecuteFfiHandler(
+          run_id, c_bundle->execute, XLA_FFI_ExecutionStage_EXECUTE, stream,
+          params.ffi_execution_context, params.buffer_allocations);
+    }
+    if (const auto* owned_bundle =
+            std::get_if<OwnedHandlerBundle>(&bundle_.value());
+        owned_bundle) {
+      if (!owned_bundle->execute) {
+        return absl::InternalError("FFI execute handler is not set");
+      }
+      return ExecuteFfiHandler(
+          run_id, *owned_bundle->execute, xla::ffi::ExecutionStage::kExecute,
+          stream, params.ffi_execution_context, params.buffer_allocations);
+    }
   }
+
   return ExecuteCustomCall(params);
 }
 
+absl::StatusOr<ThunkProto> CustomCallThunk::ToProto() const {
+  if (!api_version_.has_value()) {
+    return absl::FailedPreconditionError(
+        "CustomCallThunk was created from a non-registered target and cannot "
+        "be serialized to a proto");
+  }
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  proto.mutable_custom_call_thunk()->set_target_name(target_name_);
+  proto.mutable_custom_call_thunk()->set_opaque(opaque_);
+  proto.mutable_custom_call_thunk()->set_api_version(api_version_.value());
+  if (called_computation_ != nullptr) {
+    proto.mutable_custom_call_thunk()->set_called_computation(
+        called_computation_->name());
+  }
+
+  for (const NullableShapedSlice& operand : operands_) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_custom_call_thunk()->add_operands(),
+                        operand.ToProto());
+  }
+
+  for (const NullableShapedSlice& result : results_) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_custom_call_thunk()->add_results(),
+                        result.ToProto());
+  }
+
+  if (attributes_.has_value()) {
+    *proto.mutable_custom_call_thunk()->mutable_attributes() =
+        attributes_->ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
+    ThunkInfo thunk_info, const CustomCallThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module,
+    absl::string_view platform_name) {
+  if (hlo_module == nullptr && proto.has_called_computation()) {
+    return absl::InvalidArgumentError(
+        "HloModule is required to deserialize a CustomCallThunk with a "
+        "called computation");
+  }
+
+  std::vector<NullableShapedSlice> operands, results;
+  for (const auto& operand_proto : proto.operands()) {
+    TF_ASSIGN_OR_RETURN(
+        NullableShapedSlice operand,
+        NullableShapedSlice::FromProto(operand_proto, buffer_allocations));
+    operands.push_back(std::move(operand));
+  }
+  for (const auto& result_proto : proto.results()) {
+    TF_ASSIGN_OR_RETURN(
+        NullableShapedSlice result,
+        NullableShapedSlice::FromProto(result_proto, buffer_allocations));
+    results.push_back(std::move(result));
+  }
+  TF_ASSIGN_OR_RETURN(ffi::AttributesMap attributes,
+                      ffi::AttributesMap::FromProto(proto.attributes()));
+
+  HloComputation* called_computation = nullptr;
+  if (proto.has_called_computation()) {
+    CHECK(hlo_module != nullptr);  // This check is needed for static analysis.
+    called_computation =
+        hlo_module->GetComputationWithName(proto.called_computation());
+    if (called_computation == nullptr) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "HloComputation '", proto.called_computation(),
+          "' not found in the HloModule with name '", hlo_module->name(), "'"));
+    }
+  }
+
+  return CustomCallThunk::Create(std::move(thunk_info), proto.target_name(),
+                                 std::move(operands), std::move(results),
+                                 std::move(attributes), called_computation,
+                                 platform_name);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
index 9a2c0bfc805ff3..7324bd1666738f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
@@ -21,19 +21,27 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/object_pool.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -55,19 +63,27 @@ namespace gpu {
 // compiler is allowed to create.
 class CustomCallThunk : public Thunk {
  public:
+  // An owning equivalent of XLA_FFI_Handler_Bundle that allows using lambdas
+  // with captures.
+  //
+  // The members can be initialized with xla::ffi::Ffi::Bind().To(...).
+  struct OwnedHandlerBundle {
+    std::unique_ptr<xla::ffi::Ffi> initialize;
+    std::unique_ptr<xla::ffi::Ffi> instantiate;
+    std::unique_ptr<xla::ffi::Ffi> prepare;
+    std::unique_ptr<xla::ffi::Ffi> execute;
+  };
+
   using CustomCallTarget =
       std::function<void(stream_executor::Stream*, void**, const char*, size_t,
                          XlaCustomCallStatus*)>;
 
-  using Attribute = ffi::CallFrameBuilder::Attribute;
-  using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
-
   // Creates a serializable custom call thunk. The callback is resolved using
   // the legacy CustomCall registry. For new code please use XLA FFI instead.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      std::vector<std::optional<ShapedSlice>> operands,
-      std::vector<std::optional<ShapedSlice>> results, std::string opaque,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, std::string opaque,
       CustomCallApiVersion api_version, absl::string_view platform_name);
 
   // Creates a custom call thunk from the given legacy custom call target.
@@ -75,16 +91,16 @@ class CustomCallThunk : public Thunk {
   // This function is only permitted for unit testing code.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      CustomCallTarget call_target,
-      std::vector<std::optional<ShapedSlice>> operands,
-      std::vector<std::optional<ShapedSlice>> results, std::string opaque);
+      CustomCallTarget call_target, std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, std::string opaque);
 
   // Creates a serializable custom call thunk. The callback is resolved using
   // XLA FFI.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      std::vector<std::optional<ShapedSlice>> operands,
-      std::vector<std::optional<ShapedSlice>> results, AttributesMap attributes,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
       const HloComputation* called_computation,
       absl::string_view platform_name);
 
@@ -93,9 +109,19 @@ class CustomCallThunk : public Thunk {
   // handler which matches the given bundle.
   static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
       ThunkInfo thunk_info, std::string target_name,
-      XLA_FFI_Handler_Bundle bundle,
-      std::vector<std::optional<ShapedSlice>> operands,
-      std::vector<std::optional<ShapedSlice>> results, AttributesMap attributes,
+      XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
+      const HloComputation* called_computation);
+
+  // Creates a custom call thunk from a bundle of handlers created with
+  // xla::ffi::Bind(). Any pointer or reference lambda captures must be valid
+  // for the lifetime of the thunk.
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> Create(
+      ThunkInfo thunk_info, std::string target_name, OwnedHandlerBundle bundle,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results,
+      xla::ffi::AttributesMap attributes,
       const HloComputation* called_computation);
 
   absl::Status Prepare(const PrepareParams& params,
@@ -105,51 +131,78 @@ class CustomCallThunk : public Thunk {
 
   const std::string& target_name() const { return target_name_; }
   CustomCallTarget call_target() const { return call_target_; }
-  std::optional<XLA_FFI_Handler_Bundle> bundle() const { return bundle_; }
+  std::optional<XLA_FFI_Handler_Bundle> bundle() const {
+    if (!bundle_.has_value()) {
+      return std::nullopt;
+    }
+    const XLA_FFI_Handler_Bundle* c_bundle =
+        std::get_if<XLA_FFI_Handler_Bundle>(&bundle_.value());
+    return c_bundle ? std::make_optional(*c_bundle) : std::nullopt;
+  }
   std::optional<ffi::CallFrame> call_frame() const {
     return call_frame_ ? std::make_optional(call_frame_->Copy()) : std::nullopt;
   }
 
-  const std::vector<std::optional<ShapedSlice>>& operands() const {
-    return operands_;
-  }
-  const std::vector<std::optional<ShapedSlice>>& results() const {
-    return results_;
-  }
+  const std::vector<NullableShapedSlice>& operands() const { return operands_; }
+  const std::vector<NullableShapedSlice>& results() const { return results_; }
 
   absl::string_view opaque() const { return opaque_; }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
+  static absl::StatusOr<std::unique_ptr<CustomCallThunk>> FromProto(
+      ThunkInfo thunk_info, const CustomCallThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      const HloModule* absl_nullable hlo_module,
+      absl::string_view platform_name);
+
  private:
   CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
-                  std::vector<std::optional<ShapedSlice>> operands,
-                  std::vector<std::optional<ShapedSlice>> results,
-                  std::string opaque, CustomCallTarget call_target,
+                  std::vector<NullableShapedSlice> operands,
+                  std::vector<NullableShapedSlice> results, std::string opaque,
+                  CustomCallTarget call_target,
                   const std::optional<CustomCallApiVersion>& api_version);
 
-  CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
-                  XLA_FFI_Handler_Bundle bundle,
-                  std::vector<std::optional<ShapedSlice>> operands,
-                  std::vector<std::optional<ShapedSlice>> results,
-                  ffi::CallFrame call_frame, AttributesMap attributes,
-                  std::unique_ptr<ffi::ExecutionState> execution_state,
-                  const HloComputation* called_computation);
+  CustomCallThunk(
+      ThunkInfo thunk_info, std::string target_name,
+      std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle> bundle,
+      std::vector<NullableShapedSlice> operands,
+      std::vector<NullableShapedSlice> results, ffi::CallFrame call_frame,
+      xla::ffi::AttributesMap attributes,
+      std::unique_ptr<ffi::ExecutionState> execution_state,
+      const HloComputation* called_computation);
 
   absl::Status ExecuteCustomCall(const ExecuteParams& params);
 
+  absl::StatusOr<ObjectPool<xla::ffi::CallFrame>::BorrowedObject>
+  BuildCallFrame(const BufferAllocations* absl_nullable buffer_allocations);
+
+  xla::ffi::CallOptions BuildCallOptions(
+      RunId run_id, se::Stream* absl_nullable stream,
+      const BufferAllocations* absl_nullable buffer_allocations,
+      const ffi::ExecutionContext* absl_nullable execution_context);
+
   absl::Status ExecuteFfiHandler(RunId run_id, XLA_FFI_Handler* handler,
                                  XLA_FFI_ExecutionStage stage,
                                  se::Stream* stream,
                                  const ffi::ExecutionContext* execution_context,
                                  const BufferAllocations* buffer_allocations);
 
+  absl::Status ExecuteFfiHandler(RunId run_id, xla::ffi::Ffi& handler,
+                                 xla::ffi::ExecutionStage stage,
+                                 se::Stream* stream,
+                                 const ffi::ExecutionContext* execution_context,
+                                 const BufferAllocations* buffer_allocations);
+
   // API version of the custom call. If not set, it means the custom call thunk
   // was initialized from a non-registered function pointer and can't be
   // serialized to a proto.
   std::optional<CustomCallApiVersion> api_version_;
   std::string target_name_;
 
-  std::vector<std::optional<ShapedSlice>> operands_;
-  std::vector<std::optional<ShapedSlice>> results_;
+  // Nulled shape slices represent null pointer arguments to the thunk.
+  std::vector<NullableShapedSlice> operands_;
+  std::vector<NullableShapedSlice> results_;
 
   // This is a legacy custom call API that is discouraged, and will be
   // deprecated once XLA:FFI mechanism is ready.
@@ -159,8 +212,9 @@ class CustomCallThunk : public Thunk {
   // XLA FFI provides a right type safe mechanism for registering external
   // functions with XLA runtime. It's under construction, and still misses
   // a lot of features. Long term it will replace legacy custom calls.
-  std::optional<XLA_FFI_Handler_Bundle> bundle_;
-  std::optional<AttributesMap> attributes_;
+  std::optional<std::variant<XLA_FFI_Handler_Bundle, OwnedHandlerBundle>>
+      bundle_;
+  std::optional<xla::ffi::AttributesMap> attributes_;
 
   // Reference call frame pre-initialized at construction time.
   std::optional<ffi::CallFrame> call_frame_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index abd3d5394a0bc5..29199825bb96ae 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -15,35 +15,51 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 
+#include <array>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/resource_requests.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 
 namespace xla::gpu {
 namespace {
+using absl_testing::IsOk;
 using absl_testing::StatusIs;
 using ::testing::HasSubstr;
 
@@ -198,5 +214,236 @@ TEST(CustomCallThunkTest, ResolvesLegacyCustomCall) {
                        HasSubstr("Legacy Custom call was executed!")));
 }
 
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlers) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  int instantiate_calls = 0;
+  int prepare_calls = 0;
+  int initialize_calls = 0;
+  int execute_calls = 0;
+  CustomCallThunk::OwnedHandlerBundle bundle;
+  bundle.instantiate =
+      ffi::Ffi::Bind<ffi::ExecutionStage::kInstantiate>().To([&]() {
+        ++instantiate_calls;
+        return absl::OkStatus();
+      });
+  bundle.prepare = ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>().To([&]() {
+    ++prepare_calls;
+    return absl::OkStatus();
+  });
+  bundle.initialize =
+      ffi::Ffi::Bind<ffi::ExecutionStage::kInitialize>().To([&]() {
+        ++initialize_calls;
+        return absl::OkStatus();
+      });
+  bundle.execute = ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>().To([&]() {
+    ++execute_calls;
+    return absl::OkStatus();
+  });
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
+  ResourceRequests resource_requests;
+  BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::InitializeParams initialize_params;
+  initialize_params.stream = stream.get();
+  initialize_params.buffer_allocations = &buffer_allocations;
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
+      stream.get(), nullptr, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                              std::move(bundle),
+                              /*operands=*/{},
+                              /*results=*/{}, /*attributes=*/{},
+                              /*called_computation=*/nullptr));
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 0);
+  EXPECT_EQ(initialize_calls, 0);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->Prepare(prepare_params, resource_requests), IsOk());
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(initialize_calls, 0);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->Initialize(initialize_params), IsOk());
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(initialize_calls, 1);
+  EXPECT_EQ(execute_calls, 0);
+
+  EXPECT_THAT(thunk->ExecuteOnStream(execute_params), IsOk());
+  EXPECT_EQ(initialize_calls, 1);
+  EXPECT_EQ(instantiate_calls, 1);
+  EXPECT_EQ(prepare_calls, 1);
+  EXPECT_EQ(execute_calls, 1);
+}
+
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutOptionalOnes) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  int execute_calls = 0;
+  CustomCallThunk::OwnedHandlerBundle bundle;
+  bundle.execute = ffi::Ffi::Bind().To([&]() {
+    ++execute_calls;
+    return absl::OkStatus();
+  });
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
+  ResourceRequests resource_requests;
+  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
+  BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
+      stream.get(), nullptr, nullptr);
+
+  // Optional handlers are null and shouldn't be invoked.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                              std::move(bundle),
+                              /*operands=*/{},
+                              /*results=*/{}, /*attributes=*/{},
+                              /*called_computation=*/nullptr));
+  EXPECT_THAT(thunk->Prepare(prepare_params, resource_requests), IsOk());
+  EXPECT_THAT(thunk->Initialize(initialize_params), IsOk());
+  EXPECT_THAT(thunk->ExecuteOnStream(execute_params), IsOk());
+  EXPECT_EQ(execute_calls, 1);
+}
+
+TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutExecute) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  CustomCallThunk::OwnedHandlerBundle bundle;  // all handlers null
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), BufferAllocations({}, 0, &allocator),
+      stream.get(), stream.get(), nullptr, nullptr);
+
+  EXPECT_THAT(CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                                      std::move(bundle),
+                                      /*operands=*/{},
+                                      /*results=*/{}, /*attributes=*/{},
+                                      /*called_computation=*/nullptr),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+// A simple callback function that expects specific arguments.
+absl::Status VerifyCallbackArguments(int my_attribute,
+                                     ffi::AnyBuffer my_operand,
+                                     ffi::Result<ffi::AnyBuffer> my_result,
+                                     const HloComputation* called_computation) {
+  EXPECT_EQ(my_attribute, 42);
+  EXPECT_EQ(my_operand.element_type(), xla::PrimitiveType::U8);
+  EXPECT_EQ(my_operand.device_memory().opaque(),
+            absl::bit_cast<void*>(static_cast<intptr_t>(0xDEADBEEF)));
+  EXPECT_EQ(my_result->element_type(), xla::PrimitiveType::U16);
+  EXPECT_EQ(my_result->device_memory().opaque(),
+            absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)));
+  EXPECT_EQ(called_computation->name(), "test_computation");
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kVerifyCallbackArguments, VerifyCallbackArguments,
+                       ffi::Ffi::Bind()
+                           .Attr<int>("my_attribute")
+                           .Arg<ffi::AnyBuffer>()
+                           .Ret<ffi::AnyBuffer>()
+                           .Ctx<ffi::CalledComputation>(),
+                       {ffi::Traits::kCmdBufferCompatible});
+
+constexpr absl::string_view kVerifyCallbackArgumentsCustomCallName =
+    "__xla_test$$verify_callback_arguments";
+constexpr absl::string_view kTestPlatformName = "TEST_PLATFORM";
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
+                         kVerifyCallbackArgumentsCustomCallName,
+                         kTestPlatformName, kVerifyCallbackArguments);
+
+TEST(CustomCallThunkTest, ProtoConversion) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  BufferAllocation alloc0{0, 1024, 0};
+  BufferAllocation alloc1{1, 1024, 0};
+  ShapedSlice operand_slice{BufferAllocation::Slice{&alloc0, 0, 1024},
+                            ShapeUtil::MakeShape(U8, {1024})};
+  ShapedSlice result_slice{BufferAllocation::Slice{&alloc1, 0, 1024},
+                           ShapeUtil::MakeShape(U16, {512})};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{operand_slice},
+          /*results=*/{result_slice}, /*attributes=*/{{"my_attribute", 42}},
+          hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName));
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  ASSERT_TRUE(proto.has_custom_call_thunk());
+  original_thunk.reset();
+
+  std::array allocations = {alloc0, alloc1};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 allocations, &hlo_module, kTestPlatformName));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations device_allocations(
+      {stream_executor::DeviceMemoryBase(
+           absl::bit_cast<void*>(static_cast<intptr_t>(0xDEADBEEF)), 1024),
+       stream_executor::DeviceMemoryBase(
+           absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)), 1024)},
+      0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), device_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+  EXPECT_THAT(new_thunk->ExecuteOnStream(params), IsOk());
+}
+
+TEST(CustomCallThunkTest, DeserializationFailsWithMissingHloModule) {
+  CustomCallThunkProto proto =
+      tsl::proto_testing::ParseTextProtoOrDie<CustomCallThunkProto>(
+          R"pb(
+            target_name: "__xla_test$$verify_callback_arguments"
+            api_version: API_VERSION_TYPED_FFI
+            called_computation: "called_computation"
+          )pb");
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("not_called_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto,
+                                         /*buffer_allocations=*/{}, &hlo_module,
+                                         /*platform_name=*/kTestPlatformName),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc
new file mode 100644
index 00000000000000..f3b70b2ee25b3a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.cc
@@ -0,0 +1,124 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+CustomKernelThunk::CustomKernelThunk(
+    Thunk::ThunkInfo thunk_info, CustomKernel custom_kernel,
+    const emitters::KernelArguments& kernel_arguments)
+    : Thunk(Kind::kCustomKernel, std::move(thunk_info)),
+      args_(kernel_arguments.GetArgumentBufferSlices()),
+      written_(kernel_arguments.GetArgumentOutputFlags()),
+      custom_kernel_(std::move(custom_kernel)) {}
+
+std::string CustomKernelThunk::ToString(int indent) const {
+  return custom_kernel_.ToString();
+}
+
+absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
+  absl::MutexLock lock(mutex_);
+
+  if (!kernel_cache_.contains(params.executor)) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::Kernel> kernel,
+        params.executor->LoadKernel(custom_kernel_.kernel_spec()));
+    kernel_cache_.emplace(params.executor, std::move(kernel));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  se::Kernel* kernel = [&] {
+    absl::MutexLock lock(mutex_);
+    return kernel_cache_[executor].get();
+  }();
+
+  int device_ordinal = executor->device_ordinal();
+  VLOG(3) << "[" << device_ordinal << "] Launching "
+          << custom_kernel_.ToString() << " as device kernel "
+          << kernel->name();
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
+  for (const BufferAllocation::Slice& arg : args_) {
+    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
+    VLOG(3) << "[" << device_ordinal << "]  Arg: alloc #" << arg.index()
+            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
+            << buf.size() << "B)";
+    buffer_args.push_back(buf);
+  }
+
+  if (VLOG_IS_ON(100)) {
+    absl::InlinedVector<se::KernelArgument, 4> kernel_args;
+    for (const se::DeviceMemoryBase& arg : buffer_args) {
+      kernel_args.push_back(arg);
+    }
+    PrintBufferContents(params.stream, kernel_args);
+  }
+
+  se::KernelArgsDeviceMemoryArray args(buffer_args,
+                                       custom_kernel_.shared_memory_bytes());
+
+  return kernel->Launch(custom_kernel_.thread_dims(),
+                        custom_kernel_.block_dims(),
+                        custom_kernel_.cluster_dims(), params.stream, args);
+}
+
+Thunk::BufferUses CustomKernelThunk::buffer_uses() const {
+  Thunk::BufferUses buffers;
+  buffers.reserve(args_.size());
+  for (int i = 0; i < args_.size(); ++i) {
+    // We assume that any buffer is either an input or an output of the
+    // kernel, and inout buffers are represented as 2 separate arguments.
+    if (written_[i]) {
+      buffers.push_back(BufferUse::Write(args_[i]));
+    } else {
+      buffers.push_back(BufferUse::Read(args_[i]));
+    }
+  }
+  return buffers;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h
new file mode 100644
index 00000000000000..e7970dca487cb7
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk.h
@@ -0,0 +1,91 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// CustomKernelThunk loads and executes kernels defined by a custom kernel
+// (which in practice means hand written CUDA C++ kernel), instead of a kernel
+// compiled by XLA and loaded from an executable source.
+class CustomKernelThunk : public Thunk {
+ public:
+  CustomKernelThunk(Thunk::ThunkInfo thunk_info, CustomKernel custom_kernel,
+                    const emitters::KernelArguments& kernel_arguments);
+
+  std::string ToString(int indent) const override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  const CustomKernel& custom_kernel() const { return custom_kernel_; }
+
+  const std::vector<BufferAllocation::Slice>& arguments() const {
+    return args_;
+  }
+
+  absl::string_view custom_kernel_name() const { return custom_kernel_.name(); }
+
+  const std::vector<bool>& written() const { return written_; }
+
+  LaunchDimensions launch_dimensions() const {
+    return LaunchDimensions(custom_kernel_.block_dims(),
+                            custom_kernel_.thread_dims());
+  }
+
+  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
+
+  BufferUses buffer_uses() const override;
+
+ private:
+  // Buffer slices passed to the kernel as arguments.
+  std::vector<BufferAllocation::Slice> args_;
+
+  // args_[i] is written iff (written_[i] == true).
+  std::vector<bool> written_;
+
+  CustomKernel custom_kernel_;
+
+  // Loaded kernels for each `StreamExecutor`.
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      kernel_cache_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_CUSTOM_KERNEL_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc
new file mode 100644
index 00000000000000..9bd6f99f41469c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_kernel_thunk_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+namespace {
+
+TEST(CustomKernelThunkTest, BufferUsesReturnsCorrectBuffers) {
+  CustomKernel kernel(
+      /*name=*/"",
+      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
+          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
+      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
+  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
+  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
+  arg0.set_written(false);
+  arg1.set_written(true);
+  emitters::KernelArguments kernel_arguments({arg0, arg1});
+  CustomKernelThunk thunk(Thunk::ThunkInfo{}, kernel, kernel_arguments);
+
+  Thunk::BufferUses buffers = thunk.buffer_uses();
+
+  ASSERT_THAT(buffers, testing::UnorderedElementsAre(BufferUse::Read(slice0),
+                                                     BufferUse::Write(slice1)));
+}
+
+TEST(CustomKernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
+  CustomKernel kernel(
+      /*name=*/"",
+      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
+          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
+      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
+  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
+  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
+  arg0.set_written(false);
+  arg1.set_written(true);
+  emitters::KernelArguments kernel_arguments({arg0, arg1});
+  CustomKernelThunk thunk(Thunk::ThunkInfo{}, kernel, kernel_arguments);
+
+  Thunk::BufferUses buffers1 = thunk.buffer_uses();
+  Thunk::BufferUses buffers2 = thunk.buffer_uses();
+
+  ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 7b01e3ac7c1cb6..f87c2155779956 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -39,23 +41,20 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -175,7 +174,7 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
 DynamicSliceThunk::DynamicSliceThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
-    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+    std::vector<BufferAllocation> fake_allocations,
     std::vector<std::optional<std::vector<Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
@@ -440,6 +439,18 @@ void DynamicSliceThunk::ForAllThunksMutable(
   embedded_thunk_->ForAllThunksMutable(fn);
 }
 
+absl::Status DynamicSliceThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  TF_RETURN_IF_ERROR(embedded_thunk_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                      fn(std::move(embedded_thunk_)));
+  embedded_thunk_ = SequentialThunk::FromThunk(std::move(thunk));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<OptionalDynamicSliceOffsetsProto>
 SerializeOptionalDynamicSliceOffsetsToProto(
     const std::optional<std::vector<DynamicSliceThunk::Offset>>& offsets_item,
@@ -605,13 +616,19 @@ absl::StatusOr<ThunkProto> DynamicSliceThunk::ToProto() const {
              ->mutable_offset_as_function_of_indvar_modules_metadata(),
         offset_as_function_of_indvar_metadata_->ToProto());
   }
+
+  // fake_allocations
+  for (const auto& fake_allocation : fake_allocations_) {
+    *dynamic_slice_proto->add_fake_allocations() = fake_allocation.ToProto();
+  }
+
   return proto;
 }
 
 absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     ThunkInfo thunk_info, const DynamicSliceThunkProto& proto,
     absl::Span<const BufferAllocation> buffer_allocations,
-    absl::Span<const BufferAllocation> fake_allocations) {
+    const DeserializerWithCustomAllocations& deserializer) {
   // offset_as_function_of_indvar_metadata
   std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
       offset_as_function_of_indvar_metadata;
@@ -668,21 +685,25 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
+  // fake_allocations
+  std::vector<BufferAllocation> fake_allocations;
+  for (const auto& fake_allocation_proto : proto.fake_allocations()) {
+    fake_allocations.push_back(
+        BufferAllocation::FromProto(fake_allocation_proto));
+  }
+
   // embedded_thunk
   std::vector<std::unique_ptr<Thunk>> embedded_thunks;
   for (const auto& thunk_proto : proto.embedded_thunk().thunks()) {
-    TF_ASSIGN_OR_RETURN(auto thunk,
-                        DeserializeThunkProto(thunk_proto, fake_allocations));
-    embedded_thunks.push_back(std::move(thunk));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> embedded_thunk,
+                        deserializer(thunk_proto, fake_allocations));
+    embedded_thunks.push_back(std::move(embedded_thunk));
   }
 
-  // leave fake_allocations empty, because we manage their lifetime outside
-  // of this function.
   return std::make_unique<DynamicSliceThunk>(
       thunk_info, std::make_unique<ThunkSequence>(std::move(embedded_thunks)),
-      std::move(arguments),
-      /*fake_allocations=*/std::vector<std::unique_ptr<BufferAllocation>>(),
-      std::move(offsets), std::move(orig_shapes), std::move(sliced_shapes),
+      std::move(arguments), std::move(fake_allocations), std::move(offsets),
+      std::move(orig_shapes), std::move(sliced_shapes),
       std::move(offset_byte_sizes),
       std::move(offset_as_function_of_indvar_metadata));
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
index 776dcc7c15d4b8..9247784971b116 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
@@ -28,9 +30,10 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/literal.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -111,7 +114,7 @@ class DynamicSliceThunk : public Thunk {
   DynamicSliceThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
-      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+      std::vector<BufferAllocation> fake_allocations,
       std::vector<std::optional<std::vector<Offset>>> offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
@@ -147,8 +150,7 @@ class DynamicSliceThunk : public Thunk {
     return arguments_;
   }
 
-  const std::vector<std::unique_ptr<BufferAllocation>>& get_fake_allocations()
-      const {
+  const std::vector<BufferAllocation>& get_fake_allocations() const {
     return fake_allocations_;
   }
 
@@ -170,21 +172,21 @@ class DynamicSliceThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
 
   // `buffer_allocations`: the actual buffer allocations; required to parse the
   // `arguments` (BufferAllocation::Slice) -- the tensors that we are later
   // slicing from.
-  // `fake_allocations`: The fake allocations that are used as
-  // placeholders during creation of the embedded thunk. These are being
-  // replaced during execution in `ExecuteOnStream` with the actual (dynamic)
-  // slices. We have to create these outside of this method to manage their
-  // lifetime correctly.
+  // `deserializer`: The deserializer is used to deserialize the embedded thunk.
   static absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> FromProto(
       ThunkInfo thunk_info, const DynamicSliceThunkProto& proto,
       absl::Span<const BufferAllocation> buffer_allocations,
-      absl::Span<const BufferAllocation> fake_allocations);
+      const DeserializerWithCustomAllocations& deserializer);
 
   std::optional<const OffsetAsFunctionOfIndvarModulesMetadata*>
   get_offset_function() const {
@@ -198,7 +200,7 @@ class DynamicSliceThunk : public Thunk {
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<BufferAllocation::Slice>> arguments_;
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<BufferAllocation> fake_allocations_;
   std::vector<std::optional<std::vector<Offset>>> offsets_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
index c39007eb477f2d..97db28bae28ff6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -33,7 +33,11 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.pb.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -64,7 +68,20 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
+class DummyThunk : public Thunk {
+ public:
+  explicit DummyThunk(Kind kind, const Thunk::ThunkInfo& info)
+      : Thunk(kind, info) {}
+  ~DummyThunk() override = default;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+};
+
 using DynamicSliceThunkTest = HloHardwareIndependentTestBase;
+using ::testing::NotNull;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 
 std::string GetPlatformName() {
@@ -93,11 +110,21 @@ void CheckProtoRoundTrip(const DynamicSliceThunk& thunk,
           BufferAllocation(i, arguments[i].value().allocation()->size(), 0));
     }
   }
+
+  Thunk::DeserializerWithCustomAllocations deserializer =
+      [](const ThunkProto& thunk_proto,
+         absl::Span<const BufferAllocation> fake_allocations_span)
+      -> absl::StatusOr<std::unique_ptr<Thunk>> {
+    return DeserializeThunkProto(thunk_proto, fake_allocations_span,
+                                 /*hlo_module*/ nullptr,
+                                 /*platform_name=*/"TEST_PLATFORM");
+  };
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto thunk_from_proto,
       DynamicSliceThunk::FromProto(Thunk::ThunkInfo(), proto,
                                    /*buffer_allocations=*/buffer_allocations,
-                                   /*fake_allocations=*/fake_allocations_span));
+                                   deserializer));
   TF_ASSERT_OK_AND_ASSIGN(auto proto_roundtrip, thunk_from_proto->ToProto());
   auto dynamic_slice_thunk_proto_roundtrip =
       proto_roundtrip.dynamic_slice_thunk();
@@ -146,28 +173,24 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> CreateSlicedGemmThunk(
   int64_t out_length = sizeof(float) * 1 * 1;
   int64_t offset_length = sizeof(int64_t);
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
   auto alloc_lhs =
       std::make_unique<BufferAllocation>(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), 0, lhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
-                                    rhs_length);
-
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&fake_allocations.back(), 0, rhs_length);
+
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
 
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
@@ -308,14 +331,13 @@ CreateMultipleSlicedOperandsGemmThunk(
   int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(float) * 3;
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          slice_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          slice_length);
   auto alloc_lhs =
       std::make_unique<BufferAllocation>(/*index=*/0, length, /*color=*/0);
@@ -323,13 +345,10 @@ CreateMultipleSlicedOperandsGemmThunk(
   auto alloc_rhs =
       std::make_unique<BufferAllocation>(/*index=*/1, length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(alloc_rhs.get(), 0, length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
       /*index=*/4, offset_length, /*color=*/0);
@@ -525,21 +544,19 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, dst_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst(fake_allocations.back().get(), 0,
-                                    dst_length);
+  fake_allocations.emplace_back(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&fake_allocations.back(), 0, dst_length);
 
   BufferAllocation alloc_offset_0(/*index=*/2, offset_length, /*color=*/0);
   BufferAllocation::Slice slice_offset_0(&alloc_offset_0, 0, offset_length);
@@ -559,9 +576,9 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<ShapedSlice>> operands{ShapedSlice{
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
       slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
-  std::vector<std::optional<ShapedSlice>> results{
+  std::vector<NullableShapedSlice> results{
       ShapedSlice{slice_dst, ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8})}};
 
   // Creating embedded custom call thunk.
@@ -570,7 +587,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -663,17 +680,16 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
@@ -720,9 +736,9 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<ShapedSlice>> operands{ShapedSlice{
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
       slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
-  std::vector<std::optional<ShapedSlice>> results{ShapedSlice{
+  std::vector<NullableShapedSlice> results{ShapedSlice{
       slice_dst_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
 
   // Creating embedded custom call thunk.
@@ -731,7 +747,7 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -853,22 +869,19 @@ CreateSlicedGemmArbitraryArgumentOrderThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc_lhs =
@@ -1028,22 +1041,19 @@ CreateSlicedGemmArbitraryNumberOfArgumentsThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc_lhs =
@@ -1205,29 +1215,24 @@ CreateSlicedTupledOperandGemmThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
   auto alloc_lhs = std::make_unique<BufferAllocation>(
       /*index=*/0, 3 * lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), lhs_length, lhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
-                                    rhs_length);
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&fake_allocations.back(), 0, rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
-                                    out_length);
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&fake_allocations.back(), 0, out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&fake_allocations.back(), 0,
                                           1024 * 1024);
 
   auto alloc_lhs_offset_0 = std::make_unique<BufferAllocation>(
@@ -1383,17 +1388,16 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
   // Prepare embedded and dynamic slice thunks.
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(2);
 
   // Fake slices for embedded thunk creation.
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/0, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, slice_length, /*color=*/0));
-  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, slice_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_fake(&fake_allocations.back(), 0,
                                          slice_length);
 
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
@@ -1440,9 +1444,9 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       auto registration,
       xla::ffi::FindHandler("__xla_test$$memcpy", GetPlatformName()));
 
-  std::vector<std::optional<ShapedSlice>> operands{ShapedSlice{
+  std::vector<NullableShapedSlice> operands{ShapedSlice{
       slice_src_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
-  std::vector<std::optional<ShapedSlice>> results{ShapedSlice{
+  std::vector<NullableShapedSlice> results{ShapedSlice{
       slice_dst_fake, ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
 
   // Creating embedded custom call thunk.
@@ -1451,7 +1455,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       seq.emplace_back(),
       CustomCallThunk::Create(Thunk::ThunkInfo(), "__xla_test$$memcpy",
                               registration.bundle, operands, results,
-                              /*attributes=*/CustomCallThunk::AttributesMap(),
+                              /*attributes=*/ffi::AttributesMap(),
                               /*called_computation=*/nullptr));
 
   // Wrapping dynamic slice thunk around the custom call thunk.
@@ -1576,25 +1580,22 @@ CreateSlicedOperandsSameBufferGemmThunk(
   int64_t offset_length = sizeof(int64_t);
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
-  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs_fake(&fake_allocations.back(), 0,
                                          rhs_length);
 
-  fake_allocations.push_back(
-      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
-  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out_fake(&fake_allocations.back(), 0,
                                          out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, 1024 * 1024, /*color=*/0));
-  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+  fake_allocations.emplace_back(/*index=*/3, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace_fake(&fake_allocations.back(), 0,
                                                1024 * 1024);
 
   auto alloc = std::make_unique<BufferAllocation>(
@@ -1789,11 +1790,11 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
   int64_t out_length = sizeof(float) * 1 * 1;
 
   // Preparing buffer allocation slices for thunk creations.
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/0, /*size=*/rhs_length, /*color=*/0));
+  std::vector<BufferAllocation> fake_allocations;
+  fake_allocations.reserve(4);
+  fake_allocations.emplace_back(/*index=*/0, /*size=*/rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs_fake(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/rhs_length);
 
   auto alloc_lhs = std::make_unique<BufferAllocation>(
@@ -1801,22 +1802,19 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
   BufferAllocation::Slice slice_lhs(alloc_lhs.get(), /*offset=*/0,
                                     /*size=*/lhs_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/1, /*size=*/rhs_length, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/1, /*size=*/rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/rhs_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/2, /*size=*/out_length, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/2, /*size=*/out_length, /*color=*/0);
   BufferAllocation::Slice slice_out(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/out_length);
 
-  fake_allocations.push_back(std::make_unique<BufferAllocation>(
-      /*index=*/3, /*size=*/1024 * 1024, /*color=*/0));
+  fake_allocations.emplace_back(/*index=*/3, /*size=*/1024 * 1024, /*color=*/0);
   BufferAllocation::Slice slice_workspace(
-      /*allocation=*/fake_allocations.back().get(), /*offset=*/0,
+      /*allocation=*/&fake_allocations.back(), /*offset=*/0,
       /*size=*/1024 * 1024);
 
   backing_allocations.push_back(std::move(alloc_lhs));
@@ -2099,5 +2097,29 @@ TEST_F(DynamicSliceThunkTest,
   EXPECT_EQ(proto.offsets().offsets(0).hlo_module_offset_idx(), 0);
 }
 
+TEST_F(DynamicSliceThunkTest, TransformAllNestedThunks) {
+  auto seq = std::make_unique<ThunkSequence>();
+  seq->emplace_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, Thunk::ThunkInfo()));
+  DynamicSliceThunk thunk(Thunk::ThunkInfo(),
+                          /*embedded_thunk=*/std::move(seq),
+                          /*arguments=*/{},
+                          /*fake_allocations=*/{},
+                          /*offsets=*/{},
+                          /*orig_shapes=*/{},
+                          /*sliced_shapes=*/{},
+                          /*offset_byte_sizes=*/{});
+
+  TF_EXPECT_OK(thunk.TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Thunk::Kind::kCustomCall,
+                                        Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(thunk.get_embedded_thunk(), NotNull());
+  EXPECT_THAT(thunk.get_embedded_thunk()->thunks(), SizeIs(1));
+  EXPECT_THAT(thunk.get_embedded_thunk()->thunks()[0]->kind(),
+              Thunk::Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
index 6d8a65213a4254..dc68516ac462f3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
index 69acf964dcf21f..cb65d966aaf1f8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_GEMM_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_GEMM_THUNK_H_
 
+#include <memory>
 #include <optional>
 
 #include "absl/status/status.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
index d59734a9483ffd..db0586bd7bfc9f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 
 #include <memory>
-#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index 2003e3effe1839..aa30b3a0737f03 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <random>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
index 8e44502f0bcebd..264c0dec22538c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/base/casts.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -38,6 +37,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/host_offloading/host_offloading_allocator.h"
 #include "xla/core/host_offloading/host_offloading_buffer.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
index 1b6f0e1f3abb70..20af930ad5d747 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/host_offloading/host_offloading_executable.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
index 543f3987ad2ddf..cce2f2795712a1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
@@ -112,6 +113,53 @@ HostSendThunk::HostSendThunk(
       frontend_attrs_(std::move(frontend_attrs)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostSendThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostSendThunkProto& host_send_thunk_proto = *proto.mutable_host_send_thunk();
+  *host_send_thunk_proto.mutable_shape() = shape_.ToProto();
+  TF_ASSIGN_OR_RETURN(*host_send_thunk_proto.mutable_buffer(),
+                      buffer_.ToProto());
+  host_send_thunk_proto.set_channel_id(channel_id_);
+  for (const auto& [key, value] : frontend_attrs_) {
+    host_send_thunk_proto.mutable_frontend_attrs()->insert({key, value});
+  }
+  if (device_constraint_.has_value()) {
+    host_send_thunk_proto.set_device_constraint(device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostSendThunk has no paired Done event");
+  }
+  host_send_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostSendThunk>> HostSendThunk::FromProto(
+    ThunkInfo thunk_info, const HostSendThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice buffer,
+      BufferAllocation::Slice::FromProto(proto.buffer(), allocations));
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+  absl::flat_hash_map<std::string, std::string> frontend_attrs(
+      proto.frontend_attrs().begin(), proto.frontend_attrs().end());
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+  return std::make_unique<HostSendThunk>(
+      thunk_info, std::move(shape), buffer, proto.channel_id(),
+      async_event_it->second, std::move(frontend_attrs), device_constraint);
+}
+
 absl::Status HostSendThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Send buffer: channel_id=" << channel_id_
           << "; shape=" << shape_.ToString();
@@ -168,6 +216,44 @@ HostSendDoneThunk::HostSendDoneThunk(
       events_(std::move(events)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostSendDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostSendDoneThunkProto& host_send_done_thunk_proto =
+      *proto.mutable_host_send_done_thunk();
+  host_send_done_thunk_proto.set_channel_id(channel_id_);
+  if (device_constraint_.has_value()) {
+    host_send_done_thunk_proto.set_device_constraint(
+        device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostSendDoneThunk has no paired Start event");
+  }
+  host_send_done_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostSendDoneThunk>> HostSendDoneThunk::FromProto(
+    ThunkInfo thunk_info, const HostSendDoneThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+
+  return std::make_unique<HostSendDoneThunk>(thunk_info, proto.channel_id(),
+                                             std::move(async_event_it->second),
+                                             device_constraint);
+}
+
 absl::Status HostSendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Wait for send completion: channel_id=" << channel_id_;
 
@@ -217,6 +303,53 @@ HostRecvThunk::HostRecvThunk(
       frontend_attrs_(std::move(frontend_attrs)),
       device_constraint_(device_constraint) {}
 
+absl::StatusOr<ThunkProto> HostRecvThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostRecvThunkProto& host_recv_thunk_proto = *proto.mutable_host_recv_thunk();
+  *host_recv_thunk_proto.mutable_shape() = shape_.ToProto();
+  TF_ASSIGN_OR_RETURN(*host_recv_thunk_proto.mutable_buffer(),
+                      buffer_.ToProto());
+  host_recv_thunk_proto.set_channel_id(channel_id_);
+  for (const auto& [key, value] : frontend_attrs_) {
+    host_recv_thunk_proto.mutable_frontend_attrs()->insert({key, value});
+  }
+  if (device_constraint_.has_value()) {
+    host_recv_thunk_proto.set_device_constraint(device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostRecvThunk has no paired Done event");
+  }
+  host_recv_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostRecvThunk>> HostRecvThunk::FromProto(
+    ThunkInfo thunk_info, const HostRecvThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice buffer,
+      BufferAllocation::Slice::FromProto(proto.buffer(), allocations));
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+  absl::flat_hash_map<std::string, std::string> frontend_attrs(
+      proto.frontend_attrs().begin(), proto.frontend_attrs().end());
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+  return std::make_unique<HostRecvThunk>(
+      thunk_info, std::move(shape), buffer, proto.channel_id(),
+      async_event_it->second, std::move(frontend_attrs), device_constraint);
+}
+
 absl::Status HostRecvThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Recv buffer: channel_id=" << channel_id_
           << "; shape=" << shape_.ToString();
@@ -270,7 +403,46 @@ HostRecvDoneThunk::HostRecvDoneThunk(
     std::optional<GlobalDeviceId> device_constraint)
     : Thunk(Thunk::kHostRecvDone, thunk_info),
       channel_id_(channel_id),
-      events_(std::move(events)) {}
+      events_(std::move(events)),
+      device_constraint_(device_constraint) {}
+
+absl::StatusOr<ThunkProto> HostRecvDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+  HostRecvDoneThunkProto& host_recv_done_thunk_proto =
+      *proto.mutable_host_recv_done_thunk();
+  host_recv_done_thunk_proto.set_channel_id(channel_id_);
+  if (device_constraint_.has_value()) {
+    host_recv_done_thunk_proto.set_device_constraint(
+        device_constraint_->value());
+  }
+  std::optional<AsyncEventsUniqueId> async_events_unique_id =
+      GetAsyncEventsUniqueId();
+  if (!async_events_unique_id.has_value()) {
+    return absl::InternalError("HostRecvDoneThunk has no paired Start event");
+  }
+  host_recv_done_thunk_proto.set_async_events_unique_id(
+      async_events_unique_id.value().value());
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<HostRecvDoneThunk>> HostRecvDoneThunk::FromProto(
+    ThunkInfo thunk_info, const HostRecvDoneThunkProto& proto,
+    absl::Span<const BufferAllocation> allocations,
+    HostSendRecvAsyncEventsMap& async_events_map) {
+  std::optional<GlobalDeviceId> device_constraint;
+  if (proto.has_device_constraint()) {
+    device_constraint = GlobalDeviceId(proto.device_constraint());
+  }
+
+  auto [async_event_it, _] = async_events_map.try_emplace(
+      AsyncEventsUniqueId(proto.async_events_unique_id()),
+      std::make_shared<HostSendRecvAsyncEvents>());
+
+  return std::make_unique<HostRecvDoneThunk>(thunk_info, proto.channel_id(),
+                                             std::move(async_event_it->second),
+                                             device_constraint);
+}
 
 absl::Status HostRecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Wait for recv completion: channel_id=" << channel_id_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
index e307270787c655..ca86059a10c358 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/shape.h"
@@ -78,12 +79,24 @@ class HostSendRecvAsyncEvents {
       events_ ABSL_GUARDED_BY(mutex_);
 };
 
+// A map from a unique id to a shared pointer to HostSendRecvAsyncEvents.
+// This is used to match the pairs of HostSend/Recv and HostSend/RecvDone thunks
+// during deserialization.
+using HostSendRecvAsyncEventsMap =
+    absl::flat_hash_map<AsyncEventsUniqueId,
+                        std::shared_ptr<HostSendRecvAsyncEvents>>;
+
 //===----------------------------------------------------------------------===//
 // HostSendThunk
 //===----------------------------------------------------------------------===//
 
 class HostSendThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostSendThunk>> FromProto(
+      ThunkInfo thunk_info, const HostSendThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostSendThunk(ThunkInfo thunk_info, Shape shape,
                 BufferAllocation::Slice buffer, int64_t channel_id,
                 std::shared_ptr<HostSendRecvAsyncEvents> events,
@@ -92,6 +105,8 @@ class HostSendThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncStart() const override { return events_ != nullptr; }
@@ -113,12 +128,19 @@ class HostSendThunk : public Thunk {
 
 class HostSendDoneThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostSendDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const HostSendDoneThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostSendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
                     std::shared_ptr<HostSendRecvAsyncEvents> events,
                     std::optional<GlobalDeviceId> device_constraint);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncDone() const override { return events_ != nullptr; }
@@ -136,6 +158,11 @@ class HostSendDoneThunk : public Thunk {
 
 class HostRecvThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostRecvThunk>> FromProto(
+      ThunkInfo thunk_info, const HostRecvThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostRecvThunk(ThunkInfo thunk_info, Shape shape,
                 BufferAllocation::Slice buffer, int64_t channel_id,
                 std::shared_ptr<HostSendRecvAsyncEvents> events,
@@ -144,6 +171,8 @@ class HostRecvThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncStart() const override { return events_ != nullptr; }
@@ -165,12 +194,19 @@ class HostRecvThunk : public Thunk {
 
 class HostRecvDoneThunk : public Thunk {
  public:
+  static absl::StatusOr<std::unique_ptr<HostRecvDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const HostRecvDoneThunkProto& proto,
+      absl::Span<const BufferAllocation> allocations,
+      HostSendRecvAsyncEventsMap& async_events_map);
+
   HostRecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
                     std::shared_ptr<HostSendRecvAsyncEvents> events,
                     std::optional<GlobalDeviceId> device_constraint);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   std::optional<AsyncEventsUniqueId> GetAsyncEventsUniqueId() const override;
 
   bool IsAsyncDone() const override { return events_ != nullptr; }
diff --git a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
index 00fd8c29368bd4..1cc609a2d2263f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk.h
@@ -16,10 +16,15 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_INFEED_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_INFEED_THUNK_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
index 22e2da9206addd..578429cdaf4f15 100644
--- a/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/infeed_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
index 08e14332577958..01f6576bc21c4f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
@@ -15,13 +15,11 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
@@ -31,20 +29,20 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -59,27 +57,6 @@ using tsl::profiler::TraceMeLevel;
 namespace xla {
 namespace gpu {
 
-Thunk::BufferUses BufferUseFromKernelArguments(
-    absl::Span<const BufferAllocation::Slice> args,
-    const std::vector<bool>& written) {
-  Thunk::BufferUses buffers;
-  buffers.reserve(args.size());
-  for (int i = 0; i < args.size(); ++i) {
-    // We assume that any buffer is either an input or an output of the
-    // kernel, and inout buffers are represented as 2 separate arguments.
-    if (written[i]) {
-      buffers.push_back(BufferUse::Write(args[i]));
-    } else {
-      buffers.push_back(BufferUse::Read(args[i]));
-    }
-  }
-  return buffers;
-}
-
-//===----------------------------------------------------------------------===//
-// KernelThunk
-//===----------------------------------------------------------------------===//
-
 KernelThunk::KernelThunk(Thunk::ThunkInfo thunk_info, std::string kernel_name,
                          const emitters::KernelArguments& kernel_arguments,
                          LaunchDimensions launch_dimensions,
@@ -188,45 +165,6 @@ absl::Status KernelThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
-void PrintBufferContents(se::Stream*, int input_idx, se::TensorMap tensor_map) {
-  VLOG(100) << "TENSOR_MAP(" << input_idx << ") = ";
-  for (std::byte element : tensor_map.storage) {
-    VLOG(100) << absl::StrFormat("%x ", static_cast<unsigned>(element));
-  }
-}
-
-void PrintBufferContents(se::Stream* stream, int input_idx,
-                         se::DeviceMemoryBase buf) {
-  auto host_buffer = std::make_unique<char[]>(buf.size());
-  CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
-  CHECK_OK(stream->BlockHostUntilDone());
-
-  std::string buffer_contents;
-  for (int i = 0; i < buf.size(); ++i) {
-    absl::StrAppendFormat(&buffer_contents, "%x ",
-                          static_cast<unsigned>(host_buffer[i]));
-  }
-  VLOG(100) << "BUF(" << input_idx << ") = " << buffer_contents;
-}
-
-void PrintBufferContents(se::Stream*, int input_idx, int64_t int_arg) {
-  VLOG(100) << "INT(" << input_idx << ") = ";
-  VLOG(100) << absl::StrFormat("%x ", int_arg);
-}
-
-static void PrintBufferContents(
-    se::Stream* stream, absl::Span<const se::KernelArgument> kernel_args) {
-  for (const auto& [input_idx, arg] : llvm::enumerate(kernel_args)) {
-    // pre-cpp-20-compat(P0588R1): Capturing structured bindings in lambdas is
-    // ill-formed.
-    std::visit(
-        [&stream, &input_idx = input_idx](auto const& arg) {
-          PrintBufferContents(stream, input_idx, arg);
-        },
-        arg);
-  }
-}
-
 absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
   TraceMe trace(
       [] { return TraceMeEncode("KernelThunk::ExecuteOnStream", {}); },
@@ -307,79 +245,18 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
 }
 
 Thunk::BufferUses KernelThunk::buffer_uses() const {
-  return BufferUseFromKernelArguments(absl::MakeConstSpan(args_), written_);
-}
-
-//===----------------------------------------------------------------------===//
-// CustomKernelThunk
-//===----------------------------------------------------------------------===//
-
-CustomKernelThunk::CustomKernelThunk(
-    const HloInstruction* instr, CustomKernel custom_kernel,
-    const emitters::KernelArguments& kernel_arguments, ThunkId thunk_id)
-    : Thunk(Kind::kCustomKernel,
-            Thunk::ThunkInfo::WithProfileAnnotation(instr, thunk_id)),
-      args_(kernel_arguments.GetArgumentBufferSlices()),
-      written_(kernel_arguments.GetArgumentOutputFlags()),
-      custom_kernel_(std::move(custom_kernel)) {}
-
-std::string CustomKernelThunk::ToString(int indent) const {
-  return custom_kernel_.ToString();
-}
-
-absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
-  absl::MutexLock lock(mutex_);
-
-  if (!kernel_cache_.contains(params.executor)) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::Kernel> kernel,
-        params.executor->LoadKernel(custom_kernel_.kernel_spec()));
-    kernel_cache_.emplace(params.executor, std::move(kernel));
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
-  se::StreamExecutor* executor = params.stream->parent();
-
-  se::Kernel* kernel = [&] {
-    absl::MutexLock lock(mutex_);
-    return kernel_cache_[executor].get();
-  }();
-
-  int device_ordinal = executor->device_ordinal();
-  VLOG(3) << "[" << device_ordinal << "] Launching "
-          << custom_kernel_.ToString() << " as device kernel "
-          << kernel->name();
-
-  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
-  for (const BufferAllocation::Slice& arg : args_) {
-    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
-    VLOG(3) << "[" << device_ordinal << "]  Arg: alloc #" << arg.index()
-            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
-            << buf.size() << "B)";
-    buffer_args.push_back(buf);
-  }
-
-  if (VLOG_IS_ON(100)) {
-    absl::InlinedVector<se::KernelArgument, 4> kernel_args;
-    for (const se::DeviceMemoryBase& arg : buffer_args) {
-      kernel_args.push_back(arg);
+  Thunk::BufferUses buffers;
+  buffers.reserve(args_.size());
+  for (int i = 0; i < args_.size(); ++i) {
+    // We assume that any buffer is either an input or an output of the
+    // kernel, and inout buffers are represented as 2 separate arguments.
+    if (written_[i]) {
+      buffers.push_back(BufferUse::Write(args_[i]));
+    } else {
+      buffers.push_back(BufferUse::Read(args_[i]));
     }
-    PrintBufferContents(params.stream, kernel_args);
   }
-
-  se::KernelArgsDeviceMemoryArray args(buffer_args,
-                                       custom_kernel_.shared_memory_bytes());
-
-  return kernel->Launch(custom_kernel_.thread_dims(),
-                        custom_kernel_.block_dims(),
-                        custom_kernel_.cluster_dims(), params.stream, args);
-}
-
-Thunk::BufferUses CustomKernelThunk::buffer_uses() const {
-  return BufferUseFromKernelArguments(absl::MakeConstSpan(args_), written_);
+  return buffers;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
index 00a604b4f20d30..cd7212fffad2c9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
@@ -26,16 +26,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
@@ -46,19 +42,13 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // TODO(ezhulenev): Unify KernelThunk and CustomKernelThunk as they are very
 // similar. XLA:GPU should use more of kernel loading APIs provided by
 // StreamExecutor out of the box and less custom kernel loading solutions.
 //
 // Today KernelThunk is required for lowering to XLA runtime, and
 // CustomKernelThunk is only supported for thunk execution.
-
-//===----------------------------------------------------------------------===//
-// KernelThunk
-//===----------------------------------------------------------------------===//
-
+//
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -141,58 +131,6 @@ class KernelThunk : public Thunk {
       kernel_cache_ ABSL_GUARDED_BY(mutex_);
 };
 
-//===----------------------------------------------------------------------===//
-// CustomKernelThunk
-//===----------------------------------------------------------------------===//
-
-// CustomKernelThunk loads and executes kernels defined by a custom kernel
-// (which in practice means hand written CUDA C++ kernel), instead of a kernel
-// compiled by XLA and loaded from an executable source.
-class CustomKernelThunk : public Thunk {
- public:
-  CustomKernelThunk(const HloInstruction* inst, CustomKernel custom_kernel,
-                    const emitters::KernelArguments& kernel_arguments,
-                    ThunkId thunk_id);
-
-  std::string ToString(int indent) const override;
-
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  const CustomKernel& custom_kernel() const { return custom_kernel_; }
-
-  const std::vector<BufferAllocation::Slice>& arguments() const {
-    return args_;
-  }
-
-  absl::string_view custom_kernel_name() const { return custom_kernel_.name(); }
-
-  const std::vector<bool>& written() const { return written_; }
-
-  LaunchDimensions launch_dimensions() const {
-    return LaunchDimensions(custom_kernel_.block_dims(),
-                            custom_kernel_.thread_dims());
-  }
-
-  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
-
-  BufferUses buffer_uses() const override;
-
- private:
-  // Buffer slices passed to the kernel as arguments.
-  std::vector<BufferAllocation::Slice> args_;
-
-  // args_[i] is written iff (written_[i] == true).
-  std::vector<bool> written_;
-
-  CustomKernel custom_kernel_;
-
-  // Loaded kernels for each `StreamExecutor`.
-  mutable absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
-      kernel_cache_ ABSL_GUARDED_BY(mutex_);
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
index 8be0c484119bdd..85fae57dddb91d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
@@ -309,51 +308,6 @@ TEST(KernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
   ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
 }
 
-TEST(CustomKernelThunkTest, BufferUsesReturnsCorrectBuffers) {
-  CustomKernel kernel(
-      /*name=*/"",
-      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
-      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
-  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
-  arg0.set_written(false);
-  arg1.set_written(true);
-  emitters::KernelArguments kernel_arguments({arg0, arg1});
-  auto hlo = HloInstruction::CreateConstant(Literal());
-  CustomKernelThunk thunk(hlo.get(), kernel, kernel_arguments, ThunkId{0});
-
-  Thunk::BufferUses buffers = thunk.buffer_uses();
-
-  ASSERT_THAT(buffers, testing::UnorderedElementsAre(BufferUse::Read(slice0),
-                                                     BufferUse::Write(slice1)));
-}
-
-TEST(CustomKernelThunkTest, BufferUsesReturnsBuffersInConsistentOrder) {
-  CustomKernel kernel(
-      /*name=*/"",
-      se::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-          /*ptx=*/"", /*kernel_name=*/"", /*arity=*/0),
-      se::BlockDim(), se::ThreadDim(), /*shared_memory_bytes=*/0);
-  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/512);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/512, /*size=*/512);
-  emitters::KernelArgument arg0(ShapeUtil::MakeShape(F32, {512}), slice0);
-  emitters::KernelArgument arg1(ShapeUtil::MakeShape(F32, {512}), slice1);
-  arg0.set_written(false);
-  arg1.set_written(true);
-  emitters::KernelArguments kernel_arguments({arg0, arg1});
-  auto hlo = HloInstruction::CreateConstant(Literal());
-  CustomKernelThunk thunk(hlo.get(), kernel, kernel_arguments, ThunkId{0});
-
-  Thunk::BufferUses buffers1 = thunk.buffer_uses();
-  Thunk::BufferUses buffers2 = thunk.buffer_uses();
-
-  ASSERT_THAT(buffers1, testing::ContainerEq(buffers2));
-}
 
 class KernelThunkTmaPTXTest : public ::testing::TestWithParam<bool> {
  public:
diff --git a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
index 820fa0a208beca..853690d3e2ed77 100644
--- a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
index 5fd260891b5fed..06e8c2a05fb232 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
@@ -17,8 +17,11 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_MEMSET_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
index 4b6f285cb12f8b..27aed1e6d9cafb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
index 59d286d40b6912..d8ad220cbe0cec 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
index acf2ae6e48aa68..455d03b08eb15e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -44,8 +43,6 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
@@ -56,24 +53,6 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-namespace {
-
-absl::StatusOr<const int64_t> GetCurrentId(
-    Thunk::CollectiveExecuteParams* collective_params,
-    const P2PConfig& config) {
-  GlobalDeviceId global_device_id = collective_params->global_device_id;
-  TF_ASSIGN_OR_RETURN(
-      const DeviceAssignment::LogicalID current_logical_id,
-      collective_params->device_assn->LogicalIdForDevice(global_device_id));
-  const int64_t current_id =
-      config.config.group_mode ==
-              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  return current_id;
-}
-
-}  // namespace
 
 NvshmemCollectivePermuteStartThunk::NvshmemCollectivePermuteStartThunk(
     ThunkInfo thunk_info, const HloCollectivePermuteInstruction* instr,
@@ -154,8 +133,9 @@ absl::Status NvshmemCollectivePermuteStartThunk::RunNvshmemCollective(
       ConvertToDeviceBuffers(params,
                              std::vector<CollectiveThunk::Buffer>(buffers_),
                              config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                      GetCurrentId(params.collective_params, config_));
+  TF_ASSIGN_OR_RETURN(
+      const int64_t current_id,
+      GetCollectiveCurrentId(params.collective_params, config_));
   std::string device_string =
       CollectiveThunk::GetDeviceString(*params.collective_params);
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
index f2959c7204e05b..17c945f85e3149 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
@@ -22,12 +22,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
index 1db022e135605f..22f8e0a9d1b037 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.cc
@@ -31,16 +31,15 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
-#include "xla/core/collectives/communicator.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
index ae73d07909cd8a..b3205c70e1b5b0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_thunk.h
@@ -13,7 +13,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_COLLECTIVE_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_COLLECTIVE_THUNK_H_
 
-#include <cstdint>
 #include <memory>
 #include <optional>
 
@@ -22,7 +21,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
index d8660018c0b283..e310a931c8790b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
@@ -32,15 +32,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
index 430a8d43f5eea9..acba04c1baac89 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_RECV_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_NVSHMEM_RECV_THUNK_H_
 
-#include <vector>
+#include <cstdint>
+#include <memory>
+#include <string>
 
 #include "absl/status/status.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
index 71e8eff682cddb..d565d0dbd0ff56 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
@@ -33,17 +33,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
index b07f90782b4907..a951df9eea5cf0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
index ce84026bbb3e1c..17e98e101dd38d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -26,17 +26,21 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/source_target_pairs.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -188,5 +192,22 @@ AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr) {
   return AsyncStreamKind::ASYNC_STREAM_KIND_P2P0;
 }
 
+// Retrieves the current collective ID (replica or partition ID) for the
+// executing device.
+absl::StatusOr<const int64_t> GetCollectiveCurrentId(
+    Thunk::CollectiveExecuteParams* collective_params,
+    const P2PConfig& config) {
+  GlobalDeviceId global_device_id = collective_params->global_device_id;
+  TF_ASSIGN_OR_RETURN(
+      const DeviceAssignment::LogicalID current_logical_id,
+      collective_params->device_assn->LogicalIdForDevice(global_device_id));
+  const int64_t current_id =
+      config.config.group_mode ==
+              CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  return current_id;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
index bc5871faea00ba..c7ecc95e81c2ae 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
@@ -27,10 +27,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -100,6 +103,9 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
 // instruction.
 AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr);
 
+absl::StatusOr<const int64_t> GetCollectiveCurrentId(
+    Thunk::CollectiveExecuteParams* collective_params, const P2PConfig& config);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc
new file mode 100644
index 00000000000000..30180fadd8d863
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.cc
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+namespace {
+void PrintBufferContents(stream_executor::Stream*, int input_idx,
+                         stream_executor::TensorMap tensor_map) {
+  std::string formatted_contents;
+  for (std::byte element : tensor_map.storage) {
+    absl::StrAppendFormat(&formatted_contents, "%02x ",
+                          static_cast<unsigned>(element));
+  }
+  LOG(INFO) << "TENSOR_MAP(" << input_idx << ") = " << formatted_contents;
+}
+
+void PrintBufferContents(stream_executor::Stream* stream, int input_idx,
+                         stream_executor::DeviceMemoryBase buf) {
+  auto host_buffer = std::make_unique<char[]>(buf.size());
+  CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
+  CHECK_OK(stream->BlockHostUntilDone());
+
+  std::string buffer_contents;
+  for (int i = 0; i < buf.size(); ++i) {
+    absl::StrAppendFormat(&buffer_contents, "%02x ",
+                          static_cast<unsigned>(host_buffer[i]));
+  }
+  LOG(INFO) << "BUF(" << input_idx << ") = " << buffer_contents;
+}
+
+void PrintBufferContents(stream_executor::Stream*, int input_idx,
+                         int64_t int_arg) {
+  LOG(INFO) << "INT(" << input_idx
+            << ") = " << absl::StrFormat("%#08x", int_arg);
+}
+}  // namespace
+
+void PrintBufferContents(
+    stream_executor::Stream* stream,
+    absl::Span<const stream_executor::KernelArgument> kernel_args) {
+  for (int input_idx = 0; input_idx < kernel_args.size(); ++input_idx) {
+    const stream_executor::KernelArgument& arg = kernel_args[input_idx];
+    std::visit(
+        [&](auto const& arg) { PrintBufferContents(stream, input_idx, arg); },
+        arg);
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h
new file mode 100644
index 00000000000000..3ced52a0aee03b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
+
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+// Prints the contents of the buffer arguments in `kernel_args` to LOG(INFO).
+void PrintBufferContents(
+    stream_executor::Stream* stream,
+    absl::Span<const stream_executor::KernelArgument> kernel_args);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_PRINT_BUFFER_CONTENTS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc
new file mode 100644
index 00000000000000..102cabd07835d4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/print_buffer_contents_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/print_buffer_contents.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/scoped_mock_log.h"
+#include "absl/strings/ascii.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+
+namespace xla::gpu {
+namespace {
+using ::testing::_;
+using ::testing::HasSubstr;
+
+TEST(PrintBufferContentsTest, PrintBufferContents) {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  stream_executor::Platform* platform =
+      stream_executor::PlatformManager::PlatformWithName(name).value();
+  stream_executor::StreamExecutor* executor =
+      platform->ExecutorForDevice(0).value();
+
+  auto stream = executor->CreateStream().value();
+
+  stream_executor::DeviceMemory<int> arg1 =
+      executor->AllocateArray<int32_t>(10, 0);
+
+  TF_ASSERT_OK(stream->Memset32(&arg1, 0x12345678, 10 * sizeof(int32_t)));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  std::vector<stream_executor::KernelArgument> kernel_args;
+  kernel_args.push_back(arg1);
+  stream_executor::TensorMap tensor_map;
+  for (int i = 0; i < 128; ++i) {
+    tensor_map.storage[i] = static_cast<std::byte>(i);
+  }
+  kernel_args.push_back(tensor_map);
+  kernel_args.push_back(0x123456789);
+
+  absl::ScopedMockLog log{absl::MockLogDefault::kIgnoreUnexpected};
+  EXPECT_CALL(log, Log(_, _, HasSubstr("BUF(0) = 78 56 34 12 78 56 34 12")));
+  EXPECT_CALL(
+      log,
+      Log(_, _,
+          HasSubstr("TENSOR_MAP(1) = 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d "
+                    "0e 0f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f")));
+  EXPECT_CALL(log, Log(_, _, HasSubstr("INT(2) = 0x123456789")));
+
+  log.StartCapturingLogs();
+  PrintBufferContents(stream.get(), kernel_args);
+  log.StopCapturingLogs();
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index 670b5b76596f04..32dcf2bae52c70 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/future.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index 44982a95c27cb2..1912a514e853c0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/event.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
index 8a6490fcc85685..0d8d4cea7ad138 100644
--- a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
index 16910d97f00409..ba14a0b1881865 100644
--- a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
index ac2b16e4b4a75a..a2770d75c90c98 100644
--- a/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/replica_id_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
index bd5c38cfc3ca3f..65bfbb36a5b8dc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/const_init.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/layout_util.h"
@@ -41,13 +43,17 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/io/record_writer.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/host_info.h"
+#include "tsl/platform/path.h"
 
 namespace xla {
-
 namespace {
 
 std::string GetGpuPlatformName() {
@@ -81,6 +87,23 @@ void NopReturnTokenCustomCall(void* stream_handle, void** buffers,
   VLOG(1) << "NopReturnTokenCustomCall called.";
 }
 
+absl::StatusOr<Literal> ConvertToLiteral(se::Stream* stream,
+                                         const ffi::AnyBuffer& arg) {
+  Shape shape = ShapeUtil::MakeShape(arg.element_type(), arg.dimensions());
+  LayoutUtil::SetToDefaultLayout(&shape);
+
+  TF_ASSIGN_OR_RETURN(Literal literal, Literal::Make(shape));
+
+  int64_t size_bytes = arg.size_bytes();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> host_buffer,
+                      stream->parent()->HostMemoryAllocate(size_bytes));
+  TF_RETURN_IF_ERROR(
+      stream->Memcpy(literal.untyped_data(), arg.device_memory(), size_bytes));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  return literal;
+}
+
 absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
                                   absl::string_view format,
                                   ffi::Result<ffi::Buffer<xla::TOKEN>> res) {
@@ -107,18 +130,9 @@ absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
       return absl::FailedPreconditionError(absl::Substitute(
           "Missing formatter for argument $0 in debug print custom call", i));
     }
-    const ffi::AnyBuffer& arg = args_buffers[i];
-    int64_t size_bytes = arg.size_bytes();
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> host_buffer,
-                        stream->parent()->HostMemoryAllocate(size_bytes));
-    TF_RETURN_IF_ERROR(
-        stream->Memcpy(host_buffer->opaque(), arg.device_memory(), size_bytes));
-    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-
-    Shape shape = ShapeUtil::MakeShape(arg.element_type(), arg.dimensions());
-    LayoutUtil::SetToDefaultLayout(&shape);
-    MutableBorrowingLiteral literal(static_cast<char*>(host_buffer->opaque()),
-                                    shape);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ConvertToLiteral(stream, args_buffers[i]));
+
     formatted =
         absl::StrReplaceAll(formatted, {{to_substitute, literal.ToString()}});
   }
@@ -128,6 +142,49 @@ absl::Status DebugPrintCustomCall(se::Stream* stream, ffi::RemainingArgs args,
   return absl::OkStatus();
 }
 
+std::string GetUniqueFilenameForHost() {
+  return absl::StrCat(tsl::port::Hostname(), tsl::port::JobName(),
+                      tsl::port::TaskId(), tsl::port::JobUid(), ".tfrecord");
+}
+
+// This custom call copies its argument to the host and appends it to file.
+absl::Status AppendToFileCustomCall(se::Stream* stream, ffi::AnyBuffer buffer,
+                                    absl::string_view dir,
+                                    absl::string_view metadata,
+                                    ffi::Result<ffi::Buffer<xla::TOKEN>> res) {
+  if (!stream) {
+    return Internal("Stream is nullptr.");
+  }
+  static absl::Mutex host_mutex{absl::kConstInit};
+
+  TF_ASSIGN_OR_RETURN(Literal literal, ConvertToLiteral(stream, buffer));
+
+  auto* env = tsl::Env::Default();
+  std::string destination{dir};
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(destination));
+  std::string path = tsl::io::JoinPath(destination, GetUniqueFilenameForHost());
+
+  // Supports tensors 2+GB. Should not be serialized as proto.
+  TF_ASSIGN_OR_RETURN(std::string serialized, literal.SerializeAsString());
+
+  std::unique_ptr<tsl::WritableFile> file;
+  std::string filename(path);
+
+  {
+    absl::MutexLock lock(&host_mutex);
+
+    TF_RETURN_IF_ERROR(env->NewAppendableFile(filename, &file));
+    tsl::io::RecordWriter writer(file.get());
+
+    TF_RETURN_IF_ERROR(writer.WriteRecord(metadata));
+    TF_RETURN_IF_ERROR(writer.WriteRecord(serialized));
+
+    TF_RETURN_IF_ERROR(writer.Close());
+  }
+
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 // This custom call copies its arguments to the host and pretty-prints them as
@@ -144,6 +201,17 @@ XLA_FFI_DEFINE_HANDLER(kXlaGpuDebugPrintCustomCall, DebugPrintCustomCall,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kXlaGpuDebugPrintCustomCallTag,
                          GetGpuPlatformName(), kXlaGpuDebugPrintCustomCall);
 
+XLA_FFI_DEFINE_HANDLER(kXlaGpuAppendToFileCustomCall, AppendToFileCustomCall,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::AnyBuffer>()
+                           .Attr<absl::string_view>("dir")
+                           .Attr<absl::string_view>("metadata")
+                           .Ret<xla::ffi::Buffer<xla::TOKEN>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kXlaGpuAppendToFileCustomCallTag,
+                         GetGpuPlatformName(), kXlaGpuAppendToFileCustomCall);
+
 XLA_FFI_DEFINE_HANDLER(kXlaGpuAssertCustomCall, AssertionCustomCall,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
index 2397f0a1d21681..0a42104f9b0139 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.h
@@ -26,6 +26,9 @@ inline constexpr absl::string_view kXlaGpuAssertCustomCallTag =
 inline constexpr absl::string_view kXlaGpuDebugPrintCustomCallTag =
     "__xla_gpu_debug_print";
 
+inline constexpr absl::string_view kXlaGpuAppendToFileCustomCallTag =
+    "__xla_gpu_append_to_file";
+
 }  // namespace xla
 
 #endif  // XLA_BACKENDS_GPU_RUNTIME_RUNTIME_INTRINSICS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
index 6644623f000281..8153edf3269b2d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics_test.cc
@@ -13,18 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/base/log_severity.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/lib/io/record_reader.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/testing/temporary_directory.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/tstring.h"
 
 namespace xla {
 namespace gpu {
@@ -35,6 +50,40 @@ using RuntimeIntrinsicsTest = HloTestBase;
 using ::testing::EndsWith;
 using ::testing::HasSubstr;
 
+absl::StatusOr<std::vector<std::pair<std::string, Literal>>>
+ReadTFRecordIOLiteral(const std::string& dir) {
+  auto* env = tsl::Env::Default();
+
+  std::vector<std::string> files;
+  TF_RETURN_IF_ERROR(env->GetChildren(dir, &files));
+
+  std::vector<std::pair<std::string, Literal>> result;
+  for (const std::string& path : files) {
+    std::unique_ptr<tsl::RandomAccessFile> file;
+    TF_RETURN_IF_ERROR(tsl::Env::Default()->NewRandomAccessFile(
+        tsl::io::JoinPath(dir, path), &file));
+    tsl::io::RecordReader reader(file.get());
+
+    uint64_t offset = 0;
+    tsl::tstring record;
+
+    for (;;) {
+      tsl::tstring metadata;
+      absl::Status status = reader.ReadRecord(&offset, &metadata);
+      if (absl::IsOutOfRange(status)) {
+        break;
+      }
+      TF_RETURN_IF_ERROR(status);
+
+      TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
+      TF_ASSIGN_OR_RETURN(Literal literal,
+                          Literal::DeserializeFromString(record));
+      result.emplace_back(metadata, std::move(literal));
+    }
+  }
+  return result;
+}
+
 TEST_F(RuntimeIntrinsicsTest, NopReturnTokenWorks) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -140,6 +189,43 @@ ENTRY e {
   mock_log.StopCapturingLogs();
 }
 
+TEST_F(RuntimeIntrinsicsTest, AppendToFile) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto temp_dir,
+      tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
+
+  std::string hlo = absl::StrFormat(R"hlo(
+HloModule m
+
+ENTRY e {
+  constant = f32[2]{0} constant({1, 2})
+  ROOT token1 = token[] custom-call(constant),
+    backend_config="{dir = \"%1$s\", metadata = \"op.1\"}",
+    custom_call_target="__xla_gpu_append_to_file",
+    custom_call_has_side_effect=true,
+    api_version=API_VERSION_TYPED_FFI
+})hlo",
+                                    temp_dir.path());
+
+  Literal expected = LiteralUtil::CreateR1<float>({1.0f, 2.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(hlo));
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+
+  std::vector<std::pair<std::string, Literal>> literals;
+  TF_ASSERT_OK_AND_ASSIGN(literals, ReadTFRecordIOLiteral(temp_dir.path()));
+  EXPECT_EQ(literals.size(), 1);
+  EXPECT_EQ(literals[0].first, "op.1");
+  EXPECT_EQ(literals[0].second, expected);
+
+  // Verify that append works.
+  TF_ASSERT_OK_AND_ASSIGN(module, GetOptimizedModule(hlo));
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(literals, ReadTFRecordIOLiteral(temp_dir.path()));
+  EXPECT_EQ(literals.size(), 2);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
index 6094f3a0215210..bdea7a5f1d4202 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_stub.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/types.h"
+#include "xla/types.h"  // IWYU pragma: keep
 
 namespace xla::gpu {
 namespace se = ::stream_executor;
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
index 77942bd6cd4e23..8c6dd201166ef9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.cc
@@ -31,9 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/select_k_exec.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
index 019b0a1ede0dac..2b9a25f4307110 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_thunk.h
@@ -23,10 +23,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/types.h"  // IWYU pragma: keep
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
index e9d64080e64b38..37ffb13abfae36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
@@ -30,14 +30,13 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
index 63113da10e2b6b..bf9dd9da895abf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
@@ -23,12 +23,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
index a8df1eb635c041..fb11c6b4517428 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/functional/function_ref.h"
@@ -125,6 +126,17 @@ void SequentialThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   }
 }
 
+absl::Status SequentialThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  for (std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->TransformAllNestedThunks(fn));
+    TF_ASSIGN_OR_RETURN(thunk, fn(std::move(thunk)));
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<ThunkProto> SequentialThunk::ToProto() const {
   ThunkProto proto;
   *proto.mutable_thunk_info() = thunk_info().ToProto();
@@ -152,5 +164,19 @@ absl::StatusOr<std::unique_ptr<SequentialThunk>> SequentialThunk::FromProto(
   return std::make_unique<SequentialThunk>(std::move(thunk_info),
                                            std::move(thunk_sequence));
 }
+
+std::unique_ptr<SequentialThunk> SequentialThunk::FromThunk(
+    std::unique_ptr<Thunk> thunk) {
+  if (thunk->kind() == Thunk::kSequential) {
+    return std::unique_ptr<SequentialThunk>(
+        static_cast<SequentialThunk*>(thunk.release()));
+  }
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(thunk));
+  return std::make_unique<SequentialThunk>(Thunk::ThunkInfo(),
+                                           std::move(thunks));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
index 3aee984c7d9ba1..86118548b2e4a8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
@@ -48,6 +48,10 @@ class SequentialThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
 
@@ -55,6 +59,13 @@ class SequentialThunk : public Thunk {
       ThunkInfo thunk_info, const SequentialThunkProto& thunk_proto,
       const Deserializer& deserializer);
 
+  // Converts a Thunk into a SequentialThunk. If the input is already a
+  // SequentialThunk, the returned value is the downcasted input.
+  //
+  // The new thunk, if created, will use a default-initialized ThunkInfo.
+  static std::unique_ptr<SequentialThunk> FromThunk(
+      std::unique_ptr<Thunk> thunk);
+
  private:
   // The list of sub-thunks.
   ThunkSequence thunks_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
index 9f9a426f97ce07..d3df365b966f42 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -27,6 +28,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -156,5 +158,36 @@ TEST(SequentialThunkTest, ToString) {
             "  003: kGemm\t\n");
 }
 
+TEST(SequentialThunkTest, TransformAllNestedThunks) {
+  auto make_info = [](uint64_t id) {
+    Thunk::ThunkInfo info;
+    info.thunk_id = ThunkId(id);
+    return info;
+  };
+  ThunkSequence thunks;
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(1)));
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(2)));
+  thunks.push_back(
+      std::make_unique<DummyThunk>(Thunk::Kind::kGemm, make_info(3)));
+  SequentialThunk sequential_thunk(Thunk::ThunkInfo(), std::move(thunks));
+
+  TF_EXPECT_OK(sequential_thunk.TransformAllNestedThunks(
+      [&](std::unique_ptr<Thunk> thunk) -> std::unique_ptr<Thunk> {
+        return std::make_unique<DummyThunk>(
+            Thunk::Kind::kCopy,
+            make_info(thunk->thunk_info().thunk_id.value() + 10));
+      }));
+
+  EXPECT_EQ(sequential_thunk.thunks().size(), 3);
+  EXPECT_EQ(sequential_thunk.thunks()[0]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[0]->thunk_info().thunk_id, ThunkId(11));
+  EXPECT_EQ(sequential_thunk.thunks()[1]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[1]->thunk_info().thunk_id, ThunkId(12));
+  EXPECT_EQ(sequential_thunk.thunks()[2]->kind(), Thunk::Kind::kCopy);
+  EXPECT_EQ(sequential_thunk.thunks()[2]->thunk_info().thunk_id, ThunkId(13));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc
new file mode 100644
index 00000000000000..f648f8f2e59b78
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.cc
@@ -0,0 +1,68 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+
+#include <optional>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+absl::StatusOr<ShapedSlice> ShapedSlice::FromProto(
+    const ShapedSliceProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  ShapedSlice shaped_slice;
+  TF_ASSIGN_OR_RETURN(
+      shaped_slice.slice,
+      BufferAllocation::Slice::FromProto(proto.slice(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(shaped_slice.shape, Shape::FromProto(proto.shape()));
+  return shaped_slice;
+}
+
+absl::StatusOr<ShapedSliceProto> ShapedSlice::ToProto() const {
+  ShapedSliceProto proto;
+  TF_ASSIGN_OR_RETURN(*proto.mutable_slice(), slice.ToProto());
+  *proto.mutable_shape() = shape.ToProto();
+  return proto;
+}
+
+absl::StatusOr<NullableShapedSlice> NullableShapedSlice::FromProto(
+    const NullableShapedSliceProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  if (proto.has_shaped_slice()) {
+    TF_ASSIGN_OR_RETURN(
+        ShapedSlice shaped_slice,
+        ShapedSlice::FromProto(proto.shaped_slice(), buffer_allocations));
+    return NullableShapedSlice(std::move(shaped_slice));
+  }
+  return NullableShapedSlice(std::nullopt);
+}
+
+absl::StatusOr<NullableShapedSliceProto> NullableShapedSlice::ToProto() const {
+  NullableShapedSliceProto proto;
+  if (has_value()) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_shaped_slice(), value().ToProto());
+  }
+  return proto;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h
new file mode 100644
index 00000000000000..e4306458650a69
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.h
@@ -0,0 +1,91 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
+#define XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+
+namespace xla::gpu {
+
+// A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
+// shape.
+struct ShapedSlice {
+  BufferAllocation::Slice slice;
+  Shape shape;
+
+  static absl::StatusOr<ShapedSlice> FromProto(
+      const ShapedSliceProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+  absl::StatusOr<ShapedSliceProto> ToProto() const;
+
+  friend bool operator==(const ShapedSlice& lhs, const ShapedSlice& rhs) {
+    return lhs.slice == rhs.slice && lhs.shape == rhs.shape;
+  }
+
+  friend bool operator!=(const ShapedSlice& lhs, const ShapedSlice& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ShapedSlice& shaped_slice) {
+    absl::Format(&sink, "ShapedSlice{slice: %v, shape: %v}", shaped_slice.slice,
+                 shaped_slice.shape.ToString(/*print_layout=*/true));
+  }
+};
+
+// A nullable shaped slice is either a ShapedSlice or a nullopt. This is used
+// to represent the operands and results of a thunk, where a nullopt represents
+// a null pointer argument to the thunk.
+class NullableShapedSlice : public std::optional<ShapedSlice> {
+ public:
+  using std::optional<ShapedSlice>::optional;
+
+  static absl::StatusOr<NullableShapedSlice> FromProto(
+      const NullableShapedSliceProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
+  absl::StatusOr<NullableShapedSliceProto> ToProto() const;
+
+  friend bool operator==(const NullableShapedSlice& lhs,
+                         const NullableShapedSlice& rhs) {
+    return static_cast<const std::optional<ShapedSlice>&>(lhs) ==
+           static_cast<const std::optional<ShapedSlice>&>(rhs);
+  }
+
+  friend bool operator!=(const NullableShapedSlice& lhs,
+                         const NullableShapedSlice& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink,
+                            const NullableShapedSlice& shaped_slice) {
+    if (shaped_slice.has_value()) {
+      absl::Format(&sink, "%v", *shaped_slice);
+    } else {
+      absl::Format(&sink, "null");
+    }
+  }
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_SHAPED_SLICE_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto
new file mode 100644
index 00000000000000..ddb76b96c82e2f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice.proto
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/service/buffer_assignment.proto";
+import "xla/xla_data.proto";
+
+option java_multiple_files = true;
+option java_outer_classname = "ShapedSlice";
+
+// A shaped slice is a BufferAllocation::Slice and its shape.
+message ShapedSliceProto {
+  // The buffer allocation slice.
+  xla.buffer_assignment.BufferAllocationSliceProto slice = 1;
+
+  // The shape of the slice.
+  xla.ShapeProto shape = 2;
+}
+
+// A nullable shaped slice is either a ShapedSlice or a nullopt. This is used
+// to represent the operands and results of a thunk, where a nullopt represents
+// a null pointer argument to the thunk.
+message NullableShapedSliceProto {
+  optional ShapedSliceProto shaped_slice = 1;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc b/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc
new file mode 100644
index 00000000000000..e0318e9b2d4304
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/shaped_slice_test.cc
@@ -0,0 +1,224 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/strings/str_cat.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
+#include "xla/primitive_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+using absl_testing::IsOkAndHolds;
+using ::testing::HasSubstr;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ShapedSliceTest, Stringify) {
+  constexpr int64_t kNumElements = 1024;
+  const size_t kSizeInBytes = kNumElements * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice =
+      BufferAllocation::Slice(&alloc, /*offset=*/primitive_util::ByteWidth(F32),
+                              /*size=*/kSizeInBytes);
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElements - 1});
+  EXPECT_THAT(absl::StrCat(shaped_slice), HasSubstr("ShapedSlice"));
+  EXPECT_THAT(absl::StrCat(shaped_slice),
+              HasSubstr(absl::StrCat(shaped_slice.slice)));
+  EXPECT_THAT(absl::StrCat(shaped_slice),
+              HasSubstr(shaped_slice.shape.ToString(/*print_layout=*/true)));
+}
+
+TEST(ShapedSliceTest, ToProto) {
+  constexpr int64_t kNumElements = 1024;
+  const size_t kSizeInBytes = kNumElements * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElements - 1});
+
+  EXPECT_THAT(
+      shaped_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+        shape {
+          element_type: F32
+          dimensions: 1023
+          layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+          is_dynamic_dimension: false
+        }
+      )pb")));
+}
+
+TEST(ShapedSliceTest, FromProto) {
+  ShapedSliceProto proto = ParseTextProtoOrDie<ShapedSliceProto>(R"pb(
+    slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+    shape {
+      element_type: F32
+      dimensions: 1023
+      layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+      is_dynamic_dimension: false
+    }
+  )pb");
+
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+
+  ShapedSlice expected_shaped_slice;
+  expected_shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+
+  expected_shaped_slice.shape =
+      ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+
+  std::vector<BufferAllocation> buffer_allocations = {alloc};
+  EXPECT_THAT(ShapedSlice::FromProto(proto, buffer_allocations),
+              IsOkAndHolds(expected_shaped_slice));
+}
+
+TEST(NullableShapedSliceTest, StringifyNonEmptySlice) {
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice =
+      BufferAllocation::Slice(&alloc, /*offset=*/primitive_util::ByteWidth(F32),
+                              /*size=*/kSizeInBytes);
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+  NullableShapedSlice non_empty_slice(shaped_slice);
+  EXPECT_THAT(absl::StrCat(non_empty_slice),
+              HasSubstr(absl::StrCat(shaped_slice)));
+}
+
+TEST(NullableShapedSliceTest, StringifyEmptySlice) {
+  NullableShapedSlice empty_slice;
+  EXPECT_THAT(absl::StrCat(empty_slice), HasSubstr("null"));
+}
+
+TEST(NullableShapedSliceTest, ToProtoNonEmptySlice) {
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  ShapedSlice shaped_slice;
+  shaped_slice.slice = BufferAllocation::Slice(
+      &alloc, /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr int64_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  shaped_slice.shape = ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+  NullableShapedSlice non_empty_slice(shaped_slice);
+  EXPECT_THAT(
+      non_empty_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        shaped_slice {
+          slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+          shape {
+            element_type: F32
+            dimensions: 1023
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+        }
+      )pb")));
+}
+
+TEST(NullableShapedSliceTest, ToProtoEmptySlice) {
+  NullableShapedSlice empty_slice;
+  EXPECT_THAT(empty_slice.ToProto(), IsOkAndHolds(EqualsProto(R"pb()pb")));
+}
+
+TEST(NullableShapedSliceTest, FromProtoNonEmptySlice) {
+  NullableShapedSliceProto proto =
+      ParseTextProtoOrDie<NullableShapedSliceProto>(R"pb(
+        shaped_slice {
+          slice { buffer_allocation_index: 0 offset: 4 size: 4092 }
+          shape {
+            element_type: F32
+            dimensions: 1023
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+        }
+      )pb");
+
+  constexpr int64_t kNumElementsInBuffer = 1024;
+  const size_t kSizeInBytes =
+      kNumElementsInBuffer * primitive_util::ByteWidth(F32);
+  BufferAllocation alloc(/*index=*/0,
+                         /*size=*/kSizeInBytes,
+                         /*color=*/0);
+  std::vector<BufferAllocation> buffer_allocations = {alloc};
+  ShapedSlice expected_shaped_slice;
+  expected_shaped_slice.slice = BufferAllocation::Slice(
+      &buffer_allocations[0], /*offset=*/primitive_util::ByteWidth(F32),
+      /*size=*/kSizeInBytes - primitive_util::ByteWidth(F32));
+
+  // The slice starts with an offset of one element, therefore it can only hold
+  // kNumElementsInBuffer - 1 elements.
+  constexpr size_t kNumElementsInSlice = kNumElementsInBuffer - 1;
+  expected_shaped_slice.shape =
+      ShapeUtil::MakeShape(F32, {kNumElementsInSlice});
+
+  EXPECT_THAT(NullableShapedSlice::FromProto(
+                  proto, /*buffer_allocations=*/buffer_allocations),
+              IsOkAndHolds(NullableShapedSlice(expected_shaped_slice)));
+}
+
+TEST(NullableShapedSliceTest, FromProtoEmptySlice) {
+  NullableShapedSliceProto proto;
+  EXPECT_THAT(NullableShapedSlice::FromProto(proto, /*buffer_allocations=*/{}),
+              IsOkAndHolds(NullableShapedSlice()));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index 57581bed877d1e..e15f2b83263bb0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -17,14 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <utility>
 
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -42,7 +41,6 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -260,7 +258,7 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kAllToAllDone);
     CASE(kAllToAllStart);
     CASE(kBuffersDebugChecksum);
-    CASE(kCholesky);
+    CASE(kBuffersDebugFloatCheck);
     CASE(kCollectiveBroadcast);
     CASE(kCollectiveBroadcastDone);
     CASE(kCollectiveBroadcastStart);
@@ -459,23 +457,6 @@ ThunkInfoProto Thunk::ThunkInfo::ToProto() const {
   return proto;
 }
 
-absl::StatusOr<ShapedSlice> ShapedSlice::FromProto(
-    const ShapedSliceProto& proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
-  ShapedSlice shaped_slice;
-  TF_ASSIGN_OR_RETURN(
-      shaped_slice.slice,
-      BufferAllocation::Slice::FromProto(proto.slice(), buffer_allocations));
-  TF_ASSIGN_OR_RETURN(shaped_slice.shape, Shape::FromProto(proto.shape()));
-  return shaped_slice;
-}
-
-absl::StatusOr<ShapedSliceProto> ShapedSlice::ToProto() const {
-  ShapedSliceProto proto;
-  TF_ASSIGN_OR_RETURN(*proto.mutable_slice(), slice.ToProto());
-  *proto.mutable_shape() = shape.ToProto();
-  return proto;
-}
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index a2e21293553c01..ed0b057b1f9706 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -138,7 +138,7 @@ class Thunk {
     kAllToAllDone,
     kAllToAllStart,
     kBuffersDebugChecksum,
-    kCholesky,
+    kBuffersDebugFloatCheck,
     kCollectiveBroadcast,
     kCollectiveBroadcastDone,
     kCollectiveBroadcastStart,
@@ -537,6 +537,17 @@ class Thunk {
   // Invokes `fn` with this thunk and all nested thunks.
   virtual void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn);
 
+  // Recursively replaces all nested thunks with the result of applying `fn` to
+  // them.
+  // An error will leave the transformation in invalid state.
+  // InternalError should be used for status.
+  virtual absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) {
+    return absl::OkStatus();
+  }
+
   // A helper function to get the `GpuCollectives*` pointer from the
   // CollectiveExecuteParams.
   static absl::StatusOr<GpuCollectives* absl_nonnull> GetGpuCollectives(
@@ -565,6 +576,10 @@ class Thunk {
       absl::AnyInvocable<absl::StatusOr<std::unique_ptr<Thunk>>(
           const ThunkProto&) const>;
 
+  using DeserializerWithCustomAllocations =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<Thunk>>(
+          const ThunkProto&, absl::Span<const BufferAllocation>) const>;
+
   void add_control_predecessor(const Thunk* control_predecessor) {
     control_predecessors_.push_back(control_predecessor);
   }
@@ -598,18 +613,6 @@ using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
 
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
 
-// A struct that defines a shaped slice, i.e., a BufferAllocation::Slice and its
-// shape.
-struct ShapedSlice {
-  BufferAllocation::Slice slice;
-  Shape shape;
-
-  static absl::StatusOr<ShapedSlice> FromProto(
-      const ShapedSliceProto& proto,
-      absl::Span<const BufferAllocation> buffer_allocations);
-  absl::StatusOr<ShapedSliceProto> ToProto() const;
-};
-
 // Returns if the thunk implements a reduction collective (all-reduce or
 // reduce-scatter).
 bool IsReductionCollective(Thunk::Kind kind);
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 7543054d6c0811..c9be2a3abd8b38 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -19,11 +19,14 @@ package xla.gpu;
 
 import "xla/backends/gpu/runtime/convolution_filter_thunk.proto";
 import "xla/backends/gpu/runtime/dynamic_slice_thunk.proto";
+import "xla/backends/gpu/runtime/shaped_slice.proto";
 import "xla/core/host_offloading/host_offloading_executable.proto";
+import "xla/ffi/attribute_map.proto";
 import "xla/service/buffer_assignment.proto";
 import "xla/service/gpu/gpu_conv_runner.proto";
 import "xla/service/gpu/gpu_norm_runner.proto";
 import "xla/service/gpu/launch_dimensions.proto";
+import "xla/service/hlo.proto";
 import "xla/stream_executor/gpu/gpu_blas_lt.proto";
 import "xla/stream_executor/gpu/tma_metadata.proto";
 import "xla/stream_executor/launch_dim.proto";
@@ -117,16 +120,6 @@ message TriangularSolveThunkProto {
   int64 b_batch_stride = 10;
 }
 
-message CholeskyThunkProto {
-  xla.CholeskyOptions options = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto a_buffer = 2;
-  xla.buffer_assignment.BufferAllocationSliceProto workspace_buffer = 3;
-  xla.buffer_assignment.BufferAllocationSliceProto info_buffer = 4;
-  xla.PrimitiveType type = 5;
-  int64 batch_size = 6;
-  int64 n = 7;
-}
-
 message ReplicaIdThunkProto {
   xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
 }
@@ -142,9 +135,34 @@ message CudnnThunkProto {
   optional int64 sdpa_dropout_seed = 3;
 }
 
-message ShapedSliceProto {
-  xla.buffer_assignment.BufferAllocationSliceProto slice = 1;
-  xla.ShapeProto shape = 2;
+message HostSendThunkProto {
+  xla.ShapeProto shape = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto buffer = 2;
+  int64 channel_id = 3;
+  map<string, string> frontend_attrs = 4;
+  optional int64 device_constraint = 5;
+  uint64 async_events_unique_id = 6;
+}
+
+message HostSendDoneThunkProto {
+  int64 channel_id = 1;
+  optional int64 device_constraint = 2;
+  uint64 async_events_unique_id = 3;
+}
+
+message HostRecvThunkProto {
+  xla.ShapeProto shape = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto buffer = 2;
+  int64 channel_id = 3;
+  map<string, string> frontend_attrs = 4;
+  optional int64 device_constraint = 5;
+  uint64 async_events_unique_id = 6;
+}
+
+message HostRecvDoneThunkProto {
+  int64 channel_id = 1;
+  optional int64 device_constraint = 2;
+  uint64 async_events_unique_id = 3;
 }
 
 message HostExecuteStartThunkProto {
@@ -167,6 +185,7 @@ message DynamicSliceThunkProto {
   repeated OptionalInt64Proto offset_byte_sizes = 6;
   optional OffsetAsFunctionOfIndvarModulesMetadataProto
       offset_as_function_of_indvar_modules_metadata = 7;
+  repeated BufferAllocationProto fake_allocations = 8;
 }
 
 message MemzeroThunkProto {
@@ -213,6 +232,16 @@ message CublasLtMatmulThunkProto {
   optional xla.buffer_assignment.BufferAllocationSliceProto workspace = 16;
 }
 
+message CubSortThunkProto {
+  xla.PrimitiveType type = 1;
+  optional xla.PrimitiveType value_type = 3;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto operands = 4;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto results = 5;
+  xla.buffer_assignment.BufferAllocationSliceProto scratch = 6;
+  bool descending = 7;
+  int64 batch_size = 8;
+}
+
 message NormThunkProto {
   GpuNormDescriptorProto norm_descriptor = 1;
   xla.buffer_assignment.BufferAllocationSliceProto x = 2;
@@ -250,6 +279,18 @@ message FftThunkProto {
   xla.ShapeProto output_shape = 6;
 }
 
+message CustomCallThunkProto {
+  string target_name = 1;
+  repeated NullableShapedSliceProto operands = 2;
+  repeated NullableShapedSliceProto results = 3;
+  string opaque = 4;
+  CustomCallApiVersion api_version = 5;
+  xla.ffi.AttributesMapProto attributes = 6;
+  // The name of the called computation. It needs to match the HloCompuation in
+  // the HloModule that is used to deserialize the thunk.
+  optional string called_computation = 7;
+}
+
 message ThunkProto {
   ThunkInfoProto thunk_info = 1;
 
@@ -280,8 +321,13 @@ message ThunkProto {
     ConvolutionThunkProto convolution_thunk = 25;
     ConvolutionReorderThunkProto convolution_reorder_thunk = 26;
     FftThunkProto fft_thunk = 27;
-    CholeskyThunkProto cholesky_thunk = 28;
-    Memset32BitValueThunkProto memset32bit_value_thunk = 29;
+    Memset32BitValueThunkProto memset32bit_value_thunk = 28;
+    CustomCallThunkProto custom_call_thunk = 30;
+    CubSortThunkProto cub_sort_thunk = 31;
+    HostSendThunkProto host_send_thunk = 32;
+    HostSendDoneThunkProto host_send_done_thunk = 33;
+    HostRecvThunkProto host_recv_thunk = 34;
+    HostRecvDoneThunkProto host_recv_done_thunk = 35;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc
new file mode 100644
index 00000000000000..06173167322378
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.cc
@@ -0,0 +1,251 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/dump.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+// With BufferDebugLogEntry size of 8 bytes, this is enough to hold ~8K entries.
+constexpr size_t kLogSizeBytes = 64 * 1024;
+
+namespace {
+
+// If the thunk has any interesting buffers to check, turns it into a sequence
+// of:
+// - BuffersDebugChecksumThunk checking the buffers before execution
+// - The original thunk
+// - BuffersDebugChecksumThunk checking the buffers after execution
+//
+// If the thunk got wrapped, the data dependencies between the thunks will be
+// configured to ensure `predecessor_thunk` executes before the wrapped thunk
+// and `successor_thunk` executes after.
+//
+// If the thunk has no interesting buffers to check, it is returned as is. It
+// can never return nullptr.
+std::unique_ptr<Thunk> WrapWithChecksumThunk(
+    std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
+    const Thunk& predecessor_thunk, Thunk& successor_thunk,
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+  const auto& thunk_buffers = thunk->buffer_uses();
+  if (thunk_buffers.empty()) {
+    return thunk;
+  }
+
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check_before;
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check_after;
+
+  for (size_t buffer_idx = 0; buffer_idx < thunk_buffers.size(); ++buffer_idx) {
+    const BufferUse& use = thunk_buffers[buffer_idx];
+    if (use.HasDefinedContentsOnInput()) {
+      buffers_to_check_before.emplace(buffer_idx, use.slice());
+    }
+    if (use.HasDefinedContentsOnOutput()) {
+      buffers_to_check_after.emplace(buffer_idx, use.slice());
+    }
+  }
+
+  if (buffers_to_check_before.empty() && buffers_to_check_after.empty()) {
+    return thunk;
+  }
+
+  std::vector<std::unique_ptr<Thunk>> thunk_and_checks;
+  if (!buffers_to_check_before.empty()) {
+    auto buffer_debug_before_thunk =
+        std::make_unique<BuffersDebugChecksumThunk>(
+            Thunk::ThunkInfo(), log_slice, thunk->thunk_info().thunk_id,
+            std::move(buffers_to_check_before),
+            /*runs_before_checked_thunk=*/true, metadata_store);
+    thunk->add_control_predecessor(buffer_debug_before_thunk.get());
+    thunk_and_checks.push_back(std::move(buffer_debug_before_thunk));
+  }
+
+  Thunk* thunk_ptr = thunk.get();
+  thunk_and_checks.push_back(std::move(thunk));
+
+  if (!buffers_to_check_after.empty()) {
+    auto buffer_debug_after_thunk = std::make_unique<BuffersDebugChecksumThunk>(
+        Thunk::ThunkInfo(), log_slice, thunk_ptr->thunk_info().thunk_id,
+        std::move(buffers_to_check_after),
+        /*runs_before_checked_thunk=*/false, metadata_store);
+    buffer_debug_after_thunk->add_control_predecessor(thunk_ptr);
+    thunk_and_checks.push_back(std::move(buffer_debug_after_thunk));
+  }
+
+  auto wrapped_thunk = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo(), std::move(thunk_and_checks));
+  wrapped_thunk->add_control_predecessor(&predecessor_thunk);
+  successor_thunk.add_control_predecessor(wrapped_thunk.get());
+  return wrapped_thunk;
+}
+
+// Saves the contents of the BufferDebugLog stored in `log_buffer` to a file..
+//
+// `metadata_store` is used to retrieve the metadata for the log entries.
+// The filename is derived from the HLO module name and the log dump path
+// configured in `debug_options`.
+absl::Status DumpBufferDebugChecksumLog(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    se::Stream* stream, const HloComputation* absl_nonnull hlo_computation,
+    xla::ffi::Buffer<U8> log_buffer) {
+  VLOG(1) << "HLO computation ptr: " << hlo_computation;
+  const HloModule* hlo_module = hlo_computation->parent();
+  VLOG(1) << "HLO module ptr: " << hlo_module;
+  VLOG(1) << "HLO module name: " << hlo_module->name();
+  CHECK(hlo_module != nullptr);
+  const DebugOptions& debug_options = hlo_module->config().debug_options();
+
+  auto buffer_debug_log =
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::FromDeviceMemoryUnchecked(
+          log_buffer.device_memory());
+  TF_ASSIGN_OR_RETURN(std::vector<BufferDebugLogEntry> log_entries,
+                      buffer_debug_log.ReadFromDevice(*stream));
+  BufferDebugLogProto buffer_debug_log_proto =
+      metadata_store->EntriesToProto(log_entries);
+
+  VLOG(1) << "read " << buffer_debug_log_proto.entries_size() << " entries";
+  DumpPerExecutionProtobufToFile(*hlo_module, buffer_debug_log_proto,
+                                 debug_options, "buffer_debug_log", nullptr);
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    kBufferDebugChecksumLogInitHandler,
+    [](se::Stream* absl_nonnull stream, xla::ffi::Buffer<U8> log_buffer) {
+      return se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                 *stream, log_buffer.device_memory())
+          .status();
+    },
+    xla::ffi::Ffi::Bind().Ctx<xla::ffi::Stream>().Arg<xla::ffi::Buffer<U8>>());
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateDebugInitThunk(
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  XLA_FFI_Handler_Bundle buffer_debug_init_bundle{};
+  buffer_debug_init_bundle.execute = kBufferDebugChecksumLogInitHandler;
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_log_init",
+      buffer_debug_init_bundle, /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateBufferDebugDumpThunk(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  CustomCallThunk::OwnedHandlerBundle dump_bundle{};
+  dump_bundle.execute =
+      xla::ffi::Ffi::Bind()
+          .Ctx<xla::ffi::Stream>()
+          .Ctx<xla::ffi::CalledComputation>()
+          .Arg<xla::ffi::Buffer<U8>>()
+          .To(absl::bind_front(DumpBufferDebugChecksumLog, metadata_store));
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_log_dump",
+      std::move(dump_bundle),
+      /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+}  // namespace
+absl::Status RunChecksumPassInternal(SequentialThunk* root_thunk,
+                                     const DebugOptions& debug_options,
+                                     const HloModule* absl_nonnull hlo_module,
+                                     ThunkPassBufferAllocator& allocator) {
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store =
+      std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
+                      allocator.NewEmptyAllocation(kLogSizeBytes));
+  BufferAllocation::Slice log_slice(log_alloc, 0, log_alloc->size());
+
+  TF_ASSIGN_OR_RETURN(auto buffer_debug_init_thunk,
+                      CreateDebugInitThunk(log_slice, hlo_module));
+
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_debug_dump_thunk,
+      CreateBufferDebugDumpThunk(metadata_store, log_slice, hlo_module));
+
+  ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
+  TF_RETURN_IF_ERROR(
+      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+        if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
+          return thunk;
+        }
+        VLOG(1) << "Wrapping with checksum thunk";
+        return WrapWithChecksumThunk(
+            std::move(thunk), log_slice,
+            /*predecessor_thunk=*/*buffer_debug_init_thunk,
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+      }));
+
+  ThunkSequence& thunks = root_thunk->thunks();
+  thunks.reserve(thunks.size() + 2);
+  thunks.insert(thunks.begin(), std::move(buffer_debug_init_thunk));
+  thunks.push_back(std::move(buffer_debug_dump_thunk));
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h
new file mode 100644
index 00000000000000..d13f4c788eb85b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h
@@ -0,0 +1,34 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla::gpu {
+
+absl::Status RunChecksumPassInternal(SequentialThunk* root_thunk,
+                                     const DebugOptions& debug_options,
+                                     const HloModule* absl_nonnull hlo_module,
+                                     ThunkPassBufferAllocator& allocator);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_CHECKSUM_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc
new file mode 100644
index 00000000000000..0c5687821c4595
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.cc
@@ -0,0 +1,128 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "re2/re2.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/ffi/ffi.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+
+// A function that decides whether the thunk should be instrumented
+// (kInstrument) or not (kSkip).
+using ThunkFilter = absl::AnyInvocable<InstrumentAction(const Thunk&) const>;
+
+// Creates a thunk filter that filters thunks by their IDs, based the allowed
+// ranges passed in debug options.
+ThunkFilter CreateThunkIdFilter(const DebugOptions& debug_options) {
+  std::vector<std::pair<int64_t, int64_t>> thunk_id_ranges;
+  for (const auto& range :
+       debug_options.xla_gpu_experimental_thunk_buffer_debug_filter()
+           .thunk_id_ranges()) {
+    VLOG(1) << "Thunk filter: id range [" << range.first() << ", "
+            << range.last() << "]";
+    thunk_id_ranges.emplace_back(range.first(), range.last());
+  }
+
+  return [id_ranges = std::move(thunk_id_ranges)](const Thunk& thunk) {
+    if (id_ranges.empty()) {
+      return InstrumentAction::kInstrument;
+    }
+
+    const ThunkId thunk_id = thunk.thunk_info().thunk_id;
+    if (absl::c_any_of(id_ranges, [&](const auto& range) {
+          VLOG(2) << "Thunk filter: check ID range: " << range.first
+                  << " <= " << thunk_id.value() << " <= " << range.second;
+          return range.first <= thunk_id.value() &&
+                 thunk_id.value() <= range.second;
+        })) {
+      VLOG(2) << "Thunk filter: ID matches";
+      return InstrumentAction::kInstrument;
+    }
+
+    VLOG(2) << "Thunk filter: ID does not match";
+    return InstrumentAction::kSkip;
+  };
+}
+
+// Creates a thunk filter that filters thunks by matching their profile
+// annotations against regexes configured in debug options.
+ThunkFilter CreateProfileAnnotationRegexFilter(
+    const DebugOptions& debug_options) {
+  std::vector<std::unique_ptr<RE2>> profile_annotation_regexes;
+  for (const auto& regex :
+       debug_options.xla_gpu_experimental_thunk_buffer_debug_filter()
+           .profile_annotation_regexes()) {
+    VLOG(1) << "Thunk filter: profile annotation regex: " << regex;
+    profile_annotation_regexes.push_back(std::make_unique<RE2>(regex));
+  }
+  return [regexes = std::move(profile_annotation_regexes)](const Thunk& thunk) {
+    if (regexes.empty()) {
+      return InstrumentAction::kInstrument;
+    }
+
+    const std::string& profile_annotation =
+        thunk.thunk_info().profile_annotation;
+    if (absl::c_any_of(regexes, [&](const auto& regex) {
+          VLOG(2) << "Thunk filter: check profile annotation regex: "
+                  << regex->pattern();
+          return RE2::PartialMatch(profile_annotation, *regex);
+        })) {
+      VLOG(2) << "Thunk filter: profile annotation matches";
+      return InstrumentAction::kInstrument;
+    }
+
+    VLOG(2) << "Thunk filter: profile annotation does not match";
+    return InstrumentAction::kSkip;
+  };
+}
+
+}  // namespace
+
+// Creates a thunk filter that filters thunks by all the conditions configured
+// in debug options.
+ThunkFilter CreateThunkFilter(const DebugOptions& debug_options) {
+  std::vector<ThunkFilter> filters;
+  filters.push_back(CreateThunkIdFilter(debug_options));
+  filters.push_back(CreateProfileAnnotationRegexFilter(debug_options));
+
+  return [filters = std::move(filters)](const Thunk& thunk) {
+    VLOG(2) << "Thunk filter: check ID " << thunk.thunk_info().thunk_id
+            << ", profile annotation " << thunk.thunk_info().profile_annotation;
+    if (absl::c_all_of(filters, [&](const auto& filter) {
+          return filter(thunk) == InstrumentAction::kInstrument;
+        })) {
+      return InstrumentAction::kInstrument;
+    }
+    return InstrumentAction::kSkip;
+  };
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h
new file mode 100644
index 00000000000000..2d4d8cc97e0970
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_filter.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
+
+#include "absl/functional/any_invocable.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+// A boolean-like value returned from thunk filters to indicate whether the
+// thunk should be instrumented or left as is.
+enum class InstrumentAction : bool {
+  // Don't instrument the thunk, leave it as is.
+  kSkip,
+  // Instrument the thunk.
+  kInstrument,
+};
+
+using ThunkFilter = absl::AnyInvocable<InstrumentAction(const Thunk&) const>;
+
+ThunkFilter CreateThunkFilter(const DebugOptions& debug_options);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FILTER_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
new file mode 100644
index 00000000000000..c5b579278459f3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
@@ -0,0 +1,348 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_filter.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace se = stream_executor;
+
+// With BufferDebugFloatCheckEntry size of 16 bytes, this is enough to hold ~4K
+// entries.
+constexpr size_t kLogSizeBytes = 64 * 1024;
+
+namespace {
+
+std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
+    std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
+    const Thunk& predecessor_thunk, Thunk& successor_thunk,
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+  const auto& thunk_buffers = thunk->buffer_uses();
+  if (thunk_buffers.empty()) {
+    VLOG(1) << "No buffers in thunk " << thunk->thunk_info().thunk_id
+            << ", skipping";
+    return thunk;
+  }
+
+  absl::flat_hash_map<size_t, BufferAllocation::Slice> buffers_to_check;
+  for (size_t buffer_idx = 0; buffer_idx < thunk_buffers.size(); ++buffer_idx) {
+    VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+            << thunk->thunk_info().thunk_id;
+    const BufferUse& use = thunk_buffers[buffer_idx];
+    const BufferAllocation::Slice& slice = use.slice();
+    if (slice.allocation() == nullptr) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has null allocation, skipping";
+      continue;
+    }
+    if (slice.element_type() != PrimitiveType::F32 &&
+        slice.element_type() != PrimitiveType::BF16) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has unsupported element type "
+              << PrimitiveType_Name(slice.element_type()) << ", skipping";
+      continue;
+    }
+    if (!use.HasDefinedContentsOnOutput()) {
+      VLOG(1) << "Buffer " << buffer_idx << " in thunk "
+              << thunk->thunk_info().thunk_id
+              << " has no defined contents, skipping";
+      continue;
+    }
+    buffers_to_check.emplace(buffer_idx, use.slice());
+    VLOG(1) << "Found buffer " << buffer_idx << " in thunk "
+            << thunk->thunk_info().thunk_id << " with element type "
+            << PrimitiveType_Name(slice.element_type()) << " and size "
+            << slice.size();
+  }
+
+  if (buffers_to_check.empty()) {
+    return thunk;
+  }
+
+  VLOG(1) << "Wrapping thunk " << thunk->thunk_info().thunk_id
+          << " with float check thunk due to presence of buffers: "
+          << buffers_to_check.size();
+  std::vector<std::unique_ptr<Thunk>> thunk_and_checks;
+  Thunk* thunk_ptr = thunk.get();
+  thunk_and_checks.push_back(std::move(thunk));
+  auto buffer_debug_float_check_thunk =
+      std::make_unique<BuffersDebugFloatCheckThunk>(
+          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice,
+          std::move(buffers_to_check), std::move(metadata_store));
+  buffer_debug_float_check_thunk->add_control_predecessor(thunk_ptr);
+  thunk_and_checks.push_back(std::move(buffer_debug_float_check_thunk));
+  auto wrapped_thunk = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo(), std::move(thunk_and_checks));
+  wrapped_thunk->add_control_predecessor(&predecessor_thunk);
+  successor_thunk.add_control_predecessor(wrapped_thunk.get());
+  return wrapped_thunk;
+}
+
+void LogHloInstructionWithId(const HloModule* hlo_module,
+                             const std::string& id) {
+  for (const HloComputation* computation : hlo_module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->name() == id) {
+        LOG(ERROR) << "HLO instruction with id " << id << ":\n\n"
+                   << instruction->ToString() << "\n\n";
+        if (instruction->opcode() == HloOpcode::kFusion) {
+          auto fusion = xla::Cast<HloFusionInstruction>(instruction);
+          LOG(ERROR) << "HLO fusion instruction computation:\n\n"
+                     << fusion->fused_instructions_computation()->ToString()
+                     << "\n\n";
+        }
+        return;
+      }
+    }
+  }
+  LOG(ERROR) << "HLO instruction with id " << id << " was not found";
+}
+
+absl::Status BufferDebugFloatCheck(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    se::Stream* stream, const HloComputation* absl_nonnull hlo_computation,
+    xla::ffi::Buffer<U8> log_buffer) {
+  VLOG(1) << "HLO computation ptr: " << hlo_computation;
+  const HloModule* hlo_module = hlo_computation->parent();
+  VLOG(1) << "HLO module ptr: " << hlo_module;
+  VLOG(1) << "HLO module name: " << hlo_module->name();
+  CHECK(hlo_module != nullptr);
+  bool nan_check_enabled =
+      hlo_module->config().debug_options().xla_gpu_detect_nan() !=
+      DebugOptions::DETECTION_MODE_NONE;
+  bool inf_check_enabled =
+      hlo_module->config().debug_options().xla_gpu_detect_inf() !=
+      DebugOptions::DETECTION_MODE_NONE;
+
+  auto buffer_debug_log = se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::
+      FromDeviceMemoryUnchecked(log_buffer.device_memory());
+  TF_ASSIGN_OR_RETURN(std::vector<BufferDebugFloatCheckEntry> entries,
+                      buffer_debug_log.ReadFromDevice(*stream));
+
+  std::vector<BufferDebugLogEntryId> entry_ids;
+  entry_ids.reserve(entries.size());
+  for (const auto& entry : entries) {
+    entry_ids.push_back(entry.entry_id);
+  }
+
+  VLOG(1) << "read " << entries.size() << " entries";
+  auto entries_metadata = metadata_store->GetEntryMetadataBatch(entry_ids);
+  int non_zero_nan_check_modules_count = 0;
+  int non_zero_inf_check_modules_count = 0;
+  CHECK_EQ(entries.size(), entries_metadata.size());
+
+  absl::flat_hash_set<std::string> reported_nan_thunks;
+  absl::flat_hash_set<std::string> reported_inf_thunks;
+  for (int i = 0; i < entries.size(); ++i) {
+    const auto& entry = entries[i];
+    const auto& metadata = entries_metadata[i];
+    if (!metadata.has_value()) {
+      VLOG(1) << "Entry ID " << entry.entry_id
+              << " for float check not found in metadata";
+      continue;
+    }
+    if (metadata->check_type !=
+        BufferDebugLogEntryProto::CHECK_TYPE_FLOAT_CHECKS) {
+      VLOG(1) << "Entry ID " << entry.entry_id
+              << " for float check has unsupported check type "
+              << BufferDebugLogEntryProto::CheckType_Name(metadata->check_type);
+      continue;
+    }
+    if (nan_check_enabled && entry.nan_count > 0) {
+      if (reported_nan_thunks.contains(metadata->profile_annotation)) {
+        VLOG(1) << "Skipping entry with non zero nan count " << entry.nan_count
+                << " for thunk " << entry.entry_id << " and execution "
+                << "with metadata: " << metadata->profile_annotation;
+        continue;
+      }
+      reported_nan_thunks.insert(metadata->profile_annotation);
+      LOG(ERROR) << "Found entry with non zero nan count " << entry.nan_count
+                 << " for thunk " << entry.entry_id << " and execution "
+                 << "with metadata: " << metadata->profile_annotation;
+      non_zero_nan_check_modules_count++;
+      LogHloInstructionWithId(hlo_module, metadata->profile_annotation);
+    }
+    if (inf_check_enabled && entry.inf_count > 0) {
+      if (reported_inf_thunks.contains(metadata->profile_annotation)) {
+        VLOG(1) << "Skipping entry with non zero inf count " << entry.inf_count
+                << " for thunk " << entry.entry_id << " with execution_id "
+                << metadata->execution_id
+                << " and profile annotation: " << metadata->profile_annotation;
+        continue;
+      }
+      reported_inf_thunks.insert(metadata->profile_annotation);
+      LOG(ERROR) << "Found entry with non zero inf count " << entry.inf_count
+                 << " for thunk " << entry.entry_id << " with execution_id "
+                 << metadata->execution_id
+                 << " and profile annotation: " << metadata->profile_annotation;
+      non_zero_inf_check_modules_count++;
+      LogHloInstructionWithId(hlo_module, metadata->profile_annotation);
+    }
+  }
+  if (non_zero_nan_check_modules_count > 0 &&
+      hlo_module->config().debug_options().xla_gpu_detect_nan() ==
+          DebugOptions::DETECTION_MODE_FAIL) {
+    LOG(FATAL) << "Crash execution as requested by the xla_gpu_detect_nan flag "
+                  "because "
+               << non_zero_nan_check_modules_count
+               << " NaN values were found in buffers.";
+  }
+  if (non_zero_inf_check_modules_count > 0 &&
+      hlo_module->config().debug_options().xla_gpu_detect_inf() ==
+          DebugOptions::DETECTION_MODE_FAIL) {
+    LOG(FATAL) << "Crash execution as requested by the xla_gpu_detect_inf flag "
+                  "because "
+               << non_zero_inf_check_modules_count
+               << " infinite values were found in buffers.";
+  }
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    kBufferDebugFloatCheckLogInitHandler,
+    [](se::Stream* absl_nonnull stream, xla::ffi::Buffer<U8> log_buffer) {
+      return se::gpu::BufferDebugLog<xla::gpu::BufferDebugFloatCheckEntry>::
+          CreateOnDevice(*stream, log_buffer.device_memory())
+              .status();
+    },
+    xla::ffi::Ffi::Bind().Ctx<xla::ffi::Stream>().Arg<xla::ffi::Buffer<U8>>());
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>> CreateDebugInitThunk(
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  XLA_FFI_Handler_Bundle buffer_debug_init_bundle{};
+  buffer_debug_init_bundle.execute = kBufferDebugFloatCheckLogInitHandler;
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_float_check_init",
+      buffer_debug_init_bundle, /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+absl::StatusOr<std::unique_ptr<CustomCallThunk>>
+CreateBufferDebugFloatCheckThunk(
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    BufferAllocation::Slice log_slice,
+    const HloModule* absl_nonnull hlo_module) {
+  ShapedSlice shaped_log_slice{
+      /*slice=*/log_slice,
+      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_slice.size()}),
+  };
+
+  CustomCallThunk::OwnedHandlerBundle float_check_bundle{};
+  float_check_bundle.execute =
+      xla::ffi::Ffi::Bind()
+          .Ctx<xla::ffi::Stream>()
+          .Ctx<xla::ffi::CalledComputation>()
+          .Arg<xla::ffi::Buffer<U8>>()
+          .To(absl::bind_front(BufferDebugFloatCheck, metadata_store));
+  return CustomCallThunk::Create(
+      Thunk::ThunkInfo(), "xla_gpu_buffer_debug_float_check",
+      std::move(float_check_bundle),
+      /*operands=*/{shaped_log_slice},
+      /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation());
+}
+
+}  // namespace
+
+absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
+                                       const DebugOptions& debug_options,
+                                       const HloModule* absl_nonnull hlo_module,
+                                       ThunkPassBufferAllocator& allocator) {
+  std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store =
+      std::make_shared<BufferDebugLogEntryMetadataStore>();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
+                      allocator.NewEmptyAllocation(kLogSizeBytes));
+  BufferAllocation::Slice log_slice(log_alloc, 0, log_alloc->size());
+
+  TF_ASSIGN_OR_RETURN(auto buffer_debug_init_thunk,
+                      CreateDebugInitThunk(log_slice, hlo_module));
+
+  TF_ASSIGN_OR_RETURN(
+      auto buffer_debug_dump_thunk,
+      CreateBufferDebugFloatCheckThunk(metadata_store, log_slice, hlo_module));
+
+  ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
+  TF_RETURN_IF_ERROR(
+      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+        if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
+          return thunk;
+        }
+        VLOG(1) << "Wrapping with float check thunk";
+        return WrapWithFloatCheckThunk(
+            std::move(thunk), log_slice,
+            /*predecessor_thunk=*/*buffer_debug_init_thunk,
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+      }));
+
+  ThunkSequence& thunks = root_thunk->thunks();
+  thunks.reserve(thunks.size() + 2);
+  thunks.insert(thunks.begin(), std::move(buffer_debug_init_thunk));
+  thunks.push_back(std::move(buffer_debug_dump_thunk));
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h
new file mode 100644
index 00000000000000..9ab900694b2fbb
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h
@@ -0,0 +1,33 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla::gpu {
+
+absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
+                                       const DebugOptions& debug_options,
+                                       const HloModule* absl_nonnull hlo_module,
+                                       ThunkPassBufferAllocator& allocator);
+}
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_FLOAT_CHECK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc
new file mode 100644
index 00000000000000..996b2f00739e12
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.cc
@@ -0,0 +1,60 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_checksum.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/ffi/ffi.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+absl::StatusOr<bool> ThunkBufferDebugPass::Run(
+    SequentialThunk* root_thunk, const DebugOptions& debug_options,
+    const HloModule* absl_nullable hlo_module,
+    const se::DeviceDescription& device_info,
+    ThunkPassBufferAllocator& allocator) {
+  VLOG(1) << "ThunkBufferDebugPass running";
+
+  if (hlo_module == nullptr) {
+    // We need the HLO module to dump the buffer debug log proto to a file. If
+    // it's not available, there's no point in doing extra work.
+    VLOG(1) << "HLO module is null, skip buffer checksumming";
+    return false;
+  }
+
+  switch (mode_) {
+    case Mode::kChecksum:
+      TF_RETURN_IF_ERROR(RunChecksumPassInternal(root_thunk, debug_options,
+                                                 hlo_module, allocator));
+      break;
+    case Mode::kFloatChecker:
+      TF_RETURN_IF_ERROR(RunFloatCheckPassInternal(root_thunk, debug_options,
+                                                   hlo_module, allocator));
+      break;
+  }
+
+  return true;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
similarity index 73%
rename from third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h
rename to third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
index 864b1b2bef32cc..20a07373bcc70f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
-#define XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
 
 #include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
@@ -27,21 +27,29 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Adds checksum tracing to thunks.
-class ThunkChecksumTracingPass : public ThunkPassInterface {
+// Adds buffer debug tracing to thunks.
+class ThunkBufferDebugPass : public ThunkPassInterface {
  public:
-  ThunkChecksumTracingPass() = default;
+  enum class Mode {
+    kChecksum,
+    kFloatChecker,
+  };
 
-  absl::string_view name() const override { return "thunk-checksum-tracing"; }
+  explicit ThunkBufferDebugPass(Mode mode) : mode_(mode) {}
+
+  absl::string_view name() const override { return "thunk-buffer-debug"; }
 
   absl::StatusOr<bool> Run(SequentialThunk* root_thunk,
                            const DebugOptions& debug_options,
                            const HloModule* absl_nullable hlo_module,
                            const se::DeviceDescription& device_info,
                            ThunkPassBufferAllocator& allocator) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_CHECKSUM_TRACING_PASS_H_
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_DEBUG_PASS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
new file mode 100644
index 00000000000000..71139279e24212
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
@@ -0,0 +1,742 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
+#include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
+#include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal_util.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using testing::ElementsAre;
+using testing::Eq;
+using testing::Pair;
+using testing::Pointer;
+using testing::SizeIs;
+using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
+
+MATCHER_P(IsUniquePointerTo, ptr, "") { return arg.get() == ptr; }
+
+MATCHER_P(ThunkKindIs, kind, "") {
+  return ExplainMatchResult(Eq(kind), arg->kind(), result_listener);
+}
+
+MATCHER_P(IsCustomCallThunkWithTargetName, target_name, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kCustomCall), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(
+             Eq(target_name),
+             static_cast<const CustomCallThunk&>(*arg).target_name(),
+             result_listener);
+}
+
+MATCHER_P(IsChecksumThunkChecking, slice, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kBuffersDebugChecksum), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(UnorderedElementsAreArray(slice),
+                            static_cast<const BuffersDebugChecksumThunk&>(*arg)
+                                .buffer_slices(),
+                            result_listener);
+}
+
+MATCHER_P(IsSequentialThunkWith, thunk_matcher, "") {
+  return ExplainMatchResult(Eq(Thunk::Kind::kSequential), arg->kind(),
+                            result_listener) &&
+         ExplainMatchResult(thunk_matcher,
+                            static_cast<const SequentialThunk&>(*arg).thunks(),
+                            result_listener);
+}
+
+using SliceList =
+    std::initializer_list<std::pair<size_t, BufferAllocation::Slice>>;
+
+class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
+ public:
+  absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
+    if (CreatedAlloc()) {
+      return absl::InvalidArgumentError("Expected only one allocation");
+    }
+    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
+    return alloc_.get();
+  }
+
+  bool CreatedAlloc() { return alloc_ != nullptr; }
+
+ private:
+  std::unique_ptr<BufferAllocation> alloc_;
+};
+
+class FakeThunk : public Thunk {
+ public:
+  explicit FakeThunk(ThunkInfo info, BufferUses buffer_uses)
+      : Thunk(Thunk::Kind::kGemm, std::move(info)),
+        buffer_uses_(std::move(buffer_uses)) {}
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+
+  BufferUses buffer_uses() const override { return buffer_uses_; }
+
+ private:
+  BufferUses buffer_uses_;
+};
+
+class ThunkBufferDebugPassTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // The callbacks created by ThunkBufferDebugPass require a HloModule
+    // with a non-null entry computation.
+    auto builder = HloComputation::Builder("entry");
+    HloInstruction* root = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(1)));
+    std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
+    fake_hlo_module_ =
+        std::make_unique<HloModule>("test_module", HloModuleConfig());
+    fake_hlo_module_->AddEntryComputation(std::move(entry_computation));
+  }
+
+  Thunk::ThunkInfo ThunkInfoWithId(ThunkId thunk_id) {
+    Thunk::ThunkInfo info;
+    info.thunk_id = thunk_id;
+    return info;
+  }
+
+  // Create a new, unique, non-null slice backed by `alloc_`.
+  BufferAllocation::Slice CreateSlice() {
+    BufferAllocation::Slice slice(&alloc_, used_alloc_size_, 1);
+    used_alloc_size_ += slice.size();
+    return slice;
+  }
+
+  BufferAllocation alloc_ = BufferAllocation(0, 1024, 0);
+  size_t used_alloc_size_ = 0;
+  std::unique_ptr<HloModule> fake_hlo_module_;
+};
+
+TEST_F(ThunkBufferDebugPassTest, IsNoOpWhenHloModuleIsNull) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice(&alloc, 0, 1);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      Thunk::ThunkInfo(), Thunk::BufferUses{BufferUse::Read(slice)});
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             /*hlo_module=*/nullptr, device_info, allocator));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(root_thunk->thunks(), ElementsAre(Pointer(fake_thunk_ptr)));
+}
+
+TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
+  static constexpr ThunkId kTestThunkId = ThunkId(123);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice_i(&alloc, 0, 1);
+  BufferAllocation::Slice slice_o(&alloc, 1, 1);
+  BufferAllocation::Slice slice_io(&alloc, 2, 1);
+  BufferAllocation::Slice slice_scratch(&alloc, 3, 1);
+  Thunk::ThunkInfo fake_thunk_info;
+  fake_thunk_info.thunk_id = ThunkId(kTestThunkId);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      fake_thunk_info,
+      Thunk::BufferUses{
+          // Consume means the thunk can reuse the buffer for scratch space, so
+          // only check it on input.
+          BufferUse::Consume(slice_i),
+          // Write is undefined on input, but defined on output.
+          BufferUse::Write(slice_o),
+          // Unlike Consume, Read is supposed to preserve the contents of the
+          // buffer, so we check it on input *and* output.
+          BufferUse::Read(slice_io),
+          // Scratch buffers are not checked at all.
+          BufferUse::Scratch(slice_scratch),
+      });
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice_i},
+                                                {2, slice_io},
+                                            }),
+                                            Pointer(fake_thunk_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {1, slice_o},
+                                                {2, slice_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
+  static constexpr ThunkId kWhileConditionFakeThunkId = ThunkId(100);
+  static constexpr ThunkId kWhileBodyId = ThunkId(101);
+  static constexpr ThunkId kBranch0ThunkId = ThunkId(102);
+  static constexpr ThunkId kBranch1ThunkId = ThunkId(103);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation::Slice slice_while_condition = CreateSlice();
+  BufferAllocation::Slice slice_while_body = CreateSlice();
+  BufferAllocation::Slice slice_branch0 = CreateSlice();
+  BufferAllocation::Slice slice_branch1 = CreateSlice();
+  // Setup a thunk tree.
+  auto while_condition_fake_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kWhileConditionFakeThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_while_condition)});
+  const Thunk* const while_condition_fake_thunk_ptr =
+      while_condition_fake_thunk.get();
+  auto while_body_fake_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kWhileBodyId),
+      Thunk::BufferUses{BufferUse::Read(slice_while_body)});
+  const Thunk* const while_body_fake_thunk_ptr = while_body_fake_thunk.get();
+  auto conditional_branch0_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kBranch0ThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_branch0)});
+  const Thunk* const branch0_thunk_ptr = conditional_branch0_thunk.get();
+  auto conditional_branch1_thunk = std::make_unique<FakeThunk>(
+      ThunkInfoWithId(kBranch1ThunkId),
+      Thunk::BufferUses{BufferUse::Read(slice_branch1)});
+  const Thunk* const branch1_thunk_ptr = conditional_branch1_thunk.get();
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  branch_thunks.push_back(
+      SequentialThunk::FromThunk(std::move(conditional_branch0_thunk)));
+  branch_thunks.push_back(
+      SequentialThunk::FromThunk(std::move(conditional_branch1_thunk)));
+  auto conditional_thunk = std::make_unique<ConditionalThunk>(
+      Thunk::ThunkInfo(),
+      /*branch_index_buffer_index=*/BufferAllocation::Slice(),
+      std::move(branch_thunks),
+      /*branch_index_is_bool=*/true);
+  const Thunk* const conditional_thunk_ptr = conditional_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> while_body_thunks;
+  while_body_thunks.push_back(std::move(while_body_fake_thunk));
+  while_body_thunks.push_back(std::move(conditional_thunk));
+  auto while_thunk = std::make_unique<WhileThunk>(
+      Thunk::ThunkInfo(), /*loop=*/nullptr,
+      /*condition_result_buffer_index=*/BufferAllocation::Slice(),
+      /*condition_thunk_sequence=*/
+      SequentialThunk::FromThunk(std::move(while_condition_fake_thunk)),
+      /*body_thunk_sequence=*/
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(),
+                                        std::move(while_body_thunks)));
+  std::unique_ptr<SequentialThunk> root_thunk =
+      SequentialThunk::FromThunk(std::move(while_thunk));
+
+  // Thunk structure before the pass:
+  // 1. WhileThunk
+  //    Condition: SequentialThunk [
+  //       FakeThunk (kWhileConditionFakeThunkId)
+  //    ]
+  //    Body: SequentialThunk [
+  //       FakeThunk (kWhileBodyId)
+  //       ConditionalThunk [
+  //          Branch 0: SequentialThunk [
+  //             FakeThunk (kBranch0ThunkId)
+  //          ]
+  //          Branch 1: SequentialThunk [
+  //             FakeThunk (kBranch1ThunkId)
+  //          ]
+  //       ]
+  //    ]
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Each FakeThunk is supposed to be transformed into a SequentialThunk
+  // containing the original FakeThunk sandwiched between two
+  // BuffersDebugChecksumThunk thunks.
+  //
+  // Thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. WhileThunk
+  //    1. Condition: SequentialThunk [
+  //       1. SequentialThunk [
+  //          1. BuffersDebugChecksumThunk (checksum input buffers)
+  //          2. FakeThunk (kWhileConditionFakeThunkId)
+  //          3. BuffersDebugChecksumThunk (checksum output buffers)
+  //       ]
+  //    ]
+  //    2. Body: SequentialThunk [
+  //       1. SequentialThunk [
+  //          1. BuffersDebugChecksumThunk (checksum input buffers)
+  //          2. FakeThunk (kWhileBodyId)
+  //          3. BuffersDebugChecksumThunk (checksum output buffers)
+  //       ]
+  //       2. ConditionalThunk [
+  //          Branch 0: SequentialThunk [
+  //             1. SequentialThunk [
+  //                1. BuffersDebugChecksumThunk (checksum input buffers)
+  //                2. FakeThunk (kBranch0ThunkId)
+  //                3. BuffersDebugChecksumThunk (checksum output buffers)
+  //             ]
+  //          ]
+  //          Branch 1: SequentialThunk [
+  //             1. SequentialThunk [
+  //                1. BuffersDebugChecksumThunk (checksum input buffers)
+  //                2. FakeThunk (kBranch1ThunkId)
+  //                3. BuffersDebugChecksumThunk (checksum output buffers)
+  //             ]
+  //          ]
+  //       ]
+  //    ]
+  // 3. CustomCallThunk (buffer debug log dump)
+
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          ThunkKindIs(Thunk::Kind::kWhile),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+
+  {
+    ASSERT_EQ(new_thunks[1]->kind(), Thunk::Kind::kWhile);
+    const WhileThunk& while_thunk =
+        static_cast<const WhileThunk&>(*new_thunks[1]);
+    EXPECT_THAT(while_thunk.body_thunk_sequence()->thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential),
+                            Pointer(conditional_thunk_ptr)));
+    const SequentialThunk& condition_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(
+            *while_thunk.condition_thunk_sequence()->thunks()[0]);
+    EXPECT_THAT(
+        condition_fake_thunk_sequence.thunks(),
+        ElementsAre(
+            IsChecksumThunkChecking(SliceList{{0, slice_while_condition}}),
+            Pointer(while_condition_fake_thunk_ptr),
+            IsChecksumThunkChecking(SliceList{{0, slice_while_condition}})));
+
+    const SequentialThunk& body_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(
+            *while_thunk.body_thunk_sequence()->thunks()[0]);
+    EXPECT_THAT(
+        body_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_while_body}}),
+                    Pointer(while_body_fake_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_while_body}})));
+
+    ASSERT_EQ(while_thunk.body_thunk_sequence()->thunks()[1]->kind(),
+              Thunk::Kind::kConditional);
+    const ConditionalThunk& conditional_thunk =
+        static_cast<const ConditionalThunk&>(
+            *while_thunk.body_thunk_sequence()->thunks()[1]);
+    EXPECT_THAT(conditional_thunk.branch_thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential),
+                            ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch0_thunk = static_cast<const SequentialThunk&>(
+        *conditional_thunk.branch_thunks()[0]);
+    EXPECT_THAT(branch0_thunk.thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch0_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(*branch0_thunk.thunks()[0]);
+    EXPECT_THAT(
+        branch0_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_branch0}}),
+                    Pointer(branch0_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_branch0}})));
+
+    const SequentialThunk& branch1_thunk = static_cast<const SequentialThunk&>(
+        *conditional_thunk.branch_thunks()[1]);
+    EXPECT_THAT(branch1_thunk.thunks(),
+                ElementsAre(ThunkKindIs(Thunk::Kind::kSequential)));
+
+    const SequentialThunk& branch1_fake_thunk_sequence =
+        static_cast<const SequentialThunk&>(*branch1_thunk.thunks()[0]);
+    EXPECT_THAT(
+        branch1_fake_thunk_sequence.thunks(),
+        ElementsAre(IsChecksumThunkChecking(SliceList{{0, slice_branch1}}),
+                    Pointer(branch1_thunk_ptr),
+                    IsChecksumThunkChecking(SliceList{{0, slice_branch1}})));
+  }
+}
+
+TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
+  static constexpr ThunkId kTestThunkId = ThunkId(123);
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_detect_nan(DebugOptions::DETECTION_MODE_WARNING);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // The callbacks created by ThunkBufferDebugPass require a HloModule with
+  // a non-null entry computation.
+  auto builder = HloComputation::Builder("entry");
+  HloInstruction* root = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f)));
+  std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
+  HloModule hlo_module("test_module", HloModuleConfig());
+  hlo_module.AddEntryComputation(std::move(entry_computation));
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice_i(&alloc, 0, 1, PrimitiveType::F32);
+  BufferAllocation::Slice slice_o(&alloc, 1, 1, PrimitiveType::F32);
+  BufferAllocation::Slice slice_io(&alloc, 2, 1, PrimitiveType::F32);
+  BufferAllocation::Slice slice_scratch(&alloc, 3, 1, PrimitiveType::F32);
+  Thunk::ThunkInfo fake_thunk_info;
+  fake_thunk_info.thunk_id = ThunkId(kTestThunkId);
+  auto fake_thunk = std::make_unique<FakeThunk>(
+      fake_thunk_info,
+      Thunk::BufferUses{
+          // Consume means the thunk can reuse the buffer for scratch space, so
+          // only check it on input.
+          BufferUse::Consume(slice_i),
+          // Write is undefined on input, but defined on output.
+          BufferUse::Write(slice_o),
+          // Unlike Consume, Read is supposed to preserve the contents of the
+          // buffer, so we check it on input *and* output.
+          BufferUse::Read(slice_io),
+          // Scratch buffers are not checked at all.
+          BufferUse::Scratch(slice_scratch),
+      });
+  Thunk* fake_thunk_ptr = fake_thunk.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kFloatChecker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          pass.Run(root_thunk.get(), debug_options, &hlo_module,
+                                   device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk
+  //    1. FakeThunk
+  //    2. BuffersDebugFloatCheckThunk (float check output buffers)
+  // 3. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(new_thunks, SizeIs(3));
+  EXPECT_EQ(new_thunks[0]->kind(), Thunk::Kind::kCustomCall);
+  EXPECT_EQ(new_thunks[1]->kind(), Thunk::Kind::kSequential);
+  EXPECT_EQ(new_thunks[2]->kind(), Thunk::Kind::kCustomCall);
+
+  const CustomCallThunk& buffer_debug_init_thunk =
+      static_cast<const CustomCallThunk&>(*new_thunks[0]);
+  EXPECT_EQ(buffer_debug_init_thunk.target_name(),
+            "xla_gpu_buffer_debug_float_check_init");
+
+  const CustomCallThunk& buffer_debug_dump_thunk =
+      static_cast<const CustomCallThunk&>(*new_thunks[2]);
+  EXPECT_EQ(buffer_debug_dump_thunk.target_name(),
+            "xla_gpu_buffer_debug_float_check");
+
+  const std::vector<std::unique_ptr<Thunk>>& sub_thunks =
+      static_cast<const SequentialThunk&>(*new_thunks[1]).thunks();
+  EXPECT_THAT(sub_thunks, SizeIs(2));
+  EXPECT_THAT(sub_thunks[0], Pointer(fake_thunk_ptr));
+  EXPECT_EQ(sub_thunks[1]->kind(), Thunk::Kind::kBuffersDebugFloatCheck);
+
+  const BuffersDebugFloatCheckThunk& buffer_debug_after_fake_thunk =
+      static_cast<const BuffersDebugFloatCheckThunk&>(*sub_thunks[1]);
+  EXPECT_THAT(buffer_debug_after_fake_thunk.buffer_slices(),
+              UnorderedElementsAre(Pair(1, slice_o), Pair(2, slice_io)));
+}
+
+TEST_F(ThunkBufferDebugPassTest, FiltersThunksByIdRanges) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  IntRangeInclusive* range =
+      debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+          ->add_thunk_id_ranges();
+  range->set_first(2);
+  range->set_last(2);
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. FakeThunk1 (not instrumented due to filter)
+  // 3. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk2
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // 4. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          Pointer(fake_thunk1_ptr),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }),
+                                            Pointer(fake_thunk2_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest, FiltersThunksByProfileAnnotationRegexes) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("thunk1");
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("^fake.*2$");
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  fake_thunk1_info.profile_annotation = "fake_thunk1";
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.profile_annotation = "fake_thunk2";
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk::ThunkInfo fake_thunk3_info;
+  fake_thunk3_info.profile_annotation = "fake_thunk3";
+  fake_thunk3_info.thunk_id = ThunkId(3);
+  auto fake_thunk3 = std::make_unique<FakeThunk>(
+      fake_thunk3_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  Thunk* fake_thunk3_ptr = fake_thunk3.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  thunks.push_back(std::move(fake_thunk3));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk1 (instrumented due to thunk1)
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. SequentialThunk [
+  //    4. BuffersDebugChecksumThunk (checksum input buffers)
+  //    5. FakeThunk2 (instrumented due to 2$)
+  //    6. BuffersDebugChecksumThunk (checksum output buffers)
+  // ]
+  // 3. FakeThunk3 (not instrumented)
+  // 4. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice1_io},
+                                            }),
+                                            Pointer(fake_thunk1_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice1_io},
+                                            }))),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }),
+                                            Pointer(fake_thunk2_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice2_io},
+                                            }))),
+          Pointer(fake_thunk3_ptr),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+TEST_F(ThunkBufferDebugPassTest,
+       FiltersThunksByIdRangesAndProfileAnnotationRegexes) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
+      true);
+  IntRangeInclusive* range =
+      debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+          ->add_thunk_id_ranges();
+  range->set_first(2);
+  range->set_last(3);
+  debug_options.mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+      ->add_profile_annotation_regexes("instrument_me");
+  se::DeviceDescription device_info;
+  FakeThunkPassBufferAllocator allocator;
+  // Create a fake thunk with a few different buffer uses.
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice1_io(&alloc, 0, 1);
+  BufferAllocation::Slice slice2_io(&alloc, 1, 1);
+  BufferAllocation::Slice slice3_io(&alloc, 2, 1);
+  Thunk::ThunkInfo fake_thunk1_info;
+  fake_thunk1_info.thunk_id = ThunkId(1);
+  fake_thunk1_info.profile_annotation = "instrument_me";
+  auto fake_thunk1 = std::make_unique<FakeThunk>(
+      fake_thunk1_info, Thunk::BufferUses{BufferUse::Read(slice1_io)});
+  Thunk::ThunkInfo fake_thunk2_info;
+  fake_thunk2_info.thunk_id = ThunkId(2);
+  fake_thunk2_info.profile_annotation = "ignore_me";
+  auto fake_thunk2 = std::make_unique<FakeThunk>(
+      fake_thunk2_info, Thunk::BufferUses{BufferUse::Read(slice2_io)});
+  Thunk::ThunkInfo fake_thunk3_info;
+  fake_thunk3_info.thunk_id = ThunkId(3);
+  fake_thunk3_info.profile_annotation = "instrument_me";
+  auto fake_thunk3 = std::make_unique<FakeThunk>(
+      fake_thunk3_info, Thunk::BufferUses{BufferUse::Read(slice3_io)});
+  Thunk* fake_thunk1_ptr = fake_thunk1.get();
+  Thunk* fake_thunk2_ptr = fake_thunk2.get();
+  Thunk* fake_thunk3_ptr = fake_thunk3.get();
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.push_back(std::move(fake_thunk1));
+  thunks.push_back(std::move(fake_thunk2));
+  thunks.push_back(std::move(fake_thunk3));
+  auto root_thunk =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
+
+  ThunkBufferDebugPass pass(ThunkBufferDebugPass::Mode::kChecksum);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, pass.Run(root_thunk.get(), debug_options,
+                             fake_hlo_module_.get(), device_info, allocator));
+  EXPECT_TRUE(changed);
+
+  // Expected thunk structure after the pass:
+  // 1. CustomCallThunk (buffer debug log init)
+  // 2. FakeThunk1 (not instrumented due to thunk ID filter)
+  // 3. FakeThunk2 (not instrumented due to profile annotation regex filter)
+  // 4. SequentialThunk [
+  //    1. BuffersDebugChecksumThunk (checksum input buffers)
+  //    2. FakeThunk3
+  //    3. BuffersDebugChecksumThunk (checksum output buffers)
+  // 5. CustomCallThunk (buffer debug log dump)
+  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
+  EXPECT_THAT(
+      new_thunks,
+      ElementsAre(
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_init"),
+          Pointer(fake_thunk1_ptr), Pointer(fake_thunk2_ptr),
+          IsSequentialThunkWith(ElementsAre(IsChecksumThunkChecking(SliceList{
+                                                {0, slice3_io},
+                                            }),
+                                            Pointer(fake_thunk3_ptr),
+                                            IsChecksumThunkChecking(SliceList{
+                                                {0, slice3_io},
+                                            }))),
+          IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id.h b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id.h
deleted file mode 100644
index 95ae27c31f6a2b..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_ID_H_
-#define XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_ID_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-
-namespace xla::gpu {
-
-// An ID that identifies a buffer within a program. It's a combination of the
-// thunk ID and the buffer index within the thunk.
-//
-// A single buffer can be referred to by multiple ThunkBufferIds, when it's
-// being used in different thunks.
-class ThunkBufferId {
- public:
-  ThunkBufferId() = default;
-
-  // Creates a ThunkBufferId that represents the `buffer_idx`-th buffer of a
-  // thunk with `thunk_info`.
-  //
-  // Returns an error if `buffer_idx` is too large to be represented in a
-  // ThunkBufferId.
-  static absl::StatusOr<ThunkBufferId> Create(ThunkId thunk_id,
-                                              size_t buffer_idx) {
-    if (buffer_idx >= (1 << kBitsReservedForBufferIndex)) {
-      return absl::InvalidArgumentError(absl::StrFormat(
-          "Buffer index (%u) is too large to be represented in a ThunkBufferId "
-          "(max = %u)",
-          buffer_idx, (1 << kBitsReservedForBufferIndex) - 1));
-    }
-
-    const uint32_t value = (static_cast<uint32_t>(thunk_id.value())
-                            << kBitsReservedForBufferIndex) |
-                           static_cast<uint32_t>(buffer_idx);
-    return ThunkBufferId(value);
-  }
-
-  ThunkId thunk_id() const {
-    return ThunkId(value_ >> kBitsReservedForBufferIndex);
-  }
-  size_t buffer_idx() const {
-    return value_ & ((1 << kBitsReservedForBufferIndex) - 1);
-  }
-
-  // Raw numeric value of the ID, for use in BufferDebugLogEntry::entry_id.
-  uint32_t value() const { return value_; }
-
-  bool operator==(const ThunkBufferId& other) const {
-    return value_ == other.value_;
-  }
-  bool operator!=(const ThunkBufferId& other) const {
-    return !(*this == other);
-  }
-
-  template <typename Sink>
-  friend void AbslStringify(Sink& sink, const ThunkBufferId& buffer_id) {
-    absl::Format(&sink, "{thunk_id: %u, buffer_idx: %u}",
-                 buffer_id.thunk_id().value(), buffer_id.buffer_idx());
-  }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const ThunkBufferId& buffer_id) {
-    return H::combine(std::move(h), buffer_id.value_);
-  }
-
- private:
-  // Out of 32 bits available in id, reserve that much for the buffer index.
-  // This limits us to:
-  // - 2^kBitsReservedForBufferIndex max buffers per thunk
-  // - 2^(32-kBitsReservedForBufferIndex) max thunks
-  // Which hopefully is enough.
-  static constexpr size_t kBitsReservedForBufferIndex = 8;
-
-  explicit ThunkBufferId(uint32_t value) : value_(value) {}
-
-  uint32_t value_ = 0;
-};
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_BUFFER_ID_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id_test.cc
deleted file mode 100644
index 5627c3ac925631..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_id_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/status_matchers.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace {
-
-TEST(ThunkBufferIdTest, CreateFailsForLargeBufferIndex) {
-  EXPECT_THAT(xla::gpu::ThunkBufferId::Create(xla::gpu::ThunkId(123),
-                                              /*buffer_idx=*/256),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
-TEST(ThunkBufferIdTest, CreateSucceedsForSmallBufferIndex) {
-  EXPECT_THAT(xla::gpu::ThunkBufferId::Create(xla::gpu::ThunkId(123),
-                                              /*buffer_idx=*/255),
-              absl_testing::IsOk());
-}
-
-TEST(ThunkBufferIdTest, CorrectlyStoresAndExtractsThunkIdAndBufferIndex) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      xla::gpu::ThunkBufferId buffer_id,
-      xla::gpu::ThunkBufferId::Create(xla::gpu::ThunkId(123),
-                                      /*buffer_idx=*/45));
-
-  EXPECT_THAT(buffer_id.thunk_id(), xla::gpu::ThunkId(123));
-  EXPECT_THAT(buffer_id.buffer_idx(), 45);
-}
-
-}  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc
deleted file mode 100644
index c77614aeeafe15..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
-
-#include <cstddef>
-#include <cstring>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/base/nullability.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
-#include "xla/backends/gpu/runtime/custom_call_thunk.h"
-#include "xla/backends/gpu/runtime/sequential_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
-#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
-#include "xla/ffi/api/c_api.h"
-#include "xla/ffi/ffi.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/dump.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/buffer_debug_log.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-
-namespace se = stream_executor;
-
-// With BufferDebugLogEntry size of 8 bytes, this is enough to hold ~8K entries.
-constexpr size_t kLogSizeBytes = 64 * 1024;
-
-namespace {
-
-// If the thunk has any interesting buffers to check, turns it into a sequence
-// of:
-// - BuffersDebugChecksumThunk checking the buffers before execution
-// - The original thunk
-// - BuffersDebugChecksumThunk checking the buffers after execution
-//
-// If the thunk got wrapped, the data dependencies between the thunks will be
-// configured to ensure `predecessor_thunk` executes before the wrapped thunk
-// and `successor_thunk` executes after.
-absl::StatusOr<std::unique_ptr<Thunk>> WrapThunk(
-    std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
-    const Thunk& predecessor_thunk, Thunk& successor_thunk) {
-  const auto& thunk_buffers = thunk->buffer_uses();
-  if (thunk_buffers.empty()) {
-    return thunk;
-  }
-
-  absl::flat_hash_map<ThunkBufferId, BufferAllocation::Slice>
-      buffers_to_check_before;
-  absl::flat_hash_map<ThunkBufferId, BufferAllocation::Slice>
-      buffers_to_check_after;
-
-  for (size_t buffer_idx = 0; buffer_idx < thunk_buffers.size(); ++buffer_idx) {
-    absl::StatusOr<ThunkBufferId> buffer_id =
-        ThunkBufferId::Create(thunk->thunk_info().thunk_id, buffer_idx);
-    if (!buffer_id.ok()) {
-      LOG(WARNING) << "Skipping buffer " << buffer_idx << " in thunk "
-                   << thunk->thunk_info().thunk_id << ": "
-                   << buffer_id.status();
-      continue;
-    }
-
-    const BufferUse& use = thunk_buffers[buffer_idx];
-    if (use.HasDefinedContentsOnInput()) {
-      buffers_to_check_before.emplace(buffer_id.value(), use.slice());
-    }
-    if (use.HasDefinedContentsOnOutput()) {
-      buffers_to_check_after.emplace(buffer_id.value(), use.slice());
-    }
-  }
-
-  if (buffers_to_check_before.empty() && buffers_to_check_after.empty()) {
-    return thunk;
-  }
-
-  std::vector<std::unique_ptr<Thunk>> thunk_and_checks;
-  if (!buffers_to_check_before.empty()) {
-    auto buffer_debug_before_thunk =
-        std::make_unique<BuffersDebugChecksumThunk>(
-            Thunk::ThunkInfo(), log_slice, std::move(buffers_to_check_before));
-    thunk->add_control_predecessor(buffer_debug_before_thunk.get());
-    thunk_and_checks.push_back(std::move(buffer_debug_before_thunk));
-  }
-
-  Thunk* thunk_ptr = thunk.get();
-  thunk_and_checks.push_back(std::move(thunk));
-
-  if (!buffers_to_check_after.empty()) {
-    auto buffer_debug_after_thunk = std::make_unique<BuffersDebugChecksumThunk>(
-        Thunk::ThunkInfo(), log_slice, std::move(buffers_to_check_after));
-    buffer_debug_after_thunk->add_control_predecessor(thunk_ptr);
-    thunk_and_checks.push_back(std::move(buffer_debug_after_thunk));
-  }
-
-  auto wrapped_thunk = std::make_unique<SequentialThunk>(
-      Thunk::ThunkInfo(), std::move(thunk_and_checks));
-  wrapped_thunk->add_control_predecessor(&predecessor_thunk);
-  successor_thunk.add_control_predecessor(wrapped_thunk.get());
-  return wrapped_thunk;
-}
-
-XLA_FFI_DEFINE_HANDLER_SYMBOL(
-    kDebugLogInitHandler,
-    [](se::Stream* absl_nonnull stream, xla::ffi::Buffer<U8> log_buffer) {
-      return se::cuda::BufferDebugLog::CreateOnDevice(
-                 *stream, log_buffer.device_memory())
-          .status();
-    },
-    xla::ffi::Ffi::Bind().Ctx<xla::ffi::Stream>().Arg<xla::ffi::Buffer<U8>>());
-
-XLA_FFI_DEFINE_HANDLER_SYMBOL(
-    kDebugLogDumpHandler,
-    [](se::Stream* stream, const HloComputation* absl_nonnull hlo_computation,
-       xla::ffi::Buffer<U8> log_buffer) {
-      VLOG(1) << "HLO computation ptr: " << hlo_computation;
-      const HloModule* hlo_module = hlo_computation->parent();
-      VLOG(1) << "HLO module ptr: " << hlo_module;
-      VLOG(1) << "HLO module name: " << hlo_module->name();
-      CHECK(hlo_module != nullptr);
-      const DebugOptions& debug_options = hlo_module->config().debug_options();
-
-      se::cuda::BufferDebugLog buffer_debug_log =
-          se::cuda::BufferDebugLog::FromDeviceMemoryUnchecked(
-              log_buffer.device_memory());
-      TF_ASSIGN_OR_RETURN(xla::gpu::BufferDebugLogProto buffer_debug_log_proto,
-                          buffer_debug_log.ReadProto(*stream));
-      VLOG(1) << "read " << buffer_debug_log_proto.entries_size() << " entries";
-      DumpPerExecutionProtobufToFile(*hlo_module, buffer_debug_log_proto,
-                                     debug_options, "buffer_debug_log",
-                                     nullptr);
-      return absl::OkStatus();
-    },
-    xla::ffi::Ffi::Bind()
-        .Ctx<xla::ffi::Stream>()
-        .Ctx<xla::ffi::CalledComputation>()
-        .Arg<xla::ffi::Buffer<U8>>());
-
-}  // namespace
-
-absl::StatusOr<bool> ThunkChecksumTracingPass::Run(
-    SequentialThunk* root_thunk, const DebugOptions& debug_options,
-    const HloModule* absl_nullable hlo_module,
-    const se::DeviceDescription& device_info,
-    ThunkPassBufferAllocator& allocator) {
-  VLOG(1) << "ThunkChecksumTracingPass running";
-  if (hlo_module == nullptr) {
-    // We need the HLO module to dump the buffer debug log proto to a file. If
-    // it's not available, there's no point in doing extra work.
-    VLOG(1) << "HLO module is null, skip buffer checksumming";
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation * log_alloc,
-                      allocator.NewEmptyAllocation(kLogSizeBytes));
-  BufferAllocation::Slice log_slice(log_alloc, 0, log_alloc->size());
-  ShapedSlice shaped_log_slice{
-      /*slice=*/log_slice,
-      /*shape=*/Shape(PrimitiveType::U8, /*dimensions=*/{log_alloc->size()}),
-  };
-
-  XLA_FFI_Handler_Bundle buffer_debug_init_bundle{};
-  buffer_debug_init_bundle.execute = kDebugLogInitHandler;
-  TF_ASSIGN_OR_RETURN(
-      auto buffer_debug_init_thunk,
-      CustomCallThunk::Create(
-          Thunk::ThunkInfo(), "xla_gpu_buffer_debug_log_init",
-          buffer_debug_init_bundle, /*operands=*/{shaped_log_slice},
-          /*results=*/{}, /*attributes=*/{}, hlo_module->entry_computation()));
-
-  XLA_FFI_Handler_Bundle buffer_debug_dump_bundle{};
-  buffer_debug_dump_bundle.execute = kDebugLogDumpHandler;
-  TF_ASSIGN_OR_RETURN(auto buffer_debug_dump_thunk,
-                      CustomCallThunk::Create(Thunk::ThunkInfo(),
-                                              "xla_gpu_buffer_debug_log_dump",
-                                              buffer_debug_dump_bundle,
-                                              /*operands=*/{shaped_log_slice},
-                                              /*results=*/{}, /*attributes=*/{},
-                                              hlo_module->entry_computation()));
-
-  ThunkSequence& thunks = root_thunk->thunks();
-  for (auto& thunk : thunks) {
-    TF_ASSIGN_OR_RETURN(
-        thunk, WrapThunk(std::move(thunk), log_slice,
-                         /*predecessor_thunk=*/*buffer_debug_init_thunk.get(),
-                         /*successor_thunk=*/*buffer_debug_dump_thunk.get()));
-  }
-
-  thunks.reserve(thunks.size() + 2);
-  thunks.insert(thunks.begin(), std::move(buffer_debug_init_thunk));
-  thunks.push_back(std::move(buffer_debug_dump_thunk));
-
-  return true;
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc
deleted file mode 100644
index 4a10ec8d8198b3..00000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_checksum_tracing_pass_test.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/runtime/buffers_checksum_thunk.h"
-#include "xla/backends/gpu/runtime/custom_call_thunk.h"
-#include "xla/backends/gpu/runtime/sequential_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/literal_util.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using testing::ElementsAre;
-using testing::Pair;
-using testing::Pointer;
-using testing::SizeIs;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(IsUniquePointerTo, ptr, "") { return arg.get() == ptr; }
-
-class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
- public:
-  absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
-    if (CreatedAlloc()) {
-      return absl::InvalidArgumentError("Expected only one allocation");
-    }
-    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
-    return alloc_.get();
-  }
-
-  bool CreatedAlloc() { return alloc_ != nullptr; }
-
- private:
-  std::unique_ptr<BufferAllocation> alloc_;
-};
-
-class FakeThunk : public Thunk {
- public:
-  explicit FakeThunk(ThunkInfo info, BufferUses buffer_uses)
-      : Thunk(Thunk::Kind::kGemm, std::move(info)),
-        buffer_uses_(std::move(buffer_uses)) {}
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
-    return absl::OkStatus();
-  }
-
-  BufferUses buffer_uses() const override { return buffer_uses_; }
-
- private:
-  BufferUses buffer_uses_;
-};
-
-TEST(ThunkChecksumTracingPassTest, IsNoOpWhenHloModuleIsNull) {
-  DebugOptions debug_options;
-  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
-      true);
-  se::DeviceDescription device_info;
-  FakeThunkPassBufferAllocator allocator;
-  BufferAllocation alloc(0, 1024, 0);
-  BufferAllocation::Slice slice(&alloc, 0, 1);
-  auto fake_thunk = std::make_unique<FakeThunk>(
-      Thunk::ThunkInfo(), Thunk::BufferUses{BufferUse::Read(slice)});
-  Thunk* fake_thunk_ptr = fake_thunk.get();
-  std::vector<std::unique_ptr<Thunk>> thunks;
-  thunks.push_back(std::move(fake_thunk));
-  auto root_thunk =
-      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
-
-  ThunkChecksumTracingPass pass;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, pass.Run(root_thunk.get(), debug_options,
-                             /*hlo_module=*/nullptr, device_info, allocator));
-  EXPECT_FALSE(changed);
-  EXPECT_THAT(root_thunk->thunks(), ElementsAre(Pointer(fake_thunk_ptr)));
-}
-
-TEST(ThunkChecksumTracingPassTest, InsertsBuffersDebugChecksumThunks) {
-  static constexpr ThunkId kTestThunkId = ThunkId(123);
-  DebugOptions debug_options;
-  debug_options.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(
-      true);
-  se::DeviceDescription device_info;
-  FakeThunkPassBufferAllocator allocator;
-  // The callbacks created by ThunkChecksumTracingPass require a HloModule with
-  // a non-null entry computation.
-  auto builder = HloComputation::Builder("entry");
-  HloInstruction* root = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(1)));
-  std::unique_ptr<HloComputation> entry_computation = builder.Build(root);
-  HloModule hlo_module("test_module", HloModuleConfig());
-  hlo_module.AddEntryComputation(std::move(entry_computation));
-  // Create a fake thunk with a few different buffer uses.
-  BufferAllocation alloc(0, 1024, 0);
-  BufferAllocation::Slice slice_i(&alloc, 0, 1);
-  BufferAllocation::Slice slice_o(&alloc, 1, 1);
-  BufferAllocation::Slice slice_io(&alloc, 2, 1);
-  BufferAllocation::Slice slice_scratch(&alloc, 3, 1);
-  Thunk::ThunkInfo fake_thunk_info;
-  fake_thunk_info.thunk_id = ThunkId(kTestThunkId);
-  auto fake_thunk = std::make_unique<FakeThunk>(
-      fake_thunk_info,
-      Thunk::BufferUses{
-          // Consume means the thunk can reuse the buffer for scratch space, so
-          // only check it on input.
-          BufferUse::Consume(slice_i),
-          // Write is undefined on input, but defined on output.
-          BufferUse::Write(slice_o),
-          // Unlike Consume, Read is supposed to preserve the contents of the
-          // buffer, so we check it on input *and* output.
-          BufferUse::Read(slice_io),
-          // Scratch buffers are not checked at all.
-          BufferUse::Scratch(slice_scratch),
-      });
-  Thunk* fake_thunk_ptr = fake_thunk.get();
-  std::vector<std::unique_ptr<Thunk>> thunks;
-  thunks.push_back(std::move(fake_thunk));
-  auto root_thunk =
-      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks));
-
-  ThunkChecksumTracingPass pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          pass.Run(root_thunk.get(), debug_options, &hlo_module,
-                                   device_info, allocator));
-  EXPECT_TRUE(changed);
-
-  // Expected thunk structure after the pass:
-  // 1. CustomCallThunk (buffer debug log init)
-  // 2. SequentialThunk
-  //    1. BuffersDebugChecksumThunk (checksum input buffers)
-  //    2. FakeThunk
-  //    3. BuffersDebugChecksumThunk (checksum output buffers)
-  // 3. CustomCallThunk (buffer debug log dump)
-  const std::vector<std::unique_ptr<Thunk>>& new_thunks = root_thunk->thunks();
-  EXPECT_THAT(new_thunks, SizeIs(3));
-  EXPECT_EQ(new_thunks[0]->kind(), Thunk::Kind::kCustomCall);
-  EXPECT_EQ(new_thunks[1]->kind(), Thunk::Kind::kSequential);
-  EXPECT_EQ(new_thunks[2]->kind(), Thunk::Kind::kCustomCall);
-
-  const CustomCallThunk& buffer_debug_init_thunk =
-      static_cast<const CustomCallThunk&>(*new_thunks[0]);
-  EXPECT_EQ(buffer_debug_init_thunk.target_name(),
-            "xla_gpu_buffer_debug_log_init");
-
-  const CustomCallThunk& buffer_debug_dump_thunk =
-      static_cast<const CustomCallThunk&>(*new_thunks[2]);
-  EXPECT_EQ(buffer_debug_dump_thunk.target_name(),
-            "xla_gpu_buffer_debug_log_dump");
-
-  const std::vector<std::unique_ptr<Thunk>>& sub_thunks =
-      static_cast<const SequentialThunk&>(*new_thunks[1]).thunks();
-  EXPECT_THAT(sub_thunks, SizeIs(3));
-  EXPECT_EQ(sub_thunks[0]->kind(), Thunk::Kind::kBuffersDebugChecksum);
-  EXPECT_THAT(sub_thunks[1], Pointer(fake_thunk_ptr));
-  EXPECT_EQ(sub_thunks[2]->kind(), Thunk::Kind::kBuffersDebugChecksum);
-
-  const BuffersDebugChecksumThunk& buffer_debug_before_fake_thunk =
-      static_cast<const BuffersDebugChecksumThunk&>(*sub_thunks[0]);
-  EXPECT_THAT(
-      buffer_debug_before_fake_thunk.buffer_slices(),
-      UnorderedElementsAre(
-          Pair(ThunkBufferId::Create(kTestThunkId, 0).value(), slice_i),
-          Pair(ThunkBufferId::Create(kTestThunkId, 2).value(), slice_io)));
-
-  const BuffersDebugChecksumThunk& buffer_debug_after_fake_thunk =
-      static_cast<const BuffersDebugChecksumThunk&>(*sub_thunks[2]);
-  EXPECT_THAT(
-      buffer_debug_after_fake_thunk.buffer_slices(),
-      UnorderedElementsAre(
-          Pair(ThunkBufferId::Create(kTestThunkId, 1).value(), slice_o),
-          Pair(ThunkBufferId::Create(kTestThunkId, 2).value(), slice_io)));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index c2ec6a97527476..f8aa780bc4d84d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/base/nullability.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -31,14 +32,20 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
+#include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/fft_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/host_execute_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/infeed_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/norm_thunk.h"
+#include "xla/backends/gpu/runtime/outfeed_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -46,11 +53,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
 #include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
+namespace {
+
 static std::optional<absl::string_view> GetStoredThunkTypeName(
     const ThunkProto& proto) {
   const tsl::protobuf::Descriptor* descriptor = proto.GetDescriptor();
@@ -69,17 +79,25 @@ static std::optional<absl::string_view> GetStoredThunkTypeName(
   return field_descriptor->name();
 }
 
-absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     const ThunkProto& thunk_proto,
-    absl::Span<const BufferAllocation> buffer_allocations) {
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
+    HostExecuteAsyncEventsMap& host_executable_async_events_map,
+    HostSendRecvAsyncEventsMap& host_send_recv_async_events_map) {
   TF_ASSIGN_OR_RETURN(Thunk::ThunkInfo thunk_info,
                       Thunk::ThunkInfo::FromProto(thunk_proto.thunk_info()));
+  auto deserializer =
+      [&buffer_allocations, &hlo_module, &platform_name,
+       &host_executable_async_events_map,
+       &host_send_recv_async_events_map](const ThunkProto& thunk_proto) {
+        return DeserializeThunkProtoImpl(
+            thunk_proto, buffer_allocations, hlo_module, platform_name,
+            host_executable_async_events_map, host_send_recv_async_events_map);
+      };
 
   switch (thunk_proto.impl_case()) {
     case ThunkProto::kSequentialThunk: {
-      auto deserializer = [&buffer_allocations](const ThunkProto& thunk_proto) {
-        return DeserializeThunkProto(thunk_proto, buffer_allocations);
-      };
       return SequentialThunk::FromProto(
           std::move(thunk_info), thunk_proto.sequential_thunk(), deserializer);
     }
@@ -99,18 +117,13 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
           std::move(thunk_info), thunk_proto.device_to_device_copy_thunk(),
           buffer_allocations);
     case ThunkProto::kWhileThunk:
-      return WhileThunk::FromProto(
-          std::move(thunk_info), thunk_proto.while_thunk(), buffer_allocations,
-          [&buffer_allocations](const ThunkProto& thunk_proto) {
-            return DeserializeThunkProto(thunk_proto, buffer_allocations);
-          });
+      return WhileThunk::FromProto(std::move(thunk_info),
+                                   thunk_proto.while_thunk(),
+                                   buffer_allocations, deserializer);
     case ThunkProto::kConditionalThunk:
-      return ConditionalThunk::FromProto(
-          std::move(thunk_info), thunk_proto.conditional_thunk(),
-          buffer_allocations,
-          [&buffer_allocations](const ThunkProto& thunk_proto) {
-            return DeserializeThunkProto(thunk_proto, buffer_allocations);
-          });
+      return ConditionalThunk::FromProto(std::move(thunk_info),
+                                         thunk_proto.conditional_thunk(),
+                                         buffer_allocations, deserializer);
     case ThunkProto::kGemmThunk:
       return GemmThunk::FromProto(std::move(thunk_info),
                                   thunk_proto.gemm_thunk(), buffer_allocations);
@@ -167,6 +180,58 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
       return Memset32BitValueThunk::FromProto(
           std::move(thunk_info), thunk_proto.memset32bit_value_thunk(),
           buffer_allocations);
+    case ThunkProto::kDynamicSliceThunk: {
+      auto deserializer =
+          [hlo_module, platform_name, &host_executable_async_events_map,
+           &host_send_recv_async_events_map](
+              const ThunkProto& thunk_proto,
+              absl::Span<const BufferAllocation> custom_allocations) {
+            return DeserializeThunkProtoImpl(thunk_proto, custom_allocations,
+                                             hlo_module, platform_name,
+                                             host_executable_async_events_map,
+                                             host_send_recv_async_events_map);
+          };
+      return DynamicSliceThunk::FromProto(std::move(thunk_info),
+                                          thunk_proto.dynamic_slice_thunk(),
+                                          buffer_allocations, deserializer);
+    }
+    case ThunkProto::kCustomCallThunk:
+      return CustomCallThunk::FromProto(
+          std::move(thunk_info), thunk_proto.custom_call_thunk(),
+          buffer_allocations, hlo_module, platform_name);
+    case ThunkProto::kCubSortThunk:
+      return CubSortThunk::FromProto(std::move(thunk_info),
+                                     thunk_proto.cub_sort_thunk(),
+                                     buffer_allocations, platform_name);
+    case ThunkProto::kHostExecuteStartThunk:
+      return HostExecuteStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_execute_start_thunk(),
+          buffer_allocations, host_executable_async_events_map);
+    case ThunkProto::kHostExecuteDoneThunk:
+      return HostExecuteDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_execute_done_thunk(),
+          buffer_allocations, host_executable_async_events_map);
+    case ThunkProto::kHostSendThunk:
+      return HostSendThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_send_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostSendDoneThunk:
+      return HostSendDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_send_done_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostRecvThunk:
+      return HostRecvThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_recv_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kHostRecvDoneThunk:
+      return HostRecvDoneThunk::FromProto(
+          std::move(thunk_info), thunk_proto.host_recv_done_thunk(),
+          buffer_allocations, host_send_recv_async_events_map);
+    case ThunkProto::kOutfeedThunk:
+      return OutfeedThunk::FromProto(std::move(thunk_info),
+                                     thunk_proto.outfeed_thunk(),
+                                     buffer_allocations);
+
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);
@@ -184,4 +249,18 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
   }
 }
 
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+    const ThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module,
+    absl::string_view platform_name) {
+  HostExecuteAsyncEventsMap host_executable_async_events_map;
+  HostSendRecvAsyncEventsMap host_send_recv_async_events_map;
+  return DeserializeThunkProtoImpl(
+      thunk_proto, buffer_allocations, hlo_module, platform_name,
+      host_executable_async_events_map, host_send_recv_async_events_map);
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
index a68b6ed9fa60f6..729c09847cd1ef 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 
 namespace xla::gpu {
@@ -29,7 +32,8 @@ namespace xla::gpu {
 // Deserializes the given `thunk_proto` into a Thunk.
 absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
     const ThunkProto& thunk_proto,
-    absl::Span<const BufferAllocation> buffer_allocations);
+    absl::Span<const BufferAllocation> buffer_allocations,
+    const HloModule* absl_nullable hlo_module, absl::string_view platform_name);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index e929e64e1466a4..070bb7cc9d9d92 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -22,15 +22,24 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/host_execute_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -46,6 +55,8 @@ using ::tsl::proto_testing::EqualsProto;
 using ::tsl::proto_testing::ParseTextProtoOrDie;
 using Kind = Thunk::Kind;
 
+constexpr absl::string_view kTestPlatformName = "TEST_PLATFORM";
+
 TEST(ThunkProtoDeserializationTest, SequentialThunkChain) {
   constexpr ExecutionStreamId kExecutionStreamId{123};
   constexpr absl::string_view kProfileAnnotation = "profile_annotation";
@@ -63,8 +74,10 @@ TEST(ThunkProtoDeserializationTest, SequentialThunkChain) {
   SequentialThunk outer_thunk(thunk_info, std::move(thunk_sequence));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, outer_thunk.ToProto());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> new_thunk,
-                          DeserializeThunkProto(proto, {}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> new_thunk,
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
 
   EXPECT_THAT(new_thunk.get(),
               WhenDynamicCastTo<const SequentialThunk*>(Property(
@@ -91,8 +104,10 @@ TEST(ThunkProtoDeserializationTest, CopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<CopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -123,8 +138,10 @@ TEST(ThunkProtoDeserializationTest, DeviceToHostCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<DeviceToHostCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -155,8 +172,10 @@ TEST(ThunkProtoDeserializationTest, HostToDeviceCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<HostToDeviceCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -187,8 +206,10 @@ TEST(ThunkProtoDeserializationTest, DeviceToDeviceCopyThunk) {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* copy_thunk = dynamic_cast<DeviceToDeviceCopyThunk*>(thunk.get());
   ASSERT_NE(copy_thunk, nullptr);  // Check the cast succeeded
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, copy_thunk->ToProto());
@@ -264,8 +285,10 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
       BufferAllocation(/*index=*/4, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/5, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> athunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> athunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* thunk = dynamic_cast<WhileThunk*>(athunk.get());
   ASSERT_NE(thunk, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
@@ -353,8 +376,10 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
       BufferAllocation(/*index=*/4, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/5, /*size=*/1024, /*color=*/0)};
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> athunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> athunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
   auto* thunk = dynamic_cast<ConditionalThunk*>(athunk.get());
   ASSERT_NE(thunk, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
@@ -370,7 +395,8 @@ TEST(ThunkProtoDeserializationTest, WaitForStreamsThunk) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Thunk> thunk,
-      DeserializeThunkProto(proto, /*buffer_allocations=*/{}));
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
@@ -391,8 +417,10 @@ TEST(ThunkProtoDeserializationTest, CudnnThunk) {
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0),
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, buffer_allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
 
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
@@ -457,8 +485,100 @@ TEST(ThunkProtoDeserializationTest, CublasLtMatmulThunk) {
       BufferAllocation(/*index=*/5, /*size=*/161600, /*color=*/0),
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> thunk,
-                          DeserializeThunkProto(proto, allocations));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, allocations, /*hlo_module=*/nullptr,
+                            kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+XLA_FFI_DEFINE_HANDLER(kSimpleCustomCall, []() { return absl::OkStatus(); },
+                       ffi::Ffi::Bind(), {ffi::Traits::kCmdBufferCompatible});
+
+constexpr absl::string_view kSimpleCustomCallName =
+    "__xla_test$$simple_custom_call";
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), kSimpleCustomCallName,
+                         "TEST_PLATFORM", kSimpleCustomCall);
+
+TEST(ThunkProtoDeserializationTest, CustomCallThunk) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        custom_call_thunk {
+          target_name: "__xla_test$$simple_custom_call"
+          operands {
+            shaped_slice {
+              slice { buffer_allocation_index: 0 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          operands {
+            shaped_slice {
+              slice { buffer_allocation_index: 1 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          results {
+            shaped_slice {
+              slice { buffer_allocation_index: 2 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          results {
+            shaped_slice {
+              slice { buffer_allocation_index: 3 }
+              shape {
+                dimensions: 42
+                element_type: S32
+                is_dynamic_dimension: false
+              }
+            }
+          }
+          api_version: API_VERSION_TYPED_FFI
+          attributes {
+            attrs {
+              key: "my_attribute"
+              value { scalar { i32: 42 } }
+            }
+          }
+          called_computation: "called_computation"
+        }
+      )pb");
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/2, /*size=*/1024, /*color=*/0),
+      BufferAllocation(/*index=*/3, /*size=*/1024, /*color=*/0),
+  };
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("called_computation");
+  // This instruction is pretty arbitrary, we just need a non-empty computation.
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations, &hlo_module,
+                            kTestPlatformName));
+
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
@@ -469,9 +589,171 @@ TEST(ThunkProtoDeserializationTest, EmptyThunkImplReturnsAnError) {
         thunk_info { execution_stream_id: 7 }
       )pb");
 
-  EXPECT_THAT(DeserializeThunkProto(proto, /*buffer_allocations=*/{}),
+  EXPECT_THAT(DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                                    /*hlo_module=*/nullptr, kTestPlatformName),
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(ThunkProtoDeserializationTest, HostSendRecvThunksRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        sequential_thunk {
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_send_thunk {
+              shape {
+                element_type: F32
+                dimensions: [ 10 ]
+                is_dynamic_dimension: false
+              }
+              buffer { buffer_allocation_index: 0 }
+              channel_id: 123
+              async_events_unique_id: 1
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_send_done_thunk { channel_id: 123 async_events_unique_id: 1 }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_recv_thunk {
+              shape {
+                element_type: F32
+                dimensions: [ 10 ]
+                is_dynamic_dimension: false
+
+              }
+              buffer { buffer_allocation_index: 0 }
+              channel_id: 456
+              async_events_unique_id: 2
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_recv_done_thunk { channel_id: 456 async_events_unique_id: 2 }
+          }
+        }
+      )pb");
+
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0)};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, buffer_allocations,
+                            /*hlo_module=*/nullptr, kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  const auto* sequential_thunk = dynamic_cast<SequentialThunk*>(thunk.get());
+  ASSERT_NE(sequential_thunk, nullptr);
+  ASSERT_EQ(sequential_thunk->thunks().size(), 4);
+
+  const auto* send_thunk =
+      dynamic_cast<HostSendThunk*>(sequential_thunk->thunks()[0].get());
+  ASSERT_NE(send_thunk, nullptr);
+
+  const auto* send_done_thunk =
+      dynamic_cast<HostSendDoneThunk*>(sequential_thunk->thunks()[1].get());
+  ASSERT_NE(send_done_thunk, nullptr);
+
+  const auto* recv_thunk =
+      dynamic_cast<HostRecvThunk*>(sequential_thunk->thunks()[2].get());
+  ASSERT_NE(recv_thunk, nullptr);
+
+  const auto* recv_done_thunk =
+      dynamic_cast<HostRecvDoneThunk*>(sequential_thunk->thunks()[3].get());
+  ASSERT_NE(recv_done_thunk, nullptr);
+
+  EXPECT_TRUE(send_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(send_done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(send_thunk->GetAsyncEventsUniqueId(),
+            send_done_thunk->GetAsyncEventsUniqueId());
+
+  EXPECT_TRUE(recv_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(recv_done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(recv_thunk->GetAsyncEventsUniqueId(),
+            recv_done_thunk->GetAsyncEventsUniqueId());
+
+  // The unique id is regenerated on deserialization. Overwrite it with the
+  // original value for the purpose of the roundtrip test.
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(0)
+      ->mutable_host_send_thunk()
+      ->set_async_events_unique_id(1);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(1)
+      ->mutable_host_send_done_thunk()
+      ->set_async_events_unique_id(1);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(2)
+      ->mutable_host_recv_thunk()
+      ->set_async_events_unique_id(2);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(3)
+      ->mutable_host_recv_done_thunk()
+      ->set_async_events_unique_id(2);
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(ThunkProtoDeserializationTest, HostExecuteThunksRoundTrip) {
+  ThunkProto proto = ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info { execution_stream_id: 7 }
+        sequential_thunk {
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_execute_start_thunk {
+              executable_proto { executable_type: EXECUTABLE_TYPE_NANORT }
+              async_events_unique_id: 123
+            }
+          }
+          thunks {
+            thunk_info { execution_stream_id: 7 }
+            host_execute_done_thunk { async_events_unique_id: 123 }
+          }
+        }
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto, /*buffer_allocations=*/{},
+                            /*hlo_module=*/nullptr, kTestPlatformName));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Check that start and done thunks share the same async event id.
+  const auto* sequential_thunk = dynamic_cast<SequentialThunk*>(thunk.get());
+  ASSERT_NE(sequential_thunk, nullptr);
+  ASSERT_EQ(sequential_thunk->thunks().size(), 2);
+
+  const auto* start_thunk =
+      dynamic_cast<HostExecuteStartThunk*>(sequential_thunk->thunks()[0].get());
+  ASSERT_NE(start_thunk, nullptr);
+
+  const auto* done_thunk =
+      dynamic_cast<HostExecuteDoneThunk*>(sequential_thunk->thunks()[1].get());
+  ASSERT_NE(done_thunk, nullptr);
+
+  EXPECT_TRUE(start_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_TRUE(done_thunk->GetAsyncEventsUniqueId().has_value());
+  EXPECT_EQ(start_thunk->GetAsyncEventsUniqueId(),
+            done_thunk->GetAsyncEventsUniqueId());
+
+  // The unique id is regenerated on deserialization. Overwrite it with the
+  // original value for the purpose of the roundtrip test.
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(0)
+      ->mutable_host_execute_start_thunk()
+      ->set_async_events_unique_id(123);
+  round_trip_proto.mutable_sequential_thunk()
+      ->mutable_thunks(1)
+      ->mutable_host_execute_done_thunk()
+      ->set_async_events_unique_id(123);
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk.cc b/third_party/xla/xla/backends/gpu/runtime/topk.cc
index cb40a63fea7257..10b611d29a6c0f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/topk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/topk.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <limits>
-#include <memory>
 #include <string>
 #include <utility>
 
@@ -31,10 +30,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/topk_kernel.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -47,8 +45,6 @@ namespace xla::gpu::kernel::topk {
 
 namespace {
 
-using KernelArgsPacking = se::KernelLoaderSpec::KernelArgsPacking;
-
 // The optimal number of threads is the smaller value between the number of
 // threads available per block and the number of slices of data.
 size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
@@ -66,21 +62,16 @@ size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
   return std::min(threads_per_block, min_slice);
 }
 
-// Returns the function creating packed arguments for TopK kernel.
-template <typename T>
-KernelArgsPacking CreateTopKArgsPacking(size_t num_elements, size_t k) {
-  using Packed = absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
-
-  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
-    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
-
-    se::DeviceMemory<T> data(mem_args->device_memory_args()[0]);
-    se::DeviceMemory<T> top_elements(mem_args->device_memory_args()[1]);
-    se::DeviceMemory<uint32_t> top_indices(mem_args->device_memory_args()[2]);
-
-    return se::PackKernelArgs(args.number_of_shared_bytes(), data, num_elements,
-                              top_elements, top_indices, k);
-  };
+// Returns a packing spec for invoking the TopK kernel.
+se::KernelArgumentsPackingSpec CreateTopKArgsPacking(size_t num_elements,
+                                                     size_t k) {
+  se::KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(0);  // data
+  spec.AddConstantArgument(num_elements);
+  spec.AddAddressArgument(1);  // top_elements
+  spec.AddAddressArgument(2);  // top_indices
+  spec.AddConstantArgument(k);
+  return spec;
 }
 
 // Finds the TopK kernel for the given platform registered in the global
@@ -158,7 +149,7 @@ absl::StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
       se::KernelLoaderSpec spec,
       GetTopKKernelForKAndPlatformAndN<T>(k, platform->id(), num_elements));
 
-  spec.set_kernel_args_packing(CreateTopKArgsPacking<T>(num_elements, k));
+  spec.set_kernel_args_packing(CreateTopKArgsPacking(num_elements, k));
   return CustomKernel(std::move(name), std::move(spec),
                       se::BlockDim(batch_size, 1, 1),
                       se::ThreadDim(num_threads, 1, 1), shmem_size);
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk_test.cc b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
index 8d7006242908d7..2dc181e9bd97ca 100644
--- a/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
@@ -29,7 +29,9 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/kernel_serialization_check.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -190,6 +192,21 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
   }
 }
 
+TEST_P(TopKKernelTest, EnsureSerializable) {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
+
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+
+  auto custom_kernel = GetTopKKernel("topk", PrimitiveType::F32, n, k,
+                                     batch_size, platform->Name(), 32);
+
+  stream_executor::gpu::VerifyKernelIsSerializable(custom_kernel->kernel_spec(),
+                                                   platform->id());
+}
+
 INSTANTIATE_TEST_SUITE_P(TopKTests, TopKKernelTest,
                          Combine(
                              /*n_kb=*/Values(1, 8, 12, 64, 128),
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
index 844dba23ed5076..f19b05685d4678 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
index d1c18ce217b927..6414135ebb3f8b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
index 9e7bf3aa439fc4..123fd90fd58194 100644
--- a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
index 4f973c26d071e2..9dc3e56dbd586d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/wait_for_streams_thunk.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
 
 #include <memory>
-#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
index d3bf9bf87d62b1..cd4bbed8c9f34e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
@@ -199,6 +199,23 @@ void WhileThunk::ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) {
   body_thunk_sequence_->ForAllThunksMutable(fn);
 }
 
+absl::Status WhileThunk::TransformAllNestedThunks(
+    absl::FunctionRef<
+        absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+        fn) {
+  TF_RETURN_IF_ERROR(condition_thunk_sequence_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
+                      fn(std::move(condition_thunk_sequence_)));
+  condition_thunk_sequence_ = SequentialThunk::FromThunk(std::move(thunk));
+
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->TransformAllNestedThunks(fn));
+
+  TF_ASSIGN_OR_RETURN(thunk, fn(std::move(body_thunk_sequence_)));
+  body_thunk_sequence_ = SequentialThunk::FromThunk(std::move(thunk));
+  return absl::OkStatus();
+}
+
 std::string WhileThunk::ToString(int indent) const {
   std::string indent_str(indent * 2, ' ');
   std::string result;
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
index 3eee1134927d24..c60ec04bd17179 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
@@ -94,6 +94,10 @@ class WhileThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
   void ForAllThunksMutable(absl::FunctionRef<void(Thunk*)> fn) override;
+  absl::Status TransformAllNestedThunks(
+      absl::FunctionRef<
+          absl::StatusOr<std::unique_ptr<Thunk>>(std::unique_ptr<Thunk>)>
+          fn) override;
 
   std::string ToString(int indent) const override;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
index bdaa3484d9bef5..f88361546f4db4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
@@ -42,20 +43,20 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::NotNull;
+using ::testing::SizeIs;
 using ::tsl::proto_testing::EqualsProto;
 using ::tsl::proto_testing::ParseTextProtoOrDie;
-using ::tsl::testing::IsOk;
 using Kind = Thunk::Kind;
 
 // A dummy `Thunk` that does nothing.
@@ -379,5 +380,32 @@ TEST(WhileThunkTest, FromProto) {
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
 
+TEST(WhileThunkTest, TransformAllNestedThunks) {
+  BufferAllocation::Slice slice;
+  auto condition_thunk_sequence =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence());
+  auto body_thunk_sequence =
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence());
+  auto while_thunk = std::make_unique<WhileThunk>(
+      Thunk::ThunkInfo(), /*loop=*/nullptr,
+      /*condition_result_buffer_index=*/slice,
+      /*condition_thunk_sequence=*/std::move(condition_thunk_sequence),
+      /*body_thunk_sequence_=*/std::move(body_thunk_sequence),
+      /*trip_count=*/3);
+
+  TF_EXPECT_OK(while_thunk->TransformAllNestedThunks([](auto) {
+    return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
+  }));
+
+  EXPECT_THAT(while_thunk->condition_thunk_sequence(), NotNull());
+  EXPECT_THAT(while_thunk->condition_thunk_sequence()->thunks(), SizeIs(1));
+  EXPECT_THAT(while_thunk->condition_thunk_sequence()->thunks()[0]->kind(),
+              Kind::kCustomCall);
+  EXPECT_THAT(while_thunk->body_thunk_sequence(), NotNull());
+  EXPECT_THAT(while_thunk->body_thunk_sequence()->thunks(), SizeIs(1));
+  EXPECT_THAT(while_thunk->body_thunk_sequence()->thunks()[0]->kind(),
+              Kind::kCustomCall);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index f787303b517ff6..13debc127f8755 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -65,20 +65,14 @@ cc_library(
     deps = [
         ":rocm_collector",
         ":rocm_tracer",
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/platform:env_time",
         "//xla/tsl/platform:errors",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
-        "//xla/tsl/profiler/utils:parse_annotation",
-        "//xla/tsl/profiler/utils:trace_utils",
-        "//xla/tsl/profiler/utils:xplane_builder",
-        "//xla/tsl/profiler/utils:xplane_schema",
-        "//xla/tsl/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
     ],
@@ -157,7 +151,9 @@ xla_test(
         ":cupti_wrapper",
         ":mock_cupti",
         "//xla/tsl/profiler/utils:time_utils",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -172,6 +168,7 @@ cuda_library(
         "@com_google_googletest//:gtest_for_library",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cuda_runtime",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -201,6 +198,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cupti_tracer_options_utils",
+    srcs = ["cupti_tracer_options_utils.cc"],
+    hdrs = ["cupti_tracer_options_utils.h"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cupti_collector",
+        ":cupti_tracer",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/profiler/utils:profiler_options_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+)
+
 cc_library(
     name = "cupti_tracer",
     srcs = ["cupti_tracer.cc"],
@@ -260,7 +280,6 @@ cc_library(
         "cuda-only",
         "gpu",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
         ":cupti_interface",
@@ -287,9 +306,9 @@ cc_library(
         "cuda-only",
         "gpu",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
+        ":cupti_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
@@ -308,7 +327,6 @@ cc_library(
         "gpu",
         "manual",  # This target requires CUDA 12.6+, therefore we only built it if it was requested via a dependency.
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_collector",
         ":cupti_interface",
@@ -360,6 +378,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm_tracer_utils",
+    srcs = ["rocm_tracer_utils.cc"],
+    hdrs = ["rocm_tracer_utils.h"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "rocm_collector",
     srcs = ["rocm_collector.cc"],
@@ -373,30 +407,26 @@ cc_library(
         "manual",
     ]),
     deps = [
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
-        "//xla/tsl/profiler/backends/cpu:annotation_stack",
+        "//xla/tsl/platform:status",
         "//xla/tsl/profiler/utils:parse_annotation",
+        "//xla/tsl/profiler/utils:trace_utils",
         "//xla/tsl/profiler/utils:xplane_builder",
         "//xla/tsl/profiler/utils:xplane_schema",
         "//xla/tsl/profiler/utils:xplane_utils",
-        "//xla/tsl/util:env_var",
-        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@local_config_rocm//rocm:rocm_headers",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocprofiler-sdk",  # buildcleaner: keep
         "@local_tsl//tsl/platform:abi",
-        "@local_tsl//tsl/platform:env_time",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:thread_annotations",
-        "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/profiler/lib:profiler_factory",
-        "@local_tsl//tsl/profiler/lib:profiler_interface",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -414,28 +444,68 @@ cc_library(
     ]),
     deps = [
         ":rocm_collector",
+        ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
-        "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:env",
+        "@com_google_absl//absl/types:optional",
+        "@local_config_rocm//rocm:rocm_headers",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocprofiler-sdk",  # buildcleaner: keep
+        "@local_tsl//tsl/platform:abi",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
     ],
 )
 
+xla_cc_test(
+    name = "rocm_tracer_test",
+    size = "small",
+    srcs = ["rocm_tracer_test.cc"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ] + if_google([
+        # Optional: only run internally if ROCm config is enabled
+        "manual",
+    ]),
+    deps = [
+        ":rocm_collector",
+        ":rocm_tracer",
+        ":rocm_tracer_utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
+xla_cc_test(
+    name = "rocm_collector_test",
+    size = "small",
+    srcs = ["rocm_collector_test.cc"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ] + if_google([
+        "manual",
+    ]),
+    deps = [
+        # ":rocm_tracer",
+        ":rocm_collector",
+        ":rocm_tracer_utils",
+        "@com_google_googletest//:gtest_main",
+        "//xla/tsl/profiler/utils:xplane_utils",
+    ],
+)
+
 cc_library(
     name = "cupti_collector",
     srcs = ["cupti_collector.cc"],
@@ -690,7 +760,7 @@ xla_test(
     deps = [
         ":cupti_collector",
         ":cupti_error_manager",
-        ":cupti_pm_sampler_stub",
+        ":cupti_pm_sampler_factory",  # buildcleaner: keep
         ":cupti_tracer",
         ":cupti_utils",
         ":cupti_wrapper",
@@ -701,26 +771,3 @@ xla_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_library(
-    name = "cupti_tracer_options_utils",
-    srcs = ["cupti_tracer_options_utils.cc"],
-    hdrs = ["cupti_tracer_options_utils.h"],
-    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cupti_collector",
-        ":cupti_tracer",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/profiler/utils:profiler_options_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
-    ],
-)
diff --git a/third_party/xla/xla/backends/profiler/gpu/cuda_test.cu.cc b/third_party/xla/xla/backends/profiler/gpu/cuda_test.cu.cc
index adad777dc5446a..2e1642fa65bbbe 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cuda_test.cu.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cuda_test.cu.cc
@@ -24,11 +24,14 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/backends/profiler/gpu/cuda_test.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace profiler {
 namespace test {
 
+using tsl::profiler::ScopedAnnotation;
+
 namespace {
 
 // Simple printf kernel.
@@ -38,9 +41,9 @@ __global__ void simple_print() { printf("hello, world!\n"); }
 __global__ void empty() {}
 
 // Simple kernel accesses memory.
-__global__ void access(int *addr) { *addr = *addr * 2; }
+__global__ void access(int* addr) { *addr = *addr * 2; }
 
-unsigned *g_device_copy;
+unsigned* g_device_copy;
 
 unsigned *gpu0_buf, *gpu1_buf;
 
@@ -58,13 +61,13 @@ void EmptyKernel(int iters) {
   }
 }
 
-void AccessKernel(int *addr) { access<<<1, 1>>>(addr); }
+void AccessKernel(int* addr) { access<<<1, 1>>>(addr); }
 
 void Synchronize() { cudaDeviceSynchronize(); }
 
 void UnifiedMemoryHtoDAndDtoH() {
-  int *addr = nullptr;
-  cudaMallocManaged(reinterpret_cast<void **>(&addr), sizeof(int));
+  int* addr = nullptr;
+  cudaMallocManaged(reinterpret_cast<void**>(&addr), sizeof(int));
   // The page is now in host memory.
   *addr = 1;
   // The kernel wants to access the page. HtoD transfer happens.
@@ -77,21 +80,21 @@ void UnifiedMemoryHtoDAndDtoH() {
 
 void MemCopyH2D() {
   unsigned host_val = 0x12345678;
-  cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
+  cudaMalloc(reinterpret_cast<void**>(&g_device_copy), sizeof(unsigned));
   cudaMemcpy(g_device_copy, &host_val, sizeof(unsigned),
              cudaMemcpyHostToDevice);
 }
 
 void MemCopyH2D_Async() {
   unsigned host_val = 0x12345678;
-  cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
+  cudaMalloc(reinterpret_cast<void**>(&g_device_copy), sizeof(unsigned));
   cudaMemcpyAsync(g_device_copy, &host_val, sizeof(unsigned),
                   cudaMemcpyHostToDevice);
 }
 
 void MemCopyD2H() {
   unsigned host_val = 0;
-  cudaMalloc(reinterpret_cast<void **>(&g_device_copy), sizeof(unsigned));
+  cudaMalloc(reinterpret_cast<void**>(&g_device_copy), sizeof(unsigned));
   cudaMemcpy(&host_val, g_device_copy, sizeof(unsigned),
              cudaMemcpyDeviceToHost);
 }
@@ -101,10 +104,10 @@ namespace {
 // Helper function to set up memory buffers on two devices.
 void P2PMemcpyHelper() {
   cudaSetDevice(0);
-  cudaMalloc(reinterpret_cast<void **>(&gpu0_buf), sizeof(unsigned));
+  cudaMalloc(reinterpret_cast<void**>(&gpu0_buf), sizeof(unsigned));
   cudaDeviceEnablePeerAccess(/*peerDevice=*/1, /*flags=*/0);
   cudaSetDevice(1);
-  cudaMalloc(reinterpret_cast<void **>(&gpu1_buf), sizeof(unsigned));
+  cudaMalloc(reinterpret_cast<void**>(&gpu1_buf), sizeof(unsigned));
   cudaDeviceEnablePeerAccess(/*peerDevice=*/0, /*flags=*/0);
 }
 
@@ -131,12 +134,12 @@ void MemCopyP2PExplicit() {
 
 // The test about cuda graph is based on Nvidia's CUPTI sample code
 // under extras/CUPTI/samples/cuda_graphs_trace/ dir of CUDA distribution.
-__global__ void VecAdd(const int *a, const int *b, int *c, int n) {
+__global__ void VecAdd(const int* a, const int* b, int* c, int n) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < n) c[i] = a[i] + b[i];
 }
 
-__global__ void VecSub(const int *a, const int *b, int *c, int n) {
+__global__ void VecSub(const int* a, const int* b, int* c, int n) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < n) c[i] = a[i] - b[i];
 }
@@ -162,9 +165,12 @@ void CudaGraphCreateAndExecute() {
 
   // Allocates vectors in device memory.
   int *d_a, *d_b, *d_c;
-  cudaMalloc((void **)&d_a, kNumBytes);
-  cudaMalloc((void **)&d_b, kNumBytes);
-  cudaMalloc((void **)&d_c, kNumBytes);
+  cudaMalloc((void**)&d_a, kNumBytes);
+  cudaMalloc((void**)&d_b, kNumBytes);
+  cudaMalloc((void**)&d_c, kNumBytes);
+
+  ScopedAnnotation module_annotation(
+      "XlaModule:#hlo_module=my_module,program_id=1#");
 
   cudaGraphCreate(&graph, 0);
 
@@ -175,41 +181,61 @@ void CudaGraphCreateAndExecute() {
   memcpy_params.extent.width = kNumBytes;
   memcpy_params.extent.height = 1;
   memcpy_params.extent.depth = 1;
-  cudaGraphAddMemcpyNode(&nodes[0], graph, nullptr, 0, &memcpy_params);
+  {
+    ScopedAnnotation memcpy_1_annotation(
+        "Thunk:#name=my_module/prep,hlo_op=memcpy.1#");
+    cudaGraphAddMemcpyNode(&nodes[0], graph, nullptr, 0, &memcpy_params);
+  }
 
   memcpy_params.srcPtr.ptr = vec_b.data();
   memcpy_params.dstPtr.ptr = d_b;
-  cudaGraphAddMemcpyNode(&nodes[1], graph, nullptr, 0, &memcpy_params);
+  {
+    ScopedAnnotation memcpy_2_annotation(
+        "Thunk:#name=my_module/prep,hlo_op=memcpy.2#");
+    cudaGraphAddMemcpyNode(&nodes[1], graph, nullptr, 0, &memcpy_params);
+  }
 
   // Init kernel params.
   int num = kNumElements;
-  void *kernelArgs[] = {(void *)&d_a, (void *)&d_b, (void *)&d_c, (void *)&num};
+  void* kernelArgs[] = {(void*)&d_a, (void*)&d_b, (void*)&d_c, (void*)&num};
   blocks_per_grid = (kNumElements + kThreadsPerBlock - 1) / kThreadsPerBlock;
-  kernel_params.func = (void *)VecAdd;
+  kernel_params.func = (void*)VecAdd;
   kernel_params.gridDim = dim3(blocks_per_grid, 1, 1);
   kernel_params.blockDim = dim3(kThreadsPerBlock, 1, 1);
   kernel_params.sharedMemBytes = 0;
-  kernel_params.kernelParams = (void **)kernelArgs;
+  kernel_params.kernelParams = (void**)kernelArgs;
   kernel_params.extra = nullptr;
+  {
+    ScopedAnnotation add_1_annotation(
+        "Thunk:#name=my_module/body,hlo_op=add.1#");
+    cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernel_params);
+  }
 
-  cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernel_params);
-
-  kernel_params.func = (void *)VecSub;
-  cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernel_params);
-
+  kernel_params.func = (void*)VecSub;
+  {
+    ScopedAnnotation sub_1_annotation(
+        "Thunk:#name=my_module/body,hlo_op=sub.1#");
+    cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernel_params);
+  }
   memcpy_params.kind = cudaMemcpyDeviceToHost;
   memcpy_params.srcPtr.ptr = d_c;
   memcpy_params.dstPtr.ptr = vec_c.data();
   memcpy_params.extent.width = kNumBytes;
   memcpy_params.extent.height = 1;
   memcpy_params.extent.depth = 1;
-  cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpy_params);
-
+  {
+    ScopedAnnotation memcpy_3_annotation(
+        "Thunk:#name=my_module/post,hlo_op=memcpy.3#");
+    cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpy_params);
+  }
   cudaGraphClone(&cloned_graph, graph);
 
-  cudaGraphInstantiate(&graph_exec, cloned_graph, nullptr, nullptr, 0);
+  cudaGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0);
 
-  cudaGraphLaunch(graph_exec, stream);
+  {
+    ScopedAnnotation module_annotation("Thunk:#name=my_module,hlo_op=call.1#");
+    cudaGraphLaunch(graph_exec, stream);
+  }
 
   cudaStreamSynchronize(stream);
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
index 668af8494f7b9b..d6a7de323334f5 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -155,8 +155,12 @@ const char *getActivityUnifiedMemoryKindString(
 template <typename CuptiActivity>
 void SetEventGraphId(CuptiTracerEvent &event,
                      const CuptiActivity *cupti_activity) {
+  // In current implementation, CuptiActivityKernelTy, CuptiActivityMemcpyTy,
+  // CuptiActivityMemcpyP2PTy and CuptiActivityMemsetTy all have graphNodeId
+  // when they have graphId.
   if constexpr (CuptiActivityHasGraphId<CuptiActivity>::value) {
     event.graph_id = cupti_activity->graphId;
+    event.graph_node_id = cupti_activity->graphNodeId;
   }
 }
 
@@ -658,23 +662,27 @@ absl::string_view StringDeduper::Dedup(absl::string_view str,
   return absl::string_view();
 }
 
-void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
-                        const absl::string_view annotation,
-                        const absl::string_view nvtx_range,
-                        int64_t scope_range_id) {
-  if (annotation.empty() && nvtx_range.empty()) return;
-  VLOG(3) << "Add annotation: device_id: " << device_id
-          << " correlation_id: " << correlation_id
-          << " annotation: " << annotation;
-  if (device_id >= per_device_map_.size()) return;
-  auto &per_device_map = per_device_map_[device_id];
-  if (per_device_map.annotation_deduper.Size() < max_size_) {
-    AnnotationInfo info;
-    info.annotation = per_device_map.annotation_deduper.Dedup(annotation);
-    info.nvtx_range = per_device_map.nvtx_range_deduper.Dedup(nvtx_range);
-    info.scope_range_id = scope_range_id;
-    per_device_map.correlation_map.emplace(correlation_id, info);
+absl::string_view AnnotationMap::Add(uint32_t device_id,
+                                     uint32_t correlation_id,
+                                     const absl::string_view annotation,
+                                     const absl::string_view nvtx_range,
+                                     int64_t scope_range_id) {
+  if ((!annotation.empty() || !nvtx_range.empty()) &&
+      device_id < per_device_map_.size()) {
+    VLOG(3) << "Add annotation: device_id: " << device_id
+            << " correlation_id: " << correlation_id
+            << " annotation: " << annotation;
+    auto& per_device_map = per_device_map_[device_id];
+    if (per_device_map.annotation_deduper.Size() < max_size_) {
+      AnnotationInfo info;
+      info.annotation = per_device_map.annotation_deduper.Dedup(annotation);
+      info.nvtx_range = per_device_map.nvtx_range_deduper.Dedup(nvtx_range);
+      info.scope_range_id = scope_range_id;
+      per_device_map.correlation_map.emplace(correlation_id, info);
+      return info.annotation;
+    }
   }
+  return "";
 }
 
 AnnotationMap::AnnotationInfo AnnotationMap::LookUp(
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
index 5d3c8bb585c418..da5a03f6cc1749 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -280,9 +280,14 @@ class AnnotationMap {
   explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
       : max_size_(max_size), per_device_map_(num_gpus) {}
 
-  void Add(uint32_t device_id, uint32_t correlation_id,
-           absl::string_view annotation, absl::string_view nvtx_range,
-           int64_t scope_range_id = 0);
+  // Returns a string_view of the dedupped annotation string. The string_view is
+  // valid as long as the AnnotationMap is alive. If the annotation is dropped
+  // due to size limit or any other reason, an empty string_view will be
+  // returned.
+  absl::string_view Add(uint32_t device_id, uint32_t correlation_id,
+                        absl::string_view annotation,
+                        absl::string_view nvtx_range,
+                        int64_t scope_range_id = 0);
 
   AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id) const
       ABSL_ATTRIBUTE_LIFETIME_BOUND;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index abd78721861a0a..9ee5ca44ccffcd 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -414,60 +414,6 @@ class PerDeviceCollector {
 
  public:
   PerDeviceCollector() = default;
-  void SetCudaGraphIdMap(
-      absl::flat_hash_map<uint32_t, uint32_t>& cuda_graph_id_map) {
-    per_device_cuda_graph_id_map_.insert(cuda_graph_id_map.begin(),
-                                         cuda_graph_id_map.end());
-  }
-
-  void SetCudaGraphNodeIdMap(
-      absl::flat_hash_map<uint32_t, absl::flat_hash_map<uint64_t, uint64_t>>&
-          cuda_graph_node_id_map) {
-    for (const auto& [graph_id, node_map] : cuda_graph_node_id_map) {
-      per_device_cuda_graph_node_id_map_[graph_id].insert(node_map.begin(),
-                                                          node_map.end());
-    }
-  }
-
-  void AddGraphIdMapsToPlane(XPlaneBuilder* device_plane) {
-    // Create a new line for graph metadata
-    XLineBuilder line =
-        device_plane->GetOrCreateLine(StatType::kGraphMetadataLineId);
-    line.SetName(GetStatTypeStr(StatType::kGraphMetadataLineId));
-    line.SetTimestampNs(0);
-
-    // Add the graph id map to the device plane
-    for (const auto& [graph_id, orig_graph_id] :
-         per_device_cuda_graph_id_map_) {
-      XEventBuilder event =
-          line.AddEvent(*device_plane->GetOrCreateEventMetadata(
-              GetStatTypeStr(StatType::kCudaGraphMapId)));
-      event.AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                             GetStatTypeStr(StatType::kCudaGraphId)),
-                         graph_id);
-      event.AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                             GetStatTypeStr(StatType::kCudaGraphOrigId)),
-                         orig_graph_id);
-    }
-    // Add the node id map to the device plane
-    for (const auto& [graph_id, node_map] :
-         per_device_cuda_graph_node_id_map_) {
-      for (const auto& [node_id, orig_node_id] : node_map) {
-        XEventBuilder event =
-            line.AddEvent(*device_plane->GetOrCreateEventMetadata(
-                GetStatTypeStr(StatType::kCudaGraphNodeMapId)));
-        event.AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kCudaGraphId)),
-                           graph_id);
-        event.AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kCudaGraphNodeId)),
-                           node_id);
-        event.AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kCudaGraphOrigNodeId)),
-                           orig_node_id);
-      }
-    }
-  }
 
   void AddEvent(CuptiTracerEvent&& event) {
     absl::MutexLock l(m_);
@@ -635,9 +581,6 @@ class PerDeviceCollector {
   std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
   cudaOccDeviceProp device_properties_;
   absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
-  absl::flat_hash_map<uint32_t, uint32_t> per_device_cuda_graph_id_map_;
-  absl::flat_hash_map<uint32_t, absl::flat_hash_map<uint64_t, uint64_t>>
-      per_device_cuda_graph_node_id_map_;
 };
 
 // Using two iterator of the CuptiTracerEvent queue to mark the current and
@@ -752,17 +695,26 @@ void CuptiTraceCollector::OnTracerCollectedCallbackData(
     auto event_in_queue = min_heap.top();
     min_heap.pop();
     auto& event = event_in_queue.Event();
+    absl::string_view deduped_annotation{};
     if (event.type == CuptiTracerEventType::Generic &&
         event.generic_info.cbid ==
             CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
       for (uint32_t device = 0; device < options_.num_gpus; ++device) {
-        annotation_map_.Add(device, event.correlation_id, event.annotation,
-                            event.nvtx_range, event.scope_range_id);
+        deduped_annotation =
+            annotation_map_.Add(device, event.correlation_id, event.annotation,
+                                event.nvtx_range, event.scope_range_id);
       }
     } else {
-      annotation_map_.Add(event.device_id, event.correlation_id,
-                          event.annotation, event.nvtx_range,
-                          event.scope_range_id);
+      deduped_annotation = annotation_map_.Add(
+          event.device_id, event.correlation_id, event.annotation,
+          event.nvtx_range, event.scope_range_id);
+    }
+    if (event.type == CuptiTracerEventType::CudaGraph && event.graph_id != 0 &&
+        event.graph_node_id != 0) {
+      if (!deduped_annotation.empty()) {
+        graph_node_annotations_.insert(
+            {{event.graph_id, event.graph_node_id}, deduped_annotation});
+      }
     }
     // Clear the annotation and nvtx_range of the Callback API events, as they
     // are now in the combined AnnotationMap which will be used by the
@@ -840,20 +792,39 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       // followed AddEvent() processing.
       if (!AddNvtxMarker(event)) return;
     }
+
+    // If this is a CudaGraphNodeMap event, we need to record the mapping from
+    // graph_id/graph_node_id to orig_graph_id/orig_graph_node_id.
     if (event.type == CuptiTracerEventType::CudaGraphNodeMap) {
+      cuda_graph_id_map_[event.graph_id] = event.cuda_graph_info.orig_graph_id;
       cuda_graph_node_id_map_[event.graph_id][event.graph_node_id] =
           event.cuda_graph_info.orig_graph_node_id;
-      cuda_graph_id_map_[event.graph_id] = event.cuda_graph_info.orig_graph_id;
+      return;
     }
-    if (event.type != CuptiTracerEventType::CudaGraphNodeMap) {
-      per_device_collector_[event.device_id].AddEvent(std::move(event));
+
+    // For activity events with graph_id and graph_node_id, we need to rewrite
+    // the annotation to reflect the detail framework information which are got
+    // during the callback for cuda graph creation and nodes insertion.
+    if (event.source == CuptiTracerEventSource::Activity &&
+        (event.graph_id != 0 && event.graph_node_id != 0)) {
+      // We need to rewrite the annotation of this inner node event in cuda
+      // graph device plane.
+      auto orig_graph_id_it = cuda_graph_id_map_.find(event.graph_id);
+      if (orig_graph_id_it != cuda_graph_id_map_.end()) {
+        uint32_t orig_graph_id = orig_graph_id_it->second;
+        const auto& node_id_2_orig = cuda_graph_node_id_map_[event.graph_id];
+        auto orig_node_id_it = node_id_2_orig.find(event.graph_node_id);
+        if (orig_node_id_it != node_id_2_orig.end()) {
+          uint64_t orig_node_id = orig_node_id_it->second;
+          auto annotation_it =
+              graph_node_annotations_.find({orig_graph_id, orig_node_id});
+          if (annotation_it != graph_node_annotations_.end()) {
+            event.annotation = annotation_it->second;
+          }
+        }
+      }
     }
-    per_device_collector_[event.device_id].SetCudaGraphIdMap(
-        cuda_graph_id_map_);
-    per_device_collector_[event.device_id].SetCudaGraphNodeIdMap(
-        cuda_graph_node_id_map_);
-    cuda_graph_node_id_map_.clear();
-    cuda_graph_id_map_.clear();
+    per_device_collector_[event.device_id].AddEvent(std::move(event));
   }
   void OnEventsDropped(const std::string& reason,
                        uint32_t num_events) override {
@@ -903,11 +874,6 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
           device_ordinal, &device_plane);
       num_events += per_device_collector_[device_ordinal].Flush(
           start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane, &nvtx_plane);
-      if (options_.dump_graph_nope_mapping) {
-        // Add the graph id maps to the device plane
-        per_device_collector_[device_ordinal].AddGraphIdMapsToPlane(
-            &device_plane);
-      }
       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
     }
     NormalizeTimeStamps(&host_plane, start_walltime_ns_);
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
index fc8a3fbb2ff541..8255d12cadeafd 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -42,8 +44,6 @@ struct CuptiTracerCollectorOptions {
   uint64_t max_annotation_strings = 1024 * 1024;
   // Number of GPUs involved.
   uint32_t num_gpus;
-  // Whether to dump the graph nope mapping.
-  bool dump_graph_nope_mapping = false;
 };
 // This struct will be used to store the PM Sampling data.
 // Same as CUDA 12.6.2 extras/CUPTI/samples/pm_sampling/pm_sampling.h
@@ -132,6 +132,9 @@ class CuptiTraceCollector {
   CuptiTracerCollectorOptions options_;
   // map of child_scope_id -> parent_scope_id
   ScopeRangeIdTree scope_range_id_tree_;
+  // <graph_id, graph_node_id> to annotation string during creation of the node.
+  absl::flat_hash_map<std::pair<uint32_t, uint64_t>, absl::string_view>
+      graph_node_annotations_ = {};
 
  private:
   AnnotationMap annotation_map_;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index f1f144b2576477..e52688c55855c3 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -619,11 +619,11 @@ void SetCuMemHostUnregisterEventUponApiExit(
 struct GraphResourceCreationInfo {
   uint32_t graph_id = 0;
   uint32_t orig_graph_id = 0;
-  absl::flat_hash_map<uint64_t, uint64_t> node_id_map;
+  absl::flat_hash_map<uint64_t, uint64_t> node_id_map = {};
 };
 
 static GraphResourceCreationInfo& GetGraphResourceCreationInfo() {
-  static thread_local GraphResourceCreationInfo per_thread_graph_info;
+  static thread_local GraphResourceCreationInfo per_thread_graph_info{};
   return per_thread_graph_info;
 }
 
@@ -1225,6 +1225,8 @@ CuptiTracer::CreateDefaultCallbackIds() {
       CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync,
       // MemAlloc
       CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2,
       CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2,
@@ -1255,7 +1257,16 @@ CuptiTracer::CreateDefaultCallbackIds() {
       CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags,
       CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams,
       CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz,
+      // Following add-node to cuda graph events are needed to trace as they are
+      // used to create graph nodes for different Hlo operations in XLA.
       CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode_v2,
 #endif  // CUDA_VERSION >= 12080
   };
 }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
index 1f7de88ad55802..05d1738a92c2e7 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer_options_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -69,9 +70,13 @@ absl::Status UpdateCuptiTracerOptionsFromProfilerOptions(
                           collector_options.max_annotation_strings = value;
                         }));
 
-  TF_RETURN_IF_ERROR(SetValue<bool>(
-      profile_options, "gpu_dump_graph_node_mapping", input_keys,
-      [&](bool value) { collector_options.dump_graph_nope_mapping = value; }));
+  TF_RETURN_IF_ERROR(SetValue<int64_t>(
+      profile_options, "gpu_num_chips_to_profile_per_task", input_keys,
+      [&](int64_t value) {
+        if (value >= 0 && value <= std::numeric_limits<uint32_t>::max()) {
+          collector_options.num_gpus = static_cast<uint32_t>(value);
+        }
+      }));
 
   TF_RETURN_IF_ERROR(SetValue<bool>(
       profile_options, "gpu_enable_nvtx_tracking", input_keys,
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index e110e0c6bbd4b3..c948652db51540 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -101,7 +101,8 @@ absl::Status GpuTracer::DoStart() {
 #endif
 
   CuptiTracerCollectorOptions collector_options;
-  collector_options.num_gpus = cupti_tracer_->NumGpus();
+  int num_gpus = cupti_tracer_->NumGpus();
+  collector_options.num_gpus = num_gpus;
 
   // TODO: Add a test to verify that the options are set correctly and
   // collectors are generating correct data once ProfileData is
@@ -109,6 +110,17 @@ absl::Status GpuTracer::DoStart() {
   TF_RETURN_IF_ERROR(UpdateCuptiTracerOptionsFromProfilerOptions(
       profile_options_, options_, collector_options));
 
+  if (collector_options.num_gpus <= 0 ||
+      collector_options.num_gpus > num_gpus) {
+    if (collector_options.num_gpus != 0) {
+      LOG(WARNING)
+          << "The provided number of GPUs (" << collector_options.num_gpus
+          << ") is invalid. Profiling will be done on all available GPUs ("
+          << num_gpus << ").";
+    }
+    collector_options.num_gpus = num_gpus;
+  }
+
   uint64_t start_gputime_ns = CuptiTracer::GetTimestamp();
   uint64_t start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
   cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index f64f01af823979..bc5efe8eac58d8 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -15,17 +15,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <set>
-#include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "rocm/include/hip/amd_detail/hip_prof_str.h"
-#include "rocm/include/roctracer/ext/prof_protocol.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
@@ -38,7 +34,6 @@ namespace profiler {
 using tensorflow::ProfileOptions;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::ProfilerInterface;
-using tsl::profiler::RegisterProfilerFactory;
 using tsl::profiler::XSpace;
 
 // GpuTracer for ROCm GPU.
@@ -59,7 +54,6 @@ class GpuTracer : public profiler::ProfilerInterface {
   absl::Status DoStop();
 
   RocmTracerOptions GetRocmTracerOptions();
-
   RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);
 
   enum State {
@@ -76,80 +70,8 @@ class GpuTracer : public profiler::ProfilerInterface {
 };
 
 RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
-  // TODO(rocm-profiler): We need support for context similar to CUDA
   RocmTracerOptions options;
-  std::vector<uint32_t> empty_vec;
-
-  // clang formatting does not preserve one entry per line
-  // clang-format off
-  std::vector<uint32_t> hip_api_domain_ops{
-      // KERNEL
-      HIP_API_ID_hipExtModuleLaunchKernel,
-      HIP_API_ID_hipModuleLaunchKernel,
-      HIP_API_ID_hipHccModuleLaunchKernel,
-      HIP_API_ID_hipLaunchKernel,
-      HIP_API_ID_hipExtLaunchKernel,
-      // MEMCPY
-      HIP_API_ID_hipMemcpy,
-      HIP_API_ID_hipMemcpyAsync,
-      HIP_API_ID_hipMemcpyDtoD,
-      HIP_API_ID_hipMemcpyDtoDAsync,
-      HIP_API_ID_hipMemcpyDtoH,
-      HIP_API_ID_hipMemcpyDtoHAsync,
-      HIP_API_ID_hipMemcpyHtoD,
-      HIP_API_ID_hipMemcpyHtoDAsync,
-      HIP_API_ID_hipMemcpyPeer,
-      HIP_API_ID_hipMemcpyPeerAsync,
-
-      // MEMSet
-      HIP_API_ID_hipMemsetD32,
-      HIP_API_ID_hipMemsetD32Async,
-      HIP_API_ID_hipMemsetD16,
-      HIP_API_ID_hipMemsetD16Async,
-      HIP_API_ID_hipMemsetD8,
-      HIP_API_ID_hipMemsetD8Async,
-      HIP_API_ID_hipMemset,
-      HIP_API_ID_hipMemsetAsync,
-
-      // MEMAlloc
-      HIP_API_ID_hipMalloc,
-      HIP_API_ID_hipMallocPitch,
-      // MEMFree
-      HIP_API_ID_hipFree,
-      // GENERIC
-      HIP_API_ID_hipStreamSynchronize,
-  };
-  // clang-format on
-
-  options.api_tracking_set =
-      std::set<uint32_t>(hip_api_domain_ops.begin(), hip_api_domain_ops.end());
-
-  // These are the list of APIs we track since roctracer activity
-  // does not provide all the information necessary to fully populate the
-  // TF events. We need to track the APIs for those activities in API domain but
-  // we only use them for filling the missing items in their corresponding
-  // activity (using correlation id).
-  // clang-format off
-  std::vector<uint32_t> hip_api_aux_ops{
-    HIP_API_ID_hipStreamWaitEvent,
-    // TODO(rocm-profiler): finding device ID from hipEventSynchronize need some
-    // extra work, we ignore it for now.
-    // HIP_API_ID_hipEventSynchronize,
-    HIP_API_ID_hipHostFree,
-    HIP_API_ID_hipHostMalloc,
-    HIP_API_ID_hipSetDevice  //  added to track default device
-  };
-
-  // clang-format on
-
-  hip_api_domain_ops.insert(hip_api_domain_ops.end(), hip_api_aux_ops.begin(),
-                            hip_api_aux_ops.end());
-
-  // options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
-  options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
-
-  options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec);
-
+  options.max_annotation_strings = 1024 * 1024;
   return options;
 }
 
@@ -164,20 +86,16 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
 }
 
 absl::Status GpuTracer::DoStart() {
-  if (!rocm_tracer_->IsAvailable()) {
-    return tsl::errors::Unavailable("Another profile session running.");
-  }
-
   AnnotationStack::Enable(true);
+  uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
+  uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
 
+  RocmTracerOptions tracer_options = GetRocmTracerOptions();
   RocmTraceCollectorOptions trace_collector_options =
       GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
-  uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
-  uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
   rocm_trace_collector_ = CreateRocmCollector(
       trace_collector_options, start_walltime_ns, start_gputime_ns);
 
-  RocmTracerOptions tracer_options = GetRocmTracerOptions();
   rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
 
   return absl::OkStatus();
@@ -188,9 +106,10 @@ absl::Status GpuTracer::Start() {
   if (status.ok()) {
     profiling_state_ = State::kStartedOk;
     return absl::OkStatus();
+  } else {
+    profiling_state_ = State::kStartedError;
+    return status;
   }
-  profiling_state_ = State::kStartedError;
-  return status;
 }
 
 absl::Status GpuTracer::DoStop() {
@@ -222,9 +141,7 @@ absl::Status GpuTracer::CollectData(XSpace* space) {
       VLOG(3) << "No trace data collected";
       return absl::OkStatus();
     case State::kStoppedOk: {
-      if (rocm_trace_collector_) {
-        rocm_trace_collector_->Export(space);
-      }
+      if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
       return absl::OkStatus();
     }
   }
@@ -236,17 +153,11 @@ absl::Status GpuTracer::CollectData(XSpace* space) {
 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
   if (options.device_type() != ProfileOptions::GPU &&
-      options.device_type() != ProfileOptions::UNSPECIFIED) {
-    return nullptr;
-  }
-
-  profiler::RocmTracer* rocm_tracer =
-      profiler::RocmTracer::GetRocmTracerSingleton();
-  if (!rocm_tracer->IsAvailable()) {
+      options.device_type() != ProfileOptions::UNSPECIFIED)
     return nullptr;
-  }
-
-  return std::make_unique<profiler::GpuTracer>(rocm_tracer);
+  auto& rocm_tracer = profiler::RocmTracer::GetRocmTracerSingleton();
+  if (!rocm_tracer.IsAvailable()) return nullptr;
+  return std::make_unique<profiler::GpuTracer>(&rocm_tracer);
 }
 
 auto register_rocm_gpu_tracer_factory = [] {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index 6b9da0c26bbae6..5bcb9d52e84d1e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,45 +15,45 @@ limitations under the License.
 
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 
-#include "absl/container/fixed_array.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/rocm/roctracer_wrapper.h"
-#include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/rocprofiler-sdk/fwd.h"
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/profiler/utils/parse_annotation.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/abi.h"
-#include "tsl/platform/env_time.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/thread_annotations.h"
-#include "tsl/platform/types.h"
-#include "tsl/profiler/lib/profiler_factory.h"
-#include "tsl/profiler/lib/profiler_interface.h"
 
 namespace xla {
 namespace profiler {
 
-namespace se = ::stream_executor;
-using tensorflow::ProfileOptions;
+using tsl::Status;
 using tsl::profiler::Annotation;
-using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
 using tsl::profiler::GetStatTypeStr;
 using tsl::profiler::GpuPlaneName;
 using tsl::profiler::kDeviceVendorAMD;
-using tsl::profiler::kThreadIdOverhead;
 using tsl::profiler::ParseAnnotationStack;
-using tsl::profiler::ProfilerInterface;
-// using tsl::profiler::RegisterProfilerFactory;
 using tsl::profiler::StatType;
 using tsl::profiler::XEventBuilder;
 using tsl::profiler::XEventMetadata;
@@ -62,26 +61,6 @@ using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 using tsl::profiler::XSpace;
 
-void AnnotationMap::Add(uint32_t correlation_id,
-                        const std::string& annotation) {
-  if (annotation.empty()) return;
-  VLOG(3) << "Add annotation: "
-          << " correlation_id=" << correlation_id
-          << ", annotation: " << annotation;
-  absl::MutexLock lock(map_.mutex);
-  if (map_.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *map_.annotations.insert(annotation).first;
-    map_.correlation_map.emplace(correlation_id, annotation_str);
-  }
-}
-
-absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
-  absl::MutexLock lock(map_.mutex);
-  auto it = map_.correlation_map.find(correlation_id);
-  return it != map_.correlation_map.end() ? it->second : absl::string_view();
-}
-
 //==========
 namespace {
 // Set the all XLines of specified XPlane to starting walltime.
@@ -100,7 +79,9 @@ std::string GetDeviceXLineName(
     int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
   std::string line_name = absl::StrCat("Stream #", stream_id);
   event_types.erase(RocmTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
+  if (event_types.empty()) {
+    return line_name;
+  }
   std::vector<const char*> type_names;
   for (const auto event_type : event_types) {
     type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
@@ -108,18 +89,17 @@ std::string GetDeviceXLineName(
   return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
 }
 
-}  // namespace
-
-static void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                                uint64_t start_walltime_ns,
-                                uint64_t start_gputime_ns,
-                                const std::string& message) {
+void PrintRocmTracerEvent(const RocmTracerEvent& event,
+                          absl::string_view message = {},
+                          uint64_t start_walltime_ns = 0,
+                          uint64_t start_gputime_ns = 0) {
   std::ostringstream oss;
   oss << "correlation_id=" << event.correlation_id;
   oss << ",type=" << GetRocmTracerEventTypeName(event.type);
   oss << ",source=" << GetRocmTracerEventSourceName(event.source);
   oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
   oss << ",name=" << event.name;
+  oss << ",corr_id=" << event.correlation_id;
   oss << ",annotation=" << event.annotation;
   oss << ",start_time_us="
       << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
@@ -134,7 +114,6 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event,
     case RocmTracerEventType::MemcpyD2H:
     case RocmTracerEventType::MemcpyH2D:
     case RocmTracerEventType::MemcpyD2D:
-    case RocmTracerEventType::MemcpyP2P:
       oss << ",num_bytes=" << event.memcpy_info.num_bytes;
       oss << ",destination=" << event.memcpy_info.destination;
       oss << ",async=" << event.memcpy_info.async;
@@ -152,542 +131,404 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event,
       DCHECK(false);
       break;
   }
-  oss << message;
-  VLOG(3) << oss.str();
+  VLOG(3) << oss.str() << ' ' << message;
 }
 
-static uint64_t get_timestamp() {
+uint64_t get_timestamp() {
   uint64_t ts;
-  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
-    const char* errstr = se::wrap::roctracer_error_string();
-    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
+  rocprofiler_status_t CHECKSTATUS = rocprofiler_get_timestamp(&ts);
+  if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) {
+    const char* errstr = rocprofiler_get_status_string(CHECKSTATUS);
+    LOG(ERROR) << "function rocprofiler_get_timestamp failed with error "
                << errstr;
-    // Return 0 on error.
     return 0;
   }
   return ts;
 }
+}  // namespace
 
-struct RocmDeviceOccupancyParams {
-  hipFuncAttributes attributes = {};
-  int block_size = 0;
-  size_t dynamic_smem_size = 0;
-  void* func_ptr;
-
-  friend bool operator==(const RocmDeviceOccupancyParams& lhs,
-                         const RocmDeviceOccupancyParams& rhs) {
-    return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
+OccupancyStats PerDeviceCollector::GetOccupancy(
+    const RocmDeviceOccupancyParams& params) const {
+  // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
+  // return hipSuccess for HIP_API_ID_hipLaunchKernel
+  OccupancyStats stats;
+  int number_of_active_blocks;
+  hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+      &number_of_active_blocks, params.func_ptr, params.block_size,
+      params.dynamic_smem_size);
+
+  if (err != hipError_t::hipSuccess) {
+    return {};
   }
 
-  template <typename H>
-  friend H AbslHashValue(H hash_state,
-                         const RocmDeviceOccupancyParams& params) {
-    return H::combine(
-        std::move(hash_state), params.attributes.maxThreadsPerBlock,
-        params.attributes.numRegs, params.attributes.sharedSizeBytes,
-        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
-        params.dynamic_smem_size, params.func_ptr);
-  }
-};
-
-struct OccupancyStats {
-  double occupancy_pct = 0.0;
-  int min_grid_size = 0;
-  int suggested_block_size = 0;
-};
-
-struct CorrelationInfo {
-  CorrelationInfo(uint32_t t, uint32_t e) : thread_id(t), enqueue_time_ns(e) {}
-  uint32_t thread_id;
-  uint64_t enqueue_time_ns;
-};
-
-class PerDeviceCollector {
- private:
-  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
-    // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
-    // return hipSuccess for HIP_API_ID_hipLaunchKernel
-
-    OccupancyStats stats;
-    int number_of_active_blocks;
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &number_of_active_blocks, params.func_ptr, params.block_size,
-        params.dynamic_smem_size);
-
-    if (err != hipError_t::hipSuccess) {
-      return {};
-    }
+  stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
+  stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
 
-    stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
-    stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
+  err = hipOccupancyMaxPotentialBlockSize(
+      &stats.min_grid_size, &stats.suggested_block_size,
+      static_cast<const void*>(params.func_ptr), params.dynamic_smem_size, 0);
 
-    err = hipOccupancyMaxPotentialBlockSize(
-        &stats.min_grid_size, &stats.suggested_block_size,
-        static_cast<const void*>(params.func_ptr), params.dynamic_smem_size, 0);
+  if (err != hipError_t::hipSuccess) {
+    return {};
+  }
 
-    if (err != hipError_t::hipSuccess) {
-      return {};
-    }
+  return stats;
+}
 
-    return stats;
+void PerDeviceCollector::CreateXEvent(const RocmTracerEvent& event,
+                                      XPlaneBuilder* plane,
+                                      uint64_t start_gpu_ns,
+                                      uint64_t end_gpu_ns, XLineBuilder* line) {
+  if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+      event.start_time_ns > event.end_time_ns) {
+    VLOG(2) << "events have abnormal timestamps:" << event.name
+            << " start time(ns): " << event.start_time_ns
+            << " end time(ns): " << event.end_time_ns
+            << " start gpu(ns):" << start_gpu_ns
+            << " end gpu(ns):" << end_gpu_ns
+            << " corr. id:" << event.correlation_id;
+    return;
+  }
+  std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
+  if (kernel_name.empty()) {
+    kernel_name = GetRocmTracerEventTypeName(event.type);
+  }
+  XEventMetadata* event_metadata =
+      plane->GetOrCreateEventMetadata(std::move(kernel_name));
+  XEventBuilder xevent = line->AddEvent(*event_metadata);
+  VLOG(7) << "Adding event to line=" << line->Id();
+  xevent.SetTimestampNs(event.start_time_ns);
+  xevent.SetEndTimestampNs(event.end_time_ns);
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
+        event.device_id);
+  }
+  if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kCorrelationId)),
+                        event.correlation_id);
+  }
+  if (!event.roctx_range.empty()) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
+        *plane->GetOrCreateStatMetadata(event.roctx_range));
   }
 
-  void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
-                    uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-                    XLineBuilder* line) {
-    if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
-        event.start_time_ns > event.end_time_ns) {
-      VLOG(2) << "events have abnormal timestamps:" << event.name
-              << " start time(ns): " << event.start_time_ns
-              << " end time(ns): " << event.end_time_ns
-              << " start gpu(ns):" << start_gpu_ns
-              << " end gpu(ns):" << end_gpu_ns
-              << " corr. id:" << event.correlation_id;
-      return;
-    }
-    std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
-    if (kernel_name.empty()) {
-      kernel_name = GetRocmTracerEventTypeName(event.type);
-    }
-    XEventMetadata* event_metadata =
-        plane->GetOrCreateEventMetadata(std::move(kernel_name));
-    XEventBuilder xevent = line->AddEvent(*event_metadata);
-    VLOG(7) << "Adding event to line=" << line->Id();
-    xevent.SetTimestampNs(event.start_time_ns);
-    xevent.SetEndTimestampNs(event.end_time_ns);
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
-          event.device_id);
-    }
-    if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kCorrelationId)),
-                          event.correlation_id);
-    }
-    if (!event.roctx_range.empty()) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
-          *plane->GetOrCreateStatMetadata(event.roctx_range));
-    }
-
-    if (event.type == RocmTracerEventType::Kernel &&
-        event.source == RocmTracerEventSource::Activity) {
-      RocmDeviceOccupancyParams params{};
-      params.attributes.maxThreadsPerBlock = INT_MAX;
-      params.attributes.numRegs =
-          static_cast<int>(event.kernel_info.registers_per_thread);
-      params.attributes.sharedSizeBytes =
-          event.kernel_info.static_shared_memory_usage;
-      // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
-      // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-      params.attributes.maxDynamicSharedSizeBytes = 0;
-      params.block_size = static_cast<int>(event.kernel_info.block_x *
-                                           event.kernel_info.block_y *
-                                           event.kernel_info.block_z);
-
-      params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
-      params.func_ptr = event.kernel_info.func_ptr;
-
-      OccupancyStats& occ_stats = occupancy_cache_[params];
-      if (occ_stats.occupancy_pct == 0.0) {
-        occ_stats = GetOccupancy(params);
-      }
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                              StatType::kTheoreticalOccupancyPct)),
-                          occ_stats.occupancy_pct);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kOccupancyMinGridSize)),
-                          static_cast<tsl::int32>(occ_stats.min_grid_size));
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-          static_cast<tsl::int32>(occ_stats.suggested_block_size));
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kKernelDetails)),
-                          *plane->GetOrCreateStatMetadata(ToXStat(
-                              event.kernel_info, occ_stats.occupancy_pct)));
-    } else if (event.type == RocmTracerEventType::MemcpyH2D ||
-               event.type == RocmTracerEventType::MemcpyD2H ||
-               event.type == RocmTracerEventType::MemcpyD2D ||
-               event.type == RocmTracerEventType::MemcpyP2P ||
-               event.type == RocmTracerEventType::MemcpyOther) {
-      VLOG(7) << "Add Memcpy stat";
-      const auto& memcpy_info = event.memcpy_info;
-      std::string memcpy_details = absl::StrCat(
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          "kind:", "Unknown", " size:", memcpy_info.num_bytes,
-          " dest:", memcpy_info.destination, " async:", memcpy_info.async);
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kMemcpyDetails)),
-          *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
-    } else if (event.type == RocmTracerEventType::MemoryAlloc) {
-      VLOG(7) << "Add MemAlloc stat";
-      std::string value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memalloc_info.num_bytes);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemallocDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } else if (event.type == RocmTracerEventType::MemoryFree) {
-      VLOG(7) << "Add MemFree stat";
-      std::string value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memalloc_info.num_bytes);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemFreeDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } else if (event.type == RocmTracerEventType::Memset) {
-      VLOG(7) << "Add Memset stat";
-      auto value =
-          // TODO(rocm-profiler): we need to discover the memory kind similar
-          // to CUDA
-          absl::StrCat("kind:", "Unknown",
-                       " num_bytes:", event.memset_info.num_bytes,
-                       " async:", event.memset_info.async);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                              GetStatTypeStr(StatType::kMemsetDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    }
-    // TODO(rocm-profiler): we need to support the following event type
-    /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
-      VLOG(7) << "Add MemoryResidency stat";
-      std::string value = absl::StrCat(
-          "kind:", GetMemoryKindName(event.memory_residency_info.kind),
-          " num_bytes:", event.memory_residency_info.num_bytes,
-          " addr:", event.memory_residency_info.address);
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                              StatType::kMemoryResidencyDetails)),
-                          *plane->GetOrCreateStatMetadata(std::move(value)));
-    } */
-
-    std::vector<Annotation> annotation_stack =
-        ParseAnnotationStack(event.annotation);
-    if (!annotation_stack.empty()) {
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-          *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-    }
-    // If multiple metadata have the same key name, show the values from the
-    // top of the stack (innermost annotation). Concatenate the values from
-    // "hlo_op".
-    absl::flat_hash_set<absl::string_view> key_set;
-
-    for (auto annotation = annotation_stack.rbegin();
-         annotation != annotation_stack.rend(); ++annotation) {
-      for (const Annotation::Metadata& metadata : annotation->metadata) {
-        if (key_set.insert(metadata.key).second) {
-          xevent.ParseAndAddStatValue(
-              *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-        }
+  if (event.type == RocmTracerEventType::Kernel &&
+      event.source == RocmTracerEventSource::Activity) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kKernelDetails)),
+        *plane->GetOrCreateStatMetadata(ToXStat(event.kernel_info,
+                                                /*occupancy_pct*/ 0)));
+  } else if (event.type == RocmTracerEventType::MemcpyH2D ||
+             event.type == RocmTracerEventType::MemcpyD2H ||
+             event.type == RocmTracerEventType::MemcpyD2D ||
+             event.type == RocmTracerEventType::MemcpyOther) {
+    VLOG(7) << "Add Memcpy stat";
+    const auto& memcpy_info = event.memcpy_info;
+    std::string memcpy_details = absl::StrCat(
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        "kind:", "Unknown", " size:", memcpy_info.num_bytes,
+        " dest:", memcpy_info.destination, " async:", memcpy_info.async);
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kMemcpyDetails)),
+        *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
+  } else if (event.type == RocmTracerEventType::MemoryAlloc) {
+    VLOG(7) << "Add MemAlloc stat";
+    std::string value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memalloc_info.num_bytes);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemallocDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } else if (event.type == RocmTracerEventType::MemoryFree) {
+    VLOG(7) << "Add MemFree stat";
+    std::string value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memalloc_info.num_bytes);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemFreeDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } else if (event.type == RocmTracerEventType::Memset) {
+    VLOG(7) << "Add Memset stat";
+    auto value =
+        // TODO(rocm-profiler): we need to discover the memory kind similar
+        // to CUDA
+        absl::StrCat("kind:", "Unknown",
+                     " num_bytes:", event.memset_info.num_bytes,
+                     " async:", event.memset_info.async);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                            GetStatTypeStr(StatType::kMemsetDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  }
+  // TODO(rocm-profiler): we need to support the following event type
+  /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
+    VLOG(7) << "Add MemoryResidency stat";
+    std::string value = absl::StrCat(
+        "kind:", GetMemoryKindName(event.memory_residency_info.kind),
+        " num_bytes:", event.memory_residency_info.num_bytes,
+        " addr:", event.memory_residency_info.address);
+    xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                            StatType::kMemoryResidencyDetails)),
+                        *plane->GetOrCreateStatMetadata(std::move(value)));
+  } */
+
+  std::vector<Annotation> annotation_stack =
+      ParseAnnotationStack(event.annotation);
+  if (!annotation_stack.empty()) {
+    xevent.AddStatValue(
+        *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
+        *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
+  }
+  // If multiple metadata have the same key name, show the values from the
+  // top of the stack (innermost annotation). Concatenate the values from
+  // "hlo_op".
+  absl::flat_hash_set<absl::string_view> key_set;
+
+  for (auto annotation = annotation_stack.rbegin();
+       annotation != annotation_stack.rend(); ++annotation) {
+    for (const Annotation::Metadata& metadata : annotation->metadata) {
+      if (key_set.insert(metadata.key).second) {
+        xevent.ParseAndAddStatValue(
+            *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
       }
     }
   }
+}
 
-  void SortByStartTime() {
-    absl::MutexLock lock(events_mutex);
-    std::sort(events.begin(), events.end(),
-              [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
-                return event1.start_time_ns < event2.start_time_ns;
-              });
+void PerDeviceCollector::SortByStartTime() {
+  absl::MutexLock lock(events_mutex_);
+  std::sort(events_.begin(), events_.end(),
+            [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
+              return event1.start_time_ns < event2.start_time_ns;
+            });
+}
+
+bool PerDeviceCollector::IsHostEvent(const RocmTracerEvent& event,
+                                     int64_t* line_id) {
+  // DriverCallback(i.e. kernel launching) events are host events.
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    *line_id = event.thread_id;
+    return true;
+  } else {  // activities
+    *line_id = event.stream_id;
+    return false;
   }
 
-  bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
-    // DriverCallback(i.e. kernel launching) events are host events.
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      *line_id = event.thread_id;
-      return true;
-    } else {  // activities
-      *line_id = event.stream_id;
-      return false;
-    }
+  // TODO(rocm-profiler): do we have such a report in rocm?
+  // Non-overhead activity events are device events.
+  /* if (event.type != CuptiTracerEventType::Overhead) {
+    *line_id = event.stream_id;
+    return false;
+  } */
+  // Overhead events can be associated with a thread or a stream, etc.
+  // If a valid thread id is specified, we consider it as a host event.
+  //
+
+  if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
+    *line_id = event.stream_id;
+    return false;
+  } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
+             event.thread_id != 0) {
+    *line_id = event.thread_id;
+    return true;
+  } else {
+    *line_id = tsl::profiler::kThreadIdOverhead;
+    return false;
+  }
+}
 
-    // TODO(rocm-profiler): do we have such a report in rocm?
-    // Non-overhead activity events are device events.
-    /* if (event.type != CuptiTracerEventType::Overhead) {
-      *line_id = event.stream_id;
-      return false;
-    } */
-    // Overhead events can be associated with a thread or a stream, etc.
-    // If a valid thread id is specified, we consider it as a host event.
-    //
-
-    if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
-      *line_id = event.stream_id;
-      return false;
-    } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
-               event.thread_id != 0) {
-      *line_id = event.thread_id;
-      return true;
+void PerDeviceCollector::Export(uint64_t start_walltime_ns,
+                                uint64_t start_gputime_ns,
+                                uint64_t end_gputime_ns,
+                                XPlaneBuilder* device_plane,
+                                XPlaneBuilder* host_plane) {
+  int host_ev_cnt = 0, dev_ev_cnt = 0;
+  absl::MutexLock lock(events_mutex_);
+  // Tracking event types per line.
+  absl::flat_hash_map<int64_t, absl::flat_hash_set<RocmTracerEventType> >
+      events_types_per_line;
+
+  for (const RocmTracerEvent& event : events_) {
+    int64_t line_id = RocmTracerEvent::kInvalidThreadId;
+    bool is_host_event = IsHostEvent(event, &line_id);
+
+    if (is_host_event) {
+      host_ev_cnt++;
     } else {
-      *line_id = tsl::profiler::kThreadIdOverhead;
-      return false;
+      dev_ev_cnt++;
     }
-  }
-
- public:
-  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-              uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
-              XPlaneBuilder* host_plane) {
-    int host_ev_cnt = 0, dev_ev_cnt = 0;
-    absl::MutexLock l(events_mutex);
-    // Tracking event types per line.
-    absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
-        events_types_per_line;
-    for (const RocmTracerEvent& event : events) {
-      int64_t line_id = RocmTracerEvent::kInvalidThreadId;
-      bool is_host_event = IsHostEvent(event, &line_id);
-
-      if (is_host_event) {
-        host_ev_cnt++;
-      } else {
-        dev_ev_cnt++;
-      }
 
-      if (line_id == RocmTracerEvent::kInvalidThreadId ||
-          line_id == RocmTracerEvent::kInvalidStreamId) {
-        VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
-        continue;
-      }
-      auto* plane = is_host_event ? host_plane : device_plane;
-      VLOG(9) << "Event"
-              << " type=" << static_cast<int>(event.type)
-              << " line_id=" << line_id
-              << (is_host_event ? " host plane=" : " device plane=")
-              << plane->Name();
-      XLineBuilder line = plane->GetOrCreateLine(line_id);
-      line.SetTimestampNs(start_gputime_ns);
-      CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
-      events_types_per_line[line_id].emplace(event.type);
+    if (line_id == RocmTracerEvent::kInvalidThreadId ||
+        line_id == RocmTracerEvent::kInvalidStreamId) {
+      VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
+      continue;
     }
-    device_plane->ForEachLine([&](XLineBuilder line) {
-      line.SetName(
-          GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-    });
-    host_plane->ForEachLine([&](XLineBuilder line) {
-      line.SetName(absl::StrCat("Host Threads/", line.Id()));
-    });
-    events.clear();
+    auto* plane = is_host_event ? host_plane : device_plane;
+    VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
+            << " line_id=" << line_id
+            << (is_host_event ? " host plane=" : " device plane=")
+            << plane->Name();
+
+    XLineBuilder line = plane->GetOrCreateLine(line_id);
+    line.SetTimestampNs(start_gputime_ns);
+    CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
   }
 
-  PerDeviceCollector() = default;
+  device_plane->ForEachLine([&](XLineBuilder line) {
+    line.SetName(
+        GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+  });
+  host_plane->ForEachLine([&](XLineBuilder line) {
+    line.SetName(absl::StrCat("Host Threads/", line.Id()));
+  });
+  events_.clear();
+}
 
-  void AddEvent(const RocmTracerEvent& event) {
-    absl::MutexLock l(events_mutex);
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      // Cupti api callback events were used to populate launch times etc.
-      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-        correlation_info_.insert(
-            {event.correlation_id,
-             CorrelationInfo(event.thread_id, event.start_time_ns)});
-      }
-      events.emplace_back(std::move(event));
-    } else {
-      // Cupti activity events measure device times etc.
-      events.emplace_back(std::move(event));
-    }
-  }
+void PerDeviceCollector::AddEvent(RocmTracerEvent&& event) {
+  absl::MutexLock lock(events_mutex_);
+  events_.emplace_back(std::move(event));
+}
 
-  void GetDeviceCapabilities(int32_t device_ordinal,
-                             XPlaneBuilder* device_plane) {
-    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(StatType::kDevVendor)),
-                               kDeviceVendorAMD);
-
-    if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
-        hipSuccess)
-      return;
-
-    auto clock_rate_in_khz =
-        device_properties_.clockRate;  // this is also in Khz
-    if (clock_rate_in_khz) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-          clock_rate_in_khz);
-    }
+void PerDeviceCollector::GetDeviceCapabilities(int32_t device_ordinal,
+                                               XPlaneBuilder* device_plane) {
+  device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                 GetStatTypeStr(StatType::kDevVendor)),
+                             kDeviceVendorAMD);
 
-    auto core_count = device_properties_.multiProcessorCount;
-    if (core_count) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapCoreCount)),
-          core_count);
-    }
+  if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
+      hipSuccess) {
+    return;
+  }
 
-    auto mem_clock_khz = device_properties_.memoryClockRate;
-    auto mem_bus_width_bits = device_properties_.memoryBusWidth;
-
-    if (mem_clock_khz && mem_bus_width_bits) {
-      // Times 2 because HBM is DDR memory; it gets two data bits per each
-      // data lane.
-      auto memory_bandwidth =
-          uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-          memory_bandwidth);
-    }
+  auto clock_rate_in_khz = device_properties_.clockRate;  // this is also in Khz
+  if (clock_rate_in_khz) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+        clock_rate_in_khz);
+  }
 
-    size_t total_memory = device_properties_.totalGlobalMem;
-    if (total_memory) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapMemorySize)),
-          static_cast<uint64_t>(total_memory));
-    }
+  auto core_count = device_properties_.multiProcessorCount;
+  if (core_count) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevCapCoreCount)),
+                               core_count);
+  }
 
-    auto compute_capability_major = device_properties_.major;
-    if (compute_capability_major) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-          compute_capability_major);
-    }
-    auto compute_capability_minor = device_properties_.minor;
-    if (compute_capability_minor) {
-      device_plane->AddStatValue(
-          *device_plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-          compute_capability_minor);
-    }
+  auto mem_clock_khz = device_properties_.memoryClockRate;
+  auto mem_bus_width_bits = device_properties_.memoryBusWidth;
+
+  if (mem_clock_khz && mem_bus_width_bits) {
+    // Times 2 because HBM is DDR memory; it gets two data bits per each
+    // data lane.
+    auto memory_bandwidth =
+        uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+        memory_bandwidth);
   }
 
- private:
-  absl::Mutex events_mutex;
-  std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-  absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
-      TF_GUARDED_BY(events_mutex);
-  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
-      occupancy_cache_;
-  hipDeviceProp_t device_properties_;
-};
-
-class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
- public:
-  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
-      : RocmTraceCollector(options),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus) {}
-
-  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
-  void Flush() override;
-  void Export(XSpace* space) override;
-
-  void OnEventsDropped(const std::string& reason,
-                       uint32_t correlation_id) override {
-    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
-              << ",) : " << reason << ".";
+  size_t total_memory = device_properties_.totalGlobalMem;
+  if (total_memory) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevCapMemorySize)),
+                               static_cast<uint64_t>(total_memory));
   }
 
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  uint64_t start_walltime_ns_;
-  uint64_t start_gputime_ns_;
-  int num_gpus_;
-
-  absl::Mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
-   trigger multiple HIP ops domain activities. We keep them in a vector and
-   merge them with api activities at flush time.
- */
-  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
-      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
-  // This is for the APIs that we track because we need some information from
-  // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  const std::vector<RocmTracerEvent> ApiActivityInfoExchange()
-      TF_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_);
-
-  absl::node_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;
-};
-//==========
+  auto compute_capability_major = device_properties_.major;
+  if (compute_capability_major) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+        compute_capability_major);
+  }
+  auto compute_capability_minor = device_properties_.minor;
+  if (compute_capability_minor) {
+    device_plane->AddStatValue(
+        *device_plane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+        compute_capability_minor);
+  }
+}
 
 void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
                                       bool is_auxiliary) {
   absl::MutexLock lock(event_maps_mutex_);
 
-  if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
-    if (num_callback_events_ > options_.max_callback_api_events) {
-      OnEventsDropped("max callback event capacity reached",
-                      event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      return;
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    if (!is_auxiliary) {
+      if (num_callback_events_ >= options_.max_callback_api_events) {
+        OnEventsDropped("max callback event capacity reached",
+                        event.correlation_id);
+        PrintRocmTracerEvent(event, ". Dropped!");
+        return;
+      }
+      num_callback_events_++;
     }
-    num_callback_events_++;
-  } else if (event.source == RocmTracerEventSource::Activity &&
-             event.domain == RocmTracerEventDomain::HIP_API) {
-    // we do not count HIP_OPS activities.
-    if (num_activity_events_ > options_.max_activity_api_events) {
-      OnEventsDropped("max activity event capacity reached",
+    auto& map = is_auxiliary ? auxiliary_api_events_map_ : api_events_map_;
+    auto [it, added] = map.emplace(event.correlation_id, std::move(event));
+
+    if (!added) {
+      OnEventsDropped("event with duplicate correlation_id was received.",
                       event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      return;
+      PrintRocmTracerEvent(event, ". Dropped!");
     }
-    num_activity_events_++;
-  }
-
-  bool emplace_result = false;
-  if (event.source == RocmTracerEventSource::ApiCallback) {
-    auto& target_api_event_map =
-        (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
-    std::tie(std::ignore, emplace_result) =
-        target_api_event_map.emplace(event.correlation_id, std::move(event));
   } else if (event.source == RocmTracerEventSource::Activity) {
-    auto result = activity_ops_events_map_.emplace(
+    if (event.domain == RocmTracerEventDomain::HIP_API) {
+      // we do not count HIP_OPS activities.
+      if (num_activity_events_ >= options_.max_activity_api_events) {
+        OnEventsDropped("max activity event capacity reached",
+                        event.correlation_id);
+        PrintRocmTracerEvent(event, ". Dropped!");
+        return;
+      }
+      num_activity_events_++;
+    }
+
+    auto [it, _] = activity_ops_events_map_.emplace(
         event.correlation_id, std::vector<RocmTracerEvent>{});
-    result.first->second.push_back(std::move(event));
-    emplace_result = true;  // we always accept Hip-Ops events
-  }
-  if (!emplace_result) {
-    OnEventsDropped("event with duplicate correlation_id was received.",
-                    event.correlation_id);
-    DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+    it->second.push_back(std::move(event));
+  } else {
+    VLOG(3) << "Dropping unknown event: " << (int)event.source
+            << " domain: " << (int)event.domain;
   }
 }
 
 void RocmTraceCollectorImpl::Flush() {
   absl::MutexLock lock(event_maps_mutex_);
-  auto& aggregated_events_ = ApiActivityInfoExchange();
+  auto aggregated_events = ApiActivityInfoExchange();
 
   VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
           << " callback events, " << num_activity_events_
           << " activity events, and aggregated them into "
-          << aggregated_events_.size() << " events.";
+          << aggregated_events.size() << " events.";
 
   // device ids for GPUs filled in by roctracer are not zero indexed.
   // They are offset by number of CPUs on the machine
-  tsl::uint32 min_device_id = INT32_MAX;
-  ;
-  for (auto& event : aggregated_events_) {
+  uint32_t min_device_id = INT32_MAX;
+
+  for (const auto& event : aggregated_events) {
     if (event.device_id < min_device_id) {
       min_device_id = event.device_id;
     }
   }
 
-  for (auto event : aggregated_events_) {
-    event.device_id = event.device_id - min_device_id;
-    if (event.device_id < num_gpus_) {
-      per_device_collector_[event.device_id].AddEvent(event);
+  for (auto& event : aggregated_events) {
+    auto id = event.device_id - min_device_id;
+    if (id < num_gpus_) {
+      per_device_collector_[id].AddEvent(std::move(event));
     } else {
-      OnEventsDropped("Invalid device id for an event.", event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      PrintRocmTracerEvent(event, ". Dropped due to invalid device ID!");
     }
   }
 
@@ -701,24 +542,24 @@ void RocmTraceCollectorImpl::Export(XSpace* space) {
   XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
       space, tsl::profiler::kRoctracerApiPlaneName));
 
-  for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-    std::string name = GpuPlaneName(device_ordinal);
+  VLOG(3) << "Calling RocmTraceCollectorImpl::Export num_gpus " << num_gpus_;
+
+  for (int id = 0; id < num_gpus_; id++) {
+    std::string name = GpuPlaneName(id);
     XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-    device_plane.SetId(device_ordinal);
+    device_plane.SetId(id);
     // Calculate device capabilities before flushing, so that device
     // properties are available to the occupancy calculator in export().
-    per_device_collector_[device_ordinal].GetDeviceCapabilities(device_ordinal,
-                                                                &device_plane);
-    per_device_collector_[device_ordinal].Export(
-        start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
-        &host_plane);
+    per_device_collector_[id].GetDeviceCapabilities(id, &device_plane);
+    per_device_collector_[id].Export(start_walltime_ns_, start_gputime_ns_,
+                                     end_gputime_ns, &device_plane,
+                                     &host_plane);
     NormalizeTimeStamps(&device_plane, start_walltime_ns_);
   }
   NormalizeTimeStamps(&host_plane, start_walltime_ns_);
 }
 
-const std::vector<RocmTracerEvent>
-RocmTraceCollectorImpl::ApiActivityInfoExchange() {
+std::vector<RocmTracerEvent> RocmTraceCollectorImpl::ApiActivityInfoExchange() {
   /* Different from CUDA, roctracer activity records are not enough to fill a
     TF event. For most of the activities, we need to enable the corresponding
     API callsbacks (we call them auxiliary API callbacks) to capture the
@@ -728,116 +569,108 @@ RocmTraceCollectorImpl::ApiActivityInfoExchange() {
   */
 
   std::vector<RocmTracerEvent> aggregated_events;
+  aggregated_events.reserve(api_events_map_.size());
 
   // Copy info from activity events to API callback events
-  for (auto& api_iter : api_events_map_) {
-    RocmTracerEvent& api_event = api_iter.second;
-    auto activity_event =
-        activity_ops_events_map_.find(api_event.correlation_id);
-
-    if (activity_event == activity_ops_events_map_.end()) {
-      OnEventsDropped(
-          "An event from HIP API discarded."
-          "Could not find the counterpart activity.",
-          api_event.correlation_id);
-      DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-    } else {
-      api_event.device_id = activity_event->second.front().device_id;
-      api_event.stream_id = activity_event->second.front().stream_id;
-      switch (api_event.type) {
-        case RocmTracerEventType::Kernel:
-        case RocmTracerEventType::Memset:
-        case RocmTracerEventType::MemoryAlloc:
-        case RocmTracerEventType::MemoryFree:
-        case RocmTracerEventType::Synchronization: {
-          aggregated_events.push_back(api_event);
-          break;
-        }
-        case RocmTracerEventType::MemcpyD2H:
-        case RocmTracerEventType::MemcpyH2D:
-        case RocmTracerEventType::MemcpyD2D:
-        case RocmTracerEventType::MemcpyOther: {
-          api_event.memcpy_info.destination =
-              activity_event->second.front().device_id;
-          aggregated_events.push_back(api_event);
-          break;
-        }
-        default:
-          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
-                          api_event.correlation_id);
-          DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-          LOG(WARNING) << "A ROCm API event type with unimplemented activity "
-                          "merge dropped! "
-                          "Type="
-                       << GetRocmTracerEventTypeName(api_event.type);
-      }
+  for (auto& [key, api_event] : api_events_map_) {
+    auto iact = activity_ops_events_map_.find(api_event.correlation_id);
+
+    if (iact == activity_ops_events_map_.end()) {
+      PrintRocmTracerEvent(api_event, ". Dropped!");
+      VLOG(1) << api_event.name << "  could not find activity counterpart!";
+      continue;
     }
-  }
+    const auto& item = iact->second.front();
+    api_event.device_id = item.device_id;
+    api_event.stream_id = item.stream_id;
+    switch (api_event.type) {
+      case RocmTracerEventType::Kernel:
+        api_event.kernel_info = item.kernel_info;
+        aggregated_events.push_back(api_event);
+        break;
+      case RocmTracerEventType::Memset:
+      case RocmTracerEventType::MemoryAlloc:
+      case RocmTracerEventType::MemoryFree:
+      case RocmTracerEventType::Synchronization:
+        aggregated_events.push_back(api_event);
+        break;
+      case RocmTracerEventType::MemcpyD2H:
+      case RocmTracerEventType::MemcpyH2D:
+      case RocmTracerEventType::MemcpyD2D:
+      case RocmTracerEventType::MemcpyOther:
+        api_event.memcpy_info = item.memcpy_info;
+        aggregated_events.push_back(api_event);
+        break;
+      default:
+        OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                        api_event.correlation_id);
+        PrintRocmTracerEvent(api_event, ". Dropped!");
+        LOG(WARNING) << "A ROCm API event type with unimplemented activity "
+                        "merge dropped! "
+                        "Type="
+                     << GetRocmTracerEventTypeName(api_event.type);
+    }  // switch
+  }  // for
 
   // Make sure for all activity events we have API callback events
   for (auto& activity_iter : activity_ops_events_map_) {
     RocmTracerEvent& activity_event = activity_iter.second.front();
+
     auto api_event = api_events_map_.find(activity_event.correlation_id);
 
     if (api_event == api_events_map_.end()) {
       api_event = auxiliary_api_events_map_.find(activity_event.correlation_id);
-    }
 
-    if (api_event == auxiliary_api_events_map_.end()) {
-      OnEventsDropped(
-          "An event from activity was discarded."
-          "Could not find the counterpart HIP API.",
-          activity_event.correlation_id);
-      DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-    } else {
-      switch (activity_event.type) {
-        // KERNEL ACTIVITY
-        case RocmTracerEventType::Kernel: {
-          activity_event.name = api_event->second.name;
-          activity_event.kernel_info = api_event->second.kernel_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MEMCPY ACTIVITY
-        case RocmTracerEventType::MemcpyD2H:
-        case RocmTracerEventType::MemcpyH2D:
-        case RocmTracerEventType::MemcpyD2D:
-        case RocmTracerEventType::MemcpyOther: {
-          activity_event.memcpy_info = api_event->second.memcpy_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MEMSET ACTIVITY
-        case RocmTracerEventType::Memset: {
-          activity_event.memset_info = api_event->second.memset_info;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // MALLOC ACTIVITY, FREE ACTIVITY
-        case RocmTracerEventType::MemoryAlloc:
-        case RocmTracerEventType::MemoryFree: {
-          activity_event.device_id = api_event->second.device_id;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        // SYNCHRONIZATION ACTIVITY
-        case RocmTracerEventType::Synchronization: {
-          activity_event.device_id = api_event->second.device_id;
-          aggregated_events.push_back(activity_event);
-          break;
-        }
-        default:
-          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
-                          activity_event.correlation_id);
-          DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-          LOG(WARNING) << "A ROCm activity event with unimplemented API "
-                          "callback merge dropped! "
-                          "Type="
-                       << GetRocmTracerEventTypeName(activity_event.type);
-          break;
+      if (api_event == auxiliary_api_events_map_.end()) {
+        OnEventsDropped(
+            "An event from activity was discarded."
+            "Could not find the counterpart HIP API.",
+            activity_event.correlation_id);
+        PrintRocmTracerEvent(activity_event, ". Dropped!");
+        continue;
       }
     }
-  }
+
+    switch (activity_event.type) {
+      case RocmTracerEventType::Kernel:
+        activity_event.kernel_info = api_event->second.kernel_info;
+        PrintRocmTracerEvent(activity_event,
+                             ". activity event from api_event.");
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::MemcpyD2H:
+      case RocmTracerEventType::MemcpyH2D:
+      case RocmTracerEventType::MemcpyD2D:
+      case RocmTracerEventType::MemcpyOther:
+        // activity_event.memcpy_info = api_event->second.memcpy_info;
+        aggregated_events.push_back(activity_event);
+        break;
+      case RocmTracerEventType::Memset:
+        activity_event.memset_info = api_event->second.memset_info;
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::MemoryAlloc:
+      case RocmTracerEventType::MemoryFree:
+        activity_event.device_id = api_event->second.device_id;
+        aggregated_events.push_back(activity_event);
+        break;
+
+      case RocmTracerEventType::Synchronization:
+        activity_event.device_id = api_event->second.device_id;
+        aggregated_events.push_back(activity_event);
+        break;
+      default:
+        OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                        activity_event.correlation_id);
+        PrintRocmTracerEvent(activity_event, ". Dropped!");
+        LOG(WARNING) << "A ROCm activity event with unimplemented API "
+                        "callback merge dropped! "
+                        "Type="
+                     << GetRocmTracerEventTypeName(activity_event.type);
+    }  // switch
+  }  // for
 
   return aggregated_events;
 }
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
index 46e8e71eee77f0..c93dbb2c128f50 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
@@ -16,213 +16,208 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
 
+#include <atomic>
+#include <cstddef>
 #include <cstdint>
-#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_set.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace xla {
 namespace profiler {
 
-using tsl::profiler::XSpace;
-
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64_t num_bytes;
-};
-
-struct MemsetDetails {
-  // The number of memory elements getting set
-  size_t num_bytes;
-  // Whether or not the memset is asynchronous.
-  bool async;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // kernel address. Used for calculating core occupancy
-  void* func_ptr;
-};
-
 inline std::string ToXStat(const KernelDetails& kernel_info,
                            double occupancy_pct) {
-  return absl::StrCat(
-      "regs:", kernel_info.registers_per_thread,
-      " static_shared:", kernel_info.static_shared_memory_usage,
-      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-      kernel_info.block_y, ",", kernel_info.block_z,
-      " occ_pct:", occupancy_pct);
+  uint32_t grid_x = kernel_info.workgroup_x != 0
+                        ? kernel_info.grid_x / kernel_info.workgroup_x
+                        : 0,
+           grid_y = kernel_info.workgroup_y != 0
+                        ? kernel_info.grid_y / kernel_info.workgroup_y
+                        : 0,
+           grid_z = kernel_info.workgroup_z != 0
+                        ? kernel_info.grid_z / kernel_info.workgroup_z
+                        : 0;
+
+  return absl::StrCat(" grid:", grid_x, ",", grid_y, ",", grid_z,
+                      " block:", kernel_info.workgroup_x, ",",
+                      kernel_info.workgroup_y, ",", kernel_info.workgroup_z,
+                      " private_mem:", kernel_info.private_segment_size,
+                      " group_mem:", kernel_info.group_segment_size,
+                      " occ_pct:", occupancy_pct);
 }
 
-enum class RocmTracerEventType {
-  Unsupported = 0,
-  Kernel,
-  MemcpyH2D,
-  MemcpyD2H,
-  MemcpyD2D,
-  MemcpyP2P,
-  MemcpyOther,
-  MemoryAlloc,
-  MemoryFree,
-  Memset,
-  Synchronization,
-  Generic,
-};
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
-
-enum class RocmTracerEventSource {
-  Invalid = 0,
-  ApiCallback,
-  Activity,
-};
-
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
-
-enum class RocmTracerEventDomain {
-  InvalidDomain = 0,
-  HIP_API,
-  HIP_OPS,
-};
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
-// RocmTracerSyncTypes forward declaration
-enum class RocmTracerSyncTypes;
-
-struct SynchronizationDetails {
-  RocmTracerSyncTypes sync_type;
-};
-
-struct RocmTracerEvent {
-  static constexpr uint32_t kInvalidDeviceId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidThreadId =
-      std::numeric_limits<uint64_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  RocmTracerEventType type;
-  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
-  RocmTracerEventDomain domain;
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view roctx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = kInvalidDeviceId;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint64_t thread_id = kInvalidThreadId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;                    // If type == Memcpy*
-    MemsetDetails memset_info;                    // If type == Memset*
-    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
-    KernelDetails kernel_info;                    // If type == Kernel
-    SynchronizationDetails synchronization_info;  // If type == Synchronization
-  };
-};
+struct RocmDeviceOccupancyParams {
+  hipFuncAttributes attributes = {};
+  int block_size = 0;
+  size_t dynamic_smem_size = 0;
+  void* func_ptr;
 
-struct RocmTraceCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64_t max_callback_api_events;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64_t max_activity_api_events;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64_t max_annotation_strings;
-  // Number of GPUs involved.
-  uint32_t num_gpus;
+  friend bool operator==(const RocmDeviceOccupancyParams& a,
+                         const RocmDeviceOccupancyParams& b) noexcept {
+    // Compare only the fields that affect occupancy decisions.
+    return std::tuple{a.attributes.binaryVersion,
+                      a.attributes.cacheModeCA,
+                      a.attributes.constSizeBytes,
+                      a.attributes.localSizeBytes,
+                      a.attributes.maxDynamicSharedSizeBytes,
+                      a.attributes.maxThreadsPerBlock,
+                      a.attributes.numRegs,
+                      a.attributes.preferredShmemCarveout,
+                      a.attributes.ptxVersion,
+                      a.block_size,
+                      a.dynamic_smem_size,
+                      a.func_ptr} ==
+           std::tuple{b.attributes.binaryVersion,
+                      b.attributes.cacheModeCA,
+                      b.attributes.constSizeBytes,
+                      b.attributes.localSizeBytes,
+                      b.attributes.maxDynamicSharedSizeBytes,
+                      b.attributes.maxThreadsPerBlock,
+                      b.attributes.numRegs,
+                      b.attributes.preferredShmemCarveout,
+                      b.attributes.ptxVersion,
+                      b.block_size,
+                      b.dynamic_smem_size,
+                      b.func_ptr};
+  }
+
+  friend bool operator!=(const RocmDeviceOccupancyParams& a,
+                         const RocmDeviceOccupancyParams& b) noexcept {
+    return !(a == b);
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H hash_state,
+                         const RocmDeviceOccupancyParams& params) {
+    return H::combine(
+        std::move(hash_state), params.attributes.maxThreadsPerBlock,
+        params.attributes.numRegs, params.attributes.sharedSizeBytes,
+        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
+        params.dynamic_smem_size, params.func_ptr);
+  }
 };
 
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
-  void Add(uint32_t correlation_id, const std::string& annotation);
-  absl::string_view LookUp(uint32_t correlation_id);
-
- private:
-  struct AnnotationMapImpl {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
-  };
-  const uint64_t max_size_;
-  AnnotationMapImpl map_;
-
- public:
-  // Disable copy and move.
-  AnnotationMap(const AnnotationMap&) = delete;
-  AnnotationMap& operator=(const AnnotationMap&) = delete;
+// FIXME: rocprofiler-sdk does not have this one yet
+struct OccupancyStats {
+  double occupancy_pct = 0.0;
+  int min_grid_size = 0;
+  int suggested_block_size = 0;
 };
 
 class RocmTraceCollector {
  public:
   explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
-      : options_(options), annotation_map_(options.max_annotation_strings) {}
+      : options_(options) {}
   virtual ~RocmTraceCollector() {}
 
   virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
   virtual void OnEventsDropped(const std::string& reason,
                                uint32_t num_events) = 0;
   virtual void Flush() = 0;
-  virtual void Export(XSpace* space) = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
+  virtual void Export(tsl::profiler::XSpace* space) = 0;
 
  protected:
   RocmTraceCollectorOptions options_;
 
- private:
-  AnnotationMap annotation_map_;
-
  public:
   // Disable copy and move.
   RocmTraceCollector(const RocmTraceCollector&) = delete;
   RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
 };
 
+class PerDeviceCollector {
+ public:
+  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+              uint64_t end_gputime_ns,
+              tsl::profiler::XPlaneBuilder* device_plane,
+              tsl::profiler::XPlaneBuilder* host_plane);
+
+  PerDeviceCollector() = default;
+
+  void AddEvent(RocmTracerEvent&& event);
+  void GetDeviceCapabilities(int32_t device_ordinal,
+                             tsl::profiler::XPlaneBuilder* device_plane);
+
+ private:
+  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const;
+  void CreateXEvent(const RocmTracerEvent& event,
+                    tsl::profiler::XPlaneBuilder* plane, uint64_t start_gpu_ns,
+                    uint64_t end_gpu_ns, tsl::profiler::XLineBuilder* line);
+  void SortByStartTime();
+  bool IsHostEvent(const RocmTracerEvent& event, int64_t* line_id);
+
+ private:
+  absl::Mutex events_mutex_;
+  std::vector<RocmTracerEvent> events_ ABSL_GUARDED_BY(events_mutex_);
+  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
+      occupancy_cache_;
+  hipDeviceProp_t device_properties_;
+};  // PerDeviceCollector
+
+class RocmTraceCollectorImpl : public RocmTraceCollector {
+ public:
+  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollector(options),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gputime_ns_(start_gputime_ns),
+        num_gpus_(options.num_gpus) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
+  void Flush() override;
+  void Export(tsl::profiler::XSpace* space) override;
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    VLOG(2) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
+            << ",) : " << reason << ".";
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gputime_ns_;
+  int num_gpus_;
+
+  absl::Mutex event_maps_mutex_;
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
+      ABSL_GUARDED_BY(event_maps_mutex_);
+
+  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
+   trigger multiple HIP ops domain activities. We keep them in a vector and
+   merge them with api activities at flush time.
+ */
+  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
+      activity_ops_events_map_ ABSL_GUARDED_BY(event_maps_mutex_);
+  // This is for the APIs that we track because we need some information from
+  // them to populate the corresponding activity that we actually track.
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
+      ABSL_GUARDED_BY(event_maps_mutex_);
+
+  std::vector<RocmTracerEvent> ApiActivityInfoExchange()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_);
+
+  absl::node_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;
+};  // RocmTraceCollectorImpl
+
 std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
-    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
-    const uint64_t start_gputime_ns);
+    const RocmTraceCollectorOptions& options, uint64_t start_walltime_ns,
+    uint64_t start_gputime_ns);
 
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc
new file mode 100644
index 00000000000000..1e0d341746546a
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+
+#include <cstdint>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "xla/tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
+namespace profiler {
+namespace test {
+
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::XSpace;
+
+TEST(RocmCollectorTest, TestAddKernelEventAndExport) {
+  RocmTraceCollectorOptions options;
+  options.max_callback_api_events = 100;
+  options.max_activity_api_events = 100;
+  options.max_annotation_strings = 100;
+  options.num_gpus = 1;
+
+  constexpr uint64_t kStartWallTimeNs = 1000;
+  constexpr uint64_t kStartGpuTimeNs = 2000;
+
+  RocmTraceCollectorImpl collector(options, kStartWallTimeNs, kStartGpuTimeNs);
+
+  constexpr uint32_t kCorrelationId = 42;
+  constexpr uint64_t kStartTimeNs = 3000;
+  constexpr uint64_t kEndTimeNs = 4000;
+
+  // === 1. Add API Callback Event ===
+  RocmTracerEvent api_event;
+  api_event.type = RocmTracerEventType::Kernel;
+  api_event.source = RocmTracerEventSource::ApiCallback;
+  api_event.domain = RocmTracerEventDomain::HIP_API;
+  api_event.name = "test_rocm_kernel";
+  api_event.correlation_id = kCorrelationId;
+  api_event.thread_id = 999;
+  api_event.kernel_info = KernelDetails{};
+  api_event.kernel_info.private_segment_size = 32;
+  api_event.kernel_info.group_segment_size = 1024;
+  api_event.kernel_info.workgroup_x = 256;
+  api_event.kernel_info.workgroup_y = 1;
+  api_event.kernel_info.workgroup_z = 1;
+  api_event.kernel_info.grid_x = 100;
+  api_event.kernel_info.grid_y = 1;
+  api_event.kernel_info.grid_z = 1;
+  api_event.kernel_info.func_ptr = reinterpret_cast<void*>(0xdeadbeef);
+
+  collector.AddEvent(std::move(api_event), /*is_auxiliary=*/false);
+
+  // === 2. Add Activity Event ===
+  RocmTracerEvent activity_event;
+  activity_event.type = RocmTracerEventType::Kernel;
+  activity_event.source = RocmTracerEventSource::Activity;
+  activity_event.domain = RocmTracerEventDomain::HIP_OPS;
+  activity_event.name = "test_rocm_kernel";
+  activity_event.correlation_id = kCorrelationId;
+  activity_event.start_time_ns = kStartTimeNs;
+  activity_event.end_time_ns = kEndTimeNs;
+  activity_event.device_id = 100;
+  activity_event.stream_id = 123;
+
+  collector.AddEvent(std::move(activity_event), /*is_auxiliary=*/false);
+
+  // === 3. Finalize and Export ===
+  collector.Flush();
+
+  tensorflow::profiler::XSpace space;
+  collector.Export(&space);
+
+  // === 4. Check results ===
+  ASSERT_GE(space.planes_size(), 1);
+  const auto* gpu_plane =
+      FindOrAddMutablePlaneWithName(&space, "/device:GPU:0");
+  ASSERT_NE(gpu_plane, nullptr);
+
+  ASSERT_GT(gpu_plane->lines_size(), 0);
+  const auto& line = gpu_plane->lines(0);
+  ASSERT_GT(line.events_size(), 0);
+
+  const auto& event = line.events(0);
+  EXPECT_EQ(event.offset_ps(), (kStartTimeNs - kStartGpuTimeNs) * 1000);
+  EXPECT_EQ(event.duration_ps(), (kEndTimeNs - kStartTimeNs) * 1000);
+  EXPECT_EQ(gpu_plane->event_metadata().at(event.metadata_id()).name(),
+            "test_rocm_kernel");
+}
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index f5283708ef9f60..601261073d71fe 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -13,1578 +13,570 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This translation unit is **self‑contained**: it provides minimal stub
+// implementations for the rocprofiler callbacks that XLA needs to register
+// (toolInit / toolFinialize / code_object_callback).  They do nothing except
+// keep the compiler and linker happy.  Once real logging is implemented, you
+// can replace the stubs with the actual logic.
+
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
 
-#include <cstdint>
+#include <time.h>
+#include <unistd.h>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/status/status.h"
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
-#include "rocm/rocm_config.h"
+#include "rocm/include/rocprofiler-sdk/agent.h"
+#include "rocm/include/rocprofiler-sdk/buffer.h"
+#include "rocm/include/rocprofiler-sdk/buffer_tracing.h"
+#include "rocm/include/rocprofiler-sdk/callback_tracing.h"
+#include "rocm/include/rocprofiler-sdk/context.h"
+#include "rocm/include/rocprofiler-sdk/cxx/details/name_info.hpp"
+#include "rocm/include/rocprofiler-sdk/fwd.h"
+#include "rocm/include/rocprofiler-sdk/hip/runtime_api_id.h"
+#include "rocm/include/rocprofiler-sdk/internal_threading.h"
+#include "rocm/include/rocprofiler-sdk/registration.h"
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
-#include "xla/tsl/profiler/utils/time_utils.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/mem.h"
+#include "tsl/platform/abi.h"
 
+// for rocprofiler-sdk
 namespace xla {
 namespace profiler {
 
-namespace se = ::stream_executor;
 using tsl::profiler::AnnotationStack;
 
+// represents an invalid or uninitialized device ID used in RocmTracer events.
 constexpr uint32_t RocmTracerEvent::kInvalidDeviceId;
 
-#define RETURN_IF_ROCTRACER_ERROR(expr)                                     \
-  do {                                                                      \
-    roctracer_status_t status = expr;                                       \
-    if (status != ROCTRACER_STATUS_SUCCESS) {                               \
-      const char* errstr = se::wrap::roctracer_error_string();              \
-      LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
-      return absl::InternalError(                                           \
-          absl::StrCat("roctracer call error", errstr));                    \
-    }                                                                       \
-  } while (false)
-
-namespace {
-
-// GetCachedTID() caches the thread ID in thread-local storage (which is a
-// userspace construct) to avoid unnecessary system calls. Without this caching,
-// it can take roughly 98ns, while it takes roughly 1ns with this caching.
-int64_t GetCachedTID() {
-  static thread_local int64_t current_thread_id =
-      tsl::Env::Default()->GetCurrentThreadId();
-  return current_thread_id;
-}
-
-const char* GetActivityDomainName(uint32_t domain) {
-  switch (domain) {
-    case ACTIVITY_DOMAIN_HSA_API:
-      return "HSA API";
-    case ACTIVITY_DOMAIN_HSA_OPS:
-      return "HSA OPS";
-    case ACTIVITY_DOMAIN_HIP_OPS:
-      return "HIP OPS/HCC/VDI";
-    case ACTIVITY_DOMAIN_HIP_API:
-      return "HIP API";
-    case ACTIVITY_DOMAIN_KFD_API:
-      return "KFD API";
-    case ACTIVITY_DOMAIN_EXT_API:
-      return "EXT API";
-    case ACTIVITY_DOMAIN_ROCTX:
-      return "ROCTX";
-    case ACTIVITY_DOMAIN_HSA_EVT:
-      return "HSA envents";
-    default:
-      DCHECK(false);
-      return "";
+inline auto GetCallbackTracingNames() {
+  return rocprofiler::sdk::get_callback_tracing_names();
+}
+
+std::vector<rocprofiler_agent_v0_t> GetGpuDeviceAgents();
+
+//-----------------------------------------------------------------------------
+// copy api calls
+bool isCopyApi(uint32_t id) {
+  switch (id) {
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArrayAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArrayAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAtoH:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoD:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoDAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoH:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoHAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbol:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbolAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoA:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoD:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoDAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2D:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2DAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeer:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeerAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToArray:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbol:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbolAsync:
+    case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream:
+      return true;
+    default: {
+    };
   }
-  return "";
+  return false;
 }
 
-std::string GetActivityDomainOpName(uint32_t domain, uint32_t op) {
-  std::ostringstream oss;
-  oss << GetActivityDomainName(domain) << " - ";
-  switch (domain) {
-    case ACTIVITY_DOMAIN_HIP_API:
-      oss << hip_api_name(op);
-      break;
-    default:
-      oss << op;
-      break;
-  }
-  return oss.str();
-}
-
-const char* GetActivityPhaseName(uint32_t phase) {
-  switch (phase) {
-    case ACTIVITY_API_PHASE_ENTER:
-      return "ENTER";
-    case ACTIVITY_API_PHASE_EXIT:
-      return "EXIT";
-    default:
-      DCHECK(false);
-      return "";
-  }
-  return "";
+// ----------------------------------------------------------------------------
+// Stub implementations for RocmTracer static functions expected by
+// rocprofiler-sdk.
+// ----------------------------------------------------------------------------
+RocmTracer& RocmTracer::GetRocmTracerSingleton() {
+  static RocmTracer obj;
+  return obj;
 }
 
-inline void DumpApiCallbackData(uint32_t domain, uint32_t cbid,
-                                const void* cbdata) {
-  std::ostringstream oss;
-  oss << "API callback for " << GetActivityDomainName(domain);
-  if (domain == ACTIVITY_DOMAIN_HIP_API) {
-    const hip_api_data_t* data =
-        reinterpret_cast<const hip_api_data_t*>(cbdata);
-    oss << " - " << hip_api_name(cbid);
-    oss << ", correlation_id=" << data->correlation_id;
-    oss << ", phase=" << GetActivityPhaseName(data->phase);
-    switch (cbid) {
-      case HIP_API_ID_hipModuleLaunchKernel:
-      case HIP_API_ID_hipExtModuleLaunchKernel:
-      case HIP_API_ID_hipHccModuleLaunchKernel:
-      case HIP_API_ID_hipLaunchKernel:
-      case HIP_API_ID_hipExtLaunchKernel:
-        break;
-      case HIP_API_ID_hipMemcpyDtoH:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoH.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoHAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoHAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyHtoD:
-        oss << ", sizeBytes=" << data->args.hipMemcpyHtoD.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyHtoDAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyHtoDAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoD:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoD.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyDtoDAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyDtoDAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemcpyAsync:
-        oss << ", sizeBytes=" << data->args.hipMemcpyAsync.sizeBytes;
-        break;
-      case HIP_API_ID_hipMemsetD32:
-        oss << ", value=" << data->args.hipMemsetD32.value;
-        oss << ", count=" << data->args.hipMemsetD32.count;
-        break;
-      case HIP_API_ID_hipMemsetD32Async:
-        oss << ", value=" << data->args.hipMemsetD32Async.value;
-        oss << ", count=" << data->args.hipMemsetD32Async.count;
-        break;
-      case HIP_API_ID_hipMemsetD8:
-        oss << ", value=" << data->args.hipMemsetD8.value;
-        oss << ", count=" << data->args.hipMemsetD8.count;
-        break;
-      case HIP_API_ID_hipMemsetD8Async:
-        oss << ", value=" << data->args.hipMemsetD8Async.value;
-        oss << ", count=" << data->args.hipMemsetD8Async.count;
-        break;
-      case HIP_API_ID_hipMalloc:
-        oss << ", size=" << data->args.hipMalloc.size;
-        break;
-      case HIP_API_ID_hipFree:
-        oss << ", ptr=" << data->args.hipFree.ptr;
-        break;
-      case HIP_API_ID_hipStreamSynchronize:
-        break;
-      case HIP_API_ID_hipStreamWaitEvent:  // ignore all aux HIP API Events
-      case HIP_API_ID_hipHostFree:
-      case HIP_API_ID_hipHostMalloc:
-      case HIP_API_ID_hipSetDevice:
-        break;
-      default:
-        VLOG(3) << "Warning: HIP API is not handled: HIP_API_ID_"
-                << hip_api_name(cbid);
-        break;
-    }
-  } else {
-    oss << ": " << cbid;
-  }
-  VLOG(3) << oss.str();
-}
-
-void DumpActivityRecord(const roctracer_record_t* record,
-                        std::string extra_info) {
-  std::ostringstream oss;
-  oss << "Activity callback for " << GetActivityDomainName(record->domain);
-  oss << ", op name= "
-      << se::wrap::roctracer_op_string(record->domain, record->op,
-                                       record->kind);
-  oss << ", correlation_id=" << record->correlation_id;
-  oss << ", begin_ns=" << record->begin_ns;
-  oss << ", end_ns=" << record->end_ns;
-  oss << ", duration=" << record->end_ns - record->begin_ns;
-  oss << ", device_id=" << record->device_id;
-  oss << ", queue_id=" << record->queue_id;
-  oss << ", process_id=" << record->process_id;
-  oss << ", thread_id=" << record->thread_id;
-  oss << ", external_id=" << record->external_id;
-  oss << ", bytes=" << record->bytes;
-  oss << ", domain=" << record->domain;
-  oss << ", op=" << record->op;
-  oss << ", kind=" << record->kind;
-  oss << ", extra_info=" << extra_info;
-  VLOG(3) << oss.str();
-}
-
-}  // namespace
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type) {
-  switch (type) {
-    case RocmTracerEventType::Kernel:
-      return "Kernel";
-    case RocmTracerEventType::MemcpyH2D:
-      return "MemcpyH2D";
-    case RocmTracerEventType::MemcpyD2H:
-      return "MemcpyD2H";
-    case RocmTracerEventType::MemcpyD2D:
-      return "MemcpyD2D";
-    case RocmTracerEventType::MemcpyP2P:
-      return "MemcpyP2P";
-    case RocmTracerEventType::MemcpyOther:
-      return "MemcpyOther";
-    case RocmTracerEventType::MemoryAlloc:
-      return "MemoryAlloc";
-    case RocmTracerEventType::MemoryFree:
-      return "MemoryFree";
-    case RocmTracerEventType::Memset:
-      return "Memset";
-    case RocmTracerEventType::Synchronization:
-      return "Synchronization";
-    case RocmTracerEventType::Generic:
-      return "Generic";
-    default:
-      DCHECK(false);
-      return "";
-  }
-  return "";
+bool RocmTracer::IsAvailable() const {
+  return !activity_tracing_enabled_ && !api_tracing_enabled_;  // &&NumGpus()
 }
 
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source) {
-  switch (source) {
-    case RocmTracerEventSource::ApiCallback:
-      return "ApiCallback";
-      break;
-    case RocmTracerEventSource::Activity:
-      return "Activity";
-      break;
-    case RocmTracerEventSource::Invalid:
-      return "Invalid";
-      break;
-    default:
-      DCHECK(false);
-      return "";
+/*static*/ uint64_t RocmTracer::GetTimestamp() {
+  uint64_t ts;
+  if (rocprofiler_get_timestamp(&ts) != ROCPROFILER_STATUS_SUCCESS) {
+    LOG(ERROR) << "function rocprofiler_get_timestamp failed with error ";
+    return 0;
   }
-  return "";
+  return ts;
 }
 
-// FIXME(rocm-profiler): These domain names are not consistent with the
-// GetActivityDomainName function
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
-  switch (domain) {
-    case RocmTracerEventDomain::HIP_API:
-      return "HIP_API";
-      break;
-    case RocmTracerEventDomain::HIP_OPS:
-      return "HIP_OPS";
-      break;
-    default:
-      VLOG(3) << "RocmTracerEventDomain::InvalidDomain";
-      DCHECK(false);
-      return "";
+void RocmTracer::Enable(const RocmTracerOptions& options,
+                        RocmTraceCollector* collector) {
+  absl::MutexLock lock(collector_mutex_);
+  if (collector_ != nullptr) {
+    LOG(WARNING) << "ROCM tracer is already running!";
+    return;
   }
-  return "";
-}
-
-absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                             const void* cbdata) {
-  /* Some APIs such as hipMalloc, implicitly work on th devices set by the
-    user using APIs such as hipSetDevice. API callbacks and activity records
-    for functions like hipMalloc does not return the device id (CUDA does). To
-    solve this we need to track the APIs that select the device (such as
-    hipSetDevice) for each thread.
-    */
-
-  thread_local uint32_t default_device = hipGetStreamDeviceId(nullptr);
-
-  // DumpApiCallbackData(domain, cbid, cbdata);
-
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus();
-
-  const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
-
-  if (data->phase == ACTIVITY_API_PHASE_ENTER) {
-    if (options_.api_tracking_set.find(cbid) !=
-        options_.api_tracking_set.end()) {
-      absl::MutexLock lock(api_call_start_mutex_);
-      api_call_start_time_.emplace(data->correlation_id,
-                                   RocmTracer::GetTimestamp());
-    }
-
-    if (cbid == HIP_API_ID_hipSetDevice) {
-      default_device = hipGetStreamDeviceId(nullptr);
-    }
-  } else if (data->phase == ACTIVITY_API_PHASE_EXIT) {
-    uint64_t enter_time = 0, exit_time = 0;
-
-    if (options_.api_tracking_set.find(cbid) !=
-        options_.api_tracking_set.end()) {
-      absl::MutexLock lock(api_call_start_mutex_);
-      if (api_call_start_time_.find(data->correlation_id) !=
-          api_call_start_time_.end()) {
-        enter_time = api_call_start_time_.at(data->correlation_id);
-        api_call_start_time_.erase(data->correlation_id);
+  options_ = options;
+  collector_ = collector;
+  api_tracing_enabled_ = true;
+  activity_tracing_enabled_ = true;
+  rocprofiler_start_context(context_);
+  VLOG(1) << "GpuTracer started with number of GPUs = " << NumGpus();
+}
+
+void RocmTracer::HipApiEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_hip_api_record_t*>(
+          hdr->payload);
+
+  trace_event->type = RocmTracerEventType::Kernel;
+  trace_event->source = RocmTracerEventSource::ApiCallback;
+  trace_event->domain = RocmTracerEventDomain::HIP_API;
+  trace_event->name = "??";
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->device_id = RocmTracerEvent::kInvalidDeviceId;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  trace_event->stream_id = RocmTracerEvent::kInvalidStreamId;
+  trace_event->kernel_info = KernelDetails{};
+
+  {
+    // bounds-check name table: kind and operation
+    absl::MutexLock lock(kernel_lock_);
+    const size_t kind = static_cast<size_t>(rec.kind);
+    if (kind < name_info_.size()) {
+      const auto& vec = name_info_[kind];
+      const size_t op = static_cast<size_t>(rec.operation);
+      if (op < vec.operations.size()) {
+        trace_event->name = vec[op];
       } else {
-        LOG(WARNING) << "An API exit callback received without API enter "
-                        "with same correlation id. Event droped!";
-        return absl::OkStatus();  // This API does not belong to us.
+        static std::atomic<int> once{0};
+        if (once.fetch_add(1) == 0) {
+          LOG(ERROR) << "HIP op OOB: kind " << kind << " op = " << op
+                     << " vec.size() = " << vec.operations.size();
+        }
+        trace_event->name = "HIP_UNKNOWN_OP";
       }
-      exit_time = RocmTracer::GetTimestamp();
-    }
-    // Set up the map from correlation id to annotation string.
-    const std::string& annotation = AnnotationStack::Get();
-    if (!annotation.empty()) {
-      collector_->annotation_map()->Add(data->correlation_id, annotation);
-    }
-
-    if (options_.api_tracking_set.find(cbid) ==
-        options_.api_tracking_set.end()) {
-      VLOG(3) << "API callback is from the auxilarity list. Corr. id="
-              << data->correlation_id;
-    }
-    DumpApiCallbackData(domain, cbid, cbdata);
-
-    switch (cbid) {
-      // star in comments means it does not exist in the driver wrapper
-      case HIP_API_ID_hipModuleLaunchKernel:
-      case HIP_API_ID_hipExtModuleLaunchKernel:  // *
-      case HIP_API_ID_hipHccModuleLaunchKernel:  // *
-      case HIP_API_ID_hipLaunchKernel:           // *
-      case HIP_API_ID_hipExtLaunchKernel:
-
-        this->AddKernelEventUponApiExit(cbid, data, enter_time, exit_time);
-
-        // Add the correlation_ids for these events to the pending set
-        // so that we can explicitly wait for their corresponding
-        // HIP runtime activity records, before exporting the trace data
-        tracer_->AddToPendingActivityRecords(data->correlation_id);
-        break;
-      case HIP_API_ID_hipMemcpy:
-      case HIP_API_ID_hipMemcpyDtoH:
-      case HIP_API_ID_hipMemcpyDtoHAsync:
-      case HIP_API_ID_hipMemcpyHtoD:
-      case HIP_API_ID_hipMemcpyHtoDAsync:
-      case HIP_API_ID_hipMemcpyDtoD:
-      case HIP_API_ID_hipMemcpyDtoDAsync:
-      case HIP_API_ID_hipMemcpyAsync:
-        this->AddNormalMemcpyEventUponApiExit(cbid, data, enter_time,
-                                              exit_time);
-        tracer_->AddToPendingActivityRecords(data->correlation_id);
-        break;
-      case HIP_API_ID_hipMemset:
-      case HIP_API_ID_hipMemsetAsync:
-      case HIP_API_ID_hipMemsetD32:
-      case HIP_API_ID_hipMemsetD32Async:
-      case HIP_API_ID_hipMemsetD16:
-      case HIP_API_ID_hipMemsetD16Async:
-      case HIP_API_ID_hipMemsetD8:
-      case HIP_API_ID_hipMemsetD8Async:
-        this->AddMemsetEventUponApiExit(cbid, data, enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipMalloc:
-      case HIP_API_ID_hipMallocPitch:
-      case HIP_API_ID_hipHostMalloc:
-      case HIP_API_ID_hipFree:
-      case HIP_API_ID_hipHostFree:
-        this->AddMallocFreeEventUponApiExit(cbid, data, default_device,
-                                            enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipStreamSynchronize:
-      case HIP_API_ID_hipStreamWaitEvent:
-        // case HIP_API_ID_hipEventSynchronize:
-        this->AddSynchronizeEventUponApiExit(cbid, data, enter_time, exit_time);
-        break;
-      case HIP_API_ID_hipSetDevice:
-        // we track this ID only to find the device ID
-        //  for the current thread.
-        break;
-      default:
-        //
-        VLOG(1) << "API call "
-                << se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid,
-                                                 0)
-                << ", corr. id=" << data->correlation_id
-                << " dropped. No capturing function was found!";
-        // AddGenericEventUponApiExit(cbid, data);
-        break;
+    } else {
+      static std::atomic<int> once{0};
+      if (once.fetch_add(1) == 0) {
+        LOG(ERROR) << "HIP kind OOB: kind = " << kind
+                   << " name_info_.size() = " << name_info_.size();
+      }
+      trace_event->name = "HIP_UNKNOWN_KIND";
     }
   }
-  return absl::OkStatus();
-}
 
-void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
-                                                    const hip_api_data_t* data,
-                                                    const uint64_t enter_time,
-                                                    const uint64_t exit_time) {
-  /*
-  extra fields:
-    kernel_info, domain
-
-  missing fields:
-    context_id
-  */
-  RocmTracerEvent event;
-
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipModuleLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipModuleLaunchKernel.blockDimX;
-      event.kernel_info.block_y = data->args.hipModuleLaunchKernel.blockDimY;
-      event.kernel_info.block_z = data->args.hipModuleLaunchKernel.blockDimZ;
-      event.kernel_info.grid_x = data->args.hipModuleLaunchKernel.gridDimX;
-      event.kernel_info.grid_y = data->args.hipModuleLaunchKernel.gridDimY;
-      event.kernel_info.grid_z = data->args.hipModuleLaunchKernel.gridDimZ;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipModuleLaunchKernel.stream;
-      // TODO(rocm-profiler): wrap this API if possible.
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipExtModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipExtModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipExtModuleLaunchKernel.sharedMemBytes;
-      unsigned int blockDimX =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeX;
-      unsigned int blockDimY =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeY;
-      unsigned int blockDimZ =
-          data->args.hipExtModuleLaunchKernel.localWorkSizeZ;
-
-      event.kernel_info.block_x = blockDimX;
-      event.kernel_info.block_y = blockDimY;
-      event.kernel_info.block_z = blockDimZ;
-      event.kernel_info.grid_x =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeX / blockDimX;
-      event.kernel_info.grid_y =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeY / blockDimY;
-      event.kernel_info.grid_z =
-          data->args.hipExtModuleLaunchKernel.globalWorkSizeZ / blockDimZ;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipExtModuleLaunchKernel.hStream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipHccModuleLaunchKernel: {
-      const hipFunction_t kernelFunc = data->args.hipHccModuleLaunchKernel.f;
-      if (kernelFunc != nullptr) event.name = hipKernelNameRef(kernelFunc);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipHccModuleLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipHccModuleLaunchKernel.blockDimX;
-      event.kernel_info.block_y = data->args.hipHccModuleLaunchKernel.blockDimY;
-      event.kernel_info.block_z = data->args.hipHccModuleLaunchKernel.blockDimZ;
-      event.kernel_info.grid_x =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeX /
-          event.kernel_info.block_x;
-      event.kernel_info.grid_y =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeY /
-          event.kernel_info.block_y;
-      event.kernel_info.grid_z =
-          data->args.hipHccModuleLaunchKernel.globalWorkSizeZ /
-          event.kernel_info.block_z;
-      event.kernel_info.func_ptr = kernelFunc;
-      const hipStream_t& stream = data->args.hipHccModuleLaunchKernel.hStream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipHccModuleLaunchKernel.sharedMemBytes;
-    } break;
-    case HIP_API_ID_hipLaunchKernel: {
-      const void* func_addr = data->args.hipLaunchKernel.function_address;
-      hipStream_t stream = data->args.hipLaunchKernel.stream;
-      if (func_addr != nullptr)
-        event.name = hipKernelNameRefByPtr(func_addr, stream);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipLaunchKernel.dimBlocks.x;
-      event.kernel_info.block_y = data->args.hipLaunchKernel.dimBlocks.y;
-      event.kernel_info.block_z = data->args.hipLaunchKernel.dimBlocks.z;
-      event.kernel_info.grid_x = data->args.hipLaunchKernel.numBlocks.x;
-      event.kernel_info.grid_y = data->args.hipLaunchKernel.numBlocks.y;
-      event.kernel_info.grid_z = data->args.hipLaunchKernel.numBlocks.z;
-      event.kernel_info.func_ptr = (void*)func_addr;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipExtLaunchKernel: {
-      const void* func_addr = data->args.hipExtLaunchKernel.function_address;
-      hipStream_t stream = data->args.hipExtLaunchKernel.stream;
-      if (func_addr != nullptr)
-        event.name = hipKernelNameRefByPtr(func_addr, stream);
-
-      event.kernel_info.dynamic_shared_memory_usage =
-          data->args.hipExtLaunchKernel.sharedMemBytes;
-      event.kernel_info.block_x = data->args.hipExtLaunchKernel.dimBlocks.x;
-      event.kernel_info.block_y = data->args.hipExtLaunchKernel.dimBlocks.y;
-      event.kernel_info.block_z = data->args.hipExtLaunchKernel.dimBlocks.z;
-      event.kernel_info.grid_x = data->args.hipExtLaunchKernel.numBlocks.x;
-      event.kernel_info.grid_y = data->args.hipExtLaunchKernel.numBlocks.y;
-      event.kernel_info.grid_z = data->args.hipExtLaunchKernel.numBlocks.z;
-      event.kernel_info.func_ptr = const_cast<void*>(func_addr);
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
+  if (isCopyApi(rec.operation)) {
+    // actually one needs to set the real type
+    trace_event->type = RocmTracerEventType::MemcpyOther;
   }
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-void RocmApiCallbackImpl::AddNormalMemcpyEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  /*
-    missing:
-      device_id(partially, have only for async), context_id,
-    memcpy_info.kind(CUPTI puts CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN),
-      memcpy_info.destination(partially, only for async)( CUPTI puts device_id),
-
-    extra:
-      domain, name,
-  */
-  // for CUDA, it does NOT capture stream id for these types
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  /* The general hipMemcpy or hipMemcpyAsync can support any kind of memory
-  copy operation, such as H2D, D2D, P2P, and D2H. Here we use MemcpyOther for
-  all api calls with HipMemcpy(+Async) to carry-on this generality.
-  We also assume that if we want to copy data BETWEEN devices, we do not use
-  hipMemcpy(+Async) or hipMemcpyDtoD(+Async) as we explicitly always set the
-  destenation as the source device id). Ultimately, to figure out the actual
-  device we can use hipPointerGetAttributes but we do not do that now .In the
-  other words, we assume we use hipMemcpyPeer to achieve the copy between
-  devices.
-  */
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemcpyDtoH: {
-      event.type = RocmTracerEventType::MemcpyD2H;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoH.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoHAsync: {
-      event.type = RocmTracerEventType::MemcpyD2H;
-      const hipStream_t& stream = data->args.hipMemcpyDtoHAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoHAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpyHtoD: {
-      event.type = RocmTracerEventType::MemcpyH2D;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyHtoD.sizeBytes;
-      event.memcpy_info.async = false;
-      // we set the destenattion device id for it using the device id we get
-      // from activities when they exchange information before flushing
-    } break;
-    case HIP_API_ID_hipMemcpyHtoDAsync: {
-      event.type = RocmTracerEventType::MemcpyH2D;
-      const hipStream_t& stream = data->args.hipMemcpyHtoDAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyHtoDAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoD: {
-      event.type = RocmTracerEventType::MemcpyD2D;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoD.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyDtoDAsync: {
-      event.type = RocmTracerEventType::MemcpyD2D;
-      const hipStream_t& stream = data->args.hipMemcpyDtoDAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyDtoDAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
-    case HIP_API_ID_hipMemcpy: {
-      event.type = RocmTracerEventType::MemcpyOther;
-      event.memcpy_info.num_bytes = data->args.hipMemcpy.sizeBytes;
-      event.memcpy_info.async = false;
-    } break;
-    case HIP_API_ID_hipMemcpyAsync: {
-      event.type = RocmTracerEventType::MemcpyOther;
-      const hipStream_t& stream = data->args.hipMemcpyAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-      event.memcpy_info.num_bytes = data->args.hipMemcpyAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      event.memcpy_info.destination = event.device_id;
-    } break;
+void RocmTracer::MemcpyEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_memory_copy_record_t*>(
+          hdr->payload);
+
+#define OO(src, target)                              \
+  case ROCPROFILER_MEMORY_COPY_##src:                \
+    trace_event->type = RocmTracerEventType::target; \
+    trace_event->name = #target;                     \
+    break;
+
+  switch (rec.operation) {
+    OO(NONE, MemcpyOther)
+    OO(HOST_TO_HOST, MemcpyOther)
+    OO(HOST_TO_DEVICE, MemcpyH2D)
+    OO(DEVICE_TO_HOST, MemcpyD2H)
+    OO(DEVICE_TO_DEVICE, MemcpyD2D)
     default:
-      LOG(WARNING) << "Unsupported Memcpy API for profiling observed for cbid="
-                   << cbid << ". Event dropped!";
-      return;
-      break;
+      LOG(WARNING) << "Unexpected memcopy operation " << rec.operation;
+      trace_event->type = RocmTracerEventType::MemcpyOther;
   }
-
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-void RocmApiCallbackImpl::AddMemcpyPeerEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  /*
-    missing: context_id, memcpy_info.kind
-
-    extra: domain, name,
-  */
-
-  RocmTracerEvent event;
-  event.type = RocmTracerEventType::MemcpyP2P;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemcpyPeer:
-      event.device_id = data->args.hipMemcpyPeer.srcDeviceId;
-      event.memcpy_info.destination = data->args.hipMemcpyPeer.dstDeviceId;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyPeer.sizeBytes;
-      event.memcpy_info.async = false;
-      break;
-    case HIP_API_ID_hipMemcpyPeerAsync:
-      event.device_id = data->args.hipMemcpyPeerAsync.srcDevice;
-      event.memcpy_info.destination = data->args.hipMemcpyPeerAsync.dstDeviceId;
-      event.memcpy_info.num_bytes = data->args.hipMemcpyPeerAsync.sizeBytes;
-      event.memcpy_info.async = true;
-      break;
-    default:
-      LOG(WARNING)
-          << "Unsupported MemcpyPeer API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
+#undef OO
+  const auto &src_gpu = agents_[static_cast<uint32_t>(rec.src_agent_id.handle)],
+             &dst_gpu = agents_[static_cast<uint32_t>(rec.dst_agent_id.handle)];
+
+  // Assign device_id based on copy direction
+  if (trace_event->type == RocmTracerEventType::MemcpyH2D &&
+      dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+    trace_event->device_id = dst_gpu.id.handle;  // Destination is GPU
+  } else if (trace_event->type == RocmTracerEventType::MemcpyD2H &&
+             src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+    trace_event->device_id = src_gpu.id.handle;  // Source is GPU
+  } else if (trace_event->type == RocmTracerEventType::MemcpyD2D) {
+    // Prefer destination GPU for D2D
+    trace_event->device_id = dst_gpu.id.handle;
+  } else {
+    // Fallback for MemcpyOther or HOST_TO_HOST
+    if (dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      trace_event->device_id = dst_gpu.id.handle;
+    } else if (src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      trace_event->device_id = src_gpu.id.handle;
+    } else {
+      LOG(WARNING) << "No GPU ID available for memory copy operation: "
+                   << trace_event->name << ", src_agent_type=" << src_gpu.type
+                   << ", dst_agent_type=" << dst_gpu.type;
+      trace_event->device_id = 0;  // Invalid ID or default
+    }
   }
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-void RocmApiCallbackImpl::AddMemsetEventUponApiExit(uint32_t cbid,
-                                                    const hip_api_data_t* data,
-                                                    uint64_t enter_time,
-                                                    uint64_t exit_time) {
-  /*
-    misses:
-      device_id(only avail. for async), context_id
-
-    extras:
-      domain, name
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMemsetD8:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetD8.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD8Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetD8Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD8Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemsetD16:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 2 * data->args.hipMemsetD16.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD16Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 2 * data->args.hipMemsetD16Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD16Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemsetD32:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 4 * data->args.hipMemsetD32.count;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD32Async: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = 4 * data->args.hipMemsetD32Async.count;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetD32Async.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipMemset:
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemset.sizeBytes;
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetAsync: {
-      event.type = RocmTracerEventType::Memset;
-      event.memset_info.num_bytes = data->args.hipMemsetAsync.sizeBytes;
-      event.memset_info.async = true;
-      const hipStream_t& stream = data->args.hipMemsetAsync.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    default:
-      LOG(WARNING) << "Unsupported Memset API for profiling observed for cbid="
-                   << cbid << ". Event dropped!";
-      return;
-      break;
+  trace_event->source = RocmTracerEventSource::Activity;
+  trace_event->domain = RocmTracerEventDomain::HIP_OPS;
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  // we do not know valid stream ID for memcpy
+  // rec.stream_id.handle;
+  trace_event->stream_id = RocmTracerEvent::kInvalidStreamId;
+  trace_event->memcpy_info = MemcpyDetails{
+      .num_bytes = rec.bytes,
+      .destination = static_cast<uint32_t>(dst_gpu.id.handle),
+      .async = false,
+  };
+
+  VLOG(2) << "copy bytes: " << trace_event->memcpy_info.num_bytes
+          << " stream: " << trace_event->stream_id << " src_id "
+          << trace_event->device_id << " dst_id "
+          << trace_event->memcpy_info.destination;
+}
+
+void RocmTracer::KernelEvent(const rocprofiler_record_header_t* hdr,
+                             RocmTracerEvent* trace_event) {
+  const auto& rec =
+      *static_cast<const rocprofiler_buffer_tracing_kernel_dispatch_record_t*>(
+          hdr->payload);
+
+  const auto& kinfo = rec.dispatch_info;
+  trace_event->type = RocmTracerEventType::Kernel;
+  trace_event->source = RocmTracerEventSource::Activity;
+  trace_event->domain = RocmTracerEventDomain::HIP_OPS;
+  trace_event->name = "??";
+  trace_event->start_time_ns = rec.start_timestamp;
+  trace_event->end_time_ns = rec.end_timestamp;
+  trace_event->device_id = agents_[kinfo.agent_id.handle].id.handle;
+  trace_event->correlation_id = rec.correlation_id.internal;
+  trace_event->annotation =
+      annotation_map()->LookUp(trace_event->correlation_id);
+  trace_event->thread_id = rec.thread_id;
+  trace_event->stream_id = kinfo.queue_id.handle;
+  trace_event->kernel_info = KernelDetails{
+      .private_segment_size = kinfo.private_segment_size,
+      .group_segment_size = kinfo.group_segment_size,
+      .workgroup_x = kinfo.workgroup_size.x,
+      .workgroup_y = kinfo.workgroup_size.y,
+      .workgroup_z = kinfo.workgroup_size.z,
+      .grid_x = kinfo.grid_size.x,
+      .grid_y = kinfo.grid_size.y,
+      .grid_z = kinfo.grid_size.z,
+      .func_ptr = nullptr,
+  };
+
+  auto it = kernel_info_.find(kinfo.kernel_id);
+  if (it != kernel_info_.end()) trace_event->name = it->second.name;
+}
+
+void RocmTracer::TracingCallback(rocprofiler_context_id_t context,
+                                 rocprofiler_buffer_id_t buffer_id,
+                                 rocprofiler_record_header_t** headers,
+                                 size_t num_headers, uint64_t drop_count) {
+  if (collector() == nullptr) {
+    return;
   }
+  if (num_headers == 0) {
+    return;
+  }
+  assert(drop_count == 0 && "drop count should be zero for lossless policy");
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
-
-void RocmApiCallbackImpl::AddMallocFreeEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint32_t device_id,
-    uint64_t enter_time, uint64_t exit_time) {
-  /*
-    misses: context_id
-
-    extras: domain
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = (cbid == HIP_API_ID_hipFree || cbid == HIP_API_ID_hipHostFree)
-                   ? RocmTracerEventType::MemoryFree
-                   : RocmTracerEventType::MemoryAlloc;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.device_id = device_id;
-  event.thread_id = GetCachedTID();
-  // We do not set stream_id (probably to zero as Malloc etc. commands seems
-  // to run on  default stream). Later we use the unassigned stream_id as a
-  // feature to assign events to host or device.
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipMalloc:
-      event.memalloc_info.num_bytes = data->args.hipMalloc.size;
-      break;
-    case HIP_API_ID_hipMallocPitch:
-      event.memalloc_info.num_bytes = data->args.hipMallocPitch.pitch__val *
-                                      data->args.hipMallocPitch.height;
-      break;
-    case HIP_API_ID_hipHostMalloc:
-      event.memalloc_info.num_bytes = data->args.hipHostMalloc.size;
-      break;
-    case HIP_API_ID_hipFree:
-    case HIP_API_ID_hipHostFree:
-      event.memalloc_info.num_bytes = 0;
-      break;
-    default:
-      LOG(WARNING)
-          << "Unsupported Malloc/Free API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
+  if (headers == nullptr) {
+    LOG(ERROR)
+        << "rocprofiler invoked a buffer callback with a null pointer to the "
+           "array of headers. this should never happen";
+    return;
   }
 
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
+  for (size_t i = 0; i < num_headers; i++) {
+    RocmTracerEvent event;
+    auto header = headers[i];
 
-void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
-    uint32_t cbid, const hip_api_data_t* data, uint64_t enter_time,
-    uint64_t exit_time) {
-  // TODO(rocm-profiler): neither CUDA and nor we capture annotaint for this
-  // event
-  /*
-    misses: context_id
-
-    extras: domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Synchronization;
-  event.source = RocmTracerEventSource::ApiCallback;
-  event.name = se::wrap::roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cbid, 0);
-  event.thread_id = GetCachedTID();
-  event.correlation_id = data->correlation_id;
-  event.start_time_ns = enter_time;
-  event.end_time_ns = exit_time;
-
-  switch (cbid) {
-    case HIP_API_ID_hipStreamSynchronize: {
-      event.synchronization_info.sync_type =
-          RocmTracerSyncTypes::StreamSynchronize;
-      const hipStream_t& stream = data->args.hipStreamSynchronize.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    case HIP_API_ID_hipStreamWaitEvent: {
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::StreamWait;
-      const hipStream_t& stream = data->args.hipStreamWaitEvent.stream;
-      event.device_id = hipGetStreamDeviceId(stream);
-    } break;
-    default:
-      LOG(WARNING)
-          << "Unsupported Synchronization API for profiling observed for cbid="
-          << cbid << ". Event dropped!";
-      return;
-      break;
-  }
-  bool is_auxiliary =
-      options_.api_tracking_set.find(cbid) == options_.api_tracking_set.end();
-  collector_->AddEvent(std::move(event), is_auxiliary);
-}
+    if (header->category != ROCPROFILER_BUFFER_CATEGORY_TRACING) continue;
 
-absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                  const char* end) {
-  // we do not dump activities in this set in logger
-
-  static std::set<activity_op_t> dump_excluded_activities = {
-      HIP_API_ID_hipGetDevice,
-      HIP_API_ID_hipSetDevice,
-      HIP_API_ID___hipPushCallConfiguration,
-      HIP_API_ID___hipPopCallConfiguration,
-      HIP_API_ID_hipEventQuery,
-      HIP_API_ID_hipCtxSetCurrent,
-      HIP_API_ID_hipEventRecord,
-      HIP_API_ID_hipEventQuery,
-      HIP_API_ID_hipGetDeviceProperties,
-      HIP_API_ID_hipPeekAtLastError,
-      HIP_API_ID_hipModuleGetFunction,
-      HIP_API_ID_hipEventCreateWithFlags};
-
-  const roctracer_record_t* record =
-      reinterpret_cast<const roctracer_record_t*>(begin);
-  const roctracer_record_t* end_record =
-      reinterpret_cast<const roctracer_record_t*>(end);
-
-  while (record < end_record) {
-    // DumpActivityRecord(record);
-
-    switch (record->domain) {
-      // HIP API activities.
-      case ACTIVITY_DOMAIN_HIP_API:
-        switch (record->op) {
-          case HIP_API_ID_hipModuleLaunchKernel:
-          case HIP_API_ID_hipExtModuleLaunchKernel:
-          case HIP_API_ID_hipHccModuleLaunchKernel:
-          case HIP_API_ID_hipLaunchKernel:
-          case HIP_API_ID_hipExtLaunchKernel:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipKernelActivityEvent(record);
-            break;
-          case HIP_API_ID_hipMemcpyDtoH:
-          case HIP_API_ID_hipMemcpyHtoD:
-          case HIP_API_ID_hipMemcpyDtoD:
-          case HIP_API_ID_hipMemcpyDtoHAsync:
-          case HIP_API_ID_hipMemcpyHtoDAsync:
-          case HIP_API_ID_hipMemcpyDtoDAsync:
-          case HIP_API_ID_hipMemcpyAsync:
-          case HIP_API_ID_hipMemcpy:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddNormalHipMemcpyActivityEvent(record);
-            break;
-          case HIP_API_ID_hipMemset:
-          case HIP_API_ID_hipMemsetAsync:
-          case HIP_API_ID_hipMemsetD32:
-          case HIP_API_ID_hipMemsetD32Async:
-          case HIP_API_ID_hipMemsetD16:
-          case HIP_API_ID_hipMemsetD16Async:
-          case HIP_API_ID_hipMemsetD8:
-          case HIP_API_ID_hipMemsetD8Async:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipMemsetActivityEvent(record);
-            break;
-
-          case HIP_API_ID_hipMalloc:
-          case HIP_API_ID_hipMallocPitch:
-          case HIP_API_ID_hipHostMalloc:
-          case HIP_API_ID_hipFree:
-          case HIP_API_ID_hipHostFree:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipMallocActivityEvent(record);
-            break;
-          case HIP_API_ID_hipStreamSynchronize:
-          case HIP_API_ID_hipStreamWaitEvent:
-            // case HIP_API_ID_hipStreamWaitEvent:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHipStreamSynchronizeActivityEvent(record);
-            break;
-
-          default:
-            if (dump_excluded_activities.find(record->op) ==
-                dump_excluded_activities.end()) {
-              std::string drop_message(
-                  "\nNot in the API tracked activities. Dropped!");
-              DumpActivityRecord(record, drop_message);
-            }
-            break;
-        }  // switch (record->op).
+    switch (header->kind) {
+      case ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API:
+        HipApiEvent(header, &event);
         break;
 
-      // HCC ops activities.
-      case ACTIVITY_DOMAIN_HIP_OPS:
-
-        switch (record->op) {
-          case HIP_OP_ID_DISPATCH:
-            DumpActivityRecord(record, std::to_string(__LINE__));
-            AddHccKernelActivityEvent(record);
-            tracer_->RemoveFromPendingActivityRecords(record->correlation_id);
-            break;
-          case HIP_OP_ID_COPY:
-            switch (record->kind) {
-              // TODO(rocm-profiler): use enum instead.
-              case 4595:   /*CopyDeviceToHost*/
-              case 4596:   /*CopyDeviceToDevice*/
-              case 4597: { /*CopyHostToDevice*/
-                /*MEMCPY*/
-                // roctracer returns CopyHostToDevice for hipMemcpyDtoD API
-                //  Please look at the issue #53 in roctracer GitHub repo.
-                DumpActivityRecord(record, "");
-                AddNormalHipOpsMemcpyActivityEvent(record);
-                tracer_->RemoveFromPendingActivityRecords(
-                    record->correlation_id);
-              } break;
-              case 4615: /*FillBuffer*/
-                /*MEMSET*/
-                DumpActivityRecord(record, "");
-                AddHipOpsMemsetActivityEvent(record);
-                break;
-              case 4606: /*MARKER*/
-                // making the log shorter.
-                // markers are with 0ns duration.
-                break;
-              default:
-                std::string drop_message(
-                    "\nNot in the HIP-OPS-COPY tracked activities. Dropeed!");
-                DumpActivityRecord(record, drop_message);
-                break;
-            }  // switch (record->kind)
-            break;
-          default:
-            std::string drop_message(
-                "\nNot in the HIP-OPS tracked activities. Dropped!");
-            DumpActivityRecord(record, drop_message);
-            break;
-        }  // switch (record->op).
+      case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH:
+        KernelEvent(header, &event);
         break;
-      default:
-        std::string drop_message(
-            "\nNot in the tracked domain activities. Dropped!");
-        DumpActivityRecord(record, drop_message);
-        break;
-    }
 
-    RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-#if TF_ROCM_VERSION >= 50300
-        se::wrap::roctracer_next_record(record, &record)
-#else
-        roctracer_next_record(record, &record)
-#endif
-            ));
-  }
+      case ROCPROFILER_BUFFER_TRACING_MEMORY_COPY:
+        MemcpyEvent(header, &event);
+        break;
 
-  return absl::OkStatus();
-}
+      default:
+        continue;
+    }  // switch
 
-void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  missing:
-   name, device_id(got from hcc), context_id, stream_id(got from hcc),
- nvtx_range, kernel_info
-
-  extra:
-   domain
- activity record contains process/thread ID
- */
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::Activity;
-  // event.name =  /* we use the API name instead*/
-  //    se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  // TODO(rocm-profiler): CUDA uses device id and correlation ID for finding
-  // annotations.
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-
-  collector_->AddEvent(std::move(event), false);
+    absl::MutexLock lock(collector_mutex_);
+    if (collector()) {
+      collector()->AddEvent(std::move(event), false);
+    }
+  }  // for
 }
 
-void RocmActivityCallbackImpl::AddNormalHipMemcpyActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  ---------------NormalMemcpy-------------------
-    misses:context_id, memcpy_info.kind, memcpy_info.srckind,
-  memcpy_info.dstkind, memcpy_info.num_bytes, memcpy_info.destenation,
-  device_id, stream_id,
-
-    extras: domain
-  ---------------PeerMemcpy---------------------
-    misses: device_id, context_id, stream_id, memcpy_info.kind,
-      memcpy_info.num_bytes, memcpy_info.destination,
-    extras:
-      domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.source = RocmTracerEventSource::Activity;
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  // TODO(roc-profiler): record->bytes is not a valid value
-  // event.memcpy_info.num_bytes = record->bytes;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  switch (record->op) {
-    case HIP_API_ID_hipMemcpyDtoH:
-    case HIP_API_ID_hipMemcpyDtoHAsync:
-      event.type = RocmTracerEventType::MemcpyD2H;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyDtoHAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyHtoD:
-    case HIP_API_ID_hipMemcpyHtoDAsync:
-      event.type = RocmTracerEventType::MemcpyH2D;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyHtoDAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyDtoD:
-    case HIP_API_ID_hipMemcpyDtoDAsync:
-      event.type = RocmTracerEventType::MemcpyD2D;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyDtoDAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpy:
-    case HIP_API_ID_hipMemcpyAsync:
-      event.type = RocmTracerEventType::MemcpyOther;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyAsync) ? true : false;
-      break;
-    case HIP_API_ID_hipMemcpyPeer:
-    case HIP_API_ID_hipMemcpyPeerAsync:
-      event.type = RocmTracerEventType::MemcpyP2P;
-      event.memcpy_info.async =
-          (record->op == HIP_API_ID_hipMemcpyPeerAsync) ? true : false;
-      break;
-    default:
-      LOG(WARNING) << "Unsupported Memcpy/MemcpyPeer activity for profiling "
-                      "observed for cbid="
-                   << record->op << ". Event dropped!";
-      return;
-      break;
+void RocmTracer::CodeObjectCallback(
+    rocprofiler_callback_tracing_record_t record, void* callback_data) {
+  if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
+      record.operation == ROCPROFILER_CODE_OBJECT_LOAD) {
+    if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) {
+      // mainly for debugging
+      LOG(WARNING)
+          << "Callback phase unload without registering kernel names ...";
+    }
+  } else if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
+             record.operation ==
+                 ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) {
+    auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
+    if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) {
+      absl::MutexLock lock(kernel_lock_);
+      kernel_info_.emplace(
+          data->kernel_id,
+          ProfilerKernelInfo{tsl::port::MaybeAbiDemangle(data->kernel_name),
+                             *data});
+    } else if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) {
+      // FIXME: clear these?  At minimum need kernel names at shutdown, async
+      // completion We don't erase it just in case a buffer callback still needs
+      // this kernel_info_.erase(data->kernel_id);
+    }
   }
-
-  collector_->AddEvent(std::move(event), false);
 }
 
-void RocmActivityCallbackImpl::AddHipMemsetActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      device_id, context_id, stram_id, memset_info.num_bytes
-      memset_info.kind
-
-    extras:
-      domain, annotation
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.type = RocmTracerEventType::Memset;
-
-  switch (record->op) {
-    case HIP_API_ID_hipMemset:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetAsync:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD8:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD8Async:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD16:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD16Async:
-      event.memset_info.async = true;
-      break;
-    case HIP_API_ID_hipMemsetD32:
-      event.memset_info.async = false;
-      break;
-    case HIP_API_ID_hipMemsetD32Async:
-      event.memset_info.async = true;
-      break;
+static void code_object_callback(rocprofiler_callback_tracing_record_t record,
+                                 rocprofiler_user_data_t* user_data,
+                                 void* callback_data) {
+  RocmTracer::GetRocmTracerSingleton().CodeObjectCallback(record,
+                                                          callback_data);
+}
+
+static void tool_tracing_callback(rocprofiler_context_id_t context,
+                                  rocprofiler_buffer_id_t buffer_id,
+                                  rocprofiler_record_header_t** headers,
+                                  size_t num_headers, void* user_data,
+                                  uint64_t drop_count) {
+  RocmTracer::GetRocmTracerSingleton().TracingCallback(
+      context, buffer_id, headers, num_headers, drop_count);
+}
+
+int RocmTracer::toolInit(rocprofiler_client_finalize_t fini_func,
+                         void* tool_data) {
+  // Gather API names
+  name_info_ = GetCallbackTracingNames();
+
+  // Gather agent info
+  num_gpus_ = 0;
+  for (const auto& agent : GetGpuDeviceAgents()) {
+    VLOG(1) << "agent id = " << agent.id.handle
+              << ", dev = " << agent.device_id
+              << ", name = " << (agent.name ? agent.name : "null");
+    agents_[agent.id.handle] = agent;
+    if (agent.type == ROCPROFILER_AGENT_TYPE_GPU) {
+      num_gpus_++;
+    }
   }
 
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipMallocActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses: device_id, context_id, memory_residency_info (num_byts, kind,
-    address)
-
-    extras:
-      annotation, domain,
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::MemoryAlloc;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  // similar to CUDA we set this to the default stream
-  event.stream_id = 0;
-  event.start_time_ns = record->begin_ns;
-  // making sure it does not have 0ns duration. Otherwise, it may not show up in
-  // the trace view
-  event.end_time_ns = std::max(record->end_ns, record->begin_ns + 1);
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipStreamSynchronizeActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-  misses: context_id, device_id (cuda also does not provide but we can get from
-  API-CB)
-
-  extras: domain, synchronization_info.sync_type, annotation
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_API;
-  event.type = RocmTracerEventType::Synchronization;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  event.start_time_ns = record->begin_ns;
-
-  // making sure it does not have 0ns duration. Otherwise, it may not show up in
-  // the trace view
-  event.end_time_ns = std::max(record->end_ns, record->begin_ns + 1);
-
-  switch (record->op) {
-    case HIP_API_ID_hipStreamSynchronize:
-      event.synchronization_info.sync_type =
-          RocmTracerSyncTypes::StreamSynchronize;
-      break;
-    case HIP_API_ID_hipStreamWaitEvent:
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::StreamWait;
-      break;
-    default:
-      event.synchronization_info.sync_type = RocmTracerSyncTypes::InvalidSync;
-      break;
+  // Utility context to gather code‑object info
+  rocprofiler_create_context(&utility_context_);
+
+  // buffered tracing
+  auto code_object_ops = std::vector<rocprofiler_tracing_operation_t>{
+      ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER};
+
+  rocprofiler_configure_callback_tracing_service(
+      utility_context_, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
+      code_object_ops.data(), code_object_ops.size(), code_object_callback,
+      nullptr);
+
+  rocprofiler_start_context(utility_context_);
+  VLOG(1) << "rocprofiler start utilityContext";
+
+  // a multiple of the page size, and the gap allows the buffer to absorb bursts
+  // of GPU events
+  constexpr auto buffer_size_bytes = 100 * 4096;
+  constexpr auto buffer_watermark_bytes = 40 * 4096;
+
+  // Utility context to gather code‑object info
+  rocprofiler_create_context(&context_);
+
+  rocprofiler_create_buffer(context_, buffer_size_bytes, buffer_watermark_bytes,
+                            ROCPROFILER_BUFFER_POLICY_LOSSLESS,
+                            tool_tracing_callback, tool_data, &buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API, nullptr, 0,
+      buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0,
+      buffer_);
+
+  rocprofiler_configure_buffer_tracing_service(
+      context_, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, buffer_);
+
+  {
+    // for annotations
+    const rocprofiler_tracing_operation_t* hip_ops = nullptr;
+    size_t hip_ops_count = 0;
+
+    rocprofiler_configure_callback_tracing_service(
+        context_, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, hip_ops,
+        hip_ops_count,
+        [](rocprofiler_callback_tracing_record_t record,
+           rocprofiler_user_data_t*, void*) {
+          if (record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) {
+            const std::string& annotation =
+                tsl::profiler::AnnotationStack::Get();
+            if (!annotation.empty()) {
+              RocmTracer::GetRocmTracerSingleton().annotation_map()->Add(
+                  record.correlation_id.internal, annotation);
+            }
+          }
+        },
+        nullptr);
   }
-  collector_->AddEvent(std::move(event), false);
-}
-
-// TODO(rocm-profiler): rename this function. this is HIP-OP
-void RocmActivityCallbackImpl::AddHccKernelActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-   missing:
-     name, context_id, nvtx_range, kernel_info
-
-   extra:
-     domain (thread id from the HIP activity)
-
-   activity record contains device/stream ID
- */
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.type = RocmTracerEventType::Kernel;
-  event.source = RocmTracerEventSource::Activity;
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.stream_id = record->queue_id;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddNormalHipOpsMemcpyActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      type, name(the name set here is not clear enough but we keep it for
-    debug), context_id, memcpy_info.kind, memcpy_info.num_bytes,
-    memcpy_info.async, memcpy_info.src_mem_kind, memcpy_info.dst_mem_kind
-
-    extras:
-      domain,
-
-  */
-
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =  // name is stored for debug
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
-
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.memcpy_info.destination = event.device_id;
-  event.stream_id = record->queue_id;
-
-  // we set the type as MemcpyOther as HIP-OPS activity record does not carry
-  // this information
-  event.type = RocmTracerEventType::MemcpyOther;
-
-  collector_->AddEvent(std::move(event), false);
-}
-
-void RocmActivityCallbackImpl::AddHipOpsMemsetActivityEvent(
-    const roctracer_record_t* record) {
-  /*
-    misses:
-      name (name recorder here is not clear enough for Memset. We only capture
-    it for debug), context_id, memset_info.kind, memset_info.num_bytes,
-    memset_info.async
-
-    extras:
-      dommain, annotation,
-
-  */
 
-  RocmTracerEvent event;
-  event.domain = RocmTracerEventDomain::HIP_OPS;
-  event.source = RocmTracerEventSource::Activity;
-  event.name =  // name is stored for debug
-      se::wrap::roctracer_op_string(record->domain, record->op, record->kind);
-  event.correlation_id = record->correlation_id;
-  event.annotation = collector_->annotation_map()->LookUp(event.correlation_id);
+  auto client_thread = rocprofiler_callback_thread_t{};
+  rocprofiler_create_callback_thread(&client_thread);
+  rocprofiler_assign_callback_thread(buffer_, client_thread);
 
-  event.start_time_ns = record->begin_ns;
-  event.end_time_ns = record->end_ns;
-  event.device_id = record->device_id;
-  event.stream_id = record->queue_id;
-
-  event.type = RocmTracerEventType::Memset;
+  int isValid = 0;
+  rocprofiler_context_is_valid(context_, &isValid);
+  if (isValid == 0) {
+    context_.handle = 0;  // Leak on failure.
+    return -1;
+  }
 
-  collector_->AddEvent(std::move(event), false);
+  return 0;
 }
 
-/* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
-  static auto* const singleton = new RocmTracer();
-  return singleton;
-}
-
-// FIXME(rocm-profiler): we should also check if we have AMD GPUs
-bool RocmTracer::IsAvailable() const {
-  return !activity_tracing_enabled_ && !api_tracing_enabled_;  // &&NumGpus()
-}
-
-int RocmTracer::NumGpus() {
-  static int num_gpus = []() -> int {
-    if (hipInit(0) != hipSuccess) {
-      return 0;
-    }
-    int gpu_count;
-    if (hipGetDeviceCount(&gpu_count) != hipSuccess) {
-      return 0;
-    }
-    LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
-    return gpu_count;
-  }();
-  return num_gpus;
-}
-
-void RocmTracer::Enable(const RocmTracerOptions& options,
-                        RocmTraceCollector* collector) {
-  options_ = options;
-  collector_ = collector;
-  api_cb_impl_ = new RocmApiCallbackImpl(options, this, collector);
-  activity_cb_impl_ = new RocmActivityCallbackImpl(options, this, collector);
-
-  // From ROCm 3.5 onwards, the following call is required.
-  // don't quite know what it does (no documentation!), only that without it
-  // the call to enable api/activity tracing will run into a segfault
-  se::wrap::roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr);
-
-  EnableApiTracing().IgnoreError();
-  EnableActivityTracing().IgnoreError();
-  LOG(INFO) << "GpuTracer started";
+void RocmTracer::toolFinalize(void* tool_data) {
+  auto& obj = RocmTracer::GetRocmTracerSingleton();
+  VLOG(1) << "Calling toolFinalize!";
+  rocprofiler_stop_context(obj.utility_context_);
+  obj.utility_context_.handle = 0;
+  rocprofiler_stop_context(obj.context_);
+  // flush buffer here or in disable?
+  obj.context_.handle = 0;
 }
 
 void RocmTracer::Disable() {
-  // TODO(rocm-profiler): TF has a SyncAndFlush() function
-  // to be called before disabling. It makes sure all the contexts
-  // has finished all the tasks before shutting down the profiler
-  DisableApiTracing().IgnoreError();
-  DisableActivityTracing().IgnoreError();
-  delete api_cb_impl_;
-  delete activity_cb_impl_;
+  absl::MutexLock lock(collector_mutex_);
   collector_->Flush();
   collector_ = nullptr;
-  options_.reset();
-  LOG(INFO) << "GpuTracer stopped";
-}
-
-void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
-                 void* user_data) {
-  RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
-}
-
-absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
-  if (api_tracing_enabled_)
-    TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return absl::OkStatus();
-  api_tracing_enabled_ = true;
-
-  for (auto& iter : options_->api_callbacks) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Enabling API tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_enable_domain_callback(
-          domain, ApiCallback, this));
-    } else {
-      VLOG(3) << "Enabling API tracing for " << ops.size() << " ops in domain "
-              << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Enabling API tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_enable_op_callback(
-            domain, op, ApiCallback, this));
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return absl::OkStatus();
   api_tracing_enabled_ = false;
-
-  for (auto& iter : options_->api_callbacks) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Disabling API tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_disable_domain_callback(domain));
-    } else {
-      VLOG(3) << "Disabling API tracing for " << ops.size() << " ops in domain "
-              << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Disabling API tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_disable_op_callback(domain, op));
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-void ActivityCallback(const char* begin, const char* end, void* user_data) {
-  RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ActivityCallbackHandler(begin, end).IgnoreError();
-}
-
-absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                 const char* end) {
-  if (activity_tracing_enabled_) {
-    TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
-  } else {
-    LOG(WARNING) << "ActivityCallbackHandler called when "
-                    "activity_tracing_enabled_ is false";
-
-    VLOG(3) << "Dropped Activity Records Start";
-    const roctracer_record_t* record =
-        reinterpret_cast<const roctracer_record_t*>(begin);
-    const roctracer_record_t* end_record =
-        reinterpret_cast<const roctracer_record_t*>(end);
-    while (record < end_record) {
-      DumpActivityRecord(record,
-                         "activity_tracing_enabled_ is false. Dropped!");
-#if TF_ROCM_VERSION >= 50300
-      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-          se::wrap::roctracer_next_record(record, &record)));
-#else
-      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-          roctracer_next_record(record, &record)));
-#endif
-    }
-    VLOG(3) << "Dropped Activity Records End";
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return absl::OkStatus();
-  activity_tracing_enabled_ = true;
-
-  if (!options_->activity_tracing.empty()) {
-    // Create the memory pool to store activity records in
-    if (se::wrap::roctracer_default_pool_expl(nullptr) == NULL) {
-      roctracer_properties_t properties{};
-      properties.buffer_size = 0x1000;
-      properties.buffer_callback_fun = ActivityCallback;
-      properties.buffer_callback_arg = this;
-      VLOG(3) << "Creating roctracer activity buffer: buff-size="
-              << properties.buffer_size;
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_open_pool_expl(&properties, nullptr));
-    }
-  }
-
-  for (auto& iter : options_->activity_tracing) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Enabling Activity tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_enable_domain_activity_expl(domain, nullptr));
-    } else {
-      VLOG(3) << "Enabling Activity tracing for " << ops.size()
-              << " ops in domain " << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Enabling Activity tracing for "
-                << GetActivityDomainOpName(domain, op);
-        // roctracer library has not exported "roctracer_enable_op_activity"
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_enable_op_activity_expl(domain, op, nullptr));
-      }
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return absl::OkStatus();
-
-  for (auto& iter : options_->activity_tracing) {
-    activity_domain_t domain = iter.first;
-    std::vector<uint32_t>& ops = iter.second;
-    if (ops.size() == 0) {
-      VLOG(3) << "Disabling Activity tracing for domain "
-              << GetActivityDomainName(domain);
-      RETURN_IF_ROCTRACER_ERROR(
-          se::wrap::roctracer_disable_domain_activity(domain));
-    } else {
-      VLOG(3) << "Disabling Activity tracing for " << ops.size()
-              << " ops in domain " << GetActivityDomainName(domain);
-      for (auto& op : ops) {
-        VLOG(3) << "Disabling Activity tracing for "
-                << GetActivityDomainOpName(domain, op);
-        RETURN_IF_ROCTRACER_ERROR(
-            se::wrap::roctracer_disable_op_activity(domain, op));
-      }
-    }
-  }
-
-  // TODO(rocm-profiler): this stopping mechanism needs improvement.
-  // Flush the activity buffer BEFORE setting the activity_tracing_enable_
-  // flag to FALSE. This is because the activity record callback routine is
-  // gated by the same flag
-  VLOG(3) << "Flushing roctracer activity buffer";
-  RETURN_IF_ROCTRACER_ERROR(se::wrap::roctracer_flush_activity_expl(nullptr));
-  // roctracer_flush_buf();
-
-  // Explicitly wait for (almost) all pending activity records
-  // The choice of all of the following is based what seemed to work
-  // best when enabling tracing on a large testcase (BERT)
-  // * 100 ms as the initial sleep duration AND
-  // * 1 as the initial threshold value
-  // * 6 as the maximum number of iterations
-  int duration_ms = 100;
-  size_t threshold = 1;
-  for (int i = 0; i < 6; i++, duration_ms *= 2, threshold *= 2) {
-    if (GetPendingActivityRecordsCount() < threshold) break;
-    VLOG(3) << "Wait for pending activity records :"
-            << " Pending count = " << GetPendingActivityRecordsCount()
-            << ", Threshold = " << threshold;
-    VLOG(3) << "Wait for pending activity records : sleep for " << duration_ms
-            << " ms";
-    tsl::profiler::SleepForMillis(duration_ms);
-  }
-  ClearPendingActivityRecordsCount();
-
   activity_tracing_enabled_ = false;
-
-  return absl::OkStatus();
-}
-
-/*static*/ uint64_t RocmTracer::GetTimestamp() {
-  uint64_t ts;
-  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
-    const char* errstr = se::wrap::roctracer_error_string();
-    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
-               << errstr;
-    // Return 0 on error.
-    return 0;
-  }
-  return ts;
+  VLOG(1) << "GpuTracer stopped";
+}
+
+// ----------------------------------------------------------------------------
+// Helper that returns all device agents (GPU + CPU for now).
+// ----------------------------------------------------------------------------
+std::vector<rocprofiler_agent_v0_t> GetGpuDeviceAgents() {
+  std::vector<rocprofiler_agent_v0_t> agents;
+
+  rocprofiler_query_available_agents_cb_t iterate_cb =
+      [](rocprofiler_agent_version_t agents_ver, const void** agents_arr,
+         size_t num_agents, void* udata) {
+        if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) {
+          LOG(ERROR) << "unexpected rocprofiler agent version: " << agents_ver;
+          return ROCPROFILER_STATUS_ERROR;
+        }
+        auto* agents_vec =
+            static_cast<std::vector<rocprofiler_agent_v0_t>*>(udata);
+        for (size_t i = 0; i < num_agents; ++i) {
+          const auto* agent =
+              static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
+          agents_vec->push_back(*agent);
+        }
+        return ROCPROFILER_STATUS_SUCCESS;
+      };
+
+  rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
+                                     iterate_cb, sizeof(rocprofiler_agent_t),
+                                     static_cast<void*>(&agents));
+  return agents;
+}
+
+static int toolInitStatic(rocprofiler_client_finalize_t finalize_func,
+                          void* tool_data) {
+  return RocmTracer::GetRocmTracerSingleton().toolInit(finalize_func,
+                                                       tool_data);
+}
+
+// ----------------------------------------------------------------------------
+// C‑linkage entry‑point expected by rocprofiler-sdk.
+// ----------------------------------------------------------------------------
+extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(
+    uint32_t version, const char* runtime_version, uint32_t priority,
+    rocprofiler_client_id_t* id) {
+  auto& obj = RocmTracer::GetRocmTracerSingleton();  // Ensure constructed,
+                                                     // critical for tracing.
+
+  id->name = "XLA-with-rocprofiler-sdk";
+  obj.client_id_ = id;
+
+  VLOG(1) << "Configure rocprofiler-sdk...";
+
+  const uint32_t major = version / 10000;
+  const uint32_t minor = (version % 10000) / 100;
+  const uint32_t patch = version % 100;
+
+  VLOG(1) << absl::StrFormat(
+      "%s Configure XLA with rocprofv3... (priority=%u) is using "
+      "rocprofiler-sdk v%u.%u.%u (%s)",
+      id->name, static_cast<unsigned>(priority), static_cast<unsigned>(major),
+      static_cast<unsigned>(minor), static_cast<unsigned>(patch),
+      runtime_version ? runtime_version : "unknown");
+
+  static rocprofiler_tool_configure_result_t cfg{
+      sizeof(rocprofiler_tool_configure_result_t), &toolInitStatic,
+      &RocmTracer::toolFinalize, nullptr};
+
+  return &cfg;
 }
 
 }  // namespace profiler
 }  // namespace xla
+
+void __attribute__((constructor)) init_rocm_lib() {
+  rocprofiler_force_configure(xla::profiler::rocprofiler_configure);
+}
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index ebe099e3839c5c..7bc139d8c4340c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -16,194 +16,113 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 
-#include <optional>
-
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "absl/types/optional.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
 namespace profiler {
-
-enum class RocmTracerSyncTypes {
-  InvalidSync = 0,
-  StreamSynchronize,  // caller thread wait stream to become empty
-  EventSynchronize,   // caller thread will block until event happens
-  StreamWait          // compute stream will wait for event to happen
-};
+// forward declare (interface)
+class RocmTraceCollector;
 
 struct RocmTracerOptions {
-  std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
-
-  // map of domain --> ops for which we need to enable the API callbacks
-  // If the ops vector is empty, then enable API callbacks for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
-
-  // map of domain --> ops for which we need to enable the Activity records
-  // If the ops vector is empty, then enable Activity records for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
-      activity_tracing;
-};
-
-class RocmTracer;
-
-class RocmApiCallbackImpl {
- public:
-  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                      RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
-
- private:
-  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddNormalMemcpyEventUponApiExit(uint32_t cbid,
-                                       const hip_api_data_t* data,
-                                       uint64_t enter_time, uint64_t exit_time);
-  void AddMemcpyPeerEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint64_t enter_time, uint64_t exit_time);
-  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddMallocFreeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint32_t device_id, uint64_t enter_time,
-                                     uint64_t exit_time);
-  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
-                                            const hip_api_data_t* data,
-                                            uint64_t enter_time,
-                                            uint64_t exit_time);
-  void AddSynchronizeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                      uint64_t enter_time, uint64_t exit_time);
-
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
-  absl::Mutex api_call_start_mutex_;
-  // TODO(rocm-profiler): replace this with absl hashmap
-  // keep a map from the corr. id to enter time for API callbacks.
-  std::map<uint32_t, uint64_t> api_call_start_time_
-      TF_GUARDED_BY(api_call_start_mutex_);
-};
-
-class RocmActivityCallbackImpl {
- public:
-  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                           RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  absl::Status operator()(const char* begin, const char* end);
-
- private:
-  void AddHipKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
-  void AddHipMallocActivityEvent(const roctracer_record_t* record);
-  void AddHipStreamSynchronizeActivityEvent(const roctracer_record_t* record);
-  void AddHccKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipOpsMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipOpsMemsetActivityEvent(const roctracer_record_t* record);
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
+  // maximum number of annotation strings that AnnotationMap in RocmTracer can
+  // store. e.g. 1M
+  uint64_t max_annotation_strings;
 };
 
-// The class uses roctracer callback/activity API and forward the collected
-// trace events to RocmTraceCollector. There should be only one RocmTracer
-// per process.
+// The class use to enable rocprofiler-sdk buffered callback/activity tracing
+// and forward the collected trace events to RocmTraceCollector. There should be
+// only one RocmTracer per process.
 class RocmTracer {
  public:
-  // Returns a pointer to singleton RocmTracer.
-  static RocmTracer* GetRocmTracerSingleton();
+  // Returns a reference to the singleton instance of RocmTracer.
+  // This ensures that only one global instance exists throughout the process
+  // lifetime. The first call to this function lazily constructs the instance in
+  // a thread-safe manner. Subsequent calls return the same instance, enabling
+  // centralized tracer state management.
+  static RocmTracer& GetRocmTracerSingleton();
 
   // Only one profile session can be live in the same time.
   bool IsAvailable() const;
 
-  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
+  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector_);
   void Disable();
 
-  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                  const void* cbdata);
-  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
-
   static uint64_t GetTimestamp();
-  static int NumGpus();
+  uint32_t NumGpus() const { return num_gpus_; };
+  RocmTraceCollector* collector() { return collector_; }
 
-  void AddToPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Add(correlation_id);
-  }
+  int toolInit(rocprofiler_client_finalize_t finalize_func, void* tool_data);
+  static void toolFinalize(void* tool_data);
 
-  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Remove(correlation_id);
-  }
+  void TracingCallback(rocprofiler_context_id_t context,
+                       rocprofiler_buffer_id_t buffer_id,
+                       rocprofiler_record_header_t** headers,
+                       size_t num_headers, uint64_t drop_count);
 
-  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
+  void CodeObjectCallback(rocprofiler_callback_tracing_record_t record,
+                          void* callback_data);
 
-  size_t GetPendingActivityRecordsCount() {
-    return pending_activity_records_.Count();
-  }
+  AnnotationMap* annotation_map() { return &annotation_map_; }
 
  protected:
   // protected constructor for injecting mock cupti interface for testing.
-  explicit RocmTracer() : num_gpus_(NumGpus()) {}
+  RocmTracer() = default;
+
+  void HipApiEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
+  void KernelEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
+  void MemcpyEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev);
 
  private:
-  absl::Status EnableApiTracing();
-  absl::Status DisableApiTracing();
+  uint32_t num_gpus_{0};
+  std::optional<RocmTracerOptions> options_;
+  RocmTraceCollector* collector_{nullptr};
+  absl::Mutex collector_mutex_;
 
-  absl::Status EnableActivityTracing();
-  absl::Status DisableActivityTracing();
+  bool api_tracing_enabled_{false};
+  bool activity_tracing_enabled_{false};
 
-  int num_gpus_;
-  std::optional<RocmTracerOptions> options_;
-  RocmTraceCollector* collector_ = nullptr;
-
-  bool api_tracing_enabled_ = false;
-  bool activity_tracing_enabled_ = false;
-
-  RocmApiCallbackImpl* api_cb_impl_;
-  RocmActivityCallbackImpl* activity_cb_impl_;
-
-  class PendingActivityRecords {
-   public:
-    // add a correlation id to the pending set
-    void Add(uint32_t correlation_id) {
-      absl::MutexLock lock(mutex);
-      pending_set.insert(correlation_id);
-    }
-    // remove a correlation id from the pending set
-    void Remove(uint32_t correlation_id) {
-      absl::MutexLock lock(mutex);
-      pending_set.erase(correlation_id);
-    }
-    // clear the pending set
-    void Clear() {
-      absl::MutexLock lock(mutex);
-      pending_set.clear();
-    }
-    // count the number of correlation ids in the pending set
-    size_t Count() {
-      absl::MutexLock lock(mutex);
-      return pending_set.size();
-    }
-
-   private:
-    // set of co-relation ids for which the hcc activity record is pending
-    absl::flat_hash_set<uint32_t> pending_set;
-    // the callback which processes the activity records (and consequently
-    // removes items from the pending set) is called in a separate thread
-    // from the one that adds item to the list.
-    absl::Mutex mutex;
+  AnnotationMap annotation_map_{/* default size, e.g. */ 1024 * 1024};
+
+ public:
+  using kernel_symbol_data_t =
+      rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
+
+  struct ProfilerKernelInfo {
+    std::string name;
+    kernel_symbol_data_t data;
   };
-  PendingActivityRecords pending_activity_records_;
+
+  using kernel_info_map_t =
+      std::unordered_map<rocprofiler_kernel_id_t, ProfilerKernelInfo>;
+
+  using agent_info_map_t = std::unordered_map<uint64_t, rocprofiler_agent_v0_t>;
+
+  using callback_name_info = rocprofiler::sdk::callback_name_info;
+
+  rocprofiler_client_id_t* client_id_{nullptr};
+  // Contexts ----------------------------------------------------------
+  // for registering kernel names
+  rocprofiler_context_id_t utility_context_{};
+  // for buffered callback services
+  rocprofiler_context_id_t context_{};
+  rocprofiler_buffer_id_t buffer_{};
+
+  // Maps & misc -------------------------------------------------------
+  kernel_info_map_t kernel_info_{};
+  absl::Mutex kernel_lock_;
+
+  callback_name_info name_info_;
+  agent_info_map_t agents_;
 
  public:
   // Disable copy and move.
@@ -213,4 +132,5 @@ class RocmTracer {
 
 }  // namespace profiler
 }  // namespace xla
+
 #endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
new file mode 100644
index 00000000000000..d8ad1392738d20
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_tracer.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+using tsl::profiler::XSpace;
+
+// Minimal mock collector implementation based on RocmTraceCollectorImpl.
+class TestRocmTraceCollector : public RocmTraceCollectorImpl {
+ public:
+  TestRocmTraceCollector(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollectorImpl(options, start_walltime_ns, start_gputime_ns) {}
+
+  void Export(XSpace* space) override {
+    exported_ = true;
+    exported_space_ = space;
+  }
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    dropped_reason_ = reason;
+    dropped_id_ = correlation_id;
+  }
+
+  bool exported() const { return exported_; }
+  const std::string& dropped_reason() const { return dropped_reason_; }
+  uint32_t dropped_id() const { return dropped_id_; }
+  XSpace* exported_space() const { return exported_space_; }
+
+ private:
+  bool exported_ = false;
+  std::string dropped_reason_;
+  uint32_t dropped_id_ = 0;
+  XSpace* exported_space_ = nullptr;
+};
+
+// Utility to create valid options for the test collector.
+std::unique_ptr<TestRocmTraceCollector> CreateTestCollector() {
+  RocmTraceCollectorOptions options;
+  options.max_callback_api_events = 2 * 1024 * 1024;
+  options.max_activity_api_events = 2 * 1024 * 1024;
+  options.max_annotation_strings = 1024 * 1024;
+  options.num_gpus = 1;
+
+  uint64_t walltime_ns = RocmTracer::GetTimestamp();
+  uint64_t gputime_ns = RocmTracer::GetTimestamp();
+
+  return std::make_unique<TestRocmTraceCollector>(options, walltime_ns,
+                                                  gputime_ns);
+}
+
+TEST(RocmTracerTest, SingletonInstance) {
+  LOG(INFO) << "Before RocmTracer::GetRocmTracerSingleton()";
+  RocmTracer& tracer1 = RocmTracer::GetRocmTracerSingleton();
+  RocmTracer& tracer2 = RocmTracer::GetRocmTracerSingleton();
+  LOG(INFO) << "Before RocmTracer::GetRocmTracerSingleton()";
+  EXPECT_EQ(&tracer1, &tracer2) << "RocmTracer must be a singleton";
+}
+
+TEST(RocmTracerTest, InitialStateIsAvailable) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  EXPECT_TRUE(tracer.IsAvailable())
+      << "Tracer should be available before Enable()";
+}
+
+TEST(RocmTracerTest, EnableAndDisableLifecycle) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  auto collector = CreateTestCollector();
+
+  RocmTracerOptions tracer_options{/*max_annotation_strings=*/128};
+  tracer.Enable(tracer_options, collector.get());
+
+  EXPECT_FALSE(tracer.IsAvailable())
+      << "Tracer should not be available after Enable()";
+  EXPECT_EQ(tracer.collector(), collector.get())
+      << "Collector should be set after Enable()";
+  ASSERT_NE(tracer.annotation_map(), nullptr)
+      << "Annotation map should be initialized";
+
+  tracer.Disable();
+
+  EXPECT_TRUE(tracer.IsAvailable())
+      << "Tracer should be available after Disable()";
+}
+
+TEST(RocmTracerTest, AnnotationMapWorks) {
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  AnnotationMap* map = tracer.annotation_map();
+  ASSERT_NE(map, nullptr);
+
+  uint64_t id = 42;
+  std::string annotation = "matmul_fused_op";
+  map->Add(id, annotation);
+
+  absl::string_view result = map->LookUp(id);
+  EXPECT_EQ(result, annotation);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc
new file mode 100644
index 00000000000000..0e02383f12afc4
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+
+// for rocprofiler-sdk
+namespace xla {
+namespace profiler {
+
+//-----------------------------------------------------------------------------
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source) {
+  switch (source) {
+    case RocmTracerEventSource::ApiCallback:
+      return "ApiCallback";
+      break;
+    case RocmTracerEventSource::Activity:
+      return "Activity";
+      break;
+    case RocmTracerEventSource::Invalid:
+      return "Invalid";
+      break;
+    default:
+      DCHECK(false);
+      return "";
+  }
+  return "";
+}
+
+// FIXME(rocm-profiler): These domain names are not consistent with the
+// GetActivityDomainName function
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
+  switch (domain) {
+    case RocmTracerEventDomain::HIP_API:
+      return "HIP_API";
+      break;
+    case RocmTracerEventDomain::HIP_OPS:
+      return "HIP_OPS";
+      break;
+    default:
+      LOG(WARNING) << "RocmTracerEventDomain::InvalidDomain";
+      DCHECK(false);
+      return "";
+  }
+  return "";
+}
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type) {
+#define OO(x)                  \
+  case RocmTracerEventType::x: \
+    return #x;
+  switch (type) {
+    OO(Kernel)
+    OO(MemcpyH2D)
+    OO(MemcpyD2H)
+    OO(MemcpyD2D)
+    OO(MemcpyOther)
+    OO(MemoryAlloc)
+    OO(MemoryFree)
+    OO(Memset)
+    OO(Synchronization)
+    OO(Generic)
+    default: {
+    };
+  }
+#undef OO
+  DCHECK(false);
+  return "";
+}
+
+void AnnotationMap::Add(uint32_t correlation_id,
+                        const std::string& annotation) {
+  if (annotation.empty()) {
+    return;
+  }
+  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
+          << ", annotation: " << annotation;
+  absl::MutexLock lock(map_.mutex);
+  if (map_.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *map_.annotations.insert(annotation).first;
+    map_.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
+  absl::MutexLock lock(map_.mutex);
+  auto it = map_.correlation_map.find(correlation_id);
+  return it != map_.correlation_map.end() ? it->second : absl::string_view();
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h
new file mode 100644
index 00000000000000..10b7ad5a24be73
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_utils.h
@@ -0,0 +1,191 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct KernelDetails {
+  // The amount of private memory used by kernel,
+  // number of register per thread (register spillage if > 0)
+  uint32_t private_segment_size;
+  // The amount of shared memory (SMEM)
+  uint32_t group_segment_size;
+  // X-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_x;
+  // Y-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_y;
+  // Z-dimension of a workgroup (grid.x*block.x)
+  uint32_t workgroup_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HIP_OPS,
+};
+
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+
+// RocmTracerSyncTypes forward declaration
+enum class RocmTracerSyncTypes;
+
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidThreadId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint64_t thread_id = kInvalidThreadId;
+  uint64_t stream_id = kInvalidStreamId;
+
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 6c3c5620477b8b..ab0f77da414fe7 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -62,6 +62,7 @@ cc_library(
         ":profiler_c_api_hdrs",
         ":profiler_error",
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/profiler/lib:profiler_collection",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
index f5ed20fa5bfbf8..e011ef8f690cc3 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer_impl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/backends/profiler/plugin/profiler_error.h"
 #include "xla/tsl/platform/logging.h"
@@ -37,7 +38,7 @@ PLUGIN_Profiler_Error* PLUGIN_Profiler_Create(
   auto profiler = std::make_unique<PLUGIN_Profiler>();
   profiler->stopped = true;
   tensorflow::ProfileOptions options;
-  options.ParseFromArray(args->options, args->options_size);
+  options.ParseFromString(absl::string_view(args->options, args->options_size));
   profiler->impl = std::make_unique<tsl::profiler::ProfilerCollection>(
       tsl::profiler::CreateProfilers(options));
 
diff --git a/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc b/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
index 148e4ce2b5d804..2807248fd57305 100644
--- a/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
+++ b/third_party/xla/xla/backends/profiler/subprocess/subprocess_main.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
 #include <memory>
 #include <vector>
 
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index 36e40e2d539e80..b32acf7e45a5a1 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -45,9 +45,9 @@ xla_cc_test(
     deps = [
         ":emitter_loc_op_builder",
         "//xla/backends/gpu/codegen/triton:fusion_emitter",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
@@ -63,8 +63,10 @@ cc_library(
     hdrs = ["kernel_emitter.h"],
     deps = [
         ":kernel_definition",
+        ":kernel_source",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -81,14 +83,17 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 cc_library(
-    name = "llvm_ir_kernel_source",
-    srcs = ["llvm_ir_kernel_source.cc"],
-    hdrs = ["llvm_ir_kernel_source.h"],
+    name = "llvm_kernel_source",
+    srcs = ["llvm_kernel_source.cc"],
+    hdrs = ["llvm_kernel_source.h"],
     deps = [
+        ":kernel_definition",
+        ":kernel_emitter",
         ":kernel_source",
         "//xla/service/llvm_ir:llvm_util",
         "@llvm-project//llvm:Core",
@@ -116,9 +121,12 @@ cc_library(
     srcs = ["mlir_kernel_source.cc"],
     hdrs = ["mlir_kernel_source.h"],
     deps = [
+        ":kernel_definition",
+        ":kernel_emitter",
         ":kernel_source",
         "//xla:util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
@@ -128,44 +136,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "mlir_kernel_definition",
-    hdrs = ["mlir_kernel_definition.h"],
-    deps = [
-        ":kernel_definition",
-        ":kernel_source",
-        ":kernel_spec",
-        ":mlir_kernel_source",
-    ],
-)
-
-cc_library(
-    name = "mlir_kernel_emitter",
-    hdrs = ["mlir_kernel_emitter.h"],
-    deps = [
-        ":kernel_emitter",
-        ":mlir_kernel_definition",
-    ],
-)
-
-cc_library(
-    name = "llvm_kernel_emitter",
-    hdrs = ["llvm_kernel_emitter.h"],
-    deps = [
-        ":kernel_emitter",
-        ":llvm_kernel_definition",
-    ],
-)
-
-cc_library(
-    name = "llvm_kernel_definition",
-    hdrs = ["llvm_kernel_definition.h"],
-    deps = [
-        ":kernel_definition",
-        ":llvm_ir_kernel_source",
-    ],
-)
-
 cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
@@ -206,11 +176,24 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "intrinsic_function",
+    hdrs = ["intrinsic_function.h"],
+    deps = [
+        "//xla/codegen/intrinsic",
+        "//xla/codegen/intrinsic:type",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:ir_headers",
+    ],
+)
+
 cc_library(
     name = "intrinsic_lib",
     srcs = ["intrinsic_lib.cc"],
     hdrs = ["intrinsic_lib.h"],
     deps = [
+        ":intrinsic_function",
         "//xla:xla_data_proto_cc",
         "//xla/codegen/intrinsic",
         "//xla/codegen/intrinsic:erf",
@@ -226,14 +209,15 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
index 885218eaa16b05..5b5611c2d66f50 100644
--- a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
+++ b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "tsl/platform/status_matchers.h"
 
@@ -47,8 +47,8 @@ class EmitterLocOpBuilderTest : public HloHardwareIndependentTestBase {
  protected:
   void SetUp() override { gpu::LoadMlirDialectsForTriton(mlir_context_); }
 
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
   mlir::MLIRContext mlir_context_;
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 NameLoc NameLoc(mlir::MLIRContext& context, absl::string_view name) {
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index c9461a5aeaffac..8b9d42d4863e3a 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -23,8 +23,8 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -48,9 +48,9 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
@@ -91,13 +91,14 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service:algorithm_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -136,12 +137,12 @@ xla_cc_test(
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/mlir_hlo",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
@@ -255,12 +256,12 @@ cc_library(
         "//xla/codegen:kernel_spec",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
         "//xla/runtime:work_item",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -285,12 +286,12 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/runtime:work_cluster",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
         "//xla/runtime:work_item",
         "//xla/runtime:work_tile_size",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:IR",
@@ -310,18 +311,17 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_item",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -365,18 +365,17 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_item",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -426,18 +425,17 @@ cc_library(
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
         "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_item",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
index c21bc59283765a..b04bc883395edc 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
@@ -47,10 +47,10 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -68,8 +68,7 @@ const Shape& TupleShape(const Shape& shape, int index) {
 }
 
 std::vector<IndexingMapSet> ComputeOperandIndexingMaps(
-    const HloInstruction* instr,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    const HloInstruction* instr, SymbolicExprContext* symbolic_expr_context) {
   std::vector<IndexingMapSet> indexing_maps_per_operand;
   // For some ops, there is no indexing map implemented for the operands (e.g.
   // scatter) or there are multiple results and the common iteration space is
@@ -106,7 +105,7 @@ bool HasNoCompute(const HloInstruction* instr) {
 
 EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
     const HloInstruction* hero, const HloInstruction* root,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   EpilogueSpecification result;
   if (root->shape().IsArray()) {
     absl::c_copy(root->shape().dimensions(),
@@ -206,7 +205,7 @@ struct HloSubgraphData {
 
 PartitionedComputation::PartitionedComputation(
     const HloComputation* computation,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    SymbolicExprContext* symbolic_expr_context,
     std::function<bool(const HloInstruction*)> is_subgraph_root)
     : computation_(computation) {
   CHECK_NE(computation, nullptr);
@@ -393,8 +392,7 @@ PartitionedComputation::Subgraph PartitionedComputation::Subgraph::ForEpilogue(
 }
 
 PartitionedComputations::PartitionedComputations(
-    const HloComputation* fusion,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    const HloComputation* fusion, SymbolicExprContext* symbolic_expr_context,
     std::vector<EpilogueSpecification> epilogues)
     : fusion_(fusion), symbolic_expr_context_(symbolic_expr_context) {
   // Collect all transitively called computations (including the fusion itself).
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.h b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
index bd208e37541e08..6c2002744e165d 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.h
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -40,7 +40,7 @@ struct EpilogueSpecification {
   // Creates an epilogue with output indices matching the given root's shape.
   static EpilogueSpecification FromIdentityIndexing(
       const HloInstruction* hero, const HloInstruction* root,
-      gpu::SymbolicExprContext* symbolic_expr_context);
+      SymbolicExprContext* symbolic_expr_context);
 
   std::vector<const HloInstruction*> heroes;
   std::vector<const HloInstruction*> roots;
@@ -81,11 +81,10 @@ struct EpilogueSpecification {
 // than its users.
 class PartitionedComputation {
  public:
-  explicit PartitionedComputation(
-      const HloComputation* computation,
-      gpu::SymbolicExprContext* symbolic_expr_context,
-      std::function<bool(const HloInstruction*)> is_subgraph_root =
-          HloPredicateFalse);
+  explicit PartitionedComputation(const HloComputation* computation,
+                                  SymbolicExprContext* symbolic_expr_context,
+                                  std::function<bool(const HloInstruction*)>
+                                      is_subgraph_root = HloPredicateFalse);
 
   struct Subgraph {
     // A unique name of the subgraph. Used for function names.
@@ -155,8 +154,7 @@ class PartitionedComputations {
   // Partition the given fusion computation and optionally generate an epilogue
   // for the given heroes.
   explicit PartitionedComputations(
-      const HloComputation* fusion,
-      gpu::SymbolicExprContext* symbolic_expr_context,
+      const HloComputation* fusion, SymbolicExprContext* symbolic_expr_context,
       std::vector<EpilogueSpecification> epilogues = {});
 
   const PartitionedComputation& FindPartitionedComputation(
@@ -179,7 +177,7 @@ class PartitionedComputations {
 
   const HloComputation* fusion() const { return fusion_; }
 
-  gpu::SymbolicExprContext* symbolic_expr_context() const {
+  SymbolicExprContext* symbolic_expr_context() const {
     return symbolic_expr_context_;
   }
 
@@ -200,7 +198,7 @@ class PartitionedComputations {
       computation_to_partitioning_;
   const HloComputation* fusion_;
   std::vector<PartitionedComputation::Subgraph> epilogues_;
-  gpu::SymbolicExprContext* symbolic_expr_context_;
+  SymbolicExprContext* symbolic_expr_context_;
 };
 
 // Returns an MLIR function declaration for the given subgraph. For subgraphs of
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
index 32e2ff97803cc8..1f1d86d5ca2daf 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 namespace emitters {
@@ -45,7 +45,7 @@ class ComputationPartitionerTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_;
+  SymbolicExprContext symbolic_expr_context_;
 };
 
 std::string PrintAndErase(mlir::func::FuncOp func) {
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
index bc7fbb3f1f089d..72e04f176c0d5c 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
@@ -53,17 +53,16 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_item.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -74,7 +73,7 @@ limitations under the License.
 namespace xla::emitters {
 
 ConcatenateFusionKernelEmitter::ConcatenateFusionKernelEmitter(
-    gpu::SymbolicExprContext& symbolic_expr_context,
+    SymbolicExprContext& symbolic_expr_context,
     const HloFusionInstruction& fusion, const HloFusionSpec& fusion_spec,
     const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
@@ -90,7 +89,7 @@ ConcatenateFusionKernelEmitter::ConcatenateFusionKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<ConcatenateFusionKernelEmitter::KernelDefinition>
 ConcatenateFusionKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(symbolic_expr_context_.GetMLIRContext());
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -122,8 +121,8 @@ ConcatenateFusionKernelEmitter::EmitKernelDefinition() {
                       GetKernelSpec(entry_function_name_, fusion_,
                                     buffer_assignment_, work_dimensions_));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 const Shape& ConcatenateFusionKernelEmitter::GetIndexingShape(
@@ -153,12 +152,12 @@ int ConcatenateFusionKernelEmitter::GetValidUnrollFactor(
 
 IndexingMap ConcatenateFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& largest_shape,
-    gpu::SymbolicExprContext* ctx) {
+    SymbolicExprContext* ctx) {
   return GetDefaultWorkItemIndexingMap(work_dimensions, largest_shape, ctx);
 }
 
 IndexingMap ConcatenateFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
-    gpu::SymbolicExprContext* ctx) const {
+    SymbolicExprContext* ctx) const {
   return ComputeWorkItemIdToOutputIndexing(work_dimensions_, largest_shape_,
                                            ctx);
 }
@@ -290,7 +289,7 @@ absl::Status ConcatenateFusionKernelEmitter::EmitEntryFunction(
 std::vector<emitters::EpilogueSpecification>
 ConcatenateFusionKernelEmitter::GetEpilogues(
     const HloFusionInstruction& fusion,
-    gpu::SymbolicExprContext* symbolic_expr_context) const {
+    SymbolicExprContext* symbolic_expr_context) const {
   return {emitters::EpilogueSpecification::FromIdentityIndexing(
       &fusion_spec_.fusion_hero(0).instruction(),
       &fusion_spec_.fusion_root(0).instruction(), symbolic_expr_context)};
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
index 6472ff9e42c90c..b3edbe06284272 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_CONCATENATE_KERNEL_EMITTER_H_
 #define XLA_CODEGEN_EMITTERS_CONCATENATE_KERNEL_EMITTER_H_
 
-#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -28,34 +27,37 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla::emitters {
 
-class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
+class ConcatenateFusionKernelEmitter final
+    : public KernelEmitter<MlirKernelSource> {
  public:
   ConcatenateFusionKernelEmitter(
-      gpu::SymbolicExprContext& symbolic_expr_context,
+      SymbolicExprContext& symbolic_expr_context,
       const HloFusionInstruction& fusion, const HloFusionSpec& fusion_spec,
       const BufferAssignment* buffer_assignment,
       KernelArguments::BufferAlignment buffer_alignment,
       WorkDimensions work_dimensions, absl::string_view entry_function_name,
       BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final {
+    return "concatenate_fusion_kernel_emitter";
+  }
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   static IndexingMap ComputeWorkItemIdToOutputIndexing(
       const WorkDimensions& work_dimensions, const Shape& largest_shape,
-      gpu::SymbolicExprContext* ctx);
+      SymbolicExprContext* ctx);
 
   // Get the shape used for indexing.
   // For concatenate, this is the largest shape.
@@ -68,11 +70,8 @@ class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
   static int GetValidUnrollFactor(const HloFusionSpec& fusion_spec,
                                   int max_unroll_factor);
 
-  std::string name() const final { return "concatenate_fusion_kernel_emitter"; }
-
  private:
-  IndexingMap ComputeWorkItemIdToOutputIndexing(
-      gpu::SymbolicExprContext* ctx) const;
+  IndexingMap ComputeWorkItemIdToOutputIndexing(SymbolicExprContext* ctx) const;
 
   absl::Status EmitEntryFunction(
       const emitters::PartitionedComputations& computations,
@@ -82,10 +81,10 @@ class ConcatenateFusionKernelEmitter final : public MlirKernelEmitter {
 
   std::vector<emitters::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
-      gpu::SymbolicExprContext* symbolic_expr_context) const;
+      SymbolicExprContext* symbolic_expr_context) const;
 
  private:
-  gpu::SymbolicExprContext& symbolic_expr_context_;
+  SymbolicExprContext& symbolic_expr_context_;
   const HloFusionInstruction& fusion_;
   const HloFusionSpec& fusion_spec_;
   const BufferAssignment* buffer_assignment_;
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
index 4c53a6a0a680e9..1970fe6c73a54b 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
@@ -51,10 +51,10 @@ limitations under the License.
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -63,7 +63,6 @@ limitations under the License.
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_item.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -76,7 +75,7 @@ namespace xla::emitters {
 constexpr int kDUSUpdateIndex = 1;
 
 DynamicUpdateSliceKernelEmitter::DynamicUpdateSliceKernelEmitter(
-    gpu::SymbolicExprContext& symbolic_expr_context,
+    SymbolicExprContext& symbolic_expr_context,
     const HloFusionInstruction& fusion, const HloFusionSpec& fusion_spec,
     const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
@@ -93,7 +92,7 @@ DynamicUpdateSliceKernelEmitter::DynamicUpdateSliceKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<DynamicUpdateSliceKernelEmitter::KernelDefinition>
 DynamicUpdateSliceKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(symbolic_expr_context_.GetMLIRContext());
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -121,12 +120,12 @@ DynamicUpdateSliceKernelEmitter::EmitKernelDefinition() {
 
   TF_ASSIGN_OR_RETURN(auto kernel_spec, GetKernelSpec());
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 IndexingMap DynamicUpdateSliceKernelEmitter::ComputeWorkItemIdToInputIndexing(
-    gpu::SymbolicExprContext* symbolic_expr_context) const {
+    SymbolicExprContext* symbolic_expr_context) const {
   // It is guaranteed that all DUS ops have the same output shape at this point.
   const auto& update_shape =
       dus_ops_.front().GetOperand(kDUSUpdateIndex).shape();
@@ -143,7 +142,7 @@ Shape DynamicUpdateSliceKernelEmitter::GetIndexingShape(
 
 IndexingMap DynamicUpdateSliceKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& update_shape,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   return GetDefaultWorkItemIndexingMap(work_dimensions, update_shape,
                                        symbolic_expr_context);
 }
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
index 41cfb5c503559a..b3335cc2ece7f4 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
@@ -27,16 +27,15 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla::emitters {
@@ -47,17 +46,22 @@ namespace xla::emitters {
 // 3. a tuple op returning the result of several dynamic-update-slice ops
 // 4. a tuple op returning the result of several bitcast
 //    dynamic-update-slice ops
-class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
+class DynamicUpdateSliceKernelEmitter final
+    : public KernelEmitter<MlirKernelSource> {
  public:
   DynamicUpdateSliceKernelEmitter(
-      gpu::SymbolicExprContext& symbolic_expr_context,
+      SymbolicExprContext& symbolic_expr_context,
       const HloFusionInstruction& fusion, const HloFusionSpec& fusion_spec,
       const BufferAssignment* buffer_assignment,
       KernelArguments::BufferAlignment buffer_alignment,
       WorkDimensions work_dimensions, absl::string_view entry_function_name,
       BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final {
+    return "dynamic_update_slice_kernel_emitter";
+  }
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   // Get the shape that will be used for loop indexing for the given fusion
   // specification.
@@ -65,15 +69,11 @@ class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
   // Get the mapping from work item id to output.
   static IndexingMap ComputeWorkItemIdToOutputIndexing(
       const WorkDimensions& work_dimensions, const Shape& update_shape,
-      gpu::SymbolicExprContext* ctx);
-
-  std::string name() const final {
-    return "dynamic_update_slice_kernel_emitter";
-  }
+      SymbolicExprContext* ctx);
 
  private:
   IndexingMap ComputeWorkItemIdToInputIndexing(
-      gpu::SymbolicExprContext* symbolic_expr_context) const;
+      SymbolicExprContext* symbolic_expr_context) const;
   absl::StatusOr<KernelSpec> GetKernelSpec() const;
 
   absl::Status EmitEntryFunction(
@@ -85,7 +85,7 @@ class DynamicUpdateSliceKernelEmitter final : public MlirKernelEmitter {
   std::vector<emitters::EpilogueSpecification> GetEpilogues() const;
 
  private:
-  gpu::SymbolicExprContext& symbolic_expr_context_;
+  SymbolicExprContext& symbolic_expr_context_;
   const HloFusionInstruction& fusion_;
   const HloFusionSpec& fusion_spec_;
   std::vector<HloInstructionAdaptor> dus_ops_;
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
index c610d784872649..46218a226a7481 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
@@ -67,17 +67,18 @@ limitations under the License.
 #include "xla/comparison_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
@@ -88,7 +89,6 @@ namespace xla {
 namespace emitters {
 namespace {
 
-using gpu::SymbolicExprContext;
 using llvm::SmallVector;
 using llvm::SmallVectorImpl;
 using mlir::Block;
@@ -393,6 +393,9 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDynamicUpdateSlice(
 absl::StatusOr<SmallVector<Value, 1>> EmitGather(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider, ImplicitLocOpBuilder& b) {
+  const auto* gather = Cast<HloGatherInstruction>(instr);
+  CHECK(GatherSimplifier::IsSimplifiedGather(gather))
+      << "Non-simplified HLO Gather is not supported.";
   auto row = indices[0];
   auto zero = b.create<ConstantIndexOp>(0);
   // Gather allows the index vector to contain fewer elements than the rank
@@ -405,12 +408,12 @@ absl::StatusOr<SmallVector<Value, 1>> EmitGather(
   // simplifier prefers this form. Therefore, we need to check the rank of the
   // indices here and do the implicit reshape in place.
   const auto& indices_shape = instr->operand(1)->shape();
-  int num_indices =
-      indices_shape.dimensions().size() == 1 ? 1 : indices_shape.dimensions(1);
-  for (int i = 0; i < num_indices; ++i) {
+  const auto& dim_numbers = gather->gather_dimension_numbers();
+  const auto& start_index_map = dim_numbers.start_index_map();
+  for (auto [i, operand_dim] : llvm::enumerate(start_index_map)) {
     auto i_val = i == 0 ? zero : b.create<ConstantIndexOp>(i);
-    int64_t slice_size = instr->gather_slice_sizes()[i];
-    int64_t input_size = instr->operand(0)->shape().dimensions()[i];
+    int64_t slice_size = gather->gather_slice_sizes()[operand_dim];
+    int64_t input_size = gather->operand(0)->shape().dimensions()[operand_dim];
     // Read and clamp index.
     TF_ASSIGN_OR_RETURN(auto input_index,
                         operand_provider(instr, 1,
@@ -419,7 +422,7 @@ absl::StatusOr<SmallVector<Value, 1>> EmitGather(
                                              : ValueRange{row, i_val}));
     TF_RET_CHECK(input_index.size() == 1)
         << "Expected operand to be a single value.";
-    operand_indices[i] =
+    operand_indices[operand_dim] =
         ClampIndex(input_index.front(),
                    primitive_util::IsUnsignedIntegralType(
                        instr->operand(1)->shape().element_type()),
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
index cedf90fae47ca7..b5c191867c0053 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.h
@@ -71,7 +71,7 @@ absl::Status SubgraphToMlirFunction(
     const PartitionedComputation& computation,
     const PartitionedComputation::Subgraph& subgraph, mlir::func::FuncOp& func,
     const CallTargetProvider& call_target_provider,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Creates an `apply_indexing` op for the given map.
 llvm::SmallVector<mlir::Value, 3> ApplyIndexing(IndexingMap map,
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
index f97d3bf8d15de4..4bf5e09dc4d376 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
@@ -44,12 +44,12 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -139,7 +139,7 @@ class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  xla::gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(ElementalHloToMlirTest, Reduce) {
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
index 9d0ef0b98da911..c6b823cdf77ce2 100644
--- a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 namespace emitters {
 
-absl::StatusOr<bool> FusionWrapperBase::Run(
+absl::StatusOr<bool> FusionWrapperBase::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto instructions = module->entry_computation()->MakeInstructionPostOrder();
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
index 730561b3651deb..393732cc64e540 100644
--- a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
@@ -35,8 +35,8 @@ class FusionWrapperBase : public HloModulePass {
     return HloInstruction::FusionKind::kLoop;
   };
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
index 93cc9364dd0e18..79ef073b9cef29 100644
--- a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
+++ b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Value.h"
 
@@ -57,6 +58,15 @@ ImplicitArithOpBuilder ImplicitArithOpBuilder::operator*(
   return Binop<mlir::arith::MulIOp>(rhs);
 }
 
+ImplicitArithOpBuilder ImplicitArithOpBuilder::operator/(int64_t rhs) const {
+  return Binop<mlir::arith::DivSIOp>(rhs);
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::operator/(
+    mlir::Value rhs) const {
+  return Binop<mlir::arith::DivSIOp>(rhs);
+}
+
 ImplicitArithOpBuilder ImplicitArithOpBuilder::operator&(
     mlir::Value rhs) const {
   return Binop<mlir::arith::AndIOp>(rhs);
@@ -137,7 +147,26 @@ ImplicitArithOpBuilder ImplicitArithOpBuilder::operator!=(int64_t rhs) const {
   return cmp(mlir::arith::CmpIPredicate::ne, rhs);
 }
 
+ImplicitArithOpBuilder ImplicitArithOpBuilder::min(mlir::Value rhs) const {
+  return {builder_->create<mlir::arith::MinSIOp>(value_, rhs), builder_};
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::min(int64_t rhs) const {
+  return min(MakeConstant(rhs));
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::max(mlir::Value rhs) const {
+  return {builder_->create<mlir::arith::MaxSIOp>(value_, rhs), builder_};
+}
+
+ImplicitArithOpBuilder ImplicitArithOpBuilder::max(int64_t rhs) const {
+  return max(MakeConstant(rhs));
+}
+
 ImplicitArithOpBuilder ImplicitArithOpBuilder::MakeConstant(int64_t c) const {
+  if (mlir::isa<mlir::IndexType>(value_.getType())) {
+    return {builder_->create<mlir::arith::ConstantIndexOp>(c), builder_};
+  }
   return {builder_->create<mlir::arith::ConstantIntOp>(value_.getType(), c),
           builder_};
 }
diff --git a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
index 1dfae3dd4ffa8c..4f7f404b452daf 100644
--- a/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
+++ b/third_party/xla/xla/codegen/emitters/implicit_arith_op_builder.h
@@ -44,6 +44,9 @@ class ImplicitArithOpBuilder {
   // Integer multiplication.
   ImplicitArithOpBuilder operator*(int64_t rhs) const;
   ImplicitArithOpBuilder operator*(mlir::Value rhs) const;
+  // Signed integer division.
+  ImplicitArithOpBuilder operator/(int64_t rhs) const;
+  ImplicitArithOpBuilder operator/(mlir::Value rhs) const;
   // Bitwise and.
   ImplicitArithOpBuilder operator&(mlir::Value rhs) const;
   ImplicitArithOpBuilder operator&(int64_t rhs) const;
@@ -72,6 +75,14 @@ class ImplicitArithOpBuilder {
   ImplicitArithOpBuilder operator==(int64_t rhs) const;
   ImplicitArithOpBuilder operator!=(int64_t rhs) const;
 
+  // Signed integer min.
+  ImplicitArithOpBuilder min(mlir::Value rhs) const;
+  ImplicitArithOpBuilder min(int64_t rhs) const;
+
+  // Signed integer max.
+  ImplicitArithOpBuilder max(mlir::Value rhs) const;
+  ImplicitArithOpBuilder max(int64_t rhs) const;
+
   ImplicitArithOpBuilder MakeConstant(int64_t c) const;
 
  private:
diff --git a/third_party/xla/xla/codegen/emitters/ir/BUILD b/third_party/xla/xla/codegen/emitters/ir/BUILD
index 826250a3992cf7..bb534898b5fa96 100644
--- a/third_party/xla/xla/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/codegen/emitters/ir/BUILD
@@ -93,7 +93,7 @@ cc_library(
         ":xla_ops_inc_gen",
         "//xla/codegen/emitters:type_util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -118,9 +118,9 @@ xla_test(
     deps = [
         ":xla",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/testlib:filecheck",
         "//xla/mlir/utils:error_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
index d91b3ce8c61578..39f677dd5bd8b5 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
@@ -57,12 +57,11 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_dialect.cc.inc"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
 
-using gpu::SymbolicExprContext;
 using llvm::ArrayRef;
 using mlir::AffineExpr;
 using mlir::AffineMap;
@@ -561,8 +560,8 @@ struct FoldApplyIndexingResults
       return rewriter.notifyMatchFailure(indexing_op,
                                          "Domain of the indexing map is empty");
     }
-    AffineMap* affine_map = &indexing_map.GetMutableAffineMap();
-    unsigned num_results = affine_map->getNumResults();
+    AffineMap affine_map = indexing_map.GetAffineMap();
+    unsigned num_results = affine_map.getNumResults();
     SmallVector<AffineExpr, 4> new_exprs;
     new_exprs.reserve(num_results);
     SmallVector<Value, 4> new_values;
@@ -574,7 +573,7 @@ struct FoldApplyIndexingResults
       }
 
       unsigned id = opresult.getResultNumber();
-      AffineExpr result_expr = affine_map->getResult(id);
+      AffineExpr result_expr = affine_map.getResult(id);
       if (auto const_expr =
               mlir::dyn_cast<mlir::AffineConstantExpr>(result_expr)) {
         new_values.push_back(rewriter.create<arith::ConstantIndexOp>(
@@ -598,11 +597,14 @@ struct FoldApplyIndexingResults
       return rewriter.notifyMatchFailure(
           indexing_op, "No constant or dim/symbol expression found");
     }
-    *affine_map =
-        AffineMap::get(affine_map->getNumDims(), affine_map->getNumSymbols(),
-                       new_exprs, affine_map->getContext());
+    AffineMap new_affine_map =
+        AffineMap::get(affine_map.getNumDims(), affine_map.getNumSymbols(),
+                       new_exprs, affine_map.getContext());
+    IndexingMap new_indexing_map(
+        new_affine_map, indexing_map.GetDimVars(), indexing_map.GetRangeVars(),
+        indexing_map.GetRTVars(), indexing_map.GetConstraints());
     auto new_indexing_op = rewriter.create<ApplyIndexingOp>(
-        loc, indexing_op.getOperands(), indexing_map);
+        loc, indexing_op.getOperands(), new_indexing_map);
     for (int new_result_id = 0, new_indexing_op_result_id = 0;
          new_result_id < new_values.size(); ++new_result_id) {
       auto& new_value = new_values[new_result_id];
@@ -1113,7 +1115,7 @@ std::optional<IndexingMap> parseChainOfStringsAsIndexingMap(
   while (parser.parseOptionalAttribute(indexing_map_attr).has_value()) {
     indexing_map_str.append(indexing_map_attr.getValue());
   }
-  gpu::SymbolicExprContext symbolic_expr_context(parser.getContext());
+  SymbolicExprContext symbolic_expr_context(parser.getContext());
   return ParseIndexingMap(indexing_map_str, &symbolic_expr_context);
 }
 
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
index 1386ae488174eb..599d2aa306d884 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
@@ -39,9 +39,9 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/mlir/utils/error_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -71,7 +71,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
 class XLAOpsTest : public HloPjRtTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 std::string VariableConstraintsToString(const IndexingMap& map) {
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc b/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
index 61f909cd5e3fc4..9a4c9b0bf2aefb 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/codegen/kernel_spec.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/runtime/work_group.h"
 #include "xla/runtime/work_item.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -221,7 +221,7 @@ void SetIndexDataLayout(mlir::ModuleOp module,
 
 IndexingMap GetDefaultWorkItemIndexingMap(
     const WorkDimensions& work_dimensions, const Shape& shape,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   std::vector<mlir::AffineExpr> output_dims(shape.dimensions().size());
 
   const NumWorkItems& num_work_items = work_dimensions.num_work_items;
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder.h b/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
index 1c7acab079dd0e..7fade4ffd85ba2 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder.h
@@ -27,11 +27,11 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/kernel_spec.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_group.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla::emitters {
@@ -54,7 +54,7 @@ void SetIndexDataLayout(mlir::ModuleOp module,
 // and output shape.
 IndexingMap GetDefaultWorkItemIndexingMap(
     const WorkDimensions& work_dimensions, const Shape& shape,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Emits the work group id ops annotated with the range of each dimension.
 llvm::SmallVector<mlir::Value> EmitWorkGroupIds(
diff --git a/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc b/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
index 0bbacf91555c4d..8e31f72cbbe3c4 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_api_builder_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/layout_util.h"
 #include "xla/runtime/work_cluster.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_group.h"
 #include "xla/runtime/work_item.h"
 #include "xla/runtime/work_tile_size.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
@@ -35,7 +35,7 @@ namespace {
 
 TEST(DefaultWorkItemIndexingMap, MultiDimensionTile) {
   mlir::MLIRContext mlir_context;
-  gpu::SymbolicExprContext symbolic_expr_context(&mlir_context);
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   mlir_context.loadDialect<mlir::affine::AffineDialect>();
 
   WorkDimensions work_dimensions{NumWorkClusters{}, NumWorkGroups{2},
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
index ad5af8fc53f9ea..1d944b61ff906d 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
@@ -49,17 +49,16 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_item.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -70,7 +69,7 @@ limitations under the License.
 namespace xla::emitters {
 
 LoopFusionKernelEmitter::LoopFusionKernelEmitter(
-    gpu::SymbolicExprContext& symbolic_expr_context,
+    SymbolicExprContext& symbolic_expr_context,
     const HloFusionInstruction& fusion, const HloFusionSpec& fusion_spec,
     const BufferAssignment* buffer_assignment,
     KernelArguments::BufferAlignment buffer_alignment,
@@ -85,7 +84,7 @@ LoopFusionKernelEmitter::LoopFusionKernelEmitter(
       entry_function_name_(entry_function_name),
       backend_kind_(backend_kind) {}
 
-absl::StatusOr<MlirKernelDefinition>
+absl::StatusOr<LoopFusionKernelEmitter::KernelDefinition>
 LoopFusionKernelEmitter::EmitKernelDefinition() {
   mlir::OpBuilder builder(symbolic_expr_context_.GetMLIRContext());
   auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_.name()));
@@ -115,18 +114,18 @@ LoopFusionKernelEmitter::EmitKernelDefinition() {
                       GetKernelSpec(entry_function_name_, fusion_,
                                     buffer_assignment_, work_dimensions_));
 
-  return MlirKernelDefinition(std::move(kernel_spec),
-                              MlirKernelSource(std::move(module)));
+  return KernelDefinition(std::move(kernel_spec),
+                          MlirKernelSource(std::move(module)));
 }
 
 IndexingMap LoopFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
     const WorkDimensions& work_dimensions, const Shape& root_shape,
-    gpu::SymbolicExprContext* ctx) {
+    SymbolicExprContext* ctx) {
   return GetDefaultWorkItemIndexingMap(work_dimensions, root_shape, ctx);
 }
 
 IndexingMap LoopFusionKernelEmitter::ComputeWorkItemIdToOutputIndexing(
-    gpu::SymbolicExprContext* ctx) const {
+    SymbolicExprContext* ctx) const {
   return ComputeWorkItemIdToOutputIndexing(work_dimensions_,
                                            GetIndexingShape(fusion_spec_), ctx);
 }
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
index 05c9fb6c4ae96f..6e8eea53ba97b5 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_LOOP_KERNEL_EMITTER_H_
 #define XLA_CODEGEN_EMITTERS_LOOP_KERNEL_EMITTER_H_
 
-#include <cstdint>
 #include <string>
 
 #include "absl/status/status.h"
@@ -27,23 +26,21 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla::emitters {
 
 // Generic loop fusion.
-class LoopFusionKernelEmitter final : public MlirKernelEmitter {
+class LoopFusionKernelEmitter final : public KernelEmitter<MlirKernelSource> {
  public:
-  LoopFusionKernelEmitter(gpu::SymbolicExprContext& symbolic_expr_context,
+  LoopFusionKernelEmitter(SymbolicExprContext& symbolic_expr_context,
                           const HloFusionInstruction& fusion,
                           const HloFusionSpec& fusion_spec,
                           const BufferAssignment* buffer_assignment,
@@ -52,21 +49,19 @@ class LoopFusionKernelEmitter final : public MlirKernelEmitter {
                           absl::string_view entry_function_name,
                           BackendKind backend_kind);
 
-  absl::StatusOr<MlirKernelDefinition> EmitKernelDefinition() override;
+  absl::string_view name() const final { return "loop_fusion_kernel_emitter"; }
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() override;
 
   static IndexingMap ComputeWorkItemIdToOutputIndexing(
       const WorkDimensions& work_dimensions, const Shape& root_shape,
-      gpu::SymbolicExprContext* ctx);
+      SymbolicExprContext* ctx);
 
   // Get the shape that will be used for loop indexing for the given fusion
   // specification.
   static Shape GetIndexingShape(const HloFusionSpec& fusion_spec);
 
-  std::string name() const final { return "loop_fusion_kernel_emitter"; }
-
  private:
-  IndexingMap ComputeWorkItemIdToOutputIndexing(
-      gpu::SymbolicExprContext* ctx) const;
+  IndexingMap ComputeWorkItemIdToOutputIndexing(SymbolicExprContext* ctx) const;
 
   absl::Status EmitEntryFunction(
       const emitters::PartitionedComputations& computations,
@@ -75,7 +70,7 @@ class LoopFusionKernelEmitter final : public MlirKernelEmitter {
       const HloFusionInstruction& fusion) const;
 
  private:
-  gpu::SymbolicExprContext& symbolic_expr_context_;
+  SymbolicExprContext& symbolic_expr_context_;
   const HloFusionInstruction& fusion_;
   const HloFusionSpec& fusion_spec_;
   const BufferAssignment* buffer_assignment_;
diff --git a/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo b/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo
new file mode 100644
index 00000000000000..f8140e0f8c3f99
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/tests/loop/gather_start_index_map.hlo
@@ -0,0 +1,33 @@
+HloModule gather_start_index_map
+// A canonicalized (simplified) gather with non-normalized start_index_map.
+
+ENTRY main {
+  %operand = f32[30,20] parameter(0)
+  %indices = s32[10,2] parameter(1)
+  ROOT %gather = f32[10,4,8] gather(%operand, %indices),
+    offset_dims={1,2},
+    collapsed_slice_dims={},
+    start_index_map={1,0},
+    index_vector_dim=1,
+    slice_sizes={4,8}
+}
+
+// RUN: gpu_fusion_to_mlir %s | FileCheck %s
+// RUN: cpu_fusion_to_mlir %s | FileCheck %s
+// RUN: gpu_test_correctness %s
+// RUN: cpu_test_correctness %s
+
+// CHECK-LABEL: func.func @main
+// CHECK-SAME: %arg0: tensor<30x20xf32>
+// CHECK-SAME: %arg1: tensor<10x2xi32>
+// CHECK-SAME: %arg2: tensor<10x4x8xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[IDX0:.*]] = tensor.extract %arg1[{{.*}}, %[[C0]]]
+// CHECK: %[[IDX0_I:.*]] = arith.index_cast %[[IDX0]]
+// CHECK: %[[C12:.*]] = arith.constant 12 : index
+// CHECK: arith.minsi %[[IDX0_I]], %[[C12]]
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[IDX1:.*]] = tensor.extract %arg1[{{.*}}, %[[C1]]]
+// CHECK: %[[IDX1_I:.*]] = arith.index_cast %[[IDX1]]
+// CHECK: %[[C26:.*]] = arith.constant 26 : index
+// CHECK: arith.minsi %[[IDX1_I]], %[[C26]]
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index a9f9937c72f65d..a0e446a5d6223e 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -81,10 +81,10 @@ cc_library(
         "//xla/codegen/intrinsic:rsqrt",
         "//xla/codegen/intrinsic:tanh",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_compute_capability",
diff --git a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
index d3c0845eca74dd..d2283480a1b372 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
@@ -505,6 +505,11 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::TruncFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
+
+    if (!op.getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
+
     auto src = mlir::cast<FloatValue>(op.getOperand());
     auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
     if (dst_ty.getWidth() > 8) {
@@ -523,6 +528,11 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
   mlir::LogicalResult matchAndRewrite(
       ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
+
+    if (!op.getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
+
     auto src = mlir::cast<FloatValue>(op.getOperand());
     auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
     if (src.getType().getWidth() > 8) {
@@ -541,6 +551,10 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
 
   mlir::LogicalResult matchAndRewrite(
       ma::CmpFOp op, mlir::PatternRewriter& rewriter) const override {
+    if (!op.getLhs().getType().isFloat()) {
+      return rewriter.notifyMatchFailure(op, "not a scalar cmpf");
+    }
+
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
     auto lhs = mlir::cast<FloatValue>(op.getLhs());
     auto rhs = mlir::cast<FloatValue>(op.getRhs());
@@ -584,7 +598,10 @@ struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::math::AbsFOp op, mlir::PatternRewriter& rewriter) const override {
     using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto src = mlir::dyn_cast<FloatValue>(op.getOperand());
+    if (!src) {
+      return rewriter.notifyMatchFailure(op, "not a scalar float");
+    }
     // LowerGpuOpsToNVVMOps has a lowering for abs that doesn't work with bf16.
     // Once that's removed, remove the code for BF16 here.
     if (src.getType().getWidth() > 8 && !src.getType().isBF16()) {
@@ -615,7 +632,7 @@ struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getType().getIntOrFloatBitWidth() > 8) {
+    if (!op.getType().isFloat() || op.getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) itofp");
     }
     Value to_float =
@@ -631,7 +648,8 @@ struct RewriteFpToIPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getIn().getType().getIntOrFloatBitWidth() > 8) {
+    if (!op.getIn().getType().isFloat() ||
+        op.getIn().getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) fptoi");
     }
     Value to_f32 = rewriter.create<ma::ExtFOp>(
diff --git a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
index 65b240ddfbf0f2..6e8fc692ec465d 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -50,8 +50,8 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/layout_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
@@ -62,7 +62,6 @@ namespace {
 #define GEN_PASS_DEF_FLATTENTENSORSPASS
 #include "xla/codegen/emitters/transforms/passes.h.inc"
 
-using gpu::SymbolicExprContext;
 using mlir::Attribute;
 using mlir::Location;
 using mlir::LogicalResult;
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
index 8f769106582084..a27582037997c0 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -139,17 +140,26 @@ class LowerTruncF32BF16FPattern
       mlir::arith::TruncFOp op,
       mlir::PatternRewriter& rewriter) const override {
     auto src = op.getOperand();
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    auto dst_ty = op.getType();
 
-    if (!mlir::isa<mlir::Float32Type>(src.getType()) ||
-        !mlir::isa<mlir::BFloat16Type>(dst_ty)) {
+    if (!mlir::isa<mlir::Float32Type>(
+            mlir::getElementTypeOrSelf(src.getType())) ||
+        !mlir::isa<mlir::BFloat16Type>(mlir::getElementTypeOrSelf(dst_ty))) {
       return rewriter.notifyMatchFailure(op, "Not f32 -> bf16");
     }
 
+    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(src.getType());
+        vec_type && vec_type.getRank() != 1) {
+      // These will later be converted to loops of 1D vectors but will then miss
+      // the XLA intrinsic lowering.
+      op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
+      return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
+    }
+
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    Type src_type = Type::S(F32);
-    Type dst_type = Type::S(BF16);
+    auto src_type = Type::TypeFromIrType(src.getType());
+    auto dst_type = Type::TypeFromIrType(dst_ty);
     auto f32_to_bf16_decl =
         codegen::intrinsics::FpTrunc::GetOrInsertDeclaration(
             rewriter, module_op_, src_type, dst_type);
@@ -171,6 +181,13 @@ class LowerIntrinsicPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
+    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
+        vec_type && vec_type.getRank() != 1) {
+      // These will later be converted to loops of 1D vectors but will then miss
+      // the XLA intrinsic lowering.
+      op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
+      return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
+    }
     Type type = Type::TypeFromIrType(op.getType());
     mlir::StringAttr features =
         module_op_->getAttrOfType<mlir::StringAttr>("mhlo.cpu_features");
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
index 9a404b758b5c2b..1de01583caab8f 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
@@ -53,6 +53,19 @@ module {
 
 // -----
 
+module {
+  // CHECK-LABEL: @trunc
+  func.func @trunc_vector(%input: vector<8xf32>) -> vector<8xbf16> {
+    // CHECK-SAME: (%[[ARG:.*]]: vector<8xf32>) -> vector<8xbf16>
+    // CHECK: %[[TRUNC_CALL:.*]] = call @local_xla.fptrunc.v8f32.to.v8bf16(%[[ARG]])
+    %truncated = arith.truncf %input : vector<8xf32> to vector<8xbf16>
+    // CHECK: return %[[TRUNC_CALL]]
+    func.return %truncated : vector<8xbf16>
+  }
+}
+
+// -----
+
 module {
   func.func @erf32(%arg0: f32) -> f32 {
     %ret = math.erf %arg0 : f32
diff --git a/third_party/xla/xla/codegen/intrinsic/BUILD b/third_party/xla/xla/codegen/intrinsic/BUILD
index f05752aa9082e4..92195da9d1e973 100644
--- a/third_party/xla/xla/codegen/intrinsic/BUILD
+++ b/third_party/xla/xla/codegen/intrinsic/BUILD
@@ -3,6 +3,7 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -45,6 +46,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -148,6 +150,9 @@ cc_library(
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCAsmParser",  # fixdeps: keep
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVAsmParser",  # fixdeps: keep
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZAsmParser",  # fixdeps: keep
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
@@ -184,6 +189,7 @@ cc_library(
     hdrs = ["vec_name_mangler.h"],
     deps = [
         ":type",
+        "//xla:util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -354,6 +360,7 @@ xla_cc_test(
         ":intrinsic",
         ":simple_jit_runner",
         ":test_matchers",
+        "//xla:xla_data_proto_cc",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:ir_headers",
@@ -456,6 +463,7 @@ xla_cc_test(
     srcs = ["type_test.cc"],
     deps = [
         ":type",
+        "//xla:xla_data_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl b/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
index 91d023f16154f3..d44bd19958ab95 100644
--- a/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
+++ b/third_party/xla/xla/codegen/intrinsic/cpp/cc_to_llvm_ir.bzl
@@ -6,6 +6,7 @@ CPU features on the host platform.
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("@rules_cc//cc:find_cc_toolchain.bzl", "find_cc_toolchain", "use_cc_toolchain")
 load("@rules_cc//cc/common:cc_common.bzl", "cc_common")
+load("@rules_cc//cc/common:cc_info.bzl", "CcInfo")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
diff --git a/third_party/xla/xla/codegen/intrinsic/erf_test.cc b/third_party/xla/xla/codegen/intrinsic/erf_test.cc
index 1f02810360760c..546e430695a6db 100644
--- a/third_party/xla/xla/codegen/intrinsic/erf_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/erf_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/simple_jit_runner.h"
 #include "xla/codegen/intrinsic/test_matchers.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 namespace {
diff --git a/third_party/xla/xla/codegen/intrinsic/exp.cc b/third_party/xla/xla/codegen/intrinsic/exp.cc
index 71150dc2ac07d9..57205e0a491a9b 100644
--- a/third_party/xla/xla/codegen/intrinsic/exp.cc
+++ b/third_party/xla/xla/codegen/intrinsic/exp.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/codegen/intrinsic/ldexp.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic.h b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
index e771f3a1be4214..30d6e31088a581 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic.h
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
@@ -95,7 +95,7 @@ class Intrinsic {
 
   template <typename... Types>
   static std::string Name(Types... args) {
-    return ::xla::codegen::intrinsic::FunctionName(
+    return ::xla::codegen::intrinsic::GetTypedName(
         Derived::kLastArgIsReturnType, {args...}, Derived::kName);
   }
 
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc b/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
index d0c38161491f15..de768156a4023a 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/codegen/intrinsic/tanh.cc b/third_party/xla/xla/codegen/intrinsic/tanh.cc
index 8fbb39fa06448d..dab859ee001361 100644
--- a/third_party/xla/xla/codegen/intrinsic/tanh.cc
+++ b/third_party/xla/xla/codegen/intrinsic/tanh.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
diff --git a/third_party/xla/xla/codegen/intrinsic/type.cc b/third_party/xla/xla/codegen/intrinsic/type.cc
index 85a2870252b640..2ffbb71b4d9515 100644
--- a/third_party/xla/xla/codegen/intrinsic/type.cc
+++ b/third_party/xla/xla/codegen/intrinsic/type.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/codegen/intrinsic/type.h"
 
-#include <cctype>
 #include <cstddef>
 #include <optional>
 #include <string>
@@ -23,6 +22,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 
@@ -121,7 +122,7 @@ std::string Type::name() const {
 
 Type Type::FromName(absl::string_view name) {
   if (name[0] == 'v') {
-    size_t len = std::isdigit(name[2]) ? 2 : 1;
+    size_t len = absl::ascii_isdigit(name[2]) ? 2 : 1;
     size_t width;
     CHECK(absl::SimpleAtoi(name.substr(1, len), &width)) << name;
     return Type(FromLowercaseLLVMTypeName(name.substr(len + 1)), width);
@@ -200,6 +201,7 @@ mlir::Type Type::to_ir_type(mlir::MLIRContext& context) const {
 
 Type Type::TypeFromIrType(mlir::Type type) {
   if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(type)) {
+    CHECK_EQ(vec_type.getRank(), 1) << "Expected rank 1 for vector type.";
     return Type(ConvertMlirTypeToPrimitiveType(vec_type.getElementType()),
                 vec_type.getShape().front());
   }
diff --git a/third_party/xla/xla/codegen/intrinsic/type_test.cc b/third_party/xla/xla/codegen/intrinsic/type_test.cc
index 497b141fc9528e..424b8f1c8e0c1f 100644
--- a/third_party/xla/xla/codegen/intrinsic/type_test.cc
+++ b/third_party/xla/xla/codegen/intrinsic/type_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/IR/Type.h"
 #include "llvm/Support/TypeSize.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::codegen::intrinsics {
 namespace {
diff --git a/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h b/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
index 5ce57f3c6017fa..515f8f31a3c02c 100644
--- a/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
+++ b/third_party/xla/xla/codegen/intrinsic/vec_name_mangler.h
@@ -24,9 +24,11 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/codegen/intrinsic/type.h"
+#include "xla/util.h"
 
 namespace xla::codegen::intrinsic {
 
@@ -82,7 +84,7 @@ inline std::string GetMangledNamePrefix(
                               param_cardinalities);
 }
 
-inline std::string FunctionName(bool last_arg_is_return_type,
+inline std::string GetTypedName(bool last_arg_is_return_type,
                                 absl::Span<const intrinsics::Type> types,
                                 absl::string_view func_name) {
   std::vector<std::string> type_names;
@@ -96,6 +98,59 @@ inline std::string FunctionName(bool last_arg_is_return_type,
   return absl::StrCat("xla.", func_name, ".", absl::StrJoin(type_names, "."));
 }
 
+struct ParsedFunctionName {
+  std::string base_name;
+  std::vector<intrinsics::Type> types;
+  bool last_arg_is_return_type;
+  bool is_masked;  // TODO: Add support for masked functions.
+};
+
+inline std::string GetTypedName(const ParsedFunctionName& parsed_name) {
+  return GetTypedName(parsed_name.last_arg_is_return_type, parsed_name.types,
+                      parsed_name.base_name);
+}
+
+inline absl::StatusOr<ParsedFunctionName> ParseFunctionName(
+    absl::string_view function_name) {
+  // The `to` in a typed function name is used to specify the return type, so
+  // we ignore it when parsing the function name.
+  static constexpr absl::string_view kIgnoredParts[] = {"to"};
+  std::vector<intrinsics::Type> types;
+  auto parts = absl::StrSplit(function_name, '.');
+  int i = -1;
+  ParsedFunctionName result;
+  result.last_arg_is_return_type = false;
+  result.is_masked = false;
+  for (absl::string_view part : parts) {
+    // Skip the first two parts, which will be `xla.<func_name>`:
+    i++;
+    if (i == 0) {
+      if (part != "xla") {
+        return InvalidArgument("Invalid function name: %s", function_name);
+      }
+      // skip `xla.`
+      continue;
+    }
+    if (i == 1) {
+      result.base_name = std::string(part);
+      continue;
+    }
+    if (bool ignored =
+            absl::c_find(kIgnoredParts, part) != std::end(kIgnoredParts)) {
+      if (part == "to") {
+        result.last_arg_is_return_type = true;
+      }
+      continue;
+    }
+    types.push_back(intrinsics::Type::FromName(part));
+  }
+  if (i < 2) {
+    return InvalidArgument("Invalid function name: %s", function_name);
+  }
+  result.types = types;
+  return result;
+}
+
 }  // namespace xla::codegen::intrinsic
 
 #endif  // XLA_CODEGEN_INTRINSIC_VEC_NAME_MANGLER_H_
diff --git a/third_party/xla/xla/codegen/intrinsic_function.h b/third_party/xla/xla/codegen/intrinsic_function.h
new file mode 100644
index 00000000000000..dd65f351ad9c85
--- /dev/null
+++ b/third_party/xla/xla/codegen/intrinsic_function.h
@@ -0,0 +1,63 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_INTRINSIC_FUNCTION_H_
+#define XLA_CODEGEN_INTRINSIC_FUNCTION_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic/type.h"
+
+namespace xla::codegen {
+
+// Interface representing a single vectorized math function approximation.
+// Each implementation may support multiple vector widths and primitive types,
+// defined by the SupportedVectorTypes() method. To emit LLVM IR for a
+// particular vector width and primitive type, call CreateDefinition() with the
+// desired vector_width and primitive_type.
+class IntrinsicFunction {
+ public:
+  virtual ~IntrinsicFunction() = default;
+  // The name of the function being approximated.
+  virtual absl::string_view FunctionName() const = 0;
+
+  // Returns the vector types supported well by this approximation.
+  virtual std::vector<std::vector<intrinsics::Type>> SupportedVectorTypes(
+      absl::string_view features) const = 0;
+
+  // Returns the LLVM IR function definition for the approximation.
+  virtual llvm::Function* CreateDefinition(llvm::Module& module,
+                                           intrinsics::IntrinsicOptions options,
+                                           absl::string_view name) const = 0;
+
+  // The vectorized function name, e.g. "xla.ldexp.v8f64.v8i32".
+  virtual std::string GenerateVectorizedFunctionName(
+      absl::Span<const intrinsics::Type> types) const = 0;
+
+  // The LLVM mangled prefix for the vectorized function, e.g.
+  // "_ZGV_LLVM_N8" used in llvm::VecDesc.
+  virtual std::string GenerateMangledSimdPrefix(
+      absl::Span<const intrinsics::Type> types) const = 0;
+};
+
+}  // namespace xla::codegen
+
+#endif  // XLA_CODEGEN_INTRINSIC_FUNCTION_H_
diff --git a/third_party/xla/xla/codegen/intrinsic_lib.cc b/third_party/xla/xla/codegen/intrinsic_lib.cc
index c8274d45aba873..23809838c1ebbe 100644
--- a/third_party/xla/xla/codegen/intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/intrinsic_lib.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/codegen/intrinsic_lib.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -27,7 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/strings/str_split.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
@@ -48,6 +47,7 @@ limitations under the License.
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/Casting.h"
@@ -90,25 +90,6 @@ template <size_t N, typename F, typename Container>
 decltype(auto) apply_vector(F&& f, const Container& v) {
   return apply_vector(f, v, std::make_index_sequence<N>{});
 }
-
-std::vector<Type> ParseTypesFromFunctionName(absl::string_view function_name) {
-  // The `to` in a typed function name is used to specify the return type, so
-  // we ignore it when parsing the function name.
-  static constexpr absl::string_view kIgnoredParts[] = {"to"};
-  std::vector<Type> types;
-  auto parts = absl::StrSplit(function_name, '.');
-  size_t i = 0;
-  for (absl::string_view part : parts) {
-    // Skip the first two parts, which will be `xla.<func_name>`:
-    if (i++ < 2 || std::find(std::begin(kIgnoredParts), std::end(kIgnoredParts),
-                             part) != std::end(kIgnoredParts)) {
-      continue;
-    }
-    types.push_back(Type::FromName(part));
-  }
-  return types;
-}
-
 }  // namespace
 
 using intrinsics::IntrinsicOptions;
@@ -131,7 +112,9 @@ class IntrinsicAdapter : public IntrinsicFunction {
   llvm::Function* CreateDefinition(llvm::Module& module,
                                    IntrinsicOptions options,
                                    absl::string_view name) const override {
-    std::vector<Type> types = ParseTypesFromFunctionName(name);
+    absl::StatusOr<intrinsic::ParsedFunctionName> parsed =
+        intrinsic::ParseFunctionName(name);
+    CHECK_OK(parsed);
     return apply_vector<Intrinsic::kNumArgs>(
                [&](auto... args) {
                  if constexpr (std::is_invocable_v<
@@ -144,7 +127,7 @@ class IntrinsicAdapter : public IntrinsicFunction {
                    return Intrinsic::CreateDefinition(&module, args...);
                  }
                },
-               types)
+               parsed->types)
         .value();
   }
 
diff --git a/third_party/xla/xla/codegen/intrinsic_lib.h b/third_party/xla/xla/codegen/intrinsic_lib.h
index cb49c88c8540c7..2907dcc0badb85 100644
--- a/third_party/xla/xla/codegen/intrinsic_lib.h
+++ b/third_party/xla/xla/codegen/intrinsic_lib.h
@@ -17,51 +17,19 @@ limitations under the License.
 #define XLA_CODEGEN_INTRINSIC_LIB_H_
 
 #include <memory>
-#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "xla/codegen/intrinsic/intrinsic.h"
+#include "xla/codegen/intrinsic_function.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::codegen {
 
-// Interface representing a single vectorized math function approximation.
-// Each implementation may support multiple vector widths and primitive types,
-// defined by the SupportedVectorTypes() method. To emit LLVM IR for a
-// particular vector width and primitive type, call CreateDefinition() with the
-// desired vector_width and primitive_type.
-class IntrinsicFunction {
- public:
-  virtual ~IntrinsicFunction() = default;
-  // The name of the function being approximated.
-  virtual absl::string_view FunctionName() const = 0;
-
-  // Returns the vector types supported well by this approximation.
-  virtual std::vector<std::vector<intrinsics::Type>> SupportedVectorTypes(
-      absl::string_view features) const = 0;
-
-  // Returns the LLVM IR function definition for the approximation.
-  virtual llvm::Function* CreateDefinition(llvm::Module& module,
-                                           intrinsics::IntrinsicOptions options,
-                                           absl::string_view name) const = 0;
-
-  // The vectorized function name, e.g. "xla.ldexp.v8f64.v8i32".
-  virtual std::string GenerateVectorizedFunctionName(
-      absl::Span<const intrinsics::Type> types) const = 0;
-
-  // The LLVM mangled prefix for the vectorized function, e.g.
-  // "_ZGV_LLVM_N8" used in llvm::VecDesc.
-  virtual std::string GenerateMangledSimdPrefix(
-      absl::Span<const intrinsics::Type> types) const = 0;
-};
-
 // A library of intrinsic functions and math approximations.
 // The library hooks into LLVM compilation in two places:
 // 1. It provides a set of VecDescs that are used to replace LLVM
diff --git a/third_party/xla/xla/codegen/kernel_definition.h b/third_party/xla/xla/codegen/kernel_definition.h
index 420e231831aa9b..bfebb0318059a0 100644
--- a/third_party/xla/xla/codegen/kernel_definition.h
+++ b/third_party/xla/xla/codegen/kernel_definition.h
@@ -1,4 +1,3 @@
-#include "xla/codegen/kernel_source.h"
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,47 +18,68 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/codegen/kernel_source.h"
 #include "xla/codegen/kernel_spec.h"
 
 namespace xla {
 
+//===----------------------------------------------------------------------===//
+// KernelDefinitionBase.
+//===----------------------------------------------------------------------===//
+
+// A base class for kernel definitions.
+//
+// KernelDefinition defines how the kernel must be executed via the `KernelSpec`
+// and also contains the `KernelSource` that implements the kernel itself.
 class KernelDefinitionBase {
  public:
+  explicit KernelDefinitionBase(KernelSpec spec) : spec_(std::move(spec)) {}
   virtual ~KernelDefinitionBase() = default;
 
-  virtual const KernelSpec& spec() const = 0;
+  const KernelSpec& spec() const { return spec_; }
+  KernelSpec& spec() { return spec_; }
+
   virtual const KernelSource& source() const = 0;
+  virtual KernelSource& source() = 0;
+
+ protected:
+  KernelDefinitionBase(KernelDefinitionBase&&) = default;
+  KernelDefinitionBase& operator=(KernelDefinitionBase&&) noexcept = default;
+
+ private:
+  KernelSpec spec_;
 };
 
-template <typename KernelSourceType>
+//===----------------------------------------------------------------------===//
+// KernelDefinition.
+//===----------------------------------------------------------------------===//
+
+// A concrete kernel definition implementation for the given kernel source type.
+template <typename Source>
 class KernelDefinition final : public KernelDefinitionBase {
- public:
-  struct Storage {
-    KernelSpec spec;
-    KernelSourceType source;
-  };
+  static_assert(std::is_base_of_v<KernelSource, Source>,
+                "Source must be a subclass of KernelSource");
 
-  KernelDefinition(KernelSpec spec, KernelSourceType source)
-      : storage_{std::move(spec), std::move(source)} {}
+ public:
+  KernelDefinition(KernelSpec spec, Source source)
+      : KernelDefinitionBase(std::move(spec)), source_(std::move(source)) {}
 
   KernelDefinition(KernelDefinition&&) = default;
   KernelDefinition& operator=(KernelDefinition&&) noexcept = default;
 
-  const KernelSpec& spec() const override { return storage_.spec; }
-  const KernelSourceType& source() const override { return storage_.source; }
+  const Source& source() const final { return source_; }
+  Source& source() final { return source_; }
 
-  // Release the kernel definition implementation.
-  // This is useful for backends that need to store the kernel definition
-  // separately from the kernel spec.
-  Storage ReleaseStorage() && { return std::move(storage_); }
+  // Moves ownership of the source to the caller.
+  Source TakeSource() && { return std::move(source_); }
 
  private:
-  Storage storage_;
+  Source source_;
 };
 
-template <typename KernelSourceType>
-KernelDefinition(KernelSpec, KernelSourceType)
-    -> KernelDefinition<KernelSourceType>;
+// Class template argument deduction guide for KernelDefinition.
+template <typename Source>
+KernelDefinition(KernelSpec, Source) -> KernelDefinition<Source>;
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/codegen/kernel_emitter.h b/third_party/xla/xla/codegen/kernel_emitter.h
index 3e7922e8dda3eb..03c380c9cb5c55 100644
--- a/third_party/xla/xla/codegen/kernel_emitter.h
+++ b/third_party/xla/xla/codegen/kernel_emitter.h
@@ -17,39 +17,55 @@ limitations under the License.
 #define XLA_CODEGEN_KERNEL_EMITTER_H_
 
 #include <memory>
-#include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_source.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 
+//===----------------------------------------------------------------------===//
+// KernelEmitterBase.
+//===----------------------------------------------------------------------===//
+
+// A base class for emitting XLA kernels.
 class KernelEmitterBase {
  public:
+  KernelEmitterBase() = default;
   virtual ~KernelEmitterBase() = default;
 
+  virtual absl::string_view name() const = 0;
+
   virtual absl::StatusOr<std::unique_ptr<KernelDefinitionBase>>
-  EmitBaseKernelDefinition() = 0;
+  EmitKernelDefinitionBase() = 0;
+
+ protected:
+  KernelEmitterBase(KernelEmitterBase&&) = default;
+  KernelEmitterBase& operator=(KernelEmitterBase&&) = default;
 };
 
+//===----------------------------------------------------------------------===//
+// KernelEmitter.
+//===----------------------------------------------------------------------===//
+
 // KernelEmitter is an API that emits kernel definition from a given input
 // (i.e. it emits kernels compiled from HLO fusions).
-template <typename KernelDefinitionType>
+template <typename Source>
 class KernelEmitter : public KernelEmitterBase {
  public:
-  virtual ~KernelEmitter() = default;
-
-  virtual absl::StatusOr<KernelDefinitionType> EmitKernelDefinition() = 0;
+  static_assert(std::is_base_of_v<KernelSource, Source>,
+                "Source must be a subclass of KernelSource");
 
-  virtual std::string name() const = 0;
+  using KernelDefinition = ::xla::KernelDefinition<Source>;
+  virtual absl::StatusOr<KernelDefinition> EmitKernelDefinition() = 0;
 
  private:
   absl::StatusOr<std::unique_ptr<KernelDefinitionBase>>
-  EmitBaseKernelDefinition() final {
-    TF_ASSIGN_OR_RETURN(KernelDefinitionType kernel_definition,
-                        EmitKernelDefinition());
-    return std::make_unique<KernelDefinitionType>(std::move(kernel_definition));
+  EmitKernelDefinitionBase() final {
+    TF_ASSIGN_OR_RETURN(auto kernel_definition, EmitKernelDefinition());
+    return std::make_unique<KernelDefinition>(std::move(kernel_definition));
   }
 };
 
diff --git a/third_party/xla/xla/codegen/kernel_source.h b/third_party/xla/xla/codegen/kernel_source.h
index b1a81148c56fe6..fbf5f5d6fe53ff 100644
--- a/third_party/xla/xla/codegen/kernel_source.h
+++ b/third_party/xla/xla/codegen/kernel_source.h
@@ -25,10 +25,15 @@ namespace xla {
 // already compiled) or an LLVM IR (if XLA itself will compile it to PTX).
 class KernelSource {
  public:
+  KernelSource() = default;
   virtual ~KernelSource() = default;
 
   // Get a human readable string representation of the kernel source.
   virtual std::string ToString() const = 0;
+
+ protected:
+  KernelSource(KernelSource&&) = default;
+  KernelSource& operator=(KernelSource&&) = default;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/kernel_spec.h b/third_party/xla/xla/codegen/kernel_spec.h
index 01f9ff440ddcae..5ca578767aacf0 100644
--- a/third_party/xla/xla/codegen/kernel_spec.h
+++ b/third_party/xla/xla/codegen/kernel_spec.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/runtime/work_cluster.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/runtime/work_group.h"
@@ -50,9 +51,9 @@ class KernelSpec {
              absl::flat_hash_set<int64_t> invariant_arguments,
              std::optional<size_t> scratch_bytes = std::nullopt);
 
-  // Get the backend specific name of the kernel.
-  // This may be used to identify the kernel in the backend specific runtime.
-  const std::string& name() const { return name_; }
+  // Get the backend specific name of the kernel. This may be used to identify
+  // the kernel in the backend specific runtime.
+  absl::string_view name() const { return name_; }
 
   // Kernel work dimensions define how the kernel execution must be
   // parallelized. The meaning of these dimensions is backend specific, i.e.
@@ -66,12 +67,15 @@ class KernelSpec {
   // on the exact meaning of these dimensions and how they are mapped to the
   // underlying hardware, and how to use them for perfrormance optimization.
   WorkDimensions work_dimensions() const { return work_dimensions_; }
+
   NumWorkClusters num_workclusters() const {
     return work_dimensions_.num_work_clusters;
   }
+
   NumWorkGroups num_workgroups() const {
     return work_dimensions_.num_work_groups;
   }
+
   NumWorkItems num_workitems() const { return work_dimensions_.num_work_items; }
 
   // Requested amount of scratch bytes for the kernel (backed by backend
@@ -80,9 +84,14 @@ class KernelSpec {
   std::optional<size_t> scratch_bytes() const { return scratch_bytes_; }
 
   // Argument buffers read by the kernel.
-  const Buffers& argument_buffers() const { return argument_buffers_; }
+  absl::Span<const BufferAllocation::Slice> argument_buffers() const {
+    return argument_buffers_;
+  }
+
   // Result buffers written to by the kernel.
-  const Buffers& result_buffers() const { return result_buffers_; }
+  absl::Span<const BufferAllocation::Slice> result_buffers() const {
+    return result_buffers_;
+  }
 
   // Returns a set of invariant arguments (corresponding to the indices in the
   // argument buffers list).
diff --git a/third_party/xla/xla/codegen/llvm_ir_kernel_source.cc b/third_party/xla/xla/codegen/llvm_kernel_source.cc
similarity index 89%
rename from third_party/xla/xla/codegen/llvm_ir_kernel_source.cc
rename to third_party/xla/xla/codegen/llvm_kernel_source.cc
index c7a0ad85bc6463..70aa5055951001 100644
--- a/third_party/xla/xla/codegen/llvm_ir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/llvm_kernel_source.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/llvm_kernel_source.h"
 
 #include <string>
 
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace xla {
 
-std::string LlvmIrKernelSource::ToString() const {
+std::string LlvmKernelSource::ToString() const {
   return module_.withModuleDo(
       [&](llvm::Module& m) { return llvm_ir::DumpToString(&m); });
 }
diff --git a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h b/third_party/xla/xla/codegen/llvm_kernel_source.h
similarity index 75%
rename from third_party/xla/xla/codegen/llvm_ir_kernel_source.h
rename to third_party/xla/xla/codegen/llvm_kernel_source.h
index 3a58afcd08770d..03b19cd2850b46 100644
--- a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
+++ b/third_party/xla/xla/codegen/llvm_kernel_source.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
-#define XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+#ifndef XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
+#define XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
 
 #include <memory>
 #include <string>
@@ -31,14 +31,14 @@ namespace xla {
 // implementation we might emit a single LLVM module with multiple kernels or a
 // separate LLVM module for each kernel. Kernel function signature is defined by
 // the backend specific ABI.
-class LlvmIrKernelSource final : public KernelSource {
+class LlvmKernelSource final : public KernelSource {
  public:
-  LlvmIrKernelSource(llvm::orc::ThreadSafeContext context,
-                     std::unique_ptr<llvm::Module> module)
+  LlvmKernelSource(llvm::orc::ThreadSafeContext context,
+                   std::unique_ptr<llvm::Module> module)
       : module_(std::move(module), std::move(context)) {}
 
-  LlvmIrKernelSource(LlvmIrKernelSource&& other) = default;
-  LlvmIrKernelSource& operator=(LlvmIrKernelSource&& other) noexcept = default;
+  LlvmKernelSource(LlvmKernelSource&& other) = default;
+  LlvmKernelSource& operator=(LlvmKernelSource&& other) noexcept = default;
 
   llvm::orc::ThreadSafeModule thread_safe_module() && {
     return std::move(module_);
@@ -52,4 +52,4 @@ class LlvmIrKernelSource final : public KernelSource {
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_LLVM_IR_KERNEL_SOURCE_H_
+#endif  // XLA_CODEGEN_LLVM_KERNEL_SOURCE_H_
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.cc b/third_party/xla/xla/codegen/mlir_kernel_source.cc
index f96a24631052f1..62784d82d37968 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -38,7 +38,7 @@ namespace xla {
 absl::StatusOr<MlirKernelSource> MlirKernelSource::ParseFromString(
     absl::string_view ir, std::unique_ptr<mlir::MLIRContext> mlir_context) {
   auto symbolic_expr_context =
-      std::make_unique<gpu::SymbolicExprContext>(mlir_context.get());
+      std::make_unique<SymbolicExprContext>(mlir_context.get());
   llvm::SourceMgr source_mgr;
 
   std::string error_string;
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.h b/third_party/xla/xla/codegen/mlir_kernel_source.h
index 89b4f5694447ff..20501ff9306c17 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.h
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -27,7 +28,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/DebugStringHelper.h"
 #include "xla/codegen/kernel_source.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -40,25 +41,19 @@ namespace xla {
 // compiler.
 class MlirKernelSource final : public KernelSource {
  public:
-  struct Storage {
-    std::unique_ptr<mlir::MLIRContext> mlir_context;
-    std::unique_ptr<gpu::SymbolicExprContext> symbolic_expr_context;
-    mlir::OwningOpRef<mlir::ModuleOp> module;
-  };
-
   // Construct a MLIR kernel source from a module and take ownership of its MLIR
   // context.
-  MlirKernelSource(
-      std::unique_ptr<mlir::MLIRContext> mlir_context,
-      std::unique_ptr<gpu::SymbolicExprContext> symbolic_expr_context,
-      mlir::OwningOpRef<mlir::ModuleOp> module)
-      : storage_{std::move(mlir_context), std::move(symbolic_expr_context),
-                 std::move(module)} {}
+  MlirKernelSource(std::unique_ptr<mlir::MLIRContext> mlir_context,
+                   std::unique_ptr<SymbolicExprContext> symbolic_expr_context,
+                   mlir::OwningOpRef<mlir::ModuleOp> module)
+      : mlir_context_(std::move(mlir_context)),
+        symbolic_expr_context_(std::move(symbolic_expr_context)),
+        module_(std::move(module)) {}
 
   // Construct a MLIR kernel source from a module but don't take any ownership
   // of the MLIR context.
   explicit MlirKernelSource(mlir::OwningOpRef<mlir::ModuleOp> module)
-      : storage_{nullptr, nullptr, std::move(module)} {}
+      : MlirKernelSource(nullptr, nullptr, std::move(module)) {}
 
   MlirKernelSource(MlirKernelSource&& other) noexcept = default;
   MlirKernelSource& operator=(MlirKernelSource&& other) noexcept = default;
@@ -66,19 +61,25 @@ class MlirKernelSource final : public KernelSource {
   static absl::StatusOr<MlirKernelSource> ParseFromString(
       absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context);
 
-  mlir::ModuleOp module() { return *storage_.module; }
-  gpu::SymbolicExprContext* symbolic_expr_context() {
-    return storage_.symbolic_expr_context.get();
-  }
+  mlir::ModuleOp module() { return *module_; }
 
-  Storage ReleaseStorage() && { return std::move(storage_); }
+  SymbolicExprContext* symbolic_expr_context() {
+    return symbolic_expr_context_.get();
+  }
 
-  std::string ToString() const final {
-    return mlir::debugString(*storage_.module);
+  // Moves ownership of the module to the caller.
+  mlir::OwningOpRef<mlir::ModuleOp> TakeModule() && {
+    DCHECK(mlir_context_ == nullptr && symbolic_expr_context_ == nullptr)
+        << "Can't move ownership of the module owned by the MlirKernelSource";
+    return std::move(module_);
   }
 
+  std::string ToString() const final { return mlir::debugString(*module_); }
+
  private:
-  Storage storage_;
+  std::unique_ptr<mlir::MLIRContext> mlir_context_;
+  std::unique_ptr<SymbolicExprContext> symbolic_expr_context_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/testlib/BUILD b/third_party/xla/xla/codegen/testlib/BUILD
index 0aacdb3868ae39..36725b09c7c6d3 100644
--- a/third_party/xla/xla/codegen/testlib/BUILD
+++ b/third_party/xla/xla/codegen/testlib/BUILD
@@ -57,11 +57,7 @@ tsl_pybind_extension(
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_source",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:llvm_kernel_emitter",
-        "//xla/codegen:mlir_kernel_definition",
-        "//xla/codegen:mlir_kernel_emitter",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
diff --git a/third_party/xla/xla/codegen/testlib/__init__.py b/third_party/xla/xla/codegen/testlib/__init__.py
index a12205ccf1235f..4411b309bd3ebb 100644
--- a/third_party/xla/xla/codegen/testlib/__init__.py
+++ b/third_party/xla/xla/codegen/testlib/__init__.py
@@ -30,9 +30,9 @@
 KernelEmitterBase = _extension.KernelEmitterBase
 KernelRunner = _extension.KernelRunner
 KernelSpec = _extension.KernelSpec
-LlvmIrKernelSource = _extension.LlvmIrKernelSource
 LlvmKernelDefinition = _extension.LlvmKernelDefinition
 LlvmKernelEmitter = _extension.LlvmKernelEmitter
+LlvmKernelSource = _extension.LlvmKernelSource
 MlirKernelDefinition = _extension.MlirKernelDefinition
 MlirKernelEmitter = _extension.MlirKernelEmitter
 MlirKernelSource = _extension.MlirKernelSource
diff --git a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
index fe263a8c12af53..5efad7c84a2f8b 100644
--- a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
@@ -38,11 +38,7 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/kernel_source.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/llvm_kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_emitter.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/comparison_util.h"
@@ -178,8 +174,8 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<KernelSource>(kernel_runner_module, "KernelSource")
       .def("__str__", &KernelSource::ToString);
 
-  nb::class_<LlvmIrKernelSource, KernelSource> llvm_kernel_source(
-      kernel_runner_module, "LlvmIrKernelSource");
+  nb::class_<LlvmKernelSource, KernelSource> llvm_kernel_source(
+      kernel_runner_module, "LlvmKernelSource");
 
   nb::class_<MlirKernelSource, KernelSource>(kernel_runner_module,
                                              "MlirKernelSource")
@@ -197,29 +193,37 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<KernelSpec> kernel_spec(kernel_runner_module, "KernelSpec");
 
   nb::class_<KernelDefinitionBase>(kernel_runner_module, "KernelDefinitionBase")
-      .def("spec", &KernelDefinitionBase::spec,
-           nb::rv_policy::reference_internal)
-      .def("source", &KernelDefinitionBase::source,
-           nb::rv_policy::reference_internal);
+      .def(
+          "spec",
+          [](const KernelDefinitionBase* self) -> const KernelSpec& {
+            return self->spec();
+          },
+          nb::rv_policy::reference_internal)
+      .def(
+          "source",
+          [](const KernelDefinitionBase* self) -> const KernelSource& {
+            return self->source();
+          },
+          nb::rv_policy::reference_internal);
 
-  nb::class_<MlirKernelDefinition, KernelDefinitionBase>(
+  nb::class_<KernelDefinition<MlirKernelSource>, KernelDefinitionBase>(
       kernel_runner_module, "MlirKernelDefinition");
-  nb::class_<LlvmKernelDefinition, KernelDefinitionBase>(
+  nb::class_<KernelDefinition<LlvmKernelSource>, KernelDefinitionBase>(
       kernel_runner_module, "LlvmKernelDefinition");
 
   nb::class_<KernelEmitterBase>(kernel_runner_module, "KernelEmitterBase")
       .def("emit_kernel_definition", [](KernelEmitterBase* self) {
         absl::StatusOr<std::unique_ptr<KernelDefinitionBase>> definition =
-            self->EmitBaseKernelDefinition();
+            self->EmitKernelDefinitionBase();
         if (!definition.ok()) {
           throw std::runtime_error(std::string(definition.status().message()));
         }
         return std::move(definition).value();
       });
-  nb::class_<MlirKernelEmitter, KernelEmitterBase>(kernel_runner_module,
-                                                   "MlirKernelEmitter");
-  nb::class_<LlvmKernelEmitter, KernelEmitterBase>(kernel_runner_module,
-                                                   "LlvmKernelEmitter");
+  nb::class_<KernelEmitter<MlirKernelSource>, KernelEmitterBase>(
+      kernel_runner_module, "MlirKernelEmitter");
+  nb::class_<KernelEmitter<LlvmKernelSource>, KernelEmitterBase>(
+      kernel_runner_module, "LlvmKernelEmitter");
 
   nb::class_<KernelRunner>(kernel_runner_module, "KernelRunner")
       .def("call", &KernelRunnerCall);
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index 6906bc4b077979..0ff1b3042cbbb1 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -47,9 +47,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:indexing_test_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
@@ -104,9 +104,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:indexing_test_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
@@ -121,8 +121,8 @@ cc_library(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -146,10 +146,10 @@ xla_cc_test(
         ":tiling_specification",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
@@ -294,10 +294,10 @@ xla_cc_test(
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
@@ -335,10 +335,10 @@ xla_cc_test(
     deps = [
         ":symbolic_tile_analysis",
         ":tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
@@ -363,6 +363,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
@@ -370,7 +371,6 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -405,13 +405,13 @@ xla_cc_test(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_test_utils",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
index a25587cad1fcde..7d63afe1a8227b 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -71,7 +72,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/shape.h"
@@ -158,7 +158,7 @@ absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
     const IndexingMap& root_indexing, absl::Span<const int64_t> tile_sizes,
     absl::Span<const int64_t> major_to_minor_active_tiling_parameters,
     const TiledHloSchedule& schedule,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    SymbolicExprContext* symbolic_expr_context,
     const std::optional<absl::Span<const Interval>>&
         parent_output_tile_dim_bounds = std::nullopt) {
   MLIRContext* mlir_context = symbolic_expr_context->GetMLIRContext();
@@ -265,7 +265,7 @@ class SymbolicTiledHloFusionInstruction : public SymbolicTiledHloInstruction {
 absl::StatusOr<IndexingMap> ComputeTileOffsetIndexing(
     const SymbolicTiledHloInstruction& tiled_hlo,
     const IndexingMap& output_tile_offset_indexing,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   MLIRContext* mlir_context = symbolic_expr_context->GetMLIRContext();
   VLOG(4) << "ComputeTileOffsetIndexing, combining output "
           << ToString(output_tile_offset_indexing) << " with operation "
@@ -424,7 +424,7 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
       hlo->opcode() == HloOpcode::kBitcast) {
     mlir::MLIRContext* ctx = indexing_map.GetMLIRContext();
     // TODO(b/446856303): Get SymbolicExprContext from indexing_map.
-    gpu::SymbolicExprContext symbolic_expr_context(ctx);
+    SymbolicExprContext symbolic_expr_context(ctx);
     IndexingMap reshape_indexing_map =
         ComputeOutputToInputIndexing(hlo, /*output_id=*/0,
                                      &symbolic_expr_context)
@@ -541,6 +541,41 @@ void SortTiledHloInstructionsInPostOrder(
   }
 }
 
+// Returns `true` if the given dot instruction has a non-batch point dimension.
+//
+// This function will perform these checks for all `dot`-like instructions for
+// which `IsSomeDot` returns `true`.
+bool IsDotWithNonBatchPointDimension(const HloInstruction* instr) {
+  if (!IsSomeDot(instr)) {
+    return false;
+  }
+
+  auto has_any_trivial_dimension = [](const Shape& shape,
+                                      absl::Span<const int64_t> dimensions) {
+    return absl::c_any_of(
+        dimensions, [&](int64_t dim) { return shape.dimensions(dim) == 1; });
+  };
+
+  absl::Span<const int64_t> lhs_contracting_dimensions =
+      instr->dot_dimension_numbers().lhs_contracting_dimensions();
+  auto lhs_non_contracting_dimensions = GetNonContractingDims(
+      instr->operand(0)->shape().dimensions().size(),
+      lhs_contracting_dimensions,
+      instr->dot_dimension_numbers().lhs_batch_dimensions());
+
+  auto rhs_non_contracting_dimensions = GetNonContractingDims(
+      instr->operand(1)->shape().dimensions().size(),
+      instr->dot_dimension_numbers().rhs_contracting_dimensions(),
+      instr->dot_dimension_numbers().rhs_batch_dimensions());
+
+  return has_any_trivial_dimension(instr->operand(0)->shape(),
+                                   lhs_non_contracting_dimensions) ||
+         has_any_trivial_dimension(instr->operand(0)->shape(),
+                                   lhs_contracting_dimensions) ||
+         has_any_trivial_dimension(instr->operand(1)->shape(),
+                                   rhs_non_contracting_dimensions);
+}
+
 // Returns `true` if `SymbolicTileAnalysis` should simplify point dimensions
 // away when deriving indexing maps.
 //
@@ -571,7 +606,11 @@ bool ShouldDerivationSimplifyPointDimensions(const HloFusionAdaptor& fusion) {
       continue;
     }
 
-    if (IsSomeDot(&instruction_adaptor.instruction())) {
+    // We're OK with simplifying point dimensions if they occur only in the
+    // batch dimensions of a dot, but not if they occur in the contracting or
+    // or non-contracting dimensions. That's because batch dimensions are
+    // unconstrained by the hardware, unlike the others.
+    if (IsDotWithNonBatchPointDimension(&instruction_adaptor.instruction())) {
       return false;
     }
 
@@ -797,7 +836,7 @@ std::vector<int64_t> InputSpaceForParameterMapping(
 absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
     const HloInstruction* root,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   std::vector<int64_t> input_space =
       InputSpaceForParameterMapping(parameter_mapping);
   int64_t num_output_parameters = root->shape().dimensions().size();
@@ -837,7 +876,7 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
 /*static*/ absl::StatusOr<RootIndexing> SymbolicTileAnalysis::GetRootIndexing(
     const HloFusionAdaptor& fusion,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   auto fusion_adaptor_roots = fusion.GetRoots();
 
   TF_ASSIGN_OR_RETURN(int64_t real_root_index,
@@ -861,7 +900,7 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
     const HloComputation& computation,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    SymbolicExprContext* symbolic_expr_context,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder) {
   auto fusion = HloFusionAdaptor::ForComputation(&computation);
   return SymbolicTileAnalysis::AnalyzeFusion(
@@ -872,8 +911,7 @@ absl::StatusOr<IndexingMap> IndexingMapForRootInstruction(
 SymbolicTileAnalysis::AnalyzeNestedFusion(
     const HloFusionAdaptor& fusion_adaptor,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    gpu::SymbolicExprContext* symbolic_expr_context,
-    const IndexingMap& indexing_map,
+    SymbolicExprContext* symbolic_expr_context, const IndexingMap& indexing_map,
     IndexingMap::SimplifyPointDimensions simplification_mode,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
     std::vector<SymbolicTiledHloInstruction*> root_runtime_variables) {
@@ -1116,8 +1154,7 @@ ComposeIndexingResult ComposeInstructionIndexing(
 }
 
 std::vector<OperandIndexingSet> GetOperandIndexingMaps(
-    const HloInstruction* hlo,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    const HloInstruction* hlo, SymbolicExprContext* symbolic_expr_context) {
   std::vector<OperandIndexingSet> indexing_maps;
   HloInstructionIndexing operands_indexing =
       ComputeOutputToInputIndexing(hlo, /*output_id=*/0, symbolic_expr_context);
@@ -1144,7 +1181,7 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeFusionImpl(
     const HloFusionAdaptor& fusion,
     const TilingSpecification::ParameterMapping& parameter_mapping,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    SymbolicExprContext* symbolic_expr_context,
     const RootIndexing& root_indexing,
     IndexingMap::SimplifyPointDimensions simplification_mode,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
@@ -1278,8 +1315,7 @@ std::vector<OperandIndexingSet> GetOperandIndexingMaps(
 }
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeFusion(
-    const HloFusionAdaptor& fusion,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    const HloFusionAdaptor& fusion, SymbolicExprContext* symbolic_expr_context,
     EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder) {
   auto real_root_index_or = GetRealRootIndex(fusion.GetRoots());
   if (!real_root_index_or.ok()) {
@@ -1371,7 +1407,7 @@ namespace {
 absl::StatusOr<bool> IsSafeForBufferSharing(
     const TiledHloInstruction& output, int64_t reference_num_output_tiles,
     const TiledHloSchedule& schedule,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   // TODO(b/453611980): this function can not behave well with regards to
   // schedules other than the default major-to-minor. This is because
   // non-trivial schedules require understanding the semantics of the iteration
@@ -1431,7 +1467,7 @@ absl::StatusOr<std::vector<const TiledHloInstruction*>> InitializeTiledRoots(
         tiled_hlo_instructions,
     const TiledHloSchedule& schedule,
     absl::Span<const int64_t> num_output_tiles_per_dim,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   // TODO(b/390559452): Investigate whether it is faster to use linear lookup.
   absl::flat_hash_map<const HloInstruction*, int64_t> roots_to_output_index;
   roots_to_output_index.reserve(roots.size());
@@ -1508,7 +1544,7 @@ absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructionsImpl(
     bool compute_all_tile_offset_indexing_maps,
     const std::optional<absl::Span<const Interval>>&
         parent_output_tile_dim_bounds,
-    gpu::SymbolicExprContext* symbolic_expr_context,
+    SymbolicExprContext* symbolic_expr_context,
     absl::flat_hash_map<const SymbolicTiledHloInstruction*,
                         TiledHloInstruction*>
         symbolic_to_tiled_hlo_map) {
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
index a7d311d552b0d9..8dbba12cfa0dd2 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
@@ -35,10 +35,10 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/instruction_fusion.h"
 
 namespace xla {
@@ -131,11 +131,11 @@ class SymbolicTileAnalysis {
   // fail to be tiled.
   static SymbolicTileAnalysisOrError AnalyzeComputation(
       const HloComputation& computation,
-      gpu::SymbolicExprContext* symbolic_expr_context,
+      SymbolicExprContext* symbolic_expr_context,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder =
           nullptr);
   static SymbolicTileAnalysisOrError AnalyzeFusion(
-      const HloFusionAdaptor& fusion, gpu::SymbolicExprContext* ctx,
+      const HloFusionAdaptor& fusion, SymbolicExprContext* ctx,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder =
           nullptr);
 
@@ -222,7 +222,7 @@ class SymbolicTileAnalysis {
       const RootIndexing& root_indexing,
       TilingSpecification tiling_specification,
       std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints,
-      gpu::SymbolicExprContext* symbolic_expr_context)
+      SymbolicExprContext* symbolic_expr_context)
       : symbolic_tiled_hlo_instructions_(
             std::move(symbolic_tiled_hlo_instructions)),
         root_indexing_(std::move(root_indexing)),
@@ -234,12 +234,12 @@ class SymbolicTileAnalysis {
   static absl::StatusOr<RootIndexing> GetRootIndexing(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      gpu::SymbolicExprContext* symbolic_expr_context);
+      SymbolicExprContext* symbolic_expr_context);
 
   static SymbolicTileAnalysisOrError AnalyzeFusionImpl(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      gpu::SymbolicExprContext* symbolic_expr_context,
+      SymbolicExprContext* symbolic_expr_context,
       const RootIndexing& root_indexing,
       IndexingMap::SimplifyPointDimensions simplification_mode,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
@@ -249,7 +249,7 @@ class SymbolicTileAnalysis {
   static SymbolicTileAnalysisOrError AnalyzeNestedFusion(
       const HloFusionAdaptor& fusion,
       const TilingSpecification::ParameterMapping& parameter_mapping,
-      gpu::SymbolicExprContext* symbolic_expr_context,
+      SymbolicExprContext* symbolic_expr_context,
       const IndexingMap& indexing_map,
       IndexingMap::SimplifyPointDimensions simplification_mode,
       EmitterSpecificConstraintsBuilder emitter_specific_constraints_builder,
@@ -270,7 +270,7 @@ class SymbolicTileAnalysis {
   // no builder was provided when constructing the analysis.
   std::unique_ptr<EmitterSpecificConstraints> emitter_specific_constraints_;
 
-  gpu::SymbolicExprContext* symbolic_expr_context_;
+  SymbolicExprContext* symbolic_expr_context_;
 };
 
 namespace detail {
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
index 8e94c481401174..4a24466e898384 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
@@ -45,6 +45,8 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -52,7 +54,6 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -64,6 +65,7 @@ namespace {
 
 using absl_testing::IsOkAndHolds;
 using detail::GetFlatTilingsForInputSpace;
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::ExplainMatchResult;
 using ::testing::IsEmpty;
@@ -103,7 +105,7 @@ absl::flat_hash_map<int64_t, const TiledHloInstruction*> GetParametersTiling(
   absl::flat_hash_map<int64_t, const TiledHloInstruction*> result;
   for (const auto& instruction : tiled_hlo_computation->instructions()) {
     const HloParameterInstruction* parameter =
-        dynamic_cast<const HloParameterInstruction*>(instruction->hlo());
+        DynCast<const HloParameterInstruction>(instruction->hlo());
     if (!parameter) {
       continue;
     }
@@ -193,7 +195,7 @@ class SymbolicTileAnalysisTest : public HloHardwareIndependentTestBase {
   mlir::MLIRContext mlir_context_;
   TiledHloScheduleBuilder default_schedule_builder_ =
       CreateMajorToMinorTiledHloSchedule;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
@@ -1645,8 +1647,8 @@ ENTRY e {
   EXPECT_EQ(dynamic_slice->hlo()->opcode(), HloOpcode::kDynamicSlice);
   const TiledHloInstruction* p0 = dynamic_slice->operand(0);
   EXPECT_THAT(*p0, MatchTiledHloInstruction(
-                       /*tile_sizes=*/{2, 8, 2},
-                       /*tile_strides=*/{1, 1, 1},
+                       /*tile_sizes=*/{1, 8, 2},
+                       /*tile_strides=*/{0, 1, 1},
                        /*tile_offsets_indexing=*/R"(
     (pid_0){rt0} -> (rt0, 0, 0), domain: pid_0 in [0, 0], rt0 in [0, 3]
   )"));
@@ -2292,7 +2294,8 @@ ENTRY main {
                   ::testing::HasSubstr("not divisible by tile size")));
 }
 
-TEST_F(SymbolicTileAnalysisTest, TrivialDimensionParametersArePreserved) {
+TEST_F(SymbolicTileAnalysisTest,
+       TrivialNonBatchDotDimensionParametersArePreserved) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 lhs {
@@ -2369,6 +2372,76 @@ ENTRY main {
                   "pid_0 in [0, 35]"));
 }
 
+TEST_F(SymbolicTileAnalysisTest,
+       TrivialBatchDotDimensionParametersAreEliminated) {
+  // Note: the batch dot dimension parameters are only eliminated if contracting
+  // and non-contracting dimensions do not contain trivial dimensions.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+lhs {
+  ROOT p0 = f32[1,137,115] parameter(0)
+}
+
+rhs {
+  p0 = f32[1,2,115] parameter(0)
+  ROOT root = f32[1,2,115] convert(p0)
+}
+
+dot {
+  p0 = f32[1,137,115] parameter(0)
+  p1 = f32[1,2,115] parameter(1)
+
+  lhs = f32[1,137,115] fusion(p0),
+    kind=kCustom, calls=lhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","16","32"]}]}}}
+  rhs = f32[1,2,115] fusion(p1),
+    kind=kCustom, calls=rhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","16","32"]}]}}}
+
+  ROOT dot = f32[1,137,2] dot(lhs, rhs),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  p0 = f32[1,137,115] parameter(0)
+  p1 = f32[1,2,115] parameter(1)
+  ROOT fusion = f32[1,137,2] fusion(p0, p1),
+    kind=kCustom, calls=dot
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+  const HloInstruction* dot_hlo =
+      module->entry_computation()->root_instruction()->fused_expression_root();
+  Tiling tiling(Tiling::TileMapping{{dot_hlo, {32, 1, 16, 16}}});
+  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
+                          analysis->ComputeTiledHloInstructions(
+                              tiling, default_schedule_builder_,
+                              /*constraints_are_known_satisfied=*/false,
+                              /*compute_all_tile_offset_indexing_maps=*/true));
+
+  const TiledHloInstruction* dot = tiled_hlo_computation.GetRoots().front();
+  ASSERT_EQ(dot->hlo()->opcode(), HloOpcode::kDot);
+
+  const TiledHloFusionInstruction* lhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(0));
+  const TiledHloFusionInstruction* rhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(1));
+
+  // We recognize that the batch dimension has been simplified away by the fact
+  // that the stride in the relevant dimension is 0.
+  EXPECT_THAT(
+      lhs_fusion->called_computation()->GetRoots().front()->tile_strides(),
+      ElementsAre(0, 1, 1));
+  EXPECT_THAT(
+      rhs_fusion->called_computation()->GetRoots().front()->tile_strides(),
+      ElementsAre(0, 1, 1));
+}
+
 TEST_F(SymbolicTileAnalysisTest,
        SymbolicTilesAlwaysDependOnAllTheHiddenParameters) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
index 40073d07ba3968..2d5c32e91c1d48 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -54,7 +54,7 @@ ENTRY main {
 )"));
 
   mlir::MLIRContext mlir_ctx;
-  gpu::SymbolicExprContext symbolic_expr_context(&mlir_ctx);
+  SymbolicExprContext symbolic_expr_context(&mlir_ctx);
   auto fusion = module->entry_computation()->root_instruction();
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(fusion);
 
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
index 9861a0e6c63fe2..960856e30d1f72 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_fusion_instruction_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla_data.pb.h"
@@ -37,7 +37,7 @@ using ::testing::HasSubstr;
 class TiledHloFusionInstructionTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(TiledHloFusionInstructionTest,
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
index d335b3568e1325..52109ad31f9c1e 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_instruction_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
@@ -40,7 +40,7 @@ using ::tsl::testing::IsOk;
 class TiledHloInstructionTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(TiledHloInstructionTest, TileSizesAndStridesShouldMatchHloShapeRank) {
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
index 145283a04f86ae..acf8dbbfbb90af 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
@@ -35,10 +35,10 @@ limitations under the License.
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -81,7 +81,7 @@ absl::Status ValidateIterationSpace(const IterationSpace& iteration_space,
 
 absl::StatusOr<IndexingMap> MajorToMinorScheduleImpl(
     const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   mlir::MLIRContext* mlir_context = symbolic_expr_context->GetMLIRContext();
   mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
 
@@ -126,7 +126,7 @@ CreateMajorToMinorTiledHloSchedule(
 
 absl::StatusOr<IndexingMap> MajorToMinorTiledHloSchedule::Schedule(
     const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-    gpu::SymbolicExprContext* ctx) const {
+    SymbolicExprContext* ctx) const {
   TF_RETURN_IF_ERROR(
       ValidateIterationSpace(iteration_space, tile_offsets_indexing));
   return MajorToMinorScheduleImpl(tile_offsets_indexing, iteration_space, ctx);
@@ -207,7 +207,7 @@ TransposedDotTiledHloSchedule::Create(
 
 absl::StatusOr<IndexingMap> TransposedDotTiledHloSchedule::Schedule(
     const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-    gpu::SymbolicExprContext* ctx) const {
+    SymbolicExprContext* ctx) const {
   TF_RETURN_IF_ERROR(
       ValidateIterationSpace(iteration_space, tile_offsets_indexing));
 
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
index eb27821d73175a..74588364027325 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -68,7 +68,7 @@ class TiledHloSchedule {
   //     results are generated, but may not change the results themselves);
   virtual absl::StatusOr<IndexingMap> Schedule(
       const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-      gpu::SymbolicExprContext* ctx) const = 0;
+      SymbolicExprContext* ctx) const = 0;
 };
 
 // The indexing map returned by this schedule iterates over the iteration space
@@ -77,9 +77,9 @@ class TiledHloSchedule {
 // dimension).
 class MajorToMinorTiledHloSchedule : public TiledHloSchedule {
  public:
-  absl::StatusOr<IndexingMap> Schedule(
-      const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-      gpu::SymbolicExprContext* ctx) const override;
+  absl::StatusOr<IndexingMap> Schedule(const IndexingMap& tile_offsets_indexing,
+                                       IterationSpace iteration_space,
+                                       SymbolicExprContext* ctx) const override;
 };
 
 // Convenience function to produce a `MajorToMinorTiledHloSchedule` that
@@ -102,9 +102,9 @@ CreateMajorToMinorTiledHloSchedule(
 // rely on the "dot" instruction being at the root).
 class TransposedDotTiledHloSchedule : public TiledHloSchedule {
  public:
-  absl::StatusOr<IndexingMap> Schedule(
-      const IndexingMap& tile_offsets_indexing, IterationSpace iteration_space,
-      gpu::SymbolicExprContext* ctx) const override;
+  absl::StatusOr<IndexingMap> Schedule(const IndexingMap& tile_offsets_indexing,
+                                       IterationSpace iteration_space,
+                                       SymbolicExprContext* ctx) const override;
 
   static absl::StatusOr<std::unique_ptr<TransposedDotTiledHloSchedule>> Create(
       const TilingSpecification& tiling_specification);
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
index a6b9ddf3053dfd..16b8cb68e97e9c 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
@@ -36,10 +36,10 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -53,7 +53,7 @@ using ::testing::HasSubstr;
 class TiledHloScheduleTest : public HloHardwareIndependentTestBase {
  protected:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 using MajorToMinorTiledHloScheduleTest = TiledHloScheduleTest;
diff --git a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
index 63ddaf46c41147..1268bd20b00331 100644
--- a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
@@ -25,12 +25,12 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -62,7 +62,7 @@ class TilingSpecificationTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 TEST_F(TilingSpecificationTest, TilingSpecificationDerivesOutputParameters) {
diff --git a/third_party/xla/xla/codegen/tools/BUILD b/third_party/xla/xla/codegen/tools/BUILD
index 4ded1759ef098f..ce17ae13a9e0f8 100644
--- a/third_party/xla/xla/codegen/tools/BUILD
+++ b/third_party/xla/xla/codegen/tools/BUILD
@@ -16,6 +16,7 @@ xla_cc_binary(
     visibility = [
         "//xla/backends/cpu/codegen:__subpackages__",
         "//xla/backends/gpu/codegen:__subpackages__",
+        "//xla/codegen:__subpackages__",
         "//xla/codegen/emitters/ir/tests:__subpackages__",
         "//xla/codegen/emitters/tests:__pkg__",
         "//xla/codegen/emitters/transforms/tests:__subpackages__",
@@ -32,11 +33,14 @@ xla_cc_binary(
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/xtile/ir:xtile",
+        "//xla/codegen/xtile/ir/transforms:passes",
         "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DLTIDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -51,6 +55,7 @@ xla_cc_binary(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@stablehlo//:stablehlo_ops",
diff --git a/third_party/xla/xla/codegen/tools/emitters_opt.cc b/third_party/xla/xla/codegen/tools/emitters_opt.cc
index 0fa5f64c7ff4b5..02f7496c0fc4b9 100644
--- a/third_party/xla/xla/codegen/tools/emitters_opt.cc
+++ b/third_party/xla/xla/codegen/tools/emitters_opt.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include "llvm/ADT/Twine.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -29,6 +32,7 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
@@ -46,6 +50,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"
 #include "xla/codegen/xtile/ir/xtile_dialect.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
@@ -70,6 +75,14 @@ int main(int argc, char** argv) {
   xla::gpu::registerGpuFusionTransformsPasses();
   xla::cpu::registerXlaCpuTransformsPasses();
   xla::cpu::registerXTileCpuTransformsPasses();
+  xla::xtile::registerXTileTransformsPasses();
+  mlir::bufferization::registerBufferizationPasses();
+
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
+      registry);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+
   mlir::registerPassPipeline(
       "xla-test-optimize",
       "Test pipeline of passes up to inlining. No vectorization, also does not "
diff --git a/third_party/xla/xla/codegen/xtile/ir/BUILD b/third_party/xla/xla/codegen/xtile/ir/BUILD
index 03b4938ba06691..27bbf1064e723b 100644
--- a/third_party/xla/xla/codegen/xtile/ir/BUILD
+++ b/third_party/xla/xla/codegen/xtile/ir/BUILD
@@ -21,6 +21,7 @@ td_library(
     compatible_with = get_compatible_with_portable(),
     includes = ["."],
     deps = [
+        "@llvm-project//mlir:BufferizableOpInterfaceTdFiles",
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:CallInterfacesTdFiles",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
@@ -76,6 +77,7 @@ cc_library(
     name = "xtile",
     srcs = [
         "xtile_attrs.cc",
+        "xtile_bufferization.cc",
         "xtile_dialect.cc",
         "xtile_ops.cc",
     ],
@@ -88,11 +90,14 @@ cc_library(
         ":xtile_attrs_inc_gen",
         ":xtile_dialect_inc_gen",
         ":xtile_ops_inc_gen",
+        "//xla/codegen/emitters:implicit_arith_op_builder",
         "//xla/hlo/analysis:indexing_analysis",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
diff --git a/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir b/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir
index d0747515907fc2..3edd30584d5f9c 100644
--- a/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir
+++ b/third_party/xla/xla/codegen/xtile/ir/tests/ops.mlir
@@ -30,6 +30,23 @@ xtile.entry_func @too_many_tile_ids(%input: memref<1024xf32>, %id0: index, %id1:
 
 // -----
 
+xtile.entry_func @correct_opaque_args(
+  %input: memref<1024xf32>, %opaque0: index, %opaque1: index, %id1: index)
+  attributes {num_opaque_args = 2 : i32}  {
+  xtile.return
+}
+
+// -----
+
+// expected-error@+1 {{entry function arguments should be of the form (arg: memref..., tile_id: index)}}
+xtile.entry_func @wrong_opaque_args(
+  %input: memref<1024xf32>, %opaque0: index, %opaque1: index, %id1: index)
+  attributes {num_opaque_args = 1 : i32}  {
+  xtile.return
+}
+
+// -----
+
 func.func @incorrect_full_shape_extract(%arg: memref<1024xf32>) -> tensor<10xf32> {
   %offset = arith.constant 0 : index
   // expected-error@+1 {{full tile shape size: 2 does not match rank of buffer: 1}}
@@ -99,3 +116,36 @@ func.func @type_mismatch_insert(%src: tensor<24xf64>, %dst: memref<1024xf32>) {
   xtile.insert %src into %dst[%offset][24][1] : tensor<24xf64> -> memref<1024xf32>
   return
 }
+
+// -----
+
+func.func @dot_scaled(%lhs: tensor<128x128xf32>, %lhs_scale: tensor<128x4xi8>, %rhs: tensor<128x256xf32>, %rhs_scale: tensor<256x4xi8>, %acc: tensor<128x256xf32>) -> tensor<128x256xf32> {
+  %0 = xtile.dot_scaled %lhs scale %lhs_scale, %rhs scale %rhs_scale {fastMath = true} : tensor<128x128xf32>, tensor<128x4xi8> * tensor<128x256xf32>, tensor<256x4xi8> -> tensor<128x256xf32>
+  return %0 : tensor<128x256xf32>
+}
+
+
+// -----
+
+func.func @legal_mask_op(%src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  %masked = xtile.mask %src bounds [10], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
+// -----
+
+func.func @illegal_mask_bound_rank_mismatch(
+    %src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  // expected-error@+1 {{tensor rank: 1 does not match mask bounds rank: 2}}
+  %masked = xtile.mask %src bounds [10, 1], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
+// -----
+
+func.func @illegal_mask_out_of_bounds(%src: tensor<32xf64>, %mask: f64) -> tensor<32xf64> {
+  // expected-error@+1 {{mask bound not less than or equal to the tensor size}}
+  %masked = xtile.mask %src bounds [33], %mask : tensor<32xf64>
+  return %masked : tensor<32xf64>
+}
+
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD b/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD
new file mode 100644
index 00000000000000..08926c228e160c
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/BUILD
@@ -0,0 +1,55 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=XTileTransforms",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "convert_elementwise_0d_tensor_to_scalar_pass.cc",
+        "verify_legal_xtile_ops.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/xtile/ir:xtile",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc b/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc
new file mode 100644
index 00000000000000..1b2acb322875bc
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/convert_elementwise_0d_tensor_to_scalar_pass.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"  // IWYU pragma: keep
+
+namespace xla::xtile {
+
+#define GEN_PASS_DEF_CONVERTELEMENTWISE0DTENSORTOSCALARPASS
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
+
+namespace {
+
+struct ElementwiseConverter
+    : public mlir::OpTraitConversionPattern<mlir::OpTrait::Elementwise> {
+ public:
+  using OpTraitConversionPattern::OpTraitConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::Operation* op, mlir::ArrayRef<mlir::Value> operands,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::Type> new_result_types;
+    if (mlir::failed(getTypeConverter()->convertTypes(op->getResultTypes(),
+                                                      new_result_types))) {
+      return rewriter.notifyMatchFailure(op, "failed to convert type");
+    }
+
+    mlir::IRMapping mapping;
+    mapping.map(op->getOperands(), operands);
+    mlir::Operation* new_op = rewriter.clone(*op, mapping);
+
+    for (auto [results, new_type] :
+         llvm::zip(new_op->getResults(), new_result_types)) {
+      results.setType(new_type);
+    }
+
+    rewriter.replaceOp(op, new_op);
+    return mlir::success();
+  }
+};
+
+struct ConstantConversionPattern
+    : public mlir::OpConversionPattern<mlir::arith::ConstantOp> {
+  using OpConversionPattern<mlir::arith::ConstantOp>::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::ConstantOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    auto dense_attr =
+        mlir::dyn_cast<mlir::DenseElementsAttr>(op.getValueAttr());
+    if (!dense_attr) {
+      return rewriter.notifyMatchFailure(op, "expected a DenseElementsAttr");
+    }
+
+    if (dense_attr.size() != 1) {
+      return rewriter.notifyMatchFailure(op, "expected a single element");
+    }
+
+    auto scalar_attr = dense_attr.getValues<mlir::TypedAttr>()[0];
+    rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(op, scalar_attr);
+
+    return mlir::success();
+  }
+};
+
+struct ConvertElementwise0DTensorToScalarPass
+    : public impl::ConvertElementwise0DTensorToScalarPassBase<
+          ConvertElementwise0DTensorToScalarPass> {
+  void runOnOperation() override {
+    mlir::TypeConverter type_converter;
+    type_converter.addConversion([](mlir::Type type) { return type; });
+
+    type_converter.addConversion([](mlir::RankedTensorType type) -> mlir::Type {
+      if (type.getRank() == 0) {
+        return type.getElementType();
+      }
+      return type;
+    });
+
+    type_converter.addSourceMaterialization(
+        [](mlir::OpBuilder& builder, mlir::Type result_type,
+           mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
+          if (inputs.size() != 1) {
+            return nullptr;
+          }
+          return mlir::tensor::FromElementsOp::create(builder, loc, result_type,
+                                                      inputs.front());
+        });
+
+    type_converter.addTargetMaterialization(
+        [](mlir::OpBuilder& builder, mlir::Type result_type,
+           mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
+          if (inputs.size() != 1) {
+            return nullptr;
+          }
+          return mlir::tensor::ExtractOp::create(builder, loc, inputs.front());
+        });
+
+    mlir::ConversionTarget target(getContext());
+
+    target.markUnknownOpDynamicallyLegal(
+        [&](mlir::Operation* op) -> std::optional<bool> {
+          if (op->hasTrait<mlir::OpTrait::Elementwise>()) {
+            return type_converter.isLegal(op);
+          }
+          return std::nullopt;
+        });
+
+    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
+        [&](mlir::arith::ConstantOp op) {
+          return type_converter.isLegal(op.getOperation());
+        });
+
+    mlir::RewritePatternSet patterns(&getContext());
+
+    patterns.add<ElementwiseConverter, ConstantConversionPattern>(
+        type_converter, &getContext());
+
+    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+                                                  std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/mlir_kernel_definition.h b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
similarity index 55%
rename from third_party/xla/xla/codegen/mlir_kernel_definition.h
rename to third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
index dd81d8ae81f043..9991af93198f4c 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_definition.h
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
-#define XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
+#ifndef XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
+#define XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
 
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/mlir_kernel_source.h"
+#include <memory>  // IWYU pragma: keep
 
-namespace xla {
+#include "mlir/IR/BuiltinOps.h"  // IWYU pragma: keep
+#include "mlir/Pass/Pass.h"  // IWYU pragma: keep
 
-using MlirKernelDefinition = KernelDefinition<MlirKernelSource>;
+namespace xla::xtile {
 
-}  // namespace xla
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
 
-#endif  // XLA_CODEGEN_MLIR_KERNEL_DEFINITION_H_
+}  // namespace xla::xtile
+
+#endif  // XLA_CODEGEN_XTILE_IR_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td
new file mode 100644
index 00000000000000..a9a198318571dc
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/passes.td
@@ -0,0 +1,35 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def VerifyLegalXTileOpsPass : Pass<"xtile-verify-legal-ops", "mlir::ModuleOp"> {
+  let summary = "Verify that all ops in the module are legal";
+
+  let description = [{
+    This pass verifies that all ops in the module are legal, which is defined as
+    the ops that are supported by all of the XTile backends.
+  }];
+
+}
+
+def ConvertElementwise0DTensorToScalarPass
+    : Pass<"convert-elementwise-0d-tensor-to-scalar"> {
+  let summary = "Lowers 0D tensors of elementwise ops to scalars.";
+
+  let dependentDialects = [
+    "mlir::tensor::TensorDialect",
+  ];
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD
new file mode 100644
index 00000000000000..10228fcc460af8
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/BUILD
@@ -0,0 +1,16 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+lit_test_suite(
+    name = "tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/codegen/tools:emitters_opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir
new file mode 100644
index 00000000000000..2042a3c0507f19
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/bufferize.mlir
@@ -0,0 +1,67 @@
+// RUN: emitters_opt %s -one-shot-bufferize -canonicalize -cse \
+// RUN: -split-input-file | FileCheck %s
+
+// CHECK: @extract_strided(%[[SOURCE:.*]]: memref<16xf32>, %[[OFFSET:.*]]: index)
+func.func @extract_strided(%source: memref<16xf32>, %tile_id: index) -> tensor<8xf32> {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+
+  // CHECK: %[[SHIFT:.*]] = arith.subi %[[C15]], %[[OFFSET]] : index
+  // CHECK: %[[STRIDED_SHIFT:.*]] = arith.divsi %[[SHIFT]], %[[C2]] : index
+  // CHECK: %[[ELEMENTS_TO_END:.*]] = arith.addi %[[STRIDED_SHIFT]], %[[C1]] : index
+  // CHECK: %[[SIZE:.*]] = arith.minsi %[[ELEMENTS_TO_END]], %[[C8]] : index
+
+  // CHECK: %[[INPUT_SUBVIEW:.*]] = memref.subview %[[SOURCE]]
+  // CHECK-SAME: [%[[OFFSET]]] [%[[SIZE]]] [2]
+  // CHECK-SAME: : memref<16xf32> to memref<?xf32, strided<[2], offset: ?>>
+
+  // CHECK: %[[BUFFER:.*]] = memref.alloc() : memref<8xf32>
+
+  // CHECK: %[[BUFFER_SUBVIEW:.*]] = memref.subview %[[BUFFER]]
+  // CHECK-SAME: [0] [%[[SIZE]]] [1] : memref<8xf32> to memref<?xf32, strided<[1]>>
+
+  // CHECK: memref.copy %[[INPUT_SUBVIEW]], %[[BUFFER_SUBVIEW]]
+  // CHECK-SAME: : memref<?xf32, strided<[2], offset: ?>> to memref<?xf32, strided<[1]>>
+
+  // CHECK: %[[TILE:.*]] = bufferization.to_tensor %[[BUFFER]] restrict writable
+  // CHECK-SAME: : memref<8xf32> to tensor<8xf32>
+  %tile = xtile.extract %source[%tile_id][8][2] : memref<16xf32> -> tensor<8xf32>
+  // CHECK: return %[[TILE]] : tensor<8xf32>
+  return %tile : tensor<8xf32>
+}
+
+// -----
+
+// CHECK: @insert_strided(
+// CHECK-SAME: %[[SOURCE:.*]]: tensor<8xf32>,
+// CHECK-SAME: %[[DESTINATION:.*]]: memref<16xf32>,
+// CHECK-SAME: %[[OFFSET:.*]]: index)
+func.func @insert_strided(%source: tensor<8xf32>, %destination: memref<16xf32>, %tile_id: index) {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index
+
+  // CHECK: %[[SOURCE_BUFFER:.*]] = bufferization.to_buffer %[[SOURCE]]
+  // CHECK-SAME: : tensor<8xf32> to memref<8xf32, strided<[?], offset: ?>>
+
+  // CHECK: %[[SHIFT:.*]] = arith.subi %[[C15]], %[[OFFSET]] : index
+  // CHECK: %[[STRIDED_SHIFT:.*]] = arith.divsi %[[SHIFT]], %[[C2]] : index
+  // CHECK: %[[ELEMENTS_TO_END:.*]] = arith.addi %[[STRIDED_SHIFT]], %[[C1]] : index
+  // CHECK: %[[SIZE:.*]] = arith.minsi %[[ELEMENTS_TO_END]], %[[C8]] : index
+
+  // CHECK: %[[SOURCE_SUBVIEW:.*]] = memref.subview %[[SOURCE_BUFFER]][0] [%[[SIZE]]] [1]
+  // CHECK-SAME: : memref<8xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[?], offset: ?>>
+
+  // CHECK: %[[DESTINATION_SUBVIEW:.*]] = memref.subview %[[DESTINATION]]
+  // CHECK-SAME: [%[[OFFSET]]] [%[[SIZE]]] [2]
+  // CHECK-SAME: : memref<16xf32> to memref<?xf32, strided<[2], offset: ?>>
+
+  // CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DESTINATION_SUBVIEW]]
+  // CHECK-SAME: : memref<?xf32, strided<[?], offset: ?>>
+  // CHECK-SAME: to memref<?xf32, strided<[2], offset: ?>>
+  xtile.insert %source into %destination[%tile_id][8][2] : tensor<8xf32> -> memref<16xf32>
+  return
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir
new file mode 100644
index 00000000000000..3892f343766d51
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/convert_elementwise_0d_tensor_to_scalar_pass.mlir
@@ -0,0 +1,25 @@
+// RUN: emitters_opt %s \
+// RUN: -split-input-file -convert-elementwise-0d-tensor-to-scalar \
+// RUN: | FileCheck %s
+
+func.func @converts_0d_addf(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: arith.addf {{.*}} : f32
+  %0 = arith.addf %arg0, %arg0 : tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @skips_1d_addf(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK: arith.addf {{.*}} : tensor<1xf32>
+  %0 = arith.addf %arg0, %arg0 : tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+// -----
+
+func.func @converts_0d_constant() -> tensor<f32> {
+  // CHECK: arith.constant 1.000000e+00 : f32
+  %0 = arith.constant dense<1.0> : tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir
new file mode 100644
index 00000000000000..9a45785b541c16
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/tests/verify_legal_xtile_ops.mlir
@@ -0,0 +1,18 @@
+// RUN: emitters_opt %s -xtile-verify-legal-ops -split-input-file -verify-diagnostics
+
+xtile.entry_func @fails_illegal_op(%arg0: memref<2xf32>, %arg1: index) {
+  %c_0 = arith.constant 0. : f32
+  // expected-error @+1 {{vector.transfer_read: unsupported op}}
+  %0 = vector.transfer_read %arg0[%arg1], %c_0 : memref<2xf32>, vector<2xf32>
+  // expected-error @+1 {{vector.transfer_write: unsupported op}}
+  vector.transfer_write %0, %arg0[%arg1] : vector<2xf32>, memref<2xf32>
+  xtile.return
+}
+
+// -----
+
+func.func @iota_2d_fails() -> tensor<2x2xi32> {
+  // expected-error @+1 {{Only 1D iota is supported}}
+  %0 = stablehlo.iota dim = 0 :  tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
diff --git a/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc b/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc
new file mode 100644
index 00000000000000..a24553b8cfa96f
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/transforms/verify_legal_xtile_ops.cc
@@ -0,0 +1,147 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/WalkResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/codegen/emitters/ir/xla_dialect.h"
+#include "xla/codegen/xtile/ir/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/codegen/xtile/ir/xtile_dialect.h"
+
+namespace xla::xtile {
+
+#define GEN_PASS_DEF_VERIFYLEGALXTILEOPSPASS
+#include "xla/codegen/xtile/ir/transforms/passes.h.inc"
+
+namespace {
+
+bool WholeDialectIsLegal(mlir::Dialect* dialect) {
+  return mlir::isa<XTileDialect, XlaDialect, mlir::arith::ArithDialect,
+                   mlir::math::MathDialect, mlir::func::FuncDialect,
+                   mlir::BuiltinDialect>(dialect);
+}
+
+std::optional<absl::string_view> IsLegalSCFOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::scf::ForOp, mlir::scf::IfOp, mlir::scf::YieldOp>(op)) {
+    return std::nullopt;
+  }
+
+  return "unsupported SCF op";
+}
+
+std::optional<absl::string_view> IsLegalTensorOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::tensor::BitcastOp>(op)) {
+    return std::nullopt;
+  }
+
+  // TODO(willfroom): remove this ExtractOp & FromElementsOp once the special
+  // handling of 0D tensors is removed from the emitter.
+  if (auto extract = mlir::dyn_cast<mlir::tensor::ExtractOp>(op)) {
+    if (extract.getTensor().getType().getRank() != 0) {
+      return "Expected rank 0";
+    }
+    return std::nullopt;
+  }
+
+  if (auto from_elements = mlir::dyn_cast<mlir::tensor::FromElementsOp>(op)) {
+    if (from_elements.getType().getRank() != 0) {
+      return "Expected rank 0";
+    }
+    return std::nullopt;
+  }
+
+  return "unsupported Tensor op";
+}
+
+std::optional<absl::string_view> IsLegalStablehloOp(mlir::Operation* op) {
+  if (mlir::isa<mlir::stablehlo::BroadcastInDimOp, mlir::stablehlo::ReduceOp,
+                mlir::stablehlo::ReturnOp, mlir::stablehlo::TransposeOp,
+                mlir::stablehlo::DotGeneralOp, mlir::stablehlo::ReshapeOp>(
+          op)) {
+    return std::nullopt;
+  }
+
+  if (auto iota = mlir::dyn_cast<mlir::stablehlo::IotaOp>(op)) {
+    if (iota.getType().getRank() != 1) {
+      return "Only 1D iota is supported";
+    }
+
+    return std::nullopt;
+  }
+
+  return "unsupported StableHLO op";
+}
+
+// Check if a given op is xtile legal, if it is return std::nullopt else,
+// returns a diagnostic string.
+std::optional<absl::string_view> IsLegalOp(mlir::Operation* op) {
+  mlir::Dialect* dialect = op->getDialect();
+  if (WholeDialectIsLegal(dialect)) {
+    return std::nullopt;
+  }
+
+  if (mlir::isa<mlir::scf::SCFDialect>(dialect)) {
+    return IsLegalSCFOp(op);
+  }
+
+  if (mlir::isa<mlir::tensor::TensorDialect>(dialect)) {
+    return IsLegalTensorOp(op);
+  }
+
+  if (mlir::isa<mlir::stablehlo::StablehloDialect>(dialect)) {
+    return IsLegalStablehloOp(op);
+  }
+
+  return "unsupported op";
+}
+
+struct VerifyLegalXTileOpsPass
+    : public impl::VerifyLegalXTileOpsPassBase<VerifyLegalXTileOpsPass> {
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    bool failed = false;
+    module->walk([&failed](mlir::Operation* op) {
+      if (std::optional<absl::string_view> diagnostic = IsLegalOp(op)) {
+        op->emitError() << op->getName() << ": " << *diagnostic;
+        failed = true;
+      }
+      return mlir::WalkResult::advance();
+    });
+
+    if (failed) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc
new file mode 100644
index 00000000000000..b2b6ef8ec00ee7
--- /dev/null
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_bufferization.cc
@@ -0,0 +1,254 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/codegen/emitters/implicit_arith_op_builder.h"
+#include "xla/codegen/xtile/ir/xtile_ops.h"
+
+namespace xla::xtile {
+
+static llvm::SmallVector<mlir::OpFoldResult> GetStaticFoldResult(
+    mlir::OpBuilder& builder, llvm::ArrayRef<int64_t> input) {
+  return llvm::map_to_vector(input, [&builder](int64_t value) {
+    return mlir::OpFoldResult(builder.getIndexAttr(value));
+  });
+}
+
+static llvm::SmallVector<mlir::OpFoldResult> GetDynamicFoldResult(
+    mlir::ValueRange input) {
+  return llvm::SmallVector<mlir::OpFoldResult>(input);
+}
+
+// Get the size of the memref subview with the output size clamped to inbound
+// elements, if full_size is true then unit values are inserted for reduced
+// dimensions.
+// The derivation of these bounds is as follows:
+//   index + tile_size * stride <= size - 1
+//   tile_size * stride <= size - 1 - index
+//   tile_size <= size - 1 - index / stride
+//   tile_size < ((size - 1 - index) / stride) + 1
+static llvm::SmallVector<mlir::OpFoldResult> GetClampedTileSize(
+    mlir::ImplicitLocOpBuilder& builder, TiledBufferInterface op,
+    bool full_size) {
+  llvm::SmallVector<mlir::OpFoldResult> tile_size;
+  llvm::SmallDenseSet<unsigned> reduced_dims = op.getReducedDimensions();
+  int64_t idx = 0;
+  for (auto [buffer_size, offset, stride, full_tile_size] :
+       llvm::zip(op.getBuffer().getType().getShape(), op.getOffsets(),
+                 op.getStrides(), op.getFullTileShape())) {
+    if (reduced_dims.contains(idx++)) {
+      if (full_size) {
+        tile_size.emplace_back(builder.getIndexAttr(1));
+      }
+      continue;
+    }
+    emitters::ImplicitArithOpBuilder arith_builder(
+        mlir::arith::ConstantIndexOp::create(builder, (buffer_size - 1)),
+        &builder);
+    auto numerator = arith_builder - offset;
+    // The stride can be 0 for single element tiles.
+    // TODO(willfroom): Fix tile analysis so this never happens.
+    auto clamped_stride = std::max<int64_t>(stride, 1);
+    auto bound = numerator / clamped_stride + 1;
+    tile_size.emplace_back(bound.min(full_tile_size));
+  }
+
+  return tile_size;
+}
+
+// Get the subview of the op buffer with its size clamped such that all elements
+// are in bounds.
+static mlir::TypedValue<mlir::MemRefType> GetClampedSubView(
+    mlir::ImplicitLocOpBuilder& builder, TiledBufferInterface op) {
+  auto tile_size = GetClampedTileSize(builder, op, true);
+
+  auto offsets = GetDynamicFoldResult(op.getOffsets());
+  auto strides = GetStaticFoldResult(builder, op.getStrides());
+
+  mlir::RankedTensorType tile_type = op.getTile().getType();
+  llvm::SmallVector<int64_t> output_shape(tile_type.getRank(),
+                                          mlir::ShapedType::kDynamic);
+  mlir::MemRefType subview_type =
+      mlir::memref::SubViewOp::inferRankReducedResultType(
+          output_shape, op.getBuffer().getType(), offsets, tile_size, strides);
+
+  return mlir::memref::SubViewOp::create(builder, subview_type, op.getBuffer(),
+                                         offsets, tile_size, strides);
+}
+
+// Get the subview of the local buffer - i.e it has 0 offsets & unit strides.
+static mlir::TypedValue<mlir::MemRefType> GetLocalBufferSubview(
+    mlir::ImplicitLocOpBuilder& builder,
+    mlir::TypedValue<mlir::MemRefType> buffer,
+    llvm::ArrayRef<mlir::OpFoldResult> tile_size,
+    llvm::ArrayRef<int64_t> full_tile_shape) {
+  mlir::SmallVector<mlir::OpFoldResult> buffer_offsets(
+      buffer.getType().getRank(), builder.getIndexAttr(0));
+  mlir::SmallVector<mlir::OpFoldResult> buffer_strides(
+      buffer.getType().getRank(), builder.getIndexAttr(1));
+
+  mlir::MemRefType buffer_subview_type =
+      mlir::memref::SubViewOp::inferRankReducedResultType(
+          full_tile_shape, buffer.getType(), buffer_offsets, tile_size,
+          buffer_strides);
+  return mlir::memref::SubViewOp::create(builder, buffer_subview_type, buffer,
+                                         buffer_offsets, tile_size,
+                                         buffer_strides);
+}
+
+// Extract the slice of the tensor that is clamped to be within bounds of the
+// target buffer.
+static mlir::TypedValue<mlir::RankedTensorType> GetTensorSlice(
+    mlir::ImplicitLocOpBuilder& builder, InsertTileOp op) {
+  auto tile_size = GetClampedTileSize(builder, op, false);
+
+  mlir::SmallVector<mlir::OpFoldResult> offsets(tile_size.size(),
+                                                builder.getIndexAttr(0));
+  mlir::SmallVector<mlir::OpFoldResult> strides(tile_size.size(),
+                                                builder.getIndexAttr(1));
+
+  return mlir::tensor::ExtractSliceOp::create(builder, op.getSource(), offsets,
+                                              tile_size, strides);
+}
+
+// Get a buffer copied from the original buffer that is padded to the full tile
+// size.
+static mlir::TypedValue<mlir::MemRefType> GetPaddedTileBuffer(
+    mlir::ImplicitLocOpBuilder& builder, ExtractTileOp op) {
+  auto buffer_tile_subview = GetClampedSubView(builder, op);
+  mlir::RankedTensorType tile_type = op.getResult().getType();
+  auto buffer = mlir::memref::AllocOp::create(
+      builder, GetStaticFoldResult(builder, tile_type.getShape()),
+      tile_type.getElementType());
+
+  auto local_tile_size = GetClampedTileSize(builder, op, false);
+  auto local_buffer_subview =
+      GetLocalBufferSubview(builder, buffer, local_tile_size,
+                            buffer_tile_subview.getType().getShape());
+
+  mlir::memref::CopyOp::create(builder, buffer_tile_subview,
+                               local_buffer_subview);
+
+  return buffer;
+}
+
+bool ExtractTileOp::bufferizesToMemoryRead(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+bool ExtractTileOp::bufferizesToMemoryWrite(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+mlir::bufferization::AliasingValueList ExtractTileOp::getAliasingValues(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return {};
+}
+
+bool ExtractTileOp::isWritable(
+    mlir::Value value, const mlir::bufferization::AnalysisState& state) {
+  // We currently unconditionally create a new buffer to store the extracted
+  // tile so it is always writable.
+  if (value == getResult()) {
+    return true;
+  }
+
+  return false;
+}
+
+llvm::LogicalResult ExtractTileOp::bufferize(
+    mlir::RewriterBase& rewriter,
+    const mlir::bufferization::BufferizationOptions& options,
+    mlir::bufferization::BufferizationState& state) {
+  mlir::ImplicitLocOpBuilder builder(getLoc(), rewriter);
+  auto buffer = GetPaddedTileBuffer(builder, *this);
+  auto to_tensor_op =
+      rewriter.replaceOpWithNewOp<mlir::bufferization::ToTensorOp>(
+          this->getOperation(), getType(), buffer);
+  to_tensor_op.setWritable(true);
+  to_tensor_op.setRestrict(true);
+  return mlir::success();
+}
+
+bool InsertTileOp::bufferizesToMemoryRead(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return true;
+}
+
+bool InsertTileOp::bufferizesToMemoryWrite(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  DCHECK_EQ(operand.getOperandNumber(), 0)
+      << "This should only be called on the tensor operand.";
+  return false;
+}
+
+mlir::bufferization::AliasingValueList InsertTileOp::getAliasingValues(
+    mlir::OpOperand& operand, const mlir::bufferization::AnalysisState& state) {
+  return {};
+}
+
+bool InsertTileOp::isWritable(mlir::Value value,
+                              const mlir::bufferization::AnalysisState& state) {
+  if (value == getDestination()) {
+    return true;
+  }
+
+  return false;
+}
+
+llvm::LogicalResult InsertTileOp::bufferize(
+    mlir::RewriterBase& rewriter,
+    const mlir::bufferization::BufferizationOptions& options,
+    mlir::bufferization::BufferizationState& state) {
+  mlir::ImplicitLocOpBuilder builder(getLoc(), rewriter);
+
+  auto tile_slice = GetTensorSlice(builder, *this);
+  auto target_buffer_subview = GetClampedSubView(builder, *this);
+  auto materialize_op = mlir::bufferization::MaterializeInDestinationOp::create(
+      builder, tile_slice, target_buffer_subview);
+  materialize_op.setWritable(true);
+
+  rewriter.eraseOp(this->getOperation());
+  return mlir::success();
+}
+
+}  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc
index aa61fdb2676727..5bc7fc57fde2f1 100644
--- a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.cc
@@ -162,7 +162,9 @@ mlir::LogicalResult EntryFuncOp::verify() {
       "entry function arguments should be of the form (arg: memref..., "
       "tile_id: index)";
 
-  for (mlir::Type arg_types : getArgumentTypes().drop_back()) {
+  // + 1 for the tile id.
+  const int64_t num_opaque_args = getNumOpaqueArgs() + 1;
+  for (mlir::Type arg_types : getArgumentTypes().drop_back(num_opaque_args)) {
     if (!mlir::isa<mlir::MemRefType>(arg_types)) {
       return emitOpError() << argument_error;
     }
@@ -196,4 +198,48 @@ mlir::TypedValue<mlir::RankedTensorType> InsertTileOp::getTile() {
 
 mlir::LogicalResult InsertTileOp::verify() { return VerifyBufferOp(*this); }
 
+llvm::SmallVector<int64_t> MaskOp::getMaskedDimensions() {
+  llvm::SmallVector<int64_t> masked_dimensions;
+
+  int64_t idx = 0;
+  for (const auto [bound_size, tensor_size] :
+       llvm::zip(getBounds(), getType().getShape())) {
+    if (bound_size < tensor_size) {
+      masked_dimensions.push_back(idx);
+    }
+    ++idx;
+  }
+
+  return masked_dimensions;
+}
+
+mlir::LogicalResult MaskOp::verify() {
+  mlir::ArrayRef<int64_t> tensor_shape = getType().getShape();
+  mlir::ArrayRef<int64_t> bounds = getBounds();
+
+  if (tensor_shape.size() != bounds.size()) {
+    return emitOpError() << "tensor rank: " << tensor_shape.size()
+                         << " does not match mask bounds rank: "
+                         << bounds.size();
+  }
+
+  for (const auto [bound_size, tensor_size] : llvm::zip(bounds, tensor_shape)) {
+    if (bound_size > tensor_size) {
+      return emitOpError()
+             << "mask bound not less than or equal to the tensor size";
+    }
+  }
+
+  return mlir::success();
+}
+
+mlir::OpFoldResult MaskOp::fold(FoldAdaptor) {
+  if (getMaskedDimensions().empty()) {
+    // If none of the dimensions are masked then the op is a nop.
+    return getSource();
+  }
+
+  return {};
+}
+
 }  // namespace xla::xtile
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h
index abe304bdba40cc..ffa99e02dcb902 100644
--- a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Bytecode/BytecodeOpInterface.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep
 #include "mlir/IR/Attributes.h"  // IWYU pragma: keep
 #include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
diff --git a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td
index 6989e80f03fbb9..fa240c77cc938c 100644
--- a/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td
+++ b/third_party/xla/xla/codegen/xtile/ir/xtile_ops.td
@@ -15,7 +15,7 @@ limitations under the License.
 
 #ifndef XLA_CODEGEN_XTILE_IR_XTILE_OPS
 #define XLA_CODEGEN_XTILE_IR_XTILE_OPS
-
+include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/FunctionInterfaces.td"
@@ -33,7 +33,14 @@ class XTile_Op<string mnemonic, list<Trait> traits = []> :
       Op<XTileDialect, mnemonic, traits> {
 }
 
-def TiledBufferInterface : OpInterface<"TiledBufferInterface"> {
+def TiledBufferInterface : OpInterface<"TiledBufferInterface",
+    [DeclareOpInterfaceMethods<BufferizableOpInterface,
+      ["bufferizesToMemoryRead",
+       "bufferizesToMemoryWrite",
+       "getAliasingValues",
+       "isWritable",
+       "bufferize"]>
+    ]> {
   let description = [{
 
   }];
@@ -71,6 +78,7 @@ def TiledBufferInterface : OpInterface<"TiledBufferInterface"> {
 def EntryFuncOp : XTile_Op<"entry_func", [
     Symbol,
     IsolatedFromAbove,
+    AutomaticAllocationScope,
     FunctionOpInterface]>
 {
   let summary = "My custom entry function operation";
@@ -86,7 +94,11 @@ def EntryFuncOp : XTile_Op<"entry_func", [
                        TypeAttrOf<FunctionType>:$function_type,
                        OptionalAttr<DictArrayAttr>:$arg_attrs,
                        OptionalAttr<XTile_TilingInfoAttr>:$tile_info,
-                       OptionalAttr<DictArrayAttr>:$res_attrs
+                       OptionalAttr<DictArrayAttr>:$res_attrs,
+                       // The number of arguments that are opaque to tiling
+                       // infrastructure and hence do not correspond to a memref
+                       // argument.
+                       DefaultValuedAttr<I32Attr, "0">:$num_opaque_args
   );
 
   // The entry function has no return values.
@@ -105,7 +117,13 @@ def EntryFuncOp : XTile_Op<"entry_func", [
     // Helper Methods
     //===------------------------------------------------------------------===//
     mlir::ValueRange getBufferArgs() {
-      return getBody().getArguments().drop_back();
+      // +1 for the tile id argument.
+      return getBody().getArguments().drop_back(getNumOpaqueArgs() + 1);
+    }
+
+    mlir::ValueRange getOpaqueArgs() {
+      return getBody().getArguments().take_back(getNumOpaqueArgs() + 1)
+          .drop_back();
     }
 
     mlir::Value getTileId() {
@@ -201,5 +219,77 @@ def InsertTileOp : XTile_Op<"insert", [TiledBufferInterface]> {
   let hasVerifier = 1;
 }
 
+def MaskOp : XTile_Op<"mask",
+                      [Pure,
+                       AllTypesMatch<["source", "result"]>,
+                       TypesMatchWith<"mask type matches result element type",
+                          "result", "value", ElementType<"_self">.result>
+                      ]> {
+  let summary = "Mask the values of a tensor.";
+
+  let description = [{
+    Masks out the values of the input tensor that are outside the range of the
+    given mask upper bound. Masked values are set to the provided value.
+  }];
+
+
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    DenseI64ArrayAttr:$bounds,
+    AnyType:$value
+  );
+
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $source `bounds` $bounds `,` $value `:` type($result) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    // Get the dimensions where the mask bound is smaller than the dimension
+    // size. The returned array is sorted in increasing order.
+    llvm::SmallVector<int64_t> getMaskedDimensions();
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
+
+//
+// DotScaled Op
+//
+// TODO(basioli): This op was copied from the triton dialect. If we want to use 
+// it more we should probably consider documenting it properly, and including 
+// more checks (e.x. similar to the Triton DotOpInterface).
+def DotScaledOp : XTile_Op<"dot_scaled", [Pure, AttrSizedOperandSegments]> {
+    let summary = "dot_scaled";
+
+    let description = [{
+        $result = matrix_multiply(scale($lhs, $lhs_scale), scale($rhs, $rhs_scale)).
+        Where scale(x, s) is a function that applies the scale per block following microscaling spec.
+    }];
+
+    let arguments = (
+      ins
+      // inputs are floats if we have a type for them, otherwise (fp4),
+      // they are packed in pairs in an I8Tensor
+      RankedTensorOf<[AnyFloat,I8]>:$lhs,
+      RankedTensorOf<[AnyFloat,I8]>:$rhs,
+      Optional<RankedTensorOf<[AnyFloat,I8]>>:$lhs_scale,
+      Optional<RankedTensorOf<[AnyFloat,I8]>>:$rhs_scale,
+      BoolAttr:$fastMath,
+      DefaultValuedAttr<BoolAttr, "true">:$lhs_k_pack,
+      DefaultValuedAttr<BoolAttr, "true">:$rhs_k_pack
+    );
+
+    let results = (outs RankedTensorOf<[AnyFloat]>:$result);
+
+    let assemblyFormat = [{
+      $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? attr-dict
+      `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($result)
+    }];
+}
+
 #endif // XLA_CODEGEN_XTILE_IR_XTILE_OPS
 
diff --git a/third_party/xla/xla/core/collectives/collectives.h b/third_party/xla/xla/core/collectives/collectives.h
index 95a40d608128d7..903ba4bb441006 100644
--- a/third_party/xla/xla/core/collectives/collectives.h
+++ b/third_party/xla/xla/core/collectives/collectives.h
@@ -79,7 +79,8 @@ class Collectives {
   // Creates communicators by splitting `comms`.
   virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   SplitCommunicators(absl::Span<const Communicator* const> comms, int32_t color,
-                     absl::Span<const RankId> keys, const Config& config) = 0;
+                     absl::Span<const RankId> keys, const Config& config,
+                     absl::Span<const DeviceRank> ranks) = 0;
 
   // Collectives instance can be ephemeral and used only for a small number of
   // XLA program executions. XLA backends that rely on the collectives instances
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
index 04d586418292ea..74569d850b37e6 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.cc
@@ -51,8 +51,7 @@ void AnnotateComputationHostOffload(HloComputation& computation) {
 }
 }  // namespace
 
-
-absl::StatusOr<bool> AnnotateHostComputeOffload::Run(
+absl::StatusOr<bool> AnnotateHostComputeOffload::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
index c25c184572ee05..64999a0ecaca85 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload.h
@@ -34,7 +34,8 @@ class AnnotateHostComputeOffload : public HloModulePass {
     return "annotate-host-compute-offload";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
index cbcd41caaad7a3..f179e665567fe2 100644
--- a/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
+++ b/third_party/xla/xla/core/host_offloading/annotate_host_compute_offload_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
 
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
diff --git a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
index 9d01b79949d86c..3bcaf4f41a69bf 100644
--- a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
+++ b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -151,6 +152,304 @@ absl::StatusOr<bool> OffloadHostInstructions(
 
   return modified;
 }
+
+std::string GetDevicePlacement(const HloInstruction* instr) {
+  CHECK(instr->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input " << instr->name() << " must be a device placement annotation";
+  CHECK(instr->has_frontend_attributes())
+      << "Input " << instr->name() << " must have frontend attributes";
+  const auto& frontend_attribute_map = instr->frontend_attributes().map();
+  auto buffer_placement_it =
+      frontend_attribute_map.find(kXlaBufferPlacementAttr);
+  CHECK(buffer_placement_it != frontend_attribute_map.end())
+      << "Input " << instr->name()
+      << " must have a buffer placement frontend attribute";
+  return buffer_placement_it->second;
+}
+
+absl::flat_hash_set<HloInstruction*> CollectAllowedDevicePlacementAnnotations(
+    const HloComputation* computation) {
+  // Collect a list of allowed annotations. We only expect annotations in one of
+  // two locations in host computations currently:
+  //  1. The ROOT instruction, if the computation returns a single value.
+  //  2. The items feeding into the ROOT tuple instruction, if the computation
+  //  returns a tuple.
+  absl::flat_hash_set<HloInstruction*> allowed_device_placement_annotations;
+  HloInstruction* root_instr = computation->root_instruction();
+  if (root_instr->opcode() == HloOpcode::kTuple) {
+    // Is a tuple
+    for (int64_t i = 0; i < root_instr->operand_count(); ++i) {
+      HloInstruction* operand = root_instr->mutable_operand(i);
+      if (operand->IsCustomCall(memory_annotations::kDevicePlacement)) {
+        allowed_device_placement_annotations.insert(operand);
+      }
+    }
+  } else {
+    // Is not a tuple
+    if (root_instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      allowed_device_placement_annotations.insert(root_instr);
+    }
+  }
+  return allowed_device_placement_annotations;
+}
+
+absl::StatusOr<std::vector<HloInstruction*>>
+CheckRemainingDevicePlacementAnnotations(
+    const HloComputation* computation,
+    const absl::flat_hash_set<HloInstruction*>&
+        allowed_device_placement_annotations) {
+  // Look for annotations which are not in the allowed set. If any annotation is
+  // redundant, return it in a list so that the caller of this function can
+  // remove it. Any other annotation is an error.
+  std::vector<HloInstruction*> redundant_annotations;
+  for (HloInstruction* instr : computation->instructions()) {
+    if (instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      if (allowed_device_placement_annotations.contains(instr)) {
+        continue;
+      }
+      const std::string device_placement = GetDevicePlacement(instr);
+      if (device_placement == memory_annotations::kMemoryTargetPinnedHost ||
+          device_placement == memory_annotations::kMemoryTargetUnpinnedHost) {
+        // An annotation in host computation annotating the buffer to be on the
+        // host is redundant.
+        redundant_annotations.push_back(instr);
+      } else {
+        // An annotation in host computation annotating the buffer to be
+        // somewhere other than the host is not allowed.
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Host computation %s contains a device placement "
+                            "annotation %s that is not allowed.",
+                            computation->name(), instr->ToString()));
+      }
+    }
+  }
+  return redundant_annotations;
+}
+
+// Returns true if any redundant annotations were removed.
+absl::StatusOr<bool> CleanUpHostComputationDevicePlacementAnnotations(
+    const HloComputation* computation) {
+  const absl::flat_hash_set<HloInstruction*>
+      allowed_device_placement_annotations =
+          CollectAllowedDevicePlacementAnnotations(computation);
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<HloInstruction*> redundant_device_placement_annotations,
+      CheckRemainingDevicePlacementAnnotations(
+          computation, allowed_device_placement_annotations));
+
+  // Remove redundant annotations
+  for (HloInstruction* redundant_annotation :
+       redundant_device_placement_annotations) {
+    VLOG(1) << "Removing redundant annotation: "
+            << redundant_annotation->ToString();
+    CHECK_EQ(redundant_annotation->operand_count(), 1)
+        << "A device placement annotation must have exactly one operand.";
+    for (HloInstruction* user : redundant_annotation->users()) {
+      for (int64_t operand_index :
+           user->operand_indices(redundant_annotation)) {
+        TF_RETURN_IF_ERROR(user->ReplaceOperandWith(
+            operand_index, redundant_annotation->mutable_operand(0)));
+      }
+    }
+    TF_RETURN_IF_ERROR(redundant_annotation->parent()->RemoveInstruction(
+        redundant_annotation));
+  }
+
+  return !redundant_device_placement_annotations.empty();
+}
+
+bool DevicePlacementMemorySpaceIsSame(const HloInstruction* a,
+                                      const HloInstruction* b) {
+  CHECK(a->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input a: " << a->name() << " must be a device placement annotation";
+  CHECK(b->IsCustomCall(memory_annotations::kDevicePlacement))
+      << "Input b: " << b->name() << " must be a device placement annotation";
+  return GetDevicePlacement(a) == GetDevicePlacement(b);
+}
+
+absl::Status CloneAnnotationToDestination(
+    HloComputation* destination_computation,
+    HloInstruction* destination_computation_caller_instruction,
+    const HloInstruction* original_annotation,
+    HloInstruction* destination_instruction) {
+  HloInstruction* moved_annotation = destination_computation->AddInstruction(
+      original_annotation->CloneWithNewOperands(original_annotation->shape(),
+                                                {destination_instruction},
+                                                "move_to_caller"));
+
+  bool used_new_annotation = false;
+  for (HloInstruction* destination_user : destination_instruction->users()) {
+    if (destination_user == moved_annotation) {
+      // Do not replace the annotation with itself.
+      continue;
+    }
+    if (destination_user->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      // The destination already has an annotation.
+      if (!DevicePlacementMemorySpaceIsSame(original_annotation,
+                                            destination_user)) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Found conflicting host computation output memory "
+                            "space. Call %s wants output memory space %s but "
+                            "call %s wants output memory space %s",
+                            original_annotation->operand(0)->name(),
+                            GetDevicePlacement(original_annotation),
+                            destination_computation_caller_instruction->name(),
+                            GetDevicePlacement(destination_user)));
+      }
+      // Annotation already exists, nothing to do.
+      continue;
+    }
+    for (int64_t operand_index :
+         destination_user->operand_indices(destination_instruction)) {
+      TF_RETURN_IF_ERROR(destination_user->ReplaceOperandWith(
+          operand_index, moved_annotation));
+    }
+    used_new_annotation = true;
+  }
+
+  // All the places where this annotation would be placed already have this
+  // exact annotation.
+  if (!used_new_annotation) {
+    TF_RETURN_IF_ERROR(
+        destination_computation->RemoveInstruction(moved_annotation));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<bool> MoveAnnotationsToCallerTuple(
+    HloComputation* host_computation) {
+  bool changed = false;
+  for (int64_t operand_index = 0;
+       operand_index < host_computation->root_instruction()->operand_count();
+       ++operand_index) {
+    HloInstruction* root_operand =
+        host_computation->root_instruction()->mutable_operand(operand_index);
+    if (!root_operand->IsCustomCall(memory_annotations::kDevicePlacement)) {
+      // Instruction is not a device placement annotation; nothing to do.
+      continue;
+    }
+    // Root is a device placement annotation.
+    CHECK_EQ(root_operand->operand_count(), 1)
+        << "A device placement annotation must have exactly one operand.";
+
+    // Clone the annotation to each of the callers.
+    for (HloInstruction* caller_instruction :
+         host_computation->caller_instructions()) {
+      HloComputation* caller_computation = caller_instruction->parent();
+      for (HloInstruction* caller_user_gte : caller_instruction->users()) {
+        if (caller_user_gte->opcode() != HloOpcode::kGetTupleElement) {
+          return absl::UnimplementedError(
+              "When moving device placement annotations out of a host "
+              "computation, the tuple is used by something other than a "
+              "get-tuple-element. This is currently not supported.");
+        }
+        if (caller_user_gte->tuple_index() != operand_index) {
+          // This get-tuple-element is getting a different index than the one we
+          // are currently looking at.
+          continue;
+        }
+        TF_RETURN_IF_ERROR(
+            CloneAnnotationToDestination(caller_computation, caller_instruction,
+                                         root_operand, caller_user_gte));
+        changed = true;
+      }
+    }
+
+    TF_RETURN_IF_ERROR(host_computation->root_instruction()->ReplaceOperandWith(
+        operand_index, root_operand->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(host_computation->RemoveInstruction(root_operand));
+    changed = true;
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> MoveAnnotationToCallerNonTuple(
+    HloComputation* host_computation) {
+  HloInstruction* root_instr = host_computation->root_instruction();
+  if (!root_instr->IsCustomCall(memory_annotations::kDevicePlacement)) {
+    // Root is not a device placement annotation; nothing to do.
+    return false;
+  }
+  // Root is a device placement annotation.
+  CHECK_EQ(root_instr->operand_count(), 1)
+      << "A device placement annotation must have exactly one operand.";
+
+  // Clone the annotation to each of the callers.
+  for (HloInstruction* caller_instruction :
+       host_computation->caller_instructions()) {
+    HloComputation* caller_computation = caller_instruction->parent();
+    TF_RETURN_IF_ERROR(
+        CloneAnnotationToDestination(caller_computation, caller_instruction,
+                                     root_instr, caller_instruction));
+  }
+
+  // Remove the annotation from inside this computation.
+  host_computation->set_root_instruction(root_instr->mutable_operand(0));
+  TF_RETURN_IF_ERROR(host_computation->RemoveInstruction(root_instr));
+  return true;
+}
+
+// Move host device placement annotations out of this computation to the calling
+// computation.
+absl::StatusOr<bool> MoveAnnotationsToCaller(HloComputation* computation) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(
+      bool cleaned_up,
+      CleanUpHostComputationDevicePlacementAnnotations(computation));
+  changed = changed || cleaned_up;
+  // All annotations at this point are valid.
+  if (computation->root_instruction()->opcode() == HloOpcode::kTuple) {
+    // When the computation returns a tuple, the annotation is on the operands
+    // of the root tuple.
+    TF_ASSIGN_OR_RETURN(bool moved, MoveAnnotationsToCallerTuple(computation));
+    changed = changed || moved;
+  } else {
+    // When the computation returns a single value, the annotation is the root
+    // instruction.
+    TF_ASSIGN_OR_RETURN(bool moved,
+                        MoveAnnotationToCallerNonTuple(computation));
+    changed = changed || moved;
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> RemoveDevicePlacementAnnotationsFromHostComputations(
+    HloModule* module) {
+  // The only time we currently find device placement annotations in host
+  // computations are when the host computation calls another host computation
+  // and that called host computation has an output memory space annotated. That
+  // output memory space annotation is usually on the users of the host call (or
+  // users of the get-tuple-elements if the call returns a tuple).
+  //
+  // Visit host computations in post-order. We will push annotations out of host
+  // computations into their callers.
+  std::vector<HloComputation*> host_computations;
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    // Check if this computation is a host computation.
+    for (const HloInstruction* caller_instruction :
+         computation->caller_instructions()) {
+      if (caller_instruction->has_frontend_attributes()) {
+        FrontendAttributes frontend_attributes =
+            caller_instruction->frontend_attributes();
+        if (frontend_attributes.map().contains(kXlaComputeTypeAttr) &&
+            frontend_attributes.map().at(kXlaComputeTypeAttr) ==
+                kXlaComputeTypeHost) {
+          // The computation is a host computation.
+          host_computations.push_back(computation);
+          break;
+        }
+      }
+    }
+  }
+
+  bool changed = false;
+  for (HloComputation* computation : host_computations) {
+    TF_ASSIGN_OR_RETURN(bool moved, MoveAnnotationsToCaller(computation));
+    changed = changed || moved;
+  }
+  return changed;
+}
 }  // namespace
 
 /*static*/ absl::StatusOr<HloCallInstruction*>
@@ -303,7 +602,7 @@ HloHostDeviceTypeCallWrapper::MaterializeConstantsOnHostComputation(
   return tsl::down_cast<HloCallInstruction*>(new_call);
 }
 
-absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::Run(
+absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool has_host_compute_instr = false;
@@ -321,6 +620,16 @@ absl::StatusOr<bool> HloHostDeviceTypeCallWrapper::Run(
     return false;
   }
 
+  // Before any other passes run, move device placement annotations out of host
+  // computations.
+  TF_ASSIGN_OR_RETURN(
+      bool modified,
+      RemoveDevicePlacementAnnotationsFromHostComputations(module));
+  // At this point, this pass will always modify the module. The return value of
+  // this function, which indicates whether the module was modified, is not
+  // useful.
+  (void)modified;
+
   TF_RETURN_IF_ERROR(
       AnnotateHostComputeOffload().Run(module, execution_threads).status());
   TF_RETURN_IF_ERROR(CallInliner().Run(module, execution_threads).status());
diff --git a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
index 99c83653f27619..4ea47b6daff3eb 100644
--- a/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
+++ b/third_party/xla/xla/core/host_offloading/hlo_host_device_type_call_wrapper.h
@@ -47,10 +47,6 @@ class HloHostDeviceTypeCallWrapper : public HloModulePass {
     return "hlo_host_device_type_call_wrapper";
   }
 
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Materializes constants on the host computation to avoid unnecessary device
   // to host transfers.
   //
@@ -66,6 +62,11 @@ class HloHostDeviceTypeCallWrapper : public HloModulePass {
   static absl::StatusOr<HloCallInstruction*> RemoveTupleParameters(
       HloCallInstruction* call);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   Options options_;
 };
diff --git a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
index 0e53cf76efde99..1b9e19c1b83642 100644
--- a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
+++ b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.cc
@@ -60,7 +60,7 @@ void RemoveTilesAndMemorySpaces(HloComputation* computation) {
 }
 }  // namespace
 
-absl::StatusOr<bool> HostComputeAsyncifier::Run(
+absl::StatusOr<bool> HostComputeAsyncifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
index d8d4776bc692d3..c29d437e359e2d 100644
--- a/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
+++ b/third_party/xla/xla/core/host_offloading/host_compute_asyncifier.h
@@ -39,7 +39,8 @@ class HostComputeAsyncifier : public HloModulePass {
 
   absl::string_view name() const override { return "host_compute_asyncifier"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
index 47cff456fbf075..3b4335bff07f09 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.cc
@@ -112,7 +112,7 @@ bool HostOffloadingLayoutAnalysis::ShapeHasPadding(const Shape& shape) {
   return has_padding;
 }
 
-absl::StatusOr<bool> HostOffloadingLayoutAnalysis::Run(
+absl::StatusOr<bool> HostOffloadingLayoutAnalysis::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // TODO(ecg): relax this by allowing padding to then operate on a modified
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
index 9c08bfeeddd875..778b8637316f0e 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_layout_analysis.h
@@ -30,17 +30,6 @@ class HostOffloadingLayoutAnalysis : public HloModulePass {
     return "host-offloading-layout-analysis";
   }
 
-  using HloPassInterface::Run;
-  // This method does not modify the module; it purely informs the caller
-  // whether device<->host layout conversion (i.e., (de)linearization of input
-  // and result buffers) can be safely skipped.
-  // Note: the pass is conservative in that it can return true for some cases
-  // that might not need layout conversion. This is OK because performing layout
-  // conversion is always correct, despite its performance impact.
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) final;
-
   // This static method provides an API better named than "Run".
   static absl::StatusOr<bool> NeedsLayoutConversion(HloModule* module) {
     HostOffloadingLayoutAnalysis pass;
@@ -51,6 +40,17 @@ class HostOffloadingLayoutAnalysis : public HloModulePass {
   // This function is useful when the HloModule has no tiling information, yet
   // we have it from shapes coming from buffers, e.g. TpuBuffer's.
   static bool ShapeHasPadding(const Shape& shape);
+
+ protected:
+  // This method does not modify the module; it purely informs the caller
+  // whether device<->host layout conversion (i.e., (de)linearization of input
+  // and result buffers) can be safely skipped.
+  // Note: the pass is conservative in that it can return true for some cases
+  // that might not need layout conversion. This is OK because performing layout
+  // conversion is always correct, despite its performance impact.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) final;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc b/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
index c212e5512cdcd3..158378bdc4b6b2 100644
--- a/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
+++ b/third_party/xla/xla/core/host_offloading/host_offloading_pjrt_executable.cc
@@ -258,8 +258,6 @@ HostOffloadingPjRtExecutable::Execute(
 
   // TODO(b/340666998) Add additional context needed to support megascale ops
   ::xla::ExecuteOptions pjrt_execute_options{
-      // By default untuple results.
-      .untuple_result = true,
       // Forward launch id to the host offloading executable because logically
       // it executes as a part of parent device execution.
       .launch_id = execute_options.launch_id,
diff --git a/third_party/xla/xla/cpu_function_runtime.h b/third_party/xla/xla/cpu_function_runtime.h
deleted file mode 100644
index b35ad602969736..00000000000000
--- a/third_party/xla/xla/cpu_function_runtime.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CPU_FUNCTION_RUNTIME_H_
-#define XLA_CPU_FUNCTION_RUNTIME_H_
-
-#include <stdint.h>
-
-#include <cassert>
-#include <cstdlib>
-
-namespace xla {
-namespace cpu_function_runtime_deprecated {
-
-struct EncodedBufferInfo {
-  uint64_t packed_kind_and_size = 0;
-  uint32_t entry_param_number = -1;
-  uint32_t result_param_number = -1;
-};
-
-// Stores information about one buffer used by an XLA:CPU compiled function.
-// These buffers are used for holding inputs to the computation, outputs from
-// the computation and as temporary scratch space.
-class BufferInfo {
- public:
-  // Creates a BufferInfo from a serialized encoding generated by `Encode`.
-  explicit constexpr BufferInfo(const EncodedBufferInfo& encoded)
-      : kind_(UnpackKind(encoded.packed_kind_and_size)),
-        size_(UnpackSize(encoded.packed_kind_and_size)),
-        entry_param_number_(encoded.entry_param_number),
-        result_param_number_(encoded.result_param_number) {}
-
-  // Returns true if this buffer stores a constant.
-  bool is_constant() const { return kind() == Kind::kConstant; }
-
-  // Returns true if this buffer stores an entry parameter.  These may or may
-  // not need to be allocated by the runtime, depending on
-  // XlaCompiledCpuFunction::AllocMode.
-  bool is_entry_parameter() const {
-    return kind() == Kind::kParameter && entry_param_number_ >= 0;
-  }
-
-  // Returns the entry parameter number of this buffer.
-  uint32_t entry_parameter_number() const {
-    assert(is_entry_parameter());
-    return entry_param_number_;
-  }
-
-  void set_result_parameter_number(uint32_t param_number) {
-    result_param_number_ = param_number;
-  }
-
-  bool is_result_parameter() const {
-    // Note: the kind is not unique, e.g. could be a kTempBuffer, or a
-    // kParameter if it is an in-out argument.
-    return result_param_number_ >= 0;
-  }
-
-  uint32_t result_parameter_number() const {
-    assert(is_result_parameter());
-    return result_param_number_;
-  }
-
-  // Returns true if this buffer is temporary scratch space required by the XLA
-  // computations.  These are always allocated by the runtime.
-  bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; }
-
-  // Returns true if this buffer is allocated on the C stack or into registers.
-  // These buffers are never allocated by the runtime.
-  bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; }
-
-  // Returns the size for this buffer.
-  uint64_t size() const { return size_; }
-
-  // Encodes this BufferInfo into two 64 bit integers that can be used to
-  // reconstruct the BufferInfo later using the constructor.  We need this
-  // because we use BufferInfo in places where using protocol buffers would
-  // negatively impact binary size.
-  EncodedBufferInfo Encode() const {
-    static_assert(sizeof(*this) == 16, "");
-    EncodedBufferInfo ret;
-    ret.packed_kind_and_size = Pack(kind(), size_);
-    ret.entry_param_number = entry_param_number_;
-    ret.result_param_number = result_param_number_;
-    return ret;
-  }
-
-  bool operator==(const BufferInfo& buffer_info) const {
-    if (kind() != buffer_info.kind() || size() != buffer_info.size()) {
-      return false;
-    }
-    return !is_entry_parameter() ||
-           entry_parameter_number() == buffer_info.entry_parameter_number();
-  }
-
-  // Factory methods:
-
-  static BufferInfo MakeTempBuffer(uint64_t size) {
-    return BufferInfo(Kind::kTempBuffer, size);
-  }
-  static BufferInfo MakeConstant(uint64_t size) {
-    return BufferInfo(Kind::kConstant, size);
-  }
-  // Note: in-out parameters are possible by first creating an entry parameter
-  // and then calling set_result_parameter_number().
-  static BufferInfo MakeEntryParameter(uint64_t size,
-                                       uint32_t entry_param_number) {
-    return BufferInfo(Kind::kParameter, size, entry_param_number);
-  }
-  // Only used in tests. Here we use kTempBuffer but it is unimportant.
-  static BufferInfo MakeResultParameter(uint64_t size,
-                                        uint32_t result_param_number) {
-    // Here we
-    return BufferInfo(Kind::kTempBuffer, size, /*entry_param_number=*/-1,
-                      result_param_number);
-  }
-  static BufferInfo MakeOnStackBuffer(uint64_t size) {
-    return BufferInfo(Kind::kOnStackBuffer, size);
-  }
-
- private:
-  BufferInfo() = default;
-
-  enum class Kind : uint64_t {
-    kConstant,
-    kTempBuffer,
-    kParameter,
-    kOnStackBuffer
-  };
-
-  Kind kind() const { return static_cast<Kind>(kind_); }
-
-  explicit BufferInfo(Kind kind, uint64_t size)
-      : BufferInfo(kind, size,
-                   /*entry_param_number=*/-1,
-                   /*result_param_number=*/-1) {}
-  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number)
-      : BufferInfo(kind, size, entry_param_number,
-                   /*result_param_number=*/-1) {}
-  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number,
-                      uint32_t result_param_number)
-      : kind_(kind),
-        size_(size),
-        entry_param_number_(entry_param_number),
-        result_param_number_(result_param_number) {}
-
-  static uint64_t Pack(Kind kind, uint64_t size) {
-    return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
-  }
-
-  static inline constexpr Kind UnpackKind(uint64_t packed) {
-    return static_cast<Kind>((packed << 62) >> 62);
-  }
-
-  static inline constexpr uint64_t UnpackSize(uint64_t packed) {
-    return packed >> 2;
-  }
-
-  Kind kind_ : 2;
-  uint64_t size_ : 62;
-  int32_t entry_param_number_ = -1;
-  int32_t result_param_number_ = -1;
-};
-
-}  // namespace cpu_function_runtime_deprecated
-}  // namespace xla
-
-#endif  // XLA_CPU_FUNCTION_RUNTIME_H_
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 1fad94eb3a049e..767c923aa66dce 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <fstream>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -213,6 +214,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_xnnpack(true);
   opts.set_xla_cpu_experimental_xnn_graph_fusion_mode(
       DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED);
+  opts.add_xla_cpu_experimental_ynn_fusion_type(
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
   opts.set_xla_cpu_parallel_codegen_split_count(32);
   opts.set_xla_cpu_copy_insertion_use_region_analysis(false);
   opts.set_xla_cpu_enable_concurrency_optimized_scheduler(true);
@@ -222,6 +225,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_emitter_verification_level(0);
 
   opts.set_xla_cpu_enable_fast_math(false);
+  opts.set_xla_cpu_enable_platform_dependent_math(true);
   // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_fast_math_honor_nans(true);
   opts.set_xla_cpu_fast_math_honor_infs(true);
@@ -273,7 +277,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_multiheap_size_constraint_per_heap(-1);
   opts.set_xla_detailed_logging(true);
   opts.set_xla_enable_dumping(true);
+  opts.set_xla_enable_enzyme_comms_opt(false);
 
+  opts.set_xla_gpu_enable_dynamic_slice_fusion(false);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
@@ -321,14 +327,14 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_triton_gemm(true);
   opts.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-  // When changing the default value of the flag, please make sure to update the
-  // default value of the command line flag in `MakeDebugOptionsFlags`.
   opts.add_xla_gpu_unsupported_generic_triton_emitter_features(
       DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
   opts.add_xla_gpu_unsupported_generic_triton_emitter_features(
       DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES);
   opts.add_xla_gpu_unsupported_generic_triton_emitter_features(
       DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION);
+  opts.add_xla_gpu_unsupported_generic_triton_emitter_features(
+      DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
   opts.set_xla_gpu_unsupported_enable_triton_multi_output_fusion(true);
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(true);
@@ -389,6 +395,14 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   const int64_t kDefaultMinGemmRewriteSize = 100;
   opts.set_xla_gpu_gemm_rewrite_size_threshold(kDefaultMinGemmRewriteSize);
 
+#ifdef HAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE
+  opts.set_xla_gpu_use_embeded_device_lib(true);
+#endif
+
+#ifdef HAS_SUPPORT_FOR_LLD_AS_A_LIBRARY
+  opts.set_xla_gpu_use_inprocess_lld(true);
+#endif
+
   opts.set_xla_gpu_use_memcpy_local_p2p(false);
 
   opts.set_xla_reduce_window_rewrite_base_length(16);
@@ -442,6 +456,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(true);
   opts.set_xla_gpu_unsupported_enable_all_reduce_decomposer(false);
   opts.set_xla_gpu_experimental_use_autotuner_pass(false);
+  opts.set_xla_gpu_experimental_enable_fusion_autotuner(true);
+  opts.set_xla_gpu_experimental_allow_unroll_factor_eight(true);
   opts.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_fix_max_iterations(false);
   opts.set_xla_hlo_pass_fix_detect_cycles(false);
@@ -453,8 +469,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_enable_triton_tma(false);
   opts.set_xla_gpu_experimental_enable_triton_warp_specialization(false);
   opts.set_xla_gpu_experimental_enable_command_buffer_on_thunks(true);
-  opts.set_xla_detect_unstable_reductions(
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE);
+  opts.set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
   opts.set_xla_gpu_experimental_scaled_dot_with_triton(false);
   opts.set_xla_gpu_experimental_use_raft_select_k(false);
 
@@ -462,6 +477,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_collective_call_terminate_timeout_seconds(40);
 
   opts.set_xla_keep_shardings_after_spmd(false);
+  opts.set_xla_gpu_experimental_enable_checksum_tracing_on_thunks(false);
+  opts.set_xla_gpu_detect_nan(DebugOptions::DETECTION_MODE_NONE);
+  opts.set_xla_gpu_detect_inf(DebugOptions::DETECTION_MODE_NONE);
   return opts;
 }
 
@@ -765,7 +783,9 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
         DebugOptions::WhileLoopUnrolling unroll_strategy;
         bool parsed = DebugOptions::WhileLoopUnrolling_Parse(
             absl::AsciiStrToUpper(input), &unroll_strategy);
-        if (!parsed) return false;
+        if (!parsed) {
+          return false;
+        }
         debug_options->set_xla_gpu_enable_while_loop_unrolling(unroll_strategy);
         return true;
       };
@@ -860,22 +880,57 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
   };
 
   // Custom "sub-parser" for xla_gpu_experimental_autotune_cache_mode.
+  auto detection_mode = [](DebugOptions* debug_options,
+                           const std::string& value)
+      -> std::optional<DebugOptions::DetectionMode> {
+    if (value == "none") {
+      return DebugOptions::DETECTION_MODE_NONE;
+    }
+    if (value == "warning") {
+      return DebugOptions::DETECTION_MODE_WARNING;
+    }
+    if (value == "fail") {
+      return DebugOptions::DETECTION_MODE_FAIL;
+    }
+    return std::nullopt;
+  };
   auto setter_for_xla_detect_unstable_reductions =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_detect_unstable_reductions(mode.value());
+          return true;
+        }
+        return false;
+      };
+
+  // Custom "sub-parser" for
+  // xla_gpu_experimental_thunk_buffer_debug_filter_by_thunk_id_ranges.
+  auto setter_for_thunk_buffer_debug_filter_by_thunk_id =
       [debug_options](const std::string& value) {
-        DebugOptions::UnstableReductionDetectionMode detection_mode;
-        if (value == "none") {
-          detection_mode = DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE;
-        } else if (value == "warning") {
-          detection_mode =
-              DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_WARNING;
-        } else if (value == "fail") {
-          detection_mode = DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL;
-        } else {
-          return false;
+        for (const auto& range_str : absl::StrSplit(value, ',')) {
+          IntRangeInclusive* range =
+              debug_options
+                  ->mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+                  ->add_thunk_id_ranges();
+          if (!details::ParseIntRangeInclusive(range_str, *range)) {
+            return false;
+          }
         }
-        debug_options->set_xla_detect_unstable_reductions(detection_mode);
         return true;
       };
+
+  // Custom "sub-parser" for
+  // xla_gpu_experimental_thunk_buffer_debug_filter_by_profile_annotation_re.
+  auto setter_for_thunk_buffer_debug_filter_by_profile_annotation =
+      [debug_options](const std::string& value) {
+        for (const auto& regex_str : absl::StrSplit(value, ',')) {
+          debug_options
+              ->mutable_xla_gpu_experimental_thunk_buffer_debug_filter()
+              ->add_profile_annotation_regexes(regex_str);
+        }
+        return true;
+      };
+
   // Don't use an initializer list for initializing the vector; this would
   // create a temporary copy, and exceeds the stack space when compiling with
   // certain configurations.
@@ -885,6 +940,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_cpu_enable_fast_math(),
       "Enable unsafe fast-math optimizations in the CPU compiler; this may "
       "produce faster code at the expense of some accuracy."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_platform_dependent_math",
+      bool_setter_for(
+          &DebugOptions::set_xla_cpu_enable_platform_dependent_math),
+      debug_options->xla_cpu_enable_platform_dependent_math(),
+      "Enable platform dependent math in the CPU compiler; this may "
+      "produce faster code at the expense of consistent results across CPUs."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_nans",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
@@ -1116,7 +1178,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::LibraryFusionType_Parse,
           debug_options->mutable_xla_cpu_experimental_xnn_fusion_type()),
       "",
-      "Comma-separated list of XNN fusion types to be enabled.; "
+      "Comma-separated list of XNN fusion types to be enabled; "
       "no whitespace around commas. Two ways to pass values:\n"
       "  1. Exact type names. This overwrites the default setting.\n"
       "  2. '+' or '-' prefix: This adds or removes a fusion type "
@@ -1124,6 +1186,21 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "mode. Every item must have the sign prefix.\n"
       "Available fusion types: dot, eltwise, and reduce.\n"
       "The default list is currently empty."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_experimental_ynn_fusion_type",
+      SetterForRepeatedEnum<DebugOptions::LibraryFusionType>(
+          "xla_cpu_experimental_ynn_fusion_type",
+          /*enum_prefix=*/"LIBRARY_FUSION_TYPE_",
+          &DebugOptions::LibraryFusionType_Parse,
+          debug_options->mutable_xla_cpu_experimental_ynn_fusion_type()),
+      "",
+      "Comma-separated list of YNN fusion types to be enabled; "
+      "no whitespace around commas. Two ways to pass values:\n"
+      "  1. Exact type names. This overwrites the default setting.\n"
+      "  2. '+' or '-' prefix: This adds or removes a fusion type "
+      "from the default list. Cannot be mixed with the overwrite "
+      "mode. Every item must have the sign prefix.\n"
+      "The default list is currently empty."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_experimental_xnn_graph_fusion_mode",
       setter_for_xla_cpu_experimental_xnn_graph_fusion_mode,
@@ -1302,6 +1379,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_experimental_dump_fdo_profiles(),
                 "Dumps FDO profiles as text to the directory specified "
                 "by --xla_dump_to."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_dump_gpu_executable",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_dump_gpu_executable),
+      debug_options->xla_gpu_experimental_dump_gpu_executable(),
+      "Dump the serialized GPU executables to 'gpu_executable_proto' suffixed "
+      "files, in the directory specified by `xla_dump_to`. No-op if "
+      "`xla_dump_to` isn't set, or during autotuning compilations."));
   flag_list->push_back(
       tsl::Flag("xla_dump_hlo_as_dot",
                 bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
@@ -1876,6 +1961,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           debug_options->xla_gpu_experimental_pipeline_parallelism_opt_level()),
       "Experimental optimizations for SPMD-based pipeline parallelism on "
       "GPU."));
+  flag_list->push_back(tsl::Flag(
+      "xla_enable_enzyme_comms_opt",
+      bool_setter_for(&DebugOptions::set_xla_enable_enzyme_comms_opt),
+      debug_options->xla_enable_enzyme_comms_opt(),
+      "Enable communication optimization patterns specified in Enzyme. More "
+      "details in http://shortn/_jXJ2VFoyMN."));
   flag_list->push_back(tsl::Flag(
       "xla_partitioning_algorithm", setter_for_xla_partitioning_algorithm,
       DebugOptions::PartitioningAlgorithm_Name(
@@ -2568,10 +2659,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_use_autotuner_pass(),
       "If true, use the AutotunerPass to autotune fusions, instead of the "
       "gemm_fusion_autotuner."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_allow_unroll_factor_eight",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_allow_unroll_factor_eight),
+      debug_options->xla_gpu_experimental_allow_unroll_factor_eight(),
+      "If true, allows unroll factor 8 on Blackwell architectures."));
   flag_list->push_back(
       tsl::Flag("xla_detect_unstable_reductions",
                 setter_for_xla_detect_unstable_reductions,
-                DebugOptions::UnstableReductionDetectionMode_Name(
+                DebugOptions::DetectionMode_Name(
                     debug_options->xla_detect_unstable_reductions()),
                 "Controls the behavior of the unstable reduction detector pass "
                 "that checks for unstable reductions in HLO computations. "
@@ -2615,6 +2712,58 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_enable_checksum_tracing_on_thunks(),
       "Enables an experimental feature to record checksums of selected thunk "
       "inputs/outputs."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_thunk_buffer_debug_filter_by_thunk_id_ranges",
+      setter_for_thunk_buffer_debug_filter_by_thunk_id, "(none)",
+      "Limits the thunk buffer debug instrumentation to thunks with IDs "
+      "matching one or more ranges defined as a single integer, min:max "
+      "(inclusive), or half-open min:/:max."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_thunk_buffer_debug_filter_by_profile_annotation_re",
+      setter_for_thunk_buffer_debug_filter_by_profile_annotation, "(none)",
+      "Limits the thunk buffer debug instrumentation to thunks with profile "
+      "annotations matching one or more regexes passed as comma-separated "
+      "string."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_fusion_autotuner",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_enable_fusion_autotuner),
+      debug_options->xla_gpu_experimental_enable_fusion_autotuner(),
+      "Enable autotuning between the native & triton fusion emitters."));
+
+  auto setter_for_xla_gpu_detect_nan =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_gpu_detect_nan(mode.value());
+          return true;
+        }
+        return false;
+      };
+
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_detect_nan", setter_for_xla_gpu_detect_nan,
+      DebugOptions::DetectionMode_Name(debug_options->xla_gpu_detect_nan()),
+      "Controls the behavior of the NaN detector pass that checks for presence "
+      "of NaN values in kernel outputs. Acceptable values are: 'none', "
+      "'warning', and 'fail'. 'none' is the default. If other than 'none' "
+      "value is provided, additional thunks will be added to detect and "
+      "warn or fail the execution if NaNs are detected."));
+  auto setter_for_xla_gpu_detect_inf =
+      [debug_options, detection_mode](const std::string& value) {
+        if (auto mode = detection_mode(debug_options, value)) {
+          debug_options->set_xla_gpu_detect_inf(mode.value());
+          return true;
+        }
+        return false;
+      };
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_detect_inf", setter_for_xla_gpu_detect_inf,
+      DebugOptions::DetectionMode_Name(debug_options->xla_gpu_detect_inf()),
+      "Controls the behavior of the Inf detector pass that checks for presence "
+      "of Inf values in kernel outputs. Acceptable values are: 'none', "
+      "'warning', and 'fail'. 'none' is the default. If other than 'none' "
+      "value is provided, additional thunks will be added to detect and "
+      "warn or fail the execution if Infs are detected."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/debug_options_parsers.cc b/third_party/xla/xla/debug_options_parsers.cc
new file mode 100644
index 00000000000000..ef2756b114af42
--- /dev/null
+++ b/third_party/xla/xla/debug_options_parsers.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/debug_options_parsers.h"
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace details {
+
+bool ParseIntRangeInclusive(absl::string_view string_value,
+                            IntRangeInclusive& range) {
+  std::vector<absl::string_view> parts = absl::StrSplit(string_value, ':');
+
+  if (parts.size() == 1) {
+    // A single integer x is a valid [x, x] range.
+    int64_t first;
+    if (!absl::SimpleAtoi(parts[0], &first)) {
+      return false;
+    }
+    range.set_first(first);
+    range.set_last(first);
+    return true;
+  }
+
+  if (parts.size() == 2) {
+    if (parts[0].empty() && parts[1].empty()) {
+      // ":" is not a valid range.
+      return false;
+    }
+
+    // Allow semi-open ranges (e.g. "1:", ":100").
+    int64_t first = std::numeric_limits<int64_t>::min();
+    int64_t last = std::numeric_limits<int64_t>::max();
+    if (!parts[0].empty() && !absl::SimpleAtoi(parts[0], &first)) {
+      return false;
+    }
+    if (!parts[1].empty() && !absl::SimpleAtoi(parts[1], &last)) {
+      return false;
+    }
+    if (first > last) {
+      return false;
+    }
+    range.set_first(first);
+    range.set_last(last);
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace details
+}  // namespace xla
diff --git a/third_party/xla/xla/debug_options_parsers.h b/third_party/xla/xla/debug_options_parsers.h
index 2ecce94961c20f..dd9fce01c02aa4 100644
--- a/third_party/xla/xla/debug_options_parsers.h
+++ b/third_party/xla/xla/debug_options_parsers.h
@@ -99,6 +99,19 @@ struct RepeatedFlagModifier {
 //   -> [(add, "A"), (remove, "B"), (add, "C")]
 absl::StatusOr<std::vector<RepeatedFlagModifier>> ParseRepeatedEnumModifiers(
     absl::string_view flag_value, absl::string_view add_prefix = "");
+
+// Parses a string representation of an inclusive range into `range`.
+//
+// The string representation can be either:
+// - a single integer x,
+// - a range min:max,
+// - a half-open range min: or :max.
+// The range is inclusive on both ends.
+//
+// Returns true if the string is a valid range representation.
+bool ParseIntRangeInclusive(absl::string_view string_value,
+                            IntRangeInclusive& range);
+
 }  // namespace details
 
 }  // namespace xla
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 21c7b0c02447f4..25b9a7d6e18c8c 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "xla/debug_options_parsers.h"
 
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -45,6 +47,7 @@ using ::absl_testing::IsOkAndHolds;
 using ::absl_testing::StatusIs;
 using ::testing::ElementsAre;
 using ::testing::IsEmpty;
+using ::xla::details::ParseIntRangeInclusive;
 using ::xla::details::ParseRepeatedEnumModifiers;
 using ::xla::details::RepeatedFlagModifier;
 
@@ -396,6 +399,7 @@ TEST(ParseRepeatedEnumFlagsTest, GenericTritonEmitterFeatures) {
       enabled_features,
       testing::UnorderedElementsAre(
           DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM,
+          DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM,
           DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_GEMM_SHAPES,
           DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION));
 
@@ -412,6 +416,7 @@ TEST(ParseRepeatedEnumFlagsTest, GenericTritonEmitterFeatures) {
       enabled_features,
       testing::UnorderedElementsAre(
           DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM,
+          DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM,
           DebugOptions::GENERIC_TRITON_EMITTER_ALLOW_ALL_OPS_IN_GEMM_FUSION));
 
   // Overwriting options.
@@ -547,6 +552,49 @@ TEST(ParseRepeatedEnumFlagsTest, XnnFusionType) {
   TestLibraryFusionType("xnn");
 }
 
+TEST(ParseIntRangeInclusiveTest, SingleInteger) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), 10);
+}
+
+TEST(ParseIntRangeInclusiveTest, Range) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10:20", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), 20);
+}
+
+TEST(ParseIntRangeInclusiveTest, HalfOpenRangeWithMin) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive("10:", range));
+  EXPECT_EQ(range.first(), 10);
+  EXPECT_EQ(range.last(), std::numeric_limits<int64_t>::max());
+}
+
+TEST(ParseIntRangeInclusiveTest, HalfOpenRangeWithMax) {
+  IntRangeInclusive range;
+  EXPECT_TRUE(ParseIntRangeInclusive(":100", range));
+  EXPECT_EQ(range.first(), std::numeric_limits<int64_t>::min());
+  EXPECT_EQ(range.last(), 100);
+}
+
+TEST(ParseIntRangeInclusiveTest, InvalidRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive("10:20:30", range));
+}
+
+TEST(ParseIntRangeInclusiveTest, InvalidHalfOpenRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive(":", range));
+}
+
+TEST(ParseIntRangeInclusiveTest, ReversedRange) {
+  IntRangeInclusive range;
+  EXPECT_FALSE(ParseIntRangeInclusive("20:10", range));
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 09bbeaa657c259..14b8979db941fe 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -1,4 +1,5 @@
 load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -22,6 +23,7 @@ cc_library(
     hdrs = ["call_frame.h"],
     deps = [
         ":api",
+        ":attribute_map",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -44,6 +46,7 @@ xla_cc_test(
     name = "call_frame_test",
     srcs = ["call_frame_test.cc"],
     deps = [
+        ":attribute_map",
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
@@ -67,7 +70,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -196,7 +199,11 @@ cc_library(
     srcs = ["attribute_map.cc"],
     hdrs = ["attribute_map.h"],
     deps = [
-        ":call_frame",
+        ":attribute_map_proto_cc",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -204,11 +211,27 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
+xla_cc_test(
+    name = "attribute_map_test",
+    srcs = ["attribute_map_test.cc"],
+    deps = [
+        ":attribute_map",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "attribute_map_proto",
+    srcs = ["attribute_map.proto"],
+)
+
 xla_cc_test(
     name = "ffi_test",
     srcs = ["ffi_test.cc"],
@@ -216,6 +239,7 @@ xla_cc_test(
     features = ["-use_header_modules"],
     shuffle_tests = False,
     deps = [
+        ":attribute_map",
         ":call_frame",
         ":execution_context",
         ":execution_state",
@@ -275,6 +299,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 30f8fcddf3b144..fe1d90a14a2621 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -85,6 +85,7 @@ xla_cc_test(
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_context",
         "//xla/ffi:execution_state",
@@ -99,6 +100,7 @@ xla_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/functional:overload",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index ce4e6e7894403e..fac7c9a816e24c 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -192,6 +192,49 @@ inline std::ostream& operator<<(std::ostream& os,
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Builtin structs equality
+//===----------------------------------------------------------------------===//
+
+inline bool operator==(const XLA_FFI_TypeId& a, const XLA_FFI_TypeId& b) {
+  return a.type_id == b.type_id;
+}
+
+inline bool operator!=(const XLA_FFI_TypeId& a, const XLA_FFI_TypeId& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Api_Version& a,
+                       const XLA_FFI_Api_Version& b) {
+  return a.major_version == b.major_version &&
+         a.minor_version == b.minor_version;
+}
+
+inline bool operator!=(const XLA_FFI_Api_Version& a,
+                       const XLA_FFI_Api_Version& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Metadata& a, const XLA_FFI_Metadata& b) {
+  return a.api_version == b.api_version && a.traits == b.traits &&
+         a.state_type_id == b.state_type_id;
+}
+
+inline bool operator!=(const XLA_FFI_Metadata& a, const XLA_FFI_Metadata& b) {
+  return !(a == b);
+}
+
+inline bool operator==(const XLA_FFI_Handler_Bundle& a,
+                       const XLA_FFI_Handler_Bundle& b) {
+  return a.instantiate == b.instantiate && a.prepare == b.prepare &&
+         a.initialize == b.initialize && a.execute == b.execute;
+}
+
+inline bool operator!=(const XLA_FFI_Handler_Bundle& a,
+                       const XLA_FFI_Handler_Bundle& b) {
+  return !(a == b);
+}
+
 namespace xla::ffi {
 
 enum class ExecutionStage : uint8_t {
@@ -217,8 +260,7 @@ enum class Traits : uint32_t {
   //   2. the FFI handler only uses device allocations passed in as buffer
   //      arguments (e.g. it does *not* do any runtime device memory
   //      allocations);
-  //   3. the FFI handler may not query the execution status of the stream
-  //      (e.g. calling `cudaGetLastError` on the stream is invalid).
+  //   3. the FFI handler may not query the execution status of the stream.
   kCmdBufferCompatible = XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE,
 };
 
@@ -279,8 +321,13 @@ class Ffi {
   // `type_id`.
   static XLA_FFI_Error* RegisterTypeId(const XLA_FFI_Api* api,
                                        std::string_view name,
-                                       XLA_FFI_TypeId* type_id,
-                                       XLA_FFI_TypeInfo type_info = {nullptr});
+                                       XLA_FFI_TypeId* type_id,  // in-out
+                                       XLA_FFI_TypeInfo type_info);
+
+  static XLA_FFI_Error* RegisterTypeId(const XLA_FFI_Api* api,
+                                       std::string_view name,
+                                       XLA_FFI_TypeId* type_id,  // in-out
+                                       const XLA_FFI_TypeInfo* type_info);
 
   // This is a helper template that allows to convert function pointers from
   // the run time values to compile time values (template arguments) with
@@ -321,6 +368,9 @@ class Ffi {
   static XLA_FFI_Error* InvalidArgument(const XLA_FFI_Api* api,
                                         std::string message);
 
+  static XLA_FFI_Error* FailedPrecondition(const XLA_FFI_Api* api,
+                                           std::string message);
+
   static XLA_FFI_Error* CheckStructSize(const XLA_FFI_Api* api,
                                         std::string_view struct_name,
                                         size_t expected, size_t actual);
@@ -348,14 +398,21 @@ inline XLA_FFI_Error* Ffi::RegisterTypeId(const XLA_FFI_Api* api,
                                           std::string_view name,
                                           XLA_FFI_TypeId* type_id,
                                           XLA_FFI_TypeInfo type_info) {
+  return RegisterTypeId(api, name, type_id, &type_info);
+}
+
+inline XLA_FFI_Error* Ffi::RegisterTypeId(const XLA_FFI_Api* api,
+                                          std::string_view name,
+                                          XLA_FFI_TypeId* type_id,
+                                          const XLA_FFI_TypeInfo* type_info) {
   assert(type_id && "type_id must not be null");
-  XLA_FFI_TypeId_Register_Args args;
-  args.struct_size = XLA_FFI_TypeId_Register_Args_STRUCT_SIZE;
+  XLA_FFI_Type_Register_Args args;
+  args.struct_size = XLA_FFI_Type_Register_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
   args.type_id = type_id;
-  args.type_info = &type_info;
-  return api->XLA_FFI_TypeId_Register(&args);
+  args.type_info = type_info;
+  return api->XLA_FFI_Type_Register(&args);
 }
 
 template <typename... Args>
@@ -384,6 +441,12 @@ inline XLA_FFI_Error* Ffi::InvalidArgument(const XLA_FFI_Api* api,
                    std::move(message));
 }
 
+inline XLA_FFI_Error* Ffi::FailedPrecondition(const XLA_FFI_Api* api,
+                                              std::string message) {
+  return MakeError(api, XLA_FFI_Error_Code_FAILED_PRECONDITION,
+                   std::move(message));
+}
+
 inline XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
                                            std::string_view struct_name,
                                            size_t expected, size_t actual) {
@@ -411,15 +474,24 @@ inline XLA_FFI_Error* Ffi::StructSizeIsGreaterOrEqual(
 // Type tags for distinguishing handler argument types
 //===----------------------------------------------------------------------===//
 
-// Forward declare.
+// Dictionary gives type-safe run time access to all attributes. Concrete
+// implementation is provided by the `ffi.h` header.
 class Dictionary;
 
+// Context gives run time access to the execution context. Concrete
+// implementation is provided by the `ffi.h` header.
+class Context;
+
 namespace internal {
 
 // WARNING: A lot of template metaprogramming on top of C++ variadic templates
 // parameter packs. We need this to be able to pattern match FFI handler
 // signature at compile time.
 
+// A type tag for decoding argument.
+template <typename T>
+struct ArgTag {};
+
 // A type tag for decoding optional argument.
 template <typename T>
 struct OptionalArgTag {};
@@ -448,7 +520,7 @@ struct AttrTag {};
 
 // A type tag to forward all attributes as `Dictionary` (and optionally decode
 // it into a custom struct).
-template <typename T = Dictionary>
+template <typename T>
 struct AttrsTag {};
 
 // A type tag to distinguish parameter extracted from an execution context.
@@ -456,23 +528,23 @@ template <typename T>
 struct CtxTag {};
 
 //----------------------------------------------------------------------------//
-// A template for counting tagged arguments in the Ts pack (i.e. attributes).
+// A template for counting tagged arguments in the Ts pack.
 //----------------------------------------------------------------------------//
 
-template <template <typename> class Tag, typename... Ts>
+template <template <typename> typename Tag, typename... Ts>
 struct NumTagged;
 
-template <template <typename> class Tag>
+template <template <typename> typename Tag>
 struct NumTagged<Tag> {
   static constexpr int64_t value = 0;
 };
 
-template <template <typename> class Tag, typename T, typename... Ts>
+template <template <typename> typename Tag, typename T, typename... Ts>
 struct NumTagged<Tag, Tag<T>, Ts...> {
   static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
 };
 
-template <template <typename> class Tag, typename T, typename... Ts>
+template <template <typename> typename Tag, typename T, typename... Ts>
 struct NumTagged<Tag, T, Ts...> {
   static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
 };
@@ -554,7 +626,7 @@ template <ExecutionStage stage, typename... Ts>
 class Binding {
  public:
   template <typename T>
-  Binding<stage, Ts..., T> Arg() && {
+  Binding<stage, Ts..., internal::ArgTag<T>> Arg() && {
     static_assert(!internal::HasOptionalArgTag<Ts...>::value,
                   "argument can't be passed after optional argument");
     static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
@@ -603,6 +675,10 @@ class Binding {
     return {std::move(*this)};
   }
 
+  Binding<stage, Ts..., internal::CtxTag<Context>> Ctx() && {
+    return {std::move(*this)};
+  }
+
   template <typename T>
   Binding<stage, Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
     static_assert(internal::NumTagged<internal::AttrsTag, Ts...>::value == 0,
@@ -903,6 +979,7 @@ struct AttrsBinding<Dictionary> {
 //
 //   template <>
 //   struct ArgDecoding<MyType> {
+//     static bool Isa(XLA_FFI_ArgType type, void* arg);
 //     static std::optional<MyType> Decode(XLA_FFI_ArgType type, void* arg);
 //   };
 //
@@ -920,6 +997,7 @@ struct ArgDecoding;
 //
 //   template <>
 //   struct RetDecoding<MyType> {
+//     static bool Isa(XLA_FFI_RetType type, void* ret);
 //     static std::optional<MyType> Decode(XLA_FFI_RetType type, void* ret);
 //   };
 //
@@ -937,9 +1015,10 @@ struct RetDecoding;
 //
 //   template <>
 //   struct AttrDecoding<MyType> {
-//    using Type = <handler argument type for attribute type MyType>
-//    static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
-//                                        DiagnosticEngine&);
+//     using Type = <handler argument type for attribute type MyType>
+//     static bool Isa(XLA_FFI_AttrType type, void* attr);
+//     static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
+//                                         DiagnosticEngine&);
 //   }
 //
 template <typename T>
@@ -1084,7 +1163,10 @@ struct DecodingContext {
 };
 
 template <typename T>
-struct Decode {
+struct Decode;
+
+template <typename T>
+struct Decode<ArgTag<T>> {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
@@ -1103,7 +1185,7 @@ struct Decode<OptionalArgTag<T>> {
     if (XLA_FFI_PREDICT_FALSE(offsets.args >= ctx.call_frame->args.size)) {
       return std::optional<T>(std::nullopt);
     }
-    return Decode<T>::call(offsets, ctx, diagnostic);
+    return Decode<ArgTag<T>>::call(offsets, ctx, diagnostic);
   }
 };
 
@@ -1205,6 +1287,13 @@ class RemainingArgsBase {
     assert(offset <= args_->size && "illegal remaining args offset");
   }
 
+  template <typename T>
+  bool isa(size_t index) const {
+    size_t idx = offset() + index;
+    assert(idx < args_->size && "illegal remaining args index");
+    return ArgDecoding<T>::Isa(args_->types[idx], args_->args[idx]);
+  }
+
   size_t size() const { return args_->size - offset_; }
   bool empty() const { return size() == 0; }
 
@@ -1232,6 +1321,13 @@ class RemainingRetsBase {
     assert(offset <= rets_->size && "illegal remaining rets offset");
   }
 
+  template <typename T>
+  bool isa(size_t index) const {
+    size_t idx = offset_ + index;
+    assert(idx < rets_->size && "illegal remaining rets index");
+    return RetDecoding<T>::Isa(rets_->types[idx], rets_->rets[idx]);
+  }
+
   size_t size() const { return rets_->size - offset_; }
   bool empty() const { return size() == 0; }
 
@@ -1260,14 +1356,73 @@ class DictionaryBase {
  public:
   explicit DictionaryBase(const XLA_FFI_Attrs* attrs) : attrs_(attrs) {}
 
+  // Iterator for iterating over dictionary attribute names.
+  class Iterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = ptrdiff_t;
+    using value_type = std::string_view;
+
+    bool operator==(const Iterator& it) const { return idx_ == it.idx_; }
+    bool operator!=(const Iterator& it) const { return idx_ != it.idx_; }
+
+    std::string_view operator*() const {
+      return std::string_view{attrs_->names[idx_]->ptr,
+                              attrs_->names[idx_]->len};
+    }
+
+    Iterator& operator++() {
+      ++idx_;
+      return *this;
+    }
+
+   private:
+    friend class DictionaryBase;
+    Iterator(const XLA_FFI_Attrs* attrs, size_t idx)
+        : attrs_(attrs), idx_(idx) {}
+
+    const XLA_FFI_Attrs* attrs_;
+    size_t idx_ = 0;
+  };
+
   size_t size() const { return attrs_->size; }
 
   bool contains(std::string_view name) const { return Find(name).has_value(); }
 
+  Iterator begin() const { return Iterator(attrs_, 0); }
+  Iterator end() const { return Iterator(attrs_, size()); }
+
+  template <typename T>
+  bool contains(std::string_view name) const {
+    std::optional<size_t> idx = Find(name);
+    if (XLA_FFI_PREDICT_FALSE(!idx.has_value())) {
+      return false;
+    }
+
+    XLA_FFI_AttrType attr_type = attrs_->types[*idx];
+    void* attr = attrs_->attrs[*idx];
+    return AttrDecoding<T>::Isa(attr_type, attr);
+  }
+
+  template <typename T>
+  bool isa(const Iterator& it) const {
+    XLA_FFI_AttrType attr_type = attrs_->types[it.idx_];
+    void* attr = attrs_->attrs[it.idx_];
+    return AttrDecoding<T>::Isa(attr_type, attr);
+  }
+
  protected:
   template <typename T, typename... Ts>
   friend struct DecodeDictionaryAttr;
 
+  template <typename T>
+  std::optional<size_t> get(const Iterator& it,
+                            DiagnosticEngine& diagnostic) const {
+    XLA_FFI_AttrType attr_type = attrs_->types[it.idx_];
+    void* attr = attrs_->attrs[it.idx_];
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
+  }
+
   template <typename T>
   std::optional<T> get(std::string_view name,
                        DiagnosticEngine& diagnostic) const {
@@ -1321,6 +1476,34 @@ struct internal::Decode<internal::AttrsTag<T>> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing context.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+class ContextBase {
+ public:
+  ContextBase(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx)
+      : api_(api), ctx_(ctx) {}
+
+  const XLA_FFI_Api* api() const { return api_; }
+  XLA_FFI_ExecutionContext* ctx() const { return ctx_; }
+
+ protected:
+  template <typename T>
+  std::optional<typename CtxDecoding<T>::Type> get(
+      DiagnosticEngine& diagnostic) const {
+    return CtxDecoding<T>::Decode(api_, ctx_, diagnostic);
+  }
+
+ private:
+  const XLA_FFI_Api* api_;
+  XLA_FFI_ExecutionContext* ctx_;
+};
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // Template metaprogramming for decoding handler signature
 //===----------------------------------------------------------------------===//
@@ -1335,7 +1518,10 @@ class RemainingRets;
 namespace internal {
 // A helper struct to extract the type of the handler argument.
 template <typename T>
-struct FnArgType {
+struct FnArgType;
+
+template <typename T>
+struct FnArgType<internal::ArgTag<T>> {
   using Type = T;
 };
 
@@ -1379,44 +1565,6 @@ struct FnArgType<internal::CtxTag<T>> {
   using Type = typename CtxDecoding<T>::Type;
 };
 
-// A template for checking if type in a parameter pack is a tagged one and has
-// a special decoding rule defined by template specialization.
-template <typename>
-struct IsTagged : std::false_type {};
-
-template <typename T>
-struct IsTagged<OptionalArgTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<RetTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<OptionalRetTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<AttrTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<AttrsTag<T>> : std::true_type {};
-template <typename T>
-struct IsTagged<CtxTag<T>> : std::true_type {};
-
-template <>
-struct IsTagged<RemainingArgsTag> : std::true_type {};
-template <>
-struct IsTagged<RemainingRetsTag> : std::true_type {};
-
-// A template for counting regular arguments in the Ts pack (arguments that are
-// not wrapped into a special tag).
-template <typename... Ts>
-struct NumArgs;
-
-template <>
-struct NumArgs<> {
-  static constexpr int64_t value = 0;
-};
-
-template <typename T, typename... Ts>
-struct NumArgs<T, Ts...> {
-  static constexpr int64_t value = !IsTagged<T>::value + NumArgs<Ts...>::value;
-};
-
 // A template to detect result encodings that are state constructors. We use
 // this to report back the TypeId of the state as a part of the metadata.
 template <typename ResultEnconding, typename = void>
@@ -1445,7 +1593,8 @@ template <ExecutionStage stage, typename Fn, typename... Ts>
 class Handler : public Ffi {
   static constexpr int64_t kSize = sizeof...(Ts);
 
-  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+  static constexpr int64_t kNumArgs =
+      internal::NumTagged<internal::ArgTag, Ts...>::value;
 
   static constexpr int64_t kNumOptionalArgs =
       internal::NumTagged<internal::OptionalArgTag, Ts...>::value;
@@ -1609,13 +1758,17 @@ class Handler : public Ffi {
       return err;
     }
 
+    // Set the API version to the version of the FFI headers used by a handler.
     extension->metadata->api_version = XLA_FFI_Api_Version{
         XLA_FFI_Api_Version_STRUCT_SIZE,
-        /*extension_start=*/nullptr, XLA_FFI_API_MAJOR, XLA_FFI_API_MINOR};
+        /*extension_start=*/nullptr,
+        XLA_FFI_API_MAJOR,
+        XLA_FFI_API_MINOR,
+    };
 
     // Collect all traits and store them in the metadata.
     XLA_FFI_Handler_Traits traits = 0;
-    for (const auto& trait : traits_) {
+    for (const Traits& trait : traits_) {
       traits |= static_cast<XLA_FFI_Handler_Traits>(trait);
     }
     extension->metadata->traits = traits;
@@ -1624,6 +1777,11 @@ class Handler : public Ffi {
     // type id in the metadata.
     using ResultEncoding = ResultEncoding<stage, ResultType>;
     if constexpr (internal::is_state_constructor_v<ResultEncoding>) {
+      if (ResultEncoding::state_type_id() == XLA_FFI_UNKNOWN_TYPE_ID) {
+        return FailedPrecondition(api,
+                                  "Types used by FFI handlers must be "
+                                  "registered before the handler registration");
+      }
       extension->metadata->state_type_id = ResultEncoding::state_type_id();
     } else {
       extension->metadata->state_type_id = XLA_FFI_UNKNOWN_TYPE_ID;
@@ -1737,7 +1895,8 @@ class Handler : public Ffi {
     // Find index of every attribute in the sorted attributes vector.
     for (size_t i = 0; i < attrs_.size(); ++i) {
       attrs_idx_.push_back(std::distance(
-          sorted.begin(), std::find(sorted.begin(), sorted.end(), attrs_[i])));
+          sorted.begin(),
+          std::find(sorted.begin(), sorted.end(), attrs_[i])));  // NOLINT
     }
   }
 
@@ -1763,6 +1922,12 @@ class Handler : public Ffi {
   template <>                                                              \
   struct AttrDecoding<T> {                                                 \
     using Type = T;                                                        \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type, \
+                                                    void* attr) {          \
+      return type == XLA_FFI_AttrType_SCALAR &&                            \
+             reinterpret_cast<XLA_FFI_Scalar*>(attr)->dtype == TYPE;       \
+    }                                                                      \
+                                                                           \
     XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(        \
         XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) { \
       if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {        \
@@ -1798,6 +1963,47 @@ XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<double>,
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
+// Decoding for an attribute of `std::variant<T0, T1, Ts...>` type.
+//
+// Returns the decoding result for a first type that matches the attribute type,
+// if no type matches, returns std::nullopt.
+template <typename T0, typename T1, typename... Ts>
+struct AttrDecoding<std::variant<T0, T1, Ts...>> {
+  using Type = std::variant<T0, T1, Ts...>;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type,
+                                                  void* attr) {
+    return AttrDecoding<T0>::Isa(type, attr) ||
+           AttrDecoding<T1>::Isa(type, attr) ||
+           (AttrDecoding<Ts>::Isa(type, attr) || ...);
+  };
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
+    return Decode<T0, T1, Ts...>(type, attr, diagnostic);
+  }
+
+ private:
+  template <typename U, typename... Us>
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
+    if (AttrDecoding<U>::Isa(type, attr)) {
+      if (auto decoded = AttrDecoding<U>::Decode(type, attr, diagnostic);
+          XLA_FFI_PREDICT_TRUE(decoded)) {
+        return std::move(*decoded);
+      }
+      return std::nullopt;
+    }
+
+    if constexpr (sizeof...(Us) > 0) {
+      return Decode<Us...>(type, attr, diagnostic);
+    }
+
+    return diagnostic.Emit(
+        "Wrong attribute type: it doesn't match any of the variant types");
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Automatic dictionary attributes to structs decoding.
 //===----------------------------------------------------------------------===//
@@ -1944,12 +2150,6 @@ auto DictionaryDecoder(Members... m) {
 // Helper macro for registering FFI implementations
 //===----------------------------------------------------------------------===//
 
-#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)  // GCC-style
-#define XLA_FFI_ATTRIBUTE_UNUSED __attribute__((unused))
-#else  // Non-GCC equivalents
-#define XLA_FFI_ATTRIBUTE_UNUSED
-#endif
-
 // In all macros below we use captureless lambda to function pointer conversion
 // to create a static XLA_FFI_Handler function pointer variable.
 
@@ -1997,7 +2197,7 @@ auto DictionaryDecoder(Members... m) {
 #define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N, ...) \
   XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ##__VA_ARGS__)
 #define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ...)       \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                      \
+  [[maybe_unused]] static const XLA_FFI_Error*                              \
       xla_ffi_static_handler_##N##_registered_ = [] {                       \
         return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM,  \
                                                       FUNC, ##__VA_ARGS__); \
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 1715a07995531e..00c400b79eecf8 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -61,6 +61,15 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Extension_Base, next);
 // Version
 //===----------------------------------------------------------------------===//
 
+// XLA FFI provides a stable binary API for registering custom calls with
+// XLA runtime. XLA runtime guarantees that old API version are supported for
+// at least 12 months, after that point FFI library has to be recompiled with
+// latest XLA FFI headers to support new features. We don't plan to break ABI
+// compatibility, unless it's absolutely necessary to enable new features that
+// can't be implemented in a backward compatible way.
+//
+// The range of supported API versions is defined in `xla/ffi/ffi_api.cc`.
+
 // Incremented when an ABI-incompatible change is made to the interface.
 //
 // Major changes include:
@@ -262,20 +271,6 @@ typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
 // Primitives
 //===----------------------------------------------------------------------===//
 
-// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
-typedef struct XLA_FFI_TypeId {
-  int64_t type_id;
-} XLA_FFI_TypeId;
-
-// TypeInfo contains function pointers required by XLA runtime to manipulate
-// user-defined types. For example stateful handlers must tell XLA runtime how
-// to destroy their state when executable is being destroyed.
-typedef struct XLA_FFI_TypeInfo {
-  void (*deleter)(void* object);
-  void (*serialize)();    // placeholder for future use
-  void (*deserialize)();  // placeholder for future use
-} XLA_FFI_TypeInfo;
-
 // We use byte spans to pass strings to handlers because strings might not be
 // null terminated, and even if they are, looking for a null terminator can
 // become very expensive in tight loops.
@@ -297,6 +292,26 @@ typedef struct XLA_FFI_Array {
   void* data;
 } XLA_FFI_Array;
 
+//===----------------------------------------------------------------------===//
+// Type registry
+//===----------------------------------------------------------------------===//
+
+// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
+typedef struct XLA_FFI_TypeId {
+  int64_t type_id;
+} XLA_FFI_TypeId;
+
+// TypeInfo contains function pointers required by XLA runtime to manipulate
+// user-defined types. For example stateful handlers must tell XLA runtime how
+// to destroy their state when executable is being destroyed.
+typedef struct XLA_FFI_TypeInfo {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+  void (*deleter)(void* object);
+} XLA_FFI_TypeInfo;
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeInfo, deleter);
+
 //===----------------------------------------------------------------------===//
 // Future
 //===----------------------------------------------------------------------===//
@@ -485,23 +500,22 @@ typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
 
 #define XLA_FFI_UNKNOWN_TYPE_ID XLA_FFI_TypeId{0}
 
-struct XLA_FFI_TypeId_Register_Args {
+struct XLA_FFI_Type_Register_Args {
   size_t struct_size;
   XLA_FFI_Extension_Base* extension_start;
 
   XLA_FFI_ByteSpan name;
   XLA_FFI_TypeId* type_id;  // in-out
-  XLA_FFI_TypeInfo* type_info;
+  const XLA_FFI_TypeInfo* type_info;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeId_Register_Args, type_id);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Type_Register_Args, type_id);
 
 // Registers user type `name` with XLA. If type id is `XLA_FFI_UNKNOWN_TYPE_ID`,
 // XLA will assign a unique type id and return it in `type_id` out argument,
 // otherwise XLA will verify that type id is unique and matches the type id of
 // the type registered with the same `name` earlier.
-typedef XLA_FFI_Error* XLA_FFI_TypeId_Register(
-    XLA_FFI_TypeId_Register_Args* args);
+typedef XLA_FFI_Error* XLA_FFI_Type_Register(XLA_FFI_Type_Register_Args* args);
 
 //===----------------------------------------------------------------------===//
 // ExecutionContext
@@ -533,10 +547,9 @@ struct XLA_FFI_State_Set_Args {
   XLA_FFI_ExecutionContext* ctx;
   XLA_FFI_TypeId* type_id;
   void* state;
-  void (*deleter)(void* state);
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Set_Args, deleter);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_State_Set_Args, state);
 
 // Sets execution state to the `state` of type `type_id`. Returns an error if
 // state already set.
@@ -745,7 +758,7 @@ struct XLA_FFI_Api {
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Destroy);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Stream_Get);
-  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_TypeId_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Type_Register);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_Get);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Set);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_State_Get);
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 0a3f0dbf9c1e27..ddd5c0f0f714f2 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -53,7 +53,8 @@ namespace xla::ffi {
 
 // All user data types that are passed via the execution context or state must
 // be registered with the XLA FFI ahead of time to get unique type id.
-using TypeId = XLA_FFI_TypeId;  // NOLINT
+using TypeId = XLA_FFI_TypeId;      // NOLINT
+using TypeInfo = XLA_FFI_TypeInfo;  // NOLINT
 
 enum class DataType : uint8_t {
   INVALID = XLA_FFI_DataType_INVALID,
@@ -747,6 +748,12 @@ using Token = BufferR0<DataType::TOKEN>;  // NOLINT
 
 namespace internal {
 
+template <DataType dtype, size_t rank>
+XLA_FFI_ATTRIBUTE_ALWAYS_INLINE bool IsaBuffer(XLA_FFI_Buffer* buf) {
+  return static_cast<DataType>(buf->dtype) == dtype &&
+         (rank == internal::kDynamicRank || buf->rank == rank);
+}
+
 template <DataType dtype, size_t rank>
 XLA_FFI_ATTRIBUTE_ALWAYS_INLINE std::optional<Buffer<dtype, rank>> DecodeBuffer(
     XLA_FFI_Buffer* buf, DiagnosticEngine& diagnostic) {
@@ -820,6 +827,11 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
 
 template <>
 struct ArgDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_ArgType type, void* arg) {
+    return type == XLA_FFI_ArgType_BUFFER;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<AnyBuffer> Decode(XLA_FFI_ArgType type, void* arg,
                                          DiagnosticEngine& diagnostic) {
@@ -833,6 +845,13 @@ struct ArgDecoding<AnyBuffer> {
 
 template <DataType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_ArgType type, void* arg) {
+    return type == XLA_FFI_ArgType_BUFFER &&
+           internal::IsaBuffer<dtype, rank>(
+               reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
@@ -895,6 +914,11 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_RetType type) {
 
 template <>
 struct RetDecoding<AnyBuffer> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_RetType type, void* ret) {
+    return type == XLA_FFI_RetType_BUFFER;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Result<AnyBuffer>> Decode(XLA_FFI_RetType type,
                                                  void* ret,
@@ -909,6 +933,13 @@ struct RetDecoding<AnyBuffer> {
 
 template <DataType dtype, size_t rank>
 struct RetDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static bool Isa(XLA_FFI_RetType type, void* ret) {
+    return type == XLA_FFI_RetType_BUFFER &&
+           internal::IsaBuffer<dtype, rank>(
+               reinterpret_cast<XLA_FFI_Buffer*>(ret));
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Result<Buffer<dtype, rank>>> Decode(
       XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
@@ -1001,6 +1032,11 @@ template <>
 struct AttrDecoding<std::string_view> {
   using Type = std::string_view;
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static bool Isa(XLA_FFI_AttrType type,
+                                                  void* attr) {
+    return type == XLA_FFI_AttrType_STRING;
+  }
+
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<std::string_view> Decode(
       XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
@@ -1044,11 +1080,21 @@ class Dictionary : public internal::DictionaryBase {
  public:
   using internal::DictionaryBase::DictionaryBase;
 
+  template <typename T>
+  ErrorOr<T> get(const Iterator& it) const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::DictionaryBase::get<T>(it, diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+    return *value;
+  }
+
   template <typename T>
   ErrorOr<T> get(std::string_view name) const {
     DiagnosticEngine diagnostic;
-    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
-    if (!value.has_value()) {
+    auto value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
       return Unexpected(Error::Internal(diagnostic.Result()));
     }
     return *value;
@@ -1079,6 +1125,38 @@ struct AttrDecoding<Dictionary> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing context.
+//===----------------------------------------------------------------------===//
+
+class Context : public internal::ContextBase {
+ public:
+  using internal::ContextBase::ContextBase;
+
+  template <typename T>
+  ErrorOr<typename CtxDecoding<T>::Type> get() const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::ContextBase::get<T>(diagnostic);
+    if (XLA_FFI_PREDICT_FALSE(!value.has_value())) {
+      return Unexpected(Error::Internal(diagnostic.Result()));
+    }
+    return *value;
+  }
+};
+
+// Context decoding for catch-all `Context` type.
+template <>
+struct CtxDecoding<Context> {
+  using Type = Context;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Context> Decode(const XLA_FFI_Api* api,
+                                       XLA_FFI_ExecutionContext* ctx,
+                                       DiagnosticEngine&) {
+    return Context(api, ctx);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Error helpers
 //===----------------------------------------------------------------------===//
@@ -1152,7 +1230,6 @@ struct ResultEncoding<ExecutionStage::kInstantiate,
       args.ctx = ctx;
       args.type_id = &T::id;
       args.state = state.value().release();
-      args.deleter = +[](void* state) { delete reinterpret_cast<T*>(state); };
       return api->XLA_FFI_State_Set(&args);
     }
 
@@ -1432,42 +1509,17 @@ inline ThreadPool::ThreadPool(const XLA_FFI_Api* api,
                               DiagnosticEngine& diagnostic)
     : api_(api), ctx_(ctx), diagnostic_(diagnostic) {}
 
-//===----------------------------------------------------------------------===//
-// Context decoding for FFI internals
-//===----------------------------------------------------------------------===//
-
-struct FfiApi {};
-struct FfiExecutionContext {};
-
-template <>
-struct CtxDecoding<FfiApi> {
-  using Type = const XLA_FFI_Api*;
-
-  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
-      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
-      DiagnosticEngine& diagnostic) {
-    return api;
-  }
-};
-
-template <>
-struct CtxDecoding<FfiExecutionContext> {
-  using Type = XLA_FFI_ExecutionContext*;
-
-  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
-      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
-      DiagnosticEngine& diagnostic) {
-    return ctx;
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Type Registration
 //===----------------------------------------------------------------------===//
 
 template <typename T>
-constexpr XLA_FFI_TypeInfo TypeInfo() {
-  return XLA_FFI_TypeInfo{[](void* ptr) { delete static_cast<T*>(ptr); }};
+constexpr XLA_FFI_TypeInfo MakeTypeInfo() {
+  return XLA_FFI_TypeInfo{
+      XLA_FFI_TypeInfo_STRUCT_SIZE,
+      /*extension_start=*/nullptr,
+      /*deleter=*/[](void* ptr) { delete static_cast<T*>(ptr); },
+  };
 }
 
 #define XLA_FFI_REGISTER_TYPE(API, NAME, TYPE_ID, TYPE_INFO) \
@@ -1475,7 +1527,7 @@ constexpr XLA_FFI_TypeInfo TypeInfo() {
 #define XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, TYPE_INFO, N) \
   XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, TYPE_INFO, N)
 #define XLA_FFI_REGISTER_TYPE__(API, NAME, TYPE_ID, TYPE_INFO, N)              \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                         \
+  [[maybe_unused]] static const XLA_FFI_Error*                                 \
       xla_ffi_type_##N##_registered_ = [] {                                    \
         return ::xla::ffi::Ffi::RegisterTypeId(API, NAME, TYPE_ID, TYPE_INFO); \
       }()
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index aa436822a43998..2aabbb0a1ce1aa 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -23,18 +23,22 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
@@ -47,7 +51,6 @@ limitations under the License.
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -58,6 +61,15 @@ limitations under the License.
 
 namespace xla::ffi {
 
+using xla::ffi::internal::ArgTag;
+using xla::ffi::internal::NumTagged;
+using xla::ffi::internal::RetTag;
+
+// Compile-time test for the template metaprogramming for counting tags.
+static_assert(NumTagged<ArgTag, RetTag<int32_t>>::value == 0);
+static_assert(NumTagged<ArgTag, ArgTag<int32_t>>::value == 1);
+static_assert(NumTagged<ArgTag, ArgTag<int32_t>, RetTag<int32_t>>::value == 1);
+
 enum class Int32BasedEnum : int32_t {
   kOne = 1,
   kTwo = 2,
@@ -115,8 +127,8 @@ XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
 
 namespace xla::ffi {
 
+using ::absl_testing::StatusIs;
 using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
 
 TEST(FfiTest, DataTypeEnumValue) {
   // Verify that xla::PrimitiveType and xla::ffi::DataType use the same
@@ -464,13 +476,39 @@ TEST(FfiTest, RunId) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, RunIdViaContext) {
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Ctx().To([&](Context ctx) {
+    ErrorOr<RunId> run_id = ctx.get<RunId>();
+    EXPECT_TRUE(run_id.has_value());
+    EXPECT_EQ(run_id->run_id, 42);
+    return Error::Success();
+  });
+
+  CallOptions options;
+  options.run_id = xla::RunId{42};
+
+  auto status = Call(*handler, call_frame, options);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, DeviceOrdinal) {
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
 
-  auto handler =
-      Ffi::Bind().Ctx<DeviceOrdinal>().To([&](int32_t device_ordinal) {
+  auto handler = Ffi::Bind().Ctx<DeviceOrdinal>().Ctx().To(
+      [&](int32_t device_ordinal, Context ctx) {
+        // Get device ordinal from the argument.
         EXPECT_EQ(device_ordinal, 42);
+
+        // Get device ordinal from the context.
+        ErrorOr<int32_t> device_ordinal_or_error = ctx.get<DeviceOrdinal>();
+        EXPECT_TRUE(device_ordinal_or_error.has_value());
+        EXPECT_EQ(*device_ordinal_or_error, 42);
+
         return Error::Success();
       });
 
@@ -550,9 +588,8 @@ TEST(FfiTest, MissingBufferArgument) {
       [](auto) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
-                                     HasSubstr("Wrong number of arguments")));
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument,
+                               HasSubstr("Wrong number of arguments")));
 }
 
 TEST(FfiTest, WrongRankBufferArgument) {
@@ -568,9 +605,8 @@ TEST(FfiTest, WrongRankBufferArgument) {
   auto status = Call(*handler, call_frame);
 
   EXPECT_THAT(status,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  HasSubstr("Wrong buffer rank: expected 1 but got 2")));
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Wrong buffer rank: expected 1 but got 2")));
 }
 
 TEST(FfiTest, WrongTypeBufferArgument) {
@@ -585,10 +621,10 @@ TEST(FfiTest, WrongTypeBufferArgument) {
       [](auto) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
+  EXPECT_THAT(
+      status,
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
 }
 
 TEST(FfiTest, WrongNumberOfArguments) {
@@ -604,9 +640,8 @@ TEST(FfiTest, WrongNumberOfArguments) {
       Ffi::Bind().Attr<int>("foo").To([](int foo) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status,
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument,
-                                     HasSubstr("Wrong number of attributes")));
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument,
+                               HasSubstr("Wrong number of attributes")));
   EXPECT_THAT(status.message(), HasSubstr("foo"));
   EXPECT_THAT(status.message(), HasSubstr("bar"));
 }
@@ -639,6 +674,9 @@ TEST(FfiTest, RemainingArgs) {
   auto fn = [&](RemainingArgs args) {
     EXPECT_EQ(args.size(), 1);
 
+    EXPECT_TRUE(args.isa<AnyBuffer>(0));
+    EXPECT_FALSE(args.isa<BufferR2<F64>>(0));
+
     ErrorOr<AnyBuffer> arg0 = args.get<AnyBuffer>(0);
     ErrorOr<AnyBuffer> arg1 = args.get<AnyBuffer>(1);
 
@@ -666,6 +704,9 @@ TEST(FfiTest, RemainingRets) {
   auto fn = [&](Result<AnyBuffer> ret, RemainingRets rets) {
     EXPECT_EQ(rets.size(), 1);
 
+    EXPECT_TRUE(rets.isa<AnyBuffer>(0));
+    EXPECT_FALSE(rets.isa<BufferR2<F64>>(0));
+
     ErrorOr<Result<AnyBuffer>> ret0 = rets.get<AnyBuffer>(0);
     ErrorOr<Result<AnyBuffer>> ret1 = rets.get<AnyBuffer>(1);
 
@@ -860,8 +901,28 @@ TEST(FfiTest, AutoBindingStructs) {
 
 TEST(FfiTest, AutoBindingDictionary) {
   auto handler = Ffi::BindTo(+[](Dictionary attrs) {
+    EXPECT_TRUE(attrs.contains("i32"));
+    EXPECT_TRUE(attrs.contains("f32"));
+
+    EXPECT_TRUE(attrs.contains<int32_t>("i32"));
+    EXPECT_TRUE(attrs.contains<float>("f32"));
+    EXPECT_FALSE(attrs.contains<int64_t>("i32"));
+    EXPECT_FALSE(attrs.contains<int64_t>("f32"));
+
     EXPECT_EQ(*attrs.get<int32_t>("i32"), 42);
     EXPECT_EQ(*attrs.get<float>("f32"), 42.0f);
+
+    auto it = attrs.begin();
+    EXPECT_EQ(*it, "f32");
+    EXPECT_TRUE(attrs.isa<float>(it));
+    EXPECT_EQ(*attrs.get<float>(it), 42.0f);
+
+    EXPECT_EQ(*++it, "i32");
+    EXPECT_TRUE(attrs.isa<int32_t>(it));
+    EXPECT_EQ(*attrs.get<int32_t>(it), 42);
+
+    EXPECT_EQ(++it, attrs.end());
+
     return Error::Success();
   });
 
@@ -877,6 +938,44 @@ TEST(FfiTest, AutoBindingDictionary) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, VariantAttrDecoding) {
+  using Integral = std::variant<int8_t, int16_t, int32_t, int64_t>;
+
+  auto to_string = absl::Overload{
+      [](int8_t i) { return absl::StrCat("i8: ", i); },
+      [](int16_t i) { return absl::StrCat("i16: ", i); },
+      [](int32_t i) { return absl::StrCat("i32: ", i); },
+      [](int64_t i) { return absl::StrCat("i64: ", i); },
+  };
+
+  auto handler = Ffi::BindTo([&](Dictionary attrs) {
+    EXPECT_TRUE(attrs.contains<Integral>("i32"));
+    EXPECT_TRUE(attrs.contains<Integral>("i64"));
+
+    Integral i32 = *attrs.get<Integral>("i32");
+    EXPECT_EQ(i32.index(), 2);
+
+    Integral i64 = *attrs.get<Integral>("i64");
+    EXPECT_EQ(i64.index(), 3);
+
+    EXPECT_EQ(std::visit(to_string, i32), "i32: 42");
+    EXPECT_EQ(std::visit(to_string, i64), "i64: 42");
+
+    return Error::Success();
+  });
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", int32_t{42});
+  attrs.Insert("i64", int64_t{42});
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
 // Use opaque struct to define a platform stream type just like platform
 // stream for GPU backend (e.g. `CUstream_st`  and `cudaStream_t`).
 struct TestStreamSt;
@@ -992,10 +1091,10 @@ TEST(FfiTest, AttrsAsDictionary) {
 }
 
 TEST(FfiTest, DictionaryAttr) {
-  CallFrameBuilder::AttributesMap dict0;
+  AttributesMap dict0;
   dict0.try_emplace("i32", 42);
 
-  CallFrameBuilder::AttributesMap dict1;
+  AttributesMap dict1;
   dict1.try_emplace("f32", 42.0f);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -1039,7 +1138,7 @@ TEST(FfiTest, DictionaryAttr) {
 }
 
 TEST(FfiTest, StructAttr) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
   dict.try_emplace("f32", 42.0f);
 
@@ -1152,7 +1251,7 @@ TEST(FfiTest, EnumAttr) {
 }
 
 TEST(FfiTest, WrongEnumAttrType) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -1202,14 +1301,20 @@ struct MyDataWithExplicitTypeId {
 
 // Rely on XLA to assign unique type id for the type.
 TypeId MyDataWithAutoTypeId::id = XLA_FFI_UNKNOWN_TYPE_ID;
+static constexpr auto kMyDataWithAutoTypeIdTypeInfo =
+    MakeTypeInfo<MyDataWithAutoTypeId>();
+
 XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_auto", &MyDataWithAutoTypeId::id,
-                      TypeInfo<MyDataWithAutoTypeId>());
+                      &kMyDataWithAutoTypeIdTypeInfo);
 
 // Provide explicit type id and rely on XLA to check that it's unique.
 TypeId MyDataWithExplicitTypeId::id = {42};
+static constexpr auto kMyDataWithExplicitTypeIdTypeInfo =
+    MakeTypeInfo<MyDataWithExplicitTypeId>();
+
 XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_explicit",
                       &MyDataWithExplicitTypeId::id,
-                      TypeInfo<MyDataWithExplicitTypeId>());
+                      &kMyDataWithExplicitTypeIdTypeInfo);
 
 TEST(FfiTest, UserData) {
   MyDataWithAutoTypeId data0{"foo"};
@@ -1248,14 +1353,16 @@ TEST(FfiTest, UserData) {
 
 struct MyState {
   static TypeId id;
+  static TypeInfo info;
 
   explicit MyState(int32_t value) : value(value) {}
   int32_t value;
 };
 
 TypeId MyState::id = {};  // zero-initialize type id
-XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "state", &MyState::id,
-                      TypeInfo<MyState>());
+TypeInfo MyState::info = MakeTypeInfo<MyState>();
+
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "state", &MyState::id, &MyState::info);
 
 TEST(FfiTest, StatefulHandler) {
   ExecutionState execution_state;
@@ -1348,13 +1455,6 @@ TEST(FfiTest, ScratchAllocatorUnimplemented) {
   TF_ASSERT_OK(status);
 }
 
-TEST(FfiTest, BindFfiInternals) {
-  (void)Ffi::Bind().Ctx<FfiApi>().Ctx<FfiExecutionContext>().To(
-      +[](const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx) {
-        return Error::Success();
-      });
-}
-
 TEST(FfiTest, ThreadPool) {
   tsl::thread::ThreadPool pool(tsl::Env::Default(), "ffi-test", 2);
   Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());
@@ -1736,4 +1836,33 @@ BENCHMARK(BM_EnumAttrs);
 BENCHMARK(BM_EnumAttrsFunction);
 BENCHMARK(BM_EnumAttrsFunctionWrapper);
 
+//===----------------------------------------------------------------------===//
+// BM_VariantAttr
+//===----------------------------------------------------------------------===//
+
+void BM_VariantAttr(benchmark::State& state) {
+  using Integral = std::variant<int8_t, int16_t, int32_t, int64_t>;
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", int32_t{0});
+  attrs.Insert("i64", int64_t{0});
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Attr<Integral>("i32").Attr<Integral>("i64").To(
+      [](Integral i32, Integral i64) {
+        benchmark::DoNotOptimize(i32);
+        benchmark::DoNotOptimize(i64);
+        return Error::Success();
+      });
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_VariantAttr);
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/attribute_map.cc b/third_party/xla/xla/ffi/attribute_map.cc
index af675f418e2c91..3085ee3a6235b7 100644
--- a/third_party/xla/xla/ffi/attribute_map.cc
+++ b/third_party/xla/xla/ffi/attribute_map.cc
@@ -16,36 +16,60 @@ limitations under the License.
 #include "xla/ffi/attribute_map.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
+#include <string>
+#include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/ffi/call_frame.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::ffi {
+template <typename OutputVariant, typename... InputTypes>
+static OutputVariant Convert(std::variant<InputTypes...> input) {
+  return std::visit(
+      [](auto&& value) -> OutputVariant {
+        return std::forward<decltype(value)>(value);
+      },
+      std::move(input));
+}
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertBoolAttr(
-    absl::string_view name, mlir::BoolAttr boolean) {
+// Checks if the given value is in the range of the given integer type.
+// Note that only works for integer types where all values can be represented as
+// int32_t.
+template <typename T>
+static bool IsInRange(int32_t value) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && (sizeof(T) < sizeof(int32_t)),
+      "All values of T must be representable as int32_t.");
+  return value >= std::numeric_limits<T>::min() &&
+         value <= std::numeric_limits<T>::max();
+}
+
+static absl::StatusOr<Attribute> ConvertBoolAttr(absl::string_view name,
+                                                 mlir::BoolAttr boolean) {
   return static_cast<bool>(boolean.getValue());
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertStringAttr(
-    absl::string_view name, mlir::StringAttr str) {
+static absl::StatusOr<Attribute> ConvertStringAttr(absl::string_view name,
+                                                   mlir::StringAttr str) {
   return str.getValue().str();
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
-    absl::string_view name, mlir::IntegerAttr integer) {
+static absl::StatusOr<Attribute> ConvertIntegerAttr(absl::string_view name,
+                                                    mlir::IntegerAttr integer) {
   if (integer.getType().isUnsignedInteger()) {
     switch (integer.getType().getIntOrFloatBitWidth()) {
       case 8:
@@ -77,8 +101,8 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
   }
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
-    absl::string_view name, mlir::FloatAttr fp) {
+static absl::StatusOr<Attribute> ConvertFloatAttr(absl::string_view name,
+                                                  mlir::FloatAttr fp) {
   switch (fp.getType().getIntOrFloatBitWidth()) {
     case 32:
       return static_cast<float>(fp.getValue().convertToFloat());
@@ -90,24 +114,28 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
   }
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertArrayAttr(
-    absl::string_view name, mlir::DenseArrayAttr arr) {
+static absl::StatusOr<Attribute> ConvertArrayAttr(absl::string_view name,
+                                                  mlir::DenseArrayAttr arr) {
   if (auto dense = mlir::dyn_cast<mlir::DenseI8ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseF32ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseF32ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else if (auto dense = mlir::dyn_cast<mlir::DenseF64ArrayAttr>(arr)) {
+  }
+  if (auto dense = mlir::dyn_cast<mlir::DenseF64ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
-  } else {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unsupported array element type for attribute: ", name));
   }
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported array element type for attribute: ", name));
 }
 
 template <typename T>
@@ -117,7 +145,7 @@ static std::vector<T> CopyDenseElementsToVec(
   return std::vector<T>(it.begin(), it.end());
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
+static absl::StatusOr<Attribute> ConvertDenseElementsAttr(
     absl::string_view name, mlir::DenseIntOrFPElementsAttr arr) {
   auto type = arr.getElementType();
   if (type.isInteger()) {
@@ -156,16 +184,15 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
       absl::StrCat("Unsupported array element type for attribute: ", name));
 }
 
-static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDictionaryAttr(
+static absl::StatusOr<Attribute> ConvertDictionaryAttr(
     absl::string_view name, mlir::DictionaryAttr dict) {
   TF_ASSIGN_OR_RETURN(auto attrs, BuildAttributesMap(dict));
-  return CallFrameBuilder::Dictionary{
-      std::make_shared<CallFrameBuilder::AttributesMap>(std::move(attrs))};
+  return AttributesDictionary{
+      std::make_shared<AttributesMap>(std::move(attrs))};
 }
 
-absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CallFrameBuilder::AttributesMap attributes;
+absl::StatusOr<AttributesMap> BuildAttributesMap(mlir::DictionaryAttr dict) {
+  AttributesMap attributes;
   for (auto& kv : dict) {
     absl::string_view name = kv.getName().strref();
     mlir::Attribute value = kv.getValue();
@@ -197,4 +224,311 @@ absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
   return attributes;
 }
 
+AttributesMapProto AttributesDictionary::ToProto() const {
+  if (attrs != nullptr) {
+    return attrs->ToProto();
+  }
+  return AttributesMapProto();
+}
+
+absl::StatusOr<AttributesDictionary> AttributesDictionary::FromProto(
+    const AttributesMapProto& proto) {
+  TF_ASSIGN_OR_RETURN(auto attrs, AttributesMap::FromProto(proto));
+  return AttributesDictionary{std::make_shared<AttributesMap>(attrs)};
+}
+
+AttributesMapProto AttributesMap::ToProto() const {
+  AttributesMapProto proto;
+  for (const auto& [key, value] : *this) {
+    (*proto.mutable_attrs())[key] = value.ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<AttributesMap> AttributesMap::FromProto(
+    const AttributesMapProto& proto) {
+  AttributesMap result;
+  for (const auto& [key, value] : proto.attrs()) {
+    TF_ASSIGN_OR_RETURN(result[key], Attribute::FromProto(value));
+  }
+  return result;
+}
+
+absl::StatusOr<Attribute> Attribute::FromProto(const AttributeProto& proto) {
+  Attribute attribute;
+  switch (proto.value_case()) {
+    case AttributeProto::kScalar:
+      return Scalar::FromProto(proto.scalar());
+    case AttributeProto::kArray:
+      return Array::FromProto(proto.array());
+    case AttributeProto::kStr:
+      return Attribute(proto.str());
+    case AttributeProto::kDict:
+      return AttributesDictionary::FromProto(proto.dict());
+    default:
+      return absl::InvalidArgumentError("Unsupported attribute type");
+  }
+}
+
+xla::ffi::AttributeProto Attribute::ToProto() const {
+  AttributeProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, Scalar>) {
+          *proto.mutable_scalar() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, Array>) {
+          *proto.mutable_array() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, std::string>) {
+          proto.set_str(value);
+        } else if constexpr (std::is_same_v<U, AttributesDictionary>) {
+          *proto.mutable_dict() = value.ToProto();
+        } else {
+          static_assert(false, "Unsupported attribute type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+ScalarProto Scalar::ToProto() const {
+  ScalarProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, bool>) {
+          proto.set_b(value);
+        } else if constexpr (std::is_same_v<U, int8_t>) {
+          proto.set_i8(value);
+        } else if constexpr (std::is_same_v<U, int16_t>) {
+          proto.set_i16(value);
+        } else if constexpr (std::is_same_v<U, int32_t>) {
+          proto.set_i32(value);
+        } else if constexpr (std::is_same_v<U, int64_t>) {
+          proto.set_i64(value);
+        } else if constexpr (std::is_same_v<U, uint8_t>) {
+          proto.set_u8(value);
+        } else if constexpr (std::is_same_v<U, uint16_t>) {
+          proto.set_u16(value);
+        } else if constexpr (std::is_same_v<U, uint32_t>) {
+          proto.set_u32(value);
+        } else if constexpr (std::is_same_v<U, uint64_t>) {
+          proto.set_u64(value);
+        } else if constexpr (std::is_same_v<U, float>) {
+          proto.set_f32(value);
+        } else if constexpr (std::is_same_v<U, double>) {
+          proto.set_f64(value);
+        } else {
+          static_assert(false, "Unsupported scalar type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<Scalar> Scalar::FromProto(const ScalarProto& proto) {
+  switch (proto.value_case()) {
+    case ScalarProto::kB:
+      return proto.b();
+    case ScalarProto::kI8:
+      if (!IsInRange<int8_t>(proto.i8())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int8_t");
+      }
+      return static_cast<int8_t>(proto.i8());
+    case ScalarProto::kI16:
+      if (!IsInRange<int16_t>(proto.i16())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int16_t");
+      }
+      return static_cast<int16_t>(proto.i16());
+    case ScalarProto::kI32:
+      return proto.i32();
+    case ScalarProto::kI64:
+      return proto.i64();
+    case ScalarProto::kU8:
+      if (!IsInRange<uint8_t>(proto.u8())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint8_t");
+      }
+      return static_cast<uint8_t>(proto.u8());
+    case ScalarProto::kU16:
+      if (!IsInRange<uint16_t>(proto.u16())) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint16_t");
+      }
+      return static_cast<uint16_t>(proto.u16());
+    case ScalarProto::kU32:
+      return proto.u32();
+    case ScalarProto::kU64:
+      return proto.u64();
+    case ScalarProto::kF32:
+      return proto.f32();
+    case ScalarProto::kF64:
+      return proto.f64();
+    default:
+      return absl::InvalidArgumentError("Unsupported scalar type");
+  }
+}
+
+ArrayProto Array::ToProto() const {
+  ArrayProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, std::vector<int8_t>>) {
+          proto.mutable_i8()->mutable_values()->Assign(value.begin(),
+                                                       value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int16_t>>) {
+          proto.mutable_i16()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int32_t>>) {
+          proto.mutable_i32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<int64_t>>) {
+          proto.mutable_i64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint8_t>>) {
+          proto.mutable_u8()->mutable_values()->Assign(value.begin(),
+                                                       value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint16_t>>) {
+          proto.mutable_u16()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint32_t>>) {
+          proto.mutable_u32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<uint64_t>>) {
+          proto.mutable_u64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<float>>) {
+          proto.mutable_f32()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else if constexpr (std::is_same_v<U, std::vector<double>>) {
+          proto.mutable_f64()->mutable_values()->Assign(value.begin(),
+                                                        value.end());
+        } else {
+          static_assert(false, "Unsupported array type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<Array> Array::FromProto(const ArrayProto& proto) {
+  switch (proto.value_case()) {
+    case ArrayProto::kI8:
+      if (!absl::c_all_of(proto.i8().values(), IsInRange<int8_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int8_t");
+      }
+      return std::vector<int8_t>(proto.i8().values().begin(),
+                                 proto.i8().values().end());
+    case ArrayProto::kI16:
+      if (!absl::c_all_of(proto.i16().values(), IsInRange<int16_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for int16_t");
+      }
+      return std::vector<int16_t>(proto.i16().values().begin(),
+                                  proto.i16().values().end());
+    case ArrayProto::kI32:
+      return std::vector<int32_t>(proto.i32().values().begin(),
+                                  proto.i32().values().end());
+    case ArrayProto::kI64:
+      return std::vector<int64_t>(proto.i64().values().begin(),
+                                  proto.i64().values().end());
+    case ArrayProto::kU8:
+      if (!absl::c_all_of(proto.u8().values(), IsInRange<uint8_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint8_t");
+      }
+      return std::vector<uint8_t>(proto.u8().values().begin(),
+                                  proto.u8().values().end());
+    case ArrayProto::kU16:
+      if (!absl::c_all_of(proto.u16().values(), IsInRange<uint16_t>)) {
+        return absl::InvalidArgumentError(
+            "Integer value out of range for uint16_t");
+      }
+      return std::vector<uint16_t>(proto.u16().values().begin(),
+                                   proto.u16().values().end());
+    case ArrayProto::kU32:
+      return std::vector<uint32_t>(proto.u32().values().begin(),
+                                   proto.u32().values().end());
+    case ArrayProto::kU64:
+      return std::vector<uint64_t>(proto.u64().values().begin(),
+                                   proto.u64().values().end());
+    case ArrayProto::kF32:
+      return std::vector<float>(proto.f32().values().begin(),
+                                proto.f32().values().end());
+    case ArrayProto::kF64:
+      return std::vector<double>(proto.f64().values().begin(),
+                                 proto.f64().values().end());
+    default:
+      return absl::InvalidArgumentError("Unsupported array type");
+  }
+}
+
+FlatAttributeProto FlatAttribute::ToProto() const {
+  FlatAttributeProto proto;
+  std::visit(
+      [&](auto&& value) {
+        using U = std::decay_t<decltype(value)>;
+        if constexpr (std::is_same_v<U, Scalar>) {
+          *proto.mutable_scalar() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, Array>) {
+          *proto.mutable_array() = value.ToProto();
+        } else if constexpr (std::is_same_v<U, std::string>) {
+          proto.set_str(value);
+        } else {
+          static_assert(false, "Unsupported flat attribute type");
+        }
+      },
+      AsVariant());
+  return proto;
+}
+
+absl::StatusOr<FlatAttribute> FlatAttribute::FromProto(
+    const FlatAttributeProto& proto) {
+  switch (proto.value_case()) {
+    case FlatAttributeProto::kScalar:
+      return Scalar::FromProto(proto.scalar());
+    case FlatAttributeProto::kArray:
+      return Array::FromProto(proto.array());
+    case FlatAttributeProto::kStr:
+      return proto.str();
+    default:
+      return absl::InvalidArgumentError("Unsupported flat attribute type");
+  }
+}
+
+FlatAttributesMapProto FlatAttributesMap::ToProto() const {
+  FlatAttributesMapProto proto;
+  for (const auto& [key, value] : *this) {
+    (*proto.mutable_attrs())[key] = value.ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<FlatAttributesMap> FlatAttributesMap::FromProto(
+    const FlatAttributesMapProto& proto) {
+  FlatAttributesMap result;
+  for (const auto& [key, value] : proto.attrs()) {
+    TF_ASSIGN_OR_RETURN(result[key], FlatAttribute::FromProto(value));
+  }
+  return result;
+}
+
+Attribute::Attribute(const FlatAttribute& flat)
+    : Attribute(Convert<Attribute>(flat)) {}
+
+bool operator==(const AttributesDictionary& lhs,
+                const AttributesDictionary& rhs) {
+  if (lhs.attrs == nullptr) {
+    return rhs.attrs == nullptr;
+  }
+  if (rhs.attrs == nullptr) {
+    return false;
+  }
+  return *lhs.attrs == *rhs.attrs;
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/attribute_map.h b/third_party/xla/xla/ffi/attribute_map.h
index 43ad41888772cb..ff593f9c00eed3 100644
--- a/third_party/xla/xla/ffi/attribute_map.h
+++ b/third_party/xla/xla/ffi/attribute_map.h
@@ -16,16 +16,166 @@ limitations under the License.
 #ifndef XLA_FFI_ATTRIBUTE_MAP_H_
 #define XLA_FFI_ATTRIBUTE_MAP_H_
 
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "mlir/IR/BuiltinAttributes.h"
-#include "xla/ffi/call_frame.h"
+#include "xla/ffi/attribute_map.pb.h"
 
 namespace xla::ffi {
+namespace internal {
+// A little bit of template metaprogramming to append type to std::variant.
+template <typename V, class T>
+struct AppendType;
+
+template <typename... Ts, class T>
+struct AppendType<std::variant<Ts...>, T> {
+  using Type = std::variant<Ts..., T>;
+};
+}  // namespace internal
+
+using ScalarBase =
+    std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                 uint32_t, uint64_t, float, double>;
+
+// A single scalar value.
+class Scalar : public ScalarBase {
+ public:
+  using ScalarBase::ScalarBase;
+
+  ScalarProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const ScalarBase& AsVariant() const { return *this; }
+  ScalarBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Scalar> FromProto(const ScalarProto& proto);
+};
+
+using ArrayBase = std::variant<std::vector<int8_t>, std::vector<int16_t>,
+                               std::vector<int32_t>, std::vector<int64_t>,
+                               std::vector<uint8_t>, std::vector<uint16_t>,
+                               std::vector<uint32_t>, std::vector<uint64_t>,
+                               std::vector<float>, std::vector<double>>;
+
+// An array of elements of the same Scalar type.
+class Array : public ArrayBase {
+ public:
+  using ArrayBase::ArrayBase;
+
+  ArrayProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const ArrayBase& AsVariant() const { return *this; }
+  ArrayBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Array> FromProto(const ArrayProto& proto);
+};
+
+using FlatAttributeBase = std::variant<Scalar, Array, std::string>;
+
+// Attributes that do not support nested dictionaries.
+class FlatAttribute : public FlatAttributeBase {
+ public:
+  using FlatAttributeBase::FlatAttributeBase;
+
+  FlatAttributeProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const FlatAttributeBase& AsVariant() const { return *this; }
+  FlatAttributeBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<FlatAttribute> FromProto(
+      const FlatAttributeProto& proto);
+};
+
+using FlatAttributesMapBase = absl::flat_hash_map<std::string, FlatAttribute>;
+
+// A map that maps from an arbitrary name (string key) to a flat attribute.
+class FlatAttributesMap : public FlatAttributesMapBase {
+ public:
+  using FlatAttributesMapBase::FlatAttributesMapBase;
+
+  FlatAttributesMapProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const FlatAttributesMapBase& AsVariant() const { return *this; }
+  FlatAttributesMapBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<FlatAttributesMap> FromProto(
+      const FlatAttributesMapProto& proto);
+};
+
+// Dictionary is just a wrapper around `AttributesMap`. We need an indirection
+// through `std::shared_ptr` to be able to define recursive `std::variant`. We
+// use shared pointer to keep `AttributesMap` copyable.
+struct AttributesDictionary {
+  std::shared_ptr<class AttributesMap> attrs;
+
+  AttributesMapProto ToProto() const;
+
+  static absl::StatusOr<AttributesDictionary> FromProto(
+      const AttributesMapProto& proto);
+
+  friend bool operator==(const AttributesDictionary& lhs,
+                         const AttributesDictionary& rhs);
+
+  friend bool operator!=(const AttributesDictionary& lhs,
+                         const AttributesDictionary& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+using AttributeBase =
+    internal::AppendType<FlatAttributeBase, AttributesDictionary>::Type;
+
+// Attributes that support arbitrary nesting.
+class Attribute : public AttributeBase {
+ public:
+  using AttributeBase::AttributeBase;
+
+  explicit Attribute(const FlatAttribute& flat);
+
+  xla::ffi::AttributeProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const AttributeBase& AsVariant() const { return *this; }
+  AttributeBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<Attribute> FromProto(const AttributeProto& proto);
+};
+
+using AttributesMapBase = absl::flat_hash_map<std::string, Attribute>;
+
+// AttributesMap is a map from an arbitrary name (string key) to an attribute.
+class AttributesMap : public AttributesMapBase {
+ public:
+  using AttributesMapBase::AttributesMapBase;
+
+  AttributesMapProto ToProto() const;
+
+  // Older versions of libstdc++ don't implement P2162R2, therefore we need
+  // these explicit casts to be able to use std::visit.
+  const AttributesMapBase& AsVariant() const { return *this; }
+  AttributesMapBase& AsVariant() { return *this; }
+
+  static absl::StatusOr<AttributesMap> FromProto(
+      const AttributesMapProto& proto);
+};
 
 // Converts MLIR dictionary attribute attached to a custom call operation to a
 // custom call handler attributes that are forwarded to the FFI handler.
-absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict);
+absl::StatusOr<AttributesMap> BuildAttributesMap(mlir::DictionaryAttr dict);
 
 }  // namespace xla::ffi
 
diff --git a/third_party/xla/xla/ffi/attribute_map.proto b/third_party/xla/xla/ffi/attribute_map.proto
new file mode 100644
index 00000000000000..1f3bbef5b63b37
--- /dev/null
+++ b/third_party/xla/xla/ffi/attribute_map.proto
@@ -0,0 +1,102 @@
+syntax = "proto3";
+
+package xla.ffi;
+
+option java_multiple_files = true;
+option java_outer_classname = "AttributeMap";
+
+message ScalarProto {
+  oneof value {
+    bool b = 1;
+    int32 i8 = 2;
+    int32 i16 = 3;
+    int32 i32 = 4;
+    int64 i64 = 5;
+    uint32 u8 = 6;
+    uint32 u16 = 7;
+    uint32 u32 = 8;
+    uint64 u64 = 9;
+    float f32 = 10;
+    double f64 = 11;
+  }
+}
+
+message ArrayProto {
+  message Int8ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int16ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int32ArrayProto {
+    repeated int32 values = 1;
+  }
+
+  message Int64ArrayProto {
+    repeated int64 values = 1;
+  }
+
+  message Uint8ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint16ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint32ArrayProto {
+    repeated uint32 values = 1;
+  }
+
+  message Uint64ArrayProto {
+    repeated uint64 values = 1;
+  }
+
+  message FloatArrayProto {
+    repeated float values = 1;
+  }
+
+  message DoubleArrayProto {
+    repeated double values = 1;
+  }
+
+  oneof value {
+    Int8ArrayProto i8 = 1;
+    Int16ArrayProto i16 = 2;
+    Int32ArrayProto i32 = 3;
+    Int64ArrayProto i64 = 4;
+    Uint8ArrayProto u8 = 5;
+    Uint16ArrayProto u16 = 6;
+    Uint32ArrayProto u32 = 7;
+    Uint64ArrayProto u64 = 8;
+    FloatArrayProto f32 = 9;
+    DoubleArrayProto f64 = 10;
+  }
+}
+
+message FlatAttributeProto {
+  oneof value {
+    ScalarProto scalar = 1;
+    ArrayProto array = 2;
+    string str = 3;
+  }
+}
+
+message FlatAttributesMapProto {
+  map<string, FlatAttributeProto> attrs = 1;
+}
+
+message AttributeProto {
+  oneof value {
+    ScalarProto scalar = 1;
+    ArrayProto array = 2;
+    string str = 3;
+    AttributesMapProto dict = 4;
+  }
+}
+
+message AttributesMapProto {
+  map<string, AttributeProto> attrs = 1;
+}
diff --git a/third_party/xla/xla/ffi/attribute_map_test.cc b/third_party/xla/xla/ffi/attribute_map_test.cc
new file mode 100644
index 00000000000000..78b577f85180f6
--- /dev/null
+++ b/third_party/xla/xla/ffi/attribute_map_test.cc
@@ -0,0 +1,287 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/attribute_map.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::ffi {
+namespace {
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+using ::testing::HasSubstr;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+TEST(ScalarTest, ProtoConversion) {
+  EXPECT_THAT(Scalar(true).ToProto(), EqualsProto(R"pb(
+                b: 1
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                b: 1
+              )pb")),
+              IsOkAndHolds(Scalar(true)));
+
+  EXPECT_THAT(Scalar(int8_t{42}).ToProto(), EqualsProto(R"pb(
+                i8: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i8: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int8_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i8: 128
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for int8_t")));
+
+  EXPECT_THAT(Scalar(int16_t{42}).ToProto(), EqualsProto(R"pb(
+                i16: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i16: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int16_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i16: 32768
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for int16_t")));
+
+  EXPECT_THAT(Scalar(int32_t{42}).ToProto(), EqualsProto(R"pb(
+                i32: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i32: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int32_t{42})));
+
+  EXPECT_THAT(Scalar(int64_t{42}).ToProto(), EqualsProto(R"pb(
+                i64: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                i64: 42
+              )pb")),
+              IsOkAndHolds(Scalar(int64_t{42})));
+
+  EXPECT_THAT(Scalar(uint8_t{42}).ToProto(), EqualsProto(R"pb(
+                u8: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u8: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint8_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u8: 256
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for uint8_t")));
+
+  EXPECT_THAT(Scalar(uint16_t{42}).ToProto(), EqualsProto(R"pb(
+                u16: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u16: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint16_t{42})));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u16: 65536
+              )pb")),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Integer value out of range for uint16_t")));
+
+  EXPECT_THAT(Scalar(uint32_t{42}).ToProto(), EqualsProto(R"pb(
+                u32: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u32: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint32_t{42})));
+
+  EXPECT_THAT(Scalar(uint64_t{42}).ToProto(), EqualsProto(R"pb(
+                u64: 42
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                u64: 42
+              )pb")),
+              IsOkAndHolds(Scalar(uint64_t{42})));
+
+  EXPECT_THAT(Scalar(float{42.0f}).ToProto(), EqualsProto(R"pb(
+                f32: 42.0
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                f32: 42.0
+              )pb")),
+              IsOkAndHolds(Scalar(float{42.0f})));
+
+  EXPECT_THAT(Scalar(double{42.0}).ToProto(), EqualsProto(R"pb(
+                f64: 42.0
+              )pb"));
+  EXPECT_THAT(Scalar::FromProto(ParseTextProtoOrDie<ScalarProto>(R"pb(
+                f64: 42.0
+              )pb")),
+              IsOkAndHolds(Scalar(double{42.0})));
+}
+
+TEST(ArrayTest, ProtoConversion) {
+  EXPECT_THAT(Array(std::vector<int8_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i8: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i8: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int8_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int16_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i16: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i16: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int16_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int32_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i32: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i32: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int32_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<int64_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                i64: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                i64: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<int64_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint8_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u8: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u8: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint8_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint16_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u16: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u16: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint16_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint32_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u32: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u32: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint32_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<uint64_t>{42, 43}).ToProto(), EqualsProto(R"pb(
+                u64: { values: 42 values: 43 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                u64: { values: 42 values: 43 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<uint64_t>{42, 43})));
+
+  EXPECT_THAT(Array(std::vector<float>{42.0f, 43.0f}).ToProto(),
+              EqualsProto(R"pb(
+                f32: { values: 42.0 values: 43.0 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                f32: { values: 42.0 values: 43.0 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<float>{42.0f, 43.0f})));
+
+  EXPECT_THAT(Array(std::vector<double>{42.0, 43.0}).ToProto(),
+              EqualsProto(R"pb(
+                f64: { values: 42.0 values: 43.0 }
+              )pb"));
+  EXPECT_THAT(Array::FromProto(ParseTextProtoOrDie<ArrayProto>(R"pb(
+                f64: { values: 42.0 values: 43.0 }
+              )pb")),
+              IsOkAndHolds(Array(std::vector<double>{42.0, 43.0})));
+}
+
+TEST(FlatAttributeTest, ProtoConversion) {
+  EXPECT_THAT(FlatAttribute(Scalar(true)).ToProto(), EqualsProto(R"pb(
+                scalar: { b: 1 }
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    scalar: { b: 1 }
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(Scalar(true))));
+  EXPECT_THAT(FlatAttribute(Array(std::vector<int8_t>{42, 43})).ToProto(),
+              EqualsProto(R"pb(
+                array: { i8: { values: 42 values: 43 } }
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    array: { i8: { values: 42 values: 43 } }
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(Array(std::vector<int8_t>{42, 43}))));
+
+  EXPECT_THAT(FlatAttribute(std::string("foo")).ToProto(), EqualsProto(R"pb(
+                str: "foo"
+              )pb"));
+  EXPECT_THAT(FlatAttribute::FromProto(ParseTextProtoOrDie<FlatAttributeProto>(
+                  R"pb(
+                    str: "foo"
+                  )pb")),
+              IsOkAndHolds(FlatAttribute(std::string("foo"))));
+}
+
+TEST(AttributesMapTest, ProtoConversion) {
+  AttributesMap attrs = {{std::string("foo"), Attribute(Scalar(true))}};
+
+  EXPECT_THAT(AttributesMap::FromProto(attrs.ToProto()), IsOkAndHolds(attrs));
+}
+
+TEST(DictionaryTest, ProtoConversion) {
+  AttributesMap attrs = {{std::string("foo"), Attribute(Scalar(true))}};
+  AttributesDictionary dict{};
+  dict.attrs = std::make_shared<AttributesMap>(attrs);
+
+  EXPECT_THAT(AttributesDictionary::FromProto(dict.ToProto()),
+              IsOkAndHolds(dict));
+}
+
+TEST(AttributeTest, ProtoConversion) {
+  AttributesDictionary dict{};
+  dict.attrs = std::make_shared<AttributesMap>(
+      AttributesMap{{std::string("foo"), Attribute(Scalar(true))}});
+  Attribute attr(std::move(dict));
+
+  EXPECT_THAT(Attribute::FromProto(attr.ToProto()), IsOkAndHolds(attr));
+}
+
+}  // namespace
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index ad067220ecf3b8..ad7c71c98f8cd6 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
@@ -51,7 +52,7 @@ struct CallFrameBuilder::Buffer {
   absl::InlinedVector<int64_t, 4> dims;
 };
 
-CallFrameBuilder::AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
+AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
   return std::move(attrs_);
 }
 
@@ -65,8 +66,9 @@ void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
 
 void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
                                                  AttributesMap attrs) {
-  attrs_.try_emplace(std::move(name),
-                     Dictionary{std::make_shared<AttributesMap>(attrs)});
+  attrs_.try_emplace(
+      std::move(name),
+      AttributesDictionary{std::make_shared<AttributesMap>(attrs)});
 }
 
 void CallFrameBuilder::AttributesBuilder::Append(AttributesMap attrs) {
@@ -160,13 +162,13 @@ struct CallFrame::Dictionary {
 };
 
 struct CallFrame::Array {
-  CallFrameBuilder::Array value;  // XLA_FFI_Array::data
+  xla::ffi::Array value;  // XLA_FFI_Array::data
 
   XLA_FFI_Array array = {};
 };
 
 struct CallFrame::Scalar {
-  CallFrameBuilder::Scalar value;  // XLA_FFI_Scalar::value
+  xla::ffi::Scalar value;  // XLA_FFI_Scalar::value
 
   XLA_FFI_Scalar scalar = {};
 };
@@ -413,11 +415,11 @@ std::unique_ptr<CallFrame::Results> CallFrame::FixUpRets(
 // An std::visit overload set for converting CallFrameBuilder::Attribute to
 // CallFrame::Attribute.
 struct CallFrame::ConvertAttribute {
-  CallFrame::Attribute operator()(const CallFrameBuilder::Array& array) {
+  CallFrame::Attribute operator()(const xla::ffi::Array& array) {
     return CallFrame::Array{array};
   }
 
-  CallFrame::Attribute operator()(const CallFrameBuilder::Scalar& scalar) {
+  CallFrame::Attribute operator()(const xla::ffi::Scalar& scalar) {
     return CallFrame::Scalar{scalar};
   }
 
@@ -425,8 +427,8 @@ struct CallFrame::ConvertAttribute {
     return CallFrame::String{str};
   }
 
-  CallFrame::Attribute operator()(const CallFrameBuilder::Dictionary& dict) {
-    return CallFrame::Dictionary{CreateAttrs(*dict.attrs)};
+  CallFrame::Attribute operator()(const xla::ffi::AttributesDictionary& dict) {
+    return Dictionary{CreateAttrs(*dict.attrs)};
   }
 };
 
@@ -440,7 +442,7 @@ struct CallFrame::FixUpAttribute {
       array.array.size = value.size();
       array.array.data = value.data();
     };
-    std::visit(visitor, array.value);
+    std::visit(visitor, array.value.AsVariant());
   }
 
   void operator()(CallFrame::Scalar& scalar) {
@@ -449,7 +451,7 @@ struct CallFrame::FixUpAttribute {
       scalar.scalar.dtype = internal::NativeTypeToCApiDataType<T>();
       scalar.scalar.value = &value;
     };
-    std::visit(visitor, scalar.value);
+    std::visit(visitor, scalar.value.AsVariant());
   }
 
   void operator()(CallFrame::String& str) {
@@ -498,13 +500,14 @@ struct CallFrame::AttributeStorage {
 };
 
 std::unique_ptr<CallFrame::Attributes> CallFrame::CreateAttrs(
-    const CallFrameBuilder::AttributesMap& battrs) {
+    const xla::ffi::AttributesMap& battrs) {
   auto attrs = std::make_unique<Attributes>();
 
   // Convert call frame builder attributes to a collection of named attributes.
   attrs->attributes.reserve(battrs.size());
   for (auto& [name, battr] : battrs) {
-    NamedAttribute attr = {String{name}, std::visit(ConvertAttribute(), battr)};
+    NamedAttribute attr = {String{name},
+                           std::visit(ConvertAttribute(), battr.AsVariant())};
     attrs->attributes.push_back(std::move(attr));
   }
 
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 54863d0615b12c..32dceead1d9b4b 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
@@ -45,15 +46,6 @@ namespace xla::ffi {
 class CallFrame;  // forward declare
 
 class CallFrameBuilder {
-  // A little bit of template metaprogramming to append type to std::variant.
-  template <typename V, class T>
-  struct AppendType;
-
-  template <typename... Ts, class T>
-  struct AppendType<std::variant<Ts...>, T> {
-    using Type = std::variant<Ts..., T>;
-  };
-
  public:
   CallFrameBuilder(size_t num_args, size_t num_rets);
   ~CallFrameBuilder();
@@ -61,32 +53,6 @@ class CallFrameBuilder {
   CallFrameBuilder(CallFrameBuilder&&);
   CallFrameBuilder& operator=(CallFrameBuilder&&);
 
-  using Scalar = std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t,
-                              uint16_t, uint32_t, uint64_t, float, double>;
-  using Array = std::variant<std::vector<int8_t>, std::vector<int16_t>,
-                             std::vector<int32_t>, std::vector<int64_t>,
-                             std::vector<uint8_t>, std::vector<uint16_t>,
-                             std::vector<uint32_t>, std::vector<uint64_t>,
-                             std::vector<float>, std::vector<double>>;
-
-  // Declare implementation detail structs for call frame builder storage.
-  struct Dictionary;
-
-  // Attributes that do not support nested dictionaries.
-  using FlatAttribute = std::variant<Scalar, Array, std::string>;
-  using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
-
-  // Attributes that support arbitrary nesting.
-  using Attribute = typename AppendType<FlatAttribute, Dictionary>::Type;
-  using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
-
-  // Dictionary is just a wrapper around AttributesMap. We need an indirection
-  // through `std::shared_ptr` to be able to define recursive `std::variant`. We
-  // use shared pointer to keep `AttributesMap` copyable.
-  struct Dictionary {
-    std::shared_ptr<AttributesMap> attrs;
-  };
-
   // A helper class to build call frame attributes.
   class AttributesBuilder {
    public:
@@ -224,8 +190,7 @@ class CallFrame {
   //===----- Call frame attributes ----------------------------------------===//
 
   // Creates call frame attributes from the call frame builder attributes.
-  static std::unique_ptr<Attributes> CreateAttrs(
-      const CallFrameBuilder::AttributesMap& attrs);
+  static std::unique_ptr<Attributes> CreateAttrs(const AttributesMap& attrs);
 
   // Fixes up call frame attributes by initializing XLA FFI structs with valid
   // pointers into storage objects.
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index bb568313cd3b4e..f73461fc7d297f 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
@@ -131,7 +132,7 @@ void BM_AddBufferArg(benchmark::State& state) {
 void BM_AddAttributes(benchmark::State& state) {
   size_t num_attrs = state.range(0);
 
-  CallFrameBuilder::AttributesMap attrs;
+  AttributesMap attrs;
   for (size_t i = 0; i < num_attrs; ++i) {
     attrs.try_emplace(absl::StrCat("attr_", i), 42);
   }
diff --git a/third_party/xla/xla/ffi/execution_context.cc b/third_party/xla/xla/ffi/execution_context.cc
index 5d6499aa1728b2..c7879b549b25ae 100644
--- a/third_party/xla/xla/ffi/execution_context.cc
+++ b/third_party/xla/xla/ffi/execution_context.cc
@@ -15,10 +15,8 @@ limitations under the License.
 
 #include "xla/ffi/execution_context.h"
 
-#include <memory>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -27,23 +25,11 @@ limitations under the License.
 
 namespace xla::ffi {
 
-ExecutionContext::UserData::UserData(void* data, Deleter<void> deleter)
-    : data_(data), deleter_(std::move(deleter)) {}
-
-ExecutionContext::UserData::~UserData() {
-  if (deleter_) deleter_(data_);
-}
-
-absl::Status ExecutionContext::Insert(TypeId type_id, void* data,
-                                      Deleter<void> deleter) {
-  return InsertUserData(type_id,
-                        std::make_unique<UserData>(data, std::move(deleter)));
+absl::Status ExecutionContext::Insert(TypeId type_id, void* data) {
+  return InsertUserData(type_id, UserData(data, /*deleter=*/[](void*) {}));
 }
 
-absl::Status ExecutionContext::InsertUserData(TypeId type_id,
-                                              std::unique_ptr<UserData> data) {
-  if (!data) return absl::InvalidArgumentError("User data must be not null");
-
+absl::Status ExecutionContext::InsertUserData(TypeId type_id, UserData data) {
   auto emplaced = user_data_.emplace(type_id, std::move(data));
   if (!emplaced.second) {
     return Internal(
@@ -53,27 +39,27 @@ absl::Status ExecutionContext::InsertUserData(TypeId type_id,
   return absl::OkStatus();
 }
 
-absl::StatusOr<ExecutionContext::UserData*> ExecutionContext::LookupUserData(
-    TypeId type_id) const {
+absl::StatusOr<const ExecutionContext::UserData*>
+ExecutionContext::LookupUserData(TypeId type_id) const {
   auto it = user_data_.find(type_id);
   if (it == user_data_.end()) {
     return NotFound("User data with type id %d not found in execution context",
                     type_id.value());
   }
-  return it->second.get();
+  return &it->second;
 }
 
 void ExecutionContext::ForEach(
     absl::FunctionRef<void(TypeId type_id, void* data)> fn) const {
   for (auto& [type_id, user_data] : user_data_) {
-    fn(type_id, user_data->data());
+    fn(type_id, user_data.get());
   }
 }
 
 absl::Status ExecutionContext::ForEachWithStatus(
     absl::FunctionRef<absl::Status(TypeId type_id, void* data)> fn) const {
   for (auto& [type_id, user_data] : user_data_) {
-    TF_RETURN_IF_ERROR(fn(type_id, user_data->data()));
+    TF_RETURN_IF_ERROR(fn(type_id, user_data.get()));
   }
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/ffi/execution_context.h b/third_party/xla/xla/ffi/execution_context.h
index 827c85cdc4539f..9bee8ef33b63f2 100644
--- a/third_party/xla/xla/ffi/execution_context.h
+++ b/third_party/xla/xla/ffi/execution_context.h
@@ -16,12 +16,11 @@ limitations under the License.
 #ifndef XLA_FFI_EXECUTION_CONTEXT_H_
 #define XLA_FFI_EXECUTION_CONTEXT_H_
 
-#include <algorithm>
 #include <functional>
 #include <memory>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -40,6 +39,17 @@ namespace xla::ffi {
 // to the FFI handler. We rely on type id to guarantee that we forward user data
 // of correct type.
 //
+// We have two kinds of TypeIds:
+//
+// 1. Internal type id. When FFI handler defined in the same binary we rely
+//    on a global static registry to automatically assign type ids.
+//
+// 2. External type id. When FFI handlers defined in a dynamically loaded
+//    library, they must register types used in the execution context ahead
+//    of time and explicitly get a unique type id for them.
+//
+// See `TypeRegistry` documentation for more details about different type ids.
+//
 // Examples: FFI handler can register a per-execution cache in the execution
 // context and get access to it in the FFI handler, with a guarantee that it is
 // unique between separate calls to XLA execute.
@@ -47,16 +57,22 @@ class ExecutionContext {
  public:
   using TypeId = TypeRegistry::TypeId;
 
-  template <typename T>
-  using Deleter = std::function<void(T*)>;
+  // Inserts user data with a given type id. Caller is responsible for making
+  // sure that the pointer stays valid during the XLA execution and correctly
+  // destroyed afterwards.
+  absl::Status Insert(TypeId type_id, void* data);
 
-  // Inserts opaque user data with a given type id and optional deleter.
-  absl::Status Insert(TypeId type_id, void* data,
-                      Deleter<void> deleter = nullptr);
+  // Looks up opaque execution context data with given `type_id`.
+  absl::StatusOr<void*> Lookup(TypeId type_id) const {
+    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
+    return user_data->get();
+  }
 
-  // Inserts typed user data of type `T` and optional deleter.
+  // Inserts typed user data of type `T`. Caller is responsible for making sure
+  // that the pointer stays valid during the XLA execution and correctly
+  // destroyed afterwards.
   template <typename T>
-  absl::Status Insert(T* data, Deleter<T> deleter = nullptr);
+  absl::Status Insert(T* data);
 
   // Emplaces typed user data constructed from `args`. Execution context
   // becomes the owner of the constructed object.
@@ -65,17 +81,7 @@ class ExecutionContext {
 
   // Looks up typed execution context data of type `T`.
   template <typename T>
-  absl::StatusOr<T*> Lookup() const {
-    TF_ASSIGN_OR_RETURN(auto user_data,
-                        LookupUserData(TypeRegistry::GetTypeId<T>()));
-    return static_cast<T*>(user_data->data());
-  }
-
-  // Looks up opaque execution context data with given `type_id`.
-  absl::StatusOr<void*> Lookup(TypeId type_id) const {
-    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
-    return user_data->data();
-  }
+  absl::StatusOr<T*> Lookup() const;
 
   // Visit all user data in the execution context.
   void ForEach(absl::FunctionRef<void(TypeId type_id, void* data)> fn) const;
@@ -83,46 +89,36 @@ class ExecutionContext {
       absl::FunctionRef<absl::Status(TypeId type_id, void* data)> fn) const;
 
  private:
-  // An RAII wrapper for opaque user data. Optional deleter will be called when
-  // UserData is destroyed together with the execution context. If deleter is
-  // nullptr then the caller is responsible for making sure that the pointer
-  // stays valid during the XLA execution and correctly destroyed afterwards.
-  class UserData {
-   public:
-    UserData(void* data, Deleter<void> deleter);
-    ~UserData();
-
-    UserData(UserData&) = delete;
-    UserData& operator=(const UserData&) = delete;
-
-    void* data() const { return data_; }
+  // An RAII wrapper for opaque user data. If deleter is no-op then the caller
+  // is responsible for making sure that the pointer stays valid during the XLA
+  // execution and correctly destroyed afterwards
+  using UserData = std::unique_ptr<void, std::function<void(void*)>>;
 
-   private:
-    void* data_;
-    Deleter<void> deleter_;
-  };
+  absl::Status InsertUserData(TypeId type_id, UserData data);
+  absl::StatusOr<const UserData*> LookupUserData(TypeId type_id) const;
 
-  absl::Status InsertUserData(TypeId type_id, std::unique_ptr<UserData> data);
-  absl::StatusOr<UserData*> LookupUserData(TypeId type_id) const;
-
-  absl::flat_hash_map<TypeId, std::unique_ptr<UserData>> user_data_;
+  absl::node_hash_map<TypeId, UserData> user_data_;
 };
 
 template <typename T>
-absl::Status ExecutionContext::Insert(T* data, Deleter<T> deleter) {
+absl::StatusOr<T*> ExecutionContext::Lookup() const {
+  TF_ASSIGN_OR_RETURN(auto user_data,
+                      LookupUserData(TypeRegistry::GetTypeId<T>()));
+  return static_cast<T*>(user_data->get());
+}
+
+template <typename T>
+absl::Status ExecutionContext::Insert(T* data) {
   return InsertUserData(TypeRegistry::GetTypeId<T>(),
-                        std::make_unique<UserData>(
-                            data, [deleter = std::move(deleter)](void* data) {
-                              if (deleter) deleter(static_cast<T*>(data));
-                            }));
+                        UserData(data, /*deleter=*/[](void*) {}));
 }
 
 template <typename T, typename... Args>
 absl::Status ExecutionContext::Emplace(Args&&... args) {
-  return InsertUserData(TypeRegistry::GetTypeId<T>(),
-                        std::make_unique<UserData>(
-                            new T(std::forward<Args>(args)...),
-                            [](void* data) { delete static_cast<T*>(data); }));
+  auto type_info = TypeRegistry::GetTypeInfo<T>();
+  return InsertUserData(
+      TypeRegistry::GetTypeId<T>(),
+      UserData(new T(std::forward<Args>(args)...), type_info.deleter));
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_state.cc b/third_party/xla/xla/ffi/execution_state.cc
index 87916729ef755e..d5fce70fc8a2a2 100644
--- a/third_party/xla/xla/ffi/execution_state.cc
+++ b/third_party/xla/xla/ffi/execution_state.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/ffi/execution_state.h"
 
-#include "absl/base/attributes.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -35,8 +34,7 @@ ExecutionState::~ExecutionState() {
 }
 
 absl::Status ExecutionState::Set(TypeId type_id, void* state) {
-  TF_ASSIGN_OR_RETURN(auto type_info,
-                      TypeRegistry::GetExternalTypeInfo(type_id));
+  TF_ASSIGN_OR_RETURN(auto type_info, TypeRegistry::GetTypeInfo(type_id));
   if (type_info.deleter == nullptr) {
     return InvalidArgument(
         "Type id %d does not have a registered type info with a deleter",
@@ -45,12 +43,6 @@ absl::Status ExecutionState::Set(TypeId type_id, void* state) {
   return Set(type_id, type_info, state);
 }
 
-ABSL_DEPRECATED("FFI users must rely in TypeInfo registration")
-absl::Status ExecutionState::Set(TypeId type_id, void* state,
-                                 void (*deleter)(void*)) {
-  return Set(type_id, TypeInfo{deleter}, state);
-}
-
 absl::Status ExecutionState::Set(TypeId type_id, TypeInfo type_info,
                                  void* state) {
   DCHECK(state && type_info.deleter) << "State and deleter must not be null";
diff --git a/third_party/xla/xla/ffi/execution_state.h b/third_party/xla/xla/ffi/execution_state.h
index 3be4d2339e8b9d..549823fcd24bc9 100644
--- a/third_party/xla/xla/ffi/execution_state.h
+++ b/third_party/xla/xla/ffi/execution_state.h
@@ -54,10 +54,6 @@ class ExecutionState {
   // already set, or if type id is not supported as a state.
   absl::Status Set(TypeId type_id, void* state);
 
-  // Sets opaque state with a given type id and custom deleter. Returns an error
-  // if state is already set, or if type id is not supported as a state.
-  absl::Status Set(TypeId type_id, void* state, void (*deleter)(void*));
-
   // Returns opaque state of the given type id. If set state type id does not
   // match the requested one, returns an error.
   absl::StatusOr<void*> Get(TypeId type_id) const;
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 39eab7724e7f54..41e52968dd038d 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -63,14 +63,12 @@ limitations under the License.
 namespace xla::ffi {
 
 // Type tags to bind parameters passed via execution context to FFI handler.
-struct Stream {};               // binds `se::Stream*`
-struct DeviceOrdinal {};        // binds `int32_t` with device ordinal
-struct Allocator {};            // binds `se::DeviceMemoryAllocator*`
-struct ScratchAllocator {};     // binds `se::OwningScratchAllocator`
-struct CalledComputation {};    // binds `HloComputation*`
-struct IntraOpThreadPool {};    // binds `const Eigen::ThreadPoolDevice*`
-struct FfiApi {};               // binds `const XLA_FFI_Api*`
-struct FfiExecutionContext {};  // binds `XLA_FFI_ExecutionContext*`
+struct Stream {};             // binds `se::Stream*`
+struct DeviceOrdinal {};      // binds `int32_t` with device ordinal
+struct Allocator {};          // binds `se::DeviceMemoryAllocator*`
+struct ScratchAllocator {};   // binds `se::OwningScratchAllocator`
+struct CalledComputation {};  // binds `HloComputation*`
+struct IntraOpThreadPool {};  // binds `const Eigen::ThreadPoolDevice*`
 
 template <typename T>
 struct PlatformStream {};  // binds a platform stream, e.g. `cudaStream_t`
@@ -251,6 +249,20 @@ struct ArgBinding<Buffer<dtype, rank>> {
   using Arg = Buffer<dtype, rank>;
 };
 
+//===----------------------------------------------------------------------===//
+// Results binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct RetBinding<Result<AnyBuffer>> {
+  using Ret = AnyBuffer;
+};
+
+template <PrimitiveType dtype, size_t rank>
+struct RetBinding<Result<Buffer<dtype, rank>>> {
+  using Ret = Buffer<dtype, rank>;
+};
+
 //===----------------------------------------------------------------------===//
 // Arguments decoding
 //===----------------------------------------------------------------------===//
@@ -428,7 +440,7 @@ struct AttrDecoding<absl::string_view> {
   static std::optional<absl::string_view> Decode(XLA_FFI_AttrType type,
                                                  void* attr,
                                                  DiagnosticEngine& diagnostic) {
-    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
     }
@@ -472,8 +484,8 @@ class Dictionary : public internal::DictionaryBase {
   template <typename T>
   absl::StatusOr<T> get(absl::string_view name) const {
     DiagnosticEngine diagnostic;
-    std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
-    if (!value.has_value()) {
+    auto value = internal::DictionaryBase::get<T>(name, diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
       return Internal("%s", diagnostic.Result());
     }
     return *value;
@@ -496,7 +508,7 @@ struct AttrDecoding<Dictionary> {
   using Type = Dictionary;
   static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
                                           DiagnosticEngine& diagnostic) {
-    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
+    if (ABSL_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
     }
@@ -504,6 +516,38 @@ struct AttrDecoding<Dictionary> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing context.
+//===----------------------------------------------------------------------===//
+
+class Context : public internal::ContextBase {
+ public:
+  using internal::ContextBase::ContextBase;
+
+  template <typename T>
+  absl::StatusOr<typename CtxDecoding<T>::Type> get() const {
+    DiagnosticEngine diagnostic;
+    auto value = internal::ContextBase::get<T>(diagnostic);
+    if (ABSL_PREDICT_FALSE(!value.has_value())) {
+      return Internal("%s", diagnostic.Result());
+    }
+    return *value;
+  }
+};
+
+// Context decoding for catch-all `Context` type.
+template <>
+struct CtxDecoding<Context> {
+  using Type = Context;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Context> Decode(const XLA_FFI_Api* api,
+                                       XLA_FFI_ExecutionContext* ctx,
+                                       DiagnosticEngine&) {
+    return Context(api, ctx);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Context decoding
 //===----------------------------------------------------------------------===//
@@ -590,28 +634,6 @@ struct CtxDecoding<IntraOpThreadPool> {
   }
 };
 
-template <>
-struct CtxDecoding<FfiApi> {
-  using Type = const XLA_FFI_Api*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    return api;
-  }
-};
-
-template <>
-struct CtxDecoding<FfiExecutionContext> {
-  using Type = XLA_FFI_ExecutionContext*;
-
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine&) {
-    return ctx;
-  }
-};
-
 template <typename T>
 struct CtxDecoding<PlatformStream<T>> {
   using Type = T;
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index f03f15936ac63f..d0e65a84299e34 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -97,8 +97,27 @@ struct XLA_FFI_ExecutionContext {
 
 namespace xla::ffi {
 
-bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits) {
-  return traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
+// The minimum XLA:FFI API version that XLA runtime supports.
+static constexpr std::pair<int32_t, int32_t> kMinSupportedApiVersion = {
+    /*major=*/0,
+    /*minor=*/1,
+};
+
+// The maximum XLA:FFI API version that XLA runtime supports.
+static constexpr std::pair<int32_t, int32_t> kMaxSupportedApiVersion = {
+    XLA_FFI_API_MAJOR,
+    XLA_FFI_API_MINOR,
+};
+
+static bool IsSupportedApiVersion(const XLA_FFI_Api_Version& api_version) {
+  std::pair<int32_t, int32_t> version = {api_version.major_version,
+                                         api_version.minor_version};
+  return version >= kMinSupportedApiVersion &&
+         version <= kMaxSupportedApiVersion;
+}
+
+bool IsCommandBufferCompatible(const XLA_FFI_Metadata& metadata) {
+  return metadata.traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
 }
 
 static XLA_FFI_ExecutionContext CreateExecutionContext(
@@ -156,16 +175,16 @@ tsl::AsyncValueRef<tsl::Chain> TakeFuture(XLA_FFI_Future* future) {
     return chain->AsRef();
   }
 
-  // If the future is already completed, immediately return the underlying async
-  // value and delete the XLA_FFI_Future.
+  // If the future is already completed, immediately return the underlying
+  // async value and delete the XLA_FFI_Future.
   if (ABSL_PREDICT_TRUE(future->async_value.IsAvailable())) {
     tsl::AsyncValueRef<tsl::Chain> async_value = std::move(future->async_value);
     delete future;
     return async_value;
   }
 
-  // If the future is not completed, return a copy of the underlying async value
-  // and keep XLA_FFI_Future alive until it is completed.
+  // If the future is not completed, return a copy of the underlying async
+  // value and keep XLA_FFI_Future alive until it is completed.
   tsl::AsyncValueRef<tsl::Chain> async_value = future->async_value;
   async_value.AndThen([future] { delete future; });
   return async_value;
@@ -270,7 +289,7 @@ static XLA_FFI_CallFrame PrepareMetadataCallFrame(
   return XLA_FFI_CallFrame{
       XLA_FFI_CallFrame_STRUCT_SIZE,
       &extension->extension_base,
-      /*api=*/nullptr,
+      /*api=*/GetXlaFfiApi(),
       /*context=*/nullptr,
       /*stage=*/XLA_FFI_ExecutionStage_EXECUTE,
       /*args=*/XLA_FFI_Args{XLA_FFI_Args_STRUCT_SIZE},
@@ -381,49 +400,61 @@ static absl::Status RegisterHandler(absl::string_view name,
         name, platform);
   }
 
-  // Check the API versions.
+  // Check the API version that FFI handler was compiled with is supported.
   TF_ASSIGN_OR_RETURN(XLA_FFI_Metadata metadata, GetMetadata(bundle.execute));
-  const XLA_FFI_Api_Version& api_version = metadata.api_version;
-  if (std::make_pair(api_version.major_version, api_version.minor_version) >
-      std::make_pair(XLA_FFI_API_MAJOR, XLA_FFI_API_MINOR)) {
+  if (!IsSupportedApiVersion(metadata.api_version)) {
     return InvalidArgument(
-        "FFI handler registration for %s on platform %s (canonical %s) failed "
-        "because the handler's API version (%d.%d) is incompatible with the "
-        "framework's API version (%d.%d)",
-        name, platform, canonical_platform, api_version.major_version,
-        api_version.minor_version, XLA_FFI_API_MAJOR, XLA_FFI_API_MINOR);
+        "XLA FFI handler registration for %s on platform %s (canonical %s) "
+        "failed because the handler's API version (%d.%d) is incompatible "
+        "with "
+        "the framework's API version (%d.%d). Minimum supported API version "
+        "is "
+        "(%d.%d).",
+        name, platform, canonical_platform, metadata.api_version.major_version,
+        metadata.api_version.minor_version, kMaxSupportedApiVersion.first,
+        kMaxSupportedApiVersion.second, kMinSupportedApiVersion.first,
+        kMinSupportedApiVersion.second);
   }
 
-  // Incorporate handler traits.
-  traits |= metadata.traits;
+  // Incorporate handler traits passed explicitly via handler registration API.
+  metadata.traits |= traits;
+
+  // Incorporate state type id from the instantiate implementation if present.
+  if (bundle.instantiate) {
+    TF_ASSIGN_OR_RETURN(XLA_FFI_Metadata instantiate_metadata,
+                        GetMetadata(bundle.instantiate));
+    metadata.state_type_id = instantiate_metadata.state_type_id;
+  }
 
   VLOG(2) << absl::StreamFormat(
       "Register XLA FFI handler for '%s'; platform=%s (canonical=%s), "
-      "stages=[%s], command_buffer_compatible=%v",
+      "stages=[%s], metadata=%v",
       name, platform, canonical_platform,
-      absl::StrJoin(GetHandlerStages(bundle), ", "),
-      IsCommandBufferCompatible(traits));
-
-  auto emplaced =
-      GetHandlerRegistry().try_emplace(MakeHandlerKey(name, canonical_platform),
-                                       HandlerRegistration{bundle, traits});
-  if (!emplaced.second) {
-    auto existing = emplaced.first->second;
-    if (existing.traits != traits) {
+      absl::StrJoin(GetHandlerStages(bundle), ", "), metadata);
+
+  HandlerRegistration registration{metadata, bundle};
+  auto [it, emplaced] = GetHandlerRegistry().try_emplace(
+      MakeHandlerKey(name, canonical_platform), registration);
+
+  // We might accidentally link the same FFI library multiple times (because
+  // linking shared libraries is hard), and we choose to ignore this problem as
+  // long as we register exactly the same handler.
+  if (!emplaced) {
+    const HandlerRegistration& existing = it->second;
+    if (existing.metadata != metadata) {
       return InvalidArgument(
           "Duplicate FFI handler registration for %s on platform %s "
-          "(canonical %s) with different traits",
-          name, platform, canonical_platform);
+          "(canonical %s) with different metadata: %v vs %v",
+          name, platform, canonical_platform, existing.metadata, metadata);
     }
-    if (existing.bundle.prepare != bundle.prepare ||
-        existing.bundle.initialize != bundle.initialize ||
-        existing.bundle.execute != bundle.execute) {
+    if (existing.bundle != bundle) {
       return InvalidArgument(
           "Duplicate FFI handler registration for %s on platform %s "
           "(canonical %s) with different bundle addresses",
           name, platform, canonical_platform);
     }
   }
+
   return absl::OkStatus();
 }
 
@@ -652,8 +683,7 @@ static XLA_FFI_Error* XLA_FFI_DeviceOrdinal_Get(
   return nullptr;
 }
 
-static XLA_FFI_Error* XLA_FFI_TypeId_Register(
-    XLA_FFI_TypeId_Register_Args* args) {
+static XLA_FFI_Error* XLA_FFI_Type_Register(XLA_FFI_Type_Register_Args* args) {
   XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "XLA_FFI_ExecutionContext_Get_Args",
       XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE, args->struct_size));
@@ -662,8 +692,8 @@ static XLA_FFI_Error* XLA_FFI_TypeId_Register(
   TypeRegistry::TypeId type_id(args->type_id->type_id);
   TypeRegistry::TypeInfo type_info = {args->type_info->deleter};
 
-  // If type_id is unknown, we are registering a new type and XLA will assign a
-  // unique type id to it.
+  // If type_id is unknown, we are registering a new type and XLA will assign
+  // a unique type id to it.
   if (type_id == TypeRegistry::kUnknownTypeId) {
     auto assigned_type_id =
         TypeRegistry::AssignExternalTypeId(type_name, type_info);
@@ -709,16 +739,8 @@ static XLA_FFI_Error* XLA_FFI_State_Set(XLA_FFI_State_Set_Args* args) {
 
   DCHECK(args->ctx->execution_state) << "ExecutionState must be set";
 
-  absl::Status status;
-  if (args->deleter == nullptr) {
-    status = args->ctx->execution_state->Set(
-        TypeRegistry::TypeId(args->type_id->type_id), args->state);
-  } else {
-    status = args->ctx->execution_state->Set(
-        TypeRegistry::TypeId(args->type_id->type_id), args->state,
-        args->deleter);
-  }
-
+  absl::Status status = args->ctx->execution_state->Set(
+      TypeRegistry::TypeId(args->type_id->type_id), args->state);
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
@@ -978,7 +1000,7 @@ static XLA_FFI_Api api = {
     XLA_FFI_Error_Destroy,
     XLA_FFI_Handler_Register,
     XLA_FFI_Stream_Get,
-    XLA_FFI_TypeId_Register,
+    XLA_FFI_Type_Register,
     XLA_FFI_ExecutionContext_Get,
     XLA_FFI_State_Set,
     XLA_FFI_State_Get,
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index ef9f65f4bd06c8..86c403c10f564f 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 #include <variant>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/api.h"
@@ -142,11 +144,11 @@ class ScopedExecutionContext {
 //===----------------------------------------------------------------------===//
 
 struct HandlerRegistration {
-  XLA_FFI_Handler_Bundle bundle = {};
-  XLA_FFI_Handler_Traits traits = {};
+  XLA_FFI_Metadata metadata;
+  XLA_FFI_Handler_Bundle bundle;
 };
 
-bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
+bool IsCommandBufferCompatible(const XLA_FFI_Metadata& metadata);
 
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
@@ -163,6 +165,54 @@ StaticRegisteredHandlers(absl::string_view platform);
 
 const XLA_FFI_Api* GetXlaFfiApi();
 
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+// Decodes XLA FFI traits packed into a 32-bit integer into a vector of traits.
+inline std::vector<Traits> DecodeTraits(XLA_FFI_Handler_Traits traits) {
+  std::vector<Traits> result;
+  if (traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE) {
+    result.push_back(Traits::kCmdBufferCompatible);
+  }
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+// Pretty printinting for FFI C++ types.
+//===----------------------------------------------------------------------===//
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, Traits traits) {
+  switch (traits) {
+    case Traits::kCmdBufferCompatible:
+      absl::Format(&sink, "cmd_buffer_compatible");
+      break;
+  }
+}
+
 }  // namespace xla::ffi
 
+//===----------------------------------------------------------------------===//
+// Pretty printinting for FFI C types.
+//===----------------------------------------------------------------------===//
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, const XLA_FFI_TypeId& type_id) {
+  if (type_id.type_id == XLA_FFI_UNKNOWN_TYPE_ID.type_id) {
+    absl::Format(&sink, "unknown");
+  } else {
+    absl::Format(&sink, "%d", type_id.type_id);
+  }
+}
+
+template <typename Sink>
+static void AbslStringify(Sink& sink, const XLA_FFI_Metadata& metadata) {
+  absl::Format(&sink, "{api_version: %d.%d, traits: [%s], state: %v}",
+               metadata.api_version.major_version,
+               metadata.api_version.minor_version,
+               absl::StrJoin(xla::ffi::DecodeTraits(metadata.traits), ", "),
+               metadata.state_type_id);
+}
+
 #endif  // XLA_FFI_FFI_API_H_
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 47b3a85fa1a3f4..975a05bf00ca06 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
@@ -105,8 +106,9 @@ TEST(FfiTest, StaticHandlerRegistration) {
   TF_ASSERT_OK(handler0.status());
   TF_ASSERT_OK(handler1.status());
 
-  ASSERT_EQ(handler0->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
-  ASSERT_EQ(handler1->traits, 0);
+  ASSERT_EQ(handler0->metadata.traits,
+            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+  ASSERT_EQ(handler1->metadata.traits, 0);
 
   // Check that platform name was canonicalized an we can find handlers
   // registered for "Host" platform as "Cpu" handlers.
@@ -122,7 +124,8 @@ TEST(FfiTest, RegistrationTraitsBackwardsCompatibility) {
                            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
   auto handler = FindHandler("traits-bwd-compat", "Host");
   TF_ASSERT_OK(handler.status());
-  ASSERT_EQ(handler->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+  ASSERT_EQ(handler->metadata.traits,
+            XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
 }
 
 // Declare XLA FFI handler as a function (extern "C" declaration).
@@ -139,7 +142,7 @@ TEST(FfiTest, StaticHandlerSymbolRegistration) {
   auto handler0 = FindHandler("no-op-sym-0", "Cpu");
 
   TF_ASSERT_OK(handler0.status());
-  ASSERT_EQ(handler0->traits, 0);
+  ASSERT_EQ(handler0->metadata.traits, 0);
 }
 
 TEST(FfiTest, ForwardError) {
@@ -211,10 +214,13 @@ TEST(FfiTest, RunId) {
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
 
-  auto handler = Ffi::Bind().Ctx<RunId>().To([&](RunId run_id) {
-    EXPECT_EQ(run_id.ToInt(), 42);
-    return absl::OkStatus();
-  });
+  auto handler = Ffi::Bind().Ctx<RunId>().Ctx().To(
+      [&](RunId run_id, Context context) -> absl::Status {
+        EXPECT_EQ(run_id.ToInt(), 42);
+        TF_ASSIGN_OR_RETURN(RunId run_id_from_context, context.get<RunId>());
+        EXPECT_EQ(run_id_from_context.ToInt(), 42);
+        return absl::OkStatus();
+      });
 
   CallOptions options;
   options.run_id = RunId{42};
@@ -407,10 +413,10 @@ TEST(FfiTest, AttrsAsDictionary) {
 }
 
 TEST(FfiTest, DictionaryAttr) {
-  CallFrameBuilder::AttributesMap dict0;
+  AttributesMap dict0;
   dict0.try_emplace("i32", 42);
 
-  CallFrameBuilder::AttributesMap dict1;
+  AttributesMap dict1;
   dict1.try_emplace("f32", 42.0f);
 
   CallFrameBuilder::AttributesBuilder attrs;
@@ -453,7 +459,7 @@ TEST(FfiTest, DictionaryAttr) {
 }
 
 TEST(FfiTest, StructAttr) {
-  CallFrameBuilder::AttributesMap dict;
+  AttributesMap dict;
   dict.try_emplace("i32", 42);
   dict.try_emplace("f32", 42.0f);
 
@@ -1136,13 +1142,6 @@ TEST(FfiTest, PlatformStream) {
   (void)Ffi::BindTo(+[](TestStream stream) { return absl::OkStatus(); });
 }
 
-TEST(FfiTest, BindFfiInternals) {
-  (void)Ffi::Bind().Ctx<FfiApi>().Ctx<FfiExecutionContext>().To(
-      +[](const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx) {
-        return absl::OkStatus();
-      });
-}
-
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/type_registry.cc b/third_party/xla/xla/ffi/type_registry.cc
index e6b8ea028985db..fa45bb8dff8505 100644
--- a/third_party/xla/xla/ffi/type_registry.cc
+++ b/third_party/xla/xla/ffi/type_registry.cc
@@ -40,24 +40,19 @@ struct TypeRegistration {
   TypeRegistry::TypeInfo type_info;
 };
 
-using ExternalTypeRegistry = absl::flat_hash_map<std::string, TypeRegistration>;
+using TypeRegistryMap = absl::flat_hash_map<std::string, TypeRegistration>;
 
 }  // namespace
 
 ABSL_CONST_INIT absl::Mutex type_registry_mutex(absl::kConstInit);
 
-static ExternalTypeRegistry& StaticExternalTypeRegistry() {
-  static absl::NoDestructor<ExternalTypeRegistry> registry;
+static TypeRegistryMap& StaticTypeRegistryMap() {
+  static absl::NoDestructor<TypeRegistryMap> registry;
   return *registry;
 }
 
-TypeRegistry::TypeId TypeRegistry::GetNextInternalTypeId() {
-  static auto* counter = new std::atomic<int64_t>(1);
-  return TypeId(counter->fetch_add(1));
-}
-
-TypeRegistry::TypeId TypeRegistry::GetNextExternalTypeId() {
-  static auto* counter = new std::atomic<int64_t>(1);
+TypeRegistry::TypeId TypeRegistry::GetNextTypeId() {
+  static absl::NoDestructor<std::atomic<int64_t>> counter(1);
   return TypeId(counter->fetch_add(1));
 }
 
@@ -66,7 +61,7 @@ absl::StatusOr<TypeRegistry::TypeId> TypeRegistry::AssignExternalTypeId(
   VLOG(3) << absl::StrFormat("Assign external type id: name=%s", name);
 
   absl::MutexLock lock(type_registry_mutex);
-  auto& registry = StaticExternalTypeRegistry();
+  auto& registry = StaticTypeRegistryMap();
 
   // Try to emplace with unknow type id and fill it with real type id only if we
   // successfully acquired an entry for a given name.
@@ -84,9 +79,9 @@ absl::StatusOr<TypeRegistry::TypeId> TypeRegistry::AssignExternalTypeId(
   };
 
   // Create a new type id that is not already in use.
-  TypeId type_id = GetNextExternalTypeId();
+  TypeId type_id = GetNextTypeId();
   while (type_id_is_in_use(type_id)) {
-    type_id = GetNextExternalTypeId();
+    type_id = GetNextTypeId();
   }
 
   VLOG(3) << absl::StrFormat("Assigned external type id: name=%s type_id=%d",
@@ -101,7 +96,7 @@ absl::Status TypeRegistry::RegisterExternalTypeId(absl::string_view name,
                              name, type_id.value());
 
   absl::MutexLock lock(type_registry_mutex);
-  auto& registry = StaticExternalTypeRegistry();
+  auto& registry = StaticTypeRegistryMap();
 
   auto emplaced = registry.emplace(name, TypeRegistration{type_id, type_info});
   if (!emplaced.second && emplaced.first->second.type_id != type_id) {
@@ -113,10 +108,22 @@ absl::Status TypeRegistry::RegisterExternalTypeId(absl::string_view name,
   return absl::OkStatus();
 }
 
-absl::StatusOr<TypeRegistry::TypeInfo> TypeRegistry::GetExternalTypeInfo(
+absl::StatusOr<TypeRegistry::TypeId> TypeRegistry::GetTypeId(
+    absl::string_view name) {
+  absl::MutexLock lock(type_registry_mutex);
+  auto& registry = StaticTypeRegistryMap();
+
+  auto it = registry.find(name);
+  if (it == registry.end()) {
+    return Internal("Type name %s is not registered", name);
+  }
+  return it->second.type_id;
+}
+
+absl::StatusOr<TypeRegistry::TypeInfo> TypeRegistry::GetTypeInfo(
     TypeId type_id) {
   absl::MutexLock lock(type_registry_mutex);
-  auto& registry = StaticExternalTypeRegistry();
+  auto& registry = StaticTypeRegistryMap();
 
   auto it = absl::c_find_if(registry, [&](const auto& kv) {
     auto& [name, registration] = kv;
diff --git a/third_party/xla/xla/ffi/type_registry.h b/third_party/xla/xla/ffi/type_registry.h
index 8c61214ddc0f4d..8fabc15f4ed55e 100644
--- a/third_party/xla/xla/ffi/type_registry.h
+++ b/third_party/xla/xla/ffi/type_registry.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/base/no_destructor.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -64,6 +65,14 @@ class TypeRegistry {
     Deleter deleter = nullptr;
   };
 
+  // Returns type id for a given type name. Returns an error if type is
+  // not registered. Works for both external and internal type ids.
+  static absl::StatusOr<TypeId> GetTypeId(absl::string_view name);
+
+  // Returns type info for a given type id. Returns an error if type id is not
+  // registered. Works for both external and internal type ids.
+  static absl::StatusOr<TypeInfo> GetTypeInfo(TypeId type_id);
+
   // Assigns a unique type id to an external type with a given name. Returns an
   // error if a type with a given name is already registered in the process.
   static absl::StatusOr<TypeId> AssignExternalTypeId(absl::string_view name,
@@ -76,9 +85,9 @@ class TypeRegistry {
                                              TypeId type_id,
                                              TypeInfo type_info);
 
-  // Returns type info for a given external type id. Returns an error if type
-  // id is not registered.
-  static absl::StatusOr<TypeInfo> GetExternalTypeInfo(TypeId type_id);
+  // Returns a type name for a given type. For internal type ids only.
+  template <typename T>
+  static absl::string_view GetTypeName();
 
   // Returns a type id for a given type. For internal type ids only.
   template <typename T>
@@ -89,16 +98,21 @@ class TypeRegistry {
   static TypeInfo GetTypeInfo();
 
  private:
-  // We never mix external and internal type ids, so we can use different type
-  // id spaces to assign unique ids to each type.
-  static TypeId GetNextInternalTypeId();
-  static TypeId GetNextExternalTypeId();
+  static TypeId GetNextTypeId();
 };
 
+template <typename T>
+absl::string_view TypeRegistry::GetTypeName() {
+  return typeid(T).name();
+}
+
 template <typename T>
 TypeRegistry::TypeId TypeRegistry::GetTypeId() {
-  static const TypeId id = GetNextInternalTypeId();
-  return id;
+  // We always register internal types in the static type registry, because we
+  // want to be able to lookup them by name.
+  static const absl::NoDestructor<absl::StatusOr<TypeId>> id(
+      AssignExternalTypeId(GetTypeName<T>(), GetTypeInfo<T>()));
+  return **id;
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/ffi/type_registry_test.cc b/third_party/xla/xla/ffi/type_registry_test.cc
index c39110b93897a4..936b4e40231b93 100644
--- a/third_party/xla/xla/ffi/type_registry_test.cc
+++ b/third_party/xla/xla/ffi/type_registry_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -53,7 +54,7 @@ TEST(TypeRegistryTest, RegisterExternalTypeId) {
 
   // Registered type has a correct type info.
   TF_ASSERT_OK_AND_ASSIGN(TypeRegistry::TypeInfo foo_info,
-                          TypeRegistry::GetExternalTypeInfo(foo_id));
+                          TypeRegistry::GetTypeInfo(foo_id));
   EXPECT_EQ(foo_info.deleter, type_info.deleter);
 
   // It's ok to register a new type with a user-provided type id.
@@ -64,14 +65,19 @@ TEST(TypeRegistryTest, RegisterExternalTypeId) {
 
   // And a new type has a correct type info.
   TF_ASSERT_OK_AND_ASSIGN(TypeRegistry::TypeInfo bar_info,
-                          TypeRegistry::GetExternalTypeInfo(bar_id));
+                          TypeRegistry::GetTypeInfo(bar_id));
   EXPECT_EQ(bar_info.deleter, type_info.deleter);
 }
 
 TEST(TypeRegistryTest, RegisterInternalTypeId) {
-  auto int32_type_id = TypeRegistry::GetTypeId<int32_t>();
-  auto int64_type_id = TypeRegistry::GetTypeId<int64_t>();
-  EXPECT_NE(int32_type_id, int64_type_id);
+  auto int32_id = TypeRegistry::GetTypeId<int32_t>();
+  auto int64_id = TypeRegistry::GetTypeId<int64_t>();
+  EXPECT_NE(int32_id, int64_id);
+
+  absl::string_view int32_name = TypeRegistry::GetTypeName<int32_t>();
+  absl::string_view int64_name = TypeRegistry::GetTypeName<int64_t>();
+  EXPECT_EQ(*TypeRegistry::GetTypeId(int32_name), int32_id);
+  EXPECT_EQ(*TypeRegistry::GetTypeId(int64_name), int64_id);
 }
 
 TEST(TypeRegistryTest, InternalTypeInfo) {
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 9bdd19581ff70b..f4d63d404fa222 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -582,47 +582,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "indexed_array_analysis",
-    srcs = ["indexed_array_analysis.cc"],
-    hdrs = ["indexed_array_analysis.h"],
-    deps = [
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/evaluator:hlo_evaluator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "indexed_array_analysis_test",
-    srcs = ["indexed_array_analysis_test.cc"],
-    deps = [
-        ":indexed_array_analysis",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "interval",
     srcs = ["interval.cc"],
@@ -660,6 +619,7 @@ cc_library(
     ],
     deps = [
         ":interval",
+        ":symbolic_expr",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:util",
@@ -668,7 +628,6 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:matmul_indexing_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -694,9 +653,9 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_test_utils",
         ":interval",
+        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/strings:string_view",
@@ -714,8 +673,8 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
+        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -731,11 +690,11 @@ cc_library(
     hdrs = ["indexing_test_utils.h"],
     deps = [
         ":indexing_analysis",
+        ":symbolic_expr",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
@@ -768,3 +727,93 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "symbolic_map",
+    srcs = ["symbolic_map.cc"],
+    hdrs = ["symbolic_map.h"],
+    deps = [
+        ":symbolic_expr",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_map_test",
+    srcs = ["symbolic_map_test.cc"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "symbolic_expr",
+    srcs = ["symbolic_expr.cc"],
+    hdrs = ["symbolic_expr.h"],
+    deps = [
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_expr_test",
+    srcs = ["symbolic_expr_test.cc"],
+    deps = [
+        ":indexing_test_utils",
+        ":symbolic_expr",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "symbolic_map_converter",
+    srcs = ["symbolic_map_converter.cc"],
+    hdrs = ["symbolic_map_converter.h"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        "//xla/hlo/analysis:interval",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_map_converter_test",
+    srcs = ["symbolic_map_converter_test.cc"],
+    deps = [
+        ":symbolic_expr",
+        ":symbolic_map",
+        ":symbolic_map_converter",
+        "//xla/hlo/analysis:interval",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
index 553cf6363b8c0d..d496939370968e 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
@@ -136,175 +136,6 @@ bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
   return true;
 }
 
-namespace {
-bool Is1dSliceWithoutStrides(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kSlice &&
-         1 == instr->slice_starts().size() &&
-         1 == instr->slice_limits().size() &&
-         1 == instr->slice_strides().size() &&
-         1 == instr->slice_strides().at(0);
-}
-
-bool IsSliceInputFusion(const HloInstruction& unnested_hlo) {
-  if (!unnested_hlo.IsInputFusion()) {
-    return false;
-  }
-  const HloInstruction* root = unnested_hlo.fused_expression_root();
-  if (root->opcode() != HloOpcode::kTuple) {
-    return false;
-  }
-  return absl::c_all_of(root->operands(), [](const HloInstruction* instr) {
-    return Is1dSliceWithoutStrides(instr);
-  });
-}
-
-struct ConcatUsageInfo {
-  // Pointer to a previously seen concat. nullptr if no previously seen concat.
-  const HloInstruction* prev_concat;
-  // The opnd id of the seen concat.
-  int64_t concat_opnd_idx;
-  // The slice that recovers the opnd in the concat outputs.
-  const HloInstruction* slice_to_recover_opnd;
-};
-
-// Returns an optional concat usage info to denote whether the concat is used in
-// an elementwise manner. A concat followed by slices is considered effectively
-// elementwise if the slices combinedly is a reverse function of the concat.
-std::optional<ConcatUsageInfo> ConcatIsEffectivelyElementwise(
-    const HloInstruction& concat, const HloInstruction& operand,
-    const ConcatUsageInfo& info) {
-  // First, check if this concat is in the below pattern. Also, we check
-  // that the slices combinedly are in effect a reverse function of the concat.
-  //
-  //     Concat
-  //     |    |
-  //     v    v
-  //   Slice Slice
-  //
-  std::vector<HloInstruction*> users = concat.users();
-  if (!absl::c_all_of(users, Is1dSliceWithoutStrides)) {
-    // Limit our supported cases to 1 dimensional slices.
-    return std::optional<ConcatUsageInfo>();
-  }
-  // Verify that each operand to the concat is reversed by a slice.
-  if (users.size() != concat.operand_count() ||
-      concat.operand_count() != concat.unique_operands().size()) {
-    return std::optional<ConcatUsageInfo>();
-  }
-  absl::c_sort(users, [](const HloInstruction* a, const HloInstruction* b) {
-    return a->slice_starts().at(0) < b->slice_starts().at(0);
-  });
-  int64_t prev_limit = 0;
-  for (int64_t i = 0; i < users.size(); ++i) {
-    const HloInstruction* u = users[i];
-    int64_t slice_size = u->slice_limits().at(0) - u->slice_starts().at(0);
-    if (u->slice_starts().at(0) != prev_limit ||
-        slice_size != ShapeUtil::ElementsIn(concat.operand(i)->shape())) {
-      return std::optional<ConcatUsageInfo>();
-    }
-    prev_limit = u->slice_limits().at(0);
-  }
-
-  // If we have seen other concats, make sure they are identical. Multiple
-  // concats exist because horizontal fusion inserts one concat for each output
-  // of the fusion candidates. Check that all concats and operand ids are the
-  // same to know that the "transitive use closure" will be computed in the same
-  // iteration space.
-  int64_t operand_idx = concat.operand_index(&operand);
-  if (info.prev_concat != nullptr) {
-    bool is_concat_identical = info.prev_concat->Identical(
-        concat,
-        /*eq_operands=*/[](const HloInstruction*, const HloInstruction*) {
-          // Operands don't need to be the same.
-          return true;
-        });
-    if (!is_concat_identical || info.concat_opnd_idx != operand_idx) {
-      return std::optional<ConcatUsageInfo>();
-    }
-  }
-
-  const HloInstruction* slice_to_recover_opnd = users.at(operand_idx);
-  return std::optional<ConcatUsageInfo>(
-      ConcatUsageInfo{&concat, operand_idx, slice_to_recover_opnd});
-}
-
-// Returns whether we can prove the transitive uses of `param` are in effect
-// elementwise. In other words, we prove that the "transitive use closure" will
-// all be computed in the same iteration space without any reorder of elements.
-// In addition, we check that the "transitive use closure" includes the output
-// in the `root_tuple`.
-// Theoretically, We can prove more patterns but our primary use case is
-// SliceInputFusion.
-bool AreTransitiveUsesEffectivelyElementwise(const HloInstruction* param,
-                                             const HloInstruction* root_tuple,
-                                             const ShapeIndex& out_shape_idx) {
-  CHECK_EQ(root_tuple->opcode(), HloOpcode::kTuple);
-  CHECK_EQ(out_shape_idx.size(), 1);
-  absl::flat_hash_set<const HloInstruction*> visited;
-  absl::InlinedVector<const HloInstruction*, 4> stack;
-  stack.push_back(param);
-  ConcatUsageInfo concat_usage_info{nullptr, 0, nullptr};
-  bool is_output_reachable = false;
-  while (!stack.empty()) {
-    const HloInstruction* current = stack.back();
-    stack.pop_back();
-    visited.insert(current);
-    for (const HloInstruction* user : current->users()) {
-      VLOG(3) << "Visiting: " << user->ToString();
-      switch (user->opcode()) {
-        case HloOpcode::kTuple:
-          if (user == root_tuple &&
-              current == root_tuple->operand(out_shape_idx.back())) {
-            // We need to know if the output is reachable by the `param` to make
-            // sure that they will be computed in the same iteration space.
-            is_output_reachable = true;
-          }
-          break;
-        case HloOpcode::kReshape:
-          if (!ShapeUtil::ReshapeIsBitcast(current->shape(), user->shape())) {
-            return false;
-          }
-          break;
-        case HloOpcode::kConcatenate: {
-          std::optional<ConcatUsageInfo> optional_concat_info =
-              ConcatIsEffectivelyElementwise(*user, *current,
-                                             concat_usage_info);
-          if (!optional_concat_info) {
-            return false;
-          }
-          concat_usage_info = *optional_concat_info;
-          // Early continue as we only want to traverse through the slice that
-          // recovers the operand. It is guaranteed that the operand to the
-          // concat and the slice have the same iteration space. Insert the
-          // slice instead of the concat.
-          CHECK(!visited.contains(concat_usage_info.slice_to_recover_opnd));
-          stack.push_back(concat_usage_info.slice_to_recover_opnd);
-          continue;
-        }
-        default:
-          for (const int64_t use_index : user->OperandIndices(current)) {
-            if (!user->IsElementwiseOnOperand(use_index)) {
-              // Found a user that is non-elementwise on the current
-              // instruction.
-              return false;
-            }
-          }
-          if (!LayoutUtil::Equal(current->shape().layout(),
-                                 user->shape().layout())) {
-            // Make sure the layout is not changed by the elementwise op.
-            return false;
-          }
-          break;
-      }  // end of switch
-      if (!visited.contains(user)) {
-        stack.push_back(user);
-      }
-    }
-  }
-  return is_output_reachable;
-}
-}  // namespace
-
 bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
                                            const ShapeIndex& index) const {
   const HloValueSet& value_set = GetValueSet(instruction, index);
@@ -1800,18 +1631,6 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       ShapeUtil::GetSubshape(operand->shape(), operand_index);
   const Shape& user_subshape =
       ShapeUtil::GetSubshape(user->shape(), user_index);
-  if (IsSliceInputFusion(*user)) {
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-    // We don't require the same dimensions but only the same number of elements
-    // and type (to make sure the same buffer size).
-    return operand_subshape.IsArray() && user_subshape.IsArray() &&
-           ShapeUtil::ElementsIn(operand_subshape) ==
-               ShapeUtil::ElementsIn(user_subshape) &&
-           ShapeUtil::SameElementType(operand_subshape, user_subshape) &&
-           AreTransitiveUsesEffectivelyElementwise(
-               fusion_param, user->fused_expression_root(), user_index);
-  }
 
   auto shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
   // Check that operand and user emit the same shape and layout.
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
index f0484a1f835bc3..755e184eb7f7aa 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
@@ -3079,151 +3079,6 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
       reverse, {}, call, {}, &alias_info_));
 }
 
-TEST_F(CanShareOperandBufferWithUserTest, ConcatSliceWithElementwise) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      add0 = f32[10, 20] add(p0, p1)
-      sub0 = f32[10, 10] subtract(p2, p3)
-      reshape0 = f32[200] reshape(add0)
-      reshape1 = f32[100] reshape(sub0)
-      concat0 = f32[300] concatenate(reshape0, reshape1), dimensions={0}
-      slice0 = f32[200] slice(concat0), slice={[0:200]}
-      slice1 = f32[100] slice(concat0), slice={[200:300]}
-      ROOT tuple = (f32[200], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      ROOT fusion = (f32[200], f32[100]) fusion(p0, p1, p2, p3), kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-  auto* param2 = module->entry_computation()->parameter_instruction(2);
-  auto* param3 = module->entry_computation()->parameter_instruction(3);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param2, {}, fusion, {1}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param3, {}, fusion, {1}, &alias_info_));
-  // Tensors of different sizes cannot share buffer.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, ConcatSliceNegativeTest) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      // p0 has multiple transitive uses fed to concat. So, p0 cannot share
-      // buffer with outputs because the aliased output could be written before
-      // all the uses of p0 are finished.
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      add0 = f32[100] add(p0, p1)
-      concat0 = f32[200] concatenate(p0, add0), dimensions={0}
-      slice0 = f32[100] slice(concat0), slice={[0:100]}
-      slice1 = f32[100] slice(concat0), slice={[100:200]}
-      ROOT tuple = (f32[100], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      ROOT fusion = (f32[100], f32[100]) fusion(p0, p1),
-                        kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  // p0 cannot share with either fusion{0} or fusion{1}.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-  // p1 cannot share with fusion{0} because we're not sure about their
-  // relationship.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  // p1 can share with fusion{1} because they will be executed in an
-  // elementwise manner.
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {1}, &alias_info_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, MultipleConcatenates) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      add0 = f32[100] add(p0, p1)
-      sub0 = f32[100] subtract(p1, p1)
-      concat0 = f32[200] concatenate(p0, add0), dimensions={0}
-      slice0 = f32[100] slice(concat0), slice={[0:100]}
-      slice1 = f32[100] slice(concat0), slice={[100:200]}
-      concat1 = f32[200] concatenate(p0, sub0), dimensions={0}
-      slice2 = f32[100] slice(concat1), slice={[0:100]}
-      slice3 = f32[100] slice(concat1), slice={[100:200]}
-      ROOT tuple = (f32[100], f32[100], f32[100], f32[100])
-                       tuple(slice0, slice1, slice2, slice3)
-    }
-
-    ENTRY test {
-      p0 = f32[100] parameter(0)
-      p1 = f32[100] parameter(1)
-      ROOT fusion = (f32[100], f32[100], f32[100], f32[100])
-          fusion(p0, p1), kind=kInput, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module->entry_computation()->root_instruction();
-  auto* param0 = module->entry_computation()->parameter_instruction(0);
-  auto* param1 = module->entry_computation()->parameter_instruction(1);
-
-  auto dataflow_analysis = RunAnalysis(*module);
-  // p0 cannot share.
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {1}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {2}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param0, {}, fusion, {3}, &alias_info_));
-  // p1 can share with either fusion{1} or fusion{3}.
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {1}, &alias_info_));
-  EXPECT_TRUE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {3}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {0}, &alias_info_));
-  EXPECT_FALSE(dataflow_analysis->CanShareOperandBufferWithUser(
-      param1, {}, fusion, {2}, &alias_info_));
-}
-
 using GetInPlaceInputOutputPairsTest = HloHardwareIndependentTestBase;
 
 TEST_F(GetInPlaceInputOutputPairsTest, DUS) {
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
index e5ba48b71f5991..5a49c3f5af13ad 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.cc
@@ -35,25 +35,60 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
 
 namespace xla {
 
-bool HloDimensionAnalysis::IsInstructionWeight(
+static void ClearDotDependent(ShapeTree<DimensionInfo>& dimension_info_tree) {
+  dimension_info_tree.ForEachMutableElement(
+      [&](const ShapeIndex& index, DimensionInfo* dimension_info) {
+        if (dimension_info_tree.IsLeaf(index) &&
+            *dimension_info == DimensionInfo::kDotDependent) {
+          *dimension_info = DimensionInfo::kUnknown;
+        }
+      });
+}
+
+bool HloDimensionAnalysis::IsWeight(const HloInstruction* instruction) const {
+  auto it = info_map_.find(instruction);
+  if (it == info_map_.end()) {
+    return false;
+  }
+  return absl::c_any_of(it->second.leaves(),
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kWeight;
+                        });
+}
+
+bool HloDimensionAnalysis::IsDotDependent(
+    const HloInstruction* instruction) const {
+  auto it = info_map_.find(instruction);
+  if (it == info_map_.end()) {
+    return false;
+  }
+  return absl::c_any_of(it->second.leaves(),
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kDotDependent;
+                        });
+}
+
+bool HloDimensionAnalysis::IsKnownDimensionInfo(
     const HloInstruction* instruction) const {
   auto it = info_map_.find(instruction);
   if (it == info_map_.end()) {
     return false;
   }
   return absl::c_any_of(it->second.leaves(),
-                        [](const std::pair<ShapeIndex, WeightInfo>& leaf) {
-                          return leaf.second == WeightInfo::kWeight;
+                        [](const std::pair<ShapeIndex, DimensionInfo>& leaf) {
+                          return leaf.second == DimensionInfo::kWeight ||
+                                 leaf.second == DimensionInfo::kDotDependent;
                         });
 }
 
-std::optional<ShapeTree<WeightInfo>> HloDimensionAnalysis::GetWeightInfo(
+std::optional<ShapeTree<DimensionInfo>> HloDimensionAnalysis::GetDimensionInfo(
     const HloInstruction* instruction) const {
   auto it = info_map_.find(instruction);
   if (it == info_map_.end()) {
@@ -62,35 +97,39 @@ std::optional<ShapeTree<WeightInfo>> HloDimensionAnalysis::GetWeightInfo(
   return it->second;
 }
 
-absl::Status HloDimensionAnalysis::SetInstructionAsWeight(
-    HloInstruction* instruction) {
+absl::Status HloDimensionAnalysis::SetDimensionInfo(
+    const HloInstruction* instruction, DimensionInfo value) {
+  CHECK(value == DimensionInfo::kWeight ||
+        value == DimensionInfo::kDotDependent)
+      << "Unsupported dimension info: " << value;
   auto [it, success] = info_map_.emplace(
       std::piecewise_construct, std::forward_as_tuple(instruction),
-      std::forward_as_tuple(instruction->shape(), WeightInfo::kUnknown));
+      std::forward_as_tuple(instruction->shape(), DimensionInfo::kUnknown));
 
   if (!success) {
-    return absl::InternalError(absl::StrCat(
-        "Instruction ", instruction->ToString(), " already has weight info."));
+    return absl::InternalError(absl::StrCat("Instruction ",
+                                            instruction->ToString(),
+                                            " already has dimension info."));
   }
 
-  ShapeTree<WeightInfo>& weight_tree = it->second;
-  weight_tree.ForEachMutableElement(
-      [&](const ShapeIndex& index, WeightInfo* weight_info) {
-        if (weight_tree.IsLeaf(index)) {
-          *weight_info = WeightInfo::kWeight;
+  ShapeTree<DimensionInfo>& dim_info_tree = it->second;
+  dim_info_tree.ForEachMutableElement(
+      [&](const ShapeIndex& index, DimensionInfo* dimension_info) {
+        if (dim_info_tree.IsLeaf(index)) {
+          *dimension_info = value;
           return;
         }
-        *weight_info = WeightInfo::kTuple;
+        *dimension_info = DimensionInfo::kUnknown;
       });
   return absl::OkStatus();
 }
 
-absl::Status HloDimensionAnalysis::SetWeightInfo(
-    const HloInstruction* target, ShapeTree<WeightInfo> weight_annotation) {
-  auto [it, success] = info_map_.emplace(target, std::move(weight_annotation));
+absl::Status HloDimensionAnalysis::SetDimensionInfo(
+    const HloInstruction* target, ShapeTree<DimensionInfo> annotation) {
+  auto [it, success] = info_map_.emplace(target, std::move(annotation));
   if (!success) {
     return absl::InternalError(absl::StrCat("Instruction ", target->ToString(),
-                                            " already has weight info."));
+                                            " already has dimension info."));
   }
   return absl::OkStatus();
 }
@@ -101,28 +140,35 @@ absl::Status HloDimensionAnalysis::AnnotateEntryComputationParameters(
   const auto& params = module.entry_computation()->parameter_instructions();
   info_map_.reserve(params.size());
   for (HloInstruction* instruction : params) {
-    TF_RETURN_IF_ERROR(SetInstructionAsWeight(instruction));
+    TF_RETURN_IF_ERROR(SetDimensionInfo(instruction, DimensionInfo::kWeight));
   }
   return absl::OkStatus();
 }
 
+bool HloDimensionAnalysis::IsDotOrHasDotDependent(
+    const HloInstruction* op) const {
+  if (HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution,
+                       HloOpcode::kRaggedDot>(op)) {
+    return true;
+  }
+  return IsDotDependent(op);
+}
+
 absl::StatusOr<std::unique_ptr<HloDimensionAnalysis>> HloDimensionAnalysis::Run(
     const HloModule& module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::unique_ptr<HloDimensionAnalysis> weight_analysis =
+  std::unique_ptr<HloDimensionAnalysis> analysis =
       absl::WrapUnique(new HloDimensionAnalysis(module, execution_threads));
-  TF_RETURN_IF_ERROR(
-      weight_analysis->AnnotateEntryComputationParameters(module));
-  TF_RETURN_IF_ERROR(
-      weight_analysis->RunOnComputation(*module.entry_computation()));
-  return weight_analysis;
+  TF_RETURN_IF_ERROR(analysis->AnnotateEntryComputationParameters(module));
+  TF_RETURN_IF_ERROR(analysis->RunOnComputation(*module.entry_computation()));
+  return analysis;
 }
 
 absl::Status HloDimensionAnalysis::RunOnComputation(
     const HloComputation& computation) {
   if (HloInstruction::IsThreadIncluded(computation.execution_thread(),
                                        execution_threads_)) {
-    HloWeightPropagation propagation(this);
+    HloDimensionInfoPropagation propagation(this);
     return propagation.Run(computation);
   }
   return absl::OkStatus();
@@ -133,17 +179,19 @@ absl::Status HloDimensionAnalysis::RunOnComputation(
     absl::Span<const HloInstruction* const> operands) {
   CHECK_EQ(computation.num_parameters(), operands.size());
   for (int i = 0; i < computation.num_parameters(); ++i) {
-    auto weight_info_iter = info_map_.find(operands[i]);
-    if (weight_info_iter == info_map_.end()) {
+    auto dimension_info_iter = info_map_.find(operands[i]);
+    if (dimension_info_iter == info_map_.end()) {
       continue;
     }
-    TF_RETURN_IF_ERROR(SetWeightInfo(computation.parameter_instructions()[i],
-                                     weight_info_iter->second));
+    ClearDotDependent(dimension_info_iter->second);
+    TF_RETURN_IF_ERROR(SetDimensionInfo(computation.parameter_instructions()[i],
+                                        dimension_info_iter->second));
   }
   return RunOnComputation(computation);
 }
 
-absl::Status HloWeightPropagation::Run(const HloComputation& computation) {
+absl::Status HloDimensionInfoPropagation::Run(
+    const HloComputation& computation) {
   TF_RETURN_IF_ERROR(computation.root_instruction()->Accept(this));
   for (HloInstruction* instruction : computation.instructions()) {
     if (instruction->user_count() == 0) {
@@ -153,151 +201,183 @@ absl::Status HloWeightPropagation::Run(const HloComputation& computation) {
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::DefaultAction(HloInstruction* instruction) {
-  return absl::OkStatus();
-}
-
 #define RETURN_IF_ALREADY_PROPAGATED(instruction) \
-  if (analysis_->HasWeightInfo(instruction)) {    \
+  if (analysis_->HasDimensionInfo(instruction)) { \
     return absl::OkStatus();                      \
   }
 
-absl::Status HloWeightPropagation::HandleTuple(HloInstruction* tuple) {
+absl::Status HloDimensionInfoPropagation::DefaultAction(
+    HloInstruction* instruction) {
+  RETURN_IF_ALREADY_PROPAGATED(instruction);
+  // For non-weight, we want to find out whether the instruction has a
+  // dot-dependent operand.
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (analysis_->IsDotOrHasDotDependent(operand)) {
+      TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
+          instruction, ShapeTree<DimensionInfo>(instruction->shape(),
+                                                DimensionInfo::kDotDependent)));
+      break;
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status HloDimensionInfoPropagation::HandleTuple(HloInstruction* tuple) {
   RETURN_IF_ALREADY_PROPAGATED(tuple);
-  bool has_weight_info = false;
-  ShapeTree<WeightInfo> weight_tree(tuple->shape(), WeightInfo::kUnknown);
+  bool has_dim_info = false;
+  ShapeTree<DimensionInfo> dim_info_tree(tuple->shape(),
+                                         DimensionInfo::kUnknown);
   for (int64_t idx = 0; idx < tuple->operand_count(); ++idx) {
     const HloInstruction* operand = tuple->operand(idx);
-    if (analysis_->IsInstructionWeight(operand)) {
-      weight_tree.CopySubtreeFrom(*analysis_->GetWeightInfo(operand), {},
-                                  {idx});
-      has_weight_info = true;
+    if (analysis_->IsKnownDimensionInfo(operand)) {
+      dim_info_tree.CopySubtreeFrom(*analysis_->GetDimensionInfo(operand), {},
+                                    {idx});
+      has_dim_info = true;
     }
   }
 
-  if (has_weight_info) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(tuple, std::move(weight_tree)));
+  if (has_dim_info) {
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(tuple, std::move(dim_info_tree)));
   }
 
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleGetTupleElement(
+absl::Status HloDimensionInfoPropagation::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
   RETURN_IF_ALREADY_PROPAGATED(get_tuple_element);
   const HloInstruction* operand = get_tuple_element->operand(0);
-  if (analysis_->IsInstructionWeight(operand)) {
-    ShapeTree<WeightInfo> weight_tree(get_tuple_element->shape(),
-                                      WeightInfo::kUnknown);
-    weight_tree.CopySubtreeFrom(*analysis_->GetWeightInfo(operand),
-                                {get_tuple_element->tuple_index()}, {});
-    TF_RETURN_IF_ERROR(
-        analysis_->SetWeightInfo(get_tuple_element, std::move(weight_tree)));
+  if (analysis_->IsKnownDimensionInfo(operand)) {
+    ShapeTree<DimensionInfo> dimension_info(get_tuple_element->shape(),
+                                            DimensionInfo::kUnknown);
+    dimension_info.CopySubtreeFrom(*analysis_->GetDimensionInfo(operand),
+                                   {get_tuple_element->tuple_index()}, {});
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(get_tuple_element,
+                                                   std::move(dimension_info)));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleCall(HloInstruction* call) {
+absl::Status HloDimensionInfoPropagation::HandleCall(HloInstruction* call) {
   RETURN_IF_ALREADY_PROPAGATED(call);
   HloComputation* computation = call->called_computations()[0];
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, call->operands()));
-  if (analysis_->IsInstructionWeight(computation->root_instruction())) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
-        call, *analysis_->GetWeightInfo(computation->root_instruction())));
+  if (analysis_->IsWeight(computation->root_instruction())) {
+    ShapeTree<DimensionInfo> dimension_info_tree =
+        *analysis_->GetDimensionInfo(computation->root_instruction());
+    ClearDotDependent(dimension_info_tree);
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(call, dimension_info_tree));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleWhile(HloInstruction* xla_while) {
+absl::Status HloDimensionInfoPropagation::HandleWhile(
+    HloInstruction* xla_while) {
   RETURN_IF_ALREADY_PROPAGATED(xla_while);
   TF_RETURN_IF_ERROR(analysis_->RunOnComputation(*xla_while->while_condition(),
                                                  xla_while->operands()));
   HloComputation* computation = xla_while->while_body();
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, xla_while->operands()));
-  if (analysis_->IsInstructionWeight(computation->root_instruction())) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
-        xla_while, *analysis_->GetWeightInfo(computation->root_instruction())));
+  if (analysis_->IsWeight(computation->root_instruction())) {
+    ShapeTree<DimensionInfo> dimension_info_tree =
+        *analysis_->GetDimensionInfo(computation->root_instruction());
+    ClearDotDependent(dimension_info_tree);
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(xla_while, dimension_info_tree));
   }
   return absl::OkStatus();
 }
 
 // Called for operations that operate on a single operand and do not change
 // the weight "nature" of their operand.
-absl::Status HloWeightPropagation::HandleSimpleOp(HloInstruction* op) {
+absl::Status HloDimensionInfoPropagation::HandleSimpleOp(HloInstruction* op) {
   RETURN_IF_ALREADY_PROPAGATED(op);
   const HloInstruction* operand = op->operand(0);
-  if (analysis_->IsInstructionWeight(operand)) {
-    TF_RETURN_IF_ERROR(analysis_->SetInstructionAsWeight(op));
+  if (analysis_->IsWeight(operand)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(op, DimensionInfo::kWeight));
+  } else if (analysis_->IsDotOrHasDotDependent(operand)) {
+    TF_RETURN_IF_ERROR(
+        analysis_->SetDimensionInfo(op, DimensionInfo::kDotDependent));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleDynamicSlice(
+absl::Status HloDimensionInfoPropagation::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
   return HandleSimpleOp(dynamic_slice);
 }
 
-absl::Status HloWeightPropagation::HandleDynamicUpdateSlice(
+absl::Status HloDimensionInfoPropagation::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   RETURN_IF_ALREADY_PROPAGATED(dynamic_update_slice);
   // If either the operand or the update is a weight, we consider the output to
   // be a weight.
   const HloInstruction* operand = dynamic_update_slice->operand(0);
   const HloInstruction* update = dynamic_update_slice->operand(1);
-  if (analysis_->IsInstructionWeight(operand) ||
-      analysis_->IsInstructionWeight(update)) {
-    TF_RETURN_IF_ERROR(analysis_->SetInstructionAsWeight(dynamic_update_slice));
+  if (analysis_->IsWeight(operand) || analysis_->IsWeight(update)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(dynamic_update_slice,
+                                                   DimensionInfo::kWeight));
+  } else if (analysis_->IsDotDependent(operand) ||
+             analysis_->IsDotDependent(update)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
+        dynamic_update_slice, DimensionInfo::kDotDependent));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleSlice(HloInstruction* slice) {
+absl::Status HloDimensionInfoPropagation::HandleSlice(HloInstruction* slice) {
   return HandleSimpleOp(slice);
 }
 
-absl::Status HloWeightPropagation::HandleConvert(HloInstruction* convert) {
+absl::Status HloDimensionInfoPropagation::HandleConvert(
+    HloInstruction* convert) {
   return HandleSimpleOp(convert);
 }
 
-absl::Status HloWeightPropagation::HandleReshape(HloInstruction* reshape) {
+absl::Status HloDimensionInfoPropagation::HandleReshape(
+    HloInstruction* reshape) {
   return HandleSimpleOp(reshape);
 }
 
-absl::Status HloWeightPropagation::HandleBitcast(HloInstruction* bitcast) {
+absl::Status HloDimensionInfoPropagation::HandleBitcast(
+    HloInstruction* bitcast) {
   return HandleSimpleOp(bitcast);
 }
 
-absl::Status HloWeightPropagation::HandleTranspose(HloInstruction* transpose) {
+absl::Status HloDimensionInfoPropagation::HandleTranspose(
+    HloInstruction* transpose) {
   return HandleSimpleOp(transpose);
 }
 
-absl::Status HloWeightPropagation::HandleCopy(HloInstruction* copy) {
+absl::Status HloDimensionInfoPropagation::HandleCopy(HloInstruction* copy) {
   return HandleSimpleOp(copy);
 }
 
-absl::Status HloWeightPropagation::HandleBitcastConvert(
+absl::Status HloDimensionInfoPropagation::HandleBitcastConvert(
     HloInstruction* bitcast_convert) {
   return HandleSimpleOp(bitcast_convert);
 }
 
-absl::Status HloWeightPropagation::HandleOptimizationBarrier(
+absl::Status HloDimensionInfoPropagation::HandleOptimizationBarrier(
     HloInstruction* optimization_barrier) {
   RETURN_IF_ALREADY_PROPAGATED(optimization_barrier);
   CHECK_EQ(optimization_barrier->operand_count(), 1)
       << "Optimization barrier must have exactly one operand.";
   const HloInstruction* optimization_barrier_operand =
       optimization_barrier->operand(0);
-  if (analysis_->IsInstructionWeight(optimization_barrier_operand)) {
-    TF_RETURN_IF_ERROR(analysis_->SetWeightInfo(
+  if (analysis_->IsKnownDimensionInfo(optimization_barrier_operand)) {
+    TF_RETURN_IF_ERROR(analysis_->SetDimensionInfo(
         optimization_barrier,
-        *analysis_->GetWeightInfo(optimization_barrier_operand)));
+        *analysis_->GetDimensionInfo(optimization_barrier_operand)));
   }
   return absl::OkStatus();
 }
 
-absl::Status HloWeightPropagation::HandleAllGather(HloInstruction* all_gather) {
+absl::Status HloDimensionInfoPropagation::HandleAllGather(
+    HloInstruction* all_gather) {
   return HandleSimpleOp(all_gather);
 }
 
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
index 6e3466ddae93a8..f9b5452997c76f 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis.h
@@ -40,25 +40,40 @@ limitations under the License.
 
 namespace xla {
 
-enum WeightInfo : uint8_t {
+enum DimensionInfo : uint8_t {
+  // kDotDependent indicates there is a DOT that can reach an operand of the
+  // instruction. We want to use this information to distinguish between
+  // WeightGradient and ActivationGradient as follows to help decide whether
+  // we can overlap the all-gather/reduce-scatter with other dot operations
+  // outside the chain:
+  //
+  // ActivationGradient: a DOT, there is another DOT that can reach the operands
+  // of this DOT via def-use chain.
+  //
+  // WeightGradient: a DOT, no other DOT can reach the operand of this DOT via
+  // def-use chain.
+  //
+  // Because we don't schedule instructions across computation boundaries, we
+  // don't propagate kDotDependent across computation boundaries. On the other
+  // hand, we propagate kWeight across computation boundaries.
   kWeight,
-  kTuple,
+  kDotDependent,
   kUnknown,
 };
 
-inline std::string WeightInfoToString(WeightInfo weight_info) {
-  switch (weight_info) {
-    case WeightInfo::kWeight:
+inline std::string DimensionInfoToString(DimensionInfo dim_info) {
+  switch (dim_info) {
+    case DimensionInfo::kWeight:
       return "weight";
-    case WeightInfo::kTuple:
-      return "tuple";
-    case WeightInfo::kUnknown:
+    case DimensionInfo::kDotDependent:
+      return "dot_dependent";
+    case DimensionInfo::kUnknown:
       return "unknown";
   }
 }
 
-using WeightInfoMap =
-    absl::node_hash_map<const HloInstruction*, ShapeTree<WeightInfo>>;
+using DimensionInfoMap =
+    absl::node_hash_map<const HloInstruction*, ShapeTree<DimensionInfo>>;
 
 // This analysis pass determines which HLO instructions produce/are weights.
 // Parameters to the entry computation are considered weights, and this property
@@ -66,86 +81,98 @@ using WeightInfoMap =
 // etc).
 class HloDimensionAnalysis {
  public:
-  friend class HloWeightPropagation;
+  friend class HloDimensionInfoPropagation;
   static absl::StatusOr<std::unique_ptr<HloDimensionAnalysis>> Run(
       const HloModule& module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
-  // Whether the instruction has been annotated with weight info.
-  bool HasWeightInfo(const HloInstruction* instruction) const {
+  // Whether the instruction has been annotated with dimension info.
+  bool HasDimensionInfo(const HloInstruction* instruction) const {
     return info_map_.contains(instruction);
   }
 
   // Whether any leaf in the instruction shape is a weight.
-  bool IsInstructionWeight(const HloInstruction* instruction) const;
+  bool IsWeight(const HloInstruction* instruction) const;
+  // Whether any leaf in the instruction shape is dot dependent.
+  bool IsDotDependent(const HloInstruction* instruction) const;
+  // Whether any leaf in the instructon shape is a weight or dot dependent.
+  bool IsKnownDimensionInfo(const HloInstruction* instruction) const;
 
-  // Returns map of HLO instructions to their weight info.
+  // Returns map of HLO instructions to their dimension info.
   // If an instruction is not found in the map, it means that we have not
-  // determined it is a weight.
-  const WeightInfoMap& GetWeightInfoMap() const { return info_map_; }
+  // determined its dimension info.
+  const DimensionInfoMap& GetDimensionInfoMap() const { return info_map_; }
 
-  // Returns the weight info for the given instruction.
-  std::optional<ShapeTree<WeightInfo>> GetWeightInfo(
+  // Returns the dimension info for the given instruction.
+  std::optional<ShapeTree<DimensionInfo>> GetDimensionInfo(
       const HloInstruction* instruction) const;
 
+  bool IsDotOrHasDotDependent(const HloInstruction* op) const;
+
  protected:
   explicit HloDimensionAnalysis(
       const HloModule& module,
       const absl::flat_hash_set<absl::string_view>& execution_threads)
       : module_(module), execution_threads_(execution_threads) {}
 
-  // Sets the instruction as a weight. This is used to annotate the entry
-  // computation parameters and other instructions that are known to be
-  // weights.
-  absl::Status SetInstructionAsWeight(HloInstruction* instruction);
+  // Sets the instruction DimensionInfo to indicate it is a weight or
+  // dot-dependent. This is used to annotate the entry computation parameters
+  // and other instructions that are known to be weights or dot-dependents.
+  absl::Status SetDimensionInfo(const HloInstruction* instruction,
+                                DimensionInfo value);
 
-  // Sets the weight info for the given target instruction.
-  absl::Status SetWeightInfo(const HloInstruction* target,
-                             ShapeTree<WeightInfo> weight_annotation);
+  // Sets the dimension info for the given target instruction.
+  absl::Status SetDimensionInfo(const HloInstruction* target,
+                                ShapeTree<DimensionInfo> annotation);
 
   // Annotates the entry computation parameters as weights.
   absl::Status AnnotateEntryComputationParameters(const HloModule& module);
 
-  // Runs the weight analysis on the given computation.
+  // Runs the analysis on the given computation to determine the DimensionInfo
+  // for each instruction.
   absl::Status RunOnComputation(const HloComputation& computation);
 
-  // Runs the weight analysis on the given computation, with the given operands
-  // as the computation parameters. Propagates the weight info from the
+  // Runs the analysis on the given computation, with the given operands as the
+  // computation parameters. Propagates the dimension info from the callsite
   // operands to the computation parameters.
   absl::Status RunOnComputation(
       const HloComputation& computation,
       absl::Span<const HloInstruction* const> operands);
 
-  WeightInfoMap info_map_;
+  DimensionInfoMap info_map_;
   const HloModule& module_;
   const absl::flat_hash_set<absl::string_view>& execution_threads_;
 };
 
-class HloWeightPropagation : public DfsHloVisitorWithDefault {
+class HloDimensionInfoPropagation : public DfsHloVisitorWithDefault {
  public:
-  explicit HloWeightPropagation(HloDimensionAnalysis* dimension_analysis)
+  explicit HloDimensionInfoPropagation(HloDimensionAnalysis* dimension_analysis)
       : analysis_(dimension_analysis) {}
   absl::Status Run(const HloComputation& computation);
   absl::Status DefaultAction(HloInstruction* instruction) override;
-  absl::Status HandleTuple(HloInstruction* tuple) override;
-  absl::Status HandleGetTupleElement(
-      HloInstruction* get_tuple_element) override;
+  // go/keep-sorted start
+  absl::Status HandleAllGather(HloInstruction* all_gather) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleBitcastConvert(HloInstruction* bitcast_convert) override;
   absl::Status HandleCall(HloInstruction* call) override;
-  absl::Status HandleWhile(HloInstruction* xla_while) override;
-  absl::Status HandleSimpleOp(HloInstruction* op);
+  absl::Status HandleConvert(HloInstruction* convert) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
   absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  absl::Status HandleSlice(HloInstruction* slice) override;
-  absl::Status HandleConvert(HloInstruction* convert) override;
-  absl::Status HandleReshape(HloInstruction* reshape) override;
-  absl::Status HandleBitcast(HloInstruction* bitcast) override;
-  absl::Status HandleTranspose(HloInstruction* transpose) override;
-  absl::Status HandleCopy(HloInstruction* copy) override;
-  absl::Status HandleBitcastConvert(HloInstruction* bitcast_convert) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
   absl::Status HandleOptimizationBarrier(
       HloInstruction* optimization_barrier) override;
-  absl::Status HandleAllGather(HloInstruction* all_gather) override;
+  absl::Status HandleReshape(HloInstruction* reshape) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  // go/keep-sorted end
+
+ private:
+  absl::Status HandleSimpleOp(HloInstruction* op);
 
  protected:
   HloDimensionAnalysis* analysis_;
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
index 5fcccfccb9e67d..bebba7ab529865 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dimension_analysis_test.cc
@@ -35,10 +35,18 @@ class HloDimensionAnalysisTest : public HloHardwareIndependentTestBase {
   bool IsWeight(const HloDimensionAnalysis& hlo_dimension_analysis,
                 HloModule* module, absl::string_view instruction_name) {
     HloInstruction* instruction = FindInstruction(module, instruction_name);
-    std::optional<ShapeTree<WeightInfo>> weight_info =
-        hlo_dimension_analysis.GetWeightInfo(instruction);
-    return weight_info.has_value() &&
-           (*weight_info).element({}) == WeightInfo::kWeight;
+    std::optional<ShapeTree<DimensionInfo>> dim_info =
+        hlo_dimension_analysis.GetDimensionInfo(instruction);
+    return dim_info.has_value() &&
+           (*dim_info).element({}) == DimensionInfo::kWeight;
+  }
+  bool HasDotDependent(const HloDimensionAnalysis& hlo_dimension_analysis,
+                       HloModule* module, absl::string_view instruction_name) {
+    HloInstruction* instruction = FindInstruction(module, instruction_name);
+    std::optional<ShapeTree<DimensionInfo>> dimension_info =
+        hlo_dimension_analysis.GetDimensionInfo(instruction);
+    return dimension_info.has_value() &&
+           (*dimension_info).element({}) == DimensionInfo::kDotDependent;
   }
 };
 
@@ -83,6 +91,13 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "copy"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "Arg_1.2"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "all-gather"));
+
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "maximum.33"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "select.35"));
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.2"));
 }
 
 TEST_F(HloDimensionAnalysisTest, RepeatWhile) {
@@ -239,6 +254,19 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.22"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.23"));
   EXPECT_TRUE(IsWeight(*hlo_dimension_analysis, module.get(), "reshape.24"));
+
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(), "dot.1"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "reshape.90"));
+  EXPECT_TRUE(
+      HasDotDependent(*hlo_dimension_analysis, module.get(), "reshape.95"));
+  // Index 2 of the while result is dot-dependent.
+  EXPECT_TRUE(HasDotDependent(*hlo_dimension_analysis, module.get(),
+                              "dynamic-update-slice.99"));
+  // Check the dot-dependent while result is not propagated to the call site.
+  EXPECT_FALSE(HasDotDependent(*hlo_dimension_analysis, module.get(),
+                               "get-tuple-element.179"));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
deleted file mode 100644
index 2cd08d3129bcda..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
+++ /dev/null
@@ -1,1189 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/hlo/evaluator/hlo_evaluator.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/map_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-
-namespace {
-using Analysis = IndexedArrayAnalysis;
-using UnknownArray = Analysis::UnknownArray;
-using ConstantArray = Analysis::ConstantArray;
-using ReshapedArray = Analysis::ReshapedArray;
-using ScalarIndexedArray = Analysis::ScalarIndexedArray;
-using absl::StrJoin;
-}  // namespace
-
-std::string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) {
-  switch (root->kind()) {
-    case Array::kUnknown: {
-      auto* unknown_tensor = root->as<UnknownArray>();
-      return absl::StrCat("%", unknown_tensor->instruction().name());
-    }
-
-    case Array::kConstant: {
-      if (print_constants) {
-        std::string contents = root->as<ConstantArray>()->literal()->ToString();
-        return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
-                            " ", contents, ")");
-      }
-      return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
-                          ")");
-    }
-
-    case Array::kReshaped: {
-      ReshapedArray* reshaped_array = root->as<ReshapedArray>();
-      return absl::StrCat(
-          "(reshape ", ToString(reshaped_array->operand(), print_constants),
-          " to ", ShapeUtil::HumanString(reshaped_array->shape()), ")");
-    }
-
-    case Array::kScalarIndexedConstant:
-    case Array::kScalarIndexed: {
-      auto* indexed_array = root->as<ScalarIndexedArray>();
-      std::string name = root->kind() == Array::kScalarIndexedConstant
-                             ? "scalar-indexed-const"
-                             : "scalar-indexed";
-      return absl::StrCat(
-          "(", name, " ", ToString(indexed_array->source(), print_constants),
-          " ", ToString(indexed_array->indices(), print_constants), " ",
-          indexed_array->source_dim(), "->[",
-          StrJoin(indexed_array->output_dims(), ","), "])");
-    }
-  }
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::GetArrayFor(
-    const HloInstruction* instr) {
-  auto it = cache_.find(instr);
-  if (it != cache_.end()) {
-    return it->second;
-  }
-
-  TF_RETURN_IF_ERROR(TraverseAndPopulateCache(instr));
-  return FindOrDie(cache_, instr);
-}
-
-absl::Status IndexedArrayAnalysis::TraverseAndPopulateCache(
-    const HloInstruction* root) {
-  // Depth first search over the DAG, invoking ComputeArrayFor in post order.
-  // The HLO instructions already in the cache are considered leaves.
-
-  absl::InlinedVector<const HloInstruction*, 4> stack;
-
-  enum DfsState { kDiscovered, kVisited };
-  absl::flat_hash_map<const HloInstruction*, DfsState> dfs_state_map;
-
-  stack.push_back(root);
-  InsertOrDie(&dfs_state_map, root, kDiscovered);
-
-  do {
-    const HloInstruction* instr = stack.back();
-    if (cache_.contains(instr)) {
-      stack.pop_back();
-      continue;
-    }
-
-    switch (FindOrDie(dfs_state_map, instr)) {
-      case kDiscovered: {
-        for (const HloInstruction* operand : instr->operands()) {
-          if (!cache_.contains(operand)) {
-            stack.push_back(operand);
-            CHECK(!dfs_state_map.contains(operand) ||
-                  dfs_state_map[operand] == kDiscovered);
-            dfs_state_map[operand] = kDiscovered;
-          }
-        }
-        dfs_state_map[instr] = kVisited;
-        break;
-      }
-
-      case kVisited:
-        stack.pop_back();
-        TF_ASSIGN_OR_RETURN(Array * array, ComputeArrayFor(instr));
-        InsertOrDie(&cache_, instr, array);
-        break;
-    }
-  } while (!stack.empty());
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayFor(
-    const HloInstruction* instr) {
-  Array* computed_array;
-  if (instr->IsElementwise() && instr->operand_count() == 1) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForElementwiseUnaryOp(
-            instr->opcode(), FindOrDie(cache_, instr->operand(0))));
-  } else if (instr->IsElementwise() && instr->operand_count() == 2) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForElementwiseBinaryOp(
-            instr->opcode(), FindOrDie(cache_, instr->operand(0)),
-            FindOrDie(cache_, instr->operand(1))));
-  } else if (instr->opcode() == HloOpcode::kConstant) {
-    TF_ASSIGN_OR_RETURN(computed_array,
-                        ComputeArrayForConstant(instr->literal()));
-  } else if (instr->opcode() == HloOpcode::kGather) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(),
-                              instr->gather_slice_sizes(),
-                              FindOrDie(cache_, instr->operand(0)),
-                              FindOrDie(cache_, instr->operand(1))));
-  } else if (instr->opcode() == HloOpcode::kReshape) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForReshape(instr->shape(),
-                               FindOrDie(cache_, instr->operand(0))));
-  } else if (instr->opcode() == HloOpcode::kDot) {
-    TF_ASSIGN_OR_RETURN(
-        computed_array,
-        ComputeArrayForDot(instr->shape(), instr->dot_dimension_numbers(),
-                           instr->precision_config(),
-                           FindOrDie(cache_, instr->operand(0)),
-                           FindOrDie(cache_, instr->operand(1))));
-  } else {
-    computed_array = nullptr;
-  }
-
-  if (!computed_array) {
-    computed_array = Construct<UnknownArray>(instr);
-  }
-
-  return computed_array;
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForConstant(
-    const Literal& literal) {
-  return Construct<ConstantArray>(&literal);
-}
-
-absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
-    ScalarIndexedArray* source, Array* indices, int64_t source_dim,
-    absl::Span<const int64_t> output_dims, Shape shape) {
-  // We want to transform Gather(Gather(A, X), Y) => Gather(A, Gather(X, Y)).
-  // `source` is the inner Gather(A, X).
-
-  Array* a = source->source();
-  Array* x = source->indices();
-  Array* y = indices;
-
-  // This bit is slightly tricky, so we do a naive "simulation" of the two
-  // consecutive gather operations to infer what the composed gather should look
-  // like.
-
-  enum class IndexComponent { Ungathered, GatheredFirst, GatheredSecond };
-
-  std::vector<IndexComponent> simulated_index(a->shape().dimensions().size(),
-                                              IndexComponent::Ungathered);
-
-  // Simulate the first gather.
-  EraseAt(&simulated_index, source->source_dim());
-  for (int64_t gather_dim : source->output_dims()) {
-    simulated_index.insert(simulated_index.begin() + gather_dim,
-                           IndexComponent::GatheredFirst);
-  }
-
-  // Simulate the second gather.
-  EraseAt(&simulated_index, source_dim);
-  for (int64_t output_dim : output_dims) {
-    simulated_index.insert(simulated_index.begin() + output_dim,
-                           IndexComponent::GatheredSecond);
-  }
-
-  int64_t source_dim_for_index_array =
-      FindIndex(source->output_dims(), source_dim);
-  CHECK_NE(source_dim_for_index_array, source->output_dims().size());
-
-  std::vector<int64_t> output_dims_for_index_array;
-  int64_t gathered_index_components_seen = 0;
-  for (IndexComponent simulation_dim : simulated_index) {
-    if (simulation_dim == IndexComponent::GatheredSecond) {
-      output_dims_for_index_array.push_back(gathered_index_components_seen);
-    }
-    if (simulation_dim != IndexComponent::Ungathered) {
-      gathered_index_components_seen++;
-    }
-  }
-
-  std::vector<int64_t> dim_sizes_for_composed_index;
-  std::vector<int64_t> output_dims_for_new_gather;
-  for (int64_t i = 0, e = simulated_index.size(); i < e; i++) {
-    if (simulated_index[i] != IndexComponent::Ungathered) {
-      dim_sizes_for_composed_index.push_back(shape.dimensions(i));
-      output_dims_for_new_gather.push_back(i);
-    }
-  }
-
-  Array* inner_indices = ConstructScalarIndexedArray(
-      x, y, source_dim_for_index_array, output_dims_for_index_array,
-      ShapeUtil::MakeShape(x->shape().element_type(),
-                           dim_sizes_for_composed_index));
-  return ConstructScalarIndexedArray(a, inner_indices, source->source_dim(),
-                                     output_dims_for_new_gather,
-                                     std::move(shape));
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
-    const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-    absl::Span<const int64_t> slice_sizes, Array* source, Array* indices) {
-  if (dim_numbers.index_vector_dim() != indices->shape().dimensions().size()) {
-    VLOG(3) << "ComputeArrayForGather: indices are not scalar";
-    return nullptr;
-  }
-
-  CHECK_EQ(dim_numbers.start_index_map_size(), 1);
-
-  // We can also handle dim_numbers.collapsed_slice_dims_size() == 0 here,
-  // should it become relevant.
-
-  if (dim_numbers.collapsed_slice_dims_size() != 1 ||
-      dim_numbers.collapsed_slice_dims(0) != dim_numbers.start_index_map(0)) {
-    VLOG(3) << "ComputeArrayForGather: gather operations must elide "
-               "start_index_map[0] and "
-               "start_index_map[0] only";
-    return nullptr;
-  }
-
-  // ScalarIndexedArray cannot represent gathers that "slice" along some
-  // dimensions -- for instance it cannot represent a gather that picks 5 [2,3]
-  // arrays from an array of size [7,4,6].  We check that condition down below:
-
-  for (int64_t i = 0, e = source->shape().dimensions().size(); i < e; i++) {
-    if (i != dim_numbers.collapsed_slice_dims(0) &&
-        source->shape().dimensions(i) != slice_sizes[i]) {
-      VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i
-              << "] != source->shape().dimensions(" << i << ") -- "
-              << source->shape().dimensions(i) << " vs. " << slice_sizes[i]
-              << " with dim_numbers.collapsed_slice_dims(0) = "
-              << dim_numbers.collapsed_slice_dims(0);
-      return nullptr;
-    }
-  }
-
-  int64_t source_dim = dim_numbers.start_index_map(0);
-  std::vector<int64_t> output_dims;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
-      output_dims.push_back(i);
-    }
-  }
-
-  if (auto* indexed = dynamic_cast<ScalarIndexedArray*>(source)) {
-    if (absl::c_linear_search(indexed->output_dims(), source_dim)) {
-      return FoldGatherOfGather(indexed, indices, source_dim, output_dims,
-                                shape);
-    }
-  } else if (auto* constant = dynamic_cast<ConstantArray*>(source)) {
-    return Construct<ScalarIndexedConstantArray>(constant, indices, source_dim,
-                                                 output_dims, shape);
-  }
-
-  return Construct<ScalarIndexedArray>(source, indices, source_dim, output_dims,
-                                       shape);
-}
-
-namespace {
-// Returns an index into `values` such that the product of the range
-// [values.begin()+index, values.end()) is equal to `product`.  If there is no
-// such index, return -1.  All integers in `values` must be positive.
-int64_t FindSuffixWithProduct(absl::Span<const int64_t> values,
-                              int64_t product) {
-  DCHECK(absl::c_all_of(values, [](int64_t value) { return value > 0; }));
-
-  int64_t current_product = 1;
-  int64_t i;
-  for (i = values.size() - 1; i >= 0 && product > current_product; --i) {
-    current_product *= values[i];
-  }
-
-  if (product == current_product) {
-    return i + 1;
-  }
-
-  return -1;
-}
-
-struct ReshapePassthroughDimPair {
-  int64_t result_dim;
-  int64_t operand_dim;
-};
-
-// Returns a set of dimension pairs such for all (result_dim, operand_dim) in
-// the set:
-//
-// output_index[result_dim] = SourceIndexOfReshape(output_index)[operand_dim]
-//
-// The returned vector of pairs is sorted in both the result_dim and the
-// operand_dim components.
-std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
-    absl::Span<const int64_t> operand_shape,
-    absl::Span<const int64_t> result_shape) {
-  // A reshape can be seen as an index mapping from output index to input index:
-  //
-  // (i_0, ..., i_n) = f(o_0, ..., o_m)
-  //
-  // This function returns the pairs (j, k) for which the following invariant
-  // holds for all indices in the shape:
-  //
-  //   o_j == i_k
-  //
-  // And this occurs when:
-  //
-  //    O_{j+1} * ... * O_n == I_{k+1} * ...  * I_m
-  //
-  // (where O_x are the sizes of the output shape and I_x are the sizes of the
-  // input shape) and the size of the dimension j of the result is the same as
-  // the size of dimension k in the operand.
-  //
-  // These conditions are sufficient because the Reshape HLO is spec'ed such
-  // that the rightmost dimensions are always minor in the flattening and refine
-  // operation.
-
-  std::vector<ReshapePassthroughDimPair> result;
-  int64_t result_subarray_size = 1;
-  for (int64_t result_dim = result_shape.size() - 1; result_dim >= 0;
-       --result_dim) {
-    int64_t candidate_operand_dim =
-        FindSuffixWithProduct(operand_shape, result_subarray_size);
-
-    // result_subarray_size does not include the elements in the current
-    // `result_dim` dimension (we multiply in result_shape[result_dim] at the
-    // end of loop body) so candidate_operand_dim can never be zero.
-    CHECK_NE(candidate_operand_dim, 0)
-        << "result_dim = " << result_dim
-        << ", result_subarray_size = " << result_subarray_size
-        << ", result_shape = [" << StrJoin(result_shape, ",") << "]"
-        << ", operand_shape = [" << StrJoin(operand_shape, ",") << "]";
-
-    if (candidate_operand_dim != -1 &&
-        result_shape[result_dim] == operand_shape[candidate_operand_dim - 1]) {
-      result.push_back({/*result_dim=*/result_dim,
-                        /*operand_dim=*/candidate_operand_dim - 1});
-    }
-    result_subarray_size *= result_shape[result_dim];
-  }
-
-  absl::c_reverse(result);
-
-  if (VLOG_IS_ON(3)) {
-    std::vector<std::string> result_strings;
-    absl::c_transform(result, std::back_inserter(result_strings),
-                      [](ReshapePassthroughDimPair value) {
-                        return absl::StrCat(value.result_dim, "->",
-                                            value.operand_dim);
-                      });
-    VLOG(3) << "For a reshape from [" << StrJoin(operand_shape, ",") << "] to ["
-            << StrJoin(result_shape, ",") << "] passthrough indices are ["
-            << StrJoin(result_strings, ",")
-            << "] (legend: `result`->`operand`)";
-  }
-
-  DCHECK(absl::c_is_sorted(
-      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
-        return lhs.result_dim < rhs.result_dim;
-      }));
-
-  DCHECK(absl::c_is_sorted(
-      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
-        return lhs.operand_dim < rhs.operand_dim;
-      }));
-
-  return result;
-}
-
-// Return true if `dim` is stated as an passthrough operand dim in
-// `passthrough_dims`.
-bool IsReshapePassthroughOperandDim(
-    absl::Span<const ReshapePassthroughDimPair> passthrough_dims, int64_t dim) {
-  return absl::c_any_of(passthrough_dims,
-                        [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-                          return passthrough_dim_pair.operand_dim == dim;
-                        });
-}
-
-// Maps `operand_dim` which must be an passthrough operand dimension to its
-// corresponding passthrough result dimension based on `passthrough_dims`.
-int64_t MapPassthroughOperandDimToResultDim(
-    absl::Span<const ReshapePassthroughDimPair> passthrough_dims,
-    int64_t operand_dim) {
-  auto it = absl::c_find_if(
-      passthrough_dims, [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-        return passthrough_dim_pair.operand_dim == operand_dim;
-      });
-  CHECK(it != passthrough_dims.end());
-  return it->result_dim;
-}
-
-int64_t FindSourcePositionForPassthroughResultDim(
-    absl::Span<const int64_t> operand_shape,
-    absl::Span<const int64_t> result_shape, int64_t source_passthrough_dim) {
-  VLOG(3) << "FindSourcePositionForPassthroughResultDim(["
-          << StrJoin(operand_shape, ",") << "], [" << StrJoin(result_shape, ",")
-          << "], " << source_passthrough_dim << ")";
-
-  int64_t indexed_source_subarray_size =
-      std::accumulate(operand_shape.begin() + source_passthrough_dim + 1,
-                      operand_shape.end(), 1LL, std::multiplies<int64_t>());
-
-  return FindSuffixWithProduct(result_shape, indexed_source_subarray_size);
-}
-
-Shape StripDegenerateDimensions(const Shape& shape) {
-  DimensionVector new_dims;
-  absl::c_copy_if(shape.dimensions(), std::back_inserter(new_dims),
-                  [](int64_t dim) { return dim != 1; });
-  return ShapeUtil::MakeShape(shape.element_type(), new_dims);
-}
-};  // namespace
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::ReshapeToRemoveDegenerateDims(
-    ScalarIndexedArray* operand) {
-  const Shape& shape = operand->shape();
-  if (!ShapeUtil::HasDegenerateDimensions(shape)) {
-    return operand;
-  }
-
-  // We only need to reshape out the degenerate dims from the indices and the
-  // source (except the source dim).
-
-  const Shape& source_shape = operand->source()->shape();
-  DimensionVector new_source_shape_dims;
-  for (int64_t i = 0, e = source_shape.dimensions().size(); i < e; i++) {
-    if (i == operand->source_dim() || source_shape.dimensions(i) != 1) {
-      new_source_shape_dims.push_back(source_shape.dimensions(i));
-    }
-  }
-
-  Shape new_source_shape =
-      ShapeUtil::MakeShape(shape.element_type(), new_source_shape_dims);
-  Shape new_indices_shape =
-      StripDegenerateDimensions(operand->indices()->shape());
-
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_source,
-      ComputeArrayForReshape(new_source_shape, operand->source()));
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_indices,
-      ComputeArrayForReshape(new_indices_shape, operand->indices()));
-
-  // Build the new output dims while keeping track of the degenerate dims that
-  // will no longer be present.
-  DimensionVector new_output_dims;
-  int64_t degenerate_dims_seen = 0;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (shape.dimensions(i) == 1) {
-      degenerate_dims_seen++;
-    } else if (absl::c_linear_search(operand->output_dims(), i)) {
-      new_output_dims.push_back(i - degenerate_dims_seen);
-    }
-  }
-
-  // Similarly, build the new source dim while keeping track of the degenerate
-  // dims that will no longer be present.
-  int64_t degenerate_dims_before_source_dim =
-      std::count(source_shape.dimensions().begin(),
-                 source_shape.dimensions().begin() + operand->source_dim(), 1);
-  int64_t new_source_dim =
-      operand->source_dim() - degenerate_dims_before_source_dim;
-
-  return ConstructScalarIndexedArray(
-      new_source, new_indices, new_source_dim,
-      InlinedVectorToVector(new_output_dims),
-      StripDegenerateDimensions(operand->shape()));
-}
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::ReshapeToAddDegenerateDims(
-    ScalarIndexedArray* operand, absl::Span<const int64_t> degenerate_dims) {
-  if (degenerate_dims.empty()) {
-    return operand;
-  }
-
-  CHECK(!ShapeUtil::HasDegenerateDimensions(operand->shape()));
-
-  DimensionVector new_output_dims = [&]() {
-    // To make things easy we use a "scratch" buffer of bools where the i'th
-    // element is true iff the i'th component of the result index is an output
-    // index.
-
-    absl::InlinedVector<bool, 6> output_dims_bitvector(
-        operand->shape().dimensions().size());
-    for (int64_t output_dim : operand->output_dims()) {
-      output_dims_bitvector[output_dim] = true;
-    }
-
-    for (int64_t degenerate_dim : degenerate_dims) {
-      InsertAt(&output_dims_bitvector, degenerate_dim, false);
-    }
-
-    DimensionVector result;
-    result.reserve(operand->output_dims().size());
-    for (int64_t i = 0, e = output_dims_bitvector.size(); i < e; i++) {
-      if (output_dims_bitvector[i]) {
-        result.push_back(i);
-      }
-    }
-
-    return result;
-  }();
-
-  DimensionVector new_result_shape_dims;
-  absl::c_copy(operand->shape().dimensions(),
-               std::back_inserter(new_result_shape_dims));
-  for (int64_t degenerate_dim : degenerate_dims) {
-    InsertAt(&new_result_shape_dims, degenerate_dim, 1);
-  }
-
-  DimensionVector new_source_shape_dims = new_result_shape_dims;
-  for (int64_t output_dim : new_output_dims) {
-    EraseAt(&new_source_shape_dims, output_dim);
-  }
-
-  int64_t new_source_dim = [&]() {
-    for (int i = 0, e = new_source_shape_dims.size(); i < e; i++) {
-      int64_t non_degenerate_dims_seen = 0;
-      if (non_degenerate_dims_seen == operand->source_dim()) {
-        return i;
-      }
-      if (new_source_shape_dims[new_source_dim] != 1) {
-        non_degenerate_dims_seen++;
-      }
-    }
-    LOG(FATAL) << "Did not find source dim in " << ToString(operand);
-  }();
-
-  int64_t source_dim_size =
-      operand->source()->shape().dimensions(operand->source_dim());
-  InsertAt(&new_source_shape_dims, /*index=*/new_source_dim,
-           /*value=*/source_dim_size);
-
-  Shape new_source_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
-                                                new_source_shape_dims);
-  Shape new_result_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
-                                                new_result_shape_dims);
-
-  TF_ASSIGN_OR_RETURN(
-      Array* const new_source,
-      ComputeArrayForReshape(new_source_shape, operand->source()));
-  return ConstructScalarIndexedArray(
-      new_source, operand->indices(), new_source_dim,
-      InlinedVectorToVector(new_output_dims), new_result_shape);
-}
-
-absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldReshapeOfGather(
-    const Shape& shape, ScalarIndexedConstantArray* operand) {
-  VLOG(3) << "FoldReshapeOfGather(" << ToString(operand) << ")";
-
-  // To make things easier on ourselves, instead of directly trying to fold the
-  // reshape of `operand` to `shape`, we call
-  // `FoldReshapeOfGatherNoDegenerateDims` on shapes without degenerate dims and
-  // handle the degenerate dimensions here by inserting reshapes.
-
-  TF_ASSIGN_OR_RETURN(ScalarIndexedArray* const operand_without_degenerate_dims,
-                      ReshapeToRemoveDegenerateDims(operand));
-
-  Shape output_shape_without_degenerate_dims = StripDegenerateDimensions(shape);
-  TF_ASSIGN_OR_RETURN(
-      ScalarIndexedArray* const folded_reshape_without_degenerate_dims,
-      FoldReshapeOfGatherNoDegenerateDims(
-          output_shape_without_degenerate_dims,
-          operand_without_degenerate_dims->as<ScalarIndexedConstantArray>()));
-
-  if (folded_reshape_without_degenerate_dims == nullptr) {
-    return nullptr;
-  }
-
-  DimensionVector degenerate_result_dims;
-  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
-    if (shape.dimensions(i) == 1) {
-      degenerate_result_dims.push_back(i);
-    }
-  }
-
-  return ReshapeToAddDegenerateDims(folded_reshape_without_degenerate_dims,
-                                    degenerate_result_dims);
-}
-
-absl::StatusOr<ScalarIndexedArray*>
-IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
-    const Shape& shape, ScalarIndexedConstantArray* scalar_indexed) {
-  VLOG(3) << "FoldReshapeOfGatherNoDegenerateDims(" << ToString(scalar_indexed)
-          << ")";
-  CHECK(!ShapeUtil::HasDegenerateDimensions(shape));
-  CHECK(!ShapeUtil::HasDegenerateDimensions(scalar_indexed->shape()));
-
-  // Try to fold Reshape(ScalarIndexed(Const, Indices))
-  //          => ScalarIndexed(Const', Indices)
-  //
-  // We can view the reshape and the scalar-indexed operations as functions that
-  // map an output index (i.e. an index into the result) to an input index
-  // (i.e. an index into the operand).  The key idea used here is that the
-  // output-to-input mapping for some reshape operations may "pass through" some
-  // output dimensions into the input space unchanged -- i.e. there may exist
-  // output dimension "O" and input dimension "I" such that OutputIndex[O] is
-  // always == InputIndexForReshape(OutputIndex)[I].  If these pass-through
-  // dimensions in the input space of the reshape happen to be include all the
-  // output dimensions for the scalar-indexed node then, roughly, the following
-  // holds:
-  //
-  //    SourceIndexOfScalarIndexed(SourceIndexOfReshape(Idx))
-  // == SourceIndexOfScalarIndexed(SourceIndexOfReshape(Ps ++ Qs))
-  //
-  //      Where Ps are the set of the pass-through components of Idx that are
-  //      also the output dims of the scalar-indexed node, and Qs are the rest.
-  //      For brevity, we're playing fast and loose with the notation here -- we
-  //      don't literally require Idx to be a concatenation of Ps and Qs, as
-  //      suggested by the "++".
-  //
-  // == SourceIndexOfScalarIndexed(Ps ++ SourceIndexOfReshape(Qs))
-  //
-  //      Again, we're playing fast and loose with the notation around "++".
-  //      Generally this ++ will be a different function that the ++ in the
-  //      previous step.
-  //
-  // If the scalar-indexed node has a constant as the source then the
-  // SourceIndexOfReshape function can be "folded into" the constant itself by
-  // reshaping it, leaving us with:
-  //
-  // == SourceIndexOfScalarIndexed(Ps ++ Qs)
-  // == SourceIndexOfScalarIndexed(Idx)
-  //
-  // which is just a scalar-indexed node (with parameters different from the
-  // scalar-indexed node we started with) with a reshaped constant as the
-  // source.
-  //
-  // We can't fold SourceIndexOfReshape into the constant without introducing
-  // another precondition: since the new scalar-indexed node will have a
-  // reshaped (constant) array as its source it will, in general, have a
-  // different source dimension than the original scalar-indexed node.  This
-  // source dimension will have to be a passthrough dimension of the
-  // SourceIndexOfReshape indexing function that is folded into the source. And
-  // such a dimension need not exist so this is a non-trivial precondition.
-
-  std::vector<ReshapePassthroughDimPair> reshape_passthrough_dims =
-      ComputeReshapePassthroughDimPairs(
-          /*operand_shape=*/scalar_indexed->shape().dimensions(),
-          /*result_shape=*/shape.dimensions());
-
-  auto is_reshape_passthrough_operand_dim = [&](int64_t operand_dim) {
-    return IsReshapePassthroughOperandDim(reshape_passthrough_dims,
-                                          operand_dim);
-  };
-
-  if (!absl::c_all_of(scalar_indexed->output_dims(),
-                      is_reshape_passthrough_operand_dim)) {
-    VLOG(3) << "Not all output dims are passthrough dims "
-            << ToString(scalar_indexed);
-    return nullptr;
-  }
-
-  // To compute the shape of the source for the new scalar-indexed node we're
-  // going to create, we first "undo" the scalar-indexed operation.
-  std::vector<int64_t> new_scalar_indexed_source_shape(
-      shape.dimensions().begin(), shape.dimensions().end());
-  for (int64_t i = scalar_indexed->output_dims().size() - 1; i >= 0; i--) {
-    int64_t output_dim = scalar_indexed->output_dims()[i];
-    int64_t output_dim_after_reshape = MapPassthroughOperandDimToResultDim(
-        reshape_passthrough_dims, output_dim);
-    EraseAt(&new_scalar_indexed_source_shape, output_dim_after_reshape);
-  }
-
-  // After this, we need to add in the dimension that will be the source
-  // dimension for the new scalar-indexed node.  A scalar-indexed node "removes"
-  // the source dimensions and "adds" the output dimensions, so to get back to
-  // the shape for the *source* of the scalar-indexed node we need to remove the
-  // output dims (which we did above) and then add back the source dim (which we
-  // are about to do below):
-
-  const Shape& scalar_indexed_source_shape = scalar_indexed->source()->shape();
-
-  int64_t source_dim_for_new_scalar_indexed_node =
-      FindSourcePositionForPassthroughResultDim(
-          /*operand_shape=*/scalar_indexed_source_shape.dimensions(),
-          /*result_shape=*/new_scalar_indexed_source_shape,
-          scalar_indexed->source_dim());
-
-  // We may not be able to find a source dim for the new scalar-indexed node.
-  // For instance consider:
-  //
-  //   operand = s32[3,5,2] constant({...})
-  //   indices = s32[7] parameter(0)
-  //   gather = s32[3,2,7] gather(operand, indices),
-  //       offset_dims={0,1},
-  //       collapsed_slice_dims={1},
-  //       start_index_map={1},
-  //       index_vector_dim=1,
-  //       slice_sizes={3,1,2}
-  //   reshape = s32[6,7] reshape(gather)
-  //
-  // In this case the gather maps to:
-  //    (scalar-indexed-const (constant s32[3,5,2]) %indices 1->[2])
-  //
-  // and the reshape passes through dimension 2 from its input into dimension 1
-  // in its output.  However, we can't rewrite the reshape as a scalar-indexed
-  // node because then we'd have to reshape the [3,5,2] `operand` array to
-  // [6,5], but then dimension 1 of the reshaped [6,5] array indexes differently
-  // (a.k.a. isn't pass-through) than the [3,5,2] array.
-
-  if (source_dim_for_new_scalar_indexed_node == -1) {
-    VLOG(3) << "Could not compute the source dim for the new scalar indexed "
-               "node: scalar_indexed_source_shape = ["
-            << StrJoin(scalar_indexed_source_shape.dimensions(), ",")
-            << "] and new_scalar_indexed_source_shape = ["
-            << StrJoin(new_scalar_indexed_source_shape, ",") << "]";
-    return nullptr;
-  }
-
-  InsertAt(
-      &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node,
-      scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim()));
-
-  CHECK_EQ(absl::c_accumulate(new_scalar_indexed_source_shape, 1LL,
-                              std::multiplies<int64_t>()),
-           ShapeUtil::ElementsIn(scalar_indexed_source_shape));
-
-  CHECK(IsReshapePassthroughOperandDim(
-      ComputeReshapePassthroughDimPairs(
-          /*operand_shape=*/scalar_indexed_source_shape.dimensions(),
-          /*result_shape=*/new_scalar_indexed_source_shape),
-      scalar_indexed->source_dim()));
-
-  auto map_passthrough_operand_dim_to_result_dim = [&](int64_t result_dim) {
-    return MapPassthroughOperandDimToResultDim(reshape_passthrough_dims,
-                                               result_dim);
-  };
-
-  std::vector<int64_t> output_dims_for_new_scalar_indexed_node;
-  absl::c_transform(scalar_indexed->output_dims(),
-                    std::back_inserter(output_dims_for_new_scalar_indexed_node),
-                    map_passthrough_operand_dim_to_result_dim);
-
-  TF_ASSIGN_OR_RETURN(const Literal* new_scalar_indexed_source_literal,
-                      TakeOwnership(scalar_indexed->literal().Reshape(
-                          new_scalar_indexed_source_shape)));
-  TF_ASSIGN_OR_RETURN(
-      Array * new_scalar_indexed_source,
-      ComputeArrayForConstant(*new_scalar_indexed_source_literal));
-
-  return ConstructScalarIndexedArray(
-      new_scalar_indexed_source, scalar_indexed->indices(),
-      source_dim_for_new_scalar_indexed_node,
-      output_dims_for_new_scalar_indexed_node, shape);
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForReshape(
-    const Shape& shape, Array* operand) {
-  if (ShapeUtil::Compatible(operand->shape(), shape)) {
-    return operand;
-  }
-
-  if (auto* scalar_indexed =
-          dynamic_cast<ScalarIndexedConstantArray*>(operand)) {
-    TF_ASSIGN_OR_RETURN(Analysis::Array * reshape_folded_into_gather,
-                        FoldReshapeOfGather(shape, scalar_indexed));
-    if (reshape_folded_into_gather) {
-      return reshape_folded_into_gather;
-    }
-  }
-
-  if (auto* constant_array = dynamic_cast<ConstantArray*>(operand)) {
-    TF_ASSIGN_OR_RETURN(
-        Literal* const new_literal,
-        TakeOwnership(constant_array->literal()->Reshape(shape.dimensions())));
-    return Construct<ConstantArray>(new_literal);
-  }
-
-  return Construct<ReshapedArray>(operand, shape);
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
-                                                         Array* lhs,
-                                                         Array* rhs) {
-  // Try to fold BinaryOp(Broadcast(Const0), ScalarIndexed(Const1, Indices))
-  //          => ScalarIndexed(BinaryOp(Broadcast'(Const0), Const1), Indices)
-  //
-  // We can do this if every output dimension from the scalar-indexed node is a
-  // broadcasted dimension for the broadcast node.  Informally, the precondition
-  // means Broadcast(Const0)[IDX] is solely a function of the components of IDX
-  // that are not output-dims for the scalar-indexed node. In other words, for
-  // every assignment to the non-output dims in IDX we have a "constant" LHS to
-  // the BinaryOp.  This transform propagates this "constant" to the source for
-  // the scalar-indexed node.
-
-  ScalarIndexedConstantArray* lhs_scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(lhs);
-  ScalarIndexedConstantArray* rhs_scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(rhs);
-
-  bool lhs_is_indexed;
-
-  // One of the operands must be scalar-indexed and the other must be a
-  // broadcast of a constant.
-  if (lhs_scalar_indexed_const && !rhs_scalar_indexed_const) {
-    lhs_is_indexed = true;
-  } else if (rhs_scalar_indexed_const && !lhs_scalar_indexed_const) {
-    lhs_is_indexed = false;
-  } else {
-    return nullptr;
-  }
-
-  ScalarIndexedConstantArray* scalar_indexed_const =
-      lhs_is_indexed ? lhs_scalar_indexed_const : rhs_scalar_indexed_const;
-  UnknownArray* candidate_broadcast_array =
-      dynamic_cast<UnknownArray*>(lhs_is_indexed ? rhs : lhs);
-  if (!candidate_broadcast_array ||
-      candidate_broadcast_array->instruction().opcode() !=
-          HloOpcode::kBroadcast) {
-    return nullptr;
-  }
-
-  const HloInstruction* broadcast_instr =
-      &candidate_broadcast_array->instruction();
-  const HloInstruction* broadcast_const_operand = broadcast_instr->operand(0);
-  if (broadcast_const_operand->opcode() != HloOpcode::kConstant) {
-    return nullptr;
-  }
-
-  absl::Span<const int64_t> broadcast_dims = broadcast_instr->dimensions();
-  auto is_broadcasted_dim = [&](int64_t output_dim) {
-    return absl::c_find(broadcast_dims, output_dim) == broadcast_dims.end();
-  };
-
-  // All of the output dims must be "broadcasted" dims for the other operand.
-  if (!absl::c_all_of(scalar_indexed_const->output_dims(),
-                      is_broadcasted_dim)) {
-    return nullptr;
-  }
-
-  // To figure out the broadcast dimensions for the (constant) source for the
-  // scalar-indexed node, we "simulate" the index transformation done by the
-  // existing broadcast:
-  enum class IndexComponent { Broadcasted, NotBroadcasted };
-  std::vector<IndexComponent> simulated_index(
-      broadcast_instr->shape().dimensions().size(),
-      IndexComponent::Broadcasted);
-  for (int64_t broadcast_dim : broadcast_dims) {
-    simulated_index[broadcast_dim] = IndexComponent::NotBroadcasted;
-  }
-
-  // The scalar-indexed node "removes" the source dim and "inserts" the output
-  // dims.  We do the opposite here to undo the scalar-indexed operation.
-  absl::Span<const int64_t> output_dims = scalar_indexed_const->output_dims();
-  for (int64_t i = output_dims.size() - 1; i >= 0; --i) {
-    CHECK(simulated_index[output_dims[i]] == IndexComponent::Broadcasted);
-    EraseAt(&simulated_index, output_dims[i]);
-  }
-
-  InsertAt(&simulated_index, scalar_indexed_const->source_dim(),
-           IndexComponent::Broadcasted);
-
-  // new_inner_broadcast_dims holds the broadcast dimensions for the inner
-  // BinaryOp(Broadcast'(Const0), Const1).  We now translate simulated_index to
-  // new_inner_broadcast_dims.
-  std::vector<int64_t> new_inner_broadcast_dims;
-  for (int64_t i = 0; i < simulated_index.size(); i++) {
-    if (simulated_index[i] == IndexComponent::NotBroadcasted) {
-      new_inner_broadcast_dims.push_back(i);
-    }
-  }
-
-  // inner_broadcast_result is the Broadcast'(Const0) bit in
-  // BinaryOp(Broadcast'(Const0), Const1)
-  TF_ASSIGN_OR_RETURN(
-      Literal inner_broadcast_result,
-      broadcast_const_operand->literal().Broadcast(
-          scalar_indexed_const->source()->shape(), new_inner_broadcast_dims));
-
-  // literal_for_new_source is BinaryOp(Broadcast'(Const0), Const1)
-  const Literal* literal_for_new_source;
-  if (lhs_is_indexed) {
-    TF_ASSIGN_OR_RETURN(
-        literal_for_new_source,
-        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
-            opcode, scalar_indexed_const->literal(), inner_broadcast_result)));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        literal_for_new_source,
-        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
-            opcode, inner_broadcast_result, scalar_indexed_const->literal())));
-  }
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, scalar_indexed_const->indices(),
-      scalar_indexed_const->source_dim(),
-      std::vector<int64_t>(scalar_indexed_const->output_dims().begin(),
-                           scalar_indexed_const->output_dims().end()),
-      scalar_indexed_const->shape());
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
-                                                        Array* operand) {
-  auto* scalar_indexed_const =
-      dynamic_cast<ScalarIndexedConstantArray*>(operand);
-  if (scalar_indexed_const == nullptr) {
-    return nullptr;
-  }
-
-  // Fold UnaryOp(ScalarIndexed(Const, Indices))
-  //   => ScalarIndexed(UnaryOp(Const), Indices)
-
-  TF_ASSIGN_OR_RETURN(Literal * literal_for_new_source,
-                      TakeOwnership(HloEvaluator{}.EvaluateElementwiseUnaryOp(
-                          opcode, scalar_indexed_const->literal())));
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, scalar_indexed_const->indices(),
-      scalar_indexed_const->source_dim(),
-      SpanToVector(scalar_indexed_const->output_dims()),
-      scalar_indexed_const->shape());
-}
-
-namespace {
-
-// Returns the non-contracting non-batch dimension (as per `contracting_dims`
-// and `batch_dims`) if there is exactly one, otherwise returns nullopt.
-std::optional<int64_t> GetOnlyNonContractingNonBatchDim(
-    int64_t rank, absl::Span<const int64_t> contracting_dims,
-    absl::Span<const int64_t> batch_dims) {
-  std::optional<int64_t> result;
-  for (int64_t dim = 0; dim < rank; dim++) {
-    if (!absl::c_linear_search(contracting_dims, dim) &&
-        !absl::c_linear_search(batch_dims, dim)) {
-      if (result.has_value()) {
-        return std::nullopt;
-      }
-      result = dim;
-    }
-  }
-  return result;
-}
-
-// Returns true if `indexed_array`, which is either the LHS or the RHS of a Dot
-// HLO, can be folded into the dot operation.  For now these conditions are both
-// necessary and sufficient.
-//
-// `tag` describes the caller.  Used only for logging.
-//
-// `contracting_dims` and `batch_dims` are the contracting and batch dimensions
-// of whatever operand `indexed_array` is to the dot (LHS or RHS).
-bool CanFoldDotIntoIndexedArray(
-    absl::string_view tag, Analysis::ScalarIndexedConstantArray* indexed_array,
-    absl::Span<const int64_t> contracting_dims,
-    absl::Span<const int64_t> batch_dims) {
-  std::optional<int64_t> non_contracting_non_batch_dim =
-      GetOnlyNonContractingNonBatchDim(
-          indexed_array->shape().dimensions().size(), contracting_dims,
-          batch_dims);
-  if (!non_contracting_non_batch_dim.has_value()) {
-    VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions";
-    return false;
-  }
-
-  if (indexed_array->output_dims().size() != 1 ||
-      indexed_array->output_dims()[0] != *non_contracting_non_batch_dim) {
-    VLOG(3) << tag << ": output dims != the lhs non-contracting non-batch dim";
-    return false;
-  }
-
-  int64_t indexed_array_rank = indexed_array->shape().dimensions().size();
-  if (indexed_array->source_dim() < (indexed_array_rank - 2)) {
-    // This restriction can be lifted by inserting reshape nodes.
-    VLOG(3) << tag
-            << ": source dim is not in the low two dims, won't be able to form "
-               "a matmul";
-    return false;
-  }
-
-  return true;
-}
-
-}  // namespace
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, ScalarIndexedConstantArray* lhs,
-    ConstantArray* rhs) {
-  VLOG(3) << "ComputeArrayForDotWithIndexedLhs(" << ToString(lhs) << " "
-          << ToString(rhs);
-  if (!CanFoldDotIntoIndexedArray(
-          "ComputeArrayForDotWithIndexedLhs", lhs, /*contracting_dims=*/
-          dim_numbers.lhs_contracting_dimensions(),
-          /*batch_dims=*/dim_numbers.lhs_batch_dimensions())) {
-    return nullptr;
-  }
-
-  int64_t lhs_rank = lhs->shape().dimensions().size();
-  DotDimensionNumbers new_dim_numbers = dim_numbers;
-  new_dim_numbers.set_lhs_contracting_dimensions(
-      0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1));
-
-  TF_ASSIGN_OR_RETURN(
-      Literal * literal_for_new_source,
-      TakeOwnership(HloEvaluator{}.EvaluateDotOp(
-          new_dim_numbers, precision_config, lhs->literal(), *rhs->literal())));
-
-  // The new source dimension is wherever the non-batch non-contracting LHS
-  // dimension "went".
-  int64_t new_source_dim = dim_numbers.lhs_batch_dimensions_size() +
-                           dim_numbers.rhs_batch_dimensions_size();
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, lhs->indices(), new_source_dim,
-      SpanToVector(lhs->output_dims()), shape);
-}
-
-absl::StatusOr<Analysis::Array*>
-IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, ConstantArray* lhs,
-    ScalarIndexedConstantArray* rhs) {
-  VLOG(3) << "ComputeArrayForDotWithIndexedRhs(" << ToString(lhs) << " "
-          << ToString(rhs);
-  if (!CanFoldDotIntoIndexedArray(
-          "ComputeArrayForDotWithIndexedRhs", rhs, /*contracting_dims=*/
-          dim_numbers.rhs_contracting_dimensions(),
-          /*batch_dims=*/dim_numbers.rhs_batch_dimensions())) {
-    return nullptr;
-  }
-
-  int64_t rhs_rank = rhs->shape().dimensions().size();
-
-  DotDimensionNumbers new_dim_numbers = dim_numbers;
-  new_dim_numbers.set_rhs_contracting_dimensions(
-      0, rhs->source_dim() == (rhs_rank - 1) ? (rhs_rank - 2) : (rhs_rank - 1));
-
-  TF_ASSIGN_OR_RETURN(
-      Literal * literal_for_new_source,
-      TakeOwnership(HloEvaluator{}.EvaluateDotOp(
-          new_dim_numbers, precision_config, *lhs->literal(), rhs->literal())));
-
-  // The new source dimension is wherever the non-batch non-contracting RHS
-  // dimension "went".
-  int64_t new_source_dim = dim_numbers.lhs_batch_dimensions_size() +
-                           dim_numbers.rhs_batch_dimensions_size() + 1;
-
-  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
-  return Construct<ScalarIndexedConstantArray>(
-      new_source, rhs->indices(), new_source_dim,
-      SpanToVector(rhs->output_dims()), shape);
-}
-
-absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForDot(
-    const Shape& shape, const DotDimensionNumbers& dim_numbers,
-    const PrecisionConfig& precision_config, Array* lhs, Array* rhs) {
-  // Intuitively, if
-  //
-  //  - The LHS of a dot product is a gathered sequence of rows from a constant
-  //    array (i.e. LHS[I,J] = Const[Indices[I],J]) and the RHS is a constant
-  //
-  //  OR
-  //
-  //  - If the RHS of a dot product is a gathered sequence of columns from a
-  //    constant array (i.e. RHS[I,J] = Const[I, Indices[J]]) and the LHS is a
-  //    constant
-  //
-  // then the result of the dot product itself is a gather from a constant
-  // array.  E.g. Dot(LHS, ConstRhs) where LHS[I,J] = Const[Indices[I],J] can be
-  // rewritten as Result where Result[I,J] = Dot(Const, ConstRhs)[Indices[I],
-  // J].
-  //
-  // We do a general version of this rewrite here.
-  VLOG(3) << "ComputeArrayForDot(" << ToString(lhs) << " " << ToString(rhs);
-  if (auto* lhs_indexed_array =
-          dynamic_cast<ScalarIndexedConstantArray*>(lhs)) {
-    if (auto* rhs_constant = dynamic_cast<ConstantArray*>(rhs)) {
-      return ComputeArrayForDotWithIndexedLhs(shape, dim_numbers,
-                                              precision_config,
-                                              lhs_indexed_array, rhs_constant);
-    }
-  }
-
-  if (auto* rhs_indexed_array =
-          dynamic_cast<ScalarIndexedConstantArray*>(rhs)) {
-    if (auto* lhs_constant = dynamic_cast<ConstantArray*>(lhs)) {
-      return ComputeArrayForDotWithIndexedRhs(shape, dim_numbers,
-                                              precision_config, lhs_constant,
-                                              rhs_indexed_array);
-    }
-  }
-
-  return nullptr;
-}
-
-absl::StatusOr<bool> IndexedArrayAnalysisPrinterPass::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!VLOG_IS_ON(2)) {
-    return false;
-  }
-
-  IndexedArrayAnalysis analysis;
-  for (auto* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    for (auto* instr : computation->instructions()) {
-      TF_ASSIGN_OR_RETURN(Analysis::Array * t, analysis.GetArrayFor(instr));
-      if (!dynamic_cast<UnknownArray*>(t) && !dynamic_cast<ConstantArray*>(t)) {
-        VLOG(2) << instr->ToString() << "   ->   " << analysis.ToString(t);
-      }
-    }
-  }
-
-  return false;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
deleted file mode 100644
index 83bf625a337120..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.h
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
-#define XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
-
-#include <type_traits>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/literal.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-
-// IndexedArrayAnalysis decides if an HLO instruction can be rewritten as a
-// gather from another array.  It does this by mapping HLO instructions to
-// instances of IndexedArrayAnalysis::Array, which can be inspected to discover
-// whether said HLO is equivalent to a gather.
-class IndexedArrayAnalysis {
- public:
-  // IndexedArrayAnalysis maps each HLO instruction to an instance of a Array.
-  // Array really just a sum type of the classes that inherit from it.  The
-  // meaning of each of the subtypes is documented on the subtype declaration.
-  //
-  // Array instances are immutable once created.
-  class Array {
-   public:
-    enum Kind {
-      kUnknown,
-      kConstant,
-      kReshaped,
-      kScalarIndexedConstant,
-      kScalarIndexed
-    };
-
-    virtual Kind kind() const = 0;
-    virtual const Shape& shape() const = 0;
-
-    // Does a checked downcast from `Array` to `T` which must be one of its
-    // subtypes.
-    template <typename T>
-    T* as() {
-      static_assert((std::is_base_of<Array, T>::value),
-                    "target type not derived from source type");
-      // We skip the CHECK and hence the dynamic_cast if RTTI is disabled.
-#if !defined(__GNUC__) || defined(__GXX_RTTI)
-      CHECK_NE(dynamic_cast<T*>(this), nullptr);
-#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
-
-      return static_cast<T*>(this);
-    }
-
-    virtual ~Array() = default;
-
-    Array& operator=(const Array& other) = delete;
-  };
-
-  // Represents an HLO instruction that was not analyzable by this
-  // IndexedArrayAnalysis.  Instances of UnknownArray just wrap an existing
-  // HloInstruction.
-  class UnknownArray : public Array {
-   public:
-    Kind kind() const override { return kUnknown; }
-    const Shape& shape() const override { return instruction().shape(); }
-    const HloInstruction& instruction() const { return instruction_; }
-
-   private:
-    explicit UnknownArray(const HloInstruction* instr) : instruction_(*instr) {}
-
-    const HloInstruction& instruction_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Represents a constant value.  This constant value may be present in the HLO
-  // module being analyzed, or it could have been created on the fly by the
-  // analysis.
-  class ConstantArray : public Array {
-   public:
-    Kind kind() const override { return kConstant; }
-    const Shape& shape() const override { return literal()->shape(); }
-    const Literal* literal() const { return literal_; }
-
-   private:
-    explicit ConstantArray(const Literal* literal) : literal_(literal) {}
-    const Literal* literal_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Represents an Array that is a reshape of another Array.
-  class ReshapedArray : public Array {
-   public:
-    Kind kind() const override { return kReshaped; }
-
-    // The array to reshape.
-    Array* operand() const { return operand_; }
-
-    // The output shape.
-    const Shape& shape() const override { return shape_; }
-
-   private:
-    explicit ReshapedArray(Array* operand, Shape shape)
-        : operand_(operand), shape_(shape) {}
-
-    Array* operand_;
-    const Shape shape_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // ---------------------------------------------------------------------------
-  // Indexed Array Overview
-  // ---------------------------------------------------------------------------
-  //
-  // ScalarIndexedArray and ScalarIndexedConstantArray form the core of this
-  // analysis.  ScalarIndexedConstantArray is just a specialization of
-  // ScalarIndexedArray so we will only discuss ScalarIndexedArray in this
-  // overview.
-  //
-  // A ScalarIndexedArray represents an array that can be computed by indexing
-  // into a "source" array using an "indices" tensor.  A simple example is a
-  // gather operation gathering 12 rows out of a [100,100] matrix -- such an
-  // operation will be represented by an instance of a ScalarIndexedArray with
-  // the [100,100] matrix as the "source" array and the [12]-shaped indices
-  // array as the "indices" tensor.  The ScalarIndexedArray operation itself
-  // will be of shape [12,100] (assuming we were gathering with axis=0).
-  //
-  // Gather operations are not the only operation that maps to
-  // ScalarIndexedArray instances (if that were true there would be little point
-  // in having a separate analysis).  We can often infer ScalarIndexedArrays for
-  // other operations too.  For instance, consider:
-  //
-  //   %source = f32[100,100] constant
-  //   %indices = s32[12] ...
-  //   %gather = f32[12,100] ... gather from %source using %indices at axis 0
-  //   %dot = dot(%gather, other_constant) [canonical contracting dims]
-  //
-  // The dot operation itself is also a ScalarIndexedArray with source =
-  // dot(constant, other_constant) and indices = %indices.  A reshape of %gather
-  // to [12,5,20] too is a ScalarIndexedArray with source = an appropriately
-  // reshaped constant and indices = %indices.
-
-  // Represents the result of a gather operation.  This gather operation may
-  // explicitly be present in the HLO module being analyzed, or it could have
-  // been created on the fly by the analysis.
-  //
-  // An instance of ScalarIndexedArray represents a array whose I'th element can
-  // be mapped to the J'th element of the `source` array (where I and J are
-  // multidimensional indices) in this way:
-  //
-  //   I' = remove components at positions `output_dims` from I
-  //   G' = remove components not at positions `output_dims` from I
-  //   T  = indices[G']
-  //   J  = I' with T inserted at position `source_dim`
-  //
-  // For example, if source is of shape [11,13,17,19], indices is of shape
-  // [23,29], output_dims is [0,2] and source_dim is 2 then the output is of
-  // shape [23,11,29,13,19] and the output index [A,B,C,D,E] is mapped to the
-  // input index [B,D,indices[A,C],E].
-  class ScalarIndexedArray : public Array {
-   public:
-    Kind kind() const override { return kScalarIndexed; }
-    const Shape& shape() const override { return shape_; }
-
-    Array* source() const { return source_; }
-    Array* indices() const { return indices_; }
-
-    // `source_dim` is the dimension in the source array that is being indexed
-    // over using indices from the `indices` array.  See the class documentation
-    // and the overview for more details.
-    int64_t source_dim() const { return source_dim_; }
-
-    // `output_dims` are the dimensions in the output array that are being used
-    // to compute an index into the `indices` array.  See the class
-    // documentation and the overview for more details.
-    absl::Span<const int64_t> output_dims() const { return output_dims_; }
-
-   private:
-    explicit ScalarIndexedArray(Array* source, Array* indices,
-                                int64_t source_dim,
-                                std::vector<int64_t> output_dims, Shape shape)
-        : source_(source),
-          indices_(indices),
-          source_dim_(source_dim),
-          output_dims_(std::move(output_dims)),
-          shape_(std::move(shape)) {}
-
-    Array* source_;
-    Array* indices_;
-    int64_t source_dim_;
-    std::vector<int64_t> output_dims_;
-    Shape shape_;
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // A ScalarIndexedConstantArray is just a ScalarIndexedArray constrained to
-  // have a ConstantArray instance as the source.  This is an ergonomic
-  // concession -- in theory it is possible to just keep ScalarIndexedArray and
-  // check source()->kind().
-  class ScalarIndexedConstantArray : public ScalarIndexedArray {
-   public:
-    Kind kind() const override { return kScalarIndexedConstant; }
-
-    const Literal& literal() const {
-      return *source()->as<ConstantArray>()->literal();
-    }
-
-   private:
-    explicit ScalarIndexedConstantArray(Array* source, Array* indices,
-                                        int64_t source_dim,
-                                        std::vector<int64_t> output_dims,
-                                        Shape shape)
-        : ScalarIndexedArray(source, indices, source_dim,
-                             std::move(output_dims), std::move(shape)) {
-      CHECK(dynamic_cast<ConstantArray*>(source));
-    }
-
-    friend class IndexedArrayAnalysis;
-  };
-
-  // Returns an Array instance for `instr`.  The IndexedArrayAnalysis instance
-  // keeps ownership of the returned Array instance.
-  //
-  // Caching Behavior: IndexedArrayAnalysis has a cache mapping HLO
-  // instructions to IndexedArrayAnalysis::Array instances.  This entire cache
-  // becomes stale and may cause the analysis to return incorrect results if any
-  // transitive operand (stopping at the containing computation) is modified for
-  // any HLO instruction on which GetArrayFor has been invoked.
-  //
-  // NB!  By inspecting the implementation, you may be able to infer a stronger
-  // caching guarantee than what is mentioned above.  Nevertheless, what is
-  // stated above is the contract.
-  absl::StatusOr<Array*> GetArrayFor(const HloInstruction* instr);
-
-  // Pretty-prints the expression rooted at `root`.
-  std::string ToString(Array* root, bool print_constants = false);
-
- private:
-  // Helper function that ensures that every HLO instruction that is
-  // transitively used by `root` has an entry in `cache_`.
-  absl::Status TraverseAndPopulateCache(const HloInstruction* root);
-
-  // Creates an Array instance for `instr` under the assumption that all
-  // operations of `instr` are present in `cache_`.
-  absl::StatusOr<Array*> ComputeArrayFor(const HloInstruction* instr);
-
-  absl::StatusOr<Array*> ComputeArrayForConstant(const Literal& literal);
-
-  absl::StatusOr<Array*> ComputeArrayForGather(
-      const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-      absl::Span<const int64_t> slice_sizes, Array* source, Array* indices);
-
-  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedLhs(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, ScalarIndexedConstantArray* lhs,
-      ConstantArray* rhs);
-
-  absl::StatusOr<Array*> ComputeArrayForDotWithIndexedRhs(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, ConstantArray* lhs,
-      ScalarIndexedConstantArray* rhs);
-
-  absl::StatusOr<Array*> ComputeArrayForDot(
-      const Shape& shape, const DotDimensionNumbers& dim_numbers,
-      const PrecisionConfig& precision_config, Array* lhs, Array* rhs);
-
-  // This tries to fold a ScalarIndexedArray which has another
-  // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a
-  // ScalarIndexedArray as indices.  If `source` happened to be a
-  // ScalarIndexedConstantArray this can result in an expression that is more
-  // canonical.
-  //
-  // As an example, consider a gather operation, G0, gathering 7 elements from
-  // an array "Arr" of shape [100] resulting in an array of shape [7], and a
-  // second gather operation, G1, which gathers 3 elements out of the result of
-  // G0 resulting in an array of shape [3].  Let the indices uses by G0 be I0
-  // (of shape [7]) and the indices used by G1 be I1 (of shape [3]).  We can
-  // instead rewrite G1 to gather directly from "Arr" with the three indices
-  // from I0 as per I1.  In other words, we can rewrite:
-  //
-  //    G0 = [Arr[i] for i in I0]
-  //    G1 = [G0[i]  for i in I1]
-  //
-  // into
-  //
-  //    I2 = [I0[i]  for i in I1]
-  //    G1 = [Arr[i] for i in I2]
-  absl::StatusOr<ScalarIndexedArray*> FoldGatherOfGather(
-      ScalarIndexedArray* source, Array* indices, int64_t source_dim,
-      absl::Span<const int64_t> output_dims, Shape shape);
-
-  // Reshapes a scalar-indexed node to remove the degenerate dimensions in its
-  // output.  The result is always a scalar-indexed node.
-  absl::StatusOr<ScalarIndexedArray*> ReshapeToRemoveDegenerateDims(
-      ScalarIndexedArray* operand);
-
-  // Reshapes a scalar-indexed node such that the result has the degenerate
-  // dimensions `degenerate_dims`.  The result is always a scalar-indexed node.
-  absl::StatusOr<ScalarIndexedArray*> ReshapeToAddDegenerateDims(
-      ScalarIndexedArray* operand, absl::Span<const int64_t> degenerate_dims);
-
-  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGather(
-      const Shape& shape, ScalarIndexedConstantArray* operand);
-  absl::StatusOr<ScalarIndexedArray*> FoldReshapeOfGatherNoDegenerateDims(
-      const Shape& shape, ScalarIndexedConstantArray* scalar_indexed);
-  absl::StatusOr<Array*> ComputeArrayForReshape(const Shape& shape,
-                                                Array* operand);
-
-  absl::StatusOr<Array*> ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
-                                                            Array* lhs,
-                                                            Array* rhs);
-  absl::StatusOr<Array*> ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
-                                                           Array* operand);
-
-  template <typename T, typename... Args>
-  T* Construct(Args&&... args) {
-    T* new_tensor = new T(std::forward<Args>(args)...);
-    owned_tensors_.push_back(std::unique_ptr<T>(new_tensor));
-    return new_tensor;
-  }
-
-  ScalarIndexedArray* ConstructScalarIndexedArray(
-      Array* source, Array* indices, int64_t source_dim,
-      std::vector<int64_t> output_dims, Shape shape) {
-    if (source->kind() == Array::kConstant) {
-      return Construct<ScalarIndexedConstantArray>(source, indices, source_dim,
-                                                   std::move(output_dims),
-                                                   std::move(shape));
-    } else {
-      return Construct<ScalarIndexedArray>(source, indices, source_dim,
-                                           std::move(output_dims),
-                                           std::move(shape));
-    }
-  }
-
-  Literal* TakeOwnership(Literal literal) {
-    owned_literals_.push_back(std::move(literal));
-    return &owned_literals_.back();
-  }
-
-  absl::StatusOr<Literal*> TakeOwnership(
-      absl::StatusOr<Literal> literal_or_error) {
-    TF_ASSIGN_OR_RETURN(Literal literal, std::move(literal_or_error));
-    owned_literals_.push_back(std::move(literal));
-    return &owned_literals_.back();
-  }
-
-  std::vector<std::unique_ptr<Array>> owned_tensors_;
-  std::vector<Literal> owned_literals_;
-  absl::flat_hash_map<const HloInstruction*, Array*> cache_;
-};
-
-// A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
-// This pass is a no-op if !VLOG_IS_ON(2) so it should be fine to
-// unconditionally add to the regular HLO pass pipeline.
-class IndexedArrayAnalysisPrinterPass : public HloModulePass {
- public:
-  absl::string_view name() const override {
-    return "indexed-array-analysis-printer-pass";
-  }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace xla
-
-#endif  // XLA_HLO_ANALYSIS_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
deleted file mode 100644
index 574a487c330e1e..00000000000000
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
+++ /dev/null
@@ -1,974 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/strings/ascii.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-class IndexedArrayAnalysisTest : public HloHardwareIndependentTestBase {
- protected:
-  void AssertArrayForRootExpressionIs(const std::string& hlo_text,
-                                      const std::string& root_expression) {
-    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
-                                       /*print_constants=*/false);
-  }
-
-  void AssertArrayWithConstantsForRootExpressionIs(
-      const std::string& hlo_text, const std::string& root_expression) {
-    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
-                                       /*print_constants=*/true);
-  }
-
- private:
-  // Replaces sequences of whitespace with a single space.  This makes the
-  // strings being matched against "whitespace insensitive" which lets us indent
-  // them for readability.
-  std::string CanonicalizeWhitespace(const std::string& text) {
-    std::string result;
-
-    for (char c : text) {
-      if (!absl::ascii_isspace(c)) {
-        result.push_back(c);
-      } else if (!result.empty() && result.back() != ' ') {
-        result.push_back(' ');
-      }
-    }
-
-    while (!result.empty() && result.back() == ' ') {
-      result.pop_back();
-    }
-
-    return result;
-  }
-
-  void AssertArrayForRootExpressionIsImpl(const std::string& hlo_text,
-                                          const std::string& root_expression,
-                                          bool print_constants) {
-    IndexedArrayAnalysis indexed_tensor_analysis;
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
-                            ParseAndReturnVerifiedModule(hlo_text));
-
-    TF_ASSERT_OK_AND_ASSIGN(IndexedArrayAnalysis::Array* const array_result,
-                            indexed_tensor_analysis.GetArrayFor(
-                                m->entry_computation()->root_instruction()));
-    std::string string_result = CanonicalizeWhitespace(
-        indexed_tensor_analysis.ToString(array_result, print_constants));
-    LOG(INFO) << string_result;
-    ASSERT_EQ(string_result, CanonicalizeWhitespace(root_expression));
-  }
-};
-
-TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneGather) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices = s32[5] parameter(0)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text, "(scalar-indexed-const (constant s32[3,3]) %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed0) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices = s32[5,2] parameter(0)
-  ROOT gather = s32[5] gather(operand, indices),
-      offset_dims={},
-      collapsed_slice_dims={0,1},
-      start_index_map={0,1},
-      index_vector_dim=1,
-      slice_sizes={1,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed1) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3,1] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,3] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0,2},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed2) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3,1] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,2,3] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={2},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={2,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed3) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] parameter(0)
-  indices = s32[5] parameter(1)
-  ROOT gather = s32[5,2] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,2}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%gather");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
-  indices_a = s32[5] parameter(0)
-  indices_b = s32[2] parameter(1)
-  gather_a = s32[5,3] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-  ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,3]) (scalar-indexed %indices_a "
-      "%indices_b 0->[0]) 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithOneToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,2] parameter(0)
-  indices_a = s32[5,7] parameter(1)
-  indices_b = s32[2] parameter(2)
-  gather_a = s32[5,3,7] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=2,
-      slice_sizes={3,1}
-  ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
-      offset_dims={0,1},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=1,
-      slice_sizes={5,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand (scalar-indexed "
-                                 "%indices_a %indices_b 1->[1]) 1->[0,2])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOneWithManyToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,6] parameter(0)
-  indices_a = s32[2] parameter(1)
-  indices_b = s32[5,7] parameter(2)
-  gather_a = s32[2,6] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,6}
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text,
-                                 "(scalar-indexed %operand (scalar-indexed "
-                                 "%indices_a %indices_b 0->[0,1]) 0->[0,2])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithManyToOne) {
-  std::string hlo_text = R"(
-HloModule SimpleGather
-
-ENTRY main {
-  operand = s32[3,2] parameter(0)
-  indices_a = s32[5,7] parameter(1)
-  indices_b = s32[4,8] parameter(2)
-  gather_a = s32[5,3,7] gather(operand, indices_a),
-      offset_dims={1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=2,
-      slice_sizes={3,1}
-  ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
-      offset_dims={1,2},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=2,
-      slice_sizes={5,3,1}
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed %operand (scalar-indexed %indices_a %indices_b "
-      "1->[0,2]) 1->[0,1,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather0) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text, "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather1) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5,7] parameter(0)
-  gather = s32[5,4,7] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2,7] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather2) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,2,6] constant({
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
-      {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
-  indices = s32[5,7] parameter(0)
-  gather = s32[5,2,6,7] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,2,6}
-  ROOT reshape = s32[5,3,4,7] reshape(gather)
-}
-)";
-
-  AssertArrayForRootExpressionIs(
-      hlo_text,
-      "(scalar-indexed-const (constant s32[3,3,4]) %indices 0->[0,3])");
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather3) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,6] constant({
-      {1,2,3,4,5,6},{1,2,3,4,5,6}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,6])
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather4) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 1, 2, 3 } })
-
-  i.0 = s64[1,3]{1,0} parameter(0)
-  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
-    collapsed_slice_dims={0}, start_index_map={0},
-    index_vector_dim=2, slice_sizes={1,3}
-
-  i.1 = s64[1] parameter(1)
-  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), offset_dims={0,2},
-    collapsed_slice_dims={1}, start_index_map={1},
-    index_vector_dim=1, slice_sizes={1,1,3}
-
-  ROOT reshape = s32[1,3]{1,0} reshape(g.1)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,3])
-   (reshape
-     (scalar-indexed %i.0 %i.1 1->[1])
-     to s64[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather5) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[1,6] constant({{1,2,3,4,5,6}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[1,1,1,6])
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather6) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[1,2,6] constant({{
-      {1,2,3,4,5,6},{1,2,3,4,5,6}}})
-  indices = s32[1] parameter(0)
-  gather = s32[1,1,6] gather(operand, indices),
-      offset_dims={1,2},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={1,1,6}
-  ROOT reshape = s32[1,1,1,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
-  (reshape %indices to s32[])
-  0->[])
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text,
-                                              expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather7) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[2,6] constant({
-      {1,2,3,4,5,6},{1,2,3,4,5,6}})
-  indices = s32[1,5] parameter(0)
-  gather = s32[1,5,6] gather(operand, indices),
-      offset_dims={2},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,6}
-  ROOT reshape = s32[1,1,5,6] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(scalar-indexed-const
-  (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
-    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
-  (reshape %indices to s32[5])
-  0->[2])
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text,
-                                              expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold0) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-  indices = s32[5,6] parameter(0)
-  gather = s32[5,4,6] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4}
-  ROOT reshape = s32[5,2,2,2,3] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,4])
-    %indices
-    0->[0,2])
-  to s32[5,2,2,2,3])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold1) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,5,2] constant({
-      {{1,2},{3,4},{5,6},{7,8},{9,10}},
-      {{1,2},{3,4},{5,6},{7,8},{9,10}},
-      {{1,2},{3,4},{5,6},{7,8},{9,10}}})
-  indices = s32[7] parameter(0)
-  gather = s32[3,2,7] gather(operand, indices),
-      offset_dims={0,1},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1,2}
-  ROOT reshape = s32[6,7] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,5,2])
-    %indices
-    1->[2])
-  to s32[6,7])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold2) {
-  std::string hlo_text = R"(
-HloModule ReshapeOfGather
-
-ENTRY main {
-  operand = s32[3,4,1] constant({
-    {{1},{2},{3},{4}},
-    {{1},{2},{3},{4}},
-    {{1},{2},{3},{4}}})
-  indices = s32[5,6] parameter(0)
-  gather = s32[5,4,6,1] gather(operand, indices),
-      offset_dims={1,3},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=2,
-      slice_sizes={1,4,1}
-  ROOT reshape = s32[5,2,2,2,3,1] reshape(gather)
-}
-)";
-
-  const char* expected_root_expression = R"(
-(reshape
-  (scalar-indexed-const
-    (constant s32[3,4,1])
-    %indices
-    0->[0,2])
-  to s32[5,2,2,2,3,1])
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, expected_root_expression);
-}
-
-TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) {
-  std::string hlo_text = R"(
-HloModule UnaryOpOfGather
-
-ENTRY main {
-  operand = f32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  indices = s32[5] parameter(0)
-  gather = f32[5,4] gather(operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT tanh = f32[5,4] tanh(gather)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant f32[3,4] f32[3,4] {
-  { 0.761594176, 0.964027584, 0.995054781, 0.999329329 },
-  { 0.761594176, 0.995054781, 0.964027584, 0.999329329 },
-  { 0.999329329, 0.995054781, 0.964027584, 0.761594176 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedScalarWithGather) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 6, 7, 8, 9 },
-  { 6, 8, 7, 9 },
-  { 9, 8, 7, 6 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest,
-       SubtractBroadcastedScalarWithGather_GatherIsLhs) {
-  std::string hlo_text = R"(
-HloModule SubtractBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT sub = s32[5,4] subtract(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { -4, -3, -2, -1 },
-  { -4, -2, -3, -1 },
-  { -1, -2, -3, -4 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest,
-       SubtractBroadcastedScalarWithGather_GatherIsRhs) {
-  std::string hlo_text = R"(
-HloModule SubtractBroadcastedScalarWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant = s32[] constant(5)
-  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT sub = s32[5,4] subtract(constant_broadcasted, gather)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 4, 3, 2, 1 },
-  { 4, 2, 3, 1 },
-  { 1, 2, 3, 4 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedVectorWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant_vect = s32[4] constant({10,11,12,13})
-  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const (constant s32[3,4] s32[3,4] {
-  { 11, 13, 15, 17 },
-  { 11, 14, 14, 17 },
-  { 14, 14, 14, 14 }
-}) %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather_Negative) {
-  std::string hlo_text = R"(
-HloModule AddBroadcastedVectorWithGather
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
-  constant_vect = s32[5] constant({10,11,12,13,14})
-  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
-  indices = s32[5] parameter(0)
-  gather = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT add = s32[5,4] add(gather, constant_broadcasted)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%add");
-}
-
-TEST_F(IndexedArrayAnalysisTest, RegularUnaryOp) {
-  std::string hlo_text = R"(
-HloModule RegularUnaryOp
-
-ENTRY main {
-  input = f32[100] parameter(0)
-  ROOT tanh = f32[100] tanh(input)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%tanh");
-}
-
-TEST_F(IndexedArrayAnalysisTest, RegularBinaryOp) {
-  std::string hlo_text = R"(
-HloModule RegularUnaryOp
-
-ENTRY main {
-  input0 = f32[100] parameter(0)
-  input1 = f32[100] parameter(1)
-  ROOT add = f32[100] add(input0, input1)
-}
-)";
-
-  AssertArrayForRootExpressionIs(hlo_text, "%add");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_0) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_lhs = s32[5,4] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,4}
-  ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[3,3] s32[3,3] {
-    { 70, 80, 90 },
-    { 158, 184, 210 },
-    { 246, 288, 330 } })
-  %indices 0->[0]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_1) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[3,3] constant({{1,2,3},{4,5,6},{7,8,9}})
-  indices = s32[5] parameter(0)
-  dot_lhs = s32[3,5] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={0}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,3] s32[4,3] {
-    { 84, 99, 114 },
-    { 96, 114, 132 },
-    { 108, 129, 150 },
-    { 120, 144, 168 } })
-   %indices 0->[1]))");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_2) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_rhs = s32[3,5] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,4] s32[4,4] {
-    { 38, 44, 50, 56 },
-    { 83, 98, 113, 128 },
-    { 128, 152, 176, 200 },
-    { 173, 206, 239, 272 } })
-  %indices 1->[1])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpBasic_3) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  indices = s32[5] parameter(0)
-  dot_rhs = s32[5,3] gather(gather_operand, indices),
-      offset_dims={1},
-      collapsed_slice_dims={0},
-      start_index_map={0},
-      index_vector_dim=1,
-      slice_sizes={1,3}
-  ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[4,4] s32[4,4] {
-    { 14, 32, 50, 68 },
-    { 32, 77, 122, 167 },
-    { 50, 122, 194, 266 },
-    { 68, 167, 266, 365 } })
-  %indices 1->[0])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpWithBatch) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[2,3,2] constant({{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
-  dot_lhs_constant = s32[2,2,3] constant({{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
-  indices = s32[4] parameter(0)
-  dot_rhs = s32[2,3,4] gather(gather_operand, indices),
-      offset_dims={0,1},
-      collapsed_slice_dims={2},
-      start_index_map={2},
-      index_vector_dim=1,
-      slice_sizes={2,3,1}
-  ROOT dot = s32[2,2,4] dot(dot_lhs_constant, dot_rhs),
-      lhs_contracting_dims={2}, rhs_contracting_dims={1},
-      lhs_batch_dims={0}, rhs_batch_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"(
-(scalar-indexed-const
-  (constant s32[2,2,2] s32[2,2,2] {
-    { { 22, 28 },
-      { 49, 64 } },
-    { { 220, 244 },
-      { 301, 334 } } })
-  %indices 3->[2])
-)");
-}
-
-TEST_F(IndexedArrayAnalysisTest, DotOpNegative) {
-  std::string hlo_text = R"(
-HloModule DotOp
-
-ENTRY main {
-  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[2,3] constant({{1,2,3},{4,5,6}})
-  indices = s32[2] parameter(0)
-  dot_lhs = s32[3,2] gather(gather_operand, indices),
-      offset_dims={0},
-      collapsed_slice_dims={1},
-      start_index_map={1},
-      index_vector_dim=1,
-      slice_sizes={3,1}
-  ROOT dot = s32[3,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  AssertArrayWithConstantsForRootExpressionIs(hlo_text, "%dot");
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index adb9d12d343f1e..970aae5af97b6e 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/layout.h"
 #include "xla/permutation_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -64,7 +64,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using gpu::SymbolicExprContext;
 using llvm::SmallVector;
 using mlir::AffineExpr;
 using mlir::AffineMap;
@@ -603,34 +602,40 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       /*rt_vars=*/{}};
 
   // A map for the `operand` operand of gather, from which we extract slices.
-  // (d0, ... d{rank - 1}) -> (d1 + rt0, d2 + rt1, ...),
-  // where rt{i} are RTVars that extract indices from the `indices` operand.
+  // If operand dimension `i` corresponds to `start_index_map[j]`, then i-th
+  // dimension of operand is indexed as d_{offset_dims[i]} + start_indices[d0,
+  // j], otherwise it's d_{offset_dims[i]}.
   std::vector<HLORTVar> rt_vars;
   std::vector<AffineExpr> exprs;
   exprs.reserve(operand_shape.dimensions().size());
+  const auto& start_index_map = dimension_numbers.start_index_map();
   for (auto [operand_dim_id, slice_size] :
        llvm::enumerate(gather->gather_slice_sizes())) {
     int64_t output_dim_id = dimension_numbers.offset_dims(operand_dim_id);
     exprs.push_back(getAffineDimExpr(output_dim_id, mlir_context));
 
-    if (operand_dim_id >= index_vector_length) {
+    int64_t start_index_map_idx =
+        absl::c_find(start_index_map, operand_dim_id) - start_index_map.begin();
+    if (start_index_map_idx == start_index_map.size()) {
       continue;
     }
     AffineMap rt_var_map = AffineMap::get(
         output_rank, /*symbolCount=*/0,
-        {indices_id_dim, getAffineConstantExpr(operand_dim_id, mlir_context)},
+        {indices_id_dim,
+         getAffineConstantExpr(start_index_map_idx, mlir_context)},
         mlir_context);
     rt_vars.push_back(HLORTVar{
         Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
         gather->operand(1), rt_var_map,
         ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
     exprs.back() =
-        exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
+        exprs.back() + getAffineSymbolExpr(rt_vars.size() - 1, mlir_context);
   }
   OperandIndexing operand_indexing = CreateOperandIndexingWithRTVars(
       AffineMap::get(/*dimCount=*/output_rank,
-                     /*symbolCount=*/index_vector_length, exprs, mlir_context),
-      std::move(dim_vars), std::move(rt_vars));
+                     /*symbolCount=*/start_index_map.size(), exprs,
+                     mlir_context),
+      dim_vars, std::move(rt_vars));
 
   return HloInstructionIndexing::FromOperandIndexing(
       {operand_indexing, OperandIndexing(indices_map)});
@@ -1777,9 +1782,11 @@ HloInstructionIndexing ComputeOutputToInputIndexing(
     const HloInstruction* instr, int output_id,
     SymbolicExprContext* symbolic_expr_context) {
   if (HloInstruction::IsOpElementwise(instr->opcode()) ||
-      instr->opcode() == HloOpcode::kMap) {
-    // Note: map has a `dimensions` attribute, but it does nothing. See
-    // b/65689298.
+      // Note: map has a `dimensions` attribute, but it does nothing. See
+      // b/65689298.
+      instr->opcode() == HloOpcode::kMap ||
+      // For a single device, all-reduce is an elementwise op.
+      instr->opcode() == HloOpcode::kAllReduceStart) {
     return ComputeOutputToInputCwiseOpIndexing(instr, symbolic_expr_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1869,9 +1876,11 @@ HloInstructionIndexing ComputeInputToOutputIndexing(
     const HloInstruction* instr, int input_id,
     SymbolicExprContext* symbolic_expr_context) {
   if (HloInstruction::IsOpElementwise(instr->opcode()) ||
-      instr->opcode() == HloOpcode::kMap) {
-    // Note: map has a `dimensions` attribute, but it does nothing. See
-    // b/65689298.
+      // Note: map has a `dimensions` attribute, but it does nothing. See
+      // b/65689298.
+      instr->opcode() == HloOpcode::kMap ||
+      // For a single device, all-reduce has 1:1 output to input mapping.
+      instr->opcode() == HloOpcode::kAllReduceStart) {
     return ComputeInputToOutputCwiseOpIndexing(instr, symbolic_expr_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
index 4342100c071345..92f3a1a1c5c9f6 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -31,9 +31,9 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 
 namespace xla {
@@ -73,13 +73,13 @@ std::ostream& operator<<(std::ostream& out,
 // of the `output_id` instruction output.
 HloInstructionIndexing ComputeOutputToInputIndexing(
     const HloInstruction* instr, int output_id,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Computes indexing maps for all output operands that the element of the
 // `input_id` instruction input will participate in.
 HloInstructionIndexing ComputeInputToOutputIndexing(
     const HloInstruction* instr, int input_id,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Computes the indexing for `epilogue_parent`'s epilogue. For example, if
 // `epilogue_parent` is a transpose, computes the input to output indexing for
@@ -107,7 +107,7 @@ HloInstructionIndexing ComputeInputToOutputIndexing(
 // fusion does not make much sense, but they are created sometimes.
 IndexingMap ComputeEpilogueInputToOutputIndexing(
     HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Indexing of the runtime variable of the HLO instruction.
 struct RuntimeVarIndexing {
@@ -206,13 +206,13 @@ using GroupedByOpIndexing =
 // cluster starting with `target_instr` and going from def to use.
 GroupedByOpIndexing ComputeGroupedOutputToInputIndexing(
     const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Returns the indexing map from logical to linearized physical shape for each
 // operand.
 llvm::SmallVector<IndexingMap, 4> MapLogicalToLinearizedPhysicalShape(
     absl::Span<const HloInstruction* const> operands,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Computes the indexing map from logical to linearized physical shape for each
 // operand and adds them to `result`. `result` may be non-empty when this
@@ -224,8 +224,7 @@ void GetThreadIdToInputMemoryLayoutsMaps(
     const HloInstructionAdaptor& hero,
     absl::Span<const HloInstruction* const> operands,
     absl::Span<const IndexingMap> operand_logical_to_linearized_physical_maps,
-    gpu::SymbolicExprContext* symbolic_expr_context,
-    GroupedByOpIndexingMap& result);
+    SymbolicExprContext* symbolic_expr_context, GroupedByOpIndexingMap& result);
 
 // Replaces RTVars with the midpoints of the feasible intervals.
 void AssignValuesToRTVars(IndexingMap* indexing_map);
@@ -238,23 +237,23 @@ GroupedByOpIndexing GroupIndexingMapsByProducers(
 // Equivalent to linearizing the input_shape index and then delinearizing it
 // to output_shape.
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          gpu::SymbolicExprContext* symbolic_expr_context);
+                          SymbolicExprContext* symbolic_expr_context);
 IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
                           const Shape& output_shape,
-                          gpu::SymbolicExprContext* symbolic_expr_context);
+                          SymbolicExprContext* symbolic_expr_context);
 IndexingMap GetBitcastMap(absl::Span<const int64_t> input_shape,
                           absl::Span<const int64_t> output_shape,
-                          gpu::SymbolicExprContext* symbolic_expr_context);
+                          SymbolicExprContext* symbolic_expr_context);
 
 // Creates an indexing map from the physical layout of the tensor to its logical
 // layout.
 IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
-    const Shape& shape, gpu::SymbolicExprContext* symbolic_expr_context);
+    const Shape& shape, SymbolicExprContext* symbolic_expr_context);
 
 // Creates an indexing map from the logical layout of the tensor to its physical
 // layout.
 IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
-    const Shape& shape, gpu::SymbolicExprContext* symbolic_expr_context);
+    const Shape& shape, SymbolicExprContext* symbolic_expr_context);
 
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
@@ -263,18 +262,18 @@ const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
 mlir::AffineExpr LinearizeShape(
     absl::Span<const int64_t> dims,
     absl::Span<const mlir::AffineExpr> dimension_exprs,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Computes N-d indexing expressions given a linear index and a shape.
 std::vector<mlir::AffineExpr> DelinearizeIndex(
     absl::Span<const int64_t> dims, mlir::AffineExpr linear_index,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+    SymbolicExprContext* symbolic_expr_context);
 
 // Creates an identity indexing map corresponding to the parameter shape.
 IndexingMap CreateIdentityMap(const Shape& shape,
-                              gpu::SymbolicExprContext* symbolic_expr_context);
+                              SymbolicExprContext* symbolic_expr_context);
 IndexingMap CreateIdentityMap(absl::Span<const int64_t> dimensions,
-                              gpu::SymbolicExprContext* symbolic_expr_context);
+                              SymbolicExprContext* symbolic_expr_context);
 
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes);
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
index ec4de8582f31fc..fc29e95f59e4de 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
@@ -1084,6 +1084,43 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
     )"));
 }
 
+TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY main {
+      operand = f32[33,76,70] parameter(0)
+      indices = s32[1806,2] parameter(1)
+      ROOT r = f32[1806,7,8,4] gather(operand, indices), offset_dims={1,2,3},
+                                 collapsed_slice_dims={}, start_index_map={1,0},
+                                 index_vector_dim=1, slice_sizes={7,8,4}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+    operand id = 0
+      (d0, d1, d2, d3){rt0, rt1} -> (d1 + rt0, d2 + rt1, d3),
+      domain:
+        d0 in [0, 1805],
+        d1 in [0, 6],
+        d2 in [0, 7],
+        d3 in [0, 3],
+        rt0 in [0, 26],
+        rt1 in [0, 68]
+      runtime variables:
+        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+          domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
+        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+          domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
+    operand id = 1
+      (d0, d1, d2, d3)[s0] -> (d0, s0),
+      domain:
+        d0 in [0, 1805],
+        d1 in [0, 6],
+        d2 in [0, 7],
+        d3 in [0, 3],
+        s0 in [0, 1]
+    )"));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.cc b/third_party/xla/xla/hlo/analysis/indexing_map.cc
index cfea502b8d5fe4..e4ab970533c036 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <limits>
 #include <numeric>
 #include <optional>
 #include <ostream>
-#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -30,9 +28,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/numeric/int128.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.h b/third_party/xla/xla/hlo/analysis/indexing_map.h
index f76f68b99bc34b..58b5f5bbec139d 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.h
@@ -164,7 +164,6 @@ class IndexingMap {
 
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
-  mlir::AffineMap& GetMutableAffineMap() { return affine_map_; }
 
   // Returns the number of indexing map results.
   int64_t GetNumResults() const { return affine_map_.getNumResults(); }
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
index 0b3e875324baa0..a27096dadcd45d 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 
 #include <algorithm>
-#include <cctype>
 #include <cstdint>
 #include <optional>
 #include <ostream>
@@ -26,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -42,12 +42,11 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
 
-using gpu::SymbolicExprContext;
 using llvm::SmallVector;
 using llvm::SmallVectorImpl;
 using llvm::StringRef;
@@ -170,7 +169,7 @@ class Parser {
 
  private:
   void ConsumeWhitespace() {
-    while (it_ != input_.end() && std::isspace(*it_)) {
+    while (it_ != input_.end() && absl::ascii_isspace(*it_)) {
       ++it_;
     }
   }
@@ -303,10 +302,10 @@ Token Parser::GetNextTokenImpl() {
     return Token{"", Token::Kind::kEOF};
   }
   auto start = it_;
-  if (std::isalpha(*it_)) {
+  if (absl::ascii_isalpha(*it_)) {
     // Variable name.
-    while (it_ != input_.end() &&
-           (std::isalpha(*it_) || std::isdigit(*it_) || *it_ == '_')) {
+    while (it_ != input_.end() && (absl::ascii_isalpha(*it_) ||
+                                   absl::ascii_isdigit(*it_) || *it_ == '_')) {
       ++it_;
     }
     StringRef spelling = input_.substr(start - input_.data(), it_ - start);
@@ -327,9 +326,9 @@ Token Parser::GetNextTokenImpl() {
     }
     return Token{spelling, Token::Kind::kVarName};
   }
-  if (std::isdigit(*it_)) {
+  if (absl::ascii_isdigit(*it_)) {
     auto start = it_;
-    while (it_ != input_.end() && std::isdigit(*it_)) {
+    while (it_ != input_.end() && absl::ascii_isdigit(*it_)) {
       ++it_;
     }
 
@@ -343,9 +342,9 @@ Token Parser::GetNextTokenImpl() {
         ++it_;
         return Token{"->", Token::Kind::kArrow};
       }
-      if (std::isdigit(*it_)) {
+      if (absl::ascii_isdigit(*it_)) {
         auto start = it_ - 1;
-        while (it_ != input_.end() && std::isdigit(*it_)) {
+        while (it_ != input_.end() && absl::ascii_isdigit(*it_)) {
           ++it_;
         }
         StringRef spelling = input_.substr(start - input_.data(), it_ - start);
@@ -600,7 +599,7 @@ void PrintAffineExprImpl(const AffineExpr affine_expr,
 }  // namespace
 
 std::optional<IndexingMap> ParseIndexingMap(
-    llvm::StringRef input, gpu::SymbolicExprContext* symbolic_expr_context) {
+    llvm::StringRef input, SymbolicExprContext* symbolic_expr_context) {
   Parser parser(input);
 
   // Parse variable names.
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
index 88fd7920c1530d..0ca28ff7ffd050 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
@@ -26,13 +26,13 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
 // Parses the given string into an IndexingMap.
 std::optional<IndexingMap> ParseIndexingMap(
-    llvm::StringRef input, gpu::SymbolicExprContext* symbolic_expr_context);
+    llvm::StringRef input, SymbolicExprContext* symbolic_expr_context);
 
 // Prints AffineExpr using the default (d0, d1, ..., s0, s1, ...) variable
 // names.
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
index ee7eccb5e9b156..ecc22c4569efc2 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -35,7 +35,7 @@ using ::testing::HasSubstr;
 class IndexingMapSerializationTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
   void ParseAndCheck(absl::string_view indexing_map_str) {
     auto indexing_map =
         ParseIndexingMap(indexing_map_str, &symbolic_expr_context_);
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index ab51c158806f2f..0a54749feb4876 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_map.h"
 
-#include <limits>
 #include <memory>
 #include <optional>
 #include <sstream>
@@ -34,9 +33,9 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -59,7 +58,7 @@ class IndexingMapTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_;
+  SymbolicExprContext symbolic_expr_context_;
 };
 
 std::vector<bool> ConvertToSTL(const llvm::SmallBitVector& bit_vector) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
index bdf1c0b754cd44..4114236d88988f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
-#include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -30,6 +29,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -42,8 +42,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/errors.h"
 
@@ -142,7 +142,7 @@ HloInstructionIndexing IndexingTestBase::GetInputToOutputIndexing(
 }
 
 AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
-                         gpu::SymbolicExprContext* symbolic_expr_context) {
+                         SymbolicExprContext* symbolic_expr_context) {
   std::string full_affine_map_string =
       absl::StrCat("affine_map<", serialized_affine_map, ">");
   return mlir::cast<mlir::AffineMapAttr>(
@@ -154,7 +154,7 @@ AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
 // Since MLIR does not have AffineExprAttr, we construct an AffineMap and then
 // retrieve its first result.
 AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
-                           gpu::SymbolicExprContext* symbolic_expr_context) {
+                           SymbolicExprContext* symbolic_expr_context) {
   std::string full_affine_map_string = absl::StrCat(
       "affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9)"
       "[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (",
@@ -171,10 +171,10 @@ bool ApproximateMatch(absl::string_view lhs, absl::string_view rhs) {
   size_t rhs_length = rhs.size();
   size_t l = 0, r = 0;
   while (l < lhs_length || r < rhs_length) {
-    while (l < lhs_length && std::isspace(lhs[l])) {
+    while (l < lhs_length && absl::ascii_isspace(lhs[l])) {
       ++l;
     }
-    while (r < rhs_length && std::isspace(rhs[r])) {
+    while (r < rhs_length && absl::ascii_isspace(rhs[r])) {
       ++r;
     }
     if (l == lhs_length || r == rhs_length) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
index 5821108fb9e13d..c2738501654d36 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -31,10 +31,10 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 
 namespace xla {
 
@@ -81,16 +81,15 @@ class IndexingTestBase : public HloHardwareIndependentTestBase {
       bool use_physical_layout = false);
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_;
+  SymbolicExprContext symbolic_expr_context_;
   std::unique_ptr<VerifiedHloModule> module_;
 };
 
 mlir::AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
-                               gpu::SymbolicExprContext* symbolic_expr_context);
+                               SymbolicExprContext* symbolic_expr_context);
 
-mlir::AffineExpr ParseAffineExpr(
-    absl::string_view serialized_affine_expr,
-    gpu::SymbolicExprContext* symbolic_expr_context);
+mlir::AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
+                                 SymbolicExprContext* symbolic_expr_context);
 
 // Safely evaluates the given expression, returning nullopt if the result is
 // undefined (due to undefined behavior, e.g. division by zero or overflow).
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc b/third_party/xla/xla/hlo/analysis/symbolic_expr.cc
similarity index 72%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_expr.cc
index f5e87640602c64..f9d92cffa73172 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 #include <algorithm>
 #include <cassert>
-#include <cctype>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
@@ -29,6 +28,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/ascii.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -43,9 +44,9 @@ limitations under the License.
 #include "llvm/Support/MathExtras.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/StorageUniquer.h"
+#include "mlir/Support/TypeID.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 
 std::string GetBinaryOpString(SymbolicExprType type) {
@@ -72,7 +73,7 @@ std::string GetBinaryOpString(SymbolicExprType type) {
 // Helper class to manage the state of the parser.
 class Parser {
  public:
-  Parser(absl::string_view str, SymbolicExprContext* context)
+  Parser(absl::string_view str, mlir::MLIRContext* context)
       : remaining_str_(str), context_(context) {}
 
   SymbolicExpr Parse() {
@@ -86,11 +87,11 @@ class Parser {
   int64_t ParseNumber(std::string& error_msg) {
     size_t num_len = 0;
     if (!remaining_str_.empty() &&
-        (isdigit(remaining_str_[0]) || remaining_str_[0] == '-')) {
+        (absl::ascii_isdigit(remaining_str_[0]) || remaining_str_[0] == '-')) {
       num_len = 1;
     }
     while (num_len < remaining_str_.size() &&
-           isdigit(remaining_str_[num_len])) {
+           absl::ascii_isdigit(remaining_str_[num_len])) {
       num_len++;
     }
     CHECK(num_len > 0) << error_msg;
@@ -106,8 +107,8 @@ class Parser {
     while (true) {
       SkipWhitespace();
       if (absl::ConsumePrefix(&remaining_str_, "+")) {
-        lhs =
-            context_->CreateBinaryOp(SymbolicExprType::kAdd, lhs, ParseTerm());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kAdd, lhs, ParseTerm(),
+                                     context_);
       } else {
         break;
       }
@@ -121,14 +122,14 @@ class Parser {
     while (true) {
       SkipWhitespace();
       if (absl::ConsumePrefix(&remaining_str_, "*")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kMul, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, ParseFactor(),
+                                     context_);
       } else if (absl::ConsumePrefix(&remaining_str_, "floordiv")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kFloorDiv, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, lhs,
+                                     ParseFactor(), context_);
       } else if (absl::ConsumePrefix(&remaining_str_, "ceildiv")) {
-        lhs = context_->CreateBinaryOp(SymbolicExprType::kCeilDiv, lhs,
-                                       ParseFactor());
+        lhs = CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, lhs,
+                                     ParseFactor(), context_);
       } else {
         break;
       }
@@ -151,7 +152,7 @@ class Parser {
     SkipWhitespace();
     CHECK(absl::ConsumePrefix(&remaining_str_, ")"))
         << "Missing ')' in " << func_name << "()";
-    return context_->CreateBinaryOp(type, lhs, rhs);
+    return CreateSymbolicBinaryOp(type, lhs, rhs, context_);
   }
 
   // Handles highest precedence items: numbers, variables, and functions.
@@ -177,13 +178,13 @@ class Parser {
     if (absl::ConsumePrefix(&remaining_str_, "v")) {
       std::string error_msg = "Invalid variable format";
       int64_t var_id = ParseNumber(error_msg);
-      return context_->CreateVariable(var_id);
+      return CreateSymbolicVariable(var_id, context_);
     }
     // Case 4: Number
     std::string error_msg =
         absl::StrCat("Failed to parse expression: \"", remaining_str_, "\"");
     int64_t val = ParseNumber(error_msg);
-    return context_->CreateConstant(val);
+    return CreateSymbolicConstant(val, context_);
   }
 
   void SkipWhitespace() {
@@ -191,7 +192,7 @@ class Parser {
   }
 
   absl::string_view remaining_str_;
-  SymbolicExprContext* context_;
+  mlir::MLIRContext* context_;
 };
 
 // Returns {BASE, COEFF}, where expr is equivalent to BASE * COEFF.
@@ -220,8 +221,25 @@ void ExtractTerms(SymbolicExpr expr,
   }
 }
 
+// TODO(b/459357586): Remove this function and use CanonicalizeAdd instead.
+SymbolicExpr BasicAddSimplify(SymbolicExpr lhs, SymbolicExpr rhs) {
+  if (rhs.GetType() == SymbolicExprType::kConstant && rhs.GetValue() == 0) {
+    return lhs;
+  }
+  if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
+    return rhs;
+  }
+  if (lhs.GetType() == SymbolicExprType::kConstant &&
+      rhs.GetType() == SymbolicExprType::kConstant) {
+    return CreateSymbolicConstant(lhs.GetValue() + rhs.GetValue(),
+                                  lhs.GetContext());
+  }
+  return CreateSymbolicBinaryOp(SymbolicExprType::kAdd, lhs, rhs,
+                                lhs.GetContext());
+}
+
 SymbolicExpr CanonicalizeAdd(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   // Flattening and term collection
   llvm::SmallVector<std::pair<SymbolicExpr, int64_t>> terms;
@@ -256,40 +274,82 @@ SymbolicExpr CanonicalizeAdd(SymbolicExpr lhs, SymbolicExpr rhs) {
 
   // Add the combined constant term as an expression
   if (const_val != 0) {
-    exprs.push_back(ctx->CreateConstant(const_val));
+    exprs.push_back(CreateSymbolicConstant(const_val, ctx));
   }
   if (exprs.empty()) {
-    return ctx->CreateConstant(0);
+    return CreateSymbolicConstant(0, ctx);
   }
   // Sort all terms, including the constant
   absl::c_sort(exprs);
 
   SymbolicExpr result = exprs[0];
   for (size_t i = 1; i < exprs.size(); ++i) {
-    result = ctx->CreateBinaryOp(SymbolicExprType::kAdd, result, exprs[i]);
+    result =
+        CreateSymbolicBinaryOp(SymbolicExprType::kAdd, result, exprs[i], ctx);
   }
   return result;
 }
 
-SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+// Helper to simplify multiplication when the RHS is a constant.
+SymbolicExpr SimplifyMulByConstantRHS(SymbolicExpr lhs, SymbolicExpr rhs) {
+  if (rhs.GetType() != SymbolicExprType::kConstant) {
+    return SymbolicExpr();
+  }
+  int64_t rhs_val = rhs.GetValue();
+  mlir::MLIRContext* ctx = lhs.GetContext();
+
+  if (rhs_val == 0) {
+    return rhs;  // x * 0 = 0
+  }
+  if (rhs_val == 1) {
+    return lhs;  // x * 1 = x
+  }
+  if (lhs.GetType() == SymbolicExprType::kConstant) {
+    return CreateSymbolicConstant(lhs.GetValue() * rhs_val, ctx);
+  }
+
+  // Associativity: (X * C1) * C2 => X * (C1 * C2)
+  if (lhs.GetType() == SymbolicExprType::kMul &&
+      lhs.GetRHS().GetType() == SymbolicExprType::kConstant) {
+    return CreateSymbolicBinaryOp(
+        SymbolicExprType::kMul, lhs.GetLHS(),
+        CreateSymbolicConstant(lhs.GetRHS().GetValue() * rhs_val, ctx), ctx);
+  }
+  return SymbolicExpr();
+}
+
+SymbolicExpr BasicMulSimplify(SymbolicExpr lhs, SymbolicExpr rhs) {
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
-  // Neutral Elements
+  // Try constant folding, neutral element simplification, and associativity.
   if (rhs.GetType() == SymbolicExprType::kConstant) {
-    if (rhs.GetValue() == 0) {
-      return rhs;  // x * 0 = 0
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(lhs, rhs);
+    if (simplified) {
+      return simplified;
     }
-    if (rhs.GetValue() == 1) {
-      return lhs;  // x * 1 = x
+  } else if (lhs.GetType() == SymbolicExprType::kConstant) {
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(rhs, lhs);
+    if (simplified) {
+      return simplified;
     }
   }
 
-  // Associativity: (X * C1) * C2 => X * (C1 * C2)
-  if (lhs.GetType() == SymbolicExprType::kMul &&
-      lhs.GetRHS().GetType() == SymbolicExprType::kConstant &&
-      rhs.GetType() == SymbolicExprType::kConstant) {
-    return (lhs.GetLHS() * (lhs.GetRHS().GetValue() * rhs.GetValue()))
-        .Canonicalize();
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, rhs, ctx);
+}
+
+SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
+  mlir::MLIRContext* ctx = lhs.GetContext();
+
+  if (rhs.GetType() == SymbolicExprType::kConstant) {
+    // Try constant folding, neutral element simplification, and associativity.
+    SymbolicExpr simplified = SimplifyMulByConstantRHS(lhs, rhs);
+    if (simplified) {
+      if (simplified.GetType() == SymbolicExprType::kConstant ||
+          simplified == lhs) {
+        return simplified;
+      }
+      return simplified.Canonicalize();
+    }
   }
 
   // Distribute Mul over Add: (A + B) * C => (A * C) + (B * C)
@@ -300,7 +360,7 @@ SymbolicExpr CanonicalizeMul(SymbolicExpr lhs, SymbolicExpr rhs) {
     return ((lhs * rhs.GetLHS()) + (lhs * rhs.GetRHS())).Canonicalize();
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMul, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMul, lhs, rhs, ctx);
 }
 
 std::optional<int64_t> SubtractAndGetConstDiff(SymbolicExpr lhs,
@@ -313,21 +373,21 @@ std::optional<int64_t> SubtractAndGetConstDiff(SymbolicExpr lhs,
 }
 
 SymbolicExpr CanonicalizeMin(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
   if (auto diff = SubtractAndGetConstDiff(lhs, rhs)) {  // min(X, X + k) = X
     return (diff.value() <= 0) ? lhs : rhs;
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMin, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMin, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeMax(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
   if (auto diff = SubtractAndGetConstDiff(lhs, rhs)) {  // max(X, X + k) = X + k
     return (diff.value() >= 0) ? lhs : rhs;
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kMax, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMax, lhs, rhs, ctx);
 }
 
 // Helper function to simplify (A * C1) op C2 using GCD.
@@ -386,7 +446,7 @@ SymbolicExpr SimplifyFloorDivAddOperand(SymbolicExpr a, SymbolicExpr b,
     remaining_expr = a.GetLHS();
   } else if (a.GetType() == SymbolicExprType::kConstant) {
     a_coeff = a.GetValue();
-    remaining_expr = a.GetContext()->CreateConstant(1);
+    remaining_expr = CreateSymbolicConstant(1, a.GetContext());
   } else {
     return SymbolicExpr();  // Cannot simplify
   }
@@ -398,7 +458,7 @@ SymbolicExpr SimplifyFloorDivAddOperand(SymbolicExpr a, SymbolicExpr b,
 }
 
 SymbolicExpr CanonicalizeFloorDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 floordiv X => 0
@@ -436,11 +496,11 @@ SymbolicExpr CanonicalizeFloorDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
     }
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kFloorDiv, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeCeilDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 ceildiv X => 0
@@ -468,18 +528,18 @@ SymbolicExpr CanonicalizeCeilDiv(SymbolicExpr lhs, SymbolicExpr rhs) {
     return (-(lhs.floorDiv(-divisor))).Canonicalize();
   }
 
-  return ctx->CreateBinaryOp(SymbolicExprType::kCeilDiv, lhs, rhs);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, lhs, rhs, ctx);
 }
 
 SymbolicExpr CanonicalizeMod(SymbolicExpr lhs, SymbolicExpr rhs) {
-  SymbolicExprContext* ctx = lhs.GetContext();
+  mlir::MLIRContext* ctx = lhs.GetContext();
 
   if (lhs.GetType() == SymbolicExprType::kConstant && lhs.GetValue() == 0) {
     return lhs;  // 0 mod X => 0
   }
 
   if (lhs == rhs) {
-    return ctx->CreateConstant(0);  // X mod X => 0
+    return CreateSymbolicConstant(0, ctx);  // X mod X => 0
   }
 
   if (rhs.GetType() == SymbolicExprType::kConstant) {
@@ -537,12 +597,16 @@ class SymbolicExprStorage : public mlir::StorageUniquer::BaseStorage {
 
  protected:
   friend class SymbolicExpr;
-  friend class SymbolicExprContext;
+  friend SymbolicExpr GetOrCreateSymbolicExpr(SymbolicExprType type,
+                                              int64_t value, SymbolicExpr lhs,
+                                              SymbolicExpr rhs,
+                                              mlir::MLIRContext* mlir_context);
+
   SymbolicExprType type_;
   int64_t value_ = 0;
   SymbolicExpr lhs_;
   SymbolicExpr rhs_;
-  SymbolicExprContext* ctx_ = nullptr;
+  mlir::MLIRContext* mlir_context_ = nullptr;
 
  private:
   SymbolicExprStorage(SymbolicExprType type, int64_t value)
@@ -551,10 +615,18 @@ class SymbolicExprStorage : public mlir::StorageUniquer::BaseStorage {
       : type_(type), lhs_(lhs), rhs_(rhs) {}
 };
 
-SymbolicExprContext* SymbolicExpr::GetContext() const { return impl_->ctx_; }
+mlir::MLIRContext* SymbolicExpr::GetContext() const {
+  return impl_->mlir_context_;
+}
 
 SymbolicExprType SymbolicExpr::GetType() const { return impl_->type_; }
 
+bool SymbolicExpr::IsBinaryOp() const {
+  auto type = GetType();
+  return type != SymbolicExprType::kConstant &&
+         type != SymbolicExprType::kVariable;
+}
+
 SymbolicExpr SymbolicExpr::GetLHS() const { return impl_->lhs_; }
 
 SymbolicExpr SymbolicExpr::GetRHS() const { return impl_->rhs_; }
@@ -667,7 +739,7 @@ int64_t SymbolicExpr::Evaluate(
 
 SymbolicExpr SymbolicExpr::ReplaceVariables(
     absl::Span<const SymbolicExpr> substitutions) const {
-  SymbolicExprContext* ctx = GetContext();
+  mlir::MLIRContext* ctx = GetContext();
   switch (GetType()) {
     case SymbolicExprType::kConstant:
       return *this;
@@ -691,7 +763,7 @@ SymbolicExpr SymbolicExpr::ReplaceVariables(
       if (new_lhs == GetLHS() && new_rhs == GetRHS()) {
         return *this;
       }
-      return ctx->CreateBinaryOp(GetType(), new_lhs, new_rhs);
+      return CreateSymbolicBinaryOp(GetType(), new_lhs, new_rhs, ctx);
     }
     default:
       LOG(FATAL) << "Substitute not implemented for this type.";
@@ -709,8 +781,9 @@ SymbolicExpr SymbolicExpr::ReplaceDimsAndSymbols(
     int64_t num_dims) const {
   llvm::SmallVector<SymbolicExpr> replacements;
   replacements.append(dim_replacements.begin(), dim_replacements.end());
+  mlir::MLIRContext* ctx = GetContext();
   for (int64_t i = dim_replacements.size(); i < num_dims; ++i) {
-    replacements.push_back(GetContext()->CreateVariable(i));
+    replacements.push_back(CreateSymbolicVariable(i, ctx));
   }
   replacements.append(symbol_replacements.begin(), symbol_replacements.end());
   return ReplaceVariables(replacements);
@@ -730,12 +803,11 @@ SymbolicExpr SymbolicExpr::Replace(
     return it->second;
   }
 
-  SymbolicExprType type = GetType();
-  if (type == SymbolicExprType::kConstant ||
-      type == SymbolicExprType::kVariable) {
+  if (!IsBinaryOp()) {
     return *this;
   }
 
+  SymbolicExprType type = GetType();
   SymbolicExpr lhs = GetLHS();
   SymbolicExpr rhs = GetRHS();
   SymbolicExpr new_lhs = lhs.Replace(replacements);
@@ -744,7 +816,7 @@ SymbolicExpr SymbolicExpr::Replace(
   if (new_lhs == lhs && new_rhs == rhs) {
     return *this;
   }
-  return GetContext()->CreateBinaryOp(type, new_lhs, new_rhs);
+  return CreateSymbolicBinaryOp(type, new_lhs, new_rhs, GetContext());
 }
 
 void SymbolicExpr::GetUsedVariables(
@@ -777,21 +849,21 @@ SymbolicExpr SymbolicExpr::Canonicalize() const {
     return *this;
   }
 
-  SymbolicExprType type = GetType();
-  if (type == SymbolicExprType::kConstant ||
-      type == SymbolicExprType::kVariable) {
+  if (!IsBinaryOp()) {
     return *this;
   }
 
+  SymbolicExprType type = GetType();
   SymbolicExpr lhs = this->GetLHS().Canonicalize();
   SymbolicExpr rhs = this->GetRHS().Canonicalize();
 
   // If both sides are constants, we can evaluate the expression.
   if (lhs.GetType() == SymbolicExprType::kConstant &&
       rhs.GetType() == SymbolicExprType::kConstant) {
-    return GetContext()->CreateConstant(
-        SymbolicExpr(GetContext()->CreateBinaryOp(type, lhs, rhs))
-            .Evaluate({}));
+    return CreateSymbolicConstant(
+        SymbolicExpr(CreateSymbolicBinaryOp(type, lhs, rhs, GetContext()))
+            .Evaluate({}),
+        GetContext());
   }
 
   // Assure constants are on the RHS for commutative operations.
@@ -829,14 +901,16 @@ SymbolicExpr SymbolicExpr::Canonicalize() const {
 }
 
 SymbolicExpr SymbolicExpr::operator+(int64_t v) const {
-  return *this + GetContext()->CreateConstant(v);
+  return *this + CreateSymbolicConstant(v, GetContext());
 }
 SymbolicExpr SymbolicExpr::operator+(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kAdd, *this, other);
+  // TODO(b/433693782): We should use our own canonicalization here instead of
+  // relying on a similar one to AffineMap so tests do not fail.
+  return BasicAddSimplify(*this, other);
 }
 
 SymbolicExpr SymbolicExpr::operator-() const {
-  return (*this * GetContext()->CreateConstant(-1)).Canonicalize();
+  return (*this * CreateSymbolicConstant(-1, GetContext())).Canonicalize();
 }
 SymbolicExpr SymbolicExpr::operator-(int64_t v) const { return *this + (-v); }
 SymbolicExpr SymbolicExpr::operator-(SymbolicExpr other) const {
@@ -844,84 +918,149 @@ SymbolicExpr SymbolicExpr::operator-(SymbolicExpr other) const {
 }
 
 SymbolicExpr SymbolicExpr::operator*(int64_t v) const {
-  return *this * GetContext()->CreateConstant(v);
+  return *this * CreateSymbolicConstant(v, GetContext());
 }
 SymbolicExpr SymbolicExpr::operator*(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMul, *this, other);
+  // TODO(b/433693782): We should use our own canonicalization here instead of
+  // relying on a similar one to AffineMap so tests do not fail.
+  return BasicMulSimplify(*this, other);
 }
 
 SymbolicExpr SymbolicExpr::operator%(int64_t v) const {
-  return this->operator%(GetContext()->CreateConstant(v));
+  return this->operator%(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::operator%(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMod, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMod, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::floorDiv(int64_t v) const {
-  return this->floorDiv(GetContext()->CreateConstant(v));
+  return this->floorDiv(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::floorDiv(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kFloorDiv, *this,
-                                      other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kFloorDiv, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::ceilDiv(int64_t v) const {
-  return this->ceilDiv(GetContext()->CreateConstant(v));
+  return this->ceilDiv(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::ceilDiv(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kCeilDiv, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kCeilDiv, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::min(int64_t v) const {
-  return this->min(GetContext()->CreateConstant(v));
+  return this->min(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::min(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMin, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMin, *this, other,
+                                GetContext());
 }
 
 SymbolicExpr SymbolicExpr::max(int64_t v) const {
-  return this->max(GetContext()->CreateConstant(v));
+  return this->max(CreateSymbolicConstant(v, GetContext()));
 }
 SymbolicExpr SymbolicExpr::max(SymbolicExpr other) const {
-  return GetContext()->CreateBinaryOp(SymbolicExprType::kMax, *this, other);
+  return CreateSymbolicBinaryOp(SymbolicExprType::kMax, *this, other,
+                                GetContext());
 }
 
-SymbolicExprContext::SymbolicExprContext(mlir::MLIRContext* mlir_context)
-    : mlir_context_(mlir_context) {
-  uniquer_.registerParametricStorageType<SymbolicExprStorage>();
+static absl::Mutex& getSymbolicExprStorageMutex() {
+  static absl::Mutex m(absl::kConstInit);
+  return m;
+}
+
+void RegisterSymbolicExprStorage(mlir::MLIRContext* mlir_context) {
+  CHECK(mlir_context != nullptr);
+  auto* uniquer = &mlir_context->getAffineUniquer();
+  {
+    absl::MutexLock lock(getSymbolicExprStorageMutex());
+    if (!uniquer->isParametricStorageInitialized(
+            mlir::TypeID::get<SymbolicExprStorage>())) {
+      uniquer->registerParametricStorageType<SymbolicExprStorage>();
+    }
+  }
+}
+
+SymbolicExpr GetOrCreateSymbolicExpr(SymbolicExprType type, int64_t value,
+                                     SymbolicExpr lhs, SymbolicExpr rhs,
+                                     mlir::MLIRContext* mlir_context) {
+  // TODO(b/433696544): This might be too expensive to call per expression.
+  // We should consider calling this once per MLIRContext creation.
+  RegisterSymbolicExprStorage(mlir_context);
+  auto* uniquer = &mlir_context->getAffineUniquer();
+  auto initContext = [&](SymbolicExprStorage* storage) {
+    storage->mlir_context_ = mlir_context;
+  };
+  return uniquer->get<SymbolicExprStorage>(initContext, type, value, lhs, rhs);
+}
+
+SymbolicExpr CreateSymbolicConstant(int64_t value,
+                                    mlir::MLIRContext* mlir_context) {
+  return GetOrCreateSymbolicExpr(SymbolicExprType::kConstant, value,
+                                 SymbolicExpr(), SymbolicExpr(), mlir_context);
+}
+
+SymbolicExpr CreateSymbolicVariable(int64_t var_id,
+                                    mlir::MLIRContext* mlir_context) {
+  return GetOrCreateSymbolicExpr(SymbolicExprType::kVariable, var_id,
+                                 SymbolicExpr(), SymbolicExpr(), mlir_context);
+}
+
+SymbolicExpr CreateSymbolicBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
+                                    SymbolicExpr rhs,
+                                    mlir::MLIRContext* mlir_context) {
+  CHECK(type != SymbolicExprType::kConstant &&
+        type != SymbolicExprType::kVariable && lhs && rhs)
+      << "We expect a binary operation and two symbolic expressions as "
+         "children.";
+  return GetOrCreateSymbolicExpr(type, 0, lhs, rhs, mlir_context);
+}
+
+llvm::SmallVector<SymbolicExpr> CreateSymbolicConstantExprs(
+    llvm::ArrayRef<int64_t> constants, mlir::MLIRContext* context) {
+  llvm::SmallVector<SymbolicExpr> exprs;
+  exprs.reserve(constants.size());
+  for (int64_t constant : constants) {
+    exprs.push_back(CreateSymbolicConstant(constant, context));
+  }
+  return exprs;
+}
+SymbolicExpr ParseSymbolicExpr(absl::string_view expr_str,
+                               mlir::MLIRContext* mlir_context) {
+  return Parser(expr_str, mlir_context).Parse();
 }
 
+SymbolicExprContext::SymbolicExprContext(mlir::MLIRContext* mlir_context)
+    : mlir_context_(mlir_context) {}
+
 SymbolicExpr SymbolicExprContext::GetOrCreate(SymbolicExprType type,
                                               int64_t value, SymbolicExpr lhs,
                                               SymbolicExpr rhs) {
-  auto initContext = [&](SymbolicExprStorage* storage) {
-    storage->ctx_ = this;
-  };
-  return uniquer_.get<SymbolicExprStorage>(initContext, type, value, lhs, rhs);
+  return GetOrCreateSymbolicExpr(type, value, lhs, rhs, mlir_context_);
 }
 
 SymbolicExpr SymbolicExprContext::CreateConstant(int64_t value) {
-  return GetOrCreate(SymbolicExprType::kConstant, value, SymbolicExpr(),
-                     SymbolicExpr());
+  return CreateSymbolicConstant(value, mlir_context_);
 }
 
 SymbolicExpr SymbolicExprContext::CreateVariable(int64_t var_id) {
-  return GetOrCreate(SymbolicExprType::kVariable, var_id, SymbolicExpr(),
-                     SymbolicExpr());
+  return CreateSymbolicVariable(var_id, mlir_context_);
 }
 
 SymbolicExpr SymbolicExprContext::CreateBinaryOp(SymbolicExprType type,
                                                  SymbolicExpr lhs,
                                                  SymbolicExpr rhs) {
-  CHECK(type != SymbolicExprType::kConstant &&
-        type != SymbolicExprType::kVariable && lhs && rhs)
-      << "We expect a binary operation and two symbolic expressions as "
-         "children.";
-  return GetOrCreate(type, 0, lhs, rhs);
+  return CreateSymbolicBinaryOp(type, lhs, rhs, mlir_context_);
 }
 
 SymbolicExpr SymbolicExprContext::Parse(absl::string_view expr_str) {
-  return Parser(expr_str, this).Parse();
+  return ParseSymbolicExpr(expr_str, mlir_context_);
+}
+
+bool SymbolicExprContext::operator==(const SymbolicExprContext& other) const {
+  return mlir_context_ == other.mlir_context_;
 }
 
 void SymbolicExpr::Walk(
@@ -948,5 +1087,4 @@ void SymbolicExpr::Walk(
   callback(*this);
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h b/third_party/xla/xla/hlo/analysis/symbolic_expr.h
similarity index 74%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h
rename to third_party/xla/xla/hlo/analysis/symbolic_expr.h
index f02c091446e319..8aeb3d1b2d37f2 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
 
 #include <cstdint>
 #include <functional>
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/Support/StorageUniquer.h"
 
 namespace xla {
-namespace gpu {
 
 class SymbolicExprContext;
 class SymbolicExprStorage;
@@ -64,8 +63,9 @@ class SymbolicExpr {
   bool operator!=(SymbolicExpr other) const { return !(*this == other); }
   bool operator<(const SymbolicExpr& other) const;
 
-  SymbolicExprContext* GetContext() const;
+  mlir::MLIRContext* GetContext() const;
   SymbolicExprType GetType() const;
+  bool IsBinaryOp() const;
   SymbolicExpr GetLHS() const;
   SymbolicExpr GetRHS() const;
   int64_t GetValue() const;
@@ -148,6 +148,30 @@ inline ::llvm::hash_code hash_value(SymbolicExpr expr) {
   return ::llvm::hash_value(expr.GetImpl());
 }
 
+template <typename H>
+H AbslHashValue(H h, const SymbolicExpr& expr) {
+  return H::combine(std::move(h), hash_value(expr));
+}
+
+// This method should be called once per MLIRContext to register the
+// SymbolicExprStorage type with the MLIRContext's uniquifier. It should be
+// called before any SymbolicExprs are created.
+void RegisterSymbolicExprStorage(mlir::MLIRContext* mlir_context);
+
+// Free functions to create SymbolicExpr.
+SymbolicExpr ParseSymbolicExpr(absl::string_view expr_str,
+                               mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicConstant(int64_t value,
+                                    mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicVariable(int64_t var_id,
+                                    mlir::MLIRContext* mlir_context);
+SymbolicExpr CreateSymbolicBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
+                                    SymbolicExpr rhs,
+                                    mlir::MLIRContext* mlir_context);
+llvm::SmallVector<SymbolicExpr> CreateSymbolicConstantExprs(
+    llvm::ArrayRef<int64_t> constants, mlir::MLIRContext* mlir_context);
+
+// Deprecated. Use free functions taking mlir::MLIRContext* instead.
 class SymbolicExprContext {
  public:
   explicit SymbolicExprContext(mlir::MLIRContext* mlir_context);
@@ -157,43 +181,45 @@ class SymbolicExprContext {
   SymbolicExpr CreateBinaryOp(SymbolicExprType type, SymbolicExpr lhs,
                               SymbolicExpr rhs);
 
+  bool operator==(const SymbolicExprContext& other) const;
+  bool operator!=(const SymbolicExprContext& other) const {
+    return !(*this == other);
+  }
+
   mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
 
  private:
   SymbolicExpr GetOrCreate(SymbolicExprType type, int64_t value,
                            SymbolicExpr lhs, SymbolicExpr rhs);
-  mlir::StorageUniquer uniquer_;
   // TODO(b/446856305): MLIRContext is only used here temporarily while we have
-  // AffineMap <-> SymbolicMap convertors.
+  // AffineMap <-> SymbolicMap convertors. In the future, we only will need a
+  // StorageUniquer pointer.
   mlir::MLIRContext* mlir_context_;
 };
 
-}  // namespace gpu
 }  // namespace xla
 
 namespace llvm {
 
 // SymbolicExpr hash just like pointers
 template <>
-struct DenseMapInfo<xla::gpu::SymbolicExpr> {
-  static xla::gpu::SymbolicExpr getEmptyKey() {
+struct DenseMapInfo<xla::SymbolicExpr> {
+  static xla::SymbolicExpr getEmptyKey() {
     auto* pointer = llvm::DenseMapInfo<void*>::getEmptyKey();
-    return xla::gpu::SymbolicExpr(
-        static_cast<xla::gpu::SymbolicExprStorage*>(pointer));
+    return xla::SymbolicExpr(static_cast<xla::SymbolicExprStorage*>(pointer));
   }
-  static xla::gpu::SymbolicExpr getTombstoneKey() {
+  static xla::SymbolicExpr getTombstoneKey() {
     auto* pointer = llvm::DenseMapInfo<void*>::getTombstoneKey();
-    return xla::gpu::SymbolicExpr(
-        static_cast<xla::gpu::SymbolicExprStorage*>(pointer));
+    return xla::SymbolicExpr(static_cast<xla::SymbolicExprStorage*>(pointer));
   }
-  static unsigned getHashValue(xla::gpu::SymbolicExpr val) {
+  static unsigned getHashValue(xla::SymbolicExpr val) {
     return hash_value(val);
   }
-  static bool isEqual(xla::gpu::SymbolicExpr LHS, xla::gpu::SymbolicExpr RHS) {
+  static bool isEqual(xla::SymbolicExpr LHS, xla::SymbolicExpr RHS) {
     return LHS == RHS;
   }
 };
 
 }  // namespace llvm
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_EXPR_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_EXPR_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
similarity index 68%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
index d515d6ff54260c..4e8f01e94f9213 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_expr_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_expr_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 #include <cstdint>
 #include <string>
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 using ::testing::Combine;
 using ::testing::Values;
@@ -34,11 +36,10 @@ using ::testing::Values;
 // Test fixture to hold the context for all tests.
 struct SymbolicExprTest : public ::testing::Test {
  protected:
-  // There should not be any usage of MLIRContext in this test.
-  SymbolicExprContext ctx{nullptr};
-  SymbolicExpr v0 = ctx.CreateVariable(0);
-  SymbolicExpr v1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
+  mlir::MLIRContext ctx;
+  SymbolicExpr v0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
 };
 
 TEST_F(SymbolicExprTest, CreateAndPrint) {
@@ -65,17 +66,20 @@ TEST_F(SymbolicExprTest, PrintWithDifferentNumDimensions) {
 TEST_F(SymbolicExprTest, ParseAndPrint) {
   const std::string kStringContainingAllOperators =
       "((((v0 + 42) * max(min(v1, 2), 0)) floordiv 2) ceildiv 2)";
-  SymbolicExpr parsed_expr = ctx.Parse(kStringContainingAllOperators);
+  SymbolicExpr parsed_expr =
+      ParseSymbolicExpr(kStringContainingAllOperators, &ctx);
   ASSERT_NE(parsed_expr, nullptr);
   EXPECT_THAT(parsed_expr.ToString(),
               MatchIndexingString(kStringContainingAllOperators));
 }
 
 TEST_F(SymbolicExprTest, ParseAndPrint_Invalid) {
-  EXPECT_DEATH(ctx.Parse("1 + "), "Unexpected end of expression");
-  EXPECT_DEATH(ctx.Parse("max(1, )"), "Failed to parse expression");
-  EXPECT_DEATH(ctx.Parse("(1 + 2"), "Missing parenthesis");
-  EXPECT_DEATH(ctx.Parse("foo(3, 4)"), "Failed to parse expression");
+  EXPECT_DEATH(ParseSymbolicExpr("1 + ", &ctx), "Unexpected end of expression");
+  EXPECT_DEATH(ParseSymbolicExpr("max(1, )", &ctx),
+               "Failed to parse expression");
+  EXPECT_DEATH(ParseSymbolicExpr("(1 + 2", &ctx), "Missing parenthesis");
+  EXPECT_DEATH(ParseSymbolicExpr("foo(3, 4)", &ctx),
+               "Failed to parse expression");
 }
 
 TEST_F(SymbolicExprTest, Evaluate) {
@@ -100,8 +104,8 @@ TEST_P(SymbolicExprEvaluateDivModTest, EvaluateDivMod) {
   const auto& params = GetParam();
   const int64_t numerator_val = std::get<0>(params);
   const int64_t denominator_val = std::get<1>(params);
-  SymbolicExpr numerator = ctx.CreateConstant(numerator_val);
-  SymbolicExpr denominator = ctx.CreateConstant(denominator_val);
+  SymbolicExpr numerator = CreateSymbolicConstant(numerator_val, &ctx);
+  SymbolicExpr denominator = CreateSymbolicConstant(denominator_val, &ctx);
 
   if (numerator_val % denominator_val == 0) {
     EXPECT_EQ((numerator % denominator).Evaluate({}), 0);
@@ -118,27 +122,28 @@ INSTANTIATE_TEST_SUITE_P(PositiveAndNegative, SymbolicExprEvaluateDivModTest,
                          Combine(Values(5, -5, 4, -4), Values(2, -2)));
 
 TEST_F(SymbolicExprTest, ReplaceVariables) {
-  SymbolicExpr expr_to_sub = ctx.Parse("(v0 + v1)");
-  std::vector<SymbolicExpr> substitutions{{}, ctx.Parse("(v2 * 10)")};
+  SymbolicExpr expr_to_sub = ParseSymbolicExpr("(v0 + v1)", &ctx);
+  std::vector<SymbolicExpr> substitutions{{},
+                                          ParseSymbolicExpr("(v2 * 10)", &ctx)};
   SymbolicExpr result = expr_to_sub.ReplaceVariables(substitutions);
   EXPECT_EQ(result.ToString(), "(v0 + (v2 * 10))");
 }
 
 TEST_F(SymbolicExprTest, ReplaceSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  SymbolicExpr s1 = ctx.CreateVariable(2);
-  SymbolicExpr c7 = ctx.CreateConstant(7);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr s1 = CreateSymbolicVariable(2, &ctx);
+  SymbolicExpr c7 = CreateSymbolicConstant(7, &ctx);
   SymbolicExpr expr_to_sub = (d0 + s0 * 2) * s1;
   SymbolicExpr result = expr_to_sub.ReplaceSymbols({d0, c7}, /*num_dims=*/1);
   EXPECT_EQ(result, ((d0 + (d0 * 2)) * c7));
 }
 
 TEST_F(SymbolicExprTest, ReplaceDimsAndSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  SymbolicExpr s1 = ctx.CreateVariable(2);
-  SymbolicExpr c7 = ctx.CreateConstant(7);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr s1 = CreateSymbolicVariable(2, &ctx);
+  SymbolicExpr c7 = CreateSymbolicConstant(7, &ctx);
   SymbolicExpr expr_to_sub = (d0 + s0 * 2) * s1;
   SymbolicExpr result =
       expr_to_sub.ReplaceDimsAndSymbols({s0}, {d0, c7}, /*num_dims=*/1);
@@ -154,10 +159,10 @@ TEST_F(SymbolicExprTest, ReplaceDimsAndSymbols) {
 }
 
 TEST_F(SymbolicExprTest, UniquingWorks) {
-  SymbolicExpr c1 = ctx.CreateConstant(42);
-  SymbolicExpr c2 = ctx.CreateConstant(42);
+  SymbolicExpr c1 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(42, &ctx);
   EXPECT_EQ(c1, c2);
-  SymbolicExpr c3 = ctx.CreateConstant(99);
+  SymbolicExpr c3 = CreateSymbolicConstant(99, &ctx);
   EXPECT_NE(c1, c3);
 
   SymbolicExpr add1 = v0 + 42;
@@ -167,11 +172,18 @@ TEST_F(SymbolicExprTest, UniquingWorks) {
   EXPECT_NE(add1, add3);
 }
 
+TEST_F(SymbolicExprTest, UniquingDoesNotCrashWithCombinedAffineExpr) {
+  mlir::AffineExpr affine_expr = mlir::getAffineDimExpr(0, &ctx);
+  SymbolicExpr c1 = CreateSymbolicConstant(42, &ctx);
+  EXPECT_EQ(affine_expr, mlir::getAffineDimExpr(0, &ctx));
+  EXPECT_EQ(c1, CreateSymbolicConstant(42, &ctx));
+}
+
 TEST_F(SymbolicExprTest, Replace) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr d1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr = (d0 + c2) * (d1 + c2);
   EXPECT_EQ(expr.Replace(d0 + c2, c5), (c5 * (d1 + c2)));
@@ -179,22 +191,21 @@ TEST_F(SymbolicExprTest, Replace) {
   EXPECT_EQ(expr.Replace(c2, c5), (d0 + c5) * (d1 + c5));
   EXPECT_EQ(expr.Replace(expr, c2), c2);
   EXPECT_EQ(expr.Replace(d1, d1), expr);
-  EXPECT_EQ(expr.Replace(ctx.CreateConstant(42), d1), expr);
+  EXPECT_EQ(expr.Replace(CreateSymbolicConstant(42, &ctx), d1), expr);
 }
 
 TEST_F(SymbolicExprTest, ReplaceWithMap) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
-  SymbolicExpr c10 = ctx.CreateConstant(10);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr d1 = CreateSymbolicVariable(1, &ctx);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &ctx);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr = (d0 + c2) * (d1 + c2);
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> replace_expression;
   replace_expression[d0 + c2] = c5;
-  replace_expression[d1] = c10;
-  EXPECT_EQ(expr.Replace(replace_expression), c5 * (c10 + c2));
+  replace_expression[d1] = d0;
+  EXPECT_EQ(expr.Replace(replace_expression), c5 * (d0 + c2));
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> replace_constant;
   replace_constant[c2] = d0;
@@ -206,10 +217,43 @@ TEST_F(SymbolicExprTest, ReplaceWithMap) {
   EXPECT_EQ(expr.Replace(swap_variables), (d1 + c2) * (d0 + c2));
 
   llvm::DenseMap<SymbolicExpr, SymbolicExpr> no_change;
-  no_change[ctx.CreateVariable(99)] = c5;
+  no_change[CreateSymbolicVariable(99, &ctx)] = c5;
   EXPECT_EQ(expr.Replace(no_change), expr);
 }
 
+TEST_F(SymbolicExprTest, BasicSimplificationsAtCreationTime) {
+  auto c0 = CreateSymbolicConstant(0, &ctx);
+  auto c1 = CreateSymbolicConstant(1, &ctx);
+  auto c3 = CreateSymbolicConstant(3, &ctx);
+
+  // x + 0 = x
+  EXPECT_EQ(v0 + c0, v0);
+  EXPECT_EQ(c0 + v0, v0);
+  EXPECT_EQ(c2 + c1, c3);
+
+  // TODO(b/459357586): This will be canonicalized to (v0 + 2) in the future.
+  EXPECT_NE(v0 + c2, c2 + v0);
+
+  // x * 0 = 0
+  EXPECT_EQ(v0 * c0, c0);
+  EXPECT_EQ(c0 * v0, c0);
+  EXPECT_EQ(c2 * c0, c0);
+
+  // x * 1 = x
+  EXPECT_EQ(v0 * c1, v0);
+  EXPECT_EQ(c1 * v0, v0);
+  EXPECT_EQ(c2 * c1, c2);
+
+  // Associativity: (X * C1) * C2 = X * (C1 * C2)
+  EXPECT_EQ(((v0 * 2) * 3), v0 * 6);
+
+  // No associativity if constant is on LHS of outer mul.
+  // TODO(b/459357586): This will be canonicalized to (v0 * 6) in the future.
+  SymbolicExpr mul_2_v0 = CreateSymbolicConstant(2, &ctx) * v0;
+  SymbolicExpr mul_2_v0_3 = mul_2_v0 * 3;
+  EXPECT_EQ(mul_2_v0_3.ToString(), "((2 * v0) * 3)");
+}
+
 TEST_F(SymbolicExprTest, Canonicalization_Basic) {
   SymbolicExpr constants = (c2 * 3) + 5;
   EXPECT_EQ(constants.Canonicalize().ToString(), "11");
@@ -305,6 +349,36 @@ TEST_F(SymbolicExprTest, Walk) {
                                                     "v1", "((v0 + 42) * v1)"));
 }
 
+TEST_F(SymbolicExprTest, Hashing) {
+  absl::flat_hash_set<SymbolicExpr> set;
+
+  SymbolicExpr c42_1 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c42_2 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c3 = CreateSymbolicConstant(3, &ctx);
+
+  set.insert(c42_1);
+  set.insert(c42_2);
+  set.insert(c3);
+  EXPECT_EQ(set.size(), 2);
+
+  SymbolicExpr v0_1 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v0_2 = CreateSymbolicVariable(0, &ctx);
+  SymbolicExpr v1 = CreateSymbolicVariable(1, &ctx);
+
+  set.insert(v0_1);
+  set.insert(v0_2);
+  set.insert(v1);
+  EXPECT_EQ(set.size(), 4);
+
+  SymbolicExpr add1 = v0_1 + c42_1;
+  SymbolicExpr add2 = v0_2 + c42_2;
+  SymbolicExpr add3 = v1 + c3;
+
+  set.insert(add1);
+  set.insert(add2);
+  set.insert(add3);
+  EXPECT_EQ(set.size(), 6);
+}
+
 }  // namespace
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc b/third_party/xla/xla/hlo/analysis/symbolic_map.cc
similarity index 86%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map.cc
index 6f8cc928f0d82c..a8743bacec83bf 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -29,20 +29,19 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
-namespace gpu {
-
 namespace {
 
-llvm::SmallVector<SymbolicExpr> CreateVariableRange(SymbolicExprContext* ctx,
+llvm::SmallVector<SymbolicExpr> CreateVariableRange(mlir::MLIRContext* ctx,
                                                     int64_t n,
                                                     int64_t offset = 0) {
   llvm::SmallVector<SymbolicExpr> replacements;
   replacements.reserve(n);
   for (int64_t i = 0; i < n; ++i) {
-    replacements.push_back(ctx->CreateVariable(offset + i));
+    replacements.push_back(CreateSymbolicVariable(offset + i, ctx));
   }
   return replacements;
 }
@@ -58,7 +57,7 @@ llvm::DenseSet<VariableID> GetUsedVariablesFromExpressions(
 
 }  // namespace
 
-SymbolicMap::SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
+SymbolicMap::SymbolicMap(mlir::MLIRContext* ctx, int64_t num_dimensions,
                          int64_t num_symbols,
                          llvm::SmallVector<SymbolicExpr> exprs)
     : ctx_(ctx),
@@ -66,7 +65,7 @@ SymbolicMap::SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
       num_symbols_(num_symbols),
       exprs_(std::move(exprs)) {}
 
-/*static*/ SymbolicMap SymbolicMap::Get(SymbolicExprContext* ctx,
+/*static*/ SymbolicMap SymbolicMap::Get(mlir::MLIRContext* ctx,
                                         int64_t num_dimensions,
                                         int64_t num_symbols,
                                         llvm::SmallVector<SymbolicExpr> exprs) {
@@ -129,13 +128,28 @@ SymbolicMap SymbolicMap::ReplaceDimsAndSymbols(
     absl::Span<const SymbolicExpr> dim_replacements,
     absl::Span<const SymbolicExpr> sym_replacements, int64_t num_result_dims,
     int64_t num_result_symbols) const {
-  CHECK_EQ(dim_replacements.size(), num_dimensions_);
-  CHECK_EQ(sym_replacements.size(), num_symbols_);
+  CHECK(dim_replacements.empty() || dim_replacements.size() == num_dimensions_);
+  CHECK(sym_replacements.empty() || sym_replacements.size() == num_symbols_);
 
   llvm::SmallVector<SymbolicExpr> all_replacements;
   all_replacements.reserve(num_dimensions_ + num_symbols_);
-  absl::c_copy(dim_replacements, std::back_inserter(all_replacements));
-  absl::c_copy(sym_replacements, std::back_inserter(all_replacements));
+
+  if (!dim_replacements.empty()) {
+    absl::c_copy(dim_replacements, std::back_inserter(all_replacements));
+  } else {
+    for (int i = 0; i < num_dimensions_; ++i) {
+      all_replacements.push_back(CreateSymbolicVariable(i, ctx_));
+    }
+  }
+
+  if (!sym_replacements.empty()) {
+    absl::c_copy(sym_replacements, std::back_inserter(all_replacements));
+  } else {
+    for (int i = 0; i < num_symbols_; ++i) {
+      all_replacements.push_back(
+          CreateSymbolicVariable(num_dimensions_ + i, ctx_));
+    }
+  }
 
   llvm::SmallVector<SymbolicExpr> new_exprs;
   new_exprs.reserve(exprs_.size());
@@ -200,8 +214,8 @@ SymbolicMap SymbolicMap::Replace(SymbolicExpr expr,
 }
 
 bool SymbolicMap::operator==(const SymbolicMap& other) const {
-  return ctx_ == other.ctx_ && num_dimensions_ == other.num_dimensions_ &&
-         num_symbols_ == other.num_symbols_ && exprs_ == other.exprs_;
+  return (ctx_ == other.ctx_ && num_dimensions_ == other.num_dimensions_ &&
+          num_symbols_ == other.num_symbols_ && exprs_ == other.exprs_);
 }
 
 llvm::SmallBitVector GetUnusedDimensionsBitVector(const SymbolicMap& map) {
@@ -259,7 +273,7 @@ SymbolicMap CompressDims(const SymbolicMap& map,
   for (int i = 0; i < map.GetNumDims(); ++i) {
     if (!unused_dims[i]) {
       dim_replacements[i] =
-          map.GetContext()->CreateVariable(current_new_dim_idx++);
+          CreateSymbolicVariable(current_new_dim_idx++, map.GetContext());
     }
   }
   auto sym_replacements =
@@ -295,8 +309,8 @@ SymbolicMap CompressSymbols(const SymbolicMap& map,
   int64_t current_new_sym_idx = 0;
   for (int i = 0; i < map.GetNumSymbols(); ++i) {
     if (!unused_symbols[i]) {
-      sym_replacements[i] =
-          map.GetContext()->CreateVariable(num_dims + current_new_sym_idx++);
+      sym_replacements[i] = CreateSymbolicVariable(
+          num_dims + current_new_sym_idx++, map.GetContext());
     }
   }
   CHECK_EQ(current_new_sym_idx, new_num_symbols);
@@ -305,5 +319,4 @@ SymbolicMap CompressSymbols(const SymbolicMap& map,
                                    new_num_symbols);
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h b/third_party/xla/xla/hlo/analysis/symbolic_map.h
similarity index 67%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h
rename to third_party/xla/xla/hlo/analysis/symbolic_map.h
index 8b17e5d671ab56..d21580977b4dbb 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map.h
@@ -13,39 +13,71 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
 
 #include <cstddef>
 #include <cstdint>
 #include <string>
 
+#include "absl/log/check.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
-namespace gpu {
 
-class SymbolicExprContext;
+// SymbolicMap abstracts away the fact that dimensions and symbols are both
+// implemented as SymbolicExpr variables. These free functions provide a way to
+// work with them without a SymbolicMap instance.
+inline SymbolicExpr CreateDimExpr(unsigned dim_id, mlir::MLIRContext* context) {
+  return CreateSymbolicVariable(dim_id, context);
+}
+
+inline SymbolicExpr CreateSymbolExpr(unsigned symbol_id, int64_t num_dims,
+                                     mlir::MLIRContext* context) {
+  return CreateSymbolicVariable(symbol_id + num_dims, context);
+}
+
+inline bool IsDimension(SymbolicExpr expr, int64_t num_dims) {
+  return expr.GetType() == SymbolicExprType::kVariable &&
+         expr.GetValue() < num_dims;
+}
+
+inline bool IsSymbol(SymbolicExpr expr, int64_t num_dims) {
+  return expr.GetType() == SymbolicExprType::kVariable &&
+         expr.GetValue() >= num_dims;
+}
+
+inline int64_t GetDimensionIndex(SymbolicExpr expr, int64_t num_dims) {
+  CHECK(IsDimension(expr, num_dims));
+  return expr.GetValue();
+}
+
+inline int64_t GetSymbolIndex(SymbolicExpr expr, int64_t num_dims) {
+  CHECK(IsSymbol(expr, num_dims));
+  return expr.GetValue() - num_dims;
+}
 
 // Maps a set of input variables to a set of output SymbolicExpr trees.
 class SymbolicMap {
  public:
   SymbolicMap() = default;
-  static SymbolicMap Get(SymbolicExprContext* ctx, int64_t num_dimensions,
+  static SymbolicMap Get(mlir::MLIRContext* ctx, int64_t num_dimensions,
                          int64_t num_symbols,
                          llvm::SmallVector<SymbolicExpr> exprs);
 
-  SymbolicExprContext* GetContext() const { return ctx_; }
+  mlir::MLIRContext* GetContext() const { return ctx_; }
   int64_t GetNumDims() const { return num_dimensions_; }
   int64_t GetNumSymbols() const { return num_symbols_; }
   SymbolicExpr GetDimExpression(unsigned idx) const {
-    return ctx_->CreateVariable(idx);
+    return CreateDimExpr(idx, ctx_);
   }
   SymbolicExpr GetSymbolExpression(unsigned idx) const {
-    return ctx_->CreateVariable(num_dimensions_ + idx);
+    return CreateSymbolExpr(idx, num_dimensions_, ctx_);
   }
   int64_t GetNumResults() const { return exprs_.size(); }
   const llvm::SmallVector<SymbolicExpr>& GetResults() const { return exprs_; }
@@ -98,16 +130,28 @@ class SymbolicMap {
   bool operator==(const SymbolicMap& other) const;
   bool operator!=(const SymbolicMap& other) const { return !(*this == other); }
 
+  template <typename H>
+  friend H AbslHashValue(H h, const SymbolicMap& map) {
+    return H::combine(std::move(h), map.num_dimensions_, map.num_symbols_,
+                      map.exprs_);
+  }
+
+  friend ::llvm::hash_code hash_value(const SymbolicMap& map) {
+    return ::llvm::hash_combine(
+        map.num_dimensions_, map.num_symbols_,
+        ::llvm::hash_combine_range(map.exprs_.begin(), map.exprs_.end()));
+  }
+
   template <typename Sink>
   friend void AbslStringify(Sink& sink, const SymbolicMap& map) {
     sink.Append(map.ToString());
   }
 
  private:
-  SymbolicMap(SymbolicExprContext* ctx, int64_t num_dimensions,
+  SymbolicMap(mlir::MLIRContext* ctx, int64_t num_dimensions,
               int64_t num_symbols, llvm::SmallVector<SymbolicExpr> exprs);
 
-  SymbolicExprContext* ctx_;
+  mlir::MLIRContext* ctx_;
   int64_t num_dimensions_;
   int64_t num_symbols_;
   llvm::SmallVector<SymbolicExpr> exprs_;
@@ -131,7 +175,11 @@ SymbolicMap CompressDims(const SymbolicMap& map,
 SymbolicMap CompressSymbols(const SymbolicMap& map,
                             const llvm::SmallBitVector& unused_symbols);
 
-}  // namespace gpu
+template <typename H>
+H AbslHashValue(H h, const llvm::SmallVector<SymbolicExpr>& vec) {
+  return H::combine(std::move(h), absl::MakeSpan(vec));
+}
+
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_MAP_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
similarity index 65%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
index 7b5253b4d00c7c..7f5edf400b0075 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map_converter.h"
+#include "xla/hlo/analysis/symbolic_map_converter.h"
 
 #include <cstdint>
 
@@ -22,20 +22,19 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
-namespace {
 
-// Helper function to convert xla::gpu::SymbolicExpr to mlir::AffineExpr.
-mlir::AffineExpr SymbolicToAffine(SymbolicExpr symbolic_expr,
-                                  mlir::MLIRContext* context, int num_dims) {
+// Helper function to convert xla::SymbolicExpr to mlir::AffineExpr.
+mlir::AffineExpr SymbolicExprToAffineExpr(SymbolicExpr symbolic_expr,
+                                          mlir::MLIRContext* context,
+                                          int num_dims) {
   mlir::AffineExpr lhs, rhs;
   if (symbolic_expr.GetLHS() && symbolic_expr.GetRHS()) {
-    lhs = SymbolicToAffine(symbolic_expr.GetLHS(), context, num_dims);
-    rhs = SymbolicToAffine(symbolic_expr.GetRHS(), context, num_dims);
+    lhs = SymbolicExprToAffineExpr(symbolic_expr.GetLHS(), context, num_dims);
+    rhs = SymbolicExprToAffineExpr(symbolic_expr.GetRHS(), context, num_dims);
     if (!lhs || !rhs) {
       return mlir::AffineExpr();
     }
@@ -69,57 +68,67 @@ mlir::AffineExpr SymbolicToAffine(SymbolicExpr symbolic_expr,
   }
 }
 
-}  // namespace
+llvm::SmallVector<SymbolicExpr> AffineExprsToSymbolicExprs(
+    llvm::ArrayRef<mlir::AffineExpr> affine_exprs, int num_dims) {
+  llvm::SmallVector<SymbolicExpr> symbolic_exprs;
+  symbolic_exprs.reserve(affine_exprs.size());
+  for (mlir::AffineExpr expr : affine_exprs) {
+    symbolic_exprs.push_back(AffineExprToSymbolicExpr(expr, num_dims));
+  }
+  return symbolic_exprs;
+}
 
-// Helper function to convert mlir::AffineExpr to xla::gpu::SymbolicExpr.
-SymbolicExpr AffineToSymbolicExpr(mlir::AffineExpr affine_expr,
-                                  SymbolicExprContext* context, int num_dims) {
+SymbolicExpr AffineExprToSymbolicExpr(mlir::AffineExpr affine_expr,
+                                      int num_dims) {
+  mlir::MLIRContext* context = affine_expr.getContext();
   switch (affine_expr.getKind()) {
     case mlir::AffineExprKind::Constant:
-      return context->CreateConstant(
-          mlir::cast<mlir::AffineConstantExpr>(affine_expr).getValue());
+      return CreateSymbolicConstant(
+          mlir::cast<mlir::AffineConstantExpr>(affine_expr).getValue(),
+          context);
     case mlir::AffineExprKind::DimId:
-      return context->CreateVariable(
-          mlir::cast<mlir::AffineDimExpr>(affine_expr).getPosition());
+      return CreateSymbolicVariable(
+          mlir::cast<mlir::AffineDimExpr>(affine_expr).getPosition(), context);
     case mlir::AffineExprKind::SymbolId:
-      return context->CreateVariable(
+      return CreateSymbolicVariable(
           mlir::cast<mlir::AffineSymbolExpr>(affine_expr).getPosition() +
-          num_dims);
+              num_dims,
+          context);
     case mlir::AffineExprKind::Add: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) +
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) +
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
     case mlir::AffineExprKind::Mul: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) *
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) *
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
     case mlir::AffineExprKind::FloorDiv: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims)
-          .floorDiv(AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims));
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims)
+          .floorDiv(AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims));
     }
     case mlir::AffineExprKind::CeilDiv: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims)
-          .ceilDiv(AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims));
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims)
+          .ceilDiv(AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims));
     }
     case mlir::AffineExprKind::Mod: {
       auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(affine_expr);
-      return AffineToSymbolicExpr(bin_op.getLHS(), context, num_dims) %
-             AffineToSymbolicExpr(bin_op.getRHS(), context, num_dims);
+      return AffineExprToSymbolicExpr(bin_op.getLHS(), num_dims) %
+             AffineExprToSymbolicExpr(bin_op.getRHS(), num_dims);
     }
   }
 }
 
-SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map,
-                                   SymbolicExprContext* context) {
+SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map) {
+  mlir::MLIRContext* context = affine_map.getContext();
   llvm::SmallVector<SymbolicExpr> results;
   results.reserve(affine_map.getNumResults());
   int num_dims = affine_map.getNumDims();
   for (mlir::AffineExpr expr : affine_map.getResults()) {
-    results.push_back(AffineToSymbolicExpr(expr, context, num_dims));
+    results.push_back(AffineExprToSymbolicExpr(expr, num_dims));
   }
   return SymbolicMap::Get(context, num_dims, affine_map.getNumSymbols(),
                           results);
@@ -133,7 +142,7 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
   results.reserve(symbolic_map.GetNumResults());
   for (SymbolicExpr expr : symbolic_map.GetResults()) {
     mlir::AffineExpr affine_expr =
-        SymbolicToAffine(expr, context, symbolic_map.GetNumDims());
+        SymbolicExprToAffineExpr(expr, context, symbolic_map.GetNumDims());
     if (!affine_expr) {
       // Conversion failed.
       return mlir::AffineMap();
@@ -147,14 +156,13 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
 llvm::MapVector<SymbolicExpr, Interval>
 ConvertAffineConstraintsToSymbolicConstraints(
     const llvm::MapVector<mlir::AffineExpr, Interval>& affine_constraints,
-    SymbolicExprContext* context, int num_dims) {
+    int num_dims) {
   llvm::MapVector<SymbolicExpr, Interval> symbolic_constraints;
   for (const auto& [affine_expr, interval] : affine_constraints) {
-    SymbolicExpr expr = AffineToSymbolicExpr(affine_expr, context, num_dims);
+    SymbolicExpr expr = AffineExprToSymbolicExpr(affine_expr, num_dims);
     symbolic_constraints[expr] = interval;
   }
   return symbolic_constraints;
 }
 
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
similarity index 56%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
index 362f0f41943dd4..5b6b0c3646b1c5 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter.h
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter.h
@@ -13,29 +13,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
-#define XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
+#ifndef XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
+#define XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
 
-// Helper function to convert mlir::AffineExpr to xla::gpu::SymbolicExpr.
-SymbolicExpr AffineToSymbolicExpr(::mlir::AffineExpr affine_expr,
-                                  SymbolicExprContext* context, int num_dims);
+// Converts an mlir::AffineExpr to xla::SymbolicExpr.
+SymbolicExpr AffineExprToSymbolicExpr(::mlir::AffineExpr affine_expr,
+                                      int num_dims);
 
-// Converts an mlir::AffineMap to xla::gpu::SymbolicMap.
-SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map,
-                                   SymbolicExprContext* context);
+// Converts a list of mlir::AffineExpr to xla::SymbolicExpr.
+llvm::SmallVector<SymbolicExpr> AffineExprsToSymbolicExprs(
+    llvm::ArrayRef<mlir::AffineExpr> affine_exprs, int num_dims);
 
-// Converts xla::gpu::SymbolicMap to an mlir::AffineMap.
+// Converts an xla::SymbolicExpr to an mlir::AffineExpr.
+mlir::AffineExpr SymbolicExprToAffineExpr(SymbolicExpr symbolic_expr,
+                                          mlir::MLIRContext* context,
+                                          int num_dims);
+
+// Converts an mlir::AffineMap to xla::SymbolicMap.
+SymbolicMap AffineMapToSymbolicMap(const mlir::AffineMap& affine_map);
+
+// Converts xla::SymbolicMap to an mlir::AffineMap.
 // Returns a null AffineMap if the conversion is not possible.
 mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
                                        mlir::MLIRContext* context);
@@ -44,9 +52,8 @@ mlir::AffineMap SymbolicMapToAffineMap(SymbolicMap symbolic_map,
 llvm::MapVector<SymbolicExpr, Interval>
 ConvertAffineConstraintsToSymbolicConstraints(
     const llvm::MapVector<mlir::AffineExpr, Interval>& affine_constraints,
-    SymbolicExprContext* context, int num_dims);
+    int num_dims);
 
-}  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_EXPERIMENTAL_SYMBOLIC_MAP_CONVERTER_H_
+#endif  // XLA_HLO_ANALYSIS_SYMBOLIC_MAP_CONVERTER_H_
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
similarity index 57%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
index e3e6661bb9a0a0..ec056438efcbd1 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_converter_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_converter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map_converter.h"
+#include "xla/hlo/analysis/symbolic_map_converter.h"
 
 #include <string>
 
@@ -29,11 +29,10 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 namespace xla {
-namespace gpu {
 namespace {
 
 using ::mlir::AffineMap;
@@ -54,59 +53,67 @@ AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
 
 class SymbolicMapConverterTest : public ::testing::Test {
  public:
-  SymbolicMapConverterTest() : symbolic_expr_context_(&mlir_context_) {}
-
-  MLIRContext mlir_context_;
-  SymbolicExprContext symbolic_expr_context_;
+  MLIRContext context_;
 };
 
 TEST_F(SymbolicMapConverterTest, AffineToSymbolicRoundTrip) {
   AffineMap affine_map = ParseAffineMap(
       "(d0, d1)[s0, s1] -> (d0 + s1 * 2, d1 - s0, d0 floordiv 3, d1 mod 4)",
-      &mlir_context_);
+      &context_);
 
-  SymbolicMap symbolic_map =
-      AffineMapToSymbolicMap(affine_map, &symbolic_expr_context_);
+  SymbolicMap symbolic_map = AffineMapToSymbolicMap(affine_map);
 
   EXPECT_EQ(symbolic_map.GetNumResults(), 4);
 
-  AffineMap round_trip_map =
-      SymbolicMapToAffineMap(symbolic_map, &mlir_context_);
+  AffineMap round_trip_map = SymbolicMapToAffineMap(symbolic_map, &context_);
   EXPECT_EQ(affine_map, round_trip_map);
 }
 
 TEST_F(SymbolicMapConverterTest, SymbolicToAffineFailure) {
-  SymbolicExpr d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr c1 = symbolic_expr_context_.CreateConstant(1);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr c1 = CreateSymbolicConstant(1, &context_);
   // kMax is not representable in AffineExpr.
   SymbolicExpr max_expr = d0.max(c1);
 
   AffineMap affine_map = SymbolicMapToAffineMap(
-      SymbolicMap::Get(&symbolic_expr_context_, 1, 0, {max_expr}),
-      &mlir_context_);
+      SymbolicMap::Get(&context_, 1, 0, {max_expr}), &context_);
   EXPECT_FALSE(affine_map);
 }
 
 TEST_F(SymbolicMapConverterTest, SymbolicToAffineNestedFailure) {
-  SymbolicExpr d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr c1 = symbolic_expr_context_.CreateConstant(1);
-  SymbolicExpr c2 = symbolic_expr_context_.CreateConstant(2);
+  SymbolicExpr d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr c1 = CreateSymbolicConstant(1, &context_);
+  SymbolicExpr c2 = CreateSymbolicConstant(2, &context_);
 
   // d0 + max(c1, c2). max is not representable in AffineExpr.
   SymbolicExpr nested_max_expr = d0 + c1.max(c2);
 
   // This should not crash and should return a null AffineMap.
   AffineMap affine_map = SymbolicMapToAffineMap(
-      SymbolicMap::Get(&symbolic_expr_context_, 1, 0, {nested_max_expr}),
-      &mlir_context_);
+      SymbolicMap::Get(&context_, 1, 0, {nested_max_expr}), &context_);
   EXPECT_FALSE(affine_map);
 }
 
+TEST_F(SymbolicMapConverterTest, AffineExprsToSymbolicExprs) {
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
+  llvm::SmallVector<mlir::AffineExpr> affine_exprs = {d0, d1, s0, c1};
+  llvm::SmallVector<SymbolicExpr> symbolic_exprs =
+      AffineExprsToSymbolicExprs(affine_exprs, /*num_dims=*/2);
+  EXPECT_EQ(symbolic_exprs.size(), 4);
+  EXPECT_EQ(symbolic_exprs[0], CreateSymbolicVariable(0, &context_));
+  EXPECT_EQ(symbolic_exprs[1], CreateSymbolicVariable(1, &context_));
+  EXPECT_EQ(symbolic_exprs[2], CreateSymbolicVariable(2, &context_));
+  EXPECT_EQ(symbolic_exprs[3], CreateSymbolicConstant(1, &context_));
+}
+
 TEST_F(SymbolicMapConverterTest,
        ConvertAffineConstraintsToSymbolicConstraints) {
-  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &mlir_context_);
-  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &mlir_context_);
-  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &mlir_context_);
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
 
   llvm::MapVector<mlir::AffineExpr, Interval> affine_constraints;
   affine_constraints[d0 + s0] = {0, 127};
@@ -114,12 +121,12 @@ TEST_F(SymbolicMapConverterTest,
   affine_constraints[d0 - c1] = {10, 20};
 
   llvm::MapVector<SymbolicExpr, Interval> symbolic_constraints =
-      ConvertAffineConstraintsToSymbolicConstraints(
-          affine_constraints, &symbolic_expr_context_, /*num_dims=*/1);
+      ConvertAffineConstraintsToSymbolicConstraints(affine_constraints,
+                                                    /*num_dims=*/1);
 
-  SymbolicExpr sym_d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr sym_s0 = symbolic_expr_context_.CreateVariable(1);
-  SymbolicExpr sym_c1 = symbolic_expr_context_.CreateConstant(1);
+  SymbolicExpr sym_d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr sym_s0 = CreateSymbolicVariable(1, &context_);
+  SymbolicExpr sym_c1 = CreateSymbolicConstant(1, &context_);
 
   EXPECT_EQ(symbolic_constraints.size(), 3);
   EXPECT_EQ(symbolic_constraints[sym_d0 + sym_s0], (Interval{0, 127}));
@@ -128,12 +135,12 @@ TEST_F(SymbolicMapConverterTest,
 }
 
 TEST_F(SymbolicMapConverterTest, ConvertAffineToSymbolicExpr) {
-  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &mlir_context_);
-  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &mlir_context_);
-  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &mlir_context_);
-  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &mlir_context_);
-  mlir::AffineExpr c2 = mlir::getAffineConstantExpr(2, &mlir_context_);
-  mlir::AffineExpr c3 = mlir::getAffineConstantExpr(3, &mlir_context_);
+  mlir::AffineExpr d0 = mlir::getAffineDimExpr(0, &context_);
+  mlir::AffineExpr d1 = mlir::getAffineDimExpr(1, &context_);
+  mlir::AffineExpr s0 = mlir::getAffineSymbolExpr(0, &context_);
+  mlir::AffineExpr c1 = mlir::getAffineConstantExpr(1, &context_);
+  mlir::AffineExpr c2 = mlir::getAffineConstantExpr(2, &context_);
+  mlir::AffineExpr c3 = mlir::getAffineConstantExpr(3, &context_);
 
   mlir::AffineExpr affine_expr =
       mlir::getAffineBinaryOpExpr(
@@ -143,21 +150,19 @@ TEST_F(SymbolicMapConverterTest, ConvertAffineToSymbolicExpr) {
           c3) +
       d1;  // ((d0 * 2 + s0 - 1) floordiv 2) mod 3 + d1
 
-  SymbolicExpr exp_d0 = symbolic_expr_context_.CreateVariable(0);
-  SymbolicExpr exp_d1 = symbolic_expr_context_.CreateVariable(1);
-  SymbolicExpr exp_s0 = symbolic_expr_context_.CreateVariable(2);
-  SymbolicExpr exp_c1 = symbolic_expr_context_.CreateConstant(1);
-  SymbolicExpr exp_c2 = symbolic_expr_context_.CreateConstant(2);
-  SymbolicExpr exp_c3 = symbolic_expr_context_.CreateConstant(3);
+  SymbolicExpr exp_d0 = CreateSymbolicVariable(0, &context_);
+  SymbolicExpr exp_d1 = CreateSymbolicVariable(1, &context_);
+  SymbolicExpr exp_s0 = CreateSymbolicVariable(2, &context_);
+  SymbolicExpr exp_c1 = CreateSymbolicConstant(1, &context_);
+  SymbolicExpr exp_c2 = CreateSymbolicConstant(2, &context_);
+  SymbolicExpr exp_c3 = CreateSymbolicConstant(3, &context_);
 
   SymbolicExpr expected_symbolic_expr =
       ((exp_d0 * exp_c2 + exp_s0 - exp_c1) / exp_c2) % exp_c3 + exp_d1;
 
-  EXPECT_EQ(AffineToSymbolicExpr(affine_expr, &symbolic_expr_context_,
-                                 /*num_dims=*/2),
+  EXPECT_EQ(AffineExprToSymbolicExpr(affine_expr, /*num_dims=*/2),
             expected_symbolic_expr);
 }
 
 }  // namespace
-}  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc b/third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
similarity index 58%
rename from third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc
rename to third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
index cfabfd6f85eebd..9c8cbc9a9795e3 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/symbolic_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/symbolic_map_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/model/experimental/symbolic_map.h"
+#include "xla/hlo/analysis/symbolic_map.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace gpu {
@@ -27,84 +29,97 @@ namespace {
 using ::testing::ElementsAre;
 
 struct SymbolicMapTest : public ::testing::Test {
-  // TODO(b/446856305): MLIRContext should not be used in this test.
-  SymbolicExprContext ctx{nullptr};
+  mlir::MLIRContext ctx;
+  SymbolicExpr d0;
+  SymbolicExpr d1;
+  static constexpr int kSampleDims = 2;
+  SymbolicExpr s0;
+  SymbolicExpr s1;
+  static constexpr int kSampleSymbols = 2;
+  SymbolicExpr c2;
+  SymbolicExpr c10;
+  SymbolicMap sample_map;
+
+  SymbolicMapTest()
+      : d0(CreateDimExpr(0, &ctx)),
+        d1(CreateDimExpr(1, &ctx)),
+        s0(CreateSymbolExpr(0, kSampleDims, &ctx)),
+        s1(CreateSymbolExpr(1, kSampleDims, &ctx)),
+        c2(CreateSymbolicConstant(2, &ctx)),
+        c10(CreateSymbolicConstant(10, &ctx)),
+        sample_map(SymbolicMap::Get(&ctx, kSampleDims, kSampleSymbols,
+                                    {d0 + s0, d1 * s1})) {}
 };
 
 TEST_F(SymbolicMapTest, GetSymbolAndDimExpressions) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-  SymbolicMap map = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  EXPECT_EQ(map.GetSymbolExpression(0), s0);
-  EXPECT_EQ(map.GetSymbolExpression(1), s1);
-  EXPECT_EQ(map.GetDimExpression(0), d0);
-  EXPECT_EQ(map.GetDimExpression(1), d1);
+  EXPECT_EQ(sample_map.GetSymbolExpression(0), s0);
+  EXPECT_EQ(sample_map.GetSymbolExpression(1), s1);
+  EXPECT_EQ(sample_map.GetDimExpression(0), d0);
+  EXPECT_EQ(sample_map.GetDimExpression(1), d1);
 }
 
 TEST_F(SymbolicMapTest, ToString) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-
-  SymbolicMap map = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  EXPECT_EQ(map.ToString(), "(d0, d1)[s0, s1] -> ((d0 + s0), (d1 * s1))");
+  EXPECT_EQ(sample_map.ToString(),
+            "(d0, d1)[s0, s1] -> ((d0 + s0), (d1 * s1))");
 
   SymbolicMap empty_map = SymbolicMap::Get(&ctx, 0, 0, {});
   EXPECT_EQ(empty_map.ToString(), "()[] -> ()");
 
-  SymbolicMap dims_only = SymbolicMap::Get(&ctx, 2, 0, {d0, d1});
+  SymbolicMap dims_only = SymbolicMap::Get(&ctx, kSampleDims, 0, {d0, d1});
   EXPECT_EQ(dims_only.ToString(), "(d0, d1)[] -> (d0, d1)");
 
-  SymbolicExpr s0_no_dims = ctx.CreateVariable(0);
-  SymbolicExpr s1_no_dims = ctx.CreateVariable(1);
+  SymbolicExpr s0_no_dims =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/0, &ctx);
+  SymbolicExpr s1_no_dims =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/0, &ctx);
   SymbolicMap symbols_only =
-      SymbolicMap::Get(&ctx, 0, 2, {s0_no_dims, s1_no_dims});
+      SymbolicMap::Get(&ctx, 0, kSampleSymbols, {s0_no_dims, s1_no_dims});
   EXPECT_EQ(symbols_only.ToString(), "()[s0, s1] -> (s0, s1)");
 }
 
 TEST_F(SymbolicMapTest, IsEmpty) {
   EXPECT_TRUE(SymbolicMap::Get(&ctx, 0, 0, {}).IsEmpty());
   EXPECT_TRUE(SymbolicMap::Get(&ctx, 2, 1, {}).IsEmpty());
-  EXPECT_FALSE(SymbolicMap::Get(&ctx, 1, 0, {ctx.CreateVariable(0)}).IsEmpty());
+  EXPECT_FALSE(
+      SymbolicMap::Get(&ctx, 1, 0, {CreateDimExpr(0, &ctx)}).IsEmpty());
 }
 
 TEST_F(SymbolicMapTest, IsIdentity) {
   SymbolicMap true_identity = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 2, 0, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_TRUE(true_identity.IsIdentity());
 
   SymbolicMap true_identity_with_symbols = SymbolicMap::Get(
-      &ctx, 2, 1, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 2, 1, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_TRUE(true_identity_with_symbols.IsIdentity());
 
   SymbolicMap few_results =
-      SymbolicMap::Get(&ctx, 2, 0, {ctx.CreateVariable(0)});
+      SymbolicMap::Get(&ctx, 2, 0, {CreateDimExpr(0, &ctx)});
   EXPECT_FALSE(few_results.IsIdentity());
 
   SymbolicMap too_many_results = SymbolicMap::Get(
-      &ctx, 1, 0, {ctx.CreateVariable(0), ctx.CreateVariable(1)});
+      &ctx, 1, 0, {CreateDimExpr(0, &ctx), CreateDimExpr(1, &ctx)});
   EXPECT_FALSE(too_many_results.IsIdentity());
 
   SymbolicMap wrong_expr_type = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(0), ctx.CreateConstant(1)});
+      &ctx, 2, 0, {CreateDimExpr(0, &ctx), CreateSymbolicConstant(1, &ctx)});
   EXPECT_FALSE(wrong_expr_type.IsIdentity());
 
   SymbolicMap unordered_variable_id = SymbolicMap::Get(
-      &ctx, 2, 0, {ctx.CreateVariable(1), ctx.CreateVariable(0)});
+      &ctx, 2, 0, {CreateDimExpr(1, &ctx), CreateDimExpr(0, &ctx)});
   EXPECT_FALSE(unordered_variable_id.IsIdentity());
 }
 
 TEST_F(SymbolicMapTest, GetConstantResults) {
   SymbolicMap all_constants_map = SymbolicMap::Get(
-      &ctx, 0, 0, {ctx.CreateConstant(5), ctx.CreateConstant(10)});
+      &ctx, 0, 0,
+      {CreateSymbolicConstant(5, &ctx), CreateSymbolicConstant(10, &ctx)});
   EXPECT_TRUE(all_constants_map.IsConstant());
   EXPECT_THAT(all_constants_map.GetConstantResults(), ElementsAre(5, 10));
 
   SymbolicMap mixed_map = SymbolicMap::Get(
-      &ctx, 1, 0, {ctx.CreateConstant(5), ctx.CreateVariable(0)});
+      &ctx, 1, 0,
+      {CreateSymbolicConstant(5, &ctx), CreateSymbolicVariable(0, &ctx)});
   EXPECT_FALSE(mixed_map.IsConstant());
   EXPECT_DEATH(mixed_map.GetConstantResults(),
                "Cannot get constant results from a non-constant map");
@@ -115,18 +130,11 @@ TEST_F(SymbolicMapTest, GetConstantResults) {
 }
 
 TEST_F(SymbolicMapTest, ReplaceDimsAndSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr s0 = ctx.CreateVariable(2);
-  SymbolicExpr s1 = ctx.CreateVariable(3);
-  SymbolicExpr c1 = ctx.CreateConstant(10);
-  SymbolicExpr c2 = ctx.CreateConstant(20);
-  SymbolicExpr c3 = ctx.CreateConstant(30);
-
-  SymbolicMap map_basic = SymbolicMap::Get(&ctx, 2, 2, {d0 + s0, d1 * s1});
-  SymbolicMap replaced_basic = map_basic.ReplaceDimsAndSymbols(
-      {c1, c2}, {c3, d0}, map_basic.GetNumDims(), map_basic.GetNumSymbols());
-  EXPECT_THAT(replaced_basic.GetResults(), ElementsAre(c1 + c3, c2 * d0));
+  SymbolicExpr c3 = CreateSymbolicConstant(30, &ctx);
+
+  SymbolicMap replaced_basic = sample_map.ReplaceDimsAndSymbols(
+      {d1, c2}, {c3, d0}, sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced_basic.GetResults(), ElementsAre(d1 + c3, c2 * d0));
 
   SymbolicMap map_empty = SymbolicMap::Get(&ctx, 0, 0, {});
   SymbolicMap replaced_empty = map_empty.ReplaceDimsAndSymbols({}, {}, 0, 0);
@@ -134,21 +142,32 @@ TEST_F(SymbolicMapTest, ReplaceDimsAndSymbols) {
 
   SymbolicMap map_change_dims = SymbolicMap::Get(&ctx, 1, 1, {d0 + s0 * c2});
   // Replacements in the context of the NEW map (2 dims, 1 symbol)
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_d1 = ctx.CreateVariable(1);
-  SymbolicExpr new_s0 = ctx.CreateVariable(2);
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
   SymbolicMap replaced_change_dims = map_change_dims.ReplaceDimsAndSymbols(
-      {new_d0 * c1 + new_d1}, {new_s0}, 2, 1);
+      {new_d0 * c10 + new_d1}, {new_s0}, 2, 1);
   EXPECT_EQ(replaced_change_dims.GetNumDims(), 2);
   EXPECT_EQ(replaced_change_dims.GetNumSymbols(), 1);
   EXPECT_THAT(replaced_change_dims.GetResults(),
-              ElementsAre((new_d0 * c1 + new_d1) + new_s0 * c2));
+              ElementsAre((new_d0 * c10 + new_d1) + new_s0 * c2));
 }
 
-TEST_F(SymbolicMapTest, Compose) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
+TEST_F(SymbolicMapTest, ReplaceDimsAndSymbolsOnlyDims) {
+  SymbolicMap replaced = sample_map.ReplaceDimsAndSymbols(
+      /*dim_replacements=*/{c10, c2}, /*sym_replacements=*/{},
+      sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced.GetResults(), ElementsAre(c10 + s0, c2 * s1));
+}
 
+TEST_F(SymbolicMapTest, ReplaceDimsAndSymbolsOnlySymbols) {
+  SymbolicMap replaced = sample_map.ReplaceDimsAndSymbols(
+      /*dim_replacements=*/{}, /*sym_replacements=*/{c10, c2},
+      sample_map.GetNumDims(), sample_map.GetNumSymbols());
+  EXPECT_THAT(replaced.GetResults(), ElementsAre(d0 + c10, d1 * c2));
+}
+
+TEST_F(SymbolicMapTest, Compose) {
   // Composition without Symbols
   SymbolicMap map1_no_symbols = SymbolicMap::Get(&ctx, 1, 0, {d0 * 2});
   SymbolicMap map2_no_symbols = SymbolicMap::Get(&ctx, 1, 0, {d0 + 5});
@@ -156,8 +175,10 @@ TEST_F(SymbolicMapTest, Compose) {
   EXPECT_THAT(composed_no_symbols.GetResults(), ElementsAre((d0 + 5) * 2));
 
   // Composition with Symbols
-  SymbolicExpr s0_map1 = ctx.CreateVariable(/*map1_dims*/ 2);
-  SymbolicExpr s0_map2 = ctx.CreateVariable(/*map2_dims*/ 1);
+  SymbolicExpr s0_map1 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
+  SymbolicExpr s0_map2 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
   SymbolicMap map1_symbols =
       SymbolicMap::Get(&ctx, 2, 1, {d0 + s0_map1, d1 * 2});
   SymbolicMap map2_symbols =
@@ -166,9 +187,10 @@ TEST_F(SymbolicMapTest, Compose) {
   EXPECT_EQ(compose_with_symbols.GetNumDims(), 1);
   EXPECT_EQ(compose_with_symbols.GetNumSymbols(), 2);
   SymbolicExpr new_d0 = d0;
-  SymbolicExpr new_s0_map1 = ctx.CreateVariable(/*compose_dims*/ 1);
+  SymbolicExpr new_s0_map1 =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
   SymbolicExpr new_s0_map2 =
-      ctx.CreateVariable(/*compose_dims + map1_symbols.GetNumSymbols()*/ 2);
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);
   EXPECT_THAT(
       compose_with_symbols.GetResults(),
       ElementsAre((new_d0 - 10) + new_s0_map1, (new_d0 + new_s0_map2) * 2));
@@ -187,18 +209,19 @@ TEST_F(SymbolicMapTest, Compose) {
       id_2dim_1sym.Compose(map1_symbols);
   EXPECT_EQ(compose_left_with_id2dim_1sym.GetNumDims(), 2);
   EXPECT_EQ(compose_left_with_id2dim_1sym.GetNumSymbols(), 2);
+  // The composed map has 2 dims and 2 symbols:
+  //    d0 and d1 (from map1_symbols)
+  //    s0 (from id_2dim_1sym) and s0 (from map1_symbols)
+  // The reindexed symbol from map1_symbols is the second symbol in the composed
+  // map.
   SymbolicExpr reindexed_map1_s0 =
-      ctx.CreateVariable(compose_left_with_id2dim_1sym.GetNumDims() +
-                         id_2dim_1sym.GetNumSymbols());
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/2, &ctx);
   EXPECT_THAT(compose_left_with_id2dim_1sym.GetResults(),
               ElementsAre(d0 + reindexed_map1_s0, d1 * 2));
 }
 
 TEST_F(SymbolicMapTest, Replace) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
-  SymbolicExpr c5 = ctx.CreateConstant(5);
+  SymbolicExpr c5 = CreateSymbolicConstant(5, &ctx);
 
   SymbolicExpr expr0 = (d0 + c2) * d1;
   SymbolicExpr expr1 = d1 + c2;
@@ -211,20 +234,21 @@ TEST_F(SymbolicMapTest, Replace) {
   SymbolicMap replaced_just_one = map.Replace(d1 + c2, c5);
   EXPECT_THAT(replaced_just_one.GetResults(), ElementsAre(expr0, c5));
 
-  SymbolicMap no_replacement_map = map.Replace(ctx.CreateVariable(99), c5);
+  SymbolicMap no_replacement_map =
+      map.Replace(CreateSymbolicVariable(99, &ctx), c5);
   EXPECT_EQ(no_replacement_map, map);
 }
 
 TEST_F(SymbolicMapTest, GetUnusedVariables) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr d1 = ctx.CreateVariable(1);
+  [[maybe_unused]] SymbolicExpr d2 = CreateDimExpr(2, &ctx);
   // d2 is unused.
-  SymbolicExpr s0 = ctx.CreateVariable(3);
-  SymbolicExpr s1 = ctx.CreateVariable(4);
-  SymbolicExpr c2 = ctx.CreateConstant(2);
+  [[maybe_unused]] SymbolicExpr s0_3dims =
+      CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/3, &ctx);
+  SymbolicExpr s1_3dims =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/3, &ctx);
 
   // Map with used and unused dims and symbols.
-  SymbolicMap map = SymbolicMap::Get(&ctx, 3, 2, {d0 + s1, d1 * c2});
+  SymbolicMap map = SymbolicMap::Get(&ctx, 3, 2, {d0 + s1_3dims, d1 * c2});
 
   llvm::SmallBitVector unused_dims = GetUnusedDimensionsBitVector(map);
   EXPECT_EQ(unused_dims.size(), 3);
@@ -258,8 +282,8 @@ TEST_F(SymbolicMapTest, GetUnusedVariables) {
   EXPECT_EQ(no_sym_symbols.size(), 0);
 
   // Map with only symbols
-  s0 = ctx.CreateVariable(0);
-  s1 = ctx.CreateVariable(1);
+  s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/0, &ctx);
+  s1 = CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/0, &ctx);
   SymbolicMap no_dims_map = SymbolicMap::Get(&ctx, 0, 2, {s0 * s1});
   llvm::SmallBitVector no_dim_dims = GetUnusedDimensionsBitVector(no_dims_map);
   EXPECT_EQ(no_dim_dims.size(), 0);
@@ -270,10 +294,10 @@ TEST_F(SymbolicMapTest, GetUnusedVariables) {
 }
 
 TEST_F(SymbolicMapTest, CompressDims) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  [[maybe_unused]] SymbolicExpr d1 = ctx.CreateVariable(1);  // Unused
-  SymbolicExpr d2 = ctx.CreateVariable(2);
-  SymbolicExpr s0 = ctx.CreateVariable(3);
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  [[maybe_unused]] SymbolicExpr d1 = CreateDimExpr(1, &ctx);  // Unused
+  SymbolicExpr d2 = CreateDimExpr(2, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/3, &ctx);
 
   // Map: (d0, d1, d2)[s0] -> {d0 + d2, s0 * 5}
   SymbolicMap map = SymbolicMap::Get(&ctx, 3, 1, {d0 + d2, s0 * 5});
@@ -285,9 +309,9 @@ TEST_F(SymbolicMapTest, CompressDims) {
   EXPECT_EQ(compressed.GetNumDims(), 2);
   EXPECT_EQ(compressed.GetNumSymbols(), 1);
 
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_d1 = ctx.CreateVariable(1);
-  SymbolicExpr new_s0 = ctx.CreateVariable(2);
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
   EXPECT_THAT(compressed.GetResults(),
               ElementsAre(new_d0 + new_d1, new_s0 * 5));
 
@@ -299,10 +323,11 @@ TEST_F(SymbolicMapTest, CompressDims) {
 }
 
 TEST_F(SymbolicMapTest, CompressSymbols) {
-  SymbolicExpr d0 = ctx.CreateVariable(0);
-  SymbolicExpr s0 = ctx.CreateVariable(1);
-  [[maybe_unused]] SymbolicExpr s1 = ctx.CreateVariable(2);  // Unused
-  SymbolicExpr s2 = ctx.CreateVariable(3);
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
+  [[maybe_unused]] SymbolicExpr s1 =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);  // Unused
+  SymbolicExpr s2 = CreateSymbolExpr(/*symbol_id=*/2, /*num_dims=*/1, &ctx);
 
   // Map: (d0)[s0, s1, s2] -> {d0 + s2, s0 * 5}
   SymbolicMap map = SymbolicMap::Get(&ctx, 1, 3, {d0 + s2, s0 * 5});
@@ -314,9 +339,10 @@ TEST_F(SymbolicMapTest, CompressSymbols) {
   EXPECT_EQ(compressed.GetNumDims(), 1);
   EXPECT_EQ(compressed.GetNumSymbols(), 2);
 
-  SymbolicExpr new_d0 = ctx.CreateVariable(0);
-  SymbolicExpr new_s0 = ctx.CreateVariable(1);
-  SymbolicExpr new_s1 = ctx.CreateVariable(2);  // Original s2
+  SymbolicExpr new_d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr new_s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/1, &ctx);
+  SymbolicExpr new_s1 =
+      CreateSymbolExpr(/*symbol_id=*/1, /*num_dims=*/1, &ctx);  // Original s2
   EXPECT_THAT(compressed.GetResults(),
               ElementsAre(new_d0 + new_s1, new_s0 * 5));
 
@@ -327,6 +353,27 @@ TEST_F(SymbolicMapTest, CompressSymbols) {
                "Attempting to compress a used symbol: 2");
 }
 
+TEST_F(SymbolicMapTest, Hashing) {
+  absl::flat_hash_set<SymbolicMap> set;
+
+  SymbolicExpr d0 = CreateDimExpr(0, &ctx);
+  SymbolicExpr d1 = CreateDimExpr(1, &ctx);
+  SymbolicExpr s0 = CreateSymbolExpr(/*symbol_id=*/0, /*num_dims=*/2, &ctx);
+  SymbolicExpr c42 = CreateSymbolicConstant(42, &ctx);
+  SymbolicExpr c99 = CreateSymbolicConstant(99, &ctx);
+
+  SymbolicMap map1 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c42});
+  SymbolicMap map2 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c42});
+  SymbolicMap map3 = SymbolicMap::Get(&ctx, 2, 1, {d0 + s0, d1 * c99});
+
+  set.insert(map1);
+  EXPECT_EQ(set.size(), 1);
+  set.insert(map2);
+  EXPECT_EQ(set.size(), 1);
+  set.insert(map3);
+  EXPECT_EQ(set.size(), 2);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/builder/lib/BUILD b/third_party/xla/xla/hlo/builder/lib/BUILD
index 54e78cfcdaa7ac..e059a9ae4a5f01 100644
--- a/third_party/xla/xla/hlo/builder/lib/BUILD
+++ b/third_party/xla/xla/hlo/builder/lib/BUILD
@@ -371,6 +371,7 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/hlo/builder/lib/prng.cc b/third_party/xla/xla/hlo/builder/lib/prng.cc
index 661981f19c17ea..2eae8d0092757d 100644
--- a/third_party/xla/xla/hlo/builder/lib/prng.cc
+++ b/third_party/xla/xla/hlo/builder/lib/prng.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -181,48 +182,39 @@ struct SplitShapePair {
 // Split the shape on a dimension > 1 into two halves.
 SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
   SplitShapePair pair;
-  if (shape.dimensions().size() == 0) {
+  const auto& dims = shape.dimensions();
+  if (dims.empty()) {
     pair.half_shape = ShapeUtil::MakeShape(shape.element_type(), {1});
     pair.concat_shape = ShapeUtil::MakeShape(shape.element_type(), {2});
     pair.split_dim = 0;
     pair.new_concat_dim = 0;
     return pair;
   }
-  pair.split_dim = -1;
-  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
-    if (shape.dimensions(i) % 2 == 0) {
-      pair.split_dim = i;
-      break;
-    }
-  }
-  if (pair.split_dim == -1) {
-    // No even dims. Find a dimension with maximum size.
-    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
-      if (pair.split_dim == -1 ||
-          shape.dimensions(i) > shape.dimensions(pair.split_dim)) {
-        pair.split_dim = i;
-      }
-    }
-  }
-  if (pair.split_dim < 0) {
-    LOG(ERROR) << "This point shouldn't have been reached.";
+
+  if (auto it = absl::c_find_if(dims, [](int64_t dim) { return dim % 2 == 0; });
+      it != dims.end()) {
+    pair.split_dim = std::distance(dims.begin(), it);
+  } else {
+    pair.split_dim = std::distance(dims.begin(), absl::c_max_element(dims));
   }
+
   std::vector<int64_t> half_shape_dims;
   std::vector<int64_t> concat_shape_dims;
-  const auto rank = shape.dimensions().size();
+  const auto rank = dims.size();
   half_shape_dims.reserve(rank + 1);
   concat_shape_dims.reserve(rank + 1);
   for (int64_t i = 0; i < rank; ++i) {
     if (i == pair.split_dim) {
       // Create a new trivial dim for the later concat, which is more friendly
       // to sharding propagation.
-      half_shape_dims.push_back(CeilOfRatio<int64_t>(shape.dimensions(i), 2));
+      auto dim_size = CeilOfRatio<int64_t>(dims[i], 2);
+      half_shape_dims.push_back(dim_size);
       half_shape_dims.push_back(1);
-      concat_shape_dims.push_back(half_shape_dims[i]);
+      concat_shape_dims.push_back(dim_size);
       concat_shape_dims.push_back(2);
     } else {
-      half_shape_dims.push_back(shape.dimensions(i));
-      concat_shape_dims.push_back(shape.dimensions(i));
+      half_shape_dims.push_back(dims[i]);
+      concat_shape_dims.push_back(dims[i]);
     }
   }
   pair.new_concat_dim = pair.split_dim + 1;
@@ -236,7 +228,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
 XlaOp CombineShapePair(absl::Span<const XlaOp> pair,
                        const SplitShapePair& shape_pair,
                        const Shape& original_shape) {
-  if (original_shape.dimensions().size() == 0) {
+  if (original_shape.dimensions().empty()) {
     return Reshape(pair[0], {});
   }
   XlaBuilder* builder = pair[0].builder();
@@ -433,6 +425,27 @@ std::pair<Philox4x32State, XlaOp> GeneratePhiloxBits(int64_t num_elems,
   return std::make_pair(outputs, new_state);
 }
 
+// Interleaves slices of Philox results in a round-robin fashion to align with
+// non-XLA implementations.
+XlaOp InterleavePhiloxResults(XlaBuilder* builder,
+                              absl::Span<const XlaOp> results,
+                              int64_t num_elems) {
+  const int kNumResults = results.size();
+  CHECK_GT(kNumResults, 0);
+  int64_t bits_len = CeilOfRatio<int64_t>(num_elems, kNumResults);
+  std::vector<XlaOp> reshaped_results;
+  reshaped_results.reserve(kNumResults);
+  for (const auto& result : results) {
+    reshaped_results.push_back(Reshape(result, {bits_len, 1}));
+  }
+  XlaOp numbers = ConcatInDim(builder, reshaped_results,
+                              /*dimension=*/1);
+  numbers = Reshape(numbers, {bits_len * kNumResults});
+  return Slice(numbers, /*start_indices=*/{0},
+               /*limit_indices=*/{num_elems},
+               /*strides=*/{1});
+}
+
 // Generates an array of primitive type U32 with the given shape containing
 // random bits generated by the Philox algorithm. Returns the array and the new
 // state of the random number generator.
@@ -445,18 +458,7 @@ RngOutput PhiloxRngBit32(XlaOp op_key, XlaOp initial_state,
   Philox4x32State bits;
   XlaOp new_state;
   std::tie(bits, new_state) = GeneratePhiloxBits(num_elems, initial_state, key);
-  // Combining bits[i] in a round-robin fashion, to align with non-XLA
-  // implementations
-  int64_t bits_len = (num_elems + 3) / 4;
-  for (auto i = 0; i < 4; ++i) {
-    bits[i] = Reshape(bits[i], {bits_len, 1});
-  }
-  XlaOp numbers = ConcatInDim(builder, {bits[0], bits[1], bits[2], bits[3]},
-                              /*dimension=*/1);
-  numbers = Reshape(numbers, {bits_len * 4});
-  numbers = Slice(numbers, /*start_indices=*/{0},
-                  /*limit_indices=*/{num_elems},
-                  /*strides=*/{1});
+  XlaOp numbers = InterleavePhiloxResults(builder, bits, num_elems);
   return {Reshape(numbers, shape.dimensions()), new_state};
 }
 
@@ -488,25 +490,14 @@ RngOutput PhiloxRngBit64(XlaOp op_key, XlaOp initial_state,
   Philox4x32Key key = Uint64ToUint32s(op_key);
   Philox4x32State bits32;
   XlaOp new_state;
-  std::tie(bits32, new_state) =
-      GeneratePhiloxBits(num_elems * 2, initial_state, key);
+  constexpr int kNum32BitIntsFor64BitInt = sizeof(uint64_t) / sizeof(uint32_t);
+  std::tie(bits32, new_state) = GeneratePhiloxBits(
+      num_elems * kNum32BitIntsFor64BitInt, initial_state, key);
 
   std::array<XlaOp, 2> bits64;
   bits64[0] = Uint32sToUint64({bits32[0], bits32[1]});
   bits64[1] = Uint32sToUint64({bits32[2], bits32[3]});
-
-  // Combining bits64[i] in a round-robin fashion, to align with non-XLA
-  // implementations
-  int64_t bits64_len = (num_elems + 1) / 2;
-  for (auto i = 0; i < 2; ++i) {
-    bits64[i] = Reshape(bits64[i], {bits64_len, 1});
-  }
-  XlaOp numbers = ConcatInDim(builder, {bits64[0], bits64[1]},
-                              /*dimension=*/1);
-  numbers = Reshape(numbers, {bits64_len * 2});
-  numbers = Slice(numbers, /*start_indices=*/{0},
-                  /*limit_indices=*/{num_elems},
-                  /*strides=*/{1});
+  XlaOp numbers = InterleavePhiloxResults(builder, bits64, num_elems);
   return {Reshape(numbers, shape.dimensions()), new_state};
 }
 
@@ -529,6 +520,7 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
           primitive_util::LowercasePrimitiveTypeName(bit_type));
     }
 
+    XlaOp values;
     if (value_type == F16 && bit_type == U16) {
       // This path follows the approach of the non-XLA kernels (see
       // `tsl::random::Uint16ToHalf`). IEEE754 halfs are formatted as follows
@@ -539,11 +531,19 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
       //    exponent == 15  -- an excess 15 representation of a zero exponent
       //    mantissa == 10 random bits
 
-      auto mantissa = bits & ScalarLike(bits, 0x3ffu);  // 10 bit mantissa
-      auto exponent = ScalarLike(bits, static_cast<uint16_t>(15) << 10);
+      const int trailing_significand_width =
+          primitive_util::SignificandWidth(F16) - 1;
+      const uint16_t trailing_significand_mask =
+          LsbMask<uint16_t>(trailing_significand_width);
+      auto mantissa =
+          bits &
+          ScalarLike(bits, trailing_significand_mask);  // 10 bit mantissa
+      auto exponent = ScalarLike(
+          bits, static_cast<uint16_t>(primitive_util::ExponentBias(F16))
+                    << trailing_significand_width);
       auto u16_result = exponent | mantissa;
       auto result = BitcastConvertType(u16_result, F16);
-      return result - ScalarLike(result, 1.0);
+      values = result - ScalarLike(result, 1.0);
     } else {
       // TODO: b/256715195 - Consider using the approach in the F16 case.
       // Form random mantissa bits for float/double, with a leading 1 bit.
@@ -565,15 +565,15 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
 
       // We have an integer-valued floating point number in the range
       // [0, 2**{num_mantissa_bits}).
-      XlaOp values = ConvertElementType(bits, value_type);
+      values = ConvertElementType(bits, value_type);
 
       // Multiply by 2**{-num_mantissa_bits} to get a number in the range
       // [0.0, 1.0).
       values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+    }
 
       // Multiply and add to shift to the range [minval, maxval).
-      return values * (maxval - minval) + minval;
-    }
+    return values * (maxval - minval) + minval;
   });
 }
 
diff --git a/third_party/xla/xla/hlo/builder/value_inference.cc b/third_party/xla/xla/hlo/builder/value_inference.cc
index 584cf6b173798c..71676585ce0916 100644
--- a/third_party/xla/xla/hlo/builder/value_inference.cc
+++ b/third_party/xla/xla/hlo/builder/value_inference.cc
@@ -411,7 +411,8 @@ struct PostorderDFSVisitor {
     for (int64_t operand_id : proto->operand_ids()) {
       const HloInstructionProto* operand =
           handle_to_instruction(operand_id).value();
-      auto operand_shape = std::make_unique<Shape>(operand->shape());
+      auto operand_shape = Shape::FromProto(operand->shape());
+      TF_CHECK_OK(operand_shape.status());
 
       if (operand_shape->IsArray() &&
           ShapeUtil::ElementsIn(*operand_shape) > kLargeShapeElementLimit &&
@@ -1715,7 +1716,12 @@ absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto* inst, builder_->LookUpInstructionByHandle(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(inst->opcode()));
   std::vector<Literal> operands;
-  auto output_shape = std::make_unique<const Shape>(inst->shape());
+  std::unique_ptr<Shape> output_shape;
+  {
+    TF_ASSIGN_OR_RETURN(auto output_shape_stack,
+                        Shape::FromProto(inst->shape()));
+    output_shape = std::make_unique<Shape>(std::move(output_shape_stack));
+  }
   switch (opcode) {
     case HloOpcode::kSlice:
     case HloOpcode::kConcatenate:
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 2420a3bdbc3f60..9bfad6ee2758e7 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -76,7 +76,6 @@ cc_library(
         "//xla/service:logical_buffer",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
-        "//xla/service/cpu:runtime_single_threaded_matmul",
         "//xla/tsl/lib/core:bitmap",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -86,7 +85,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:endian",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -167,9 +165,9 @@ xla_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",  # fixdeps: keep
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:endian",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index c8fc2f709fad60..a1f935eda2f0ce 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cmath>
 #include <complex>
@@ -35,7 +36,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/internal/endian.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -70,7 +70,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/compilation_environments.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "xla/service/gather_scatter_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/logical_buffer.h"
@@ -89,6 +88,13 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
 
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
 namespace xla {
 
 namespace {
@@ -236,10 +242,15 @@ absl::Status MakeEvalErrorDueToParamOrInfeed(
       eval_instruction.parent()->name(), ")."));
   std::string error_payload;
   error_payload.resize(sizeof(internal::EvalErrorDetail));
-  absl::little_endian::Store32(
-      const_cast<char*>(error_payload.data()),
-      static_cast<uint32_t>(
-          internal::EvalErrorDetail::kDynamicValueDependence));
+
+  uint32_t error_detail =
+      static_cast<uint32_t>(internal::EvalErrorDetail::kDynamicValueDependence);
+  // Ensure that the error detail is also in little endian.
+  if constexpr (absl::endian::native != absl::endian::little) {
+    DCHECK(absl::endian::native == absl::endian::big);
+    error_detail = absl::byteswap(error_detail);
+  }
+  (*error_payload.data()) = error_detail;
   error.SetPayload(internal::kEvalErrorDetailUrl, absl::Cord(error_payload));
   return error;
 }
@@ -4887,76 +4898,68 @@ absl::Status HloEvaluator::Postprocess(const HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-namespace {
 template <typename T>
-std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
-    const Array2D<T>& lhs, const Array2D<T>& rhs,
-    const std::function<void(const void* run_options_ptr, T* out, T* lhs,
-                             T* rhs, int64_t m, int64_t n, int64_t k,
-                             int32_t transpose_lhs, int32_t transpose_rhs)>&
-        impl_fn) {
+static std::unique_ptr<Array2D<T>> MatmulArray2DImpl(const Array2D<T>& lhs,
+                                                     const Array2D<T>& rhs) {
   CHECK_EQ(lhs.width(), rhs.height());
   int m = lhs.height();
-  int n = rhs.width();
   int k = lhs.width();
+  int n = rhs.width();
   auto result = std::make_unique<Array2D<T>>(m, n);
-  // Because Eigen is a header-oriented library, make sure that the Eigen code
-  // is the same as the code used by the CPU backend (otherwise the linker will
-  // randomly pick *some* definition).
-  impl_fn(
-      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
-      k,
-      /*transpose_lhs=*/0,
-      /*transpose_rhs=*/0);
+
+  using ConstTensor = Eigen::Tensor<const T, 2, Eigen::RowMajor>;
+  using Tensor = Eigen::Tensor<T, 2, Eigen::RowMajor>;
+
+  Eigen::TensorMap<ConstTensor> A(lhs.data(), m, k);
+  Eigen::TensorMap<ConstTensor> B(rhs.data(), k, n);
+  Eigen::TensorMap<Tensor> C(result->data(), m, n);
+
+  using DimPair = typename ConstTensor::DimensionPair;
+  std::array<DimPair, 1> dims({DimPair(1, 0)});
+
+  C = A.contract(B, dims);
+
   return result;
 }
-}  // namespace
 
 std::unique_ptr<Array2D<Eigen::half>> HloEvaluator::MatmulArray2D(
     const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
-  return MatmulArray2DImpl<Eigen::half>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+  return MatmulArray2DImpl<Eigen::half>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<float>> HloEvaluator::MatmulArray2D(
     const Array2D<float>& lhs, const Array2D<float>& rhs) {
-  return MatmulArray2DImpl<float>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+  return MatmulArray2DImpl<float>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
     const Array2D<double>& lhs, const Array2D<double>& rhs) {
-  return MatmulArray2DImpl<double>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+  return MatmulArray2DImpl<double>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<std::complex<float>>> HloEvaluator::MatmulArray2D(
     const Array2D<std::complex<float>>& lhs,
     const Array2D<std::complex<float>>& rhs) {
-  return MatmulArray2DImpl<std::complex<float>>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC64);
+  return MatmulArray2DImpl<std::complex<float>>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<std::complex<double>>> HloEvaluator::MatmulArray2D(
     const Array2D<std::complex<double>>& lhs,
     const Array2D<std::complex<double>>& rhs) {
-  return MatmulArray2DImpl<std::complex<double>>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC128);
+  return MatmulArray2DImpl<std::complex<double>>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<int32_t>> HloEvaluator::MatmulArray2D(
     const Array2D<int32_t>& lhs, const Array2D<int32_t>& rhs) {
-  return MatmulArray2DImpl<int32_t>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulS32);
+  return MatmulArray2DImpl<int32_t>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
     const Array2D<uint8_t>& lhs, const Array2D<uint8_t>& rhs) {
-  return MatmulArray2DImpl<uint8_t>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulU8);
+  return MatmulArray2DImpl<uint8_t>(lhs, rhs);
 }
 
-/* static */ std::unique_ptr<Array2D<float>> Array2DF8E5M2ToF32(
+std::unique_ptr<Array2D<float>> Array2DF8E5M2ToF32(
     const Array2D<tsl::float8_e5m2>& input) {
   auto result = std::make_unique<Array2D<float>>(input.height(), input.width());
   for (int64_t rowno = 0; rowno < input.height(); ++rowno) {
@@ -4967,7 +4970,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<float>> Array2DF8E4M3FNToF32(
+std::unique_ptr<Array2D<float>> Array2DF8E4M3FNToF32(
     const Array2D<tsl::float8_e4m3fn>& input) {
   auto result = std::make_unique<Array2D<float>>(input.height(), input.width());
   for (int64_t rowno = 0; rowno < input.height(); ++rowno) {
@@ -4978,7 +4981,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<tsl::float8_e5m2>> Array2DF32ToF8E5M2(
+std::unique_ptr<Array2D<tsl::float8_e5m2>> Array2DF32ToF8E5M2(
     const Array2D<float>& input) {
   auto result = std::make_unique<Array2D<tsl::float8_e5m2>>(input.height(),
                                                             input.width());
@@ -4991,7 +4994,7 @@ std::unique_ptr<Array2D<uint8_t>> HloEvaluator::MatmulArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<tsl::float8_e4m3fn>> Array2DF32ToF8E4M3FN(
+std::unique_ptr<Array2D<tsl::float8_e4m3fn>> Array2DF32ToF8E4M3FN(
     const Array2D<float>& input) {
   auto result = std::make_unique<Array2D<tsl::float8_e4m3fn>>(input.height(),
                                                               input.width());
@@ -5014,10 +5017,8 @@ std::unique_ptr<Array2D<tsl::float8_e5m2>> HloEvaluator::MatmulArray2D(
     auto rhs_float = Array2DF8E5M2ToF32(rhs);
     auto result = MatmulArray2D(*lhs_float, *rhs_float);
     return Array2DF32ToF8E5M2(*result);
-  } else {
-    return MatmulArray2DImpl<tsl::float8_e5m2>(
-        lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF8E5M2);
   }
+  return MatmulArray2DImpl<tsl::float8_e5m2>(lhs, rhs);
 }
 
 std::unique_ptr<Array2D<tsl::float8_e4m3fn>> HloEvaluator::MatmulArray2D(
@@ -5028,10 +5029,8 @@ std::unique_ptr<Array2D<tsl::float8_e4m3fn>> HloEvaluator::MatmulArray2D(
     auto rhs_float = Array2DF8E4M3FNToF32(rhs);
     auto result = MatmulArray2D(*lhs_float, *rhs_float);
     return Array2DF32ToF8E4M3FN(*result);
-  } else {
-    return MatmulArray2DImpl<tsl::float8_e4m3fn>(
-        lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF8E4M3FN);
   }
+  return MatmulArray2DImpl<tsl::float8_e4m3fn>(lhs, rhs);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
index ec0945eb6b937c..a334ab084cf4ab 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/internal/endian.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -114,7 +114,8 @@ class HloEvaluatorTest : public HloHardwareIndependentTestBase {
     TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
     auto element_type = expected.shape().element_type();
-    if (element_type == F32 || element_type == F64) {
+    if (element_type == F32 || element_type == F64 || element_type == C64 ||
+        element_type == C128) {
       ErrorSpec error(aabs);
       EXPECT_TRUE(LiteralTestUtil::Near(expected, result, error));
     } else {
@@ -493,6 +494,108 @@ TEST_P(HloEvaluatorBf16Test, DoesTanR2) {
   TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
+TEST_F(HloEvaluatorTest, DoesSinC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.2985, 0.6350});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesSinC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({1.298457581, 0.634963915});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesCosC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.8337, -0.9889});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesCosC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected =
+      LiteralUtil::CreateR0<complex128>({0.833730025, -0.988897706});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesTanC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.2718, 1.0839});
+  TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesTanC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.271752585, 1.083923327});
+  TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesSinhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.6350, 1.2985});
+  TestUnaryOp(HloOpcode::kSinh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesSinhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.634963915, 1.298457581});
+  TestUnaryOp(HloOpcode::kSinh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesCoshC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.8337, 0.9889});
+  TestUnaryOp(HloOpcode::kCosh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesCoshC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.833730025, 0.988897706});
+  TestUnaryOp(HloOpcode::kCosh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAsinC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.4523, 0.5306});
+  TestUnaryOp(HloOpcode::kAsin, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAsinC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.452278447, 0.530637531});
+  TestUnaryOp(HloOpcode::kAsin, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAcosC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.1185, -0.5306});
+  TestUnaryOp(HloOpcode::kAcos, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAcosC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected =
+      LiteralUtil::CreateR0<complex128>({1.118517880, -0.530637531});
+  TestUnaryOp(HloOpcode::kAcos, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAsinhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.5306, 0.4523});
+  TestUnaryOp(HloOpcode::kAsinh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAsinhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.530637531, 0.452278447});
+  TestUnaryOp(HloOpcode::kAsinh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAcoshC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex64>({1.0613, 0.9046});
+  TestUnaryOp(HloOpcode::kAcosh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAcoshC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({1.0, 1.0});
+  auto expected = LiteralUtil::CreateR0<complex128>({1.061275062, 0.904556894});
+  TestUnaryOp(HloOpcode::kAcosh, std::move(expected), std::move(operand), 1e-9);
+}
+TEST_F(HloEvaluatorTest, DoesAtanhC64) {
+  auto operand = LiteralUtil::CreateR0<complex64>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex64>({0.4024, 0.5536});
+  TestUnaryOp(HloOpcode::kAtanh, std::move(expected), std::move(operand), 1e-4);
+}
+TEST_F(HloEvaluatorTest, DoesAtanhC128) {
+  auto operand = LiteralUtil::CreateR0<complex128>({0.5, 0.5});
+  auto expected = LiteralUtil::CreateR0<complex128>({0.402359478, 0.553574359});
+  TestUnaryOp(HloOpcode::kAtanh, std::move(expected), std::move(operand), 1e-9);
+}
 TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32_t>({{0, std::numeric_limits<int>::min()},
@@ -7662,10 +7765,16 @@ TEST(EvalErrorTest, Payload) {
   absl::Status s = absl::InternalError("hmm");
   std::string payload;
   payload.resize(sizeof(internal::EvalErrorDetail));
-  absl::little_endian::Store32(
-      const_cast<char*>(payload.data()),
-      static_cast<uint32_t>(
-          internal::EvalErrorDetail::kDynamicValueDependence));
+
+  uint32_t error_detail =
+      static_cast<uint32_t>(internal::EvalErrorDetail::kDynamicValueDependence);
+  // Ensure that the error detail is also in little endian.
+  if constexpr (absl::endian::native != absl::endian::little) {
+    DCHECK(absl::endian::native == absl::endian::big);
+    error_detail = absl::byteswap(error_detail);
+  }
+  (*payload.data()) = error_detail;
+
   s.SetPayload(internal::kEvalErrorDetailUrl, absl::Cord(payload));
 
   EXPECT_EQ(internal::ParseEvalErrorDetail(s),
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index 1366f09e96d443..3ead99f0f0128f 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -240,42 +240,31 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleAcos(const HloInstruction* acos) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(acos, [](ElementwiseT elem_operand) {
-            return std::acos(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(acos, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(acos);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(acos, [](ElementwiseT elem_operand) {
+                          return std::acos(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(acos, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleAcosh(const HloInstruction* acosh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(acosh, [](ElementwiseT elem_operand) {
-            return std::acosh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(acosh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(acosh);
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(acosh, [](ElementwiseT elem_operand) {
+          return std::acosh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(acosh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleAsin(const HloInstruction* asin) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(asin, [](ElementwiseT elem_operand) {
-            return std::asin(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(asin, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(asin);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(asin, [](ElementwiseT elem_operand) {
+                          return std::asin(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(asin, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleRound(const HloInstruction* round) override {
@@ -475,16 +464,13 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleAsinh(const HloInstruction* asinh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(asinh, [](ElementwiseT elem_operand) {
-            return std::asinh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(asinh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(asinh);
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(asinh, [](ElementwiseT elem_operand) {
+          return std::asinh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(asinh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleAtan2(const HloInstruction* atan2) override {
@@ -512,16 +498,13 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleAtanh(const HloInstruction* atanh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(atanh, [](ElementwiseT elem_operand) {
-            return std::atanh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(atanh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(atanh);
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        ElementWiseUnaryOp(atanh, [](ElementwiseT elem_operand) {
+          return std::atanh(elem_operand);
+        }));
+    parent_->SetEvaluatedLiteralFor(atanh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleTanh(const HloInstruction* tanh) override {
@@ -2083,70 +2066,48 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleSin(const HloInstruction* sin) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT> ||
-                  is_complex_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
-            return std::sin(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(sin, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(sin);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(sin, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleSinh(const HloInstruction* sinh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(sinh, [](ElementwiseT elem_operand) {
-            return std::sinh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(sinh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(sinh);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(sinh, [](ElementwiseT elem_operand) {
+                          return std::sinh(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(sinh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleCos(const HloInstruction* cos) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT> ||
-                  is_complex_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
-            return std::cos(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(cos, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(cos);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(cos, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleCosh(const HloInstruction* cosh) override {
-    if constexpr (!is_complex_v<ReturnT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(cosh, [](ElementwiseT elem_operand) {
-            return std::cosh(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(cosh, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(cosh);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(cosh, [](ElementwiseT elem_operand) {
+                          return std::cosh(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(cosh, std::move(literal));
+    return absl::OkStatus();
   }
 
   absl::Status HandleTan(const HloInstruction* tan) override {
-    if constexpr (std::is_floating_point_v<ElementwiseT>) {
-      TF_ASSIGN_OR_RETURN(
-          Literal literal,
-          ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
-            return std::tan(elem_operand);
-          }));
-      parent_->SetEvaluatedLiteralFor(tan, std::move(literal));
-      return absl::OkStatus();
-    }
-    return UnsupportedTypeError(tan);
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
+                          return std::tan(elem_operand);
+                        }));
+    parent_->SetEvaluatedLiteralFor(tan, std::move(literal));
+    return absl::OkStatus();
   }
 
   template <typename NativeT, typename std::enable_if_t<
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 68aa13bcd0b763..681ade850f5c39 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -3566,7 +3566,7 @@ absl::StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
           /*instruction_to_shard_group_id=*/nullptr,
           /*shard_group_id_to_shard_as_group=*/nullptr,
           /*shard_group_id_to_shard_like_group=*/nullptr,
-          /*allow_spmd_sharding_propagation_to_parameters_vector=*/nullptr,
+          /*allow_spmd_sharding_propagation_to_parameters_vector=*/{},
           /*remove_unknown_shardings=*/true));
 
   DumpHloModuleIfEnabled(*module, "after_spmd_calls");
@@ -3956,7 +3956,7 @@ std::vector<int> FindAllIndices(std::vector<int64_t> vec, int64_t element) {
   return result;
 }
 
-absl::StatusOr<bool> AutoSharding::Run(
+absl::StatusOr<bool> AutoSharding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!option_.enable) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 9d5e8a26fa670b..49ffe55af7f532 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -109,11 +109,6 @@ class AutoSharding : public HloModulePass {
   ~AutoSharding() override = default;
   absl::string_view name() const override { return "auto_sharding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   double GetSolverOptimalObjectiveValue() {
     return solver_optimal_objective_value_;
   }
@@ -125,6 +120,10 @@ class AutoSharding : public HloModulePass {
   // Backend-specific aliasing information.
   const AliasInfo* alias_info_;
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Stores the optimal value of the objective the solver found.
   double solver_optimal_objective_value_ = -1.0;
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index fde53b5064d02a..b4a8d8e9b35acc 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -60,6 +60,7 @@ cc_library(
     deps = [
         ":backend_config",
         ":hlo_sharding",
+        ":mesh_and_axis",
         ":named_sharding",
         ":ptrvec",
         ":tile_assignment",
@@ -123,6 +124,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf_lite",
         "@highwayhash",
         "@highwayhash//:arch_specific",
         "@highwayhash//:hh_types",
@@ -180,10 +182,20 @@ xla_cc_test(
 
 cc_library(
     name = "mesh_and_axis",
+    srcs = ["mesh_and_axis.cc"],
     hdrs = ["mesh_and_axis.h"],
     deps = [
         ":tile_assignment",
+        "//xla:array",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -192,7 +204,12 @@ cc_library(
     hdrs = ["named_sharding.h"],
     deps = [
         ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -223,7 +240,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -279,9 +295,11 @@ cc_library(
         "//xla:util",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:human_readable_json",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -407,6 +425,10 @@ xla_cc_test(
     srcs = ["replica_group_test.cc"],
     deps = [
         ":hlo",
+        ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:array2d",
         "//xla:xla_data_proto_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/platform:test_main",
@@ -414,6 +436,33 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "mesh_and_axis_test",
+    srcs = ["mesh_and_axis_test.cc"],
+    deps = [
+        ":mesh_and_axis",
+        ":tile_assignment",
+        "//xla:array",
+        "//xla:array2d",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "named_sharding_test",
+    srcs = ["named_sharding_test.cc"],
+    deps = [
+        ":mesh_and_axis",
+        ":named_sharding",
+        "//xla:xla_data_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "hlo_casting_utils_test",
     srcs = ["hlo_casting_utils_test.cc"],
diff --git a/third_party/xla/xla/hlo/ir/backend_config.cc b/third_party/xla/xla/hlo/ir/backend_config.cc
index d5bd6d9a3c6329..852f3fe1311caa 100644
--- a/third_party/xla/xla/hlo/ir/backend_config.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "tsl/platform/human_readable_json.h"
diff --git a/third_party/xla/xla/hlo/ir/backend_config.h b/third_party/xla/xla/hlo/ir/backend_config.h
index 436df938f6f79a..90347de0c64490 100644
--- a/third_party/xla/xla/hlo/ir/backend_config.h
+++ b/third_party/xla/xla/hlo/ir/backend_config.h
@@ -16,18 +16,26 @@ limitations under the License.
 #ifndef XLA_HLO_IR_BACKEND_CONFIG_H_
 #define XLA_HLO_IR_BACKEND_CONFIG_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
+#include "tsl/platform/human_readable_json.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
 
+template <typename T>
+using EnableIfProto = typename std::enable_if_t<
+    std::is_base_of<tsl::protobuf::Message, T>::value>;
+
 // Returns a string representation of a proto in the format used by
 // HloInstruction::raw_backend_config_string.
 //
@@ -48,7 +56,7 @@ std::unique_ptr<tsl::protobuf::Message> CloneBackendConfigProto(
 // A wrapper around the BackendConfig proto. It can be initialized either with
 // a proto object or a string representing the JSON encoding of a proto. Once
 // the wrapper is initialized (either during construction or via an assignment)
-// it becomes immutable and any further assignment attempts will fail.
+// it can only be mutated by calling the ApplyFnOnProto method.
 //
 // When the wrapper is initialized only the provided format is stored. If the
 // other format is requested from the wrapper later, it is lazily computed and
@@ -92,6 +100,55 @@ class BackendConfigWrapper {
   }
   absl::Status GetProto(tsl::protobuf::Message* output_proto) const;
 
+  // Type trait to check if a type has the mutable_custom_call_metadata method.
+  template <typename T, typename = void>
+  struct has_mutable_custom_call_metadata : std::false_type {};
+
+  template <typename T>
+  struct has_mutable_custom_call_metadata<
+      T,
+      std::void_t<decltype(std::declval<T>().mutable_custom_call_metadata())>>
+      : std::true_type {};
+
+  // Applies a function `fn` to the underlying proto. The function receives a
+  // mutable pointer to a proto of type `ConfigProto`.
+  //
+  // If there is no proto initialized, will try to initialize from the
+  // raw_string_. If raw_string_ is also empty, will return an error.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::Status ApplyFnOnProto(
+      const std::function<absl::Status(ConfigProto*)>& fn) {
+    absl::WriterMutexLock lock{mutex_};
+    if (proto_ == nullptr) {
+      if (raw_string_.empty()) {
+        return absl::InvalidArgumentError(
+            "Has no proto to apply the modifier function on.");
+      }
+      auto proto = std::make_unique<ConfigProto>();
+      absl::Status status =
+          tsl::HumanReadableJsonToProto(raw_string_, proto.get());
+      if (!status.ok()) {
+        // If the proto is unparsable, move the raw string to the custom call
+        // metadata field. This preserves the original string for lowering
+        // emitters that might depend on it.
+        if constexpr (has_mutable_custom_call_metadata<ConfigProto>::value) {
+          auto* custom_call_metadata = proto->mutable_custom_call_metadata();
+          custom_call_metadata->set_metadata(raw_string_);
+          VLOG(1) << "Moving the unparsable backend config " << raw_string_
+                  << " to custom call metadata.";
+          raw_string_.clear();
+        } else {
+          return status;
+        }
+      }
+      proto_ = std::move(proto);
+    }
+    absl::Status status = fn(static_cast<ConfigProto*>(proto_.get()));
+    // Invalidate string cache, as the proto might have been mutated.
+    raw_string_.clear();
+    return status;
+  }
+
   bool empty() const {
     absl::MutexLock lock{mutex_};
     return proto_ == nullptr && raw_string_.empty();
diff --git a/third_party/xla/xla/hlo/ir/backend_config_test.cc b/third_party/xla/xla/hlo/ir/backend_config_test.cc
index b5fe5b3313f885..6e47a558c2d567 100644
--- a/third_party/xla/xla/hlo/ir/backend_config_test.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config_test.cc
@@ -37,7 +37,7 @@ const int kNumRepetitions = 100;
 // since the == operator does not canonicalize the raw strings before comparing
 // them.
 constexpr absl::string_view kRawString =
-    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1","is_tma_allowed":false}},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"})";
+    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1","is_tma_allowed":false,"is_warp_specialization_allowed":false}},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"})";
 
 template <typename Input, typename CheckFn>
 void RunThreaded(Input input, CheckFn check_fn) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 325d8783a41943..0ce9271f05d5a5 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <stack>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/functional/overload.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
@@ -239,6 +241,28 @@ void HloComputation::ClearCalledComputations() {
   CHECK(callee_computations_.empty());
 }
 
+void HloComputation::SetInstruction(HloInstruction* instruction,
+                                    InstructionType type) {
+  static_assert(alignof(HloInstruction) == kInstructionTypeMask + 1,
+                "HloInstruction should be aligned as a QWORD");
+
+  DCHECK(type != InstructionType::kUnset)
+      << "Set instruction must be called with a valid type, not kUnset.";
+  DCHECK(instruction_type() == InstructionType::kUnset ||
+         instruction_type() == type)
+      << "Unexpected instruction type. Current type is "
+      << static_cast<int>(instruction_type()) << " and it cannot be reset to "
+      << static_cast<int>(type);
+
+  // If `instruction` is nullptr, we need to preserve the existing type.
+  if (instruction == nullptr) {
+    type = instruction_type();
+  }
+
+  instruction_and_type_ =
+      reinterpret_cast<uintptr_t>(instruction) | static_cast<uintptr_t>(type);
+}
+
 HloInstruction* HloComputation::AddInstruction(
     std::unique_ptr<HloInstruction> instruction, absl::string_view new_name) {
   CHECK(instruction->opcode() != HloOpcode::kParameter)
@@ -1422,6 +1446,10 @@ HloComputation::CreateFromProto(
       new HloComputation(proto.name(), parameter_count, &instructions, root,
                          /*preserve_instruction_ids=*/true));
   computation->SetUniqueIdHelper(proto.id());
+  if (proto.is_fusion_computation()) {
+    computation->instruction_and_type_ =
+        static_cast<uintptr_t>(InstructionType::kFusion);
+  }
   if (!proto.execution_thread().empty()) {
     computation->SetExecutionThread(proto.execution_thread());
   }
@@ -1938,7 +1966,7 @@ void SortClonedInstructions(
       continue;
     }
     ++num_mapped_instructions;
-    if (!dynamic_cast<const HloParameterInstruction*>(instruction.get())) {
+    if (!HloParameterInstruction::ClassOf(instruction.get())) {
       continue;
     }
     mapped_index_of_last_parameter_plus_one = num_mapped_instructions;
@@ -1946,7 +1974,7 @@ void SortClonedInstructions(
   auto unmapped_ptr_index =
       [num_mapped_instructions,
        mapped_index_of_last_parameter_plus_one](const HloInstruction* i) {
-        if (dynamic_cast<const HloParameterInstruction*>(i)) {
+        if (HloParameterInstruction::ClassOf(i)) {
           if (num_mapped_instructions > 0 &&
               mapped_index_of_last_parameter_plus_one > 0) {
             return mapped_index_of_last_parameter_plus_one - 1;
@@ -1994,7 +2022,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
                               std::unique_ptr<HloInstruction>>* replacements,
     absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const std::string& suffix,
-    const HloInstruction* new_root) {
+    std::variant<const HloInstruction*, const absl::Span<HloInstruction* const>>
+        new_root) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
     context_ptr = std::make_unique<HloCloneContext>(parent(), suffix);
@@ -2009,11 +2038,9 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
     const absl::flat_hash_map<const HloInstruction*,
                               std::unique_ptr<HloInstruction>>* replacements,
     absl::Span<const HloInstruction* const> extra_parameters,
-    const std::string& suffix, const HloInstruction* new_root) const {
-  if (new_root == nullptr) {
-    new_root = root_instruction();
-  }
-
+    const std::string& suffix,
+    std::variant<const HloInstruction*, const absl::Span<HloInstruction* const>>
+        new_root) const {
   // Look up instr in the replacements map, and return either the replacement,
   // or instr, if the replacement isn't present.
   //
@@ -2106,8 +2133,39 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
   for (auto& instr : instructions) {
     builder.AddInstruction(std::move(instr));
   }
+
+  // Figure out the new root instruction for the clone. There are three cases:
+  // 1. The new root is just the old root (nullptr `new_root` instruction)
+  // 2. The new root is a different instruction in the computation (non-null
+  // `new_root` instruction)
+  // 3. The new root is a tuple of instructions, where the instructions are part
+  // of the computation, but the tuple did not previously exist (`new_root`
+  // span).
+  HloInstruction* new_root_instruction;
+  std::visit(absl::Overload{
+                 [&](const HloInstruction* arg) {
+                   if (arg == nullptr) {
+                     new_root_instruction =
+                         context.GetInstruction(replace(root_instruction()));
+                   } else {
+                     new_root_instruction =
+                         context.GetInstruction(replace(arg));
+                   }
+                 },
+                 [&](const absl::Span<HloInstruction* const> arg) {
+                   std::vector<HloInstruction*> root_replacements;
+                   for (HloInstruction* instr : arg) {
+                     root_replacements.push_back(
+                         context.GetInstruction(replace(instr)));
+                   }
+                   new_root_instruction = builder.AddInstruction(
+                       HloInstruction::CreateTuple(root_replacements));
+                 },
+             },
+             new_root);
+
   auto result = builder.Build(
-      /*root_instruction=*/context.GetInstruction(replace(new_root)));
+      /*root_instruction=*/new_root_instruction);
 
   // Clone control dependencies.
   for (auto instr : postorder) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index 5f3f26ad766bb1..d0b80f7859a87b 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -208,6 +209,30 @@ class HloComputation {
 
   ~HloComputation();
 
+  enum class InstructionType : uint8_t {
+    kUnset,
+    // This computation is a fusion computation. A fusion computation ordinarily
+    // also has a non-null instruction. However, if a fusion instruction
+    // is removed during compilation, the fusion computation becomes
+    // unreachable, and its instruction is set to null. We still need to regard
+    // such computations as fusion computations for HLO scheduling purposes.
+    kFusion,
+    // Last Value for range checking.
+    kLast = kFusion,
+  };
+  static_assert(static_cast<int>(InstructionType::kUnset) == 0,
+                "kUnset must be 0.");
+
+  InstructionType instruction_type() const {
+    return static_cast<InstructionType>(instruction_and_type_ &
+                                        kInstructionTypeMask);
+  }
+
+  HloInstruction* instruction() const {
+    DCHECK(instruction_type() <= InstructionType::kLast);
+    return reinterpret_cast<HloInstruction*>(instruction_and_type_ &
+                                             ~kInstructionTypeMask);
+  }
   // Add an instruction to the computation. The computation takes ownership of
   // the instruction.
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
@@ -715,6 +740,11 @@ class HloComputation {
   // 'extra_parameters' allows to specify additional parameters that should be
   // added to the computation.
   //
+  // 'new_root' allows specifying a new root instruction for the clone. If it's
+  // a pointer to an instruction in the computation being cloned, the new root
+  // is that instruction. If it's a span, the new root is a tuple instruction,
+  // where the instructions in the span are the tuple elements.
+  //
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
@@ -722,7 +752,9 @@ class HloComputation {
                                 std::unique_ptr<HloInstruction>>* replacements,
       absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const std::string& suffix = "clone",
-      const HloInstruction* new_root = nullptr);
+      std::variant<const HloInstruction*,
+                   const absl::Span<HloInstruction* const>>
+          new_root = nullptr);
 
   // Like CloneWithReplacements(), but this is a const method and `context` must
   // be specified.
@@ -733,7 +765,9 @@ class HloComputation {
           nullptr,
       absl::Span<const HloInstruction* const> extra_parameters = {},
       const std::string& suffix = "clone",
-      const HloInstruction* new_root = nullptr) const;
+      std::variant<const HloInstruction*,
+                   const absl::Span<HloInstruction* const>>
+          new_root = nullptr) const;
 
   // Convenience overloads for CloneWithReplacements.  You want to do
   //
@@ -789,30 +823,23 @@ class HloComputation {
   bool HasSideEffect() const;
 
   // Returns if this computation is a fusion computation.
+  // Do not use this method to determine if fusion_instruction_ != nullptr.
+  // Instead, directly do: FusionInstruction() != nullptr
   bool IsFusionComputation() const {
-    // TODO(b/418034360): There should be at most one fusion instruction calling
-    // a fusion computation. Assert this and fix all related tests.
-    return !caller_instructions(HloOpcode::kFusion).empty();
+    return instruction_type() == InstructionType::kFusion;
   }
 
   // Returns if this computation is the entry computation of the module.
   bool IsEntryComputation() const;
 
-  // Returns if this computation is dead. A computation is dead if it is not
-  // the entry computation and it is not called by any other computation.
-  bool IsDeadComputation() const {
-    return !IsEntryComputation() && caller_computations().empty();
-  }
-
   // Returns the owning fusion instruction, or nullptr if this is not a fusion
-  // computation. Note that this is just one of the fusion instructions that
-  // calls this computation, there may be more than one callers.
-  //
-  // TODO(b/418034360): There should be at most one fusion instruction calling
-  // a fusion computation. Assert this and fix all related tests.
+  // computation.
   HloInstruction* FusionInstruction() const {
-    auto callers = caller_instructions(HloOpcode::kFusion);
-    return callers.empty() ? nullptr : callers.front();
+    return instruction_type() == InstructionType::kFusion ? instruction()
+                                                          : nullptr;
+  }
+  void SetFusionInstruction(HloInstruction* fusion_instruction) {
+    SetInstruction(fusion_instruction, InstructionType::kFusion);
   }
 
   // Returns if this computation is an async computation.
@@ -1005,6 +1032,8 @@ class HloComputation {
   absl::Status RemoveInstructionImpl(HloInstruction* instruction,
                                      bool ignore_safety_check);
 
+  void SetInstruction(HloInstruction* instruction, InstructionType type);
+
   // Private, because only HloModule should be able to set the parent.
   // We maintain the invariant that a computation has a parent() if and only if
   // the computation has been added to a module. Accordingly, the only way to
@@ -1039,6 +1068,10 @@ class HloComputation {
   // Module containing this computation.
   HloModule* parent_ = nullptr;
 
+  // Contains HloInstruction* and its type.
+  // The respective type in the least significant three bits.
+  uintptr_t instruction_and_type_ = 0;
+
   // Contains an HloInstruction* or an absl::flat_hash_map<HloInstruction*,
   // /*count=*/int> in the high bits and a CallersType in the least significant
   // bit.
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
index 527853d6d06d2c..b1613071d5e68f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -41,8 +41,8 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
-  // The kind of aliases which can be set. A kMayAlias is one setup at
-  // compilation time by the user, and has to be respected. A kMustAlias one
+  // The kind of aliases which can be set. A kMustAlias is one setup at
+  // compilation time by the user, and has to be respected. A kMayAlias one
   // might be setup by the compiler, if it decides it is convenient to do so.
   enum AliasKind {
     kMayAlias,
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 6c6428ba1ca2f4..a13fb0540a520c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -997,7 +997,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             proto.operand_shapes_with_layout();
         operand_shapes.reserve(operand_shapes_with_layout.size());
         for (const ShapeProto& shape_proto : operand_shapes_with_layout) {
-          operand_shapes.emplace_back(shape_proto);
+          TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(shape_proto));
+          operand_shapes.emplace_back(std::move(shape));
         }
         TF_RET_CHECK(proto.called_computation_ids_size() <= 1);
         if (proto.called_computation_ids_size() == 1) {
@@ -6042,7 +6043,13 @@ void HloInstruction::set_output_to_operand_aliasing(
 }
 
 std::shared_ptr<OriginalValue> HloInstruction::original_value() const {
-  return original_value_;
+  if (original_value_ != nullptr || opcode_ != HloOpcode::kGetTupleElement) {
+    return original_value_;
+  }
+  const HloInstruction* tuple = operand(0);
+  return tuple->opcode() == HloOpcode::kTuple
+             ? tuple->operand(tuple_index())->original_value()
+             : nullptr;
 }
 
 void HloInstruction::set_original_value(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index 521eb5b2d479f9..d62cb20233986d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -222,7 +222,8 @@ static constexpr uintptr_t kInstructionTypeMask = 0b111;
 // HLO is pure (mostly).  It has no concept of mutable state.  Instead, data
 // values are produced by one HLO and flow into consumers across dependency
 // edges.
-class HloInstruction {
+// Alignment must be explicitly specified due to ARM 32 platforms.
+class alignas(kInstructionTypeMask + 1) HloInstruction {
  public:
   // A fusion node computes the same value a call to its fusion computation
   // would compute.  However, the choice of fusion kind dictates codegen
@@ -2036,6 +2037,16 @@ class HloInstruction {
     return proto;
   }
 
+  // Applies a function `fn` to the underlying backend config proto. The
+  // function receives a mutable pointer to a proto of type `ConfigProto`.
+  //
+  // If the proto is not already parsed, it will return an error.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::Status MutateBackendConfig(
+      const std::function<absl::Status(ConfigProto*)>& fn) {
+    return backend_config_.ApplyFnOnProto(fn);
+  }
+
   absl::Status set_backend_config(const tsl::protobuf::Message& proto) {
     backend_config_ = BackendConfigWrapper(proto);
     return absl::OkStatus();
@@ -2456,7 +2467,7 @@ class HloInstruction {
   std::shared_ptr<OriginalValue> original_value() const;
   void set_original_value(std::shared_ptr<OriginalValue> original_value);
 
-  // Copy original value from the input instruction if the source and
+  // Copies original value from the input instruction if the source and
   // destination shapes are compatible. This performs a deep copy if clone is
   // set to true. Otherwise, it performs a shallow copy. Print a warning if the
   // shapes are not compatible and issue_warning is set to true.
@@ -2464,8 +2475,8 @@ class HloInstruction {
                          bool issue_warning = false);
 
  protected:
-  // Internal constructor for a given opcode/shape, other fields must be filled
-  // by factory methods.
+  // Internal constructor for a given opcode/shape, other fields must be
+  // filled by factory methods.
   HloInstruction(HloOpcode opcode, const Shape& shape);
 
   void RemoveAllOperands() { operands_.clear(); }
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 870a3aea9de6c7..eea2c44bff74e8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -848,7 +848,7 @@ HloSendDoneInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  HloSendInstruction* send = dynamic_cast<HloSendInstruction*>(new_operands[0]);
+  HloSendInstruction* send = DynCast<HloSendInstruction>(new_operands[0]);
   if (send != nullptr) {
     return std::make_unique<HloSendDoneInstruction>(send, is_host_transfer());
   }
@@ -907,7 +907,7 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  HloRecvInstruction* recv = dynamic_cast<HloRecvInstruction*>(new_operands[0]);
+  HloRecvInstruction* recv = DynCast<HloRecvInstruction>(new_operands[0]);
   if (recv != nullptr) {
     return std::make_unique<HloRecvDoneInstruction>(recv, is_host_transfer());
   }
@@ -1979,6 +1979,10 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     auto* new_computation = CHECK_NOTNULL(instruction_to_append->GetModule())
                                 ->AddEmbeddedComputation(builder.Build());
     AppendComputation(new_computation);
+    if (opcode() == HloOpcode::kFusion) {
+      new_computation->SetFusionInstruction(this);
+    }
+
     clone = called_computation_root();
   } else {
     // When add_output is false, instruction_to_append is necessarily an
@@ -2209,6 +2213,31 @@ HloFusionInstruction::HloFusionInstruction(
     : HloCallableInstruction(HloOpcode::kFusion, shape, operands,
                              fusion_computation, prefix),
       fusion_kind_(fusion_kind) {
+  fusion_computation->SetFusionInstruction(this);
+}
+
+HloFusionInstruction::~HloFusionInstruction() {
+  ClearFusionComputationInstruction();
+}
+
+void HloFusionInstruction::ClearFusionComputationInstruction() {
+  // Each fusion calls a single computation, but we use called_computations()
+  // instead of fused_instructions_computation(), because the order in which
+  // things get destructed can vary; the fusion computation's back-pointer may
+  // already be null, which violates a check in
+  // fused_instructions_computation.
+  for (HloComputation* computation : called_computations()) {
+    // Some passes that rewrite fusions may reassign a fusion computation to a
+    // different fusion instruction as this instruction gets destructed.
+    if (computation->FusionInstruction() == this) {
+      computation->SetFusionInstruction(nullptr);
+    }
+  }
+}
+
+void HloFusionInstruction::ClearCalledComputations() {
+  ClearFusionComputationInstruction();
+  HloInstruction::ClearCalledComputations();
 }
 
 HloInstruction*
@@ -2464,7 +2493,11 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
 
 HloComputation* HloFusionInstruction::fused_instructions_computation() const {
   CHECK_EQ(called_computations().size(), 1);
-  return called_computations().front();
+  auto* fused_instructions_computation = called_computations().front();
+  CHECK(fused_instructions_computation->IsFusionComputation())
+      << "Computation " << fused_instructions_computation->name()
+      << " is not a fusion kind";
+  return fused_instructions_computation;
 }
 
 HloInstruction* HloFusionInstruction::fused_expression_root() const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 7f1923a7470b72..9435d5e5869f0e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -1495,6 +1495,14 @@ class HloFusionInstruction : public HloCallableInstruction {
                                 HloComputation* fusion_computation,
                                 absl::string_view prefix = "");
 
+  ~HloFusionInstruction() override;
+
+  void ClearCalledComputations() override;
+
+  // When a fusion instruction is being destructed, clear the back pointer of
+  // its fusion computation, to avoid referencing freed memory.
+  void ClearFusionComputationInstruction();
+
   // Clones the given instruction_to_append and inserts the clone into this
   // callable instruction.
   HloInstruction* CloneAndAppendInstructionIntoCalledComputation(
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index df30601f578c79..de70e70aef46c4 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -1694,33 +1694,32 @@ void HloModule::OriginalValueRecoveryTable::AddRecoveryComputation(
   for (const auto& [shape_index, old_original_array] :
        old_original_value->original_arrays()) {
     if (!old_original_array || table_.contains(*old_original_array)) {
-      // If the replaced array is already tracked by the recovery table, we can
+      // If the original array is already tracked by the recovery table, we can
       // ignore it since it is already handled by another path.
       continue;
     }
-    // If build_recovery_computation is not provided, we can just propagate the
-    // replaced original array.
     std::optional<std::unique_ptr<HloModule>> recovery_computation(nullptr);
     if (build_recovery_computation) {
       recovery_computation = build_recovery_computation(
           shape_index, *old_original_array,
           ShapeUtil::GetSubshape(old_inst->shape(), shape_index),
           ShapeUtil::GetSubshape(new_inst->shape(), shape_index));
-    }
-    if (!recovery_computation) {
-      continue;
+      if (!recovery_computation) {
+        // Skips if build_recovery_computation returns a nullopt, which
+        // indicates
+        // the original array is not recoverable.
+        continue;
+      }
     }
     std::optional<OriginalArray>* new_original_array =
         new_inst->original_value()->mutable_original_array(shape_index);
-    if (recovery_computation->get() == nullptr &&
-        !new_original_array->has_value()) {
-      // If the recovery computation is the identity computation and the
-      // replacing original array is not set, we can just propagate the replaced
-      // original array without setting any recovery computation.
-      new_original_array->emplace(*old_original_array);
-      continue;
-    }
     if (!*new_original_array) {
+      if (recovery_computation->get() == nullptr) {
+        // If the recovery computation is a nullptr, it means this is an
+        // identity computation and we can just pass through the original array.
+        new_original_array->emplace(*old_original_array);
+        continue;
+      }
       new_original_array->emplace(
           OriginalArray{GetOriginalValuePlaceholderInstructionName(
                             old_original_array->instruction_name),
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index e092964bf94331..bc0f4f4b384452 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -1009,8 +1009,8 @@ class HloModule {
     // `std::optional<std::unique_ptr<HloModule>>(
     //     const ShapeIndex& index,
     //     const OriginalArray& old_original_array,
-    //     const xla::Shape& old_array_shape,
-    //     const xla::Shape& new_array_shape)`
+    //     const xla::Shape& old_shape,
+    //     const xla::Shape& new_shape)`
     //
     // It is called for each `OriginalArray` in `old_inst` and should
     // return one of the following:
@@ -1041,9 +1041,8 @@ class HloModule {
         const HloInstruction* old_inst, HloInstruction* new_inst,
         std::function<std::optional<std::unique_ptr<HloModule>>(
             const ShapeIndex& index, const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)>&& build_recovery_computation =
-            nullptr);
+            const xla::Shape& old_shape, const xla::Shape& new_shape)>&&
+            build_recovery_computation = nullptr);
 
     // Similar to `AddRecoveryComputation`, but the callback is provided an
     // HLO module builder so that caller can directly build the recovery
@@ -1053,8 +1052,8 @@ class HloModule {
         std::function<std::optional<HloInstruction*>(
             xla::HloComputation::Builder& builder, const ShapeIndex& index,
             const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)>&& build_recovery_computation);
+            const xla::Shape& old_shape, const xla::Shape& new_shape)>&&
+            build_recovery_computation);
 
     bool empty() const { return table_.empty(); }
 
@@ -1069,6 +1068,8 @@ class HloModule {
     const_iterator begin() const { return table_.begin(); }
     const_iterator end() const { return table_.end(); }
 
+    size_t size() const { return table_.size(); }
+
    private:
     friend class HloModule;
     Table table_;
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/xla/xla/hlo/ir/hlo_original_value.h
index 039745fa3726d0..5ccfebc6839308 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.h
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -129,6 +129,8 @@ class OriginalValue {
 
   bool IsCompatibleWith(const Shape& shape) const;
 
+  bool IsTuple() const { return tree().IsTuple(); }
+
   bool operator==(const OriginalValue& other) const;
 
   bool operator!=(const OriginalValue& other) const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
index 2cd2dc855f9e53..b70a8f1576587e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
@@ -435,5 +435,24 @@ ENTRY main {
   EXPECT_EQ(p0->original_value(), p1->original_value());
 }
 
+TEST_F(OriginalValueHloTest, InferGetTupleElementOriginalValue) {
+  const char* hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  p0 = f32[] parameter(0), origin={{"p0"}}
+  p1 = f32[] parameter(1)
+  tuple = (f32[], f32[]) tuple(p0, p1)
+  ROOT gte = f32[] get-tuple-element(tuple), index=0
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* gte = module->entry_computation()->root_instruction();
+
+  EXPECT_NE(gte->original_value(), nullptr);
+  EXPECT_EQ(gte->original_value()->ToString(), R"({"p0"})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.cc b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
index 0b29cd724cac02..6ce39f9d7de319 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_schedule.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <ostream>
 #include <queue>
@@ -343,25 +342,7 @@ absl::Status HloSchedule::Verify() const {
        sequence_num_by_execution_threads) {
     std::vector<HloComputation*> nonfusion_computations =
         module_->MakeNonfusionComputations({thread_name});
-
-    // TODO(dasenov): Replace with std::erase_if after XLA uses C++20.
-    auto remove_it = std::remove_if(nonfusion_computations.begin(),
-                                    nonfusion_computations.end(),
-                                    [](const HloComputation* computation) {
-                                      return computation->IsDeadComputation();
-                                    });
-    nonfusion_computations.erase(remove_it, nonfusion_computations.end());
-
-    // It's possible to have more sequences than non_fusion_computations.
-    // This is because in some cases computations that have schedules are
-    // actually dead. The important thing to check is that each live non-fusion
-    // computation has a sequence.
-    //
-    // TODO(b/418034360): Consider strenghtening this check to equality. That
-    // would require cleaning up dead computations and/or recomputing the
-    // schedule in a number of tests. In its present state (using less or equal)
-    // this check is subsumed by the next one.
-    TF_RET_CHECK(nonfusion_computations.size() <= sequence_size)
+    TF_RET_CHECK(nonfusion_computations.size() == sequence_size)
         << "For thread " << thread_name << ", schedule has " << sequence_size
         << " sequences, but module has " << nonfusion_computations.size()
         << " non-fusion computations for thread " << thread_name;
@@ -374,45 +355,47 @@ absl::Status HloSchedule::Verify() const {
     // For each computation verify the set of instructions is the same and
     // that each dependency and control edge is honored.
     for (const HloComputation* computation : nonfusion_computations) {
-      absl::flat_hash_map<const HloInstruction*, int> instruction_position;
-      int pos = 0;
-      for (const HloInstruction* instruction :
-           sequence(computation).instructions()) {
-        TF_RET_CHECK(instruction_position.insert({instruction, pos}).second)
-            << "Instruction " << instruction->name()
-            << " appears more than once in the schedule";
-        pos++;
-      }
+      TF_RETURN_IF_ERROR(Verify(computation));
+    }
+  }
 
-      TF_RET_CHECK(instruction_position.size() ==
-                   computation->instruction_count())
-          << "Schedule for computation " << computation->name() << " has "
-          << instruction_position.size() << " instructions, expected "
-          << computation->instruction_count();
-      for (const HloInstruction* instruction : computation->instructions()) {
-        TF_RET_CHECK(instruction_position.contains(instruction))
-            << "Instruction " << instruction->name() << " is not in schedule";
-      }
+  return absl::OkStatus();
+}
 
-      for (const HloInstruction* instruction : computation->instructions()) {
-        for (const HloInstruction* operand : instruction->operands()) {
-          TF_RET_CHECK(instruction_position.at(operand) <
-                       instruction_position.at(instruction))
-              << "Instruction " << instruction->name()
-              << " is not scheduled after its operand " << operand->name();
-        }
+absl::Status HloSchedule::Verify(const HloComputation* computation) const {
+  absl::flat_hash_map<const HloInstruction*, int> instruction_position;
+  int pos = 0;
+  for (const HloInstruction* instruction :
+       sequence(computation).instructions()) {
+    TF_RET_CHECK(instruction_position.insert({instruction, pos}).second)
+        << "Instruction " << instruction->name()
+        << " appears more than once in the schedule";
+    pos++;
+  }
+  TF_RET_CHECK(instruction_position.size() == computation->instruction_count())
+      << "Schedule for computation " << computation->name() << " has "
+      << instruction_position.size() << " instructions, expected "
+      << computation->instruction_count();
+  for (const HloInstruction* instruction : computation->instructions()) {
+    TF_RET_CHECK(instruction_position.contains(instruction))
+        << "Instruction " << instruction->name() << " is not in schedule";
+  }
 
-        for (const HloInstruction* pred : instruction->control_predecessors()) {
-          TF_RET_CHECK(instruction_position.at(pred) <
-                       instruction_position.at(instruction))
-              << "Instruction " << instruction->name()
-              << " is not scheduled after its control predecessor "
-              << pred->name();
-        }
-      }
+  for (const HloInstruction* instruction : computation->instructions()) {
+    for (const HloInstruction* operand : instruction->operands()) {
+      TF_RET_CHECK(instruction_position.at(operand) <
+                   instruction_position.at(instruction))
+          << "Instruction " << instruction->name()
+          << " is not scheduled after its operand " << operand->name();
     }
-  }
 
+    for (const HloInstruction* pred : instruction->control_predecessors()) {
+      TF_RET_CHECK(instruction_position.at(pred) <
+                   instruction_position.at(instruction))
+          << "Instruction " << instruction->name()
+          << " is not scheduled after its control predecessor " << pred->name();
+    }
+  }
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index 9e29106d00dd07..aef51e1e44791e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -242,6 +242,9 @@ class HloSchedule {
   // satisfied in the schedule.
   absl::Status Verify() const;
 
+  // Verifies that the given schedule is valid for the given computation.
+  absl::Status Verify(const HloComputation* computation) const;
+
   std::string ToString() const;
 
   bool empty() const { return sequences_.empty(); }
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index edb48ebd198042..9a5f8d681b488a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/ir/hlo_op_metadata.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/overflow_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -140,7 +140,11 @@ bool NextIndex(absl::InlinedVector<int64_t, 6>* index,
 }  // namespace
 
 HloSharding HloSharding::AssignDevice(int64_t device_id,
-                                      absl::Span<const OpMetadata> metadata) {
+                                      absl::Span<const OpMetadata> metadata,
+                                      bool use_named_sharding) {
+  if (use_named_sharding) {
+    return HloSharding(NamedSharding::MaximalSharding(device_id, metadata));
+  }
   return HloSharding(device_id, metadata);
 }
 
@@ -305,7 +309,9 @@ HloSharding HloSharding::Subgroup(
                   kOrderedTypes[6] == 5);
     for (OpSharding::Type type : kOrderedTypes) {
       auto& dims = type_to_dims[type];
-      if (dims.empty()) continue;
+      if (dims.empty()) {
+        continue;
+      }
       int64_t dim_size = 1;
       for (int64_t dim : dims) {
         perm.push_back(dim);
@@ -521,7 +527,9 @@ bool HloSharding::UsesDevice(int64_t device) const {
       return s.UsesDevice(device);
     });
   }
-  return replicated_ || manual_ || tile_assignment_.UsesDevice(device);
+
+  return IsReplicatedLeaf() || IsManualLeaf() ||
+         TileAgnosticDeviceAssignment().UsesDevice(device);
 }
 
 std::map<int64_t, int64_t> HloSharding::UsedDevices(int64_t* count) const {
@@ -725,9 +733,8 @@ absl::StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
       index_to_sharding.second = *it++;
     }
     return result;
-  } else {
-    return ShapeTree<HloSharding>(shape, *this);
   }
+  return ShapeTree<HloSharding>(shape, *this);
 }
 
 absl::StatusOr<HloSharding> HloSharding::GetTupleSharding(
@@ -761,8 +768,10 @@ std::optional<int64_t> HloSharding::UniqueDevice() const {
     }
     return unique_device;
   }
-  if (!replicated_ && maximal_) {
-    return static_cast<int64_t>(*tile_assignment_.array().begin());
+
+  if (!IsReplicatedLeaf() && IsTileMaximalLeaf()) {
+    return static_cast<int64_t>(
+        *TileAgnosticDeviceAssignment().array().begin());
   }
   return std::nullopt;
 }
@@ -776,7 +785,7 @@ int64_t HloSharding::GetUniqueDevice() const {
 absl::Status HloSharding::ValidateTuple(
     const Shape& shape, std::optional<int64_t> num_devices) const {
   if (!shape.IsTuple()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Sharding is tuple-shaped but validation shape is not.");
   }
   TF_RETURN_IF_ERROR(CheckLeafCount(shape));
@@ -817,72 +826,103 @@ absl::Status HloSharding::Validate(const Shape& shape,
   return status;
 }
 
+namespace {
+absl::Status DeviceInRange(int64_t device, std::optional<int64_t> num_devices) {
+  if (device < 0) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("device %d is negative in tile assignment", device));
+  }
+  if (num_devices.has_value() && device >= *num_devices) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("device %d >= num_devices (%d) in tile assignment",
+                        device, *num_devices));
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
 absl::Status HloSharding::ValidateNonTuple(
     const Shape& shape, std::optional<int64_t> num_devices) const {
   if (shape.IsTuple()) {
     return absl::InvalidArgumentError(
         "Validation shape is a tuple but sharding is not.");
   }
-  if (replicated_) {
+  if (IsReplicatedLeaf() || IsManualLeaf() || IsUnreducedLeaf() ||
+      IsUnknownLeaf()) {
     return absl::OkStatus();
   }
 
-  // All tile assignments must be less than the number of available devices and
-  // unique.
-  bool all_devices_seen;
-  if (!tile_assignment_.iota_) {
-    absl::flat_hash_set<int64_t> seen_devices;
-    absl::Status status = tile_assignment_.array().EachStatus(
-        [&num_devices, &seen_devices](absl::Span<const int64_t> indices,
-                                      int32_t device) {
-          if (num_devices.has_value() && device >= *num_devices) {
-            return absl::InvalidArgumentError(
-                absl::StrCat("device ", device, " > num_devices (",
-                             *num_devices, ") in tile assignment"));
-          } else if (seen_devices.contains(device)) {
-            return absl::InvalidArgumentError(absl::StrCat(
-                "device ", device, " is not unique in tile assignment"));
-          }
-          seen_devices.insert(device);
-          return absl::OkStatus();
-        });
-    TF_RETURN_IF_ERROR(status);
-    all_devices_seen =
-        !num_devices.has_value() || seen_devices.size() == *num_devices;
-  } else {
-    all_devices_seen = !num_devices.has_value() ||
-                       tile_assignment_.iota_->num_elements() == *num_devices;
+  if (IsTileMaximalLeaf()) {
+    CHECK(!TileAgnosticDeviceAssignment().iota_);
+    if (TileAgnosticDeviceAssignment().array().num_elements() != 1) {
+      return absl::InvalidArgumentError(
+          "Tile maximal sharding must have a single device assignment.");
+    }
+    return DeviceInRange(TileAgnosticDeviceAssignment().first(), num_devices);
   }
 
-  if (IsTileMaximal() || IsManual() || IsUnreduced() || IsUnknown()) {
-    return absl::OkStatus();
+  // The correct constructor has to be used to create tile maximal shardings.
+  if (TileAgnosticDeviceAssignment().num_elements() == 1) {
+    return absl::InvalidArgumentError(
+        "Tile assignment only contains a single device. If a replicated "
+        "sharding was intended, use HloSharding::Replicated(). If a device "
+        "placement was intended, use HloSharding::AssignDevice()");
   }
 
   // The tile assignment tensor must have the same rank as the tiled data rank.
   if (shape.dimensions().size() != TiledDataRank()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Number of tile assignment dimensions (excluding subgroups) is "
-        "different than the input rank. "
-        "sharding=",
-        ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
+        "different than the input rank. sharding=",
+        ToString(), ", input_shape=", ShapeUtil::HumanString(shape)));
   }
 
-  // All devices should be seen in the tile assignment.
-  if (!all_devices_seen) {
-    return tsl::errors::InvalidArgument("tile_assignment should have ",
-                                        *num_devices, " devices");
+  if (tile_assignment_.iota_) {
+    if (num_devices.has_value() &&
+        tile_assignment_.iota_->num_elements() != *num_devices) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "tile_assignment should have %d devices but has %d", *num_devices,
+          tile_assignment_.iota_->num_elements()));
+    }
+    return absl::OkStatus();
   }
 
-  // The correct constructor has to be used to create tile maximal shardings.
-  if (tile_assignment_.num_elements() == 1) {
-    return tsl::errors::InvalidArgument(
-        "Tile assignment only contains a single device. If a replicated "
-        "sharding was intended, use HloSharding::Replicated(). If a device "
-        "placement was intended, use HloSharding::AssignDevice()");
+  absl::flat_hash_set<int64_t> seen_devices;
+  absl::Status status = tile_assignment_.array().EachStatus(
+      [&num_devices, &seen_devices](absl::Span<const int64_t> indices,
+                                    int64_t device) {
+        TF_RETURN_IF_ERROR(DeviceInRange(device, num_devices));
+        if (!seen_devices.insert(device).second) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "device ", device, " is not unique in tile assignment"));
+        }
+        return absl::OkStatus();
+      });
+  TF_RETURN_IF_ERROR(status);
+  if (num_devices.has_value() && seen_devices.size() != *num_devices) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("tile_assignment should have %d devices but has %d",
+                        *num_devices, seen_devices.size()));
   }
+
   return absl::OkStatus();
 }
 
+const TileAssignment& HloSharding::TileAgnosticDeviceAssignment() const {
+  // Returns device assignment regardless of sharding tiling.
+  //  - named_sharding_->device_assignment() only contains the information of
+  //    the mesh without information of how axes are used.
+  //  - tile_assignment_ keeps the information of mesh and how axes are used.
+  //
+  // For example, a NamedSharding [mesh= ['a'=2, 'b'=2] {'a'}, {}] and
+  // HloSharding [2,1,2]<=4 last_tile_dim_replicate would have the same
+  // underlying device order as: {{0, 1}, {2, 3}}.
+  if (UseNamedShardingLeaf()) {
+    return named_sharding_->device_assignment();
+  }
+  return tile_assignment_;
+}
+
 /*static*/ absl::StatusOr<HloSharding> HloSharding::FromProto(
     const OpSharding& proto) {
   std::vector<OpMetadata> metadata(proto.metadata().begin(),
@@ -1165,9 +1205,8 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
         absl::MakeConstSpan(
             &*begin_it,
             &*(begin_it + ShapeUtil::GetLeafCountTuple(*sub_shape))));
-  } else {
-    return tuple_elements_[sharding_index];
   }
+  return tuple_elements_[sharding_index];
 }
 
 std::optional<HloSharding> HloSharding::ExtractSingleSharding() const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index f0c0c130e37bbb..92b1695feee4ec 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -56,7 +56,11 @@ class HloSharding {
 
   // Creates a trivial sharding that replicates a maximal tile across all
   // devices.
-  static HloSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
+  static HloSharding Replicate(absl::Span<const OpMetadata> metadata = {},
+                               bool use_named_sharding = false) {
+    if (use_named_sharding) {
+      return HloSharding(NamedSharding::Replicate(metadata));
+    }
     return HloSharding(/*manual=*/false, /*replicated=*/true, /*unknown=*/false,
                        /*unreduced=*/false, metadata);
   }
@@ -81,7 +85,8 @@ class HloSharding {
   // Creates a sharding that emulates device placement; a tile shape equal to
   // the input shape (one tile) assigned to a single device.
   static HloSharding AssignDevice(int64_t device_id,
-                                  absl::Span<const OpMetadata> metadata = {});
+                                  absl::Span<const OpMetadata> metadata = {},
+                                  bool use_named_sharding = false);
 
   // Creates a new sharding which splits a shape into tiles amongst the devices
   // specified by `tile_assignment`.
@@ -188,26 +193,35 @@ class HloSharding {
   absl::Status Validate(const Shape& shape,
                         std::optional<int64_t> num_devices = {}) const;
 
+  // Returns true if the sharding is represented using `NamedSharding` format.
+  bool UseNamedShardingLeaf() const {
+    DCHECK(!IsTuple());
+    return named_sharding_.has_value();
+  }
+
   // Returns true if the sharding has tuple type.
   bool IsTuple() const { return tuple_; }
 
   // Returns true if the sharding is trivial: replicate on all devices.
   bool IsReplicated() const {
     if (!IsTuple()) {
-      return replicated_;
+      return IsReplicatedLeaf();
     }
     return absl::c_all_of(
         tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
   }
   bool IsReplicatedLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->IsReplicated();
+    }
     return replicated_;
   }
 
   // Returns true if the tile size is the same as the input size.
   bool IsTileMaximal() const {
     if (!IsTuple()) {
-      return maximal_;
+      return IsTileMaximalLeaf();
     }
     return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
       return s.IsTileMaximal();
@@ -215,32 +229,44 @@ class HloSharding {
   }
   bool IsTileMaximalLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->IsTileMaximal();
+    }
     return maximal_;
   }
 
   // Returns whether the sharding represents manual partitioning.
   bool IsManual() const {
     if (!IsTuple()) {
-      return manual_;
+      return IsManualLeaf();
     }
     return absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsManual(); });
   }
   bool IsManualLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      // ManualSharding is represented by separate ManualComputationOp in named
+      // sharding format.
+      return false;
+    }
     return manual_;
   }
 
   // Returns whether the sharding represents a placeholder sharding.
   bool IsUnknown() const {
     if (!IsTuple()) {
-      return unknown_;
+      return IsUnknownLeaf();
     }
     return absl::c_all_of(tuple_elements_,
                           [](const HloSharding& s) { return s.IsUnknown(); });
   }
   bool IsUnknownLeaf() const {
     DCHECK(!IsTuple());
+    if (UseNamedShardingLeaf()) {
+      // There is no Unknown sharding type in named sharding format.
+      return false;
+    }
     return unknown_;
   }
 
@@ -258,6 +284,11 @@ class HloSharding {
 
   bool IsShardGroup() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 &&
              (shard_group_.shard_like || shard_group_.shard_as);
     }
@@ -269,6 +300,11 @@ class HloSharding {
 
   bool IsShardAs() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 && shard_group_.shard_as;
     }
     return !tuple_elements_.empty() &&
@@ -278,6 +314,11 @@ class HloSharding {
 
   bool IsShardLike() const {
     if (!IsTuple()) {
+      if (UseNamedShardingLeaf()) {
+        // Sharding groups are represented by separate ShardingGroup op in named
+        // sharding format.
+        return false;
+      }
       return shard_group_.shard_group_id != -1 && shard_group_.shard_like;
     }
     return !tuple_elements_.empty() &&
@@ -447,7 +488,8 @@ class HloSharding {
            tuple_elements_ == other.tuple_elements_ &&
            replicate_on_last_tile_dim_ == other.replicate_on_last_tile_dim_ &&
            subgroup_types_ == other.subgroup_types_ &&
-           shard_group_ == other.shard_group_;
+           shard_group_ == other.shard_group_ &&
+           named_sharding_ == other.named_sharding_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
@@ -635,7 +677,8 @@ class HloSharding {
         manual_(manual),
         unknown_(unknown),
         unreduced_(unreduced),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
   // -1: the id of the host
@@ -651,7 +694,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(TileAssignment tile_assignment,
                        bool replicate_on_last_tile_dim,
                        absl::Span<const OpMetadata> metadata = {})
@@ -663,7 +707,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(TileAssignment tile_assignment,
                        absl::Span<const OpSharding::Type> subgroup_types,
                        absl::Span<const OpMetadata> metadata = {})
@@ -676,7 +721,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
   explicit HloSharding(std::vector<HloSharding> tuple_shardings)
       : tuple_elements_(std::move(tuple_shardings)),
         replicated_(false),
@@ -685,7 +731,17 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         unreduced_(false),
-        replicate_on_last_tile_dim_(false) {}
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::nullopt) {}
+  explicit HloSharding(NamedSharding named_sharding)
+      : replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        unreduced_(false),
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::move(named_sharding)) {}
 
   // Test-only constructor for sharding format code coverage. Copies the
   // original sharding with provided tile assignment.
@@ -700,7 +756,8 @@ class HloSharding {
         manual_(other.manual_),
         unknown_(other.unknown_),
         unreduced_(other.unreduced_),
-        replicate_on_last_tile_dim_(other.replicate_on_last_tile_dim_) {
+        replicate_on_last_tile_dim_(other.replicate_on_last_tile_dim_),
+        named_sharding_(std::nullopt) {
     CHECK(tile_assignment_ == other.tile_assignment_)
         << tile_assignment_.ToString() << " v.s. "
         << other.tile_assignment_.ToString();
@@ -719,6 +776,8 @@ class HloSharding {
   absl::Status ValidateNonTuple(const Shape& shape,
                                 std::optional<int64_t> num_devices) const;
 
+  const TileAssignment& TileAgnosticDeviceAssignment() const;
+
   // This field is only used if replicated_ is false. If maximal_ is true, then
   // the field contains a rank 1 array with a single element, which is the
   // device the HLO is assigned to. If maximal_ is false, the field contains an
@@ -774,6 +833,13 @@ class HloSharding {
   // or else are ignored. This is to facilitate migration from the old sharding
   // format.
   //
+  // Note that for tuple NamedShardings, we reuse HloSharding's tuple_elements_
+  // field. If named sharding format is enabled each element in tuple_elements_
+  // will be an HloSharding, which itself can be a tuple or should only have
+  // named_sharding_ populated. This approach is taken to maintain backward
+  // compatibility with the existing `tuple_elements()` method, which provides a
+  // modifiable reference to a `std::vector<HloSharding>`.
+  //
   // Note that instead of reusing HloSharding's fields like metadata, we have
   // separate fields in NamedSharding to treat it as a standalone message which
   // is more clear and will help in future cleanup.
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
new file mode 100644
index 00000000000000..6a7807f51d698e
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
@@ -0,0 +1,334 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "xla/array.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+absl::Status Mesh::ValidateMesh() {
+  // TODO(varcho): An empty mesh is valid in Shardy. If support for such meshes
+  // is required, update this validation.
+  if (device_assignment_.dimensions().empty() || axes_names_.empty()) {
+    return absl::InvalidArgumentError("Mesh must have at least one axis.");
+  }
+
+  if (device_assignment_.dimensions().size() != axes_names_.size()) {
+    return absl::InvalidArgumentError(
+        "Number of axes names must match number of dimensions in the device "
+        "assignment.");
+  }
+
+  absl::flat_hash_set<std::string> seen_axis_names;
+  for (const std::string& axis_name : axes_names_) {
+    if (!seen_axis_names.insert(axis_name).second) {
+      return absl::InvalidArgumentError("Mesh has duplicate axis names.");
+    }
+  }
+
+  // Validate device ids are permutation of iota in non-iota cases.
+  if (device_assignment_.iota().has_value()) {
+    return absl::OkStatus();
+  }
+  std::vector<int64_t> device_ids(device_assignment_.array().begin(),
+                                  device_assignment_.array().end());
+  for (int64_t device_id : device_ids) {
+    if (device_id < 0) {
+      return absl::InvalidArgumentError(
+          "Mesh device ids must be non-negative.");
+    }
+  }
+  std::vector<int64_t> iota(device_ids.size());
+  std::iota(iota.begin(), iota.end(), 0);
+
+  // For non-iota cases the device ids should be a non-identity permutation
+  // of iota.
+  if (device_ids == iota) {
+    return absl::InvalidArgumentError(
+        "Non-iota device assignment has iota device id list [0,1,2,3...].");
+  }
+  absl::c_sort(device_ids);
+  if (device_ids != iota) {
+    return absl::InvalidArgumentError(
+        "Device ids must be a permutation of [0,1,2,3...].");
+  }
+  return absl::OkStatus();
+}
+
+Mesh::Mesh(TileAssignment device_assignment,
+           absl::Span<const absl::string_view> axes_names)
+    : device_assignment_(std::move(device_assignment)),
+      axes_names_(axes_names.begin(), axes_names.end()) {
+  CHECK_OK(ValidateMesh());
+}
+
+MeshProto Mesh::ToProto() const {
+  MeshProto proto;
+  int64_t num_axes = axes_names_.size();
+
+  if (num_axes == 0) {
+    if (device_assignment_.num_elements() == 0) {
+      return MeshProto();
+    }
+    // Maximal mesh
+    // TODO(b/454008727): Validate device_ids_size is 1.
+    proto.add_device_ids(*device_assignment_.array().begin());
+    return proto;
+  }
+
+  std::vector<MeshProto::MeshAxis> axes;
+  axes.reserve(num_axes);
+
+  for (auto [name, size] :
+       llvm::zip_equal(axes_names_, device_assignment_.dimensions())) {
+    MeshProto::MeshAxis axis;
+    axis.set_name(name);
+    axis.set_size(size);
+    axes.push_back(std::move(axis));
+  }
+  proto.mutable_axes()->Assign(axes.begin(), axes.end());
+
+  std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+  // Only add device ids for non-iota cases.
+  if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+    proto.mutable_device_ids()->Assign(device_assignment_.array().begin(),
+                                       device_assignment_.array().end());
+  }
+  return proto;
+}
+
+Mesh Mesh::FromProto(const MeshProto& proto) {
+  // TODO(b/454008727): Add validators for Mesh and AxisRef FromProto methods.
+  if (proto.axes_size() == 0) {
+    if (proto.device_ids_size() == 0) {
+      return Mesh();
+    }
+    // Maximal mesh
+    // TODO(b/454008727): Validate device_ids_size is 1.
+    return Mesh(proto.device_ids(0));
+  }
+
+  std::vector<int64_t> mesh_axis_sizes;
+  std::vector<absl::string_view> mesh_axis_names;
+  mesh_axis_sizes.reserve(proto.axes_size());
+  mesh_axis_names.reserve(proto.axes_size());
+  for (const auto& axis : proto.axes()) {
+    mesh_axis_sizes.push_back(axis.size());
+    mesh_axis_names.push_back(axis.name());
+  }
+  absl::Span<const absl::string_view> mesh_axis_names_span =
+      absl::MakeSpan(mesh_axis_names);
+
+  // If device ids are not specified, create a mesh with iota tiling.
+  if (proto.device_ids_size() == 0) {
+    TileAssignment device_assignment =
+        TileAssignment(IotaTileAssignment::Create(mesh_axis_sizes));
+    return Mesh(device_assignment, mesh_axis_names_span);
+  }
+  // Otherwise, create a mesh with the specific device id ordering.
+  std::vector<int64_t> device_ids(proto.device_ids().begin(),
+                                  proto.device_ids().end());
+  Array<int64_t> device_ids_array(mesh_axis_sizes);
+  absl::c_copy(device_ids, device_ids_array.begin());
+
+  TileAssignment tile_assignment =
+      TileAssignment(std::make_shared<Array<int64_t>>(device_ids_array));
+  return Mesh(tile_assignment, mesh_axis_names_span);
+}
+
+absl::Status AxisRef::Validate(const Mesh& mesh) const {
+  if (mesh_axis_index_ >= mesh.axis_names().size()) {
+    return absl::InvalidArgumentError(
+        "Axis index must be less than number of axes.");
+  }
+  if (!sub_axis_info_.has_value()) {
+    return absl::OkStatus();
+  }
+
+  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
+  if (axis_size % sub_axis_info_->pre_size != 0 ||
+      axis_size % sub_axis_info_->size != 0) {
+    return absl::InvalidArgumentError(
+        "Pre-size and size must divide the full axis size.");
+  }
+  if (sub_axis_info_->size >= axis_size) {
+    return absl::InvalidArgumentError(
+        "Sub-axis size must be strictly less than the full axis size.");
+  }
+  return absl::OkStatus();
+}
+
+AxisRefProto AxisRef::ToProto() const {
+  AxisRefProto proto;
+  proto.set_mesh_axis_index(mesh_axis_index_);
+  if (sub_axis_info_.has_value()) {
+    proto.mutable_sub_axis_info()->set_pre_size(sub_axis_info_->pre_size);
+    proto.mutable_sub_axis_info()->set_size(sub_axis_info_->size);
+  }
+  return proto;
+}
+
+AxisRef AxisRef::FromProto(const AxisRefProto& proto) {
+  AxisRef axis_ref(proto.mesh_axis_index());
+  if (proto.has_sub_axis_info()) {
+    axis_ref.sub_axis_info_ = {proto.sub_axis_info().pre_size(),
+                               proto.sub_axis_info().size()};
+  }
+  return axis_ref;
+}
+
+AxisRef::AxisRef(int64_t mesh_axis_index) : mesh_axis_index_(mesh_axis_index) {}
+
+AxisRef::AxisRef(int64_t mesh_axis_index, SubAxis sub_axis_info)
+    : mesh_axis_index_(mesh_axis_index), sub_axis_info_(sub_axis_info) {
+  CHECK_GT(sub_axis_info_->pre_size, 0) << "sub-axis pre-size must be >= 1";
+  CHECK_GT(sub_axis_info_->size, 1) << "sub-axis size must be > 1";
+}
+
+bool canSubAxesCoexist(int64_t minPreSize, int64_t maxPreSize,
+                       int64_t minNextPreSize, int64_t maxNextPreSize) {
+  if (minNextPreSize > maxPreSize) {
+    // Sub-axes overlap, check if overlapping and non-overlapping parts are
+    // valid.
+    return minNextPreSize % maxPreSize == 0 && maxPreSize % minPreSize == 0 &&
+           maxNextPreSize % minNextPreSize == 0;
+  }
+  // Sub-axes don't overlap, check if the gap is valid.
+  return maxPreSize % minNextPreSize == 0;
+}
+
+bool AxisRef::CanCoexist(const AxisRef& other) const {
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return true;
+  }
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    // If one is a full axis and the other is a sub-axis, they can coexist.
+    return true;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  int64_t this_pre_size = this_sub_axis.pre_size;
+  int64_t other_pre_size = other_sub_axis.pre_size;
+  int64_t this_next_pre_size = this_sub_axis.next_pre_size();
+  int64_t other_next_pre_size = other_sub_axis.next_pre_size();
+
+  auto [min_pre_size, max_pre_size] =
+      std::minmax(this_pre_size, other_pre_size);
+  auto [min_next_pre_size, max_next_pre_size] =
+      std::minmax(this_next_pre_size, other_next_pre_size);
+
+  return canSubAxesCoexist(min_pre_size, max_pre_size, min_next_pre_size,
+                           max_next_pre_size);
+}
+
+bool AxisRef::Overlaps(const AxisRef& other) const {
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return false;
+  }
+
+  // If one is a full axis then they must overlap.
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    return true;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  return this_sub_axis.pre_size < other_sub_axis.next_pre_size() &&
+         other_sub_axis.pre_size < this_sub_axis.next_pre_size();
+}
+
+bool AxisRef::CanCoexistWithoutOverlap(const AxisRef& other) const {
+  // Check if the axes are on different mesh dimensions. If so, they can always
+  // coexist and never overlap.
+  if (mesh_axis_index() != other.mesh_axis_index()) {
+    return true;
+  }
+
+  // If one AxisRef is a full axis it will always overlap the other axis on the
+  // same dimension.
+  if (!sub_axis_info_.has_value() || !other.sub_axis_info_.has_value()) {
+    return false;
+  }
+
+  const SubAxis& this_sub_axis = sub_axis_info_.value();
+  const SubAxis& other_sub_axis = other.sub_axis_info_.value();
+
+  int64_t this_pre_size = this_sub_axis.pre_size;
+  int64_t other_pre_size = other_sub_axis.pre_size;
+  int64_t this_next_pre_size = this_sub_axis.next_pre_size();
+  int64_t other_next_pre_size = other_sub_axis.next_pre_size();
+
+  // Check for overlapping sub-axes
+  bool overlaps = (this_next_pre_size > other_pre_size) &&
+                  (other_next_pre_size > this_pre_size);
+  if (overlaps) {
+    return false;
+  }
+  // Assert that sub-axes can coexist.
+  auto [min_pre_size, max_pre_size] =
+      std::minmax(this_pre_size, other_pre_size);
+  auto [min_next_pre_size, max_next_pre_size] =
+      std::minmax(this_next_pre_size, other_next_pre_size);
+
+  // Sub-axes don't overlap, check if the gap is valid.
+  return max_pre_size % min_next_pre_size == 0;
+}
+
+bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes) {
+  for (int64_t i = 0; i < axes.size() - 1; ++i) {
+    for (int64_t j = i + 1; j < axes.size(); ++j) {
+      if (!axes[i].CanCoexistWithoutOverlap(axes[j])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+absl::Status ValidateSpanOfAxes(absl::Span<const AxisRef> axes,
+                                const Mesh& mesh) {
+  for (const AxisRef& axis : axes) {
+    TF_RETURN_IF_ERROR(axis.Validate(mesh));
+  }
+  if (!AxesCanCoexistWithoutOverlap(axes)) {
+    return absl::InvalidArgumentError("Axes cannot coexist or axes overlap.");
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.h b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
index 5e05c5d338bae2..2b913f7638dad0 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.h
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
@@ -17,26 +17,122 @@ limitations under the License.
 #define XLA_HLO_IR_MESH_AND_AXIS_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/hlo/ir/tile_assignment.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
-// C++ representation for corresponding `OpSharding::Mesh` proto so same
+class AxisRef;
+
+// C++ representation for corresponding OpSharding::Mesh proto so same
 // documentation applies, except device assignment is represented in the array
 // format instead of list of device ids to align with various array specific
-// queries. Note that `TileAssignment` is used instead of `xla::Array` for
-// optimized array representation in iota based cases which is the most common
-// case.
+// queries. `TileAssignment` is used instead of `xla::Array` for optimized array
+// representation in the most common iota-based cases.
+//
+// - device_assignment_.dimensions() represents the axis sizes.
+// - device_assignment_.array() represents the list of device IDs.
+//
+// For maximal mesh, axes_names is empty and device_assignment_ contains the
+// single device id.
 //
 // Example: device_assignment {{3, 0, 2}, {1, 4, 5}} with axes names
-// {"data", "model"} represents a 2 * 3 mesh of 6 devices, with "data" axis of
-// size 2 and "model" axis of size 3.
+// {"data", "model"} represents the mesh ["data"=2, "model"=3].
 class Mesh {
+ public:
+  // Empty mesh
+  explicit Mesh() = default;
+
+  // Maximal Mesh
+  explicit Mesh(int64_t device_id) : device_assignment_(device_id) {}
+
+  // Constructs an iota device assignment mesh with given axes sizes and names.
+  //
+  // Example: axes_sizes {2, 3} and axes_names {"data", "model"} represent the
+  // mesh ["data"=2, "model"=3] with iota device list. We use `TileAssignment`
+  // optimized for iota based cases which will not store the entire array.
+  explicit Mesh(absl::Span<const int64_t> axes_sizes,
+                absl::Span<const absl::string_view> axes_names)
+      : Mesh(TileAssignment(axes_sizes), axes_names) {}
+
+  // Constructs a mesh with given device assignment and axes names. This ctor
+  // should **ONLY** be used for non-iota based device assignments.
+  explicit Mesh(Array<int64_t> device_assignment,
+                absl::Span<const absl::string_view> axes_names)
+      : Mesh(TileAssignment(std::make_shared<Array<int64_t>>(
+                 std::move(device_assignment))),
+             axes_names) {}
+
+  explicit Mesh(TileAssignment device_assignment,
+                absl::Span<const absl::string_view> axes_names);
+
+  // Returns whether this mesh is a maximal-sharding mesh.
+  //
+  // A maximal-sharding mesh contains an empty axis list and a single device id.
+  bool IsMaximal() const {
+    return axes_names_.empty() && device_assignment_.num_elements() == 1;
+  }
+
+  bool operator==(const Mesh& other) const {
+    return device_assignment_ == other.device_assignment_ &&
+           axes_names_ == other.axes_names_;
+  }
+
+  bool operator!=(const Mesh& other) const { return !(*this == other); }
+
+  std::string ToString() const {
+    std::string mesh_str = "@mesh";
+    // Add the mesh axes names and sizes.
+    std::vector<std::string> formatted_axes_names;
+    formatted_axes_names.reserve(axes_names_.size());
+    for (int64_t i = 0; i < axes_names_.size(); ++i) {
+      formatted_axes_names.push_back(
+          absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
+    }
+
+    // Add the device assignment if it is not an iota case.
+    std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+    std::string device_assignment_str = "";
+    if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+      device_assignment_str =
+          absl::StrCat("(", device_assignment_.ArrayToString(), ")");
+    }
+    absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","),
+                    ">", device_assignment_str);
+    return mesh_str;
+  }
+
+  bool DeviceAssignmentEquals(const Mesh& other) const {
+    return device_assignment_ == other.device_assignment_;
+  }
+
+  MeshProto ToProto() const;
+
+  static Mesh FromProto(const MeshProto& proto);
+
+  const TileAssignment& device_assignment() const { return device_assignment_; }
+  std::vector<std::string> axis_names() const { return axes_names_; }
+  absl::Span<const int64_t> axis_sizes() const {
+    return device_assignment_.dimensions();
+  }
+  int64_t axis_size(int64_t axis_index) const {
+    return device_assignment_.dim(axis_index);
+  }
+
  private:
+  absl::Status ValidateMesh();
   // Dimensions of the `device_assignment_` array correspond to the axes of the
   // mesh.
   TileAssignment device_assignment_;
@@ -53,14 +149,70 @@ class AxisRef {
   struct SubAxis {
     int64_t pre_size;
     int64_t size;
+    int64_t next_pre_size() const { return pre_size * size; }
   };
 
   // Index corresponding to axis in the mesh. It should be a valid index into
   // `mesh.axes_names_`.
   int64_t mesh_axis_index_;
   std::optional<SubAxis> sub_axis_info_;
+
+ public:
+  explicit AxisRef(int64_t mesh_axis_index);
+
+  explicit AxisRef(int64_t mesh_axis_index, SubAxis sub_axis_info);
+
+  bool operator==(const xla::AxisRef& other) const {
+    if (mesh_axis_index_ != other.mesh_axis_index_) {
+      return false;
+    }
+    if (sub_axis_info_.has_value() != other.sub_axis_info_.has_value()) {
+      return false;
+    }
+    if (sub_axis_info_.has_value()) {
+      return sub_axis_info_->pre_size == other.sub_axis_info_->pre_size &&
+             sub_axis_info_->size == other.sub_axis_info_->size;
+    }
+    return true;
+  }
+
+  bool operator!=(const xla::AxisRef& other) const { return !(*this == other); }
+
+  std::string ToString(const Mesh& mesh) const {
+    CHECK_GE(mesh_axis_index_, 0);
+    CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
+    std::string axis_str = mesh.axis_names()[mesh_axis_index()];
+    if (sub_axis_info_.has_value()) {
+      absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
+                      sub_axis_info_->size);
+    }
+    return axis_str;
+  }
+
+  AxisRefProto ToProto() const;
+
+  static AxisRef FromProto(const AxisRefProto& proto);
+
+  bool CanCoexist(const AxisRef& other) const;
+  bool Overlaps(const AxisRef& other) const;
+  bool CanCoexistWithoutOverlap(const AxisRef& other) const;
+
+  // Validates that the given mesh is compatible for this axis ref.
+  absl::Status Validate(const Mesh& mesh) const;
+  int64_t mesh_axis_index() const { return mesh_axis_index_; }
+  std::optional<SubAxis> sub_axis_info() const { return sub_axis_info_; }
+
+ private:
+  absl::Status ValidateAxisRef();
 };
 
+bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes);
+
+// The span of axes is valid if (1) all axes are valid for the given mesh, and
+// (2) the axes can coexist without overlap.
+absl::Status ValidateSpanOfAxes(absl::Span<const AxisRef> axes,
+                                const Mesh& mesh);
+
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_MESH_AND_AXIS_H_
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
new file mode 100644
index 00000000000000..3dbcc53db77246
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
@@ -0,0 +1,353 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+using ::tsl::proto_testing::EqualsProto;
+
+namespace xla {
+
+TEST(MeshAndAxisTest, AxisRefEquality) {
+  EXPECT_EQ(AxisRef(1), AxisRef(1));
+  EXPECT_EQ(AxisRef(3, {1, 2}), AxisRef(3, {1, 2}));
+  EXPECT_NE(AxisRef(2), AxisRef(4));
+  EXPECT_NE(AxisRef(0), AxisRef(0, {1, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(3, {1, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(2, {2, 2}));
+  EXPECT_NE(AxisRef(2, {1, 2}), AxisRef(2, {1, 4}));
+}
+
+TEST(MeshAndAxisTest, MeshEquality) {
+  std::vector<absl::string_view> axes_abc = {"a", "b", "c"};
+  std::vector<absl::string_view> axes_abcd = {"a", "b", "c", "d"};
+  std::vector<absl::string_view> axes_efgh = {"e", "f", "g", "h"};
+  EXPECT_EQ(Mesh({1, 2, 3}, axes_abc), Mesh({1, 2, 3}, axes_abc));
+  EXPECT_NE(Mesh({1, 2, 3, 4}, axes_abcd), Mesh({1, 2, 3, 4}, axes_efgh));
+  EXPECT_NE(Mesh({1, 2, 3}, axes_abc), Mesh({1, 2, 3, 4}, axes_abcd));
+}
+
+TEST(MeshAndAxisTest, DeviceAssignmentEquality) {
+  std::vector<absl::string_view> axes_abcd = {"a", "b", "c", "d"};
+  std::vector<absl::string_view> axes_efgh = {"e", "f", "g", "h"};
+  Mesh mesh({1, 2, 3, 4}, axes_abcd);
+  Mesh mesh_diff_axis_names({1, 2, 3, 4}, axes_efgh);
+  EXPECT_TRUE(mesh.DeviceAssignmentEquals(mesh_diff_axis_names));
+  Mesh mesh_other({2, 1, 4, 3}, axes_efgh);
+  EXPECT_FALSE(mesh.DeviceAssignmentEquals(mesh_other));
+}
+
+TEST(MeshAndAxisTest, AxesToProto) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(123);
+  EXPECT_THAT(AxisRef(123).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesToProtoWithSubAxis) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(2);
+  expected.mutable_sub_axis_info()->set_pre_size(2);
+  expected.mutable_sub_axis_info()->set_size(8);
+  EXPECT_THAT(AxisRef(2, {2, 8}).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesFromProto) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(1);
+  EXPECT_THAT(AxisRef(1), AxisRef::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, AxesFromProtoWithSubAxis) {
+  AxisRefProto expected;
+  expected.set_mesh_axis_index(10);
+  expected.mutable_sub_axis_info()->set_pre_size(4);
+  expected.mutable_sub_axis_info()->set_size(32);
+  EXPECT_THAT(AxisRef(10, {4, 32}), AxisRef::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshToAndFromProtoIotaTiling) {
+  MeshProto proto;
+  proto.add_axes()->set_name("a");
+  proto.add_axes()->set_name("b");
+  proto.add_axes()->set_name("c");
+  proto.mutable_axes(0)->set_size(2);
+  proto.mutable_axes(1)->set_size(3);
+  proto.mutable_axes(2)->set_size(6);
+
+  Mesh mesh({2, 3, 6}, {"a", "b", "c"});
+
+  EXPECT_THAT(mesh.ToProto(), EqualsProto(proto));
+  EXPECT_EQ(mesh, Mesh::FromProto(proto));
+}
+
+TEST(MeshAndAxisTest, MeshToProtoIotaTilingWithReshapeDims) {
+  MeshProto expected;
+  expected.add_axes()->set_name("axis1");
+  expected.add_axes()->set_name("axis2");
+  expected.add_axes()->set_name("axis3");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(4);
+  expected.mutable_axes(2)->set_size(1);
+  // When dims=[4,4,1] reshape_dims=[4,2,2], transpose_perm=[1,0,2] (swap dim 0
+  // and dim 1) corresponds to [4,4,1]<=[4,2,2]T(1,0,2) which in full array V1
+  // format is [0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15].
+  std::vector<int> expected_device_ids = {0, 1, 4, 5, 8,  9,  12, 13,
+                                          2, 3, 6, 7, 10, 11, 14, 15};
+  for (int i = 0; i < expected_device_ids.size(); ++i) {
+    expected.add_device_ids(expected_device_ids[i]);
+  }
+
+  std::vector<absl::string_view> axes_names = {"axis1", "axis2", "axis3"};
+  EXPECT_THAT(
+      Mesh(TileAssignment(/*dims=*/{4, 4, 1}, /*reshape_dims=*/{4, 2, 2},
+                          /*transpose_perm=*/{1, 0, 2}),
+           axes_names)
+          .ToProto(),
+      EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshToProtoNonIotaTiling) {
+  MeshProto expected;
+  expected.add_axes()->set_name("x");
+  expected.add_axes()->set_name("y");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(2);
+  std::vector<int> random_expected_device_ids = {6, 3, 0, 1, 5, 2, 7, 4};
+  for (int i = 0; i < random_expected_device_ids.size(); ++i) {
+    expected.add_device_ids(random_expected_device_ids[i]);
+  }
+
+  Array2D<int64_t> array({{6, 3}, {0, 1}, {5, 2}, {7, 4}});
+  std::vector<absl::string_view> axes_xy = {"x", "y"};
+  EXPECT_THAT(Mesh(array, axes_xy).ToProto(), EqualsProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshFromProtoNonIotaTiling) {
+  MeshProto expected;
+  expected.add_axes()->set_name("x");
+  expected.add_axes()->set_name("y");
+  expected.mutable_axes(0)->set_size(4);
+  expected.mutable_axes(1)->set_size(2);
+  std::vector<int> random_expected_device_ids = {0, 1, 6, 3, 7, 4, 5, 2};
+  for (int i = 0; i < random_expected_device_ids.size(); ++i) {
+    expected.add_device_ids(random_expected_device_ids[i]);
+  }
+
+  Array2D<int64_t> array({{0, 1}, {6, 3}, {7, 4}, {5, 2}});
+  std::vector<absl::string_view> axes_xy = {"x", "y"};
+  EXPECT_EQ(Mesh(array, axes_xy), Mesh::FromProto(expected));
+}
+
+TEST(MeshAndAxisTest, MeshRoundtripProto) {
+  // Iota tiling.
+  std::vector<absl::string_view> axes_xy = {"data", "model"};
+  Mesh mesh_iota({5, 3}, axes_xy);
+  EXPECT_THAT(mesh_iota, Mesh::FromProto(mesh_iota.ToProto()));
+
+  // Non-iota tiling.
+  Array2D<int64_t> array(
+      {{14, 7, 6}, {12, 0, 8}, {11, 10, 5}, {1, 9, 3}, {2, 13, 4}});
+  Mesh mesh_non_iota(array, axes_xy);
+  EXPECT_THAT(mesh_non_iota, Mesh::FromProto(mesh_non_iota.ToProto()));
+}
+
+TEST(MeshAndAxisTest, ValidatesAxisRef) {
+  EXPECT_DEATH(
+      { AxisRef axis_ref_invalid_pre_size(3, {0, 2}); },
+      "sub-axis pre-size must be ");
+  EXPECT_DEATH(
+      { AxisRef axis_ref_invalid_subaxis_size(0, {1, 1}); },
+      "sub-axis size must be");
+}
+
+TEST(MeshAndAxisTest, ValidatesMesh) {
+  EXPECT_DEATH(
+      { Mesh mesh_dims_axes_mismatch({2, 3, 4}, {"x", "y"}); },
+      "Number of axes names must match number of dimensions");
+
+  Array2D<int64_t> negative_device_ids({{0, 1, 2}, {3, -4, 5}});
+  EXPECT_DEATH(
+      { Mesh mesh_invalid_non_iota(negative_device_ids, {"x", "y"}); },
+      "Mesh device ids must be non-negative");
+
+  Array2D<int64_t> invalid_non_iota_device_ids({{10, 11, 12}, {13, 14, 15}});
+  EXPECT_DEATH(
+      { Mesh mesh_invalid_non_iota(invalid_non_iota_device_ids, {"x", "y"}); },
+      "Device ids must be a permutation of");
+
+  EXPECT_DEATH(
+      {
+        Mesh mesh_with_duplicate_axis_names({1, 2, 3, 4}, {"x", "y", "z", "x"});
+      },
+      "Mesh has duplicate axis names");
+
+  EXPECT_DEATH(
+      { Mesh mesh_with_empty_dims(TileAssignment({}), {}); },
+      "Mesh must have at least one axis");
+}
+
+TEST(MeshAndAxisTest, MeshAxesToString) {
+  Mesh mesh_uvw({10, 12, 15}, {"u", "v", "w"});
+  EXPECT_EQ(mesh_uvw.ToString(), "@mesh<u=10,v=12,w=15>");
+
+  Mesh mesh_abcd(
+      TileAssignment(/*dims=*/{2, 4, 4, 2}, /*reshape_dims=*/{1, 4, 1, 16},
+                     /*transpose_perm=*/{2, 3, 0, 1}),
+      {"a", "b", "c", "d"});
+  EXPECT_EQ(mesh_abcd.ToString(), "@mesh<a=2,b=4,c=4,d=2>([4,16]T(1,0))");
+
+  Array<int64_t> array({{8, 3, 7, 5, 4, 2, 6, 0, 1, 9}});
+  array.Reshape({10});
+  Mesh mesh_ooo(array, {"ooo"});
+  EXPECT_EQ(mesh_ooo.ToString(), "@mesh<ooo=10>(8,3,7,5,4,2,6,0,1,9)");
+}
+
+TEST(MeshAndAxisTest, ValidateAxisForMesh) {
+  Mesh mesh({2 * 7, 3 * 11, 5 * 13}, {"fdr", "jfk", "lbj"});
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(3, {1, 2}).Validate(mesh)); },
+      "Axis index must be less than number of axes");
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(0, {5, 19}).Validate(mesh)); },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(0, {2, 5}).Validate(mesh)); },
+      "Pre-size and size must divide the full axis size");
+
+  EXPECT_DEATH(
+      { CHECK_OK(AxisRef(1, {1, 3 * 11}).Validate(mesh)); },
+      "Sub-axis size must be strictly less than the full axis size");
+}
+
+TEST(MeshAndAxisTest, AxisRefCanCoexist) {
+  auto canCoexist = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.CanCoexist(b), expected);
+    EXPECT_EQ(b.CanCoexist(a), expected);
+  };
+
+  canCoexist(AxisRef(0), AxisRef(1), true);
+  canCoexist(AxisRef(0), AxisRef(1, {2, 2}), true);
+  canCoexist(AxisRef(0), AxisRef(0), true);
+  canCoexist(AxisRef(0), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {1, 4}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {2, 4}), true);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {6, 2}), true);
+  canCoexist(AxisRef(0, {1, 4}), AxisRef(0, {2, 2}), true);
+  canCoexist(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), true);
+
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {1, 3}), false);
+  canCoexist(AxisRef(0, {1, 2}), AxisRef(0, {3, 2}), false);
+  canCoexist(AxisRef(0, {1, 3}), AxisRef(0, {2, 3}), false);
+}
+
+TEST(MeshAndAxisTest, AxisRefOverlaps) {
+  auto overlaps = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.Overlaps(b), expected);
+    EXPECT_EQ(b.Overlaps(a), expected);
+  };
+
+  overlaps(AxisRef(0), AxisRef(0), true);
+  overlaps(AxisRef(0, {2, 4}), AxisRef(0), true);
+  overlaps(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), true);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), true);
+  overlaps(AxisRef(0, {2, 8}), AxisRef(0, {4, 2}), true);
+  overlaps(AxisRef(2, {1, 4}), AxisRef(2, {1, 2}), true);
+
+  overlaps(AxisRef(0), AxisRef(1), false);
+  overlaps(AxisRef(0), AxisRef(1, {1, 2}), false);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {4, 2}), false);
+  overlaps(AxisRef(0, {1, 4}), AxisRef(0, {8, 2}), false);
+  overlaps(AxisRef(0, {4, 2}), AxisRef(0, {1, 2}), false);
+}
+
+TEST(MeshAndAxisTest, AxisRefCanCoexistWithoutOverlap) {
+  auto coexistWithoutOverlap = [](AxisRef a, AxisRef b, bool expected) {
+    EXPECT_EQ(a.CanCoexistWithoutOverlap(b), expected);
+    EXPECT_EQ(b.CanCoexistWithoutOverlap(a), expected);
+  };
+
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1), true);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1, {1, 2}), true);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(1, {2, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {2, 4}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {6, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {4, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {8, 2}), true);
+  coexistWithoutOverlap(AxisRef(0, {4, 2}), AxisRef(0, {1, 2}), true);
+
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0), false);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0), AxisRef(0, {2, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {2, 2}), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {1, 4}), false);
+  coexistWithoutOverlap(AxisRef(2, {1, 2}), AxisRef(2, {1, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {2, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 4}), AxisRef(0, {2, 4}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {1, 3}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 2}), AxisRef(0, {3, 2}), false);
+  coexistWithoutOverlap(AxisRef(0, {1, 3}), AxisRef(0, {2, 3}), false);
+  coexistWithoutOverlap(AxisRef(0, {2, 8}), AxisRef(0, {4, 2}), false);
+}
+
+TEST(MeshAndAxisTest, EmptyMesh) {
+  Mesh empty_mesh;
+  EXPECT_EQ(empty_mesh, Mesh());
+  EXPECT_NE(empty_mesh, Mesh(5));
+  EXPECT_NE(empty_mesh, Mesh({1}, {"a"}));
+  EXPECT_FALSE(empty_mesh.IsMaximal());
+  EXPECT_THAT(empty_mesh.ToProto(), EqualsProto(MeshProto()));
+  EXPECT_EQ(empty_mesh, Mesh::FromProto(MeshProto()));
+  EXPECT_EQ(empty_mesh, Mesh::FromProto(empty_mesh.ToProto()));
+}
+
+TEST(MeshAndAxisTest, MaximalMesh) {
+  Mesh maximal_mesh(5);
+  EXPECT_TRUE(maximal_mesh.IsMaximal());
+  Mesh non_maximal_mesh({2, 3}, {"a", "b"});
+  EXPECT_FALSE(non_maximal_mesh.IsMaximal());
+  Mesh mesh_single_axis({1}, {"a"});
+  EXPECT_FALSE(mesh_single_axis.IsMaximal());
+
+  EXPECT_EQ(maximal_mesh, Mesh(5));
+  EXPECT_NE(maximal_mesh, Mesh(6));
+
+  MeshProto expected_proto;
+  expected_proto.add_device_ids(5);
+  EXPECT_THAT(maximal_mesh.ToProto(), EqualsProto(expected_proto));
+
+  MeshProto from_proto;
+  from_proto.add_device_ids(7);
+  EXPECT_EQ(Mesh(7), Mesh::FromProto(from_proto));
+
+  EXPECT_EQ(maximal_mesh, Mesh::FromProto(maximal_mesh.ToProto()));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index ab789c577f20c8..7763086e2b4b88 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef XLA_HLO_IR_NAMED_SHARDING_H_
 #define XLA_HLO_IR_NAMED_SHARDING_H_
 
+#include <cstdint>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -26,12 +31,83 @@ namespace xla {
 // C++ representation for corresponding `OpSharding::NamedSharding` proto so
 // same documentation applies.
 class NamedSharding {
-  struct DimensionSharding {
-    std::vector<AxisRef> axes;
-    bool is_closed;
+ public:
+  class DimensionSharding {
+   public:
+    bool operator==(const DimensionSharding& other) const {
+      return axes_ == other.axes_ && is_closed_ == other.is_closed_;
+    }
+
+    explicit DimensionSharding(std::vector<AxisRef> axes, bool is_closed)
+        : axes_(std::move(axes)), is_closed_(is_closed) {}
+
+    absl::Span<const AxisRef> axes() const { return axes_; }
+
+   private:
+    std::vector<AxisRef> axes_;
+    bool is_closed_;
   };
 
-  std::vector<NamedSharding> tuple_shardings_;
+  // Shardings using mesh with similar device assignment should compare equal
+  bool operator==(const NamedSharding& other) const {
+    return mesh_.DeviceAssignmentEquals(other.mesh_) &&
+           dim_shardings_ == other.dim_shardings_ &&
+           replicated_axes_ == other.replicated_axes_ &&
+           unreduced_axes_ == other.unreduced_axes_;
+  }
+
+  bool operator!=(const NamedSharding& other) const {
+    return !(*this == other);
+  }
+
+  // TODO(b/456212087): Add validation checks
+  explicit NamedSharding(Mesh mesh,
+                         absl::Span<const DimensionSharding> dim_shardings = {},
+                         absl::Span<const AxisRef> replicated_axes = {},
+                         absl::Span<const AxisRef> unreduced_axes = {},
+                         absl::Span<const OpMetadata> metadata = {})
+      : mesh_(std::move(mesh)),
+        dim_shardings_(dim_shardings.begin(), dim_shardings.end()),
+        replicated_axes_(replicated_axes.begin(), replicated_axes.end()),
+        unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
+        metadata_(metadata.begin(), metadata.end()) {}
+
+ private:
+  friend class HloSharding;
+
+  // Creates a sharding with empty mesh and no sharding axes depicting it is
+  // replicated across all devices.
+  static NamedSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
+    return NamedSharding(/*mesh=*/Mesh(), /*dim_shardings=*/{},
+                         /*replicated_axes=*/{},
+                         /*unreduced_axes=*/{}, metadata);
+  }
+
+  static NamedSharding MaximalSharding(
+      int64_t device_id, absl::Span<const OpMetadata> metadata = {}) {
+    return NamedSharding(Mesh(device_id), /*dim_shardings=*/{},
+                         /*replicated_axes=*/{},
+                         /*unreduced_axes=*/{}, metadata);
+  }
+
+  bool IsReplicated() const {
+    return !IsMaximal() &&
+           absl::c_all_of(dim_shardings_, [](const DimensionSharding& s) {
+             return s.axes().empty();
+           });
+  }
+
+  bool IsMaximal() const { return mesh_.IsMaximal(); }
+
+  // Returns true if the tile size is the same as the input size.
+  //
+  // This checks for both replicated and maximal sharding, as in both cases tile
+  // size is same as input size.
+  bool IsTileMaximal() const { return IsReplicated() || IsMaximal(); }
+
+  const TileAssignment& device_assignment() const {
+    return mesh_.device_assignment();
+  }
 
   Mesh mesh_;
   std::vector<DimensionSharding> dim_shardings_;
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
new file mode 100644
index 00000000000000..36e9cfbbba67bb
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/named_sharding.h"
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using DimensionSharding = NamedSharding::DimensionSharding;
+
+TEST(NamedShardingTest, Equality) {
+  Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_ab_open({axis_a, axis_b}, /*is_closed=*/false);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding base(mesh_abcd, /*dim_shardings=*/{ds_ab, ds_dc},
+                     /*replicated_axes=*/{axis_b},
+                     /*unreduced_axes=*/{axis_c});
+
+  EXPECT_EQ(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
+
+  // Equal even with different mesh axis names
+  Mesh mesh_cadb({2, 4, 3, 8}, {"c", "a", "d", "b"});
+  EXPECT_EQ(base,
+            NamedSharding(mesh_cadb, {ds_ab, ds_dc}, {axis_b}, {axis_c}, {}));
+
+  // Equal even with different metadata.
+  OpMetadata metadata;
+  metadata.set_op_name("foo");
+  EXPECT_EQ(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_c},
+                                {metadata}));
+
+  // Different dim_shardings
+  EXPECT_NE(base,
+            NamedSharding(mesh_abcd, {ds_ab_open, ds_dc}, {axis_b}, {axis_c}));
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_dc, ds_ab}, {axis_b}, {axis_c}));
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab}, {axis_b}, {axis_c}));
+
+  // Different replicated_axes
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_d}, {axis_c}));
+
+  // Different unreduced_axes
+  EXPECT_NE(base, NamedSharding(mesh_abcd, {ds_ab, ds_dc}, {axis_b}, {axis_a}));
+
+  // Different mesh shape
+  Mesh mesh_diff_shape({2, 4, 3, 9}, {"a", "b", "c", "d"});
+  EXPECT_NE(base,
+            NamedSharding(mesh_diff_shape, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/replica_group.cc b/third_party/xla/xla/hlo/ir/replica_group.cc
index f693c50db3b540..e78141fc8fee05 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group.cc
@@ -15,22 +15,31 @@ limitations under the License.
 
 #include "xla/hlo/ir/replica_group.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <numeric>
+#include <optional>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -45,6 +54,269 @@ std::string ReplicaGroupsToString(
   return absl::StrCat("{", absl::StrJoin(replica_group_str, ","), "}");
 }
 
+/************** MeshAxesReplicaGroupList implementation ***********************/
+
+void HandleSingleAxisRefPerDimension(const AxisRef& axis,
+                                     int64_t full_axis_size,
+                                     std::vector<int64_t>& out_reshape_dims,
+                                     std::vector<int64_t>& out_aggregate_axes) {
+  if (axis.sub_axis_info().has_value()) {
+    out_reshape_dims = {axis.sub_axis_info()->pre_size,
+                        axis.sub_axis_info()->size,
+                        full_axis_size / axis.sub_axis_info()->next_pre_size()};
+    // The aggregation axis is the second dimension.
+    out_aggregate_axes = {1};
+  } else {
+    out_reshape_dims = {full_axis_size};
+    out_aggregate_axes = {0};
+  }
+}
+
+void HandleMultiAxisRefPerDimension(std::vector<AxisRef>& axes,
+                                    int64_t full_axis_size,
+                                    std::vector<int64_t>& out_reshape_dims,
+                                    std::vector<int64_t>& out_aggregate_axes) {
+  // --- 1. Sort Axes and Original Indices Together ---
+  // Sort both the axes and the original indices based on
+  // sub_axis_info()->pre_size. This allows us to maintain user specified order
+  // of AxisRef while still building the reshape and aggregate axes.
+  std::vector<int> original_order(axes.size());
+  std::iota(original_order.begin(), original_order.end(), 0);
+  std::sort(original_order.begin(), original_order.end(),
+            [&axes](int i, int j) {
+              return axes[i].sub_axis_info()->pre_size <
+                     axes[j].sub_axis_info()->pre_size;
+            });
+  std::sort(axes.begin(), axes.end(), [](const AxisRef& a, const AxisRef& b) {
+    return a.sub_axis_info()->pre_size < b.sub_axis_info()->pre_size;
+  });
+
+  // --- 2. Build Reshape Dims and Aggregation Axes ---
+  int64_t current_dim_index = 0;  // Index in the new reshaped tensor
+  int64_t prefix_product = 1;     // Product of the size of all prior dimensions
+
+  for (const AxisRef& axis : axes) {
+    int64_t pre_size = axis.sub_axis_info()->pre_size;
+    int64_t size = axis.sub_axis_info()->size;
+
+    // Insert "padding" dimension if the current prefix product doesn't match
+    // the required pre_size
+    if (pre_size != prefix_product) {
+      int64_t padding_size = pre_size / prefix_product;
+      out_reshape_dims.push_back(padding_size);
+      current_dim_index++;
+      prefix_product *= padding_size;
+    }
+
+    // Insert the sharded size (the part to aggregate)
+    out_reshape_dims.push_back(size);
+    out_aggregate_axes.push_back(
+        current_dim_index);  // This is the axis we aggregate over
+    current_dim_index++;
+    prefix_product *= size;
+  }
+
+  // Insert "suffix" dimension if the full size hasn't been reached
+  if (prefix_product != full_axis_size) {
+    out_reshape_dims.push_back(full_axis_size / prefix_product);
+  }
+
+  // --- 3. Permute Aggregate Axes back to Original Order ---
+  // The aggregate axes were calculated based on the sorted list.
+  // We must map them back to the original order to compute the correct
+  // flattened replica groups.
+  std::vector<int64_t> permuted_aggregate_axes(original_order.size());
+  for (int64_t i = 0; i < original_order.size(); ++i) {
+    permuted_aggregate_axes[original_order[i]] = out_aggregate_axes[i];
+  }
+  out_aggregate_axes = permuted_aggregate_axes;
+}
+
+MeshAxesReplicaGroupList::MeshAxesReplicaGroupList(Mesh mesh,
+                                                   std::vector<AxisRef> axes)
+    : mesh_(std::move(mesh)), axes_(std::move(axes)) {
+  CHECK_GT(num_devices_per_group(), 1)
+      << "MeshAxesReplicaGroupList: " << ToString()
+      << " has only one device per replica group.";
+
+  CHECK_OK(ValidateSpanOfAxes(axes_, mesh_));
+}
+
+int64_t MeshAxesReplicaGroupList::num_replica_groups() const {
+  return mesh_.device_assignment().num_elements() / num_devices_per_group();
+}
+
+int64_t MeshAxesReplicaGroupList::num_devices_per_group() const {
+  // Number of devices per replica group is equal to the product of the sizes of
+  // all axes.
+  int64_t devices_per_group = 1;
+  for (const AxisRef& axis : axes_) {
+    int64_t axis_size = axis.sub_axis_info().has_value()
+                            ? axis.sub_axis_info()->size
+                            : mesh_.axis_size(axis.mesh_axis_index());
+    devices_per_group *= axis_size;
+  }
+  return devices_per_group;
+}
+
+std::vector<std::vector<int64_t>> get_replica_groups_for_full_axes(
+    const Mesh& mesh, absl::Span<const int64_t> axis_sizes,
+    const absl::Span<const int64_t> grouped_axes,
+    const int64_t num_replica_groups, const int64_t num_devices_per_group) {
+  // Reshape the device assignment array bases on the axis sizes and transpose
+  // grouped axes to the end.
+  std::vector<int> transpose_axes;
+  transpose_axes.reserve(axis_sizes.size());
+  for (int64_t i = 0; i < axis_sizes.size(); ++i) {
+    if (!absl::c_linear_search(grouped_axes, i)) {
+      transpose_axes.push_back(i);
+    }
+  }
+  for (int64_t grouped_axis : grouped_axes) {
+    transpose_axes.push_back(grouped_axis);
+  }
+
+  TileAssignment device_assignment =
+      mesh.device_assignment().Reshape(axis_sizes).Transpose(transpose_axes);
+
+  std::vector<std::vector<int64_t>> replica_groups;
+  replica_groups.reserve(num_replica_groups);
+  for (auto it = device_assignment.array().begin();
+       it != device_assignment.array().end(); it += num_devices_per_group) {
+    std::vector<int64_t> group(it, it + num_devices_per_group);
+    replica_groups.emplace_back(std::move(group));
+  }
+  return replica_groups;
+}
+
+void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
+  absl::flat_hash_map<int64_t, std::vector<AxisRef>> dim_to_axes;
+  for (const AxisRef& axis : axes_) {
+    dim_to_axes[axis.mesh_axis_index()].push_back(axis);
+  }
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map;
+  // For each dimension determine the reshape that is consistent with it's
+  // AxisRef(s). Then maintain this reshape and the aggregated dims for easier
+  // computation of replica groups. As an example for @mesh<"a"=8>
+  // {a}               -> no reshape, aggregate over [0]
+  // {a:(1)2}          -> reshape [8]->[1,2,4], aggregate over [1]
+  // {a:(1)2, a:(4)2}  -> reshape [8]->[2,2,2], aggregate over [0,2]
+  for (auto& [dim, axes] : dim_to_axes) {
+    int64_t full_axis_size = mesh_.axis_size(dim);
+    ReshapeAndAggregateAxes reshape_and_aggregate_axes;
+    if (axes.size() == 1) {
+      HandleSingleAxisRefPerDimension(
+          axes[0], full_axis_size, reshape_and_aggregate_axes.reshape_dims,
+          reshape_and_aggregate_axes.aggregate_axes);
+    } else {
+      // Otherwise dimension is a set of axes with sub-axes info.
+      HandleMultiAxisRefPerDimension(axes, full_axis_size,
+                                     reshape_and_aggregate_axes.reshape_dims,
+                                     reshape_and_aggregate_axes.aggregate_axes);
+    }
+    dim_map[dim] = reshape_and_aggregate_axes;
+  }
+  dim_to_reshape_and_aggregate_axes_ = dim_map;
+}
+
+std::pair<std::vector<int64_t>, std::vector<int64_t>>
+MeshAxesReplicaGroupList::ComputeReindexedAxes() {
+  if (!dim_to_reshape_and_aggregate_axes_.has_value()) {
+    InitializeDimToReshapeAndAggregateAxes();
+  }
+  std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map =
+      dim_to_reshape_and_aggregate_axes_.value();
+  for (int64_t i = 0; i < mesh_.axis_sizes().size(); ++i) {
+    int64_t axis_size = mesh_.axis_size(i);
+    auto it = dim_map.find(i);
+    if (it == dim_map.end()) {
+      reindex_axis_sizes.push_back(axis_size);
+      continue;
+    }
+    int64_t offset_index = reindex_axis_sizes.size();
+    const ReshapeAndAggregateAxes& reshape_and_aggregate_axes = it->second;
+    for (int64_t reshape_dim : reshape_and_aggregate_axes.reshape_dims) {
+      reindex_axis_sizes.push_back(reshape_dim);
+    }
+    for (int64_t aggregate_dim : reshape_and_aggregate_axes.aggregate_axes) {
+      reindexed_grouped_axes.push_back(aggregate_dim + offset_index);
+    }
+  }
+  return std::make_pair(reindex_axis_sizes, reindexed_grouped_axes);
+}
+
+std::vector<std::vector<int64_t>>
+MeshAxesReplicaGroupList::flattened_replica_groups() {
+  std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
+  std::tie(reindex_axis_sizes, reindexed_grouped_axes) = ComputeReindexedAxes();
+  return get_replica_groups_for_full_axes(
+      mesh_, reindex_axis_sizes, reindexed_grouped_axes, num_replica_groups(),
+      num_devices_per_group());
+}
+
+void MeshAxesReplicaGroupList::Print(Printer* printer) const {
+  printer->Append(ToString());
+}
+
+std::string MeshAxesReplicaGroupList::ToString() const {
+  std::string rg_str = "";
+  // Add the axes defining the replica group, using names from the mesh.
+  std::vector<std::string> group_axes_str;
+  group_axes_str.reserve(axes_.size());
+  for (const AxisRef& axis : axes_) {
+    std::string axis_str = axis.ToString(mesh_);
+    group_axes_str.push_back(axis_str);
+  }
+  absl::StrAppend(&rg_str, mesh_.ToString(), " {",
+                  absl::StrJoin(group_axes_str, ","), "}");
+  return rg_str;
+}
+
+MeshAxesReplicaGroupListProto MeshAxesReplicaGroupList::ToProto() const {
+  MeshAxesReplicaGroupListProto proto;
+  *proto.mutable_mesh() = mesh_.ToProto();
+  for (const AxisRef& axis : axes_) {
+    *proto.add_axes() = axis.ToProto();
+  }
+  return proto;
+}
+
+MeshAxesReplicaGroupList MeshAxesReplicaGroupList::FromProto(
+    const MeshAxesReplicaGroupListProto& proto) {
+  Mesh mesh = Mesh::FromProto(proto.mesh());
+  std::vector<AxisRef> axes;
+  for (const AxisRefProto& axis_proto : proto.axes()) {
+    axes.push_back(AxisRef::FromProto(axis_proto));
+  }
+  return MeshAxesReplicaGroupList(mesh, axes);
+}
+
+IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
+  CHECK(mesh_.device_assignment().iota().has_value());
+  std::vector<int64_t> reshape_dims, reindexed_grouped_axes;
+  std::tie(reshape_dims, reindexed_grouped_axes) = ComputeReindexedAxes();
+
+  std::vector<int> transpose_perm;
+  for (int64_t reshape_dim = 0; reshape_dim < reshape_dims.size();
+       ++reshape_dim) {
+    if (!absl::c_linear_search(reindexed_grouped_axes, reshape_dim)) {
+      transpose_perm.push_back(reshape_dim);
+    }
+  }
+  for (int64_t grouped_axis : reindexed_grouped_axes) {
+    transpose_perm.push_back(grouped_axis);
+  }
+
+  return IotaReplicaGroupList(num_replica_groups(), num_devices_per_group(),
+                              reshape_dims, transpose_perm);
+}
+
+CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() {
+  return CollectiveDeviceList(flattened_replica_groups());
+}
+
+/************** IotaReplicaGroupList implementation ***************************/
 int64_t IotaReplicaGroupList::num_replica_groups() const {
   DCHECK_GE(num_replica_groups_, 0);
   return num_replica_groups_;
@@ -121,6 +393,7 @@ std::shared_ptr<std::vector<ReplicaGroup>> ExpandIota(
 }
 }  // namespace
 
+/************** CollectiveDeviceList implementation ***************************/
 const std::vector<ReplicaGroup>& CollectiveDeviceList::replica_groups() const {
   if (replica_groups_ == nullptr) {
     CHECK(iota_replica_group_list_.has_value());
@@ -130,6 +403,16 @@ const std::vector<ReplicaGroup>& CollectiveDeviceList::replica_groups() const {
   return *replica_groups_;
 }
 
+std::vector<std::vector<int64_t>>
+CollectiveDeviceList::flattened_replica_groups() const {
+  std::vector<std::vector<int64_t>> result;
+  result.reserve(replica_groups().size());
+  for (const ReplicaGroup& group : replica_groups()) {
+    result.emplace_back(group.replica_ids().begin(), group.replica_ids().end());
+  }
+  return result;
+}
+
 std::string CollectiveDeviceList::ToString(
     bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index f1b612fe8c77b5..b6e30d24071ef3 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_HLO_IR_REPLICA_GROUP_H_
 #define XLA_HLO_IR_REPLICA_GROUP_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -24,8 +25,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/array.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
@@ -34,6 +38,53 @@ limitations under the License.
 
 namespace xla {
 
+class IotaReplicaGroupList;
+class CollectiveDeviceList;
+
+class MeshAxesReplicaGroupList {
+  struct ReshapeAndAggregateAxes {
+    std::vector<int64_t> reshape_dims;
+    std::vector<int64_t> aggregate_axes;
+  };
+
+ public:
+  explicit MeshAxesReplicaGroupList(Mesh mesh, std::vector<AxisRef> axes);
+
+  bool operator==(const MeshAxesReplicaGroupList& other) const {
+    return mesh_ == other.mesh_ && axes_ == other.axes_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MeshAxesReplicaGroupList& c) {
+    return H::combine(std::move(h), c.mesh_, c.axes_);
+  }
+
+  int64_t num_replica_groups() const;
+  int64_t num_devices_per_group() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups();
+
+  void Print(Printer* printer) const;
+
+  std::string ToString() const;
+
+  MeshAxesReplicaGroupListProto ToProto() const;
+
+  static MeshAxesReplicaGroupList FromProto(
+      const MeshAxesReplicaGroupListProto& proto);
+
+  // Methods for converting to V2 and V1 representations.
+  IotaReplicaGroupList ToIotaReplicaGroupList();
+  CollectiveDeviceList ToCollectiveDeviceList();
+
+ private:
+  void InitializeDimToReshapeAndAggregateAxes();
+  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes();
+  Mesh mesh_;
+  std::vector<AxisRef> axes_;
+  std::optional<absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>>
+      dim_to_reshape_and_aggregate_axes_;
+};
+
 std::string ReplicaGroupsToString(
     absl::Span<const ReplicaGroup> replica_groups);
 
@@ -156,6 +207,7 @@ class CollectiveDeviceList {
 
   // Lazyly explands iota if applicable.
   const std::vector<ReplicaGroup>& replica_groups() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
     return iota_replica_group_list_;
   }
diff --git a/third_party/xla/xla/hlo/ir/replica_group_test.cc b/third_party/xla/xla/hlo/ir/replica_group_test.cc
index 74fb67f876f0ac..b038eaafd041c0 100644
--- a/third_party/xla/xla/hlo/ir/replica_group_test.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group_test.cc
@@ -16,10 +16,15 @@ limitations under the License.
 #include "xla/hlo/ir/replica_group.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -37,6 +42,409 @@ CollectiveDeviceListProto CreateDeviceListProto(
   return proto;
 }
 
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroups) {
+  Mesh mesh_xy({2, 2}, {"x", "y"});
+
+  EXPECT_DEATH(
+      { MeshAxesReplicaGroupList replica_group_none(mesh_xy, {}); },
+      "has only one device per replica group");
+
+  MeshAxesReplicaGroupList replica_group_x(mesh_xy, {AxisRef(0)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_x = {{0, 2},
+                                                                 {1, 3}};
+  EXPECT_EQ(replica_group_x.flattened_replica_groups(),
+            expected_replica_groups_x);
+
+  MeshAxesReplicaGroupList replica_group_y(mesh_xy, {AxisRef(1)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_y = {{0, 1},
+                                                                 {2, 3}};
+  EXPECT_EQ(replica_group_y.flattened_replica_groups(),
+            expected_replica_groups_y);
+
+  MeshAxesReplicaGroupList replica_group_xy(mesh_xy, {AxisRef(0), AxisRef(1)});
+  std::vector<std::vector<int64_t>> expected_replica_groups_xy = {{0, 1, 2, 3}};
+  EXPECT_EQ(replica_group_xy.flattened_replica_groups(),
+            expected_replica_groups_xy);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroupsWithSubaxes) {
+  Mesh mesh({6, 6}, {"a", "b"});
+
+  // a:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2(mesh, {AxisRef(0, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_2 = {
+      {0, 18},  {1, 19},  {2, 20},  {3, 21},  {4, 22},  {5, 23},
+      {6, 24},  {7, 25},  {8, 26},  {9, 27},  {10, 28}, {11, 29},
+      {12, 30}, {13, 31}, {14, 32}, {15, 33}, {16, 34}, {17, 35}};
+  EXPECT_EQ(replica_group_a_1_2.flattened_replica_groups(),
+            expected_replica_groups_a_1_2);
+
+  // a:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh, {AxisRef(0, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_3 = {
+      {0, 12, 24}, {1, 13, 25}, {2, 14, 26},  {3, 15, 27},
+      {4, 16, 28}, {5, 17, 29}, {6, 18, 30},  {7, 19, 31},
+      {8, 20, 32}, {9, 21, 33}, {10, 22, 34}, {11, 23, 35}};
+  EXPECT_EQ(replica_group_a_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3);
+
+  // a:(3)2
+  MeshAxesReplicaGroupList replica_group_a_3_2(mesh, {AxisRef(0, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_3_2 = {
+      {0, 6},   {1, 7},   {2, 8},   {3, 9},   {4, 10},  {5, 11},
+      {12, 18}, {13, 19}, {14, 20}, {15, 21}, {16, 22}, {17, 23},
+      {24, 30}, {25, 31}, {26, 32}, {27, 33}, {28, 34}, {29, 35}};
+  EXPECT_EQ(replica_group_a_3_2.flattened_replica_groups(),
+            expected_replica_groups_a_3_2);
+
+  // b:(1)2
+  MeshAxesReplicaGroupList replica_group_b_1_2(mesh, {AxisRef(1, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_2 = {
+      {0, 3},   {1, 4},   {2, 5},   {6, 9},   {7, 10},  {8, 11},
+      {12, 15}, {13, 16}, {14, 17}, {18, 21}, {19, 22}, {20, 23},
+      {24, 27}, {25, 28}, {26, 29}, {30, 33}, {31, 34}, {32, 35}};
+  EXPECT_EQ(replica_group_b_1_2.flattened_replica_groups(),
+            expected_replica_groups_b_1_2);
+
+  // b:(1)3
+  MeshAxesReplicaGroupList replica_group_b_1_3(mesh, {AxisRef(1, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_3 = {
+      {0, 2, 4},    {1, 3, 5},    {6, 8, 10},   {7, 9, 11},
+      {12, 14, 16}, {13, 15, 17}, {18, 20, 22}, {19, 21, 23},
+      {24, 26, 28}, {25, 27, 29}, {30, 32, 34}, {31, 33, 35}};
+  EXPECT_EQ(replica_group_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_b_1_3);
+
+  // b:(3)2
+  MeshAxesReplicaGroupList replica_group_b_3_2(mesh, {AxisRef(1, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_3_2 = {
+      {0, 1},   {2, 3},   {4, 5},   {6, 7},   {8, 9},   {10, 11},
+      {12, 13}, {14, 15}, {16, 17}, {18, 19}, {20, 21}, {22, 23},
+      {24, 25}, {26, 27}, {28, 29}, {30, 31}, {32, 33}, {34, 35}};
+  EXPECT_EQ(replica_group_b_3_2.flattened_replica_groups(),
+            expected_replica_groups_b_3_2);
+
+  // a:(1)2, b:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2_b_1_2(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(1, {1, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_2_b_1_2 = {
+      {0, 3, 18, 21},   {1, 4, 19, 22},   {2, 5, 20, 23},
+      {6, 9, 24, 27},   {7, 10, 25, 28},  {8, 11, 26, 29},
+      {12, 15, 30, 33}, {13, 16, 31, 34}, {14, 17, 32, 35}};
+  EXPECT_EQ(replica_group_a_1_2_b_1_2.flattened_replica_groups(),
+            expected_replica_groups_a_1_2_b_1_2);
+
+  // a:(1)3, b:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3_b_1_3(
+      mesh, {AxisRef(0, {1, 3}), AxisRef(1, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_1_3_b_1_3 = {
+      {0, 2, 4, 12, 14, 16, 24, 26, 28},
+      {1, 3, 5, 13, 15, 17, 25, 27, 29},
+      {6, 8, 10, 18, 20, 22, 30, 32, 34},
+      {7, 9, 11, 19, 21, 23, 31, 33, 35}};
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3_b_1_3);
+
+  //  b:(1)3, a:(1)3 (Reverse order from above). This should produce the same
+  // replica groups as the above but with ids in a different order.
+  MeshAxesReplicaGroupList replica_group_b_1_3_a_1_3(
+      mesh, {AxisRef(1, {1, 3}), AxisRef(0, {1, 3})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_b_1_3_a_1_3 = {
+      {0, 12, 24, 2, 14, 26, 4, 16, 28},
+      {1, 13, 25, 3, 15, 27, 5, 17, 29},
+      {6, 18, 30, 8, 20, 32, 10, 22, 34},
+      {7, 19, 31, 9, 21, 33, 11, 23, 35}};
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            expected_replica_groups_a_1_3_b_1_3);
+
+  // a:(3)2, b:(3)2
+  MeshAxesReplicaGroupList replica_group_a_3_2_b_3_2(
+      mesh, {AxisRef(0, {3, 2}), AxisRef(1, {3, 2})});
+  std::vector<std::vector<int64_t>> expected_replica_groups_a_3_2_b_3_2 = {
+      {0, 1, 6, 7},     {2, 3, 8, 9},     {4, 5, 10, 11},
+      {12, 13, 18, 19}, {14, 15, 20, 21}, {16, 17, 22, 23},
+      {24, 25, 30, 31}, {26, 27, 32, 33}, {28, 29, 34, 35}};
+  EXPECT_EQ(replica_group_a_3_2_b_3_2.flattened_replica_groups(),
+            expected_replica_groups_a_3_2_b_3_2);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MaterializedReplicaGroupsMatchExpectedV2) {
+  Mesh mesh({8}, {"a"});
+
+  // a:(1)2 -> replica_groups=[4,2]<=[2,4]T(1,0)
+  MeshAxesReplicaGroupList v3_subaxis_1_2(mesh, {AxisRef(0, {1, 2})});
+  IotaReplicaGroupList v2_subaxis_1_2(4, 2, {2, 4}, {1, 0});
+  EXPECT_EQ(v3_subaxis_1_2.flattened_replica_groups(),
+            v2_subaxis_1_2.flattened_replica_groups());
+
+  // a:(1)4 -> replica_groups=[2,4]<=[4,2]T(1,0)
+  MeshAxesReplicaGroupList v3_subaxis_1_4(mesh, {AxisRef(0, {1, 4})});
+  IotaReplicaGroupList v2_subaxis_1_4(2, 4, {4, 2}, {1, 0});
+  EXPECT_EQ(v3_subaxis_1_4.flattened_replica_groups(),
+            v2_subaxis_1_4.flattened_replica_groups());
+
+  // a:(2)2 -> replica_groups=[4,2]<=[2,2,2]T(0,2,1)
+  MeshAxesReplicaGroupList v3_subaxis_2_2(mesh, {AxisRef(0, {2, 2})});
+  IotaReplicaGroupList v2_subaxis_2_2(4, 2, {2, 2, 2}, {0, 2, 1});
+  EXPECT_EQ(v3_subaxis_2_2.flattened_replica_groups(),
+            v2_subaxis_2_2.flattened_replica_groups());
+
+  // a:(2)4 -> replica_groups=[2,4]<=[8]
+  MeshAxesReplicaGroupList v3_subaxis_2_4(mesh, {AxisRef(0, {2, 4})});
+  IotaReplicaGroupList v2_subaxis_2_4(2, 4, {8}, {0});
+  EXPECT_EQ(v3_subaxis_2_4.flattened_replica_groups(),
+            v2_subaxis_2_4.flattened_replica_groups());
+
+  // a:(4)2 -> replica_groups=[4,2]<=[8]
+  MeshAxesReplicaGroupList v3_subaxis_4_2(mesh, {AxisRef(0, {4, 2})});
+  IotaReplicaGroupList v2_subaxis_4_2(4, 2, {8}, {0});
+  EXPECT_EQ(v3_subaxis_4_2.flattened_replica_groups(),
+            v2_subaxis_4_2.flattened_replica_groups());
+
+  //  {a:(1)2, a:(4)2} -> replica_groups=[2,4]<=[2,2,2]T(1,0,2)
+  MeshAxesReplicaGroupList v3_subaxis_1_2_and_4_2(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  IotaReplicaGroupList v2_subaxis_1_2_and_4_2(2, 4, {2, 2, 2}, {1, 0, 2});
+  EXPECT_EQ(v3_subaxis_1_2_and_4_2.flattened_replica_groups(),
+            v2_subaxis_1_2_and_4_2.flattened_replica_groups());
+
+  //  {a:(4)2, a:(1)2} -> replica_groups=[2,4]<=[2,2,2]T(1,2,0)
+  MeshAxesReplicaGroupList v3_subaxis_4_2_and_1_2(
+      mesh, {AxisRef(0, {4, 2}), AxisRef(0, {1, 2})});
+  IotaReplicaGroupList v2_subaxis_4_2_and_1_2(2, 4, {2, 2, 2}, {1, 2, 0});
+  EXPECT_EQ(v3_subaxis_4_2_and_1_2.flattened_replica_groups(),
+            v2_subaxis_4_2_and_1_2.flattened_replica_groups());
+
+  // a      -> replica_groups=[1,8]<=[8]
+  MeshAxesReplicaGroupList v3_no_subaxis(mesh, {AxisRef(0)});
+  IotaReplicaGroupList v2_no_subaxis(1, 8, {8}, {0});
+  EXPECT_EQ(v3_no_subaxis.flattened_replica_groups(),
+            v2_no_subaxis.flattened_replica_groups());
+}
+
+TEST(MeshAxesReplicaGroupListTest,
+     MaterializedReplicaGroupsRespectNonIotaDeviceOrdering) {
+  // Create a mesh with non-iota device ordering.
+  Array2D<int64_t> array({{3, 1}, {0, 2}});
+  TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
+  Mesh mesh_xy(tile_assignment, {"x", "y"});
+
+  // Reduce along x axis.
+  MeshAxesReplicaGroupList replica_group_x(mesh_xy, {AxisRef(0)});
+  // With iota device ordering, the expected replica groups would be
+  // {{0, 2}, {1, 3}}.
+  std::vector<std::vector<int64_t>> expected_replica_groups_x = {{3, 0},
+                                                                 {1, 2}};
+  EXPECT_THAT(replica_group_x.flattened_replica_groups(),
+              testing::UnorderedElementsAreArray(expected_replica_groups_x));
+
+  // Reduce along y axis.
+  MeshAxesReplicaGroupList replica_group_y(mesh_xy, {AxisRef(1)});
+  // With iota device ordering, the expected replica groups would be
+  // {{0, 1}, {2, 3}}.
+  std::vector<std::vector<int64_t>> expected_replica_groups_y = {{3, 1},
+                                                                 {0, 2}};
+  EXPECT_THAT(replica_group_y.flattened_replica_groups(),
+              testing::UnorderedElementsAreArray(expected_replica_groups_y));
+}
+
+TEST(MeshAxesReplicaGroupListTest, NumReplicaGroups) {
+  Mesh all_axes({4, 4}, {"x", "y"});
+  MeshAxesReplicaGroupList replica_group_across_all_axes(
+      all_axes, {AxisRef(0), AxisRef(1)});
+  EXPECT_EQ(replica_group_across_all_axes.num_replica_groups(), 1);
+  EXPECT_EQ(replica_group_across_all_axes.num_devices_per_group(), 16);
+
+  Mesh one_axes({3, 5}, {"a", "b"});
+  MeshAxesReplicaGroupList replica_group_across_a(one_axes, {AxisRef(0)});
+  MeshAxesReplicaGroupList replica_group_across_b(one_axes, {AxisRef(1)});
+  EXPECT_EQ(replica_group_across_a.num_replica_groups(), 5);
+  EXPECT_EQ(replica_group_across_a.num_devices_per_group(), 3);
+  EXPECT_EQ(replica_group_across_b.num_replica_groups(), 3);
+  EXPECT_EQ(replica_group_across_b.num_devices_per_group(), 5);
+}
+
+TEST(MeshAxesReplicaGroupListTest, ValidateSubAxesCoexistenceCheck) {
+  Mesh mesh({8}, {"a"});
+  MeshAxesReplicaGroupList replica_group_multiple_subaxes1(
+      mesh, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  MeshAxesReplicaGroupList replica_group_multiple_subaxes2(
+      mesh, {AxisRef(0, {4, 2}), AxisRef(0, {1, 2})});
+
+  Mesh overlap_mesh({2 * 3 * 5}, {"u"});
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList overlapping_subaxes(
+            overlap_mesh, {AxisRef(0, {6, 5}), AxisRef(0, {10, 3})});
+      },
+      "Axes cannot coexist or axes overlap.");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ReplicaGroupsCountAndSizeForSubaxes) {
+  Mesh mesh_one_subaxis({2, 6, 10}, {"axis1", "axis2", "axis3"});
+  MeshAxesReplicaGroupList replica_group_across_axis2_subaxis(
+      mesh_one_subaxis, {AxisRef(1, {2, 3})});
+  MeshAxesReplicaGroupList replica_group_across_axis3_subaxis(
+      mesh_one_subaxis, {AxisRef(2, {1, 2})});
+  EXPECT_EQ(replica_group_across_axis2_subaxis.num_replica_groups(), 40);
+  EXPECT_EQ(replica_group_across_axis2_subaxis.num_devices_per_group(), 3);
+  EXPECT_EQ(replica_group_across_axis3_subaxis.num_replica_groups(), 60);
+  EXPECT_EQ(replica_group_across_axis3_subaxis.num_devices_per_group(), 2);
+
+  Mesh mesh_multiple_subaxis({2 * 3, 5 * 7, 11 * 13},
+                             {"alpha", "beta", "gamma"});
+  MeshAxesReplicaGroupList replica_group_across_multiple_subaxis1(
+      mesh_multiple_subaxis,
+      {AxisRef(0, {1, 2}), AxisRef(1, {1, 5}), AxisRef(2, {1, 11})});
+  MeshAxesReplicaGroupList replica_group_across_multiple_subaxis2(
+      mesh_multiple_subaxis,
+      {AxisRef(0, {2, 3}), AxisRef(1, {5, 7}), AxisRef(2, {11, 13})});
+  EXPECT_EQ(replica_group_across_multiple_subaxis1.num_replica_groups(),
+            3 * 7 * 13);
+  EXPECT_EQ(replica_group_across_multiple_subaxis1.num_devices_per_group(),
+            2 * 5 * 11);
+  EXPECT_EQ(replica_group_across_multiple_subaxis2.num_replica_groups(),
+            2 * 5 * 11);
+  EXPECT_EQ(replica_group_across_multiple_subaxis2.num_devices_per_group(),
+            3 * 7 * 13);
+}
+
+TEST(MeshAxesReplicaGroupListTest, MeshAxesToString) {
+  // No subaxes and iota device assignment.
+  Mesh mesh_uvw({10, 12, 15}, {"u", "v", "w"});
+  MeshAxesReplicaGroupList replica_group_across_uv(mesh_uvw,
+                                                   {AxisRef(0), AxisRef(1)});
+  EXPECT_EQ(replica_group_across_uv.ToString(), "@mesh<u=10,v=12,w=15> {u,v}");
+
+  // Subaxes and replica group v2 iota style device assignment.
+  Mesh mesh_abcd(
+      TileAssignment(/*dims=*/{2, 4, 4, 2}, /*reshape_dims=*/{1, 4, 1, 16},
+                     /*transpose_perm=*/{2, 3, 0, 1}),
+      {"a", "b", "c", "d"});
+  MeshAxesReplicaGroupList rg_abcd_across_multiple_axes_and_subaxes(
+      mesh_abcd, {AxisRef(0), AxisRef(1, {1, 2}), AxisRef(3)});
+  EXPECT_EQ(rg_abcd_across_multiple_axes_and_subaxes.ToString(),
+            "@mesh<a=2,b=4,c=4,d=2>([4,16]T(1,0)) {a,b:(1)2,d}");
+
+  // Subaxes and random device assignment.
+  Array<int64_t> array({{8, 3, 7, 5, 4, 2, 6, 0, 1, 9}});
+  array.Reshape({10});
+  TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
+  Mesh mesh_ooo(tile_assignment, {"ooo"});
+  MeshAxesReplicaGroupList rg_ooo_across_ooo_5_2(mesh_ooo,
+                                                 {AxisRef(0, {5, 2})});
+  EXPECT_EQ(rg_ooo_across_ooo_5_2.ToString(),
+            "@mesh<ooo=10>(8,3,7,5,4,2,6,0,1,9) {ooo:(5)2}");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ValidatesIncompatibleAxes) {
+  Mesh mesh({10}, {"u"});
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(1, {1, 2})});
+      },
+      "Axis index must be less than number of axes");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {8, 2})});
+      },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {2, 8})});
+      },
+      "Pre-size and size must divide the full axis size");
+  EXPECT_DEATH(
+      {
+        MeshAxesReplicaGroupList index_out_of_bounds(
+            mesh, /*axes=*/{AxisRef(0, {1, 10})});
+      },
+      "Sub-axis size must be strictly less than the full axis size");
+}
+
+TEST(MeshAxesReplicaGroupListTest, ToReplicaGroupV2) {
+  Mesh mesh_ab({6, 6}, {"a", "b"});
+
+  // a:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh_ab, {AxisRef(0, {1, 3})});
+  EXPECT_EQ(
+      replica_group_a_1_3.flattened_replica_groups(),
+      replica_group_a_1_3.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // b:(3)2
+  MeshAxesReplicaGroupList replica_group_b_3_2(mesh_ab, {AxisRef(1, {3, 2})});
+  EXPECT_EQ(
+      replica_group_b_3_2.flattened_replica_groups(),
+      replica_group_b_3_2.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // a:(1)2, b:(1)2
+  MeshAxesReplicaGroupList replica_group_a_1_2_b_1_2(
+      mesh_ab, {AxisRef(0, {1, 2}), AxisRef(1, {1, 2})});
+  EXPECT_EQ(replica_group_a_1_2_b_1_2.flattened_replica_groups(),
+            replica_group_a_1_2_b_1_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // a:(1)3, b:(1)3
+  MeshAxesReplicaGroupList replica_group_a_1_3_b_1_3(
+      mesh_ab, {AxisRef(0, {1, 3}), AxisRef(1, {1, 3})});
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            replica_group_a_1_3_b_1_3.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // b:(1)3, a:(1)3 (Reverse order from above). This should produce the same
+  // replica groups as the above but with ids in a different order.
+  MeshAxesReplicaGroupList replica_group_b_1_3_a_1_3(
+      mesh_ab, {AxisRef(1, {1, 3}), AxisRef(0, {1, 3})});
+  EXPECT_EQ(replica_group_a_1_3_b_1_3.flattened_replica_groups(),
+            replica_group_a_1_3_b_1_3.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  Mesh mesh_cd({8, 6}, {"c", "d"});
+
+  // c
+  MeshAxesReplicaGroupList replica_group_c(mesh_cd, {AxisRef(0)});
+  EXPECT_EQ(
+      replica_group_c.flattened_replica_groups(),
+      replica_group_c.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // d
+  MeshAxesReplicaGroupList replica_group_d(mesh_cd, {AxisRef(1)});
+  EXPECT_EQ(
+      replica_group_d.flattened_replica_groups(),
+      replica_group_d.ToIotaReplicaGroupList().flattened_replica_groups());
+
+  // c:(1)2, d:(4)2
+  MeshAxesReplicaGroupList replica_group_c_1_2_c_4_2(
+      mesh_cd, {AxisRef(0, {1, 2}), AxisRef(0, {4, 2})});
+  EXPECT_EQ(replica_group_c_1_2_c_4_2.flattened_replica_groups(),
+            replica_group_c_1_2_c_4_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+
+  // c:(2)3, d:(1)2
+  MeshAxesReplicaGroupList replica_group_d_2_3_d_1_2(
+      mesh_cd, {AxisRef(1, {2, 3}), AxisRef(1, {1, 2})});
+  EXPECT_EQ(replica_group_d_2_3_d_1_2.flattened_replica_groups(),
+            replica_group_d_2_3_d_1_2.ToIotaReplicaGroupList()
+                .flattened_replica_groups());
+}
+
+TEST(MeshAxesReplicaGroupListTest, ToCollectiveDeviceList) {
+  Mesh mesh({6, 6}, {"a", "b"});
+
+  MeshAxesReplicaGroupList replica_group_b(mesh, {AxisRef(0)});
+  EXPECT_EQ(
+      replica_group_b.flattened_replica_groups(),
+      replica_group_b.ToCollectiveDeviceList().flattened_replica_groups());
+
+  MeshAxesReplicaGroupList replica_group_a_1_3(mesh, {AxisRef(0, {1, 3})});
+  EXPECT_EQ(
+      replica_group_a_1_3.flattened_replica_groups(),
+      replica_group_a_1_3.ToCollectiveDeviceList().flattened_replica_groups());
+}
+
 TEST(CollectiveDeviceListTest, DefaultListToString) {
   EXPECT_EQ(CollectiveDeviceList().ToString(true), "{}");
   EXPECT_EQ(CollectiveDeviceList().ToString(false), "{}");
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.cc b/third_party/xla/xla/hlo/ir/tile_assignment.cc
index ba2a9d463e9762..ee755c1f6d81fe 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.cc
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.cc
@@ -126,10 +126,14 @@ TransposeKind GetTransposeKind(absl::Span<const int64_t> dims,
   for (int i = 0; i < perm.size(); ++i) {
     const auto& d = perm[i];
     if (dims[d] == 1) {
-      if (d != i && dims[i] != 1) kind = TransposeKind::kReshape;
+      if (d != i && dims[i] != 1) {
+        kind = TransposeKind::kReshape;
+      }
       continue;
     }
-    if (d <= prev_non_one_dim) return TransposeKind::kTranspose;
+    if (d <= prev_non_one_dim) {
+      return TransposeKind::kTranspose;
+    }
     prev_non_one_dim = d;
   }
   return kind;
@@ -268,7 +272,9 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
   DCHECK_EQ(ndims_, perm.size());
   auto dims = this->dims();
   const TransposeKind kind = GetTransposeKind(dims, perm);
-  if (kind == TransposeKind::kNoop) return *this;
+  if (kind == TransposeKind::kNoop) {
+    return *this;
+  }
   absl::InlinedVector<int64_t, 6> new_dims(ndims_);
   for (int64_t i = 0; i < ndims_; ++i) {
     new_dims[i] = dims[perm[i]];
@@ -304,7 +310,9 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
     absl::InlinedVector<int, 6> new_perm;
     new_perm.reserve(non_one_dims.size());
     for (int i = 0; i < ndims_; ++i) {
-      if (dims[perm[i]] == 1) continue;
+      if (dims[perm[i]] == 1) {
+        continue;
+      }
       new_perm.push_back(transpose_perm[one_to_non_one[perm[i]]]);
     }
     CHECK_EQ(reshape_ndims_, new_perm.size());
@@ -430,10 +438,8 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
                                     flattened_transpose_perm);
 }
 
-void IotaTileAssignment::Print(Printer* printer) const {
+void IotaTileAssignment::PrintArray(Printer* printer) const {
   printer->Append("[");
-  AppendJoin(printer, dims(), ",");
-  printer->Append("]<=[");
   AppendJoin(printer, reshape_dims(), ",");
   printer->Append("]");
   if (reshape_ndims_ > 1) {
@@ -443,6 +449,19 @@ void IotaTileAssignment::Print(Printer* printer) const {
   }
 }
 
+void IotaTileAssignment::Print(Printer* printer) const {
+  printer->Append("[");
+  AppendJoin(printer, dims(), ",");
+  printer->Append("]<=");
+  PrintArray(printer);
+}
+
+std::string IotaTileAssignment::ArrayToString() const {
+  StringPrinter printer;
+  PrintArray(&printer);
+  return std::move(printer).ToString();
+}
+
 std::string IotaTileAssignment::ToString() const {
   StringPrinter printer;
   Print(&printer);
@@ -581,10 +600,8 @@ absl::Status TileAssignment::EachStatus(
     absl::Span<const int64_t> new_dimensions) const {
   if (iota_) {
     CHECK_EQ(Product(new_dimensions), iota_->num_elements());
-    return TileAssignment(
-        IotaTileAssignment(new_dimensions, iota_->reshape_dims(),
-                           iota_->transpose_perm()),
-        /*shared_array=*/nullptr);
+    return TileAssignment(new_dimensions, iota_->reshape_dims(),
+                          iota_->transpose_perm());
   }
   std::shared_ptr<Array<int64_t>> reshaped = shared_array_clone();
   reshaped->Reshape(new_dimensions);
@@ -608,6 +625,14 @@ absl::Status TileAssignment::EachStatus(
   return TileAssignment(std::move(cloned_array));
 }
 
+void TileAssignment::PrintArray(Printer* printer) const {
+  if (iota_) {
+    iota_->PrintArray(printer);
+  } else {
+    AppendJoin(printer, array(), ",");
+  }
+}
+
 void TileAssignment::Print(Printer* printer) const {
   if (iota_) {
     printer->Append("devices=");
@@ -620,6 +645,12 @@ void TileAssignment::Print(Printer* printer) const {
   }
 }
 
+std::string TileAssignment::ArrayToString() const {
+  StringPrinter printer;
+  PrintArray(&printer);
+  return std::move(printer).ToString();
+}
+
 std::string TileAssignment::ToString() const {
   StringPrinter printer;
   Print(&printer);
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.h b/third_party/xla/xla/hlo/ir/tile_assignment.h
index 8b6dd9d28c22e1..e7fb204a8c3fef 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.h
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.h
@@ -110,8 +110,10 @@ class IotaTileAssignment {
   std::optional<IotaTileAssignment> Transpose(absl::Span<const int> perm) const;
 
   void Print(Printer* printer) const;
+  void PrintArray(Printer* printer) const;
 
   std::string ToString() const;
+  std::string ArrayToString() const;
 
   // Materializes array representation of IotaTileAssignment.
   Array<int64_t> ToArray() const;
@@ -147,18 +149,6 @@ class IotaTileAssignment {
   int size_bytes() const {
     return ndims_ * kPerDimBytes + reshape_ndims_ * kPerReshapeDimBytes;
   }
-
-  bool next_index(absl::Span<int64_t> index) const {
-    DCHECK_EQ(index.size(), ndims_);
-    for (int64_t i = ndims_ - 1; i >= 0; --i) {
-      index[i]++;
-      if (index[i] < dims_ptr()[i]) {
-        return true;
-      }
-      index[i] = 0;
-    }
-    return false;
-  }
   int32_t ndims_;
   int32_t reshape_ndims_;
   // Contiguous buffer storing `int64_t dims[]`, `int64_t reshape_dims[]`,
@@ -181,14 +171,14 @@ Array<int64_t> ToArray(absl::Span<const int64_t> reshape_dims,
 class TileAssignment {
  public:
   TileAssignment() : array_(ReplicatedArray()) {}
+
   explicit TileAssignment(std::shared_ptr<const Array<int64_t>> array)
       : shared_array_(std::move(array)), array_(shared_array_.get()) {}
   explicit TileAssignment(int64_t device_id)
       : TileAssignment(std::make_shared<const Array<int64_t>>(
             std::initializer_list<int64_t>{1}, device_id)) {}
+
   explicit TileAssignment(IotaTileAssignment iota) : iota_(std::move(iota)) {}
-  explicit TileAssignment(std::initializer_list<int64_t> dims)
-      : iota_(IotaTileAssignment::Create(dims)) {}
   explicit TileAssignment(absl::Span<const int64_t> dims)
       : iota_(IotaTileAssignment::Create(dims)) {}
   explicit TileAssignment(absl::Span<const int64_t> dims,
@@ -249,8 +239,10 @@ class TileAssignment {
   [[nodiscard]] TileAssignment Transpose(absl::Span<const int> perm) const;
 
   void Print(Printer* printer) const;
+  void PrintArray(Printer* printer) const;
 
   std::string ToString() const;
+  std::string ArrayToString() const;
 
   bool UsesDevice(int64_t device) const;
 
@@ -276,13 +268,6 @@ class TileAssignment {
 
  private:
   friend class HloSharding;
-  // TODO(b/281892190): Consider changing int64_t to int32_t since it's unlikely
-  // to have so many devices to overflow int32_t in practice.
-  explicit TileAssignment(IotaTileAssignment iota,
-                          std::shared_ptr<const Array<int64_t>> shared_array)
-      : iota_(std::move(iota)),
-        shared_array_(std::move(shared_array)),
-        array_(shared_array_.get()) {}
 
   void MaybeMaterializeFullArray() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment_test.cc b/third_party/xla/xla/hlo/ir/tile_assignment_test.cc
index cf9c9d46d46ed6..fae834f0ca6689 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment_test.cc
+++ b/third_party/xla/xla/hlo/ir/tile_assignment_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -146,22 +145,24 @@ TEST(IotaTileAssignmentTest, ValueAt) {
 
 TEST(IotaTileAssignmentTest, ToString) {
   IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
+  EXPECT_EQ(iota.ArrayToString(), "[6]");
   EXPECT_EQ(iota.ToString(), "[2,3]<=[6]");
 
   IotaTileAssignment iota2 = IotaTileAssignment::Create({2, 6}, {3, 4}, {0, 1});
+  EXPECT_EQ(iota2.ArrayToString(), "[12]");
   EXPECT_EQ(iota2.ToString(), "[2,6]<=[12]");
 
   IotaTileAssignment iota3 =
       IotaTileAssignment::Create({3, 4, 5}, {3, 4, 5}, {2, 0, 1});
+  EXPECT_EQ(iota3.ArrayToString(), "[12,5]T(1,0)");
   EXPECT_EQ(iota3.ToString(), "[3,4,5]<=[12,5]T(1,0)");
 }
 
 TEST(TileAssignmentTest, FromIota) {
   IotaTileAssignment iota = IotaTileAssignment::Create({2, 3});
   TileAssignment tile_assignment(iota);
-  std::string prefix("devices=");
 
-  EXPECT_EQ(tile_assignment.ToString().compare(0, prefix.size(), prefix), 0);
+  EXPECT_EQ(tile_assignment.ToString(), "devices=[2,3]<=[6]");
   EXPECT_EQ(tile_assignment.dimensions(),
             absl::MakeConstSpan(std::vector<int64_t>{2, 3}));
   EXPECT_EQ(tile_assignment.num_elements(), 6);
@@ -171,9 +172,9 @@ TEST(TileAssignmentTest, FromIota) {
 TEST(TileAssignmentTest, FromArray) {
   Array2D<int64_t> array({{0, 1}, {2, 3}});
   TileAssignment tile_assignment(std::make_shared<Array<int64_t>>(array));
-  std::string prefix("devices=");
 
-  EXPECT_EQ(tile_assignment.ToString().compare(0, prefix.size(), prefix), 0);
+  EXPECT_EQ(tile_assignment.ToString(), "devices=[2,2]0,1,2,3");
+  EXPECT_EQ(tile_assignment.ArrayToString(), "0,1,2,3");
   EXPECT_EQ(tile_assignment.dimensions(),
             absl::MakeConstSpan(std::vector<int64_t>{2, 2}));
   EXPECT_EQ(tile_assignment.num_elements(), 4);
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index b34f66757d347e..6779cd744bcdcf 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -2296,7 +2296,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         return nullptr;
       }
 
-      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) != nullptr) {
+      if (HloChannelInstruction::ClassOf(operands[0])) {
         if (channel_id != operands[0]->channel_id()) {
           return nullptr;
         }
@@ -2333,7 +2333,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         return nullptr;
       }
 
-      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) != nullptr) {
+      if (DynCast<const HloChannelInstruction>(operands[0]) != nullptr) {
         if (channel_id != operands[0]->channel_id()) {
           return nullptr;
         }
diff --git a/third_party/xla/xla/hlo/pass/BUILD b/third_party/xla/xla/hlo/pass/BUILD
index 6daaf01f0350b6..9989bdd50c2b05 100644
--- a/third_party/xla/xla/hlo/pass/BUILD
+++ b/third_party/xla/xla/hlo/pass/BUILD
@@ -24,6 +24,7 @@ package_group(
 
 cc_library(
     name = "hlo_pass",
+    srcs = ["hlo_pass_interface.cc"],
     hdrs = [
         "hlo_pass_fix.h",
         "hlo_pass_interface.h",
@@ -108,6 +109,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_pass_interface",
+    srcs = ["hlo_pass_interface.cc"],
+    hdrs = ["hlo_pass_interface.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 xla_cc_test(
     name = "hlo_pass_pipeline_test",
     srcs = ["hlo_pass_pipeline_test.cc"],
diff --git a/third_party/xla/xla/hlo/pass/README.md b/third_party/xla/xla/hlo/pass/README.md
index bb9773a3282ae7..ecc9055eb74dca 100644
--- a/third_party/xla/xla/hlo/pass/README.md
+++ b/third_party/xla/xla/hlo/pass/README.md
@@ -48,7 +48,7 @@ auto status = pipeline.Run(&module);
 ## Important Considerations
 
 When creating custom HLO passes, inherit from `HloModulePass`. Implement
-the required virtual methods (e.g., `Run`) to define the pass's behavior.
+the required virtual methods (e.g., `RunImpl`) to define the pass's behavior.
 Utilize `HloPassFix` when your pass's transformations may trigger further
 optimizations upon repeated application. Construct `HloPassPipelines` to
 orchestrate the execution of multiple passes in a defined sequence.
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_fix.h b/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
index 8f143319d767f3..ae699bfcfcabea 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_fix.h
@@ -64,10 +64,10 @@ class HloPassFix : public Pass {
     return absl::OkStatus();
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     RunState run_state(module);
     TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
     return !run_state.changed.empty();
@@ -135,8 +135,8 @@ class HloPassFix : public Pass {
     }
     // If Pass does not override the default
     // HloPassInterface::RunOnChangedComputations that calls into
-    // HloPassFix<Pass>::Run, avoid infinite recursion.
-    TF_ASSIGN_OR_RETURN(bool changed, Pass::Run(module, execution_threads));
+    // HloPassFix<Pass>::RunImpl, avoid infinite recursion.
+    TF_ASSIGN_OR_RETURN(bool changed, Pass::RunImpl(module, execution_threads));
     if (changed) {
       auto computations = module->computations(execution_threads);
       run_state->changed_this_iteration.insert(computations.begin(),
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc b/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
index 183a3b27da14fd..c7020254615036 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_fix_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <utility>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
@@ -48,10 +47,10 @@ class DecrementPositiveConstants : public HloModulePass {
  public:
   absl::string_view name() const override { return "decrement-constants"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -79,10 +78,10 @@ class FlipAddSubtract : public HloModulePass {
  public:
   absl::string_view name() const override { return "flip-add-subtract"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
new file mode 100644
index 00000000000000..bec1de8aaaa219
--- /dev/null
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
@@ -0,0 +1,36 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+absl::StatusOr<bool> HloPassInterface::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunImpl(module, execution_threads);
+}
+
+absl::StatusOr<bool> HloPassInterface::Run(
+    std::unique_ptr<HloModule>& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunImpl(module, execution_threads);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
index ccb35667a86207..cfbe9723201e1f 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
@@ -78,27 +78,20 @@ class HloPassInterface {
   // Run the pass on the given HLO module with specified execution_threads.
   // Empty execution_threads list means all execution_threads are included.
   // Returns whether it modified the module.
-  //
-  // Note: C++ hides non-explicitly declared overloaded functions.
-  // You can make all overloaded variants available in the child class  by
-  // adding `using HloPassInterface::Run;` to the child class declaration.
-  absl::StatusOr<bool> Run(HloModule* module) {
-    return Run(module, /*execution_threads=*/{});
-  }
-  virtual absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Same as above, except that this API allows the pass to return a *different*
   // module, rather than modifying the module in-place.
-  absl::StatusOr<bool> Run(std::unique_ptr<HloModule>& module_ptr) {
-    return Run(module_ptr, /*execution_threads=*/{});
-  }
-  virtual absl::StatusOr<bool> Run(
+  // TODO(b/454418238): Remove this function (and the corresponding RunImpl).
+  ABSL_DEPRECATED(
+      "This interface exists for passes which create an entire new HloModule "
+      "instead of updating the existing one in place. This is not the norm and "
+      "should not be done.")
+  absl::StatusOr<bool> Run(
       std::unique_ptr<HloModule>& module_ptr,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) {
-    return Run(module_ptr.get(), execution_threads);
-  }
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Run the pass on computation on changed computations from last iteration in
   // given HLO module for specified execution_threads, with caller provided
@@ -133,6 +126,18 @@ class HloPassInterface {
       LOG(WARNING) << "Failed to set stat: " << status;
     }
   }
+
+ protected:
+  virtual absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
+
+  // TODO(b/454418238): Remove this function.
+  virtual absl::StatusOr<bool> RunImpl(
+      std::unique_ptr<HloModule>& module_ptr,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    return RunImpl(module_ptr.get(), execution_threads);
+  }
 };
 
 // Base class for passes which are module-scoped.
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
index 556605eab43c15..031c1b986cbe77 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
@@ -179,6 +179,7 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
       hash_before = absl::HashOf(*hlo);
       VLOG(2) << "  Module hash " << hash_before.value();
     }
+    VLOG(2) << "  Number of instructions: " << hlo->instruction_count();
     tsl::profiler::TraceMe traceme(pass->name());
     if (!pass->IsPassPipeline()) {
       compilation_stats_->StartPass(pass_name);
@@ -289,7 +290,7 @@ void HloPassPipeline::MaybeDumpHloAndSaveFilenames(
   }
 }
 
-absl::StatusOr<bool> HloPassPipeline::Run(
+absl::StatusOr<bool> HloPassPipeline::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
@@ -303,7 +304,7 @@ absl::StatusOr<bool> HloPassPipeline::Run(
   return RunPassesInternal(module, debug_options, execution_threads);
 }
 
-absl::StatusOr<bool> HloPassPipeline::Run(
+absl::StatusOr<bool> HloPassPipeline::RunImpl(
     std::unique_ptr<HloModule>& module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
index 091b6049a02e16..5fa1bfb1d17ec1 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
@@ -87,14 +87,6 @@ class HloPassPipeline : public HloPassInterface {
 #endif  // NDEBUG
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  absl::StatusOr<bool> Run(
-      std::unique_ptr<HloModule>& module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   bool IsPassPipeline() const override { return true; }
 
   // Return size of passes_.
@@ -102,6 +94,14 @@ class HloPassPipeline : public HloPassInterface {
   // Return reference to pass specified by index.
   HloPassInterface& GetPass(int index) { return *passes_[index]; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  absl::StatusOr<bool> RunImpl(
+      std::unique_ptr<HloModule>& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Returns the set of passes which are enabled. DebugOptions can selectively
   // disable passes via --xla_disable_hlo_passes flag.
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
index c958af7540255e..1f7f434f87f400 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -53,10 +52,10 @@ using HloPassPipelineTest = HloHardwareIndependentTestBase;
 class FooToBarModulePass : public HloModulePass {
   absl::string_view name() const override { return "foo2bar"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -76,10 +75,10 @@ class FooToBarModulePass : public HloModulePass {
 class ReverseStringModulePass : public HloModulePass {
   absl::string_view name() const override { return "reverse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -97,9 +96,9 @@ class ReverseStringModulePass : public HloModulePass {
 class BazToQuxModulePass : public HloModulePass {
   absl::string_view name() const override { return "baz2qux"; }
 
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation :
          module->computations(execution_threads)) {
@@ -119,10 +118,10 @@ class BazToQuxModulePass : public HloModulePass {
 class BarBlowerUpper : public HloModulePass {
   absl::string_view name() const override { return "bar-blower-upper"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     for (HloComputation* computation :
          module->computations(execution_threads)) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -356,10 +355,10 @@ TEST_F(HloPassPipelineTest, SetHloModuleMetadata) {
 class NoOpModulePass : public HloModulePass {
   absl::string_view name() const override { return "noop"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     return false;
   }
 };
diff --git a/third_party/xla/xla/hlo/separate_compilation/BUILD b/third_party/xla/xla/hlo/separate_compilation/BUILD
new file mode 100644
index 00000000000000..5d1e0768431b2c
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/BUILD
@@ -0,0 +1,72 @@
+# Targets related to hlo_module_splitting, linking, and more generally
+# incremental compilation of HLO.
+
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "hlo_linking_manifest",
+    hdrs = ["hlo_linking_manifest.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compilation_environments",
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "hlo_module_splitting",
+    srcs = ["hlo_module_splitting.cc"],
+    hdrs = ["hlo_module_splitting.h"],
+    deps = [
+        ":hlo_linking_manifest",
+        "//xla:status_macros",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compilation_environments",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+# Tests below
+
+xla_test(
+    name = "hlo_module_splitting_test",
+    size = "small",
+    srcs = ["hlo_module_splitting_test.cc"],
+    backends = ["cpu"],
+    deps = [
+        ":hlo_module_splitting",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h b/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h
new file mode 100644
index 00000000000000..1558a2f6c47788
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_linking_manifest.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
+#define XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
+
+#include <memory>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/hlo_module_config.h"
+
+namespace xla::separate_compilation {
+
+// Metadata to guide linking process of HLO modules.
+//
+// Manifest contains Caller/Callee information:
+// When splitting modules, sometimes callers and their callees end up
+// in different sub-modules. During linking, we must be able to find
+// the callees and replace the stubs we inserted in the caller's sub-module.
+// Because, HLO serialization includes ids keeping this information in
+// the module interferes with caching of artifacts, by making artifacts
+// with the same semantics appear different when serialized. This class
+// externalizes the information.
+struct HloLinkingManifest {
+  // Maps from a stub computation to the cloned computation.
+  // Note that these `HloComputation` pointers might be from different modules.
+  absl::flat_hash_map<const HloComputation* absl_nonnull,
+                      const HloComputation* absl_nonnull>
+      stub_links;
+  std::shared_ptr<const HloModuleConfig> module_config;
+  std::unique_ptr<CompilationEnvironments> compilation_environment;
+};
+
+}  // namespace xla::separate_compilation
+#endif  // XLA_HLO_SEPARATE_COMPILATION_HLO_LINKING_MANIFEST_H_
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
new file mode 100644
index 00000000000000..ac8afaeaeaf0a2
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
@@ -0,0 +1,335 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_splitting.h"
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+namespace {
+
+constexpr absl::string_view kEntryName = "entry";
+constexpr absl::string_view kModulePrefix = "module";
+constexpr absl::string_view kStubPrefix = "stub";
+
+// Provide a name for the module containing the split.
+// The name should be stable across compilations meaning that the same
+// split should get the same name.
+std::string GetSplitModuleName(
+    absl::flat_hash_set<const HloComputation*> split) {
+  // If multiple `HloComputation` elements are in a split, we have to worry
+  // about their ordering when hashing, or use some ordering-invariant hash.
+  CHECK(split.size() == 1) << "The current implementation only supports "
+                              "singleton splits.";
+  return absl::StrCat(kModulePrefix, absl::HashOf(*split.begin()));
+}
+
+std::string GetStubName(int32_t callee_index) {
+  return absl::StrCat(kStubPrefix, ".", callee_index);
+}
+
+absl::StatusOr<std::unique_ptr<HloComputation>> CreateCalleeStub(
+    HloComputation* callee, int32_t callee_index) {
+  // Bind to keep alive for the duration of the scope.
+  std::string stub_name = GetStubName(callee_index);
+  auto comp_builder = HloComputation::Builder(stub_name);
+
+  std::vector<HloInstruction*> operands;
+  for (const HloInstruction* parameter : callee->parameter_instructions()) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * cloned_parameter,
+        comp_builder.AddParameter(parameter->Clone(/*suffix=*/"")));
+    operands.push_back(cloned_parameter);
+  }
+
+  comp_builder.AddInstruction(HloInstruction::CreateCustomCall(
+      callee->root_instruction()->shape(), operands,
+      /*custom_call_target=*/kStubPrefix));
+  return comp_builder.Build();
+}
+
+// Returns all `kCall` instructions in the given computation.
+std::vector<const HloInstruction*> CollectCallInstructions(
+    const HloComputation* computation) {
+  // TODO: b/419473710 - Maybe guarantee order of operand traversal?
+  std::vector<const HloInstruction*> call_sites;
+  for (const HloInstruction* caller : computation->MakeInstructionPostOrder()) {
+    if (caller->opcode() == HloOpcode::kCall) {
+      call_sites.push_back(caller);
+    }
+  }
+  return call_sites;
+}
+
+// Composes two maps into one. The first map's value type must be second map's
+// key type.
+template <typename K, typename KV, typename V>
+absl::StatusOr<absl::flat_hash_map<K, V>> ComposeMaps(
+    const absl::flat_hash_map<K, KV>& first,
+    const absl::flat_hash_map<KV, V>& second) {
+  absl::flat_hash_map<K, V> result;
+  for (const auto [k, kv] : first) {
+    if (auto it = second.find(kv); it != second.end()) {
+      result.insert({k, it->second});
+    }
+  }
+  return result;
+}
+
+// Merges `from` into `into`. If `error_on_duplicate_key` is true, returns an
+// error if any key is encountered more than once.
+template <typename K, typename V>
+absl::Status MergeMapInto(absl::flat_hash_map<K, V>& into,
+                          const absl::flat_hash_map<K, V>& from,
+                          bool error_on_duplicate_key = true) {
+  for (const auto& [k, v] : from) {
+    if (!into.insert({k, v}).second) {
+      if (error_on_duplicate_key) {
+        return absl::AlreadyExistsError("Duplicate key encountered.");
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
+// Assigns entry & all kCall-ed computations to splits in stable way.
+// Computations are assigned to exactly one split.
+//
+// Other computations, not explicitly kCall-ed, are not assigned and will be
+// cloned with any explicitly assigned computation that depends on them. Note
+// that this means that they might be replicated to multiple splits!
+absl::StatusOr<std::vector<absl::flat_hash_set<const HloComputation*>>>
+GroupComputationsForSplitting(const HloModule& module) {
+  std::vector<absl::flat_hash_set<const HloComputation*>> result;
+
+  const HloComputation* entry_computation = module.entry_computation();
+  TF_RET_CHECK(entry_computation != nullptr)
+      << "Module has no entry computation.";
+
+  // Perform a BFS traversal of the graph along kCall edges.
+  // All other edges are ignored.
+  std::deque<const HloComputation*> computations_to_visit;
+  absl::flat_hash_set<const HloComputation*> seen;
+
+  computations_to_visit.push_back(entry_computation);
+  seen.insert(entry_computation);
+
+  while (!computations_to_visit.empty()) {
+    const HloComputation* current_computation = computations_to_visit.front();
+    computations_to_visit.pop_front();
+
+    // Each reachable computation is added as a separate split.
+    // If grouping is possible, this logic might change.
+    result.push_back({current_computation});
+
+    // Process callees.
+    for (const HloInstruction* op : current_computation->instructions()) {
+      if (op->opcode() == HloOpcode::kCall) {
+        const HloComputation* callee = op->to_apply();
+        TF_RET_CHECK(callee != nullptr)
+            << "HloOpcode::kCall has a null callee.";
+        if (!seen.contains(callee)) {
+          seen.insert(callee);
+          computations_to_visit.push_back(callee);
+        }
+      }
+    }
+  }
+  return result;
+}
+
+absl::StatusOr<std::unique_ptr<HloModuleSplit>> CreateHloModuleSplit(
+    const HloModule& module, absl::flat_hash_set<const HloComputation*> split) {
+  // If multiple `HloComputation` elements are in a split, we have to worry
+  // about their ordering when hashing, or use some ordering-invariant hash.
+  CHECK(split.size() == 1)
+      << "The current implementation supports singleton splits.";
+  // TODO: b/419184359 - Revisit when we reconfigure the pipeline
+  // (global->local->global). Check what data is needed by which
+  // set of passes.
+  std::shared_ptr<const HloModuleConfig> sub_module_config =
+      module.shared_config();
+  auto sub_module_env =
+      std::make_unique<CompilationEnvironments>(module.comp_envs());
+  auto submodule = std::make_unique<HloModule>(
+      GetSplitModuleName(split), sub_module_config, std::move(sub_module_env));
+  HloCloneContext clone_context(submodule.get());
+  // The plan is:
+  // 1. Prepare stubs as substitutions for callees.
+  // 2. Clone the computation(s) with replacements of calls.
+  // 3. Set the ENTRY computation.
+
+  const HloComputation* computation = *split.begin();
+  VLOG(4) << "Splitting out: " << computation->name();
+  std::vector<const HloInstruction*> call_instructions =
+      CollectCallInstructions(computation);
+  // stub -> original callee
+  absl::flat_hash_map<const HloComputation*, const HloComputation*> stub_map;
+  // original computation -> split computation
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      computation_map;
+
+  // We want to give a unique name to every call site by inserting a unique
+  // stub call into every call site. However, `HloCloneContext` wants a map
+  // of callee_computation -> new_computation, and this does not allow different
+  // call sites to replace the same `callee_computation` with a different
+  // `new_computation_2` at a different call site.
+  //
+  // We create a fresh stub for every call site. Since we only handle kCall
+  // instructions, it can have single callee (call site). We map
+  // original callee to these new stub, even though some aliasing might
+  // happen. Specifically, if multiple call sites refer to the same callee
+  // that callee will map to the stub for the last encountered call site.
+  //
+  // We tolerate this to avoid cloning the actual callees into the new module.
+  // After cloning we go into the cloned computation and patch callee pointers.
+
+  absl::flat_hash_map<const HloInstruction*, HloComputation*>
+      callee_replacements;
+  int32_t callee_index = 0;
+  for (const HloInstruction* caller : call_instructions) {
+    HloComputation* callee = caller->to_apply();
+    // Skip callee that is part of the current split.
+    if (split.contains(callee)) {
+      // Remember the original callee, so that we can patch the call site later.
+      callee_replacements[caller] = callee;
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> stub,
+                        CreateCalleeStub(callee, callee_index));
+    VLOG(4) << "Stubbing " << stub->name() << " --> " << callee->name() << " "
+            << stub->ToString();
+    HloComputation* stub_raw_ptr =
+        submodule->AddComputationAndUnifyNamesAndIds(std::move(stub),
+                                                     /*is_entry=*/false);
+    clone_context.MapComputation(callee, stub_raw_ptr);
+    callee_replacements[caller] = stub_raw_ptr;
+    stub_map.insert({stub_raw_ptr, callee});
+    ++callee_index;
+  }
+
+  HloComputation* entry_computation = submodule->AddEntryComputationWithLayouts(
+      computation->CloneInContext(clone_context, nullptr,
+                                  /*extra_parameters=*/{}, /*suffix=*/""));
+
+  // Patch call sites.
+  for (const HloInstruction* caller : call_instructions) {
+    HloInstruction* mapped_call_instruction =
+        clone_context.GetInstruction(caller);
+    // Adjust `callee_replacement` to only contain pointer to computations in
+    // the submodule. This is currently not the case because we skipped some
+    // callees when stubbing and remembered pointer to their original.
+    HloComputation* replacement = callee_replacements[caller];
+    if (replacement->parent() != submodule.get()) {
+      replacement = clone_context.GetComputation(replacement);
+    }
+    mapped_call_instruction->set_to_apply(callee_replacements[caller]);
+  }
+
+  entry_computation->SetAndSanitizeName(kEntryName);
+  computation_map.insert({computation, entry_computation});
+
+  VLOG(3) << submodule->ToString();
+  return std::make_unique<HloModuleSplit>(
+      module, std::move(submodule), std::move(stub_map),
+      std::move(computation_map), std::move(call_instructions));
+}
+
+absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
+    const HloModule& module) {
+  absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>
+      computation_address_book;
+  std::vector<std::unique_ptr<HloModuleSplit>> module_splits;
+
+  // See `HloModuleSplit::stub_map`.
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      global_stub_map;
+  absl::flat_hash_map<const HloComputation*, const HloComputation*>
+      global_computation_map;
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::flat_hash_set<const HloComputation*>> splits,
+      GroupComputationsForSplitting(module));
+
+  for (const auto& split : splits) {
+    TF_ASSIGN_OR_RETURN(auto module_split, CreateHloModuleSplit(module, split));
+    module_splits.push_back(std::move(module_split));
+    for (const auto* original_comp : split) {
+      computation_address_book.insert(
+          {original_comp, module_splits.back().get()});
+    }
+    TF_RETURN_IF_ERROR(
+        MergeMapInto(global_stub_map, module_splits.back()->stub_map));
+    TF_RETURN_IF_ERROR(MergeMapInto(global_computation_map,
+                                    module_splits.back()->computation_map));
+  }
+
+  if (VLOG_IS_ON(5)) {
+    VLOG(5) << "Split group:";
+    for (const auto& split : module_splits) {
+      VLOG(5) << "Split: " << split->submodule->name();
+      VLOG(5) << " Stub links:";
+      for (const auto& [stub, comp] : split->stub_map) {
+        VLOG(5)
+            << "  " << stub->name() << " ==>> " << comp->name() << "("
+            << computation_address_book[comp]->computation_map.at(comp)->name()
+            << " @ " << computation_address_book[comp]->submodule->name()
+            << ")";
+      }
+    }
+  }
+  // Compose at the end once all planned cloning operations are finished and
+  // we know where each original computation ended up.
+  TF_ASSIGN_OR_RETURN(auto stub_links,
+                      ComposeMaps(global_stub_map, global_computation_map));
+
+  HloLinkingManifest linking_manifest{
+      std::move(stub_links), module.shared_config(),
+      std::make_unique<CompilationEnvironments>(module.comp_envs())};
+
+  return std::make_unique<HloModuleSplitGroup>(
+      std::move(computation_address_book), std::move(module_splits),
+      std::move(linking_manifest));
+}
+
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
new file mode 100644
index 00000000000000..4569edd925e5ca
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
@@ -0,0 +1,107 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
+#define XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/separate_compilation/hlo_linking_manifest.h"
+
+namespace xla::separate_compilation {
+
+// Returns a list of sets of computations that can be split into separate
+// modules. Adjacent computations in the same set can be compiled together.
+absl::StatusOr<std::vector<absl::flat_hash_set<const HloComputation*>>>
+GroupComputationsForSplitting(const HloModule& module);
+
+// Represents one part of a module that was split into multiple parts for
+// separate compilation.
+struct HloModuleSplit {
+  using StubComputation = HloComputation;
+  using OriginalComputation = HloComputation;
+  using ClonedComputation = HloComputation;
+
+  // The original `HloModule` that this split originated from.
+  const HloModule& module;
+  // An `HloModule` containing computations belonging to this split. If any
+  // computation in `submodule` calls a computation which is part of another
+  // split, that call is replaced with a call to a stub computation.
+  std::unique_ptr<HloModule> submodule;
+  // Maps stub computations defined in `submodule` to the original computations
+  // in `module` which they replace.
+  absl::flat_hash_map<const StubComputation*, const OriginalComputation*>
+      stub_map;
+  // Maps computations from `module` to their cloned versions in `submodule`.
+  absl::flat_hash_map<const OriginalComputation*, const ClonedComputation*>
+      computation_map;
+  // All `kCall` instructions in `submodule` which originally belonged to
+  // computations cloned into this split. This includes calls to stubbed out
+  // computations, and calls to computations within this split.
+  std::vector<const HloInstruction*> call_sites;
+
+  HloModuleSplit(
+      const HloModule& module, std::unique_ptr<HloModule> submodule,
+      absl::flat_hash_map<const StubComputation*, const OriginalComputation*>
+          stub_map,
+      absl::flat_hash_map<const OriginalComputation*, const ClonedComputation*>
+          computation_map,
+      std::vector<const HloInstruction*> call_sites)
+      : module{std::move(module)},
+        submodule{std::move(submodule)},
+        stub_map{std::move(stub_map)},
+        computation_map{std::move(computation_map)},
+        call_sites{std::move(call_sites)} {}
+};
+
+// Creates an `HloModule` that only contains the requested computations
+// from the original module and, potentially, insert callee stubs.
+absl::StatusOr<std::unique_ptr<HloModuleSplit>> CreateHloModuleSplit(
+    const HloModule& module, absl::flat_hash_set<const HloComputation*> split);
+
+// Represents a group of `HloModuleSplit`s.
+struct HloModuleSplitGroup {
+  absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>
+      address_book;
+  std::vector<std::unique_ptr<HloModuleSplit>> module_splits;
+  HloLinkingManifest linking_manifest;
+
+  HloModuleSplitGroup(
+      absl::flat_hash_map<const HloComputation*, const HloModuleSplit*>&&
+          address_book,
+      std::vector<std::unique_ptr<HloModuleSplit>>&& module_splits,
+      HloLinkingManifest&& linking_manifest)
+      : address_book(std::move(address_book)),
+        module_splits(std::move(module_splits)),
+        linking_manifest(std::move(linking_manifest)) {}
+};
+
+// Split the given module. Returns a mapping from `HloComputation*` to
+// the `ModulePartition` data where that computation was assigned. If multiple
+// computations are assigned to the same module there are multiple keys pointing
+// to the same `ModulePartition` structure.
+absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
+    const HloModule& module);
+
+}  // namespace xla::separate_compilation
+#endif  // XLA_HLO_SEPARATE_COMPILATION_HLO_MODULE_SPLITTING_H_
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc
new file mode 100644
index 00000000000000..79f0809d66f16a
--- /dev/null
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting_test.cc
@@ -0,0 +1,376 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/separate_compilation/hlo_module_splitting.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::separate_compilation {
+namespace {
+
+using ::testing::UnorderedElementsAreArray;
+
+class SplittingTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(SplittingTest, AllComputationsInBuckets) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.3 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.3
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto splits, GroupComputationsForSplitting(*module));
+
+  absl::flat_hash_set<const HloComputation*> all_computations_in_splits;
+  for (const auto& bucket : splits) {
+    all_computations_in_splits.insert(bucket.begin(), bucket.end());
+  }
+  EXPECT_THAT(all_computations_in_splits,
+              UnorderedElementsAreArray(module->computations()));
+}
+
+TEST_F(SplittingTest, CreateModule) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.3 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.3
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  auto* main = FindComputation(module.get(), "main");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split,
+                          CreateHloModuleSplit(*module, {main}));
+
+  const int kMainModuleComputationCount = 5;  // the main + 4 stubs
+  // const int kMainModuleClonedComputations = 1;
+  const int kMainModuleCallSitesCount = 4;
+  EXPECT_EQ(module_split->module.computation_count(),
+            kMainModuleComputationCount);
+  EXPECT_EQ(module_split->stub_map.size(), kMainModuleCallSitesCount);
+  EXPECT_EQ(module_split->call_sites.size(), kMainModuleCallSitesCount);
+}
+
+TEST_F(SplittingTest, SplitModule) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%fusion.1 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%fusion.2 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.2
+  %fusion = f32[] fusion(%res.3), kind=kLoop, calls=%fusion.2
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+  %res.fu23 = f32[] add(f32[] %fusion, f32[] %res.23)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.fu23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+TEST_F(SplittingTest, SplitDiamondGraphModule) {
+  constexpr absl::string_view module_text = R"(
+    HloModule shared_callee_module
+
+    %y {
+      %p = f32[] parameter(0)
+      ROOT %result = f32[] exponential(%p)
+    }
+
+    %x {
+      %p = f32[] parameter(0)
+      %call_y = f32[] call(%p), to_apply=%y
+      ROOT %result = f32[] cosine(%call_y)
+    }
+
+    %a {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(5.0)
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] add(%call_x, %c)
+    }
+
+    %b {
+      %p = f32[] parameter(0)
+      %c = f32[] constant(10.0)
+      %call_x = f32[] call(%p), to_apply=%x
+      ROOT %result = f32[] subtract(%call_x, %c)
+    }
+
+    ENTRY %entry {
+      %p_entry = f32[] parameter(0)
+      %call_a = f32[] call(%p_entry), to_apply=%a
+      %call_b = f32[] call(%p_entry), to_apply=%b
+      ROOT %result = f32[] add(%call_a, %call_b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  // Expect all computations to be assigned somewhere.
+  for (auto original_comp : module->computations()) {
+    EXPECT_TRUE(module_split_group->address_book.contains(original_comp));
+  }
+
+  EXPECT_EQ(module_split_group->module_splits.size(),
+            module->computation_count());
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+TEST_F(SplittingTest, SplitModuleWithSharedComputations) {
+  constexpr absl::string_view module_text = R"(
+HloModule simple_module
+
+
+// Simple alpha equivalence examples
+
+%fusion.1 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%fusion.2 (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %add = f32[] negate(f32[] %x)
+}
+
+%add.0 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%add.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+%add.2 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %main() -> f32[] {
+
+  %p = f32[] constant(3.3)
+  %q = f32[] constant(-1.0)
+  %r = f32[] constant(7.1)
+  %s = f32[] constant(0.2)
+
+  %res.0 = call(%p, %q), to_apply=%add.0
+  %res.1 = call(%p, %q), to_apply=%add.1
+  %res.1f = f32[] fusion(%res.1), kind=kLoop, calls=%fusion.1
+  %res.2 = call(%r, %s), to_apply=%add.2
+  %res.3 = call(%p, %s), to_apply=%add.2
+  %fusion = f32[] fusion(%res.3), kind=kLoop, calls=%fusion.2
+
+  %res.01 = f32[] add(f32[] %res.0, f32[] %res.1f)
+  %res.23 = f32[] add(f32[] %res.2, f32[] %res.3)
+  %res.f = f32[] add(f32[] %fusion, f32[], %res.23)
+
+  ROOT %result = f32[] add(f32[] %res.01, f32[] %res.f)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto module_split_group,
+                          CreateHloModuleSplitGroup(*module));
+
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Compiler> compiler,
+                          Compiler::GetForPlatform(platform));
+  TF_ASSERT_OK(compiler->RunHloPasses(module->Clone(), /*executor=*/nullptr,
+                                      Compiler::CompileOptions{}));
+  for (auto& split : module_split_group->module_splits) {
+    TF_EXPECT_OK(compiler->RunHloPasses(split->submodule->Clone(),
+                                        /*executor=*/nullptr,
+                                        Compiler::CompileOptions{}));
+  }
+}
+
+}  // namespace
+}  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
index f2f1118d58ea38..a80769523cc7fd 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
@@ -63,7 +63,6 @@ HloPrintOptions CreateHloPrintOptions(
     const HloGumgraphFingerprintOptions& fingerprint_options) {
   HloPrintOptions hlo_print_options =
       HloPrintOptions::Fingerprint()
-          .set_include_layout_in_shapes(false)
           .set_print_subcomputation_mode(
               HloPrintOptions::PrintSubcomputationMode::kOff)
           .set_print_parameter_number(false)
@@ -137,11 +136,10 @@ absl::Status HloGumgraph::ConstructGraph(const HloModule& hlo_module) {
 
       HloInstructionNode* node = node_and_inserted.first;
       node->props.fingerprint = GetHloInstructionFingerprint(
-          instruction, CreateHloPrintOptions(fingerprint_options_));
+          instruction, CreateHloPrintOptions(fingerprint_options_)
+                           .set_include_layout_in_shapes(false));
       node->props.canonical_fingerprint = GetHloInstructionFingerprint(
-          instruction, HloPrintOptions::Fingerprint()
-                           .set_print_parameter_number(false)
-                           .set_print_only_essential_constants(false));
+          instruction, CreateHloPrintOptions(fingerprint_options_));
 
       bool inline_called_computations = false;
       switch (instruction->opcode()) {
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
index c806e524c39847..1bf46508a9ab33 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
@@ -235,25 +235,25 @@ ENTRY entry {
                   /*generation=*/1,
                   /*height=*/3, /*subgraph_fingerprint=*/10174981490612213786U,
                   /*fingerprint=*/7968662072287666665U,
-                  /*canonical_fingerprint=*/962574172336760684U));
+                  /*canonical_fingerprint=*/7968662072287666665U));
   EXPECT_THAT(entry->children[0]->props,
               FieldsAre(
                   /*generation=*/2,
                   /*height=*/2, /*subgraph_fingerprint=*/12866517545790127195U,
                   /*fingerprint=*/7968662072287666665U,
-                  /*canonical_fingerprint=*/962574172336760684U));
+                  /*canonical_fingerprint=*/7968662072287666665U));
   EXPECT_THAT(entry->children[1]->props,
               FieldsAre(
                   /*generation=*/3,
                   /*height=*/1, /*subgraph_fingerprint=*/3741348072536313129U,
                   /*fingerprint=*/3741348072536313129U,
-                  /*canonical_fingerprint=*/12841472793063608770U));
+                  /*canonical_fingerprint=*/3741348072536313129U));
   EXPECT_THAT(entry->children[0]->children[0]->props,
               FieldsAre(
                   /*generation=*/3,
                   /*height=*/1, /*subgraph_fingerprint=*/856105463456541506U,
                   /*fingerprint=*/856105463456541506U,
-                  /*canonical_fingerprint=*/1668459129586447343U));
+                  /*canonical_fingerprint=*/856105463456541506U));
 
   EXPECT_THAT(
       graph->AllComputationProps(),
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
index fd8bb74e39895d..d22d069b8865d6 100644
--- a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
@@ -57,16 +57,15 @@ names, parameter ordering etc, layouts (in some instances).
       bazel run hlo_diff -- \
         --{first_hlo_snapshot,first_hlo_proto,first_hlo_module_proto,first_hlo_text}=path/to/first/binary_proto
         --{second_hlo_snapshot,second_hlo_proto,second_hlo_module_proto,second_hlo_text}=path/to/second/binary_proto
-        [--ignore_shape_during_instruction_matching]
+        [--ignore_shape]
         [--text_output=path/to/file/to/save/text]
         [--html_output=path/to/file/to/save/html]
 
 first and second hlo file paths are required flags. Optionally the following
 flags can be used:
 
-If --ignore_shape_during_instruction_matching is specified, the tool ignores
-array/tensor shapes when matching instructions allowing for more permissive
-matches.
+If --ignore_shape is specified, the tool ignores array/tensor shapes when
+matching instructions and reporting diffs, allowing for more permissive matches.
 If --text_output is specified, the full diff result will be printed in text
 format and saved to the specified file.
 if --html_output is specified, the diff result will be rendered in HTML
@@ -248,9 +247,10 @@ int main(int argc, char** argv) {
                 "second XLA hlo module proto to compare"),
       tsl::Flag("second_hlo_text", &opts.second.hlo_text,
                 "second XLA hlo text to compare"),
-      tsl::Flag("ignore_shape_during_instruction_matching",
-                &opts.diff_options.fingerprint_options.ignore_shape,
-                "Ignore array/tensor shapes when matching instructions"),
+      tsl::Flag(
+          "ignore_shape", &opts.diff_options.fingerprint_options.ignore_shape,
+          "If true, ignore array/tensor shapes when matching instructions "
+          "and reporting diffs."),
       tsl::Flag("text_output", &opts.render_options.text_output,
                 "file to save diff blocks as text"),
       tsl::Flag("html_output", &opts.render_options.html_output,
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
index 8547e012c2768a..cd5f53d7544fe6 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
@@ -41,7 +41,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:alias_info",
-        "//xla/hlo/analysis:indexed_array_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/tools/tests:hlo_opt_test_only_passes",
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
index a1969ab8693dbf..430582c02191bb 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/hlo/analysis/indexed_array_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
@@ -295,7 +294,6 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   RegisterPass<HostOffloadLegalize>();
   RegisterPass<HostOffloadingPrepare>(
       /*rewrite=*/HostOffloadingPrepare::Rewrite::kElideMoveToHost);
-  RegisterPass<IndexedArrayAnalysisPrinterPass>();
   RegisterPass<InfeedTokenPropagation>();
   RegisterPass<InstructionHoister>();
   RegisterPass<LiteralCanonicalizer>(
diff --git a/third_party/xla/xla/hlo/tools/hlo_translate.cc b/third_party/xla/xla/hlo/tools/hlo_translate.cc
index 33ac1c7a4376f1..0c43379259ea6e 100644
--- a/third_party/xla/xla/hlo/tools/hlo_translate.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_translate.cc
@@ -121,15 +121,18 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOText(
   auto status = ConvertHloToMlirHlo(*module, hlo_module.get(),
                                     /*import_all_computations=*/true,
                                     /*flatten_computation_args_result*/ true);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
   return module;
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
     const std::string& content, mlir::MLIRContext* context, bool emit_mhlo) {
   xla::HloProto hlo_proto;
-  if (!LoadHloProto(content, &hlo_proto))
+  if (!LoadHloProto(content, &hlo_proto)) {
     return absl::InvalidArgumentError(kLoadHloError);
+  }
 
   // For emitting StableHLO, use new APIs by defualt.
   if (!emit_mhlo) {
@@ -143,7 +146,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
       ConvertHloToMlirHlo(module.get(), hlo_proto.mutable_hlo_module(),
                           /*import_all_computations=*/true,
                           /*flatten_computation_args_result=*/true);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
   return module;
 }
 
@@ -164,7 +169,9 @@ mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
 
   // Try HLO Text
   auto module_from_text = GetModuleFromHLOText(content, context, emit_mhlo);
-  if (module_from_text.ok()) return std::move(module_from_text).value();
+  if (module_from_text.ok()) {
+    return std::move(module_from_text).value();
+  }
   if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_text.status().message();
@@ -174,7 +181,9 @@ mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
   // Try HLO Proto
   auto module_from_proto =
       GetModuleFromHLOProto(std::string(content), context, emit_mhlo);
-  if (module_from_proto.ok()) return std::move(module_from_proto).value();
+  if (module_from_proto.ok()) {
+    return std::move(module_from_proto).value();
+  }
   if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_proto.status().message();
@@ -195,7 +204,9 @@ static mlir::OwningOpRef<mlir::ModuleOp> HloToMlirTranslate(
   mlir::OwningOpRef<mlir::ModuleOp> module =
       GetModuleFromHloInput(sourceMgr, context, emit_mhlo);
 
-  if (!module) return nullptr;
+  if (!module) {
+    return nullptr;
+  }
 
   return module;
 }
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
index fb44caf056f6e7..c9b5de93c5af4d 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
@@ -221,7 +221,7 @@ absl::StatusOr<bool> XlaBuilderTestPass::ReplaceWithExpandedClientHlo(
       "Unsupported xla_builder custom call target: ", custom_call_target));
 }
 
-absl::StatusOr<bool> XlaBuilderTestPass::Run(
+absl::StatusOr<bool> XlaBuilderTestPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
index bc32ef0f37544a..20b31c1712b84d 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
@@ -33,10 +33,10 @@ class FooToBarModulePass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-foo2bar"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -55,10 +55,10 @@ class BarToHelloModulePass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-bar2hello"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
@@ -103,8 +103,8 @@ class XlaBuilderTestPass : public HloModulePass {
  public:
   absl::string_view name() const override { return "test-only-xla-builder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 9beb9f1c6d20ec..4db8e3b4c9739c 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -566,3 +566,49 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
     ],
 )
+
+cc_library(
+    name = "call_splitter",
+    srcs = ["call_splitter.cc"],
+    hdrs = ["call_splitter.h"],
+    deps = [
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "call_splitter_test",
+    srcs = ["call_splitter_test.cc"],
+    deps = [
+        ":call_splitter",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/transforms/simplifiers:call_parameter_cleanup",
+        "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
+        "//xla/service:call_inliner",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",  # fixdeps: keep
+    ],
+)
diff --git a/third_party/xla/xla/hlo/transforms/add_original_value.cc b/third_party/xla/xla/hlo/transforms/add_original_value.cc
index 91c44c876c540e..2d3ff322d0b8d0 100644
--- a/third_party/xla/xla/hlo/transforms/add_original_value.cc
+++ b/third_party/xla/xla/hlo/transforms/add_original_value.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AddOriginalValue::Run(
+absl::StatusOr<bool> AddOriginalValue::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/add_original_value.h b/third_party/xla/xla/hlo/transforms/add_original_value.h
index f253b8ad42ec09..0cfa6247d9018c 100644
--- a/third_party/xla/xla/hlo/transforms/add_original_value.h
+++ b/third_party/xla/xla/hlo/transforms/add_original_value.h
@@ -30,8 +30,9 @@ namespace xla {
 class AddOriginalValue : public HloModulePass {
  public:
   absl::string_view name() const override { return "add-original-value"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
index 9f812fe3e3753a..c04cf4585194ef 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
@@ -979,7 +979,7 @@ absl::Status BFloat16Propagation::SkipNoopConversions(HloModule* module) {
 // their users. During the backward pass, the potential changes are stored in
 // changes_to_bf16_ which are subject to further adjustments then applied to the
 // HLOs.
-absl::StatusOr<bool> BFloat16Propagation::Run(
+absl::StatusOr<bool> BFloat16Propagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   consider_using_bfloat16_.clear();
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
index 6d412d4265e10e..31b4d3e6316cbe 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
@@ -73,13 +73,6 @@ class BFloat16Propagation : public HloModulePass {
   static constexpr absl::string_view kName = "bfloat16-propagation";
   absl::string_view name() const override { return kName; }
 
-  // Runs the pass on the given module. Returns whether the module was changed
-  // (precision reductions were added).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns whether we should avoid changing the precision of inst regardless
   // of the producers and users.
   virtual bool ShouldKeepPrecisionUnchanged(const HloInstruction* inst);
@@ -91,6 +84,12 @@ class BFloat16Propagation : public HloModulePass {
  protected:
   const FloatSupport* bfloat16_support_;
 
+  // Runs the pass on the given module. Returns whether the module was changed
+  // (precision reductions were added).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // ***************************
   // Function called and state produced by the forward analysis pass (from
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter.cc b/third_party/xla/xla/hlo/transforms/call_splitter.cc
new file mode 100644
index 00000000000000..5e6e7acd03dc6f
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter.cc
@@ -0,0 +1,282 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/call_splitter.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Returns all instructions in the body that match the boundary predicate.
+std::vector<HloInstruction*> GetBoundaryInstructions(
+    HloComputation* body, HloPredicate boundary_predicate) {
+  std::vector<HloInstruction*> boundary_instructions;
+  for (HloInstruction* instruction : body->instructions()) {
+    if (boundary_predicate(instruction)) {
+      boundary_instructions.push_back(instruction);
+    }
+  }
+  return boundary_instructions;
+}
+
+// Returns all instructions that must go into the second call, because they
+// depend on the boundary instructions.
+absl::flat_hash_set<HloInstruction*> GetSecondCallInstructions(
+    HloComputation* body,
+    const std::vector<HloInstruction*>& boundary_instructions) {
+  absl::flat_hash_set<HloInstruction*> second_call_instructions(
+      boundary_instructions.begin(), boundary_instructions.end());
+  std::vector<HloInstruction*> worklist(boundary_instructions.begin(),
+                                        boundary_instructions.end());
+  while (!worklist.empty()) {
+    HloInstruction* curr = worklist.back();
+    worklist.pop_back();
+    auto process = [&](HloInstruction* user) {
+      if (second_call_instructions.contains(user)) {
+        return;
+      }
+      second_call_instructions.insert(user);
+      worklist.push_back(user);
+    };
+    for (HloInstruction* user : curr->users()) {
+      process(user);
+    }
+    for (HloInstruction* successor : curr->control_successors()) {
+      process(successor);
+    }
+  }
+  return second_call_instructions;
+}
+
+absl::StatusOr<bool> SplitCall(HloInstruction* call,
+                               HloPredicate boundary_predicate) {
+  // We need to do several things here:
+  // 1. Figure out which instructions go into the first call and which into the
+  // second. In particular:
+  //    a) The boundary instructions go into the second call.
+  //    b) Anything that consumes the results of the boundary instructions goes
+  //    into the second call.
+  //    c) Anything that feeds the instructions from (a) and (b) goes into the
+  //    first call.
+  //    d) The remaining instructions go into the first call.
+  // 2. Figure out the outputs of the first call and the inputs to the second
+  // call, and how to connect them.
+  // 3. Materialized the two new computations and the calls, and put them in the
+  // enclosing computation.
+
+  // TODO(mkuper): This splits "down". We also want a version that splits "up",
+  // i.e. the boundary ends up in the first call, and the "irrelevant"
+  // instructions end up in the second one.
+
+  HloComputation* body = call->to_apply();
+  HloComputation* enclosing_computation = call->parent();
+  HloModule* module = body->parent();
+
+  std::vector<HloInstruction*> boundary_instructions =
+      GetBoundaryInstructions(body, boundary_predicate);
+  if (boundary_instructions.empty()) {
+    return false;
+  }
+
+  absl::flat_hash_set<HloInstruction*> second_call_instructions =
+      GetSecondCallInstructions(body, boundary_instructions);
+
+  absl::flat_hash_set<HloInstruction*> first_call_instructions;
+  for (HloInstruction* instruction : body->instructions()) {
+    if (!second_call_instructions.contains(instruction)) {
+      first_call_instructions.insert(instruction);
+    }
+  }
+  if (first_call_instructions.empty() || second_call_instructions.empty()) {
+    return false;
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "First call instructions: ";
+    for (HloInstruction* instruction : first_call_instructions) {
+      VLOG(1) << instruction->ToString();
+    }
+    VLOG(1) << "Second call instructions: ";
+    for (HloInstruction* instruction : second_call_instructions) {
+      VLOG(1) << instruction->ToString();
+    }
+  }
+
+  // The outputs of the first call are instructions that will be in the first
+  // call that are directly used by instructions that will be in the second
+  // call. It's convenient to have both a set and a vector representation. We
+  // could use a single ordered associative container, but this is simpler.
+  absl::flat_hash_set<HloInstruction*> first_call_outputs;
+  for (HloInstruction* instruction : second_call_instructions) {
+    for (HloInstruction* control_pred : instruction->control_predecessors()) {
+      // Don't break the function if it would create a control edge that needs
+      // to be threaded between the two new functions.
+      if (first_call_instructions.contains(control_pred)) {
+        return false;
+      }
+    }
+    for (HloInstruction* data_pred : instruction->operands()) {
+      if (first_call_instructions.contains(data_pred)) {
+        first_call_outputs.insert(data_pred);
+      }
+    }
+  }
+
+  // Make sure the order of outputs is deterministic.
+  std::vector<HloInstruction*> first_call_outputs_vec(
+      first_call_outputs.begin(), first_call_outputs.end());
+  std::sort(first_call_outputs_vec.begin(), first_call_outputs_vec.end(),
+            [](HloInstruction* a, HloInstruction* b) {
+              return a->unique_id() < b->unique_id();
+            });
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "First call outputs: ";
+    for (HloInstruction* instruction : first_call_outputs_vec) {
+      VLOG(1) << instruction->ToString();
+    }
+  }
+
+  // Construct the first call body. We delete everything that goes into the
+  // second call from the call body, and construct a new output tuple based on
+  // the inputs the second call needs.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      first_call_replacements;
+  for (HloInstruction* instruction : second_call_instructions) {
+    first_call_replacements.insert({instruction, nullptr});
+  }
+  HloComputation* first_call_computation =
+      module->AddEmbeddedComputation(body->CloneWithReplacements(
+          &first_call_replacements, /*extra_parameters=*/{},
+          /*context=*/nullptr, /*suffix=*/"first", first_call_outputs_vec));
+
+  // Now construct the second call body. In the call body, the first call
+  // instruction that are directly used are replaced by parameters, and the rest
+  // are deleted.
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      second_call_replacements;
+  for (int i = 0; i < first_call_outputs_vec.size(); ++i) {
+    second_call_replacements.insert(
+        {first_call_outputs_vec[i],
+         HloInstruction::CreateParameter(i, first_call_outputs_vec[i]->shape(),
+                                         absl::StrCat("first_output_", i))});
+  }
+  for (HloInstruction* instruction : first_call_instructions) {
+    if (first_call_outputs.contains(instruction)) {
+      continue;
+    }
+    second_call_replacements.insert({instruction, nullptr});
+  }
+  HloComputation* second_call_computation =
+      module->AddEmbeddedComputation(body->CloneWithReplacements(
+          &second_call_replacements, /*extra_parameters=*/{},
+          /*context=*/nullptr, /*suffix=*/"second", /*new_root=*/nullptr));
+
+  // Now actually create the call ops, connect them together, and splice them
+  // where the original call was.
+  HloInstruction* first_call =
+      enclosing_computation->AddInstruction(call->CloneWithNewOperands(
+          first_call_computation->root_instruction()->shape(),
+          call->operands()));
+  first_call->set_to_apply(first_call_computation);
+  std::vector<HloInstruction*> first_call_output_gtes;
+  first_call_output_gtes.reserve(first_call_outputs_vec.size());
+  for (int i = 0; i < first_call_outputs_vec.size(); ++i) {
+    first_call_output_gtes.push_back(enclosing_computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(first_call, i)));
+  }
+  HloInstruction* second_call =
+      enclosing_computation->AddInstruction(call->CloneWithNewOperands(
+          second_call_computation->root_instruction()->shape(),
+          first_call_output_gtes));
+  second_call->set_to_apply(second_call_computation);
+  TF_RETURN_IF_ERROR(call->ReplaceAllUsesWith(second_call));
+  return true;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> CallSplitter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  // Find all the call instructions that match the predicate. We don't process
+  // them immediately since we're going to change their enclosing computation.
+  // process all calls in a computation together. Note that we want to process
+  // them in the same order as we encounter them, because for nested calls, we
+  // want to process the deeper call first.
+
+  // TODO(mkuper): Support unflattened graphs properly - if a function has
+  // several callsites, we should only split it once, and then reuse the
+  // resulting computations.
+  std::vector<HloInstruction*> calls_to_process;
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() != HloOpcode::kCall) {
+        continue;
+      }
+      // TODO(mkuper): Support calls with control dependencies, if that appears
+      // useful.
+      if (instruction->HasControlDependencies()) {
+        continue;
+      }
+      if (!execution_threads.empty() &&
+          !execution_threads.contains(
+              instruction->to_apply()->execution_thread())) {
+        continue;
+      }
+      // TODO(mkuper): We could support removing dead parameters from non-tuple
+      // shaped calls. We could also potentially support pass-through for
+      // tuple-shaped calls where the root *instruction* is not kTuple by doing
+      // more complex analysis.
+      if (instruction->to_apply()->root_instruction()->opcode() !=
+          HloOpcode::kTuple) {
+        continue;
+      }
+      VLOG(1) << "Found matching call: " << instruction->ToString();
+      calls_to_process.push_back(instruction);
+    }
+  }
+
+  for (HloInstruction* call : calls_to_process) {
+    TF_ASSIGN_OR_RETURN(bool split, SplitCall(call, boundary_predicate_));
+    changed |= split;
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter.h b/third_party/xla/xla/hlo/transforms/call_splitter.h
new file mode 100644
index 00000000000000..e13445130d9ec5
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
+#define XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// This pass allows splitting a single function call into two calls to two
+// functions, where the original called computation is split across a specified
+// boundary.
+//
+// For example, given the call
+//
+// (x) = call(a, b, c), to_apply={(mul(p0, (add(p1, p2)))}
+//
+// with the boundary predicate "opcode == kMultiply" we will get:
+//
+// (t) = call(b, c), to_apply={(add(p0, p1))}
+// (y) = call(a, t), to_apply={(mul(p0, p1))}
+//
+// This also allow splitting functions "vertically" as opposed to "horizontally"
+// e.g. for the same predicated, given:
+//
+// (x, y) = call(a, b, c), to_apply={(add(p0, p1), mul(p1, p2))}
+//
+// we should get:
+//
+// (x) = call(a, b), to_apply={(add(p0, p1))}
+// (y) = call(b, c), to_apply={(mul(p0, p1))}
+//
+// More precisely:
+//
+// a) The instructions matching the boundary predicate go into the second call.
+// b) Anything that consumes the results of the boundary instructions goes
+//    into the second call.
+// c) Anything that feeds the instructions from (a) and (b) goes into the
+//    first call.
+// d) The remaining instructions go into the first call.
+// TODO(mkuper): This is not quite ready for production use yet.
+class CallSplitter : public HloModulePass {
+ public:
+  // The `call_predicate` is used to select the calls that should be split. The
+  // `boundary_predicate` is used to select the instructions that form the
+  // boundary between the two calls.
+  explicit CallSplitter(const HloPredicate& call_predicate,
+                        const HloPredicate& boundary_predicate)
+      : call_predicate_(call_predicate),
+        boundary_predicate_(boundary_predicate) {}
+
+  ~CallSplitter() override = default;
+
+  static constexpr absl::string_view kName = "call-splitter";
+  absl::string_view name() const override { return kName; }
+
+ protected:
+  // Runs the pass on the given module. Returns whether the module was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ protected:
+  HloPredicate call_predicate_;
+  HloPredicate boundary_predicate_;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_CALL_SPLITTER_H_
diff --git a/third_party/xla/xla/hlo/transforms/call_splitter_test.cc b/third_party/xla/xla/hlo/transforms/call_splitter_test.cc
new file mode 100644
index 00000000000000..912e053d634aa1
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/call_splitter_test.cc
@@ -0,0 +1,286 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/call_splitter.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/service/call_inliner.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CallSplitterTest : public HloHardwareIndependentTestBase {
+ protected:
+  CallSplitterTest()
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+};
+
+namespace {
+
+namespace m = ::xla::match;
+
+TEST_F(CallSplitterTest, SplitDownOneInstructionBasic) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addmul {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, b)
+  mul = s32[] multiply(add, c)
+  ROOT tuple = (s32[]) tuple(mul)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[]) call(p0, p1, p2), to_apply=addmul
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(cleanup.Run(module.get()).status());
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // Verify we got the two-call structure, with the mul in the second call.
+  HloInstruction* call1;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::Parameter(2),
+                                 m::GetTupleElement(m::Call(&call1), 0))));
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add())));
+  EXPECT_THAT(call2->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply())));
+
+  // Verify we hooked up all the parameters correctly by simplifying again and
+  // making sure it's equivalent to what we had in the beginning.
+  CallInliner call_inliner;
+  TF_CHECK_OK(call_inliner.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply(
+                  m::Add(m::Parameter(0), m::Parameter(1)), m::Parameter(2)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownOneInstructionIndependent) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+addmul {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, b)
+  mul = s32[] multiply(b, c)
+  ROOT tuple = (s32[], s32[]) tuple(add, mul)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=addmul
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(cleanup.Run(module.get()).status());
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::GetTupleElement(m::Call(&call1), 0),
+                                  m::GetTupleElement(m::Call(&call2), 0))));
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+  EXPECT_THAT(
+      call2->to_apply()->root_instruction(),
+      GmockMatch(m::Tuple(m::Multiply(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownMultipleInstructionsParallel) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+func {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  d = s32[] parameter(3)
+  x = s32[] add(a, b)
+  y = s32[] add(c, d)
+  mul = s32[] multiply(x, x)
+  sub = s32[] subtract(y, y)
+  add = s32[] add(mul, sub)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(mul, sub, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT call = (s32[], s32[], s32[]) call(p0, p1, p2, p3), to_apply=func
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply ||
+           instruction->opcode() == HloOpcode::kSubtract;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(cleanup.Run(module.get()).status());
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call1_copy;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::GetTupleElement(m::Call(&call1), 0),
+                                 m::GetTupleElement(m::Call(&call1_copy), 1))));
+  EXPECT_EQ(call1, call1_copy);
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)),
+                                  m::Add(m::Parameter(2), m::Parameter(3)))));
+}
+
+TEST_F(CallSplitterTest, SplitDownMultipleInstructionsDependent) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+func {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  d = s32[] parameter(3)
+  x = s32[] add(a, b)
+  y = s32[] add(c, d)
+  mul = s32[] multiply(x, x)
+  sub = s32[] subtract(mul, y)
+  ROOT tuple = (s32[], s32[]) tuple(mul, sub)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2, p3), to_apply=func
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  auto split = [](const HloInstruction* instruction) -> bool {
+    return instruction->opcode() == HloOpcode::kMultiply ||
+           instruction->opcode() == HloOpcode::kSubtract;
+  };
+
+  CallSplitter splitter(/*call_predicate=*/HloPredicateTrue,
+                        /*boundary_predicate=*/split);
+  EXPECT_TRUE(splitter.Run(module.get()).value());
+
+  CallParameterCleanup cleanup;
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(cleanup.Run(module.get()).status());
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call1;
+  HloInstruction* call1_copy;
+  HloInstruction* call2;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call2, m::GetTupleElement(m::Call(&call1), 0),
+                                 m::GetTupleElement(m::Call(&call1_copy), 1))));
+  EXPECT_EQ(call1, call1_copy);
+  EXPECT_THAT(call1->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)),
+                                  m::Add(m::Parameter(2), m::Parameter(3)))));
+  EXPECT_THAT(call2->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply(), m::Subtract())));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
index 7697315c9e4c55..562a317fc08d64 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
@@ -35,9 +35,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherBroadcastReorder::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> AllGatherBroadcastReorder::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedCollective(*module,
                                                      HloOpcode::kAllGather)) {
     VLOG(1) << "Skip AllGatherBroadcastReorder because the module contains "
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
index 0a1a8d5e27c2d7..b08b79960f1c1d 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h
@@ -32,8 +32,8 @@ class AllGatherBroadcastReorder : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-gather-bcast-reorder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
index 61b261248e0e2c..ce1fe5e14ceb85 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
@@ -269,7 +269,7 @@ absl::StatusOr<bool> AllGatherCombiner::RunWithKeyCombiner(
   return changed;
 }
 
-absl::StatusOr<bool> AllGatherCombiner::Run(
+absl::StatusOr<bool> AllGatherCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
index 67cd439ab80b68..eab423c04855d6 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.h
@@ -41,11 +41,6 @@ class AllGatherCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "all-gather-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // The group key encapsulates all of the properties which must match for it to
   // be possible to combine the instructions.
   // The field of the key corresponds to the following:
@@ -69,6 +64,10 @@ class AllGatherCombiner : public HloModulePass {
       bool combine_by_dim, bool combine_different_dtypes = true);
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   absl::StatusOr<bool> RunWithKeyCombiner(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads,
@@ -76,7 +75,6 @@ class AllGatherCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&, bool, bool)>
           combine_key);
 
- protected:
   // Combine all gather ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
index 1414658d397809..bdb4d02bde89b4 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherCSE::Run(
+absl::StatusOr<bool> AllGatherCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running AllGatherCSE pass";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
index 7a67eae5bc50c4..cf33e955fecab0 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_cse.h
@@ -53,8 +53,8 @@ class AllGatherCSE : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-gather-cse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
index 5e11dc5d3585d2..65e1f4f8110902 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.cc
@@ -155,7 +155,7 @@ absl::Status ReshapeAndReplaceResults(HloInstruction* original_all_gather,
 
 }  // namespace
 
-absl::StatusOr<bool> AllGatherRemoveDegenerateDims::Run(
+absl::StatusOr<bool> AllGatherRemoveDegenerateDims::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
index 39810f8b295990..8675e6e032e048 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_remove_degenerate_dims.h
@@ -46,8 +46,8 @@ class AllGatherRemoveDegenerateDims : public HloModulePass {
     return "all-gather-remove-degenerate-dims";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
index a635d43b5e5e44..7b37fd9edd9584 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.cc
@@ -174,7 +174,7 @@ AllReduceCombiner::AllReduceCombiner(int64_t combine_threshold_in_bytes,
     : combine_threshold_in_bytes_(combine_threshold_in_bytes),
       combine_threshold_count_(combine_threshold_count) {}
 
-absl::StatusOr<bool> AllReduceCombiner::Run(
+absl::StatusOr<bool> AllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithKeyCombiner(module, execution_threads, CombineKey);
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
index 044cc74b5a5c45..1f6df279f1b38a 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_combiner.h
@@ -45,11 +45,6 @@ class AllReduceCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using GroupKey = std::tuple<AllReduceKey, /*extra_args*/ std::string>;
 
   static std::string& GetGroupKeyExtraArgs(AllReduceCombiner::GroupKey& key);
@@ -67,6 +62,10 @@ class AllReduceCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&)>
           combine_key);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Combine all reduce ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
index 9768600d744536..bc18c84e1b7f4f 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.cc
@@ -93,7 +93,7 @@ absl::Status ReplaceWithContiguousAllReduce(
 }
 }  // namespace
 
-absl::StatusOr<bool> AllReduceContiguous::Run(
+absl::StatusOr<bool> AllReduceContiguous::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running AllReduceContiguous";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
index 5262c94366cce0..4ac0130dda98e4 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_reduce_contiguous.h
@@ -30,8 +30,8 @@ class AllReduceContiguous : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-contiguous"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
index f993a5762d4cca..d2332112c8d614 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
@@ -254,7 +254,7 @@ absl::StatusOr<bool> AsyncCollectiveCreator::ReplaceCollectives(
   return changed;
 }
 
-absl::StatusOr<bool> AsyncCollectiveCreator::Run(
+absl::StatusOr<bool> AsyncCollectiveCreator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
index ad98b84de98eb1..4d686dc354c65e 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.h
@@ -59,17 +59,17 @@ class AsyncCollectiveCreator : public HloModulePass {
       : config_(std::move(creator_config)) {}
   absl::string_view name() const override { return "async-collective-creator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule *module,
-      const absl::flat_hash_set<absl::string_view> &execution_threads) override;
-
   std::vector<HloInstruction *> MatchCollectives(HloComputation *computation);
   absl::StatusOr<bool> ReplaceCollectives(
       HloComputation *computation,
       std::vector<HloInstruction *> &supported_collectives);
   const CollectiveCreatorConfig *config() const { return &config_; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   CollectiveCreatorConfig config_;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
index eb5b7d10a80c5b..68ea1495311981 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.cc
@@ -94,7 +94,7 @@ CollectivePermuteCombiner::CollectivePermuteCombiner(
     : combine_threshold_in_bytes_(combine_threshold_in_bytes),
       combine_threshold_count_(combine_threshold_count) {}
 
-absl::StatusOr<bool> CollectivePermuteCombiner::Run(
+absl::StatusOr<bool> CollectivePermuteCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running CollectivePermuteCombiner with threshold of "
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
index 01d66056ccad53..1d62386115a7bc 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner.h
@@ -46,16 +46,16 @@ class CollectivePermuteCombiner : public HloModulePass {
     return "collective-permute-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Combine collective permute ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
   // Combine collective permute ops up to this threshold (number of operands).
   int64_t combine_threshold_count_;
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
index 976397dd542f16..0dabd59b68a32b 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.cc
@@ -446,7 +446,7 @@ absl::StatusOr<bool> MatchQuantization(HloInstruction* instr) {
 
 }  // namespace
 
-absl::StatusOr<bool> CollectiveQuantizer::Run(
+absl::StatusOr<bool> CollectiveQuantizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
index be5722307decae..6830fa9e5715a5 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_quantizer.h
@@ -49,7 +49,8 @@ class CollectiveQuantizer : public HloModulePass {
  public:
   absl::string_view name() const override { return "collective-quantizer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
index 5ae9f7f7f7ca8d..3ca17626a298f9 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
@@ -291,7 +291,7 @@ CollectiveTransformationReorder::ReorderAllReduceTransformations(
   return true;
 }
 
-absl::StatusOr<bool> CollectiveTransformationReorder::Run(
+absl::StatusOr<bool> CollectiveTransformationReorder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool ag_changed, ReorderAllGatherTransformations(
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
index 9b8071d517d635..3ce31eec4d387f 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.h
@@ -57,8 +57,9 @@ class CollectiveTransformationReorder : public HloModulePass {
   absl::string_view name() const override {
     return "collective-transformation-reorderer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
index 495abf85385fa0..e1bf0359873acf 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
@@ -31,7 +31,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> CollectivesScheduleLinearizer::Run(
+absl::StatusOr<bool> CollectivesScheduleLinearizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (is_enabled_ && !is_enabled_(module)) {
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
index 01928549116813..26559caf456f39 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.h
@@ -39,8 +39,8 @@ class CollectivesScheduleLinearizer : public HloModulePass {
     return "collectives-schedule-linearizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
index ec51d1948d3838..2088c39bd515f5 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.cc
@@ -218,7 +218,7 @@ absl::StatusOr<bool> ConvertAsyncCollectivesToSync::RunOnComputation(
   return true;
 }
 
-absl::StatusOr<bool> ConvertAsyncCollectivesToSync::Run(
+absl::StatusOr<bool> ConvertAsyncCollectivesToSync::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->has_schedule()) {
diff --git a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
index 2c0ccaad6569cf..3a79b18b666928 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h
@@ -41,11 +41,6 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
     return "convert-async-collectives-to-sync";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   virtual absl::Status ConvertAsyncInstructionsToSync(
       HloComputation* computation,
       absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
@@ -63,6 +58,11 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
   static constexpr char kAsyncCollectiveNameAttributeName[] =
       "async_collective_name";
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
   HloPredicate is_nop_;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
index 9bac9fcde4f198..a4d54b5c8c3be7 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
@@ -519,7 +519,7 @@ absl::Status InfeedTokenPropagation::PropagateToken(
   return PropagateToken(ordering);
 }
 
-absl::StatusOr<bool> InfeedTokenPropagation::Run(
+absl::StatusOr<bool> InfeedTokenPropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Before InfeedTokenPropagation:";
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
index bf012949356729..f6a7ccf633ed7d 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.h
@@ -40,8 +40,9 @@ namespace xla {
 class InfeedTokenPropagation : public HloModulePass {
  public:
   absl::string_view name() const override { return "infeed-token-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
index 92c4448b7049a9..9644f1a28b8cb6 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
@@ -110,7 +110,7 @@ ConvertCustomCallWithExternalAnnotationToInternalAnnotation(
 
 }  // namespace
 
-absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
+absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
index ab2bc8359c0f76..25fe92f4499d07 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h
@@ -35,8 +35,9 @@ class ConvertMemoryPlacementToInternalAnnotations : public HloModulePass {
   absl::string_view name() const override {
     return "convert-memory-placement-to-internal-annotations";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/defuser.cc b/third_party/xla/xla/hlo/transforms/defuser.cc
index 16f8152a9d15dc..ba7e97af244a7f 100644
--- a/third_party/xla/xla/hlo/transforms/defuser.cc
+++ b/third_party/xla/xla/hlo/transforms/defuser.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> Defuser::Run(
+absl::StatusOr<bool> Defuser::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Defusing module " << module->name();
diff --git a/third_party/xla/xla/hlo/transforms/defuser.h b/third_party/xla/xla/hlo/transforms/defuser.h
index edc2f8f9d8b632..0289e940b59ff8 100644
--- a/third_party/xla/xla/hlo/transforms/defuser.h
+++ b/third_party/xla/xla/hlo/transforms/defuser.h
@@ -34,10 +34,8 @@ class Defuser : public HloModulePass {
   ~Defuser() override {}
   absl::string_view name() const override { return "defuser"; }
 
-  // Run defusion on the given module. Returns whether the module was
-  // changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/despecializer.cc b/third_party/xla/xla/hlo/transforms/despecializer.cc
index 11297956ae6cab..8ca63af50bb107 100644
--- a/third_party/xla/xla/hlo/transforms/despecializer.cc
+++ b/third_party/xla/xla/hlo/transforms/despecializer.cc
@@ -53,7 +53,7 @@ void Despecializer::AddReduceWindowToReduceBroadcastDeconstruct() {
   pipeline_.AddPass<DeconstructReduceWindowToReduceBroadcast>();
 }
 
-absl::StatusOr<bool> Despecializer::Run(
+absl::StatusOr<bool> Despecializer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return pipeline_.Run(module, execution_threads);
@@ -66,7 +66,7 @@ absl::StatusOr<bool> Despecializer::Run(
 // reference platform perspective, i.e., for testing, this custom-call should be
 // a copy since no optimizations are performed and runtime is not the criterion
 // while obtaining reference results.
-absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::Run(
+absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> candidates;
@@ -87,7 +87,7 @@ absl::StatusOr<bool> AssumeGatherIndicesInBoundRewriteToCopy::Run(
   return !candidates.empty();
 }
 
-absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::Run(
+absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/despecializer.h b/third_party/xla/xla/hlo/transforms/despecializer.h
index 1266eed7af03f3..e0cb04d06f2288 100644
--- a/third_party/xla/xla/hlo/transforms/despecializer.h
+++ b/third_party/xla/xla/hlo/transforms/despecializer.h
@@ -45,8 +45,9 @@ class Despecializer : public HloModulePass {
   void AddReduceWindowToReduceBroadcastDeconstruct();
   void AddAssumeGatherIndicesInBoundRewriteToCopy();
   absl::string_view name() const override { return "despecializer"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -60,8 +61,9 @@ class AssumeGatherIndicesInBoundRewriteToCopy : public HloModulePass {
   absl::string_view name() const override {
     return "AssumeGatherIndicesInBoundRewriteToCopy";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -72,8 +74,9 @@ class DeconstructReduceWindowToReduceBroadcast : public HloModulePass {
   absl::string_view name() const override {
     return "ReduceWindowToReduceAndBroadcast";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -84,10 +87,10 @@ class ControlDepRemover : public HloModulePass {
   ControlDepRemover() = default;
   absl::string_view name() const override { return "control-dep-remover"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     bool changed = false;
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index ff4f58a1ce1711..99bc6dd8342e46 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -366,6 +366,7 @@ xla_cc_test(
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",  # fixdeps: keep
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
index 9384ac4d2552ef..5cafb77aade175 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
@@ -89,6 +89,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   }
   // The canonical form of the lhs is
   // [BatchDims, NonContractingDimsProduct, ContractingsDimsProduct]
+  // However, [ContractingDim, NonContractingDim] is considered canonical too.
   // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64_t> lhs_transpose;
   lhs_transpose.reserve(lhs_rank);
@@ -147,6 +148,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
 
   // The canonical form of the rhs is
   // [BatchDims, ContractingsDimsProduct, NonContractingDimsProduct]
+  // However, [NonContractingDim, ContractingDim] is considered canonical too.
   // If NonContractingDimsProduct is 1, it is omitted.
   std::vector<int64_t> rhs_transpose;
   rhs_transpose.reserve(rhs_rank);
@@ -219,7 +221,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDecomposer::Run(
+absl::StatusOr<bool> DotDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Gather all Non-canonical Dot operations.
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
index b399970fe5cffa..392704b5eb621b 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.h
@@ -31,10 +31,8 @@ class DotDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_decomposer"; }
 
-  // Run DotDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
index 2793c7e9bb504d..e7e285bbdfaf4d 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer_test.cc
@@ -20,17 +20,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/pattern_matcher.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -63,6 +59,55 @@ TEST_F(DotDecomposerTest, CanonicalizeMultipleNonContractingDims) {
                                 op::Shape("f32[4032,512]"))));
 }
 
+TEST_F(DotDecomposerTest,
+       DontCanonicalizeLhsContractingDim0AndRhsContractingDim1) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[512,64]{1,0} parameter(0)
+    p1 = f32[1024,512]{1,0} parameter(1)
+    ROOT dot = f32[64,1024]{1,0} dot(p0, p1), lhs_contracting_dims={0},
+                                              rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_FALSE(canonicalized) << module->ToString();
+}
+
+TEST_F(DotDecomposerTest, TransposeContractingDimsUponCanonicalization) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[512,32,32]{2,1,0} parameter(0)
+    p1 = f32[1024,512]{1,0} parameter(1)
+    // This dot is considered non-canonical because the LHS has two
+    // non-contracting dimensions. Both, LHS and RHS operands are canonicalized,
+    // which involves transposing the contracting dimensions to be 1 and 0 on
+    // the LHS and RHS, respectively.
+    // TODO(tjoerg): Consider leaving the RHS alone, since it is canonical.
+    ROOT dot = f32[32,32,1024]{2,1,0} dot(p0, p1), lhs_contracting_dims={0},
+                                                   rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(op::Transpose()),
+                                        op::Reshape(op::Transpose()),
+                                        /*lhs_contracting_dim=*/1,
+                                        /*rhs_contracting_dim=*/0),
+                                op::Shape("f32[1024,1024]"))))
+      << module->ToString();
+}
+
 TEST_F(DotDecomposerTest, DontCanonicalizeIfNoNoncontractingDims) {
   absl::string_view module_string = R"(
   HloModule module
@@ -185,6 +230,30 @@ TEST_F(DotDecomposerTest, AddRhsNonContractingDimIfZero) {
                                 op::Shape("f32[64,0]"))));
 }
 
+TEST_F(DotDecomposerTest, CanonicalizeBatchDims) {
+  absl::string_view module_string = R"(
+  ENTRY main {
+    p0 = f32[64,4,32,8] parameter(0)
+    p1 = f32[128,4,8,32] parameter(1)
+    ROOT dot = f32[32,8,64,128] dot(p0, p1), lhs_batch_dims={2,3},
+                                             lhs_contracting_dims={1},
+                                             rhs_batch_dims={3,2},
+                                             rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/3,
+                                        /*rhs_contracting_dim=*/2),
+                                op::Shape("f32[32,8,64,128]"))));
+}
+
 template <typename Arg0, typename Arg1, typename Arg2>
 auto SparseDotMatcher(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {
   return match::Op()
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
index faae8fdec809b0..77c9c20c9dfbd5 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
@@ -32,7 +32,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> DynamicIndexSplitter::Run(
+absl::StatusOr<bool> DynamicIndexSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
index 910b149d136755..28a836d45ec90a 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.h
@@ -30,8 +30,9 @@ class DynamicIndexSplitter : public HloModulePass {
  public:
   DynamicIndexSplitter() = default;
   absl::string_view name() const override { return "dynamic-index-splitter"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
index 25ba442542d2c8..fe4e3c8cb1bb87 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.cc
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> OpExpanderPass::Run(
+absl::StatusOr<bool> OpExpanderPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> matching_instructions;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
index c30120ee2370f5..76cd9abe705fe7 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/op_expander_pass.h
@@ -32,11 +32,6 @@ namespace xla {
 // does not support into other HLO instructions.
 class OpExpanderPass : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // extra_filter: Optional extra filtering criteria for matching instructions,
   // used in conjunction with InstructionMatchesPattern.
   // preserve_sharding and relay_control_dependency: If we preserve sharding and
@@ -49,6 +44,10 @@ class OpExpanderPass : public HloModulePass {
         relay_control_dependency_(relay_control_dependency) {}
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Returns `true` if `instruction` should be expanded by this pass.
   virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
 
diff --git a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
index 10dcc7a2eef96c..705ad85f3280f6 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> OptimizationBarrierExpander::Run(
+absl::StatusOr<bool> OptimizationBarrierExpander::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> barriers;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
index a18b8e9a310239..8592815c24bb69 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/optimization_barrier_expander.h
@@ -30,8 +30,8 @@ class OptimizationBarrierExpander : public HloModulePass {
 
   absl::string_view name() const override { return "cse_barrier_expander"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
index cd7a138ddb5cef..62968d989416ff 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.cc
@@ -341,7 +341,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> RaggedToGeneral(
 
 }  // namespace
 
-absl::StatusOr<bool> RaggedDotRewriter::Run(
+absl::StatusOr<bool> RaggedDotRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (module->config()
diff --git a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
index 37e430a13e0a8e..d26078aeb20b92 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/ragged_dot_rewriter.h
@@ -29,10 +29,8 @@ class RaggedDotRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "ragged_dot_rewriter"; }
 
-  // Run DotDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
index 7b7a5367b81348..cd47ac1a1ca8f8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
@@ -153,7 +153,7 @@ class ReduceDecomposerVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ReduceDecomposer::Run(
+absl::StatusOr<bool> ReduceDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed1,
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
index 22bcabf831ca6f..0011d6202897a3 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.h
@@ -69,8 +69,8 @@ class ReduceDecomposer : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-decomposer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
index 50924428832c5d..ee506a662f4d78 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.cc
@@ -77,7 +77,7 @@ class ReshapeDecomposerVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ReshapeDecomposer::Run(
+absl::StatusOr<bool> ReshapeDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return ReshapeDecomposerVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
index f169cdc666a803..49544fe152627c 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/reshape_decomposer.h
@@ -32,8 +32,8 @@ class ReshapeDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "reshape-decomposer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
index 7c5ab5fa62a752..f6158b6b5bdbb5 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.cc
@@ -139,7 +139,7 @@ absl::Status DecomposeStochasticConvert(HloComputation* comp,
                        PrimitiveType_Name(to_type));
 }
 
-absl::StatusOr<bool> StochasticConvertDecomposer::Run(
+absl::StatusOr<bool> StochasticConvertDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
index e0574e4fa5e85f..20b0284732fcee 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
+++ b/third_party/xla/xla/hlo/transforms/expanders/stochastic_convert_decomposer.h
@@ -31,8 +31,9 @@ class StochasticConvertDecomposer : public HloModulePass {
   absl::string_view name() const override {
     return "stochastic_convert_decomposer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
index 1efd19941fab0e..868e450502626f 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
@@ -1025,7 +1025,7 @@ HostOffloadLegalize::FindStartingInstructionsOfHostMemoryOffload(
   return starting_instructions;
 }
 
-absl::StatusOr<bool> HostOffloadLegalize::Run(
+absl::StatusOr<bool> HostOffloadLegalize::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
index efe8845c2a5804..75b9760bcbe0f1 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
@@ -39,8 +39,8 @@ class HostOffloadLegalize : public HloModulePass {
 
   absl::string_view name() const override { return "host-offload-legalize"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index 260eee1ce22071..1fdd3360d98b82 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -1380,7 +1380,7 @@ absl::StatusOr<bool> HostOffloader::HandlePallasKernels(HloModule* module) {
   return changed;
 }
 
-absl::StatusOr<bool> HostOffloader::Run(
+absl::StatusOr<bool> HostOffloader::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Start by removing all host memory space from all shapes. Host memory space
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.h b/third_party/xla/xla/hlo/transforms/host_offloader.h
index 72d0110559a106..3095591c22985c 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.h
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.h
@@ -62,15 +62,14 @@ class HostOffloader : public HloModulePass {
 
   absl::string_view name() const override { return "host-offloader"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   virtual absl::StatusOr<std::vector<int64_t>>
   GetPallasCustomCallOutputMemorySpaces(HloInstruction* instruction) const;
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Process the next "MoveToHost" instruction that resides at the beginning of
   // a host memory offload instruction chain. This ensures that redundant
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
index 4714f7b86c29e2..84c33554c8e466 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
@@ -133,7 +133,7 @@ absl::StatusOr<bool> ConvertToCustomCall(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> HostOffloadingPrepare::Run(
+absl::StatusOr<bool> HostOffloadingPrepare::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   switch (rewrite_) {
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
index d45336e6111a05..e651b0aca3baa0 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.h
@@ -76,8 +76,8 @@ class HostOffloadingPrepare : public HloModulePass {
 
   absl::string_view name() const override { return pass_name_; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
index 6b7418447b6ad0..3f45ff1051f4a0 100644
--- a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.cc
@@ -60,7 +60,7 @@ LiteralCanonicalizer::LiteralCanonicalizer(LiteralPool* literal_pool,
                                            size_t min_size_bytes)
     : literal_pool_(literal_pool), min_size_bytes_(min_size_bytes) {}
 
-absl::StatusOr<bool> LiteralCanonicalizer::Run(
+absl::StatusOr<bool> LiteralCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Every time we canonicalize literals in a module, we garbage collect expired
diff --git a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
index 26d1768f374a79..311ea8df029899 100644
--- a/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/literal_canonicalizer.h
@@ -33,14 +33,13 @@ class LiteralCanonicalizer : public HloModulePass {
  public:
   LiteralCanonicalizer(LiteralPool* literal_pool, size_t min_size_bytes);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override { return "literal-canonicalizer"; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   LiteralPool* literal_pool_;
   size_t min_size_bytes_;
 };
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
index d5637aa6ddeef1..624879fc482a97 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
@@ -56,7 +56,7 @@ bool MemorySpacePropagation::RunOnComputation(HloComputation* computation) {
   return modified;
 }
 
-absl::StatusOr<bool> MemorySpacePropagation::Run(
+absl::StatusOr<bool> MemorySpacePropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
index 33b4f08c4e31ec..f0eda8291dc285 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
@@ -39,15 +39,16 @@ class MemorySpacePropagation : public HloModulePass {
       : dataflow_analysis_(std::move(dataflow_analysis)) {}
   ~MemorySpacePropagation() override = default;
   absl::string_view name() const override { return "memory-space-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Propagates the memory space (and associated split config) in the layout to
   // a given fusion computation. Returns true if the computation is modified.
   bool RunOnComputation(HloComputation* computation);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Given the shape index (operand or output) and its corresponding instruction
   // in the fused computation (parameter or root), propagates the memory space
diff --git a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
index 5d555bcf873e6a..4d58a9be040fad 100644
--- a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.cc
@@ -50,7 +50,7 @@ class ShapeCanonicalizerVisitor : public DfsHloRewriteVisitor {
 ShapeCanonicalizer::ShapeCanonicalizer(ShapePool* shape_pool)
     : shape_pool_(shape_pool) {}
 
-absl::StatusOr<bool> ShapeCanonicalizer::Run(
+absl::StatusOr<bool> ShapeCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Every time we canonicalize shapes in a module, we garbage collect expired
diff --git a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
index 1d2efb1f3849ca..e68cb8d7788bb5 100644
--- a/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/shape_canonicalizer.h
@@ -34,14 +34,13 @@ class ShapeCanonicalizer : public HloModulePass {
 
   explicit ShapeCanonicalizer(ShapePool* shape_pool);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override { return "shape-canonicalizer"; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   ShapePool* shape_pool_;
 };
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index f6d67858673baa..cb23149983268c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -1851,3 +1851,47 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "call_parameter_cleanup",
+    srcs = ["call_parameter_cleanup.cc"],
+    hdrs = ["call_parameter_cleanup.h"],
+    deps = [
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "call_parameter_cleanup_test",
+    srcs = ["call_parameter_cleanup_test.cc"],
+    deps = [
+        ":call_parameter_cleanup",
+        ":hlo_dce",
+        ":tuple_simplifier",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/service:call_inliner",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",  # fixdeps: keep
+    ],
+)
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index 6fce96ac1292c4..276e9c7c5b1456 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -10033,7 +10033,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
   return ReplaceWithNewInstruction(map, std::move(clone));
 }
 
-absl::StatusOr<bool> AlgebraicSimplifier::Run(
+absl::StatusOr<bool> AlgebraicSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
index 4d688d39eb8df6..37c9215218cc34 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -423,13 +423,6 @@ class AlgebraicSimplifier : public HloModulePass {
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
-  // Run algebraic simplification on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Create constant from literal with tiles and element size updated in the
   // constant's layout.
   std::unique_ptr<HloInstruction> CreateConstantWithLayoutUpdated(
@@ -440,6 +433,12 @@ class AlgebraicSimplifier : public HloModulePass {
   }
 
  protected:
+  // Run algebraic simplification on the given computation. Returns whether the
+  // computation was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   AlgebraicSimplifierOptions options_;
 };
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
index 9196add5cabbbf..7c0908f6e6250b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.cc
@@ -960,7 +960,7 @@ absl::Status AllGatherPadDsSimplifierVisitor::HandleDynamicSlice(
   return ReplaceInstruction(dynamic_slice, *selected);
 }
 
-absl::StatusOr<bool> AllGatherPadDsSimplifier::Run(
+absl::StatusOr<bool> AllGatherPadDsSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
index 9e95ceee85d1e6..e2a957d278b906 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_pad_ds_simplifier.h
@@ -157,8 +157,9 @@ class AllGatherPadDsSimplifier : public HloModulePass {
   absl::string_view name() const override {
     return "all-gather-pad-ds-simplifier";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
index c1305a3809aaf7..5463c1cfaf950b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.cc
@@ -88,7 +88,7 @@ AllGatherDynamicSlicePermutedOffsetSimplifierVisitor::HandleDynamicSlice(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> AllGatherDynamicSlicePermutedOffsetSimplifier::Run(
+absl::StatusOr<bool> AllGatherDynamicSlicePermutedOffsetSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
index 71ce4e5cce8ccc..899944cafdeb18 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_gather_permuted_ds_simplifier.h
@@ -66,8 +66,8 @@ class AllGatherDynamicSlicePermutedOffsetSimplifier : public HloModulePass {
     return "all-gather-to-collective-permute-simplifier";
   }
 
-  using HloModulePass::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
index 734cb745154049..caeafbd1826709 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.cc
@@ -148,9 +148,9 @@ std::optional<std::vector<ReplicaGroup>> FoldReplicaGroups(
 
 }  // namespace
 
-absl::StatusOr<bool> AllReduceFolder::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> AllReduceFolder::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
     VLOG(1) << "Skip AllReduceFolder because the module contains all-reduce "
                "with constrained layouts";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
index ed43d9be54838b..7653cfa2ff1068 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/all_reduce_folder.h
@@ -39,8 +39,8 @@ class AllReduceFolder : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-folder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
index b3dd302d221236..5b4ec0c01dcaf9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.cc
@@ -632,7 +632,7 @@ absl::StatusOr<bool> ArCrsCombiner::RewriteGraph() {
   return true;
 }
 
-absl::StatusOr<bool> ArCrsCombiner::Run(
+absl::StatusOr<bool> ArCrsCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   call_graph_ = CallGraph::Build(module);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
index 3b5ffc22bf83c8..d602a5bc9357c5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/ar_crs_combiner.h
@@ -86,15 +86,16 @@ class ArCrsCombiner : public HloModulePass {
       : num_spatial_partitions_(num_spatial_partitions),
         spmd_partition_(spmd_partition) {}
   absl::string_view name() const override { return "ar-crs-combiner"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Helper method to allow testing of InstructionsComputeSameValue.
   static bool TestInstructionsComputeSameValue(HloInstruction* i1,
                                                HloInstruction* i2);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // We used this struct because multiple ARs could be paired with the same CRS.
   // In this case, we want to select the AR that is furthest from the CRS,
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
index 0a0ef5127151a8..6aed9f6e007067 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.cc
@@ -121,7 +121,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
   return true;
 }
 
-absl::StatusOr<bool> BatchDotSimplification::Run(
+absl::StatusOr<bool> BatchDotSimplification::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
index 4d82376d61de08..c6a7066e1e98a3 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/batch_dot_simplification.h
@@ -31,11 +31,12 @@ namespace xla {
 // run the DotDecomposer.
 class BatchDotSimplification : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::string_view name() const override { return "batch-dot-simplification"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  absl::string_view name() const override { return "batch-dot-simplification"; }
 
  private:
   absl::StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
index 0ed91d0a1b2a09..cec1d2ca199574 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.cc
@@ -268,11 +268,11 @@ absl::Status BFloat16ConversionFoldingVisitor::HandleAllReduce(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> BFloat16ConversionFolding::Run(
+absl::StatusOr<bool> BFloat16ConversionFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "BFloat16ConversionFolding::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "BFloat16ConversionFolding::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     if (BFloat16ConversionFoldingVisitor::Run(comp, bfloat16_support_, this)) {
@@ -280,7 +280,7 @@ absl::StatusOr<bool> BFloat16ConversionFolding::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "BFloat16ConversionFolding::Run(), after:\n" + module->ToString());
+      2, "BFloat16ConversionFolding::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
index b21e512d60bc83..badc0649f1daec 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h
@@ -46,10 +46,10 @@ class BFloat16ConversionFolding : public HloModulePass {
   ~BFloat16ConversionFolding() override = default;
   absl::string_view name() const override { return "bfloat16-fold"; }
 
+ protected:
   // Run BF16 conversion folding on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
index 98ee931bdc9f85..491174d70ff8da 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
@@ -34,7 +34,7 @@ namespace xla {
 
 BroadcastCanonicalizer::BroadcastCanonicalizer() {}
 
-absl::StatusOr<bool> BroadcastCanonicalizer::Run(
+absl::StatusOr<bool> BroadcastCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
index fe21b1b997579c..1347df910665ce 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h
@@ -32,8 +32,9 @@ class BroadcastCanonicalizer : public HloModulePass {
   explicit BroadcastCanonicalizer();
 
   absl::string_view name() const override { return "broadcast_canonicalizer"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc
new file mode 100644
index 00000000000000..8b6a935799ec60
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.cc
@@ -0,0 +1,261 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Construct a mapping from parameter numbers in the old computation to
+// parameter numbers in the new computation. This is basically a compaction of
+// the parameters after skipping the ones we'll remove.
+// Also figures out if we need to adjust the parameters (for dead and pure
+// pass-through parameters) and the root (for any kind of pass-through).
+absl::flat_hash_map<int, int> BuildParameterMap(HloComputation* computation,
+                                                bool& adjust_params,
+                                                bool& adjust_root) {
+  adjust_params = false;
+  adjust_root = false;
+
+  absl::flat_hash_map<int, int> old_to_new_parameter_number;
+  int curr_old = 0, curr_new = 0;
+  for (HloInstruction* parameter : computation->parameter_instructions()) {
+    bool dead = false;
+    if (parameter->users().empty()) {
+      // Case 1: Dead parameter, we want to remove it.
+      dead = true;
+    } else {
+      bool found_root_use = false;
+      for (HloInstruction* user : parameter->users()) {
+        if (user == computation->root_instruction()) {
+          found_root_use = true;
+          break;
+        }
+      }
+      if (found_root_use) {
+        // Case 2: Pass-through parameter, we want to remove it from the root
+        // tuple and forward the users to the call operand.
+        adjust_root = true;
+        if (parameter->users().size() == 1) {
+          // Case 2b: Pure pass-through parameter, we want to remove it from
+          // both the root tuple *and* the parameter list.
+          dead = true;
+        }
+      }
+    }
+
+    if (dead) {
+      adjust_params = true;
+    } else {
+      old_to_new_parameter_number[curr_old] = curr_new;
+      ++curr_new;
+    }
+    ++curr_old;
+  }
+
+  return old_to_new_parameter_number;
+}
+
+// Similarly, construct a mapping from output numbers (i.e. tuple indices) in
+// the original computation to output numbers in the new computation, by
+// skipping the ones we'll remove.
+// Also collects the outputs we want to keep into `new_outputs`.
+absl::flat_hash_map<int, int> BuildOutputMap(
+    HloComputation* computation, std::vector<HloInstruction*>& new_outputs) {
+  absl::flat_hash_map<int, int> old_to_new_output_number;
+  int curr_old = 0, curr_new = 0;
+  for (HloInstruction* output : computation->root_instruction()->operands()) {
+    if (output->opcode() == HloOpcode::kParameter) {
+      ++curr_old;
+      continue;
+    }
+    old_to_new_output_number[curr_old] = curr_new;
+    ++curr_old;
+    ++curr_new;
+    new_outputs.push_back(output);
+  }
+  return old_to_new_output_number;
+}
+
+absl::Status ReplaceCallSite(
+    HloInstruction* old_call, HloComputation* new_computation,
+    const absl::flat_hash_map<int, int>& old_to_new_parameter_number,
+    const absl::flat_hash_map<int, int>& old_to_new_output_number,
+    bool adjust_root) {
+  // Create a new call instruction with the new computation and new parameters.
+  std::vector<HloInstruction*> new_call_operands;
+  new_call_operands.reserve(old_call->operands().size());
+
+  for (int i = 0; i < old_call->operands().size(); ++i) {
+    if (old_to_new_parameter_number.find(i) !=
+        old_to_new_parameter_number.end()) {
+      new_call_operands.push_back(old_call->mutable_operand(i));
+    }
+  }
+
+  HloComputation* enclosing_computation = old_call->parent();
+  HloInstruction* new_call =
+      enclosing_computation->AddInstruction(old_call->CloneWithNewOperands(
+          new_computation->root_instruction()->shape(), new_call_operands));
+  new_call->set_to_apply(new_computation);
+
+  // If we didn't remove any pass-through parameters, we're done with this
+  // callsite. Note that we can't unconditionally replace here, because the
+  // output will create a mismatch.
+  if (!adjust_root) {
+    return old_call->ReplaceAllUsesWith(new_call);
+  }
+
+  // The old call produced a tuple. To ensure the shapes match up, create a new
+  // tuple instruction with the right shape, and populate it based on the call's
+  // operands (for pass-through parameters) and the new call's outputs (for
+  // everything else). This creates some cruft, but the tuple simplifier will
+  // clean it up later.
+  HloInstruction* old_root = old_call->to_apply()->root_instruction();
+  std::vector<HloInstruction*> tuple_inputs;
+  for (int i = 0; i < old_root->operands().size(); ++i) {
+    auto iter = old_to_new_output_number.find(i);
+    if (iter != old_to_new_output_number.end()) {
+      HloInstruction* gte = enclosing_computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_call, iter->second));
+      tuple_inputs.push_back(gte);
+    } else {
+      tuple_inputs.push_back(
+          old_call->mutable_operand(old_root->operand(i)->parameter_number()));
+    }
+  }
+
+  HloInstruction* new_tuple = enclosing_computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_inputs));
+  return old_call->ReplaceAllUsesWith(new_tuple);
+}
+
+absl::StatusOr<bool> RemoveDeadParameters(HloComputation* computation) {
+  bool adjust_params, adjust_root;
+  absl::flat_hash_map<int, int> old_to_new_parameter_number =
+      BuildParameterMap(computation, adjust_params, adjust_root);
+
+  // If we don't need to adjust anything, we're done.
+  if (!adjust_params && !adjust_root) {
+    return false;
+  }
+
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  // If we're removing parameters, we need to (a) replace the ones being removed
+  // with null, and (b) adjust the parameter numbers on the remaining ones so
+  // that we don't have "holes".
+  if (adjust_params) {
+    for (HloInstruction* parameter : computation->parameter_instructions()) {
+      auto iter =
+          old_to_new_parameter_number.find(parameter->parameter_number());
+      if (iter == old_to_new_parameter_number.end()) {
+        replacements.insert({parameter, nullptr});
+      } else {
+        replacements.insert({parameter, HloInstruction::CreateParameter(
+                                            iter->second, parameter->shape(),
+                                            parameter->name())});
+      }
+    }
+  }
+
+  HloComputation* new_computation;
+  absl::flat_hash_map<int, int> old_to_new_output_number;
+  if (adjust_root) {
+    replacements.insert({computation->root_instruction(), nullptr});
+    std::vector<HloInstruction*> new_outputs;
+    old_to_new_output_number = BuildOutputMap(computation, new_outputs);
+    new_computation = computation->parent()->AddEmbeddedComputation(
+        computation->CloneWithReplacements(
+            &replacements, /*extra_parameters=*/{},
+            /*context=*/nullptr, /*suffix=*/"undead",
+            /*new_root=*/new_outputs));
+  } else {
+    // Don't fill old_to_new_output_number here, we won't need it.
+    new_computation = computation->parent()->AddEmbeddedComputation(
+        computation->CloneWithReplacements(
+            &replacements, /*extra_parameters=*/{},
+            /*context=*/nullptr, /*suffix=*/"undead"));
+  }
+
+  // The new call computation is ready, now make all the call sites use it.
+  for (HloInstruction* old_call : computation->caller_instructions()) {
+    TF_RETURN_IF_ERROR(ReplaceCallSite(old_call, new_computation,
+                                       old_to_new_parameter_number,
+                                       old_to_new_output_number, adjust_root));
+  }
+
+  return true;
+}
+
+bool ShouldProcessComputation(HloComputation* computation) {
+  // Only process computations with tuple roots. In theory we could also remove
+  // completely dead parameters from a computation with a non-tuple root, but
+  // since pass-through is only a thing for tuples, and it complicates the code,
+  // we don't bother for now.
+  if (computation->root_instruction()->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+  for (HloInstruction* instruction : computation->caller_instructions()) {
+    if (instruction->opcode() != HloOpcode::kCall) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> CallParameterCleanup::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<HloComputation*> computations_to_process;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    if (ShouldProcessComputation(computation)) {
+      computations_to_process.push_back(computation);
+    }
+  }
+
+  bool changed = false;
+  for (HloComputation* computation : computations_to_process) {
+    TF_ASSIGN_OR_RETURN(bool removed, RemoveDeadParameters(computation));
+    changed |= removed;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h
new file mode 100644
index 00000000000000..a40572a01a66b3
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// This pass:
+// a) Removes dead (unused) parameters.
+// b) Rewrites calls that pass a parameter through s.t. the users of the
+//    pass-through parameter instead directly use the operand to the call. If
+//    this transformation would make the parameter dead, it is removed.
+class CallParameterCleanup : public HloModulePass {
+ public:
+  CallParameterCleanup() = default;
+  ~CallParameterCleanup() override = default;
+
+  static constexpr absl::string_view kName = "call-parameter-cleanup";
+  absl::string_view name() const override { return kName; }
+
+ protected:
+  // Runs the pass on the given module. Returns whether the module was changed
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_CALL_PARAMETER_CLEANUP_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc
new file mode 100644
index 00000000000000..8d70e53db36ce5
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/call_parameter_cleanup_test.cc
@@ -0,0 +1,310 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/call_parameter_cleanup.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+class CallParameterCleanupTest : public HloHardwareIndependentTestBase {
+ protected:
+  CallParameterCleanupTest()
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+};
+
+namespace {
+
+namespace m = ::xla::match;
+
+TEST_F(CallParameterCleanupTest, DeadParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[]) tuple(add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be removed.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Call(&call, m::Parameter(0), m::Parameter(2))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, PassThroughParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(b, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be passed through directly from the
+  // entry computation parameter, and removed from the call parameters.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, UsedPassThroughParameter) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(c, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 2 to be passed through directly from the
+  // entry computation parameter, but not removed from the call parameters.
+  // Parameter 1 gets removed.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(2),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, DeadPassThroughParameterMultipleUses) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(b, add, b)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[], s32[]) call(p0, p1, p2), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(2)), 0),
+                  m::Parameter(1))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, UsedPassThroughParameterNoDeadParams) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  add = s32[] add(a, b)
+  ROOT tuple = (s32[], s32[]) tuple(b, add)
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  ROOT call = (s32[], s32[]) call(p0, p1), to_apply=add
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect the parameter at index 1 to be passed through directly from the
+  // entry computation parameter, but not removed from the call parameters.
+  HloInstruction* call;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1),
+                  m::GetTupleElement(
+                      m::Call(&call, m::Parameter(0), m::Parameter(1)), 0))));
+  EXPECT_THAT(call->to_apply()->root_instruction(),
+              GmockMatch(m::Tuple(m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(CallParameterCleanupTest, MultipleCallSites) {
+  const std::string module_str = R"hlo(
+HloModule module
+
+add {
+  a = s32[] parameter(0)
+  b = s32[] parameter(1)
+  c = s32[] parameter(2)
+  add = s32[] add(a, c)
+  ROOT tuple = (s32[], s32[]) tuple(c, add)
+}
+
+wrap {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  ROOT call = (s32[], s32[]) call(p1, p2, p0), to_apply=add
+}
+
+ENTRY entry {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  call0 = (s32[], s32[]) call(p0, p1, p2), to_apply=add
+  call1 = (s32[], s32[]) call(p0, p1, p2), to_apply=wrap
+  gte0 = s32[] get-tuple-element(call0), index=1
+  gte1 = s32[] get-tuple-element(call1), index=0
+  ROOT mul = s32[] multiply(gte0, gte1)
+}
+
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  CallParameterCleanup cleanup;
+  EXPECT_TRUE(cleanup.Run(module.get()).value());
+
+  HloDCE dce;
+  TupleSimplifier tuple_simplifier;
+  TF_CHECK_OK(dce.Run(module.get()).status());
+  TF_CHECK_OK(tuple_simplifier.Run(module.get()).status());
+
+  // We expect both call sites to use the same computation, for 3 computations
+  // total, rather than 4.
+  EXPECT_EQ(module->computation_count(), 3);
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
index 9eaab45fdcfe4a..b11a058d98e7e5 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.cc
@@ -119,11 +119,11 @@ absl::StatusOr<bool> CanonicalizeNonTupleConditional(
 
 }  // namespace
 
-absl::StatusOr<bool> ConditionalCanonicalizer::Run(
+absl::StatusOr<bool> ConditionalCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, "ConditionalCanonicalizer::Run(), before:\n" + module->ToString());
+      2, "ConditionalCanonicalizer::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     for (auto* inst : comp->MakeInstructionPostOrder()) {
@@ -133,7 +133,7 @@ absl::StatusOr<bool> ConditionalCanonicalizer::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "ConditionalCanonicalizer::Run(), after:\n" + module->ToString());
+      2, "ConditionalCanonicalizer::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
index 9920f6d0c21cf3..4fa2b36626c96b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/conditional_canonicalizer.h
@@ -34,8 +34,8 @@ class ConditionalCanonicalizer : public HloModulePass {
     return "conditional-canonicalizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
index 7e9678bb515f79..92a5c97183aa6f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.cc
@@ -86,7 +86,7 @@ HloInstructionSequence DeferConstants(const HloInstructionSequence& sequence) {
   return new_sequence;
 }
 
-absl::StatusOr<bool> ConstantDeferring::Run(
+absl::StatusOr<bool> ConstantDeferring::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
index bf50f9cfe2f363..76968a46476f85 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/constant_deferring.h
@@ -34,8 +34,9 @@ HloInstructionSequence DeferConstants(const HloInstructionSequence& sequence);
 class ConstantDeferring : public HloModulePass {
  public:
   absl::string_view name() const override { return "constant-deferring"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
index 40c91f1c5e9f81..3f60e885eeb501 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.cc
@@ -211,7 +211,7 @@ absl::StatusOr<bool> MoveConvertPrecisionOps(HloComputation* comp) {
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> ConvertMover::Run(
+absl::StatusOr<bool> ConvertMover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
index 43732be420ea21..d69549333b20a2 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convert_mover.h
@@ -43,8 +43,9 @@ class ConvertMover : public HloModulePass {
   ConvertMover() = default;
 
   absl::string_view name() const override { return "convert-mover"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
index a2391d2e95cd9b..e66e10aa966d4c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
@@ -682,11 +682,11 @@ absl::Status ConvolutionVisitor::HandleConvolution(
 
 }  // namespace
 
-absl::StatusOr<bool> ConvolutionGroupConverter::Run(
+absl::StatusOr<bool> ConvolutionGroupConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "ConvolutionGroupConverter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvolutionGroupConverter::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     if (ConvolutionVisitor::Run(comp, should_expand_, is_cost_viable_,
@@ -696,7 +696,7 @@ absl::StatusOr<bool> ConvolutionGroupConverter::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "ConvolutionGroupConverter::Run(), after:\n" + module->ToString());
+      2, "ConvolutionGroupConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
index 8e7059d6109aed..02dac368f4098f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.h
@@ -44,13 +44,6 @@ class ConvolutionGroupConverter : public HloModulePass {
     return "convolution-group-converter";
   }
 
-  // Run convolution rewriting on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Predicate that determines whether this pass should rewrite a given
   // convolution.
   std::function<bool(HloInstruction*)> should_expand_;
@@ -64,6 +57,13 @@ class ConvolutionGroupConverter : public HloModulePass {
 
   // Tells whether filter expansion is required.
   bool filter_expansion_;
+
+ protected:
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
index 8090c572d34d3f..c2f6d31379d679 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.cc
@@ -151,7 +151,7 @@ class BatchDimensionMerger : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDimensionMerger::Run(
+absl::StatusOr<bool> DotDimensionMerger::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return BatchDimensionMerger().RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
index 52bf94d24154a6..3a6fd5019351c1 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_dimension_merger.h
@@ -29,10 +29,8 @@ class DotDimensionMerger : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_dimension_merger"; }
 
-  // Run the pass on computations in 'module'.
-  // Return whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index 2e927f6ac99eef..db0040ed1ba75b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -584,7 +584,7 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> DotMerger::Run(
+absl::StatusOr<bool> DotMerger::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
index c303f45df4824d..c89ba6153f2feb 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
@@ -65,8 +65,9 @@ class DotMerger : public HloModulePass {
       : max_size_to_merge_(max_size_to_merge), can_merge_(can_merge) {}
 
   absl::string_view name() const override { return "dot-merger"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
index d6a1734e0b230b..00bf452d035a1e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
@@ -170,11 +170,11 @@ absl::StatusOr<bool> IdentityReshapeRemoving(HloInstruction* reshape) {
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicDimensionSimplifier::Run(
+absl::StatusOr<bool> DynamicDimensionSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "DynamicDimensionSimplifier::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "DynamicDimensionSimplifier::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
 
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
@@ -215,8 +215,8 @@ absl::StatusOr<bool> DynamicDimensionSimplifier::Run(
       changed |= local_changed;
     }
   }
-  XLA_VLOG_LINES(
-      2, "DynamicDimensionSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "DynamicDimensionSimplifier::RunImpl(), after:\n" +
+                        module->ToString());
   return changed;
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
index eb52ffaa82c08f..2e670b29832749 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h
@@ -33,8 +33,8 @@ class DynamicDimensionSimplifier : public HloModulePass {
     return "dynamic-dimension-simplifier";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
index 7bd5f94133d165..f644881c517c8f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
@@ -88,21 +88,58 @@ absl::StatusOr<bool> FlattenNode(const CallGraphNode& node) {
   return changed;
 }
 
+// Annotates flatten computations with callee instruction types.
+absl::Status AnnotateNode(const CallGraphNode& node) {
+  for (auto& callsite : node.callsites()) {
+    HloInstruction* instruction = callsite.instruction();
+
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      for (HloComputation* computation : instruction->called_computations()) {
+        computation->SetFusionInstruction(instruction);
+      }
+    }
+  }
+
+  // Correctly handle dead code: if a fusion computation is no longer used, it
+  // should not have a fusion instruction set.
+  if (node.callers().empty() &&
+      node.computation()->FusionInstruction() != nullptr) {
+    node.computation()->SetFusionInstruction(nullptr);
+  }
+
+  return absl::OkStatus();
+}
+
 }  // namespace
 
-absl::StatusOr<bool> FlattenCallGraph::Run(
+absl::StatusOr<bool> FlattenCallGraph::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
 
-  // Flatten original call graph.
-  std::unique_ptr<CallGraph> call_graph =
-      CallGraph::Build(module, execution_threads);
-  TF_ASSIGN_OR_RETURN(bool changed,
-                      call_graph->VisitNodesWithReturn(FlattenNode));
+  bool changed = false;
+  {  // Flatten original call graph.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module, execution_threads);
+    TF_ASSIGN_OR_RETURN(bool flattened,
+                        call_graph->VisitNodesWithReturn(FlattenNode));
+    changed |= flattened;
+  }
+
+  if (!changed) {
+    return false;
+  }
+
+  // TODO(b/418034360): Remove this step once the fusion instruction is
+  // automatically maintained.
+  {  // Annotate flattened computations with callee types.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module, execution_threads);
+    TF_RETURN_IF_ERROR(call_graph->VisitNodes(AnnotateNode));
+  }
 
   XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
-  return changed;
+  return true;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
index 25a3abb97b009c..fa36a6f1e05481 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.h
@@ -32,10 +32,10 @@ class FlattenCallGraph : public HloModulePass {
  public:
   absl::string_view name() const override { return "flatten-call-graph"; }
 
+ protected:
   // Duplicates computations called from multiple call- or while-nodes to
   // flatten the call graph.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
index f0f9e09f90a561..e0ec47713d19f0 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
@@ -669,10 +669,10 @@ CloneComputationsForNonNormalizingInstructions(
 }
 }  // namespace
 
-absl::StatusOr<bool> FloatNormalization::Run(
+absl::StatusOr<bool> FloatNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
+  XLA_VLOG_LINES(2, "FloatNormalization::RunImpl() for " +
                         primitive_util::LowercasePrimitiveTypeName(
                             float_support_->LowPrecisionType()) +
                         ", before:\n" + module->ToString());
@@ -686,7 +686,7 @@ absl::StatusOr<bool> FloatNormalization::Run(
     if (computations_to_skip.contains(comp)) continue;
     TF_RETURN_IF_ERROR(comp->Accept(&visitor));
   }
-  XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
+  XLA_VLOG_LINES(2, "FloatNormalization::RunImpl() for " +
                         primitive_util::LowercasePrimitiveTypeName(
                             float_support_->LowPrecisionType()) +
                         ", after:\n" + module->ToString());
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
index 84a19601ca289d..7ac05015e55273 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.h
@@ -43,10 +43,10 @@ class FloatNormalization : public HloModulePass {
   ~FloatNormalization() override = default;
   absl::string_view name() const override { return name_; }
 
+ protected:
   // Run float normalization on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -73,12 +73,12 @@ class BFloat16MixedPrecisionRemoval : public HloModulePass {
     return "bf16-mixed-precision-removal";
   }
 
+ protected:
   // Run mixed precision removal on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     FloatNormalization normalization(&no_mixed_precision_support_);
     return normalization.Run(module, execution_threads);
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
index 7b691b4b95983b..29cdd78ee8ed80 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.cc
@@ -95,7 +95,7 @@ bool ProcessComputation(HloComputation* computation) {
   return changed;
 }
 
-absl::StatusOr<bool> FusionConstantSinking::Run(
+absl::StatusOr<bool> FusionConstantSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before FusionConstantSinking:";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
index 0fe609f8582f11..a8f2d7441ef4a7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/fusion_constant_sinking.h
@@ -29,10 +29,10 @@ class FusionConstantSinking : public HloModulePass {
  public:
   absl::string_view name() const override { return "fusion_constant_sinking"; }
 
+ protected:
   // Run fusion constant sinking operations on the given module. Returns whether
   // the module was changed (constant expressions folded).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
index 2fe0fb20da8dff..49e83965bee11a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/literal_util.h"
-#include "xla/permutation_util.h"
 #include "xla/service/gather_scatter_utils.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -47,20 +45,17 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
   const auto& dims = gather->gather_dimension_numbers();
   int operand_rank =
       dims.collapsed_slice_dims().size() + dims.offset_dims().size();
-
-  // Make the operand conform to start_index_map.
-  auto [operand_permutation, operand_permutation_inverse] =
-      MakeOperandStartIndexPermutations(dims.start_index_map(), operand_rank);
   auto* operand = gather->operands()[0];
   auto* start_indices = gather->operands()[1];
-  TF_ASSIGN_OR_RETURN(operand, MaybeTranspose(operand, operand_permutation));
+
+  // Make the start_indices a two-dimensional tensor.
   TF_ASSIGN_OR_RETURN(
       start_indices,
       TransformStartIndices(start_indices, dims.index_vector_dim()));
 
   // Permute the slice sizes according to start_index_map and compute the new
   // output shape for the Gather op.
-  auto slice_sizes = Permute(gather->gather_slice_sizes(), operand_permutation);
+  const auto slice_sizes = gather->gather_slice_sizes();
   std::vector<int64_t> output_dims = {start_indices->shape().dimensions(0)};
   absl::c_copy(slice_sizes, std::back_inserter(output_dims));
   Shape output_shape =
@@ -68,23 +63,15 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
 
   std::vector<int64_t> offset_dims(operand_rank);
   absl::c_iota(offset_dims, 1);
-  std::vector<int64_t> start_index_map(dims.start_index_map().size());
-  absl::c_iota(start_index_map, 0);
 
   auto* result = gather->AddInstruction(HloInstruction::CreateGather(
       output_shape, operand, start_indices,
-      HloGatherInstruction::MakeGatherDimNumbers(
-          offset_dims,
-          /*collapsed_slice_dims=*/{}, start_index_map, /*index_vector_dim=*/1),
+      HloGatherInstruction::MakeGatherDimNumbers(offset_dims,
+                                                 /*collapsed_slice_dims=*/{},
+                                                 dims.start_index_map(),
+                                                 /*index_vector_dim=*/1),
       slice_sizes, gather->indices_are_sorted()));
 
-  // Undo the start_index_map transpose.
-  std::vector<int64_t> output_permutation(1 +  // start index dimension.
-                                          operand_rank);
-  absl::c_transform(operand_permutation_inverse, output_permutation.begin() + 1,
-                    [](int64_t dim) { return dim + 1; });
-  TF_ASSIGN_OR_RETURN(result, MaybeTranspose(result, output_permutation));
-
   // Collapse the requested slice dimensions.
   if (!dims.collapsed_slice_dims().empty()) {
     std::vector<int64_t> collapsed_slice_dims(
@@ -131,9 +118,7 @@ bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
   auto* start_indices = gather->operands()[1];
   const auto& dims = gather->gather_dimension_numbers();
   return start_indices->shape().dimensions().size() == 2 &&
-         dims.index_vector_dim() == 1 &&
-         IsIdentityPermutation(dims.start_index_map()) &&
-         dims.collapsed_slice_dims().empty() &&
+         dims.index_vector_dim() == 1 && dims.collapsed_slice_dims().empty() &&
          *dims.offset_dims().begin() == 1 &&
          *dims.offset_dims().rbegin() == dims.offset_dims().size();
 }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
index 15edb691c661de..a264bcba8574cd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.h
@@ -29,7 +29,6 @@ namespace xla {
 // The output gather's attributes will have the following characteristics:
 // - start_indices is a two-dimensional tensor
 // - index_vector_dim is 1
-// - start_index_map is [0, 1, ...]
 // - collapsed_slice_dims is []
 // - offset_dims is [1, 2, ...]
 //
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
index 6cfa33dfaf45c4..b7a530f0cf1b24 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier_test.cc
@@ -84,8 +84,8 @@ TEST_F(GatherSimplifierTest, RemovesCollapsedSliceDims) {
   )");
 }
 
-TEST_F(GatherSimplifierTest, MakesStartIndexMapIdentity) {
-  // Verifies that GatherSimplifier ensures start_index_map is {0, 1, ...}.
+TEST_F(GatherSimplifierTest, KeepsStartIndexIntact) {
+  // Verifies that GatherSimplifier does not change the start_index_map.
   constexpr absl::string_view kModuleStr = R"(
     HloModule gather_simplifier
 
@@ -100,13 +100,8 @@ TEST_F(GatherSimplifierTest, MakesStartIndexMapIdentity) {
           slice_sizes={1,2,3}
     })";
 
-  RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
-  %operand = f32[33,34,35]{2,1,0} parameter(0)
-           CHECK: %[[OPERAND:.*]] = f32[35,33,34]{2,1,0} transpose(%operand)
-           CHECK: %[[GATHER:.*]] = f32[42,3,1,2]{{.*}} gather(%[[OPERAND]],
-      CHECK-SAME:    start_index_map={0,1,2},
-           CHECK: ROOT {{.*}} = f32[42,1,2,3]{{.*}} transpose(%[[GATHER]])
-  )");
+  // Expect unchanged.
+  RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), std::nullopt);
 }
 
 TEST_F(GatherSimplifierTest, CollapsesSomeDims) {
@@ -176,8 +171,8 @@ TEST_F(GatherSimplifierTest, ZeroSizeSlice) {
 
   // The shape check is sufficient.
   RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
-      CHECK: %[[ZERO:.*]] = f32[] constant(0) 
-      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={} 
+      CHECK: %[[ZERO:.*]] = f32[] constant(0)
+      CHECK: ROOT {{.*}} = f32[3,2]{1,0} broadcast(%[[ZERO]]), dimensions={}
   )");
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
index d3dc79e6b9c65b..de2bc2387b4c91 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
@@ -44,7 +44,7 @@ bool HloComputationDeduplicator::ContainsLargeConstants(HloComputation* comp) {
   return false;
 }
 
-absl::StatusOr<bool> HloComputationDeduplicator::Run(
+absl::StatusOr<bool> HloComputationDeduplicator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<std::string, HloComputation*> unique_comps;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
index 7d04caede058dd..4a548b35f92186 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h
@@ -36,8 +36,8 @@ class HloComputationDeduplicator : public HloModulePass {
       : mark_fusion_duplications_(mark_fusion_duplications) {}
   absl::string_view name() const override { return "computation-deduplicator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
index 0afddf0eefd4ed..e5848c4ba7c283 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
@@ -283,7 +283,7 @@ absl::StatusOr<bool> PropagateIdenticalConstantArguments(
 
 }  // namespace
 
-absl::StatusOr<bool> HloConstantFolding::Run(
+absl::StatusOr<bool> HloConstantFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Limit the constant folding to 0 iterations to skip folding loops in the
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
index 084185d4ec45e3..00bec34045f771 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
@@ -48,10 +48,10 @@ class HloConstantFolding : public HloModulePass {
   explicit HloConstantFolding(Level level = Level::kDefault) : level_(level) {}
   absl::string_view name() const override { return "constant_folding"; }
 
+ protected:
   // Run constant folding operations on the given module. Returns whether the
   // module was changed (constant expressions folded).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
index 61d269bfbf3bdd..1f6c3fe7f6f7f1 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.cc
@@ -114,7 +114,7 @@ absl::StatusOr<bool> DuplicateConstantExpressionPerUser(
 
 }  // namespace
 
-absl::StatusOr<bool> HloConstantSplitter::Run(
+absl::StatusOr<bool> HloConstantSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
index 2f58909fb8a7d4..98162cae363f66 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_splitter.h
@@ -45,8 +45,9 @@ class HloConstantSplitter : public HloModulePass {
       : split_expressions_(split_expressions),
         extra_constraints_(extra_constraints) {}
   absl::string_view name() const override { return "hlo-constant-splitter"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
index 783426fa988e44..943b821883740e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
@@ -51,6 +51,9 @@ namespace xla {
 
 namespace {
 
+const absl::string_view kDceSideEffectFrontendAttribute =
+    "xla_allow_dce_side_effecting_op";
+
 // Checks if the instruction is a removable while given
 // remove_cross_partition_collective_ops
 bool IsRemovableWhile(const HloInstruction* instruction,
@@ -208,7 +211,12 @@ bool CanRemoveInstruction(
                             !maybe_collective_op->constrain_layout();
     bool allow_while =
         IsRemovableWhile(instruction, remove_cross_partition_collective_ops);
-    if (!allow_collective && !allow_while) {
+    bool allow_custom_call = instruction->IsCustomCall("tpu_custom_call") &&
+                             instruction->frontend_attributes().map().contains(
+                                 kDceSideEffectFrontendAttribute) &&
+                             instruction->frontend_attributes().map().at(
+                                 kDceSideEffectFrontendAttribute) == "true";
+    if (!allow_collective && !allow_while && !allow_custom_call) {
       return false;
     }
   }
@@ -381,7 +389,7 @@ absl::StatusOr<bool> RemoveDanglingComputations(
   return changed;
 }
 
-absl::StatusOr<bool> HloDCE::Run(
+absl::StatusOr<bool> HloDCE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Before dce; threads: " << absl::StrJoin(execution_threads, ",");
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
index 65afe534f1e03f..64479179332dfc 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
@@ -53,10 +53,10 @@ class HloDCE : public HloModulePass {
       bool remove_cross_partition_collective_ops = false,
       CallGraph* call_graph = nullptr);
 
+ protected:
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
index 98a9122da5fc35..8552bd446ef1c9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.cc
@@ -116,11 +116,11 @@ HloElementTypeConverter::HloElementTypeConverter(
 
 // This routine converts the arithmetic operations in the given module that use
 // eliminate_type_ to operations that use replace_with_type_.
-absl::StatusOr<bool> HloElementTypeConverter::Run(
+absl::StatusOr<bool> HloElementTypeConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+      3, "HloElementTypeConverter::RunImpl(), before:\n" + module->ToString());
 
   if (eliminate_type_ == replace_with_type_) {
     return false;
@@ -232,7 +232,7 @@ absl::StatusOr<bool> HloElementTypeConverter::Run(
     }
   }
   XLA_VLOG_LINES(
-      2, "HloElementTypeConverter::Run(), after:\n" + module->ToString());
+      2, "HloElementTypeConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
index 52a2f6cd924142..77307db447aef8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_element_type_converter.h
@@ -39,8 +39,8 @@ class HloElementTypeConverter : public HloModulePass {
   absl::string_view name() const override { return "element_type_converter"; }
 
   // Returns the pass on the module and returns whether the module was modified.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
index f10e5a8c7e832b..03c1386dd6bf4c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
@@ -657,7 +657,7 @@ absl::StatusOr<HloSchedule> ScheduleModule(
       execution_threads, peak_memory);
 }
 
-absl::StatusOr<bool> HloMemoryScheduler::Run(
+absl::StatusOr<bool> HloMemoryScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
@@ -666,7 +666,7 @@ absl::StatusOr<bool> HloMemoryScheduler::Run(
   return true;
 }
 
-absl::StatusOr<bool> HloTrivialScheduler::Run(
+absl::StatusOr<bool> HloTrivialScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HloSchedule schedule(module);
@@ -688,7 +688,7 @@ absl::StatusOr<bool> HloTrivialScheduler::Run(
   return true;
 }
 
-absl::StatusOr<bool> HloDescheduler::Run(
+absl::StatusOr<bool> HloDescheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = module->has_schedule();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
index f06330b53ff0ff..e3d11bb038063f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
@@ -275,8 +275,8 @@ class HloMemoryScheduler : public HloModulePass {
 
   absl::string_view name() const override { return "hlo-memory-scheduler"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -290,8 +290,9 @@ class HloTrivialScheduler : public HloModulePass {
  public:
   HloTrivialScheduler() = default;
   absl::string_view name() const override { return "hlo-trivial-scheduler"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -302,8 +303,9 @@ class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
   absl::string_view name() const override { return "hlo-descheduler"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
index 127342aec288d9..d758e09cf7a593 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc
@@ -3016,7 +3016,7 @@ HloRematerialization::GetRematAlgorithmFunction(
   }
 }
 
-absl::StatusOr<bool> HloRematerialization::Run(
+absl::StatusOr<bool> HloRematerialization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (options_.remat_mode_config.host_offload) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
index 9ef2558bfabe73..cab8902d8359dd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.h
@@ -52,7 +52,7 @@ using RematAlgorithmFunction = std::function<absl::StatusOr<bool>(
 // CSE will undo the effects of this optimization and should not be run after
 // this pass. In general, this pass should be run very late, immediately before
 // code generation.
-class HloRematerialization : public HloModulePass {
+class HloRematerialization : public HloPassInterface {
  public:
   // The minimum cost estimate memory limit in bytes for a computation to be
   // considered for rematerialization. Only in use for peak priority
@@ -219,17 +219,6 @@ class HloRematerialization : public HloModulePass {
         std::max(max_rematerialized_block_size_, new_rematerialized_block_size);
   }
 
-  // Runs rematerialization on the given module. Returns whether the module was
-  // changed. Requires that the module has a schedule set
-  // (HloModule::has_schedule() is true) before running. Returns whether any
-  // instructions were rematerialized. If memory use is already below the limit
-  // specified in the constructor then no instructions are rematerialized and
-  // false is returned.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   int64_t GetBlockSizeLimit() const { return options_.block_size_limit; }
 
   // Holds references to data structures and some constants that are used during
@@ -414,6 +403,16 @@ class HloRematerialization : public HloModulePass {
   // rematerialized.
   absl::AnyInvocable<absl::Status(HloInstruction*, HloInstruction*)>
       on_rematerialized_;
+
+  // Runs rematerialization on the given module. Returns whether the module was
+  // changed. Requires that the module has a schedule set
+  // (HloModule::has_schedule() is true) before running. Returns whether any
+  // instructions were rematerialized. If memory use is already below the limit
+  // specified in the constructor then no instructions are rematerialized and
+  // false is returned.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
index 153a0d2785fba6..332d18e864476c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
@@ -197,7 +197,7 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
 
 }  // namespace
 
-absl::StatusOr<bool> HostMemoryTransferAsyncifier::Run(
+absl::StatusOr<bool> HostMemoryTransferAsyncifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HostMemoryTransferAsyncifierVisitor visitor(kHostMemorySpaceColor);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
index 7ec27b9d85f332..d760c360012a23 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h
@@ -45,8 +45,8 @@ class HostMemoryTransferAsyncifier : public HloModulePass {
     return "host-memory-transfer-asyncifier";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
index aa8edadb330178..9349d95e35d2af 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.cc
@@ -141,7 +141,7 @@ bool HoistConstantOperations(
 }
 }  // namespace
 
-absl::StatusOr<bool> InstructionHoister::Run(
+absl::StatusOr<bool> InstructionHoister::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool modified = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
index 64cad2f68da73b..45ded03e85236e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/instruction_hoister.h
@@ -35,8 +35,9 @@ class InstructionHoister : public HloModulePass {
   ~InstructionHoister() override = default;
 
   absl::string_view name() const override { return "instruction-hoister"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
index 97b4edbd2fd1b0..3da5222cf01825 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.cc
@@ -142,7 +142,7 @@ absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
   return changed;
 }
 
-absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Run(
+absl::StatusOr<bool> OptimizeInputOutputBufferAlias::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We exactly follow HloInputOutputAliasConfig::Verify to create input_shapes
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
index d33512c0dda2bc..9e0db94ced7798 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h
@@ -58,8 +58,8 @@ class OptimizeInputOutputBufferAlias : public HloModulePass {
     return "optimize_input_output_buffer_alias";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
index 70410c93bbf254..e50dcf0904eb0a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> ReduceWindowResizer::Run(
+absl::StatusOr<bool> ReduceWindowResizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
index f12955ba3d3a09..ffa38bfef197fe 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_resizer.h
@@ -33,7 +33,8 @@ class ReduceWindowResizer : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-window-resizer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
index bc9fd7233e0c3a..0caa95ea5c287f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
@@ -498,7 +498,7 @@ absl::StatusOr<bool> ReduceWindowRewriter::TryOptimizeCumSumOrProd(
   return true;
 }
 
-absl::StatusOr<bool> ReduceWindowRewriter::Run(
+absl::StatusOr<bool> ReduceWindowRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
index d60997958cb9cd..a389dc9b4b31de 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.h
@@ -55,8 +55,8 @@ class ReduceWindowRewriter : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-window-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
index 3c450fbf7c402a..f74bf40c913990 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
@@ -395,7 +395,7 @@ absl::StatusOr<bool> ReshapeMover::TryReshapeMoveOnCandidates(
   return true;
 }
 
-absl::StatusOr<bool> ReshapeMover::Run(
+absl::StatusOr<bool> ReshapeMover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
index 20a16b292185b3..fddcd9c873ac64 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.h
@@ -54,8 +54,8 @@ class ReshapeMover : public HloModulePass {
 
   absl::string_view name() const override { return "reshape-mover"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
index 179742831b9de0..410ad1aeacbc4f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.cc
@@ -53,7 +53,7 @@ void SinkNontupleRoot(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> RootInstructionSinker::Run(
+absl::StatusOr<bool> RootInstructionSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RET_CHECK(module->has_schedule());
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
index 4ead2aa5c501f3..df33e3afba1f73 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/root_instruction_sinker.h
@@ -36,8 +36,9 @@ class RootInstructionSinker : public HloModulePass {
  public:
   ~RootInstructionSinker() override = default;
   absl::string_view name() const override { return "root-instruction-sinker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
index 43c7171fafd09f..c9eea633cb00ea 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.cc
@@ -64,11 +64,11 @@ absl::StatusOr<bool> RunOnComputation(HloComputation& computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> SimplifyFPConversions::Run(
+absl::StatusOr<bool> SimplifyFPConversions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, absl::StrFormat("SimplifyFPConversions::Run() with before:\n%s",
+      2, absl::StrFormat("SimplifyFPConversions::RunImpl() with before:\n%s",
                          module->ToString()));
   bool changed = false;
   for (HloComputation* computation :
@@ -76,9 +76,9 @@ absl::StatusOr<bool> SimplifyFPConversions::Run(
     TF_ASSIGN_OR_RETURN(bool comp_changed, RunOnComputation(*computation));
     changed |= comp_changed;
   }
-  XLA_VLOG_LINES(2,
-                 absl::StrFormat("SimplifyFPConversions::Run() with after:\n%s",
-                                 module->ToString()));
+  XLA_VLOG_LINES(
+      2, absl::StrFormat("SimplifyFPConversions::RunImpl() with after:\n%s",
+                         module->ToString()));
   return changed;
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
index a2266f34721f66..c56954a226fedd 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/simplify_fp_conversions.h
@@ -36,8 +36,8 @@ class SimplifyFPConversions : public HloModulePass {
 
   absl::string_view name() const override { return "simplify-fp-conversions"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
index 488c97bd570d27..23e26621917ce8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.cc
@@ -174,7 +174,7 @@ absl::StatusOr<bool> HoistSliceOperations(HloComputation* computation) {
 }
 }  // anonymous namespace
 
-absl::StatusOr<bool> SliceHoister::Run(
+absl::StatusOr<bool> SliceHoister::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
index 2ef0bb19b1da41..f8e77ea396556d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_hoister.h
@@ -31,8 +31,9 @@ class SliceHoister : public HloModulePass {
   SliceHoister() = default;
 
   absl::string_view name() const override { return "slice-hoister"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
index d34ba3de5f4059..5780e5d463017e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.cc
@@ -252,7 +252,7 @@ absl::Status SinkSlices(
 // This pass currently doesn't transform non-elementwise instructions. We may
 // extend this pass to transform non-elementwise instructions, such as dot,
 // broadcast and reduce in the future.
-absl::StatusOr<bool> SliceSinker::Run(
+absl::StatusOr<bool> SliceSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
index 66a58ed24f0a80..a50d2977f6e13a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/slice_sinker.h
@@ -30,8 +30,8 @@ class SliceSinker : public HloModulePass {
  public:
   absl::string_view name() const override { return "slice-sinker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
index cf9b37f695613e..4b5bdbde4aabe8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.cc
@@ -148,7 +148,7 @@ absl::StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
 }
 }  // namespace
 
-absl::StatusOr<bool> SortSimplifier::Run(
+absl::StatusOr<bool> SortSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before SortSimplifier:";
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
index 44779f1e10c452..9967955b17ce0f 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sort_simplifier.h
@@ -29,8 +29,9 @@ namespace xla {
 class SortSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "simplify-sorts"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
index 0d21d11fdc1de2..be89794d7c6f71 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.cc
@@ -192,7 +192,7 @@ SubByteCollectiveNormalizationVisitor::ProcessCollectiveInstruction(
 
 }  // namespace
 
-absl::StatusOr<bool> SubByteCollectiveNormalization::Run(
+absl::StatusOr<bool> SubByteCollectiveNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   SubByteCollectiveNormalizationVisitor visitor;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
index 6e7dd5b31465d0..6e2d78e2f287fb 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_collective_normalization.h
@@ -35,8 +35,8 @@ class SubByteCollectiveNormalization : public HloModulePass {
     return "sub-byte-collective-normalization";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
index c77a66944fda58..879db5fb3ee1a6 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.cc
@@ -89,7 +89,7 @@ bool ProcessInputOrOutputLayout(ShapeLayout* shape_layout,
 
 }  // namespace
 
-absl::StatusOr<bool> SubByteNormalization::Run(
+absl::StatusOr<bool> SubByteNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
index b8918d63c6ae21..895046fe52afa7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/sub_byte_normalization.h
@@ -52,8 +52,9 @@ class SubByteNormalization : public HloModulePass {
         return "sub-byte-size-setter";
     }
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
index 973d1be515c104..207ccfa6a30635 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
@@ -42,10 +42,14 @@ namespace xla {
 
 class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit ReductionRewriterVisitor(int64_t reduce_window_size)
-      : reduce_window_size_(reduce_window_size) {}
+  ReductionRewriterVisitor(int64_t reduce_window_size, HloPredicate filter)
+      : reduce_window_size_(reduce_window_size), filter_(std::move(filter)) {}
 
   absl::Status HandleReduce(HloInstruction *hlo) override {
+    if (filter_ && !filter_(hlo)) {
+      return absl::OkStatus();
+    }
+
     HloInstruction *reduced_op = hlo->mutable_operand(0);
     HloInstruction *initial_value = hlo->mutable_operand(1);
     const Shape &input_shape = reduced_op->shape();
@@ -112,12 +116,13 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
 
  private:
   int64_t reduce_window_size_;
+  HloPredicate filter_;
 };
 
-absl::StatusOr<bool> TreeReductionRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
-  ReductionRewriterVisitor visitor(reduce_window_size_);
+absl::StatusOr<bool> TreeReductionRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  ReductionRewriterVisitor visitor(reduce_window_size_, filter_);
   bool changed = false;
   for (const auto &computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
index b5b6f91d3ca334..52703c0da0545c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h
@@ -44,18 +44,20 @@ namespace xla {
 // increased to a larger value.
 class TreeReductionRewriter : public HloModulePass {
  public:
-  explicit TreeReductionRewriter(int64_t reduce_window_size = 32)
-      : reduce_window_size_(reduce_window_size) {}
+  explicit TreeReductionRewriter(int64_t reduce_window_size = 32,
+                                 HloPredicate filter = nullptr)
+      : reduce_window_size_(reduce_window_size), filter_(std::move(filter)) {}
   ~TreeReductionRewriter() override = default;
   absl::string_view name() const override { return "tree_reduction_rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   int64_t reduce_window_size_;
+  HloPredicate filter_;
 };
 
 }  // end namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
index 645410128700fa..56604af45ddcac 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.cc
@@ -64,7 +64,7 @@ absl::StatusOr<HloInstruction*> TupleSimplifier::RemoveWholeTuple(
   return nullptr;
 }
 
-absl::StatusOr<bool> TupleSimplifier::Run(
+absl::StatusOr<bool> TupleSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Initially add all GTE and Tuple instructions to the worklist.
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
index 77cdec08185106..e63a12f1fedda0 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tuple_simplifier.h
@@ -34,10 +34,10 @@ class TupleSimplifier : public HloModulePass {
   ~TupleSimplifier() override {}
   absl::string_view name() const override { return "tuple-simplifier"; }
 
+ protected:
   // Runs tuple simplification on the given module. Returns whether the module
   // was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
index 471c38efc0c499..ec1deade87cd63 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.cc
@@ -49,7 +49,7 @@ namespace {
 // Struct to hold all call instructions and called computations in a module.
 struct HloCalls {
   std::vector<HloInstruction*> call_sites;
-  std::vector<HloComputation*> targets;
+  absl::flat_hash_set<HloComputation*> targets;
 };
 
 // Iterates through all instructions in the module's computations
@@ -64,7 +64,7 @@ HloCalls CollectHloCalls(
     for (const CallSite& callsite : node.callsites()) {
       if (callsite.instruction()->opcode() == HloOpcode::kCall) {
         calls.call_sites.push_back(callsite.instruction());
-        calls.targets.push_back(callsite.instruction()->to_apply());
+        calls.targets.insert(callsite.instruction()->to_apply());
       }
     }
   }
@@ -74,7 +74,7 @@ HloCalls CollectHloCalls(
 
 absl::StatusOr<std::vector<UnflattenCallGraph::ComputationHashResult>>
 UnflattenCallGraph::HashComputations(
-    const std::vector<HloComputation*>& called_computations) {
+    const absl::flat_hash_set<HloComputation*>& called_computations) {
   auto hash_computation =
       [&](HloComputation* computation) -> ComputationHashResult {
     // Secret key used for hashing. Since we're not worried about attackers,
@@ -131,7 +131,7 @@ absl::Status UnflattenCallGraph::ValidateComputationHashes(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> UnflattenCallGraph::Run(
+absl::StatusOr<bool> UnflattenCallGraph::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running UnflattenCallGraph on module " << module->name();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
index 4111b4e8ccdce1..792915df4fc77d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/unflatten_call_graph.h
@@ -49,10 +49,10 @@ class UnflattenCallGraph : public HloModulePass {
 
   absl::string_view name() const override { return "unflatten-call-graph"; }
 
+ protected:
   // Find called computations that are identical and replace them with calls to
   // a single computation. Returns true if the module was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -67,7 +67,7 @@ class UnflattenCallGraph : public HloModulePass {
   // Hashes computations to produce a fingerprint and hash value.
   // Uses canonical HLO text without IDs for stable, content-based hashing.
   absl::StatusOr<std::vector<ComputationHashResult>> HashComputations(
-      const std::vector<HloComputation*>& called_computations);
+      const absl::flat_hash_set<HloComputation*>& called_computations);
 
   // Verifies that computations with the same hash are identical to prevent
   // incorrect merging due to hash collisions, using progressively more
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
index a1d3bf08730d98..b87e57699aa3a4 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
@@ -47,7 +47,7 @@ bool ShouldSkipForSideEffect(HloInstruction* instruction) {
 
 }  // namespace
 
-absl::StatusOr<bool> ZeroSizedHloElimination::Run(
+absl::StatusOr<bool> ZeroSizedHloElimination::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
index 7823b7e4b4e71a..ff2826299817db 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h
@@ -26,13 +26,14 @@ limitations under the License.
 namespace xla {
 class ZeroSizedHloElimination : public HloModulePass {
  public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   absl::string_view name() const override {
     return "zero_sized_hlo_elimination";
   }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 }  // namespace xla
 #endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
index 9ed713d6473a17..6317837901a99b 100644
--- a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> WhileLoopTripCountAnnotator::Run(
+absl::StatusOr<bool> WhileLoopTripCountAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
index 0cab15b749d9ad..671615536836a3 100644
--- a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.h
@@ -42,8 +42,9 @@ class WhileLoopTripCountAnnotator : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-trip-count-annotator";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/hlo/translate/BUILD b/third_party/xla/xla/hlo/translate/BUILD
index dede8f563102b8..199071996a47de 100644
--- a/third_party/xla/xla/hlo/translate/BUILD
+++ b/third_party/xla/xla/hlo/translate/BUILD
@@ -131,6 +131,7 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:UBDialect",
         "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_passes_optimization",
     ],
 )
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
index 8b68c112604104..b784f882ac546a 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -199,6 +199,12 @@ mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
       numPrimitiveOperations = 6;
       break;
     }
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9: {
+      lhs = rhs = builder->getBF16Type();
+      accum = builder->getF32Type();
+      numPrimitiveOperations = 9;
+      break;
+    }
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32: {
       lhs = rhs = builder->getTF32Type();
       accum = builder->getF32Type();
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
index d8cf68c1cc61f8..72ffe8250e14ff 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
@@ -37,7 +37,8 @@ absl::Status RewriteLayoutWithShardedShape(
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
     xla::Shape* xla_shape) {
-  if (sharding && !sharding->IsTileMaximal() && !sharding->IsManual()) {
+  if (sharding && !sharding->IsTileMaximal() && !sharding->IsManual() &&
+      !sharding->IsUnreduced()) {
     // After sharding, per core shape might have different layout. For example,
     // before sharding, a shape [128, 128] will be assigned default
     // minor-to-major {1, 0}. But after we shard this shape to [128, 64] * 2,
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 02f3574dfb18c8..8ef19e3d4e0e44 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -2362,7 +2362,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
       auto name = attr.getName();
       return name == kCallTargetName || name == kBackendConfig ||
              name == kApiVersion || name == kCalledComputations ||
-             name == kHasSideEffect;
+             name == kHasSideEffect || name == xla::kMhloFrontendAttributes;
     };
     for (const auto& attr : op->getAttrs()) {
       if (!isSupportedAttrName(attr))
@@ -2720,6 +2720,15 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
 
   if (op->getNumResults() == 1 && !return_tuple) {
     value_map[op.getResult(0)] = custom_call;
+  } else if (op.getCallTargetName() ==
+                 xla::sdy::kGlobalToLocalShapeCallTargetName ||
+             op.getCallTargetName() ==
+                 xla::sdy::kLocalToGlobalShapeCallTargetName) {
+    // ShardyXLA has the hlo -> stablehlo -> hlo round trip. These
+    // get-tuple-elements do not need to hold the frontend attributes.
+    xla::XlaScopedFrontendAttributesAssignment frontend_attributes_scope(
+        ctx.builder, xla::FrontendAttributes());
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   } else {
     BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
@@ -4084,7 +4093,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
       auto name = attr.getName();
       return name == kCallTargetName || name == kBackendConfig ||
              name == kApiVersion || name == kCalledComputations ||
-             name == kHasSideEffect;
+             name == kHasSideEffect || name == xla::kMhloFrontendAttributes;
     };
     for (const auto& attr : op->getAttrs()) {
       if (!isSupportedAttrName(attr))
@@ -4429,6 +4438,15 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
 
   if (op->getNumResults() == 1 && !return_tuple) {
     value_map[op.getResult(0)] = custom_call;
+  } else if (op.getCallTargetName() ==
+                 xla::sdy::kGlobalToLocalShapeCallTargetName ||
+             op.getCallTargetName() ==
+                 xla::sdy::kLocalToGlobalShapeCallTargetName) {
+    // ShardyXLA has the hlo -> stablehlo -> hlo round trip. These
+    // get-tuple-elements do not need to hold the frontend attributes.
+    xla::XlaScopedFrontendAttributesAssignment frontend_attributes_scope(
+        ctx.builder, xla::FrontendAttributes());
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   } else {
     BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
index 5e32a9f13be424..2b7e05c8e09c23 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
@@ -906,6 +906,33 @@ func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: te
 
 // -----
 
+// CHECK:  HloModule
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      top_k = 4 : i64,
+      is_fallback = true
+      },
+    mhlo.frontend_attributes = {_some_attribute = "some_value"}
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// CHECK: ENTRY
+// CHECK: frontend_attributes={_some_attribute="some_value"}
+
+// -----
+
 // CHECK:  HloModule
 func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
   %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
diff --git a/third_party/xla/xla/hlo/translate/stablehlo.cc b/third_party/xla/xla/hlo/translate/stablehlo.cc
index f63514c26ad02a..3479fe34329acb 100644
--- a/third_party/xla/xla/hlo/translate/stablehlo.cc
+++ b/third_party/xla/xla/hlo/translate/stablehlo.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 #include "stablehlo/transforms/Passes.h"
+#include "stablehlo/transforms/optimization/Passes.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h"
 #include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
@@ -73,9 +74,11 @@ absl::Status StablehloToMhlo(mlir::ModuleOp module, bool run_canonicalizer) {
       mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createChloLegalizeToStablehloPass());
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   if (run_canonicalizer) {
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::stablehlo::createStablehloTargetIndependentOptimizationPass({
+            /*assumeNoUndeclaredSideEffects=*/true,
+        }));
     pm.addPass(mlir::stablehlo_ext::
                    createStablehloSanitizeDiscardableAttributesPass());
   }
@@ -108,6 +111,7 @@ absl::Status ConvertStablehloToHloProtoInternal(mlir::ModuleOp module,
   mlir::MlirToHloConversionOptions options;
   options.return_tuple = return_tuple;
   options.use_tuple_args = use_tuple_args;
+  options.direct_stablehlo_to_hlo = true;
   TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module, hlo_proto, options));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 29f930b5382b74..56db86ab617b02 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -3057,8 +3057,8 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
 }
 
 absl::Status CanonicalizeLayoutAfterShardingPropagation(
-    HloModule* module, const std::vector<bool>& update_output_layout,
-    const std::vector<bool>& update_parameters_layout) {
+    HloModule* module, absl::Span<const bool> update_output_layout,
+    absl::Span<const bool> update_parameters_layout) {
   if (!module->layout_canonicalization_callback()) {
     VLOG(4) << "There is no registered layout_canonicalization_callback.";
     return absl::OkStatus();
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 693f31cffd5077..8e0f0768c932ef 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -574,8 +574,8 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
 // DetermineArgumentLayoutsFromCompileOptions() in
 // tensorflow/compiler/xla/pjrt/utils.h.
 absl::Status CanonicalizeLayoutAfterShardingPropagation(
-    HloModule* module, const std::vector<bool>& update_output_layout,
-    const std::vector<bool>& update_parameters_layout);
+    HloModule* module, absl::Span<const bool> update_output_layout,
+    absl::Span<const bool> update_parameters_layout);
 
 // Returns true iff the specified hlo or sharding has a spatially partitioned
 // sharding (tiled or replicated) that can be propagated by sharding
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index 717345f004e813..940c149d9f59e7 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -292,12 +292,13 @@ class Layout {
 
     Equal& MinorToMajorOnly() {
       return IgnoreTiles()
+          .IgnoreTailPaddingAlignmentInElements()
+          .IgnoreElementSize()
           .IgnoreIndexPrimitiveType()
           .IgnorePointerPrimitiveType()
           .IgnoreMemorySpace()
-          .IgnorePhysicalShape()
-          .IgnoreElementSize()
-          .IgnoreTailPaddingAlignmentInElements();
+          .IgnoreSplitConfigs()
+          .IgnorePhysicalShape();
     }
 
    private:
diff --git a/third_party/xla/xla/layout_test.cc b/third_party/xla/xla/layout_test.cc
index 91e6893f09711b..79baac745f2f85 100644
--- a/third_party/xla/xla/layout_test.cc
+++ b/third_party/xla/xla/layout_test.cc
@@ -165,6 +165,12 @@ TEST(Layout, Equality) {
   EXPECT_TRUE(Layout::Equal().IgnoreSplitConfigs()(
       Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
       Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {3}))));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
+                      Layout({0, 1, 2})));
+  EXPECT_TRUE(Layout::Equal().MinorToMajorOnly()(
+      Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
+      Layout({0, 1, 2})));
 }
 
 TEST(Layout, LayoutToFromProto) {
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 223ebfa1758d56..46931593d4d966 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -632,4 +632,39 @@ Layout LayoutUtil::MoveDimToMinor(const Layout& layout, const int64_t dim) {
              : std::nullopt;
 }
 
+/*static*/ bool LayoutUtil::IsUntiledLayout(absl::Span<const Tile> tiles,
+                                            absl::Span<const int64_t> shape) {
+  // Tiles are applied recursively to expand current_shape
+  // Example: (t0, t1) tile applied to (..., n, m) expands it to
+  // (..., ceildiv(n, t0), ceildiv(m, t1), t0, t1)
+  std::vector<int64_t> current_shape(shape.begin(), shape.end());
+  for (const Tile& tile : tiles) {
+    const int64_t tile_ndims = tile.dimensions().size();
+    CHECK_LE(tile_ndims, current_shape.size());
+    const absl::Span<const int64_t> tiled_shape =
+        absl::Span<const int64_t>(current_shape).last(tile_ndims);
+    // new_tiled_shape will hold the tiled shape after the tile is applied.
+    std::vector<int64_t> new_tiled_shape(2 * tile_ndims);
+    bool allow_multiple_tiles = true;
+    for (int64_t i = 0; i < tile_ndims; ++i) {
+      if (tiled_shape[i] % tile.dimension(i) != 0) {
+        return false;
+      }
+      CHECK_GT(tile.dimension(i), 0);
+      new_tiled_shape[i] = tiled_shape[i] / tile.dimension(i);
+      new_tiled_shape[tile_ndims + i] = tile.dimension(i);
+      if (!allow_multiple_tiles && new_tiled_shape[i] != 1) {
+        return false;
+      }
+      if (tile.dimension(i) != 1) {
+        allow_multiple_tiles = false;
+      }
+    }
+    current_shape.erase(current_shape.end() - tile_ndims, current_shape.end());
+    current_shape.insert(current_shape.end(), new_tiled_shape.begin(),
+                         new_tiled_shape.end());
+  }
+  return true;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index e5f807c0286eb3..2c9a28edd85864 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -273,6 +273,13 @@ class LayoutUtil {
 
   // Returns a shape's split config if present.
   static std::optional<SplitConfig> GetSplitConfig(const Shape& shape);
+
+  // Returns true if the layout tiling is equivalent to having no tiles at all.
+  // This is not a complete check and may return false for some unusual tilings
+  // even if they _are_ effectively untiled.
+  // The tiling should be valid for the provided shape.
+  static bool IsUntiledLayout(absl::Span<const Tile> tiles,
+                              absl::Span<const int64_t> shape);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util_test.cc b/third_party/xla/xla/layout_util_test.cc
index 3107b88330824c..f98caadf32e71a 100644
--- a/third_party/xla/xla/layout_util_test.cc
+++ b/third_party/xla/xla/layout_util_test.cc
@@ -513,5 +513,25 @@ TEST_F(LayoutUtilTest, MaxElementsInPerSplit) {
   EXPECT_EQ(LayoutUtil::MaxElementsInPerSplit(shape), 150 * 90 * 70);
 }
 
+struct IsUntiledLayoutTestCase {
+  std::vector<int64_t> shape;
+  std::vector<Tile> tiles;
+  bool expected_result;
+};
+
+using IsUntiledLayoutTest = ::testing::TestWithParam<IsUntiledLayoutTestCase>;
+
+TEST_P(IsUntiledLayoutTest, IsUntiledLayout) {
+  IsUntiledLayoutTestCase params = GetParam();
+  EXPECT_EQ(LayoutUtil::IsUntiledLayout(params.tiles, params.shape),
+            params.expected_result);
+}
+
+INSTANTIATE_TEST_SUITE_P(IsUntiledLayoutTests, IsUntiledLayoutTest,
+                         ::testing::ValuesIn<IsUntiledLayoutTestCase>(
+                             {{{24, 128}, {Tile({8, 128})}, true},
+                              {{4, 256}, {Tile({1, 128})}, true},
+                              {{2, 3, 4}, {Tile({8, 128})}, false}}));
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
index 4fef179d7989d6..775e8a0cdb3bba 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
@@ -2,7 +2,7 @@
 
 func.func @no_inputs() -> tensor<4xf32> {
   %init = arith.constant dense<[1.0,2.0,3.0,4.0]> : tensor<4xf32>
-  %zero = linalg.map outs(%init:tensor<4xf32>)() {
+  %zero = linalg.map outs(%init:tensor<4xf32>)(%out: f32) {
     %0 = arith.constant 0.0: f32
     linalg.yield %0: f32
   }
@@ -19,7 +19,7 @@ func.func @binary() -> tensor<4xi32> {
   %rhs = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
   %add = linalg.map ins(%lhs, %rhs: tensor<4xi32>, tensor<4xi32>)
                     outs(%init: tensor<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
+    (%lhs_elem: i32, %rhs_elem: i32, %out: i32) {
       %0 = arith.addi %lhs_elem, %rhs_elem: i32
       linalg.yield %0: i32
     }
@@ -36,7 +36,7 @@ func.func @memref() -> memref<4xi32> {
   %rhs = arith.constant dense<[10, 20, 30, 40]> : memref<4xi32>
   linalg.map ins(%lhs, %rhs: memref<4xi32>, memref<4xi32>)
              outs(%alloc: memref<4xi32>)
-    (%lhs_elem: i32, %rhs_elem: i32) {
+    (%lhs_elem: i32, %rhs_elem: i32, %out: i32) {
       %0 = arith.muli %lhs_elem, %rhs_elem: i32
       linalg.yield %0: i32
     }
@@ -49,7 +49,7 @@ func.func @memref() -> memref<4xi32> {
 
 func.func @index() -> memref<4xindex> {
   %alloc = memref.alloc() : memref<4xindex>
-  linalg.map outs(%alloc: memref<4xindex>)() {
+  linalg.map outs(%alloc: memref<4xindex>)(%out: index) {
     %0 = linalg.index 0 : index
     linalg.yield %0: index
   }
@@ -63,7 +63,8 @@ func.func @index() -> memref<4xindex> {
 func.func @vector() -> memref<4xvector<2xindex>> {
   %c = arith.constant dense<42> : vector<2xindex>
   %alloc = memref.alloc() : memref<4xvector<2xindex>>
-  linalg.map outs(%alloc: memref<4xvector<2xindex>>)() {
+  linalg.map outs(%alloc: memref<4xvector<2xindex>>)
+  (%out: vector<2xindex>) {
     linalg.yield %c: vector<2xindex>
   }
   func.return %alloc : memref<4xvector<2xindex>>
diff --git a/third_party/xla/xla/mlir_hlo/README.md b/third_party/xla/xla/mlir_hlo/README.md
index 0a8770aae99330..8e68ae82197973 100644
--- a/third_party/xla/xla/mlir_hlo/README.md
+++ b/third_party/xla/xla/mlir_hlo/README.md
@@ -1,5 +1,9 @@
 # MLIR-HLO: A Standalone "HLO" MLIR-based Compiler
 
+> Note:
+> [this project is deprecated](https://groups.google.com/a/openxla.org/g/openxla-discuss/c/Mppuv1Edv1s),
+> and will be removed in the future.
+
 The code here exists in two places:
 
 *   https://github.com/openxla/xla/tree/main/xla/mlir_hlo:
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 30f3cec32c4723..6b598b2371cd30 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -452,7 +452,9 @@ bool simplifyLoopDeallocs(Block& block) {
 
     getAliases(RegionBranchPoint::parent());
     for (auto& region : rbi->getRegions()) {
-      getAliases(region);
+      if (region.empty()) continue;
+      getAliases(RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+          region.front().getTerminator())));
     }
 
     for (auto it = eq.begin(), e = eq.end(); it != e; ++it) {
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
index a59c231ebaf0c3..dc5014afc09bd0 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
@@ -23,8 +23,13 @@ namespace deallocation {
 SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
                                             RegionBranchPoint point) {
   SmallVector<RegionEdge> edges;
-  if (Region* region = point.getRegionOrNull()) {
-    if (region->empty()) {
+  auto* parentRegion =
+      point.getTerminatorPredecessorOrNull()
+          ? point.getTerminatorPredecessorOrNull()->getParentRegion()
+          : nullptr;
+
+  if (parentRegion) {
+    if (parentRegion->empty()) {
       return edges;
     }
   }
@@ -35,9 +40,8 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
   for (const auto& successor : successors) {
     auto& edge = edges.emplace_back();
     edge.predecessorRegionPoint = point;
-    auto* region = point.getRegionOrNull();
-    edge.predecessorOp =
-        region ? region->front().getTerminator() : op.getOperation();
+    edge.predecessorOp = parentRegion ? parentRegion->front().getTerminator()
+                                      : op.getOperation();
     edge.predecessorOperandIndex = edge.predecessorOp->getNumOperands() -
                                    successor.getSuccessorInputs().size();
 
@@ -46,7 +50,9 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
       edge.successorOpOrRegion = op.getOperation();
       edge.successorValueIndex = 0;
     } else {
-      edge.successorRegionPoint = successor.getSuccessor();
+      edge.successorRegionPoint =
+          RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+              successor.getSuccessor()->front().getTerminator()));
       edge.successorOpOrRegion = successor.getSuccessor();
       edge.successorValueIndex = llvm::isa<scf::ForOp>(op) ? 1 : 0;
     }
@@ -68,7 +74,8 @@ SmallVector<RegionEdge> getPredecessorRegions(RegionBranchOpInterface op,
   };
   checkPredecessor(point.parent());
   for (Region& region : op->getRegions()) {
-    checkPredecessor(region);
+    checkPredecessor(RegionBranchPoint(cast<RegionBranchTerminatorOpInterface>(
+        region.front().getTerminator())));
   }
   return result;
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index e861dec331848c..0a20c69af6b6fa 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/ADT/StringExtras.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
@@ -48,7 +49,7 @@ struct EinsumToDotGeneralPattern : public OpRewritePattern<EinsumOp> {
     enum EquationVariable { kIsLhs, kIsRhs, kIsResult };
     EquationVariable currentVariable = kIsLhs;
     while (index < equation.size()) {
-      if (std::isalpha(equation[index])) {
+      if (llvm::isAlpha(equation[index])) {
         if (currentVariable == kIsLhs) {
           lhsTokens.push_back(equation[index]);
         } else if (currentVariable == kIsRhs) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 3e7c3c4a396db9..265bb8f4cac0ea 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -130,7 +130,8 @@ void prepareExplicitCapturedConstants(Operation* op) {
       // it explicit and replace uses within the block
       Operation *definingOp = input.getDefiningOp();
       mlir::DenseElementsAttr attr;
-      if (matchPattern(input, m_Constant(&attr))) {
+      if (mlir::isa_and_present<ConstantOp>(input.getDefiningOp()) &&
+          matchPattern(input, m_Constant(&attr))) {
         Operation *clonedOp = builder.clone(*definingOp);
         // Find which uses belong to the block and replace
         // with the cloned/explicit one
@@ -146,9 +147,10 @@ void prepareExplicitCapturedConstants(Operation* op) {
 }  // namespace
 
 void PrepareForExportPass::runOnOperation() {
-  getOperation().walk([&](Operation *op) {
+  getOperation().walk([&](Operation* op) {
     mlir::SplatElementsAttr attr;
-    if (matchPattern(op, m_Constant(&attr))) return prepareConstantOp(op, attr);
+    if (isa<ConstantOp>(op) && matchPattern(op, m_Constant(&attr)))
+      return prepareConstantOp(op, attr);
 
     if (auto bcastOp = dyn_cast<BroadcastInDimOp>(op))
       return prepareBroadcastInDim(bcastOp);
diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index 2e39cc43cc4f77..e301441db591c9 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -27,10 +27,7 @@ package(
         tags = [
             "nomsan",  # The execution engine doesn't work with msan, see b/248097619.
         ],
-        deps = [
-            "@pypi_lit//:pkg",
-            "@pypi//lit"
-        ],  # copybara:comment
+        deps = ["@pypi//lit"],  # copybara:comment
     )
     for src in glob(["**/*.mlir"])
 ]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
index a214de54fe8b50..9bc777043898b5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
@@ -11,6 +11,16 @@ func.func @splat_constants() -> tensor<1x64x224x224xf32> {
 
 // -----
 
+// CHECK-LABEL: @non_mhlo_constant
+func.func @non_mhlo_constant() -> tensor<128x1014x508xcomplex<f64>> {
+// CHECK:     arith.constant dense<(1.000000e+00,2.000000e+00)> : tensor<128x1014x508xcomplex<f64>>
+// CHECK-NOT: mhlo.broadcast_in_dim
+  %0 = arith.constant dense<(1.000000e+00,2.000000e+00)> : tensor<128x1014x508xcomplex<f64>>
+  func.return %0 : tensor<128x1014x508xcomplex<f64>>
+}
+
+// -----
+
 // CHECK-LABEL: @splat_constant_complex_float
 func.func @splat_constant_complex_float() -> tensor<128x1014x508xcomplex<f64>> {
 // CHECK: %[[CST:.*]] = mhlo.constant dense<(1.000000e+00,2.000000e+00)> : tensor<complex<f64>>
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index c83a734ef63872..141ca82c56b794 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -28,7 +28,7 @@ package_group(
         "//third_party/gxlang/...",
         "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
-        "//third_party/torch_tpu/...",
+        "//third_party/py/torch_tpu/...",
     ],
 )
 
@@ -101,13 +101,12 @@ cc_library(
     srcs = ["abstract_tracked_device_buffer.cc"],
     hdrs = ["abstract_tracked_device_buffer.h"],
     visibility = [
-        "//third_party/torch_tpu/pjrt:__pkg__",
+        "//third_party/py/torch_tpu/pjrt:__pkg__",
         "//xla:internal",
     ],
     deps = [
         ":device_event",
         ":pjrt_client",
-        ":pjrt_future",
         ":raw_buffer",
         "//xla:future",
         "//xla:util",
@@ -136,7 +135,7 @@ cc_library(
     ],
     visibility = internal_visibility([
         ":friends",
-        "//third_party/torch_tpu:__pkg__",
+        "//third_party/py/torch_tpu:__pkg__",
     ]),
     deps = [
         ":abstract_tracked_device_buffer",
@@ -144,7 +143,6 @@ cc_library(
         ":device_event",
         ":host_callback",
         ":pjrt_client",
-        ":pjrt_future",
         ":raw_buffer",
         "//xla:future",
         "//xla:literal",
@@ -181,7 +179,6 @@ xla_cc_test(
     deps = [
         ":pjrt_client",
         ":pjrt_common",
-        ":pjrt_future",
         ":pjrt_stream_executor_client",
         "//xla:literal",
         "//xla:literal_util",
@@ -194,6 +191,7 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/service:cpu_plugin",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -295,7 +293,6 @@ cc_library(
         ":pjrt_compiler",
         ":pjrt_device_description",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_layout",
         ":scoped_async_tracking_event",
         ":utils",
@@ -310,6 +307,7 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "//xla/tsl/framework:allocator",
+        "//xla/tsl/lib/gtl:int_type",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
@@ -353,6 +351,7 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_compiler",
+        "//xla:literal",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:alignment",
@@ -361,6 +360,7 @@ cc_library(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -404,6 +404,7 @@ cc_library(
     visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_common",
+        ":pjrt_device_dimensions",
         ":pjrt_layout",
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -481,6 +482,7 @@ cc_library(
         "//xla/hlo/builder:xla_computation",
         "//xla/pjrt/proto:pjrt_partial_program_proto_cc",
         "//xla/pjrt/proto:topology_description_proto_cc",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -524,6 +526,7 @@ cc_library(
     deps = [
         "//xla/pjrt/proto:pjrt_value_type_proto_cc",
         "//xla/tsl/lib/gtl:int_type",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -676,7 +679,6 @@ cc_library(
         ":pjrt_common",
         ":pjrt_compiler",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_stream_executor_device_description",
         ":raw_buffer",
         ":semaphore",
@@ -760,7 +762,6 @@ xla_cc_test(
         ":local_device_state",
         ":pjrt_client",
         ":pjrt_executable",
-        ":pjrt_future",
         ":pjrt_stream_executor_client",
         "//xla:future",
         "//xla:literal",
@@ -892,6 +893,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_constants",
+    hdrs = ["tpu_constants.h"],
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 # Transitional forwarding target. Use pjrt/plugin/xla_cpu:xla_cpu_pjrt_client instead.
 cc_library(
     name = "tfrt_cpu_pjrt_client",
@@ -1002,7 +1012,6 @@ cc_library(
         ":pjrt_common",
         ":pjrt_compiler",
         ":pjrt_executable",
-        ":pjrt_future",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
@@ -1053,7 +1062,6 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_executable",
-        ":pjrt_future",
         "//xla:future",
         "//xla:shape_util",
         "//xla/ffi:ffi_api",
@@ -1190,7 +1198,6 @@ cc_library(
     hdrs = ["device_event.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":pjrt_future",
         "//xla:future",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
@@ -1210,7 +1217,6 @@ cc_library(
         ":async_work_runner",
         ":device_event",
         ":pjrt_client",
-        ":pjrt_future",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
index fbe2fb39d8aba5..414ac74b92ad76 100644
--- a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/buffer_sequencing_event.cc b/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
index 7ebea180b415c0..6a86c996f2b044 100644
--- a/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
+++ b/third_party/xla/xla/pjrt/buffer_sequencing_event.cc
@@ -65,7 +65,7 @@ void BufferSequencingEvent::WaitForEventOnStream(se::Stream* stream) {
     return;
   }
 
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
   if (std::find(streams_defined_on_.begin(), streams_defined_on_.end(),
@@ -103,7 +103,7 @@ bool BufferSequencingEvent::IsPredeterminedErrorOrDefinedOn(
 
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   return absl::c_find(streams_defined_on_, stream) != streams_defined_on_.end();
 }
 
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 67db6b050a763d..559a9455a9445d 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PJRT C API changelog
 
+## 0.81
+
+* Added `PJRT_Layouts_PJRT_Executable_GetOutputLayouts`.
+
 ## 0.80
 
 * Added `PJRT_Extension_Type::PJRT_Extension_Type_HostAllocator`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index c8334878913e53..260b643bb5ba75 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -103,7 +103,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 80
+#define PJRT_API_MINOR 81
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
index dd63d7d47d31de..992b3950a4ca3a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
@@ -32,13 +32,13 @@ extern "C" {
 // See: https://en.wikipedia.org/wiki/Foreign_function_interface
 #define PJRT_API_FFI_EXTENSION_VERSION 3
 
-struct PJRT_FFI_Type_Info {
+typedef struct PJRT_FFI_Type_Info {
   void (*deleter)(void* object);
   void (*serialize)();    // placeholder for future use
   void (*deserialize)();  // placeholder for future use
-};
+} PJRT_FFI_Type_Info;
 
-struct PJRT_FFI_Type_Register_Args {
+typedef struct PJRT_FFI_Type_Register_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
 
@@ -46,7 +46,7 @@ struct PJRT_FFI_Type_Register_Args {
   size_t type_name_size;
   int64_t type_id;  // in-out
   PJRT_FFI_Type_Info* type_info;
-};
+} PJRT_FFI_Type_Register_Args;
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Type_Register_Args, type_info);
 
 // Registers external type in a static type registry. If `type_id` is set to `0`
@@ -55,12 +55,10 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Type_Register_Args, type_info);
 // id for the given type name.
 typedef PJRT_Error* PJRT_FFI_Type_Register(PJRT_FFI_Type_Register_Args* args);
 
-// User-data that will be forwarded to the FFI handlers. Deleter is optional,
-// and can be nullptr. Deleter will be called when the context is destroyed.
+// User-data that will be forwarded to the FFI handlers.
 typedef struct PJRT_FFI_UserData {
   int64_t type_id;
   void* data;
-  void (*deleter)(void* data);
 } PJRT_FFI_UserData;
 
 struct PJRT_FFI_UserData_Add_Args {
@@ -97,12 +95,11 @@ typedef PJRT_Error* PJRT_FFI_Register_Handler(
 
 typedef struct PJRT_FFI_Extension {
   PJRT_Extension_Base base;
-  PJRT_FFI_Type_Register* type_id_register;
+  PJRT_FFI_Type_Register* type_register;
   PJRT_FFI_UserData_Add* user_data_add;
   PJRT_FFI_Register_Handler* register_handler;
-  PJRT_FFI_Type_Register* type_register;
 } PJRT_FFI;
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Extension, type_register);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_Extension, register_handler);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
index 569aa0306766d7..9249fab66d316d 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
@@ -69,7 +69,7 @@ static PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args) {
 
   xla::ffi::TypeRegistry::TypeId type_id(args->user_data.type_id);
   PJRT_RETURN_IF_ERROR(args->context->execute_context->ffi_context().Insert(
-      type_id, args->user_data.data, args->user_data.deleter));
+      type_id, args->user_data.data));
   return nullptr;
 }
 
@@ -102,10 +102,9 @@ PJRT_FFI_Extension CreateFfiExtension(PJRT_Extension_Base* next) {
           /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_FFI,
           /*next=*/next,
       },
-      /*type_id_register=*/PJRT_FFI_Type_Register,
+      /*type_register=*/PJRT_FFI_Type_Register,
       /*user_data_add=*/PJRT_FFI_UserData_Add,
       /*register_handler=*/PJRT_FFI_Register_Handler,
-      /*type_register=*/PJRT_FFI_Type_Register,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 1df28573a11e74..e52aa03efdb254 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -359,7 +359,6 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
   add_args.extension_start = nullptr;
   add_args.user_data.type_id = 42;
   add_args.user_data.data = &string_data;
-  add_args.user_data.deleter = nullptr;
   add_args.context = create_arg.context;
   EXPECT_EQ(ffi_extension->user_data_add(&add_args), nullptr);
 
@@ -380,13 +379,17 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
 TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   size_t dma_size = 1024 * 1024;
   size_t alignment = 1024 * 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  void* host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
 
   PJRT_Client_DmaMap_Args dma_args;
   dma_args.struct_size = PJRT_Client_DmaMap_Args_STRUCT_SIZE;
   dma_args.extension_start = nullptr;
   dma_args.client = client_;
-  dma_args.data = host_dma_ptr.get();
+  dma_args.data = host_dma_ptr;
   dma_args.size = dma_size;
   PJRT_Error* dma_error = api_->PJRT_Client_DmaMap(&dma_args);
   ASSERT_EQ(dma_error, nullptr);
@@ -396,7 +399,7 @@ TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   unmap_args.struct_size = PJRT_Client_DmaUnmap_Args_STRUCT_SIZE;
   unmap_args.extension_start = nullptr;
   unmap_args.client = client_;
-  unmap_args.data = host_dma_ptr.get();
+  unmap_args.data = host_dma_ptr;
   PJRT_Error* unmap_error = api_->PJRT_Client_DmaUnmap(&unmap_args);
   ASSERT_EQ(unmap_error, nullptr);
   MakeErrorDeleter(api_)(unmap_error);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
index 34c6a056931368..3f6b3ecd9b03ef 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
@@ -26,8 +26,9 @@ extern "C" {
 #endif
 
 // This extension provides capabilities around custom on-device memory layouts
-// for PJRT_Buffers. The extension is both optional and experimental, meaning
-// ABI-breaking and other incompatible changes may be introduced at any time.
+// for PJRT_Buffers and PJRT_Executables. The extension is both optional and
+// experimental, meaning ABI-breaking and other incompatible changes may be
+// introduced at any time.
 //
 // If this extension is provided, JAX and possibly other frameworks will assume
 // that the compiler MLIR input can contain "mhlo.layout_mode" attributes on
@@ -36,7 +37,7 @@ extern "C" {
 // https://github.com/openxla/xla/blob/main/xla/pjrt/layout_mode.h for more
 // details.
 
-#define PJRT_API_LAYOUTS_EXTENSION_VERSION 2
+#define PJRT_API_LAYOUTS_EXTENSION_VERSION 3
 
 // -------------------------------- Data types ---------------------------------
 
@@ -124,6 +125,23 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Topology_GetDefaultLayout_Args,
 typedef PJRT_Error* PJRT_Layouts_PJRT_Topology_GetDefaultLayout(
     PJRT_Layouts_PJRT_Topology_GetDefaultLayout_Args* args);
 
+// Returns output layouts for an executable.
+struct PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+  size_t num_outputs;  // out
+  // Layout data is owned by and has the lifetime of `executable`.
+  // Has length `num_outputs`.
+  PJRT_Layouts_MemoryLayout** layouts;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args,
+                          layouts);
+
+// Returns a list of layouts for executable outputs. Each output has a layout.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Executable_GetOutputLayouts(
+    PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args* args);
+
 // --------------------------- Extension entrypoint ----------------------------
 
 typedef struct PJRT_Layouts_Extension {
@@ -136,9 +154,11 @@ typedef struct PJRT_Layouts_Extension {
   PJRT_Layouts_PJRT_Buffer_MemoryLayout* PJRT_Layouts_PJRT_Buffer_MemoryLayout;
   PJRT_Layouts_PJRT_Topology_GetDefaultLayout*
       PJRT_Layouts_PJRT_Topology_GetDefaultLayout;
+  PJRT_Layouts_PJRT_Executable_GetOutputLayouts*
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts;
 } PJRT_Layouts_Extension;
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_Extension,
-                          PJRT_Layouts_PJRT_Topology_GetDefaultLayout);
+                          PJRT_Layouts_PJRT_Executable_GetOutputLayouts);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
index 6b4de640120a37..3ac4ff2e5f5445 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_phase_compile_internal.cc
@@ -42,7 +42,7 @@ namespace {
 absl::StatusOr<xla::CompileOptions> ParseCompileOptions(
     absl::string_view options_str) {
   xla::CompileOptionsProto options_proto;
-  if (!options_proto.ParseFromArray(options_str.data(), options_str.size())) {
+  if (!options_proto.ParseFromString(options_str)) {
     return absl::InvalidArgumentError(
         "PJRT_Client_Compile: failed to deserialize CompileOptionsProto");
   }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index ee59265b6c46e9..ac22ea721b79b9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -219,6 +219,33 @@ static absl::Status EnsureExecutableOutputDimensionsPopulated(
   return absl::OkStatus();
 }
 
+static absl::Status PopulateExecutableOutputLayouts(
+    PJRT_Executable* executable) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::shared_ptr<const xla::PjRtLayout>> cpp_out_layouts,
+      executable->get()->GetOutputLayouts());
+  executable->out_layouts.reserve(cpp_out_layouts.size());
+  executable->out_layouts_pointers.reserve(cpp_out_layouts.size());
+  for (std::shared_ptr<const xla::PjRtLayout>& layout : cpp_out_layouts) {
+    executable->out_layouts.push_back(
+        PJRT_Layouts_MemoryLayout{std::move(layout)});
+  }
+  for (PJRT_Layouts_MemoryLayout& layout : executable->out_layouts) {
+    executable->out_layouts_pointers.push_back(&layout);
+  }
+  return absl::OkStatus();
+}
+
+static absl::Status EnsureExecutableOutputLayoutsPopulated(
+    PJRT_Executable* executable) {
+  absl::MutexLock lock(executable->mutex);
+  if (!executable->out_layouts_ran) {
+    TF_RETURN_IF_ERROR(PopulateExecutableOutputLayouts(executable));
+    executable->out_layouts_ran = true;
+  }
+  return absl::OkStatus();
+}
+
 static absl::Status PopulateExecutableOutputMemoryKinds(
     PJRT_Executable* executable) {
   TF_ASSIGN_OR_RETURN(
@@ -806,7 +833,7 @@ absl::StatusOr<xla::CompileOptions> ParseCompileOptions(
     absl::string_view options_str) {
   xla::CompileOptionsProto options_proto;
   // Open source ParseFromString doesn't support string_view.
-  if (!options_proto.ParseFromArray(options_str.data(), options_str.size())) {
+  if (!options_proto.ParseFromString(options_str)) {
     return tsl::errors::InvalidArgument(
         "PJRT_Client_Compile: failed to deserialize CompileOptionsProto");
   }
@@ -833,7 +860,7 @@ ParsePjrtProgram(std::optional<mlir::MLIRContext>& context,
   } else if (format_str == pjrt::kHloFormat) {
     xla::HloModuleProto module_proto;
     // Open source ParseFromString doesn't support string_view.
-    if (!module_proto.ParseFromArray(module_str.data(), module_str.size())) {
+    if (!module_proto.ParseFromString(module_str)) {
       return tsl::errors::InvalidArgument(
           "PJRT_Client_Compile: failed to deserialize HloModuleProto");
     }
@@ -1790,8 +1817,6 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     options.call_location = std::string(args->options->call_location);
   }
   options.strict_shape_checking = true;
-  options.arguments_are_tupled = false;
-  options.untuple_result = true;
   options.context = args->options->context
                         ? args->options->context->execute_context.get()
                         : nullptr;
@@ -2549,8 +2574,8 @@ PJRT_Error* PJRT_TopologyDescription_Deserialize(
       PJRT_TopologyDescription_Attributes_Args_STRUCT_SIZE, args->struct_size));
 
   xla::PjRtTopologyDescriptionProto proto;
-  if (!proto.ParseFromArray(args->serialized_topology,
-                            args->serialized_topology_size)) {
+  if (!proto.ParseFromString(absl::string_view(
+          args->serialized_topology, args->serialized_topology_size))) {
     return new PJRT_Error{xla::InvalidArgument(
         "Failed to parse PjRtTopologyDescriptionProto at the C API level, "
         "from binary string of size: %d",
@@ -2692,6 +2717,19 @@ PJRT_Error* PJRT_Layouts_PJRT_Topology_GetDefaultLayout(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Layouts_PJRT_Executable_GetOutputLayouts(
+    PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args",
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args_STRUCT_SIZE,
+      args->struct_size));
+  PJRT_RETURN_IF_ERROR(
+      EnsureExecutableOutputLayoutsPopulated(args->executable));
+  args->num_outputs = args->executable->out_layouts_pointers.size();
+  args->layouts = args->executable->out_layouts_pointers.data();
+  return nullptr;
+}
+
 static std::vector<PJRT_NamedValue> PopulatePjrtAttributes(
     const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>&
         attributes) {
@@ -3106,6 +3144,8 @@ PJRT_Layouts_Extension CreateLayoutsExtension(PJRT_Extension_Base* next) {
       pjrt::PJRT_Layouts_PJRT_Buffer_MemoryLayout,
       /*PJRT_Layouts_PJRT_Topology_GetDefaultLayout=*/
       &PJRT_Layouts_PJRT_Topology_GetDefaultLayout,
+      /*PJRT_Layouts_PJRT_Executable_GetOutputLayouts=*/
+      &PJRT_Layouts_PJRT_Executable_GetOutputLayouts,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index dcb59ef6c1436b..387c8642694872 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -158,6 +158,10 @@ struct PJRT_Executable {
   std::vector<int64_t> out_dimensions;
   std::vector<size_t> out_dimension_sizes;
 
+  bool out_layouts_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<PJRT_Layouts_MemoryLayout> out_layouts;
+  std::vector<PJRT_Layouts_MemoryLayout*> out_layouts_pointers;
+
   explicit PJRT_Executable(std::shared_ptr<xla::PjRtExecutable> executable);
   explicit PJRT_Executable(xla::PjRtExecutable* executable);
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/BUILD b/third_party/xla/xla/pjrt/c_api_client/BUILD
index e14baa666935a8..ad0d6328a8cf2f 100644
--- a/third_party/xla/xla/pjrt/c_api_client/BUILD
+++ b/third_party/xla/xla/pjrt/c_api_client/BUILD
@@ -20,7 +20,7 @@ package_group(
         "//third_party/gxlang/...",
         "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
-        "//third_party/torch_tpu/...",
+        "//third_party/py/torch_tpu/...",
     ],
 )
 
@@ -55,7 +55,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:string_utils",
         "//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
@@ -130,10 +129,12 @@ xla_cc_test(
         "//xla/service:computation_placer_hdr",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 500e008d9d208a..bd289db2cb68c8 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -989,24 +989,15 @@ PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
       }};
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtCApiClient::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PjRtDevice* device,
-    PjRtCrossHostRecvNotifier notifier) {
-  const PJRT_Api* c_api = pjrt_c_api();
-  PJRT_CrossHostTransfers_Extension* extension =
-      FindExtension<PJRT_CrossHostTransfers_Extension>(
-          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
-  if (extension == nullptr) {
-    return absl::UnimplementedError(
-        "MakeCrossHostReceiveBuffers is not implemented in this PJRT plugin.");
-  }
-  PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args args;
-  args.struct_size =
-      PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.client = c_client_.get();
-  args.num_shapes = shapes.size();
+// Helper struct and method used to serialize shapes past the C API boundary.
+struct ShapesInfo {
+  std::vector<size_t> shape_num_dims;
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<const int64_t*> num_dims;
+  std::vector<PJRT_Buffer_Type> element_type_list;
+};
+
+ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
   std::vector<size_t> shape_num_dims;
   shape_num_dims.reserve(shapes.size());
   std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
@@ -1015,6 +1006,7 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   num_dims.reserve(shapes.size());
   std::vector<PJRT_Buffer_Type> element_type_list;
   element_type_list.reserve(shapes.size());
+
   for (int i = 0; i < shapes.size(); ++i) {
     shape_num_dims.push_back(shapes[i].dimensions().size());
 
@@ -1031,10 +1023,51 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
     //   layout_list.push_back(&(c_layout_data.c_layout));
     layout_list.push_back(nullptr);
   }
-  args.shape_num_dims = shape_num_dims.data();
-  args.num_dims = num_dims.data();
-  args.element_types = element_type_list.data();
-  args.layouts = layout_list.data();
+
+  return ShapesInfo{
+      /*shape_num_dims=*/std::move(shape_num_dims),
+      /*layout_list=*/std::move(layout_list),
+      /*num_dims=*/std::move(num_dims),
+      /*element_type_list=*/std::move(element_type_list),
+  };
+}
+
+// Helper method to convert a list of PJRT_Buffer* to a list of PjRtBuffer*.
+std::vector<std::unique_ptr<PjRtBuffer>> MakePjRtBuffersFromPJRT_Buffers(
+    PjRtCApiClient* client, PJRT_Buffer** c_buffers, size_t num_buffers) {
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  buffers.reserve(num_buffers);
+  for (int i = 0; i < num_buffers; ++i) {
+    buffers.emplace_back(
+        std::make_unique<PjRtCApiBuffer>(client, c_buffers[i]));
+  }
+  return buffers;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtCApiClient::MakeCrossHostReceiveBuffers(
+    absl::Span<const Shape> shapes, PjRtDevice* device,
+    PjRtCrossHostRecvNotifier notifier) {
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "MakeCrossHostReceiveBuffers is not implemented in this PJRT plugin.");
+  }
+  PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+
+  ShapesInfo shapes_info = MakeShapesInfo(shapes);
+  args.num_shapes = shapes.size();
+  args.shape_num_dims = shapes_info.shape_num_dims.data();
+  args.num_dims = shapes_info.num_dims.data();
+  args.element_types = shapes_info.element_type_list.data();
+  args.layouts = shapes_info.layout_list.data();
 
   args.notifier = CppCrossHostRecvNotifierToC(c_api, std::move(notifier));
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
@@ -1044,13 +1077,102 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   RETURN_STATUS_IF_PJRT_ERROR(
       extension->PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers(&args),
       c_api);
-  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-  buffers.reserve(args.num_buffers);
+
+  return MakePjRtBuffersFromPJRT_Buffers(this, args.buffers,
+                                         temp_buffers.size());
+}
+
+absl::StatusOr<std::vector<Future<>>> PjRtCApiClient::CrossHostSendBuffers(
+    absl::Span<PjRtBuffer* const> buffers,
+    absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Get C API extension.
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "CrossHostSendBuffers is not implemented in this PJRT plugin.");
+  }
+
+  // Form inputs.
+  PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+  args.num_buffers = buffers.size();
+
+  std::vector<PJRT_Buffer*> c_buffers;
+  c_buffers.reserve(buffers.size());
+  for (PjRtBuffer* buffer : buffers) {
+    c_buffers.push_back(
+        tensorflow::down_cast<const PjRtCApiBuffer*>(buffer)->c_buffer());
+  }
+
+  args.buffers = c_buffers.data();
+  args.dst_global_device_ids = dst_global_device_ids.data();
+  args.transfer_keys = transfer_keys.data();
+
+  auto send_events = std::vector<PJRT_Event*>(args.num_buffers);
+  args.send_events = send_events.data();
+
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(&args), c_api);
+
+  std::vector<Future<>> send_futures;
+  send_futures.reserve(args.num_buffers);
   for (int i = 0; i < args.num_buffers; ++i) {
-    buffers.emplace_back(std::unique_ptr<PjRtBuffer>(
-        std::make_unique<PjRtCApiBuffer>(this, args.buffers[i])));
+    send_futures.push_back(
+        pjrt::ConvertCEventToCppFuture(args.send_events[i], c_api));
   }
-  return buffers;
+
+  return send_futures;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtCApiClient::CrossHostReceiveBuffers(
+    xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+    absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Get C API extension.
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_CrossHostTransfers_Extension* extension =
+      FindExtension<PJRT_CrossHostTransfers_Extension>(
+          PJRT_Extension_Type::PJRT_Extension_Type_CrossHostTransfers);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "CrossHostReceiveBuffers is not implemented in this PJRT plugin.");
+  }
+
+  // Form inputs.
+  PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args args;
+  args.struct_size =
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+
+  ShapesInfo shapes_info = MakeShapesInfo(shapes);
+  args.num_shapes = shapes.size();
+  args.shape_num_dims = shapes_info.shape_num_dims.data();
+  args.num_dims = shapes_info.num_dims.data();
+  args.element_types = shapes_info.element_type_list.data();
+  args.layouts = shapes_info.layout_list.data();
+
+  args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
+  args.src_global_device_ids = src_global_device_ids.data();
+  args.transfer_keys = transfer_keys.data();
+
+  std::vector<PJRT_Buffer*> temp_buffers(shapes.size());
+  args.buffers = temp_buffers.data();
+
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(&args),
+      c_api);
+
+  return MakePjRtBuffersFromPJRT_Buffers(this, args.buffers,
+                                         temp_buffers.size());
 }
 
 class PjRtCApiAsyncHostToDeviceTransferManager
@@ -1725,6 +1847,63 @@ PjRtCApiExecutable::GetOutputDimensions() const {
   return std::vector<std::vector<DimensionVector>>{std::move(out)};
 }
 
+absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+PjRtCApiExecutable::GetOutputLayouts() const {
+  const PJRT_Api* c_api = pjrt_c_api();
+  if (c_api->pjrt_api_version.major_version == 0 &&
+      c_api->pjrt_api_version.minor_version < 81) {
+    // If the PJRT C API version is too old, fall back to the default
+    // implementation.
+    return this->PjRtExecutable::GetOutputLayouts();
+  }
+  PJRT_Layouts_Extension* extension =
+      pjrt::FindExtension<PJRT_Layouts_Extension>(
+          c_api, PJRT_Extension_Type::PJRT_Extension_Type_Layouts);
+  if (extension == nullptr ||
+      extension->PJRT_Layouts_MemoryLayout_Serialize == nullptr ||
+      extension->PJRT_Layouts_PJRT_Executable_GetOutputLayouts == nullptr) {
+    // If we can't find PJRT_Layouts_PJRT_Executable_GetOutputLayouts support,
+    // fall back to the default implementation.
+    return this->PjRtExecutable::GetOutputLayouts();
+  }
+
+  PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args args;
+  args.struct_size =
+      PJRT_Layouts_PJRT_Executable_GetOutputLayouts_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.executable = c_executable();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Layouts_PJRT_Executable_GetOutputLayouts(&args), c_api);
+
+  std::vector<std::shared_ptr<const PjRtLayout>> layouts;
+  layouts.reserve(args.num_outputs);
+  for (int i = 0; i < args.num_outputs; ++i) {
+    // TODO(b/343274093): returns a PjRtLayout that wraps a C API layout
+    // directly instead of de/serializing into an xla::Layout.
+    PJRT_Layouts_MemoryLayout_Serialize_Args serialize_args;
+    serialize_args.struct_size =
+        PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE;
+    serialize_args.extension_start = nullptr;
+    serialize_args.layout = args.layouts[i];
+    pjrt::LogFatalIfPjrtError(
+        extension->PJRT_Layouts_MemoryLayout_Serialize(&serialize_args), c_api);
+
+    // Clean up `PJRT_Layouts_SerializedLayout`.
+    absl::Cleanup cleanup = [&serialize_args] {
+      serialize_args.serialized_layout_deleter(
+          serialize_args.serialized_layout);
+    };
+
+    std::string serialized_layout(serialize_args.serialized_bytes,
+                                  serialize_args.serialized_bytes_size);
+    absl::StatusOr<std::shared_ptr<const PjRtLayout>> pjrt_layout =
+        PjRtLayout::Deserialize(serialized_layout);
+    TF_CHECK_OK(pjrt_layout.status());
+    layouts.push_back(*std::move(pjrt_layout));
+  }
+  return layouts;
+}
+
 absl::StatusOr<std::vector<std::vector<absl::string_view>>>
 PjRtCApiExecutable::GetOutputMemoryKinds() const {
   PJRT_Executable_OutputMemoryKinds_Args args;
@@ -2287,7 +2466,7 @@ static absl::StatusOr<PJRT_ExecuteContext*> ForwardExecuteContext(
         PJRT_FFI_UserData_Add_Args_STRUCT_SIZE,
         nullptr,
         create_args.context,
-        PJRT_FFI_UserData{type_id.value(), data, /*deleter=*/nullptr},
+        PJRT_FFI_UserData{type_id.value(), data},
     };
     RETURN_STATUS_IF_PJRT_ERROR(ffi_extension->user_data_add(&add_args), c_api);
     return absl::OkStatus();
@@ -2809,8 +2988,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        literal->shape(), absl::MakeSpan(byte_strides)));
     // Avoid use-after-free on `literal` due to unsequenced move and use.
     Literal* literal_pointer = literal.get();
     return dst_memory->client()->BufferFromHostBuffer(
@@ -2901,6 +3080,7 @@ void PjRtCApiBuffer::MakePromiseTrackEvent() {
 }
 
 Future<> PjRtCApiBuffer::GetReadyFuture() {
+  absl::MutexLock l(mu_);
   if (readiness_promise_ == nullptr) {
     auto [promise, future] = Future<>::MakePromise();
     readiness_promise_ = std::move(promise).ToShared();
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 78f422e882ddf9..fde0de94a06b4c 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -50,7 +50,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/service/computation_placer.h"
@@ -377,6 +376,17 @@ class PjRtCApiClient : public PjRtClient {
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) override;
 
+  absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
   absl::Status DmaMap(void* data, size_t size) override;
 
   absl::Status DmaUnmap(void* data) override;
@@ -590,6 +600,9 @@ class PjRtCApiExecutable : public PjRtExecutable {
   absl::StatusOr<std::vector<std::vector<DimensionVector>>>
   GetOutputDimensions() const override;
 
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override;
+
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override;
 
@@ -671,6 +684,11 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->GetOutputDimensions();
   }
 
+  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const override {
+    return executable_->GetOutputLayouts();
+  }
+
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override {
     return executable_->GetOutputMemoryKinds();
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index 991b86b7454599..74e0bba0921a8f 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -57,8 +58,10 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
@@ -126,6 +129,55 @@ TEST(PjRtCApiClientTest, FulfillAliasBuffer) {
       LiteralUtil::CreateR2<int32_t>({{2, 3, 4}, {5, 6, 7}}), *alias_literal));
 }
 
+TEST(PjRtCApiClientTest, ConcurrentGetReadyFuture) {
+  const PJRT_Api* c_api = ::pjrt::cpu_plugin::GetCpuPjrtApi();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          WrapClientAroundCApi(c_api));
+
+  constexpr int kNumThreads = 4;
+  tsl::thread::ThreadPool thread_pool(
+      tsl::Env::Default(), "GetReadyWithConcurrentUsage", kNumThreads);
+
+  std::vector<int32_t> data{1, 2, 3, 4, 5, 6};
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 3});
+
+  // Create a buffer from host data.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+
+  // Define a simple "add one" kernel.
+  XlaBuilder builder("add_one");
+  auto input = Parameter(&builder, 0, shape, "input");
+  auto one = ConstantR0<int32_t>(&builder, 1);
+  auto add = Add(input, one);
+  auto computation = builder.Build(add).value();
+
+  // Compile and load the executable.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtLoadedExecutable> executable,
+      client->CompileAndLoad(computation, CompileOptions()));
+  for (size_t i = 0; i < 100; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results,
+        executable->Execute({{param.get()}}, ExecuteOptions()));
+    auto buffer = std::move(results[0][0]);
+
+    absl::BlockingCounter blocking_counter(kNumThreads);
+    for (size_t j = 0; j < kNumThreads; ++j) {
+      thread_pool.Schedule([&, buffer = buffer.get()]() {
+        TF_EXPECT_OK(buffer->GetReadyFuture().Await());
+        blocking_counter.DecrementCount();
+      });
+    }
+    blocking_counter.Wait();
+  }
+}
+
 TEST(PjRtCApiClientTest, IsDynamicDimension) {
   SetUpCpuPjRtApi();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index 4f680bbe1b5f67..cf68f679dbbdd2 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
@@ -199,11 +198,67 @@ absl::StatusOr<
 CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
                                     PjRtMemorySpace* memory_space) {
   tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
-  tsl::RCReference<PjRtDeviceEvent> definition_event;
-  PjRtFulfillAliasBufferCallback fulfill_cb;
+  PjRtFulfillAliasRawBufferCallback buffer_promise;
 
-  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, definition_event, fulfill_cb),
-                      CreateRawBufferChannel(shape, memory_space));
+  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, buffer_promise),
+                      CreateRawBufferChannel(memory_space));
+
+  tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
+  tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
+  TF_ASSIGN_OR_RETURN(
+      std::tie(definition_event_promise, definition_event),
+      CreateLinkedEventPromise(memory_space, "CreateRawBufferChannel"));
+
+  PjRtFulfillAliasBufferCallback fulfill_cb =
+      [buffer_promise = std::move(buffer_promise),
+       definition_event_promise = std::move(definition_event_promise),
+       memory_space,
+       shape](absl::StatusOr<xla::PjRtBuffer*> buffer_or) mutable {
+        if (!buffer_or.ok()) {
+          definition_event_promise->SetError(buffer_or.status());
+          std::move(buffer_promise)(buffer_or.status()).IgnoreError();
+          return buffer_or.status();
+        }
+        xla::PjRtBuffer* buffer = buffer_or.value();
+        if (buffer->on_device_shape() != shape) {
+          auto status = absl::InvalidArgumentError(absl::StrFormat(
+              "Shape mismatch in CreateRawBufferChannel fulfill: expected %s, "
+              "got %s",
+              shape.ToString(), buffer->on_device_shape().ToString()));
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        xla::CommonPjRtBuffer* common_buffer =
+            dynamic_cast<xla::CommonPjRtBuffer*>(buffer);
+        if (common_buffer == nullptr) {
+          auto status =
+              absl::InternalError("Failed to cast to CommonPjRtBuffer");
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        xla::CommonPjRtBuffer::ScopedHold hold =
+            common_buffer->GetBufferWithHold(
+                xla::CommonPjRtBuffer::ScopedHold::kDonation);
+        auto device_event_or = hold.buffer()->GetDefinitionEvent(memory_space);
+        if (!device_event_or.ok()) {
+          auto status = device_event_or.status();
+          definition_event_promise->SetError(status);
+          std::move(buffer_promise)(status).IgnoreError();
+          return status;
+        }
+        auto status = std::move(buffer_promise)(
+            hold.buffer()->GetRawBuffer(memory_space));
+        if (!status.ok()) {
+          definition_event_promise->SetError(status);
+          return status;
+        }
+
+        definition_event_promise->Set(std::move(*device_event_or));
+        hold.ConfirmDonation();
+        return absl::OkStatus();
+      };
 
   TF_ASSIGN_OR_RETURN(
       auto result_buffer,
@@ -424,16 +479,16 @@ CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
             *literal, dst_shape,
             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
             dst_raw_buffer);
+        if (!status_or_h2d_transfer_event.ok()) {
+          definition_event_promise->SetError(status);
+        } else {
+          status_or_h2d_transfer_event.value()->AndThen(
+              [literal = std::move(literal)] {});
+          definition_event_promise->Set(
+              *std::move(status_or_h2d_transfer_event));
+        }
       } else {
-        status_or_h2d_transfer_event =
-            dst_raw_buffer->MakeAllocationReadyEvent();
-      }
-      if (!status_or_h2d_transfer_event.ok()) {
-        definition_event_promise->SetError(status);
-      } else {
-        status_or_h2d_transfer_event.value()->AndThen(
-            [literal = std::move(literal)] {});
-        definition_event_promise->Set(*std::move(status_or_h2d_transfer_event));
+        definition_event_promise->SetReady();
       }
     }
   });
@@ -663,7 +718,8 @@ CommonPjRtBufferImpl::CopyToMemorySpace(PjRtMemorySpace* dst_memory_space) {
     TF_ASSIGN_OR_RETURN(auto dest_shape, client()->GetCopyDestinationShape(
                                              on_device_shape(), memory_space(),
                                              dst_memory_space));
-    if (dest_shape == on_device_shape()) {
+    if (xla::Shape::Equal().IgnoreMemorySpaceInLayout()(dest_shape,
+                                                        on_device_shape())) {
       return DirectCopyToMemorySpace(dst_memory_space);
     }
     if (!primitive_util::IsSubByteNonPredType(dest_shape.element_type())) {
@@ -695,8 +751,8 @@ CommonPjRtBufferImpl::CopyToMemorySpaceSyncThroughLiteral(
   TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
   absl::InlinedVector<int64_t, 4> byte_strides(
       literal->shape().dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      literal->shape(), absl::MakeSpan(byte_strides)));
   // Avoid use-after-free on `literal` due to unsequenced move and use.
   Literal* literal_pointer = literal.get();
   return dst_memory_space->client()->BufferFromHostBuffer(
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index bb9f2b6f632f9e..610679e0176a1a 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
@@ -187,15 +186,12 @@ class CommonPjRtClient : public PjRtClient {
   // Creates a raw buffer channel. Returns a tuple containing:
   // 1.  A tsl::RCReference<CommonPjRtRawBuffer> which is an alias for a future
   //     raw buffer.
-  // 2.  A tsl::RCReference<PjRtDeviceEvent> which is the definition event
-  //     for the alias raw buffer.
-  // 3.  A PjRtFulfillAliasBufferCallback to fulfill the alias.
-  // TODO(b/447164755 jparkerh): Rework this API to share a bit more code
-  // between children of this class.
-  virtual absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                                    tsl::RCReference<PjRtDeviceEvent>,
-                                    PjRtFulfillAliasBufferCallback>>
-  CreateRawBufferChannel(const Shape& shape, PjRtMemorySpace* memory_space) {
+  // 3.  A PjRtFulfillAliasRawBufferCallback to fulfill the alias.
+  using PjRtFulfillAliasRawBufferCallback = absl::AnyInvocable<absl::Status(
+      absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>) &&>;
+  virtual absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                                   PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
     return absl::UnimplementedError("CreateRawBufferChannel is not supported");
   }
 
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 2abd7ba1578598..be05fcac56af0f 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -52,7 +52,6 @@ cc_library(
     visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_buffer_internal_users"]),
     deps = [
         ":cpu_event",
-        "//xla:cpu_function_runtime",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_tree",
@@ -110,7 +109,6 @@ cc_library(
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:semaphore",
         "//xla/pjrt/plugin/xla_cpu:cpu_device_description",
         "//xla/service/cpu:cpu_xfeed",
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index b0ec4d20591fc1..840a2c40c67687 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -440,7 +440,7 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
     return Internal(
         "PjRtCpuClient::DeserializeExecutable proto too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal(
         "PjRtCpuClient::DeserializeExecutable proto deserialization failed");
   }
@@ -540,31 +540,10 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
 }
 
 static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
+    std::unique_ptr<HloModule> hlo_module,
     const ExecutableBuildOptions& build_options,
     const ExecutionOptions& execution_options,
-    const xla::Compiler::CompileOptions& compile_options, int num_threads,
-    std::function<void(HloModuleConfig&)> customize_hlo_module_config) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  // Unoptimized HloModuleConfig.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> hlo_module_config,
-      CreateModuleConfig(program_shape, argument_layouts, &execution_options,
-                         execution_options.num_replicas(), num_threads,
-                         /*aot_options=*/nullptr));
-
-  // Apply the user-provided callback to customize the HloModuleConfig.
-  if (customize_hlo_module_config) {
-    customize_hlo_module_config(*hlo_module_config);
-  }
-
-  // Unoptimized HloModule.
-  const xla::HloModuleProto& hlo_module_proto = computation.proto();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
+    const xla::Compiler::CompileOptions& compile_options) {
   VLOG(3) << "Unoptimized HLO module: " << hlo_module->ToString();
   static constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
@@ -589,32 +568,10 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
 }
 
 static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
+    std::unique_ptr<HloModule> hlo_module,
     const ExecutableBuildOptions& build_options,
     const ExecutionOptions& execution_options,
-    const xla::AotCompilationOptions& compile_options, int num_threads,
-    std::function<void(HloModuleConfig&)> customize_hlo_module_config) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  // Unoptimized HloModuleConfig.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> hlo_module_config,
-      CreateModuleConfig(program_shape, argument_layouts, &execution_options,
-                         execution_options.num_replicas(), num_threads,
-                         /*aot_options=*/&compile_options));
-
-  // Apply the user-provided callback to customize the HloModuleConfig.
-  if (customize_hlo_module_config) {
-    customize_hlo_module_config(*hlo_module_config);
-  }
-
-  // Unoptimized HloModule.
-  const xla::HloModuleProto& hlo_module_proto = computation.proto();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
-
+    const xla::AotCompilationOptions& compile_options) {
   cpu::CpuCompiler compiler;
   // TODO (basioli): honor build_options.run_backend_only() for AOT.
   // Compile AOT.
@@ -814,13 +771,29 @@ PjRtCpuClient::CompileInternal(
   ExecutionOptions execution_options =
       CreateExecutionOptions(build_options, &program_shape);
 
+  // Unoptimized HloModuleConfig.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> hlo_module_config,
+      CreateModuleConfig(program_shape, argument_layout_pointers,
+                         &execution_options, execution_options.num_replicas(),
+                         eigen_intraop_device()->getPool()->NumThreads(),
+                         aot_options));
+
+  // Apply the user-provided callback to customize the HloModuleConfig.
+  if (customize_hlo_module_config_) {
+    customize_hlo_module_config_(*hlo_module_config);
+  }
+
+  // Unoptimized HloModule.
+  const xla::HloModuleProto& hlo_module_proto = computation.proto();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
+
   if (aot_options) {
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        CompileAheadOfTime(computation, argument_layout_pointers, build_options,
-                           execution_options, *aot_options,
-                           eigen_intraop_device()->getPool()->NumThreads(),
-                           customize_hlo_module_config_));
+    TF_ASSIGN_OR_RETURN(cpu_executable,
+                        CompileAheadOfTime(std::move(hlo_module), build_options,
+                                           execution_options, *aot_options));
   } else {
     xla::Compiler::CompileOptions compile_options{
         build_options.device_allocator(), build_options.compile_thread_pool(),
@@ -828,12 +801,9 @@ PjRtCpuClient::CompileInternal(
     if (!compile_options.thread_pool) {
       compile_options.thread_pool = pjrt_client_thread_pool();
     }
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        JitCompile(computation, argument_layout_pointers, build_options,
-                   execution_options, compile_options,
-                   eigen_intraop_device()->getPool()->NumThreads(),
-                   customize_hlo_module_config_));
+    TF_ASSIGN_OR_RETURN(cpu_executable,
+                        JitCompile(std::move(hlo_module), build_options,
+                                   execution_options, compile_options));
   }
 
   auto cpu_executable_ptr =
@@ -1012,71 +982,37 @@ PjRtCpuClient::AllocateRawBuffer(PjRtMemorySpace* memory_space,
                                      *allocator_);
 }
 
-absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                          tsl::RCReference<PjRtDeviceEvent>,
-                          PjRtFulfillAliasBufferCallback>>
-PjRtCpuClient::CreateRawBufferChannel(const Shape& shape,
-                                      PjRtMemorySpace* memory_space) {
+absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                         CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
+PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto raw_buffer = tsl::MakeRef<CpuRawBuffer>(
       memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise));
 
-  tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
-  tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
-  TF_ASSIGN_OR_RETURN(
-      std::tie(definition_event_promise, definition_event),
-      CreateLinkedEventPromise(memory_space, "CreateRawBufferChannel"));
-
-  PjRtFulfillAliasBufferCallback fulfill_alias_buffer_cb =
-      [buffer_promise = std::move(buffer_promise),
-       definition_event_promise = std::move(definition_event_promise),
-       memory_space,
-       shape](absl::StatusOr<xla::PjRtBuffer*> buffer_or) mutable {
-        tsl::RCReference<xla::PjRtDeviceEvent> device_event;
-        if (!buffer_or.ok()) {
-          definition_event_promise->SetError(buffer_or.status());
-          buffer_promise->SetError(buffer_or.status());
-          return buffer_or.status();
-        }
-        xla::PjRtBuffer* buffer = buffer_or.value();
-        if (buffer->on_device_shape() != shape) {
-          auto status = absl::InvalidArgumentError(absl::StrFormat(
-              "Shape mismatch in CreateRawBufferChannel fulfill: expected %s, "
-              "got "
-              "%s",
-              shape.ToString(), buffer->on_device_shape().ToString()));
-          definition_event_promise->SetError(status);
-          buffer_promise->SetError(status);
-          return status;
-        }
-        xla::CommonPjRtBuffer* common_buffer =
-            dynamic_cast<xla::CommonPjRtBuffer*>(buffer);
-        if (common_buffer == nullptr) {
-          auto status =
-              absl::InternalError("Failed to cast to CommonPjRtBuffer");
-          definition_event_promise->SetError(status);
-          buffer_promise->SetError(status);
-          return status;
-        }
-        xla::CommonPjRtBuffer::ScopedHold hold =
-            common_buffer->GetBufferWithHold(
-                xla::CommonPjRtBuffer::ScopedHold::kDonation);
-        TF_ASSIGN_OR_RETURN(device_event,
-                            hold.buffer()->GetDefinitionEvent(memory_space));
-
-        auto* tracked_cpu_buffer =
-            tensorflow::down_cast<TrackedCpuDeviceBuffer*>(hold.buffer());
-        tsl::AsyncValueRef<CpuDeviceMemory> real_cpu_buffer =
-            tracked_cpu_buffer->buffer();
-
-        buffer_promise->ForwardTo(real_cpu_buffer.CopyRCRef());
-        definition_event_promise->Set(device_event);
-        hold.ConfirmDonation();
-        return absl::OkStatus();
-      };
+  auto buffer_promise_cb =
+      [buffer_promise = std::move(buffer_promise), memory_space](
+          absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> raw_buffer)
+      -> absl::Status {
+    if (!raw_buffer.ok()) {
+      buffer_promise->SetError(raw_buffer.status());
+      return raw_buffer.status();
+    }
+    if (memory_space != (*raw_buffer)->memory_space()) {
+      auto status = absl::InvalidArgumentError(absl::StrFormat(
+          "Memory space mismatch when forarding raw buffers: %s vs %s",
+          memory_space->DebugString(),
+          (*raw_buffer)->memory_space()->DebugString()));
+      buffer_promise->SetError(status);
+      return status;
+    }
+    buffer_promise->ForwardTo(
+        tensorflow::down_cast<xla::CpuRawBuffer*>(raw_buffer->get())
+            ->buffer()
+            .CopyRCRef());
+    return absl::OkStatus();
+  };
 
-  return std::make_tuple(std::move(raw_buffer), std::move(definition_event),
-                         std::move(fulfill_alias_buffer_cb));
+  return std::make_pair(std::move(raw_buffer), std::move(buffer_promise_cb));
 }
 
 absl::StatusOr<int64_t> PjRtCpuClient::GetOnDeviceBytesCount(
@@ -1403,20 +1339,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   CHECK_EQ(device->process_index(), client_->process_index());
 
   // Handle inputs.
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was "
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to "
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   // `execute_event` indicates whether cpu computation is complete and whether
   // there was an error.
   auto execute_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
@@ -1516,7 +1438,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   // Tuplize the inputs if compiler expects a single tuple argument but runtime
   // gets many inputs that are not yet tupled.
   tsl::AsyncValueRef<CpuDeviceMemory> tuple_index_table;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+  if (parameter_is_tupled_arguments_) {
     absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> leaf_buffers;
     leaf_buffers.reserve(tracked_buffers.size());
     for (const auto& tracked_buffer : tracked_buffers) {
@@ -1540,8 +1462,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
         });
   }
 
-  auto* cpu_executable =
-      tsl::down_cast<cpu::CpuExecutable*>(cpu_executable_.get());
+  auto cpu_executable =
+      tsl::down_pointer_cast<cpu::CpuExecutable>(cpu_executable_);
   // `buffer_alloc` and `buffer_alloc_and_copy` are used to do real memory
   // allocation and copy work.
   BufferAlloc buffer_alloc;
@@ -1755,7 +1677,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
          buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
          buffer_table = std::move(buffer_table),
          run_options = std::move(run_options),
-         cpu_executable_copy = cpu_executable_,
          device_assignment = std::move(device_assignment),
          cpu_run_options = std::move(cpu_run_options),
          compute_reservation = std::move(compute_reservation),
@@ -1973,14 +1894,6 @@ PjRtCpuExecutable::Execute(
     std::optional<std::vector<Future<>>>& returned_futures) const {
   RunId run_id(options.launch_id);
   tsl::profiler::TraceMe trace_me("PjRtCpuExecutable::Execute");
-  if (!options.untuple_result && cpu_executable_->module()
-                                     .config()
-                                     .entry_computation_layout()
-                                     .result_shape()
-                                     .IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
   if (device_assignment_ == nullptr) {
     return InvalidArgument("Execute expects a non-null device_assignment");
   }
@@ -2114,14 +2027,6 @@ PjRtCpuExecutable::ExecuteSharded(
   if (device_assignment_ == nullptr) {
     return InvalidArgument("ExecuteShard expects a non-null device_assignment");
   }
-  if (!options.untuple_result && cpu_executable_->module()
-                                     .config()
-                                     .entry_computation_layout()
-                                     .result_shape()
-                                     .IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
   for (int i = 0; i < addressable_devices_.size(); ++i) {
     if (addressable_devices_[i] == device) {
       VLOG(1) << "ExecuteShard executes computation " << name()
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index bbe42478ef3c39..cfa22b6bc48681 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -200,11 +200,9 @@ class PjRtCpuClient final : public CommonPjRtClient {
     return &topology_;
   }
 
-  absl::StatusOr<std::tuple<tsl::RCReference<CommonPjRtRawBuffer>,
-                            tsl::RCReference<PjRtDeviceEvent>,
-                            PjRtFulfillAliasBufferCallback>>
-  CreateRawBufferChannel(const Shape& shape,
-                         PjRtMemorySpace* memory_space) override;
+  absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                           PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> AllocateRawBuffer(
       PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index befeb9c5be200a..fb6842e92c2cb7 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -1033,6 +1033,59 @@ TEST(PjRtCpuClientTest, CustomAllocator) {
   EXPECT_THAT(data, ElementsAreArray(literal.data<float>()));
 }
 
+TEST(PjRtCpuClientTest, SerializeYnnFusions) {
+  constexpr absl::string_view kProgram = R"(
+    HloModule add_and_multiply
+
+    ynn_fusion {
+      %lhs = f32[4] parameter(0)
+      %rhs = f32[4] parameter(1)
+      %add = f32[4] add(%lhs, %rhs)
+      ROOT %mul = f32[4] multiply(%add, %add)
+    }
+
+    ENTRY entry {
+      %p0 = f32[4] parameter(0)
+      %p1 = f32[4] parameter(1)
+      ROOT %fusion = f32[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetPjRtCpuClient(CpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnUnverifiedModule(kProgram, {}));
+
+  XlaComputation xla_computation(m->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(xla_computation, {}));
+
+  Literal literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(auto buf, client->BufferFromHostLiteral(
+                                        literal, client->memory_spaces()[0]));
+
+  ExecuteOptions opts;
+  auto result = executable->Execute({{buf.get(), buf.get()}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result->at(0).at(0)->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
+      *result_literal));
+
+  // Check that serialized/deserialized executable works and produces the same
+  // result.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          executable->SerializeExecutable());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reloaded_executable,
+      client->LoadSerializedExecutable(serialized, std::nullopt, {}));
+
+  result = executable->Execute({{buf.get(), buf.get()}}, opts);
+  TF_ASSERT_OK_AND_ASSIGN(result_literal, result->at(0).at(0)->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
+      *result_literal));
+}
+
 }  // namespace
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_device.h b/third_party/xla/xla/pjrt/cpu/cpu_device.h
index a37a29f98e5175..acccb1a51351ca 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_device.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_device.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/pjrt/cpu/execution_stream_event_map.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
 #include "xla/pjrt/semaphore.h"
 
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index 6e3489d5b11018..f5691607e8d785 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/pjrt/cpu/raw_buffer.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/pjrt/device_event.h b/third_party/xla/xla/pjrt/device_event.h
index 095bc2232c26ea..9aa231ebca926f 100644
--- a/third_party/xla/xla/pjrt/device_event.h
+++ b/third_party/xla/xla/pjrt/device_event.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/future.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index c282b1a2435482..dbc5a0715b5a42 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -71,6 +71,7 @@ cc_library(
     deps = [
         ":key_value_store_interface",
         ":util",
+        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime/coordination:coordination_client",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
@@ -78,6 +79,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -147,10 +149,12 @@ xla_cc_test(
         ":service",
         ":topology_util",
         "//xla:status_macros",
+        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 254a50bbe9ac4e..a1ac70e568e9d7 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
@@ -67,6 +69,8 @@ class DistributedRuntimeCoordinationServiceClient
   absl::Status WaitAtBarrier(
       std::string barrier_id, absl::Duration timeout,
       std::optional<absl::Span<const int32_t>> process_ids) override;
+  absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+  GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) override;
   absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) override;
   absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
@@ -208,8 +212,8 @@ absl::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
   return coord_agent_->WaitAtBarrier(barrier_id, timeout, tasks);
 }
 
-absl::StatusOr<std::vector<int32_t>>
-DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
+absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+DistributedRuntimeCoordinationServiceClient::GetLiveNodesWithIncarnations(
     absl::Span<const int32_t> nodes) {
   // Note that jax.distributed uses terms "process" and "node", and the
   // coordination service uses the term "task". These all refer to the same
@@ -227,13 +231,29 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
   }
 
   // Get the set of live tasks.
-  TF_ASSIGN_OR_RETURN(const std::vector<tensorflow::CoordinatedTask> live_tasks,
-                      coord_agent_->GetAliveTasks(tasks));
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<tsl::CoordinationServiceAgent::AliveTask> live_tasks,
+      coord_agent_->GetAliveTasks(tasks));
 
   // Extract the node ids from the live tasks.
-  std::vector<int32_t> live_nodes(live_tasks.size());
-  for (int i = 0; i < live_tasks.size(); ++i) {
-    live_nodes[i] = live_tasks[i].task_id();
+  absl::flat_hash_map<int32_t, IncarnationId> live_nodes;
+  for (const tsl::CoordinationServiceAgent::AliveTask& task : live_tasks) {
+    live_nodes[task.task_id] = task.incarnation_id;
+  }
+  return live_nodes;
+}
+
+absl::StatusOr<std::vector<int32_t>>
+DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
+    absl::Span<const int32_t> nodes) {
+  absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+      live_nodes_with_incarnations = GetLiveNodesWithIncarnations(nodes);
+  if (!live_nodes_with_incarnations.ok()) {
+    return live_nodes_with_incarnations.status();
+  }
+  std::vector<int32_t> live_nodes;
+  for (const auto& [task_id, unused] : *live_nodes_with_incarnations) {
+    live_nodes.push_back(task_id);
   }
   return live_nodes;
 }
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index f6d2a412a471ae..f152ec59db94ef 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/tsl/platform/env.h"
 
 namespace tsl {
@@ -149,8 +151,15 @@ class DistributedRuntimeClient {
       std::string barrier_id, absl::Duration timeout,
       std::optional<absl::Span<const int32_t>> nodes) = 0;
 
+  // Returns the subset of live nodes, along with their incarnations. See
+  // CoordinationService.GetAliveTasks for detailed semantics.
+  virtual absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>>
+  GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) = 0;
+
   // Returns the subset of live nodes. See CoordinationService.GetAliveTasks for
   // detailed semantics.
+  //
+  // TODO: mwhittaker - Remove this function.
   virtual absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) = 0;
 
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index d65de0654f9ed0..dde16a914f6823 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gmock/gmock.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/distributed/topology_util.h"
+#include "xla/service/global_device_id.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -57,6 +59,7 @@ namespace xla {
 namespace {
 
 using ::testing::IsEmpty;
+using ::testing::Key;
 using ::testing::Matches;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
@@ -1001,10 +1004,10 @@ TEST_F(ClientServerTest, GetLiveTasksSucceeds) {
       TF_ASSERT_OK(client->Connect());
 
       // Get the set of live nodes. All three nodes should be live.
-      absl::StatusOr<std::vector<int32_t>> live_nodes =
-          client->GetLiveNodes(std::vector<int>{0, 1, 2});
+      absl::StatusOr<absl::flat_hash_map<int32_t, IncarnationId>> live_nodes =
+          client->GetLiveNodesWithIncarnations(std::vector<int>{0, 1, 2});
       TF_ASSERT_OK(live_nodes.status());
-      EXPECT_THAT(*live_nodes, UnorderedElementsAre(0, 1, 2));
+      EXPECT_THAT(*live_nodes, UnorderedElementsAre(Key(0), Key(1), Key(2)));
     });
   }
 }
@@ -1023,7 +1026,7 @@ TEST_F(ClientServerTest, GetLiveTasksWithoutBeingAMember) {
       // Get the set of live nodes but don't include ourselves.
       std::vector<int> nodes{0, 1, 2};
       nodes.erase(nodes.begin() + i);
-      EXPECT_THAT(client->GetLiveNodes(nodes),
+      EXPECT_THAT(client->GetLiveNodesWithIncarnations(nodes),
                   absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
     });
   }
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
index c1028b5db21831..a67b4073cb960d 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/BUILD
@@ -13,6 +13,7 @@ cc_library(
         "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
index 2a754b33fbea37..fb66794f488880 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
@@ -29,10 +29,80 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/shape.h"
 
 namespace pjrt {
 
+PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args",
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  std::vector<xla::Shape> shapes;
+  shapes.reserve(args->num_shapes);
+  for (int i = 0; i < args->num_shapes; ++i) {
+    PJRT_ASSIGN_OR_RETURN(
+        xla::Shape shape,
+        pjrt::BuildXlaShapeFromC(args->element_types[i], args->num_dims[i],
+                                 args->shape_num_dims[i], args->layouts[i]));
+    shapes.push_back(std::move(shape));
+  }
+
+  std::vector<xla::PjRtGlobalDeviceId> src_global_device_ids;
+  src_global_device_ids.reserve(args->num_shapes);
+
+  std::vector<xla::CrossHostTransferKey> transfer_keys;
+  transfer_keys.reserve(args->num_shapes);
+
+  for (int i = 0; i < args->num_shapes; ++i) {
+    src_global_device_ids.push_back(args->src_global_device_ids[i]);
+    transfer_keys.push_back(args->transfer_keys[i]);
+  }
+
+  PJRT_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<xla::PjRtBuffer>> buffers,
+                        args->client->client->CrossHostReceiveBuffers(
+                            args->device->device, shapes, src_global_device_ids,
+                            std::move(transfer_keys)));
+
+  for (int i = 0; i < buffers.size(); ++i) {
+    args->buffers[i] = new PJRT_Buffer{std::move(buffers[i]), args->client};
+  }
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args* args) {
+  std::vector<xla::PjRtBuffer*> buffers;
+  buffers.reserve(args->num_buffers);
+  for (int i = 0; i < args->num_buffers; ++i) {
+    buffers.push_back(args->buffers[i]->buffer.get());
+  }
+
+  std::vector<xla::PjRtGlobalDeviceId> dst_global_device_ids;
+  dst_global_device_ids.reserve(args->num_buffers);
+
+  std::vector<xla::CrossHostTransferKey> transfer_keys;
+  transfer_keys.reserve(args->num_buffers);
+
+  for (int i = 0; i < args->num_buffers; ++i) {
+    dst_global_device_ids.push_back(args->dst_global_device_ids[i]);
+    transfer_keys.push_back(args->transfer_keys[i]);
+  }
+
+  PJRT_ASSIGN_OR_RETURN(
+      std::vector<tsl::Future<>> send_futures,
+      args->client->client->CrossHostSendBuffers(buffers, dst_global_device_ids,
+                                                 std::move(transfer_keys)));
+
+  for (int i = 0; i < buffers.size(); ++i) {
+    args->send_events[i] = new PJRT_Event{std::move(send_futures[i])};
+  }
+  return nullptr;
+}
+
 namespace {
 static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
     const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
@@ -110,7 +180,12 @@ PJRT_CrossHostTransfers_Extension CreateCrossHostTransfersExtension(
       /*PJRT_CrossHostTransfers_PJRT_Client_MakeCrossHostReceiveBuffers=*/
       PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers,
       /*PJRT_CrossHostTransfers_PJRT_Buffer_CopyToRemoteDevice=*/
-      PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice};
+      PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice,
+      /*PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers=*/
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers,
+      /*PJRT_Transfers_PJRT_Client_CrossHostSendBuffers=*/
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers,
+  };
 }
 
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
index 173cf0c8b81d3c..6c59660cf532bd 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
@@ -30,10 +30,58 @@ extern "C" {
 // are supported with the PjRtClient::MakeCrossHostReceiveBuffers() and
 // PjRtBuffer::CopyToRemoteDevice() APIs.
 
-#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 1
+// Version 2 adds an alternate API for cross-host transfers:
+// CrossHostSendBuffers and CrossHostReceiveBuffers. These methods allow PjRt
+// clients to implement various optimizations for cross-host transfers.
+
+#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 2
 
 // ---------------------------------- Methods ----------------------------------
 
+// Structs and methods prefixed with
+// PJRT_Transfers_PJRT_Client_CrossHost{Send,Receive}Buffers correspond to the
+// second cross-host transfers API.
+struct PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  size_t num_buffers;
+  PJRT_Buffer** buffers;
+  const xla::PjRtGlobalDeviceId*
+      dst_global_device_ids;                       // Has size num_buffers.
+  const xla::CrossHostTransferKey* transfer_keys;  // Has size num_buffers.
+  PJRT_Event** send_events;  // Output; has size num_buffers.
+};
+
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args,
+                          send_events);
+
+typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostSendBuffers_Args* args);
+
+struct PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  size_t num_shapes;
+  size_t* shape_num_dims;
+  const int64_t** num_dims;
+  PJRT_Buffer_Type* element_types;
+  PJRT_Buffer_MemoryLayout** layouts;
+  PJRT_Device* device;
+  const xla::PjRtGlobalDeviceId* src_global_device_ids;  // Has size num_shapes.
+  const xla::CrossHostTransferKey* transfer_keys;        // Has size num_shapes.
+  PJRT_Buffer** buffers;  // Output; has size num_shapes.
+};
+
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args, buffers);
+
+typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
+    PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args);
+
+// The structs and methods below correspond to the original cross-host transfers
+// API.
 typedef void (*PJRT_Transfers_CrossHostRecvNotifier)(
     PJRT_Error* error, const char** serialized_descriptors,
     size_t* descriptors_sizes, size_t num_descriptors, void* user_arg);
@@ -79,15 +127,22 @@ typedef void PJRT_Buffer_CopyToRemoteDevice(
 
 // --------------------------- Extension entrypoint ----------------------------
 
+// NOLINTBEGIN: Non-lowercase struct member names follow the convention of the
+// PJRT C API.
 typedef struct PJRT_CrossHostTransfers_Extension {
   PJRT_Extension_Base base;
-
   PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers*
       PJRT_Transfers_PJRT_Client_MakeCrossHostReceiveBuffers;
   PJRT_Buffer_CopyToRemoteDevice* PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice;
+  PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers*
+      PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers;
+  PJRT_Transfers_PJRT_Client_CrossHostSendBuffers*
+      PJRT_Transfers_PJRT_Client_CrossHostSendBuffers;
 } PJRT_CrossHostTransfers_Extension;
+// NOLINTEND
+
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_CrossHostTransfers_Extension,
-                          PJRT_Transfers_PJRT_Buffer_CopyToRemoteDevice);
+                          PJRT_Transfers_PJRT_Client_CrossHostSendBuffers);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD b/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
index 4c5f9f9f10791d..9108b2b6d0b672 100644
--- a/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
+++ b/third_party/xla/xla/pjrt/extensions/executable_metadata/BUILD
@@ -10,9 +10,9 @@ cc_library(
     srcs = ["executable_metadata_extension.cc"],
     hdrs = ["executable_metadata_extension.h"],
     deps = [
+        "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/pjrt/c:pjrt_c_api_wrapper_impl",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 02cc441d80b3d2..5b1d8cb6bea9fa 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -79,6 +79,8 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/builder:xla_computation",
+        "//xla/pjrt:abstract_tracked_device_buffer",
+        "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
         "//xla/pjrt:host_memory_spaces",
@@ -173,7 +175,6 @@ cc_library(
         "//xla/service/gpu:stream_executor_util",
     ]) + if_cuda([
         # keep sorted
-        "//xla/service/gpu/model:gpu_collective_performance_model",
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "//xla/stream_executor/gpu:gpu_stream",
         "@local_config_cuda//cuda:cuda_headers",
@@ -198,6 +199,11 @@ xla_test(
             "broken",
             "no_oss",
         ],
+        "h100": [
+            # TODO(b/452317320): Re-enable once fixed.
+            "broken",
+            "no_oss",
+        ],
         "nvgpu_any": [
             "broken",
             "no_oss",
@@ -229,6 +235,7 @@ xla_test(
         "//xla/pjrt:local_device_state",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
@@ -269,6 +276,7 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:casts",
@@ -469,7 +477,6 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:local_service_utils",
         "//xla/service:platform_util",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/tsl/platform:errors",
@@ -627,6 +634,7 @@ cc_library(
     ]),
     deps = [
         ":gpu_topology",
+        ":gpu_topology_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_common",
@@ -634,6 +642,7 @@ cc_library(
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/pjrt/proto:topology_description_proto_cc",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc b/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
index 955b33b119a5eb..924130dbdbde8f 100644
--- a/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/nvshmem_gpu_collectives_test.cc
@@ -82,6 +82,12 @@ absl::StatusOr<std::string> GetDataTypeString(xla::PrimitiveType data_type) {
       return "s32";
     case xla::PrimitiveType::S64:
       return "s64";
+    case xla::PrimitiveType::PRED:
+      return "pred";
+    case xla::PrimitiveType::S8:
+      return "s8";
+    case xla::PrimitiveType::U8:
+      return "u8";
     default:
       return absl::InvalidArgumentError("Invalida data type.");
   }
@@ -127,6 +133,18 @@ TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceFloat) {
   RunNvshmemTest(PrimitiveType::F32, "all_reduce");
 }
 
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReducePred) {
+  RunNvshmemTest(PrimitiveType::PRED, "all_reduce");
+}
+
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceInt8) {
+  RunNvshmemTest(PrimitiveType::S8, "all_reduce");
+}
+
+TEST(NvshmemGpuCollectivesTest, NvshmemAllReduceUint8) {
+  RunNvshmemTest(PrimitiveType::U8, "all_reduce");
+}
+
 absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
                                        int input_data_type,
                                        absl::string_view test_case) {
@@ -335,6 +353,45 @@ absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
                 /*device_layout=*/nullptr));
         break;
       }
+      case xla::PrimitiveType::PRED: {
+        std::vector<uint8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
+      case xla::PrimitiveType::S8: {
+        std::vector<int8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
+      case xla::PrimitiveType::U8: {
+        std::vector<uint8_t> data_array{10};
+        TF_ASSIGN_OR_RETURN(
+            input,
+            client->BufferFromHostBuffer(
+                data_array.data(), shape.element_type(), shape.dimensions(),
+                /*byte_strides=*/std::nullopt,
+                PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+                /*on_done_with_host_buffer=*/nullptr,
+                *device->default_memory_space(),
+                /*device_layout=*/nullptr));
+        break;
+      }
       default:
         return absl::InvalidArgumentError("Invalida data type.");
     }
@@ -480,6 +537,21 @@ absl::Status NvshmemCollectiveTestBody(int rank_id, int num_ranks,
         TF_RET_CHECK(literal->data<int64_t>()[0] == ref_data[0]);
         break;
       }
+      case xla::PrimitiveType::PRED: {
+        std::vector<uint8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<uint8_t>()[0] == ref_data[0]);
+        break;
+      }
+      case xla::PrimitiveType::S8: {
+        std::vector<int8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<int8_t>()[0] == ref_data[0]);
+        break;
+      }
+      case xla::PrimitiveType::U8: {
+        std::vector<uint8_t> ref_data{20};
+        TF_RET_CHECK(literal->data<uint8_t>()[0] == ref_data[0]);
+        break;
+      }
       default:
         return absl::InvalidArgumentError("Invalida data type.");
     }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index f0588c42e7e31a..5cc5dd637b07a9 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -60,6 +60,9 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/buffer_sequencing_event.h"
+#include "xla/pjrt/common_pjrt_client.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
@@ -71,6 +74,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/host_to_device_transfer_manager.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -133,8 +137,6 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-#include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #elif TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -160,378 +162,6 @@ absl::Status RunCallbackOnStream(se::Stream* stream,
       });
 }
 
-class GpuAsyncHostToDeviceTransferManager
-    : public xla::PjRtClient::AsyncHostToDeviceTransferManager {
- public:
-  static absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
-  Create(absl::Span<const PjRtClient::ShapeSpec> shape_specs,
-         std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
-         PjRtStreamExecutorDevice* device, PjRtStreamExecutorClient* client,
-         PjRtMemorySpace* memory_space) {
-    if (device_layouts.has_value() &&
-        device_layouts->size() != shape_specs.size()) {
-      return InvalidArgument(
-          "Number of layouts %d does not match the number of shapes %d",
-          device_layouts->size(), shape_specs.size());
-    }
-    absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
-    absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs;
-    absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
-    absl::InlinedVector<Shape, 4> device_shapes;
-    buffers.reserve(shape_specs.size());
-    buffer_ptrs.reserve(shape_specs.size());
-    definition_events.reserve(shape_specs.size());
-    device_shapes.reserve(shape_specs.size());
-    for (int i = 0; i < shape_specs.size(); ++i) {
-      const PjRtClient::ShapeSpec& shape_spec = shape_specs[i];
-      if (shape_spec.element_type == TUPLE) {
-        return Unimplemented(
-            "Async buffer transfer of tuples not implemented.");
-      }
-      // Initialize a definition event for each async buffer. The definition
-      // event will block the buffer usage until the transfer is done.
-      definition_events.push_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-      Shape& device_shape = device_shapes.emplace_back(
-          ShapeUtil::MakeShape(shape_spec.element_type, shape_spec.dims));
-      if (device_layouts.has_value() && (*device_layouts)[i].has_value()) {
-        *device_shape.mutable_layout() = *(*device_layouts)[i];
-      } else {
-        TF_ASSIGN_OR_RETURN(device_shape,
-                            client->client()
-                                ->backend()
-                                .transfer_manager()
-                                ->ChooseCompactLayoutForShape(device_shape));
-      }
-      LocalDeviceState* local_device = device->local_device_state();
-      se::Stream* h2d_stream = local_device->host_to_device_stream();
-      TF_ASSIGN_OR_RETURN(auto buffer,
-                          AllocateDestinationBuffer(
-                              device_shape, device, local_device, h2d_stream,
-                              /*is_uninitialized_create=*/true, client,
-                              definition_events.back(), memory_space));
-      // Get a temporary hold just so we can fish out a shared_ptr to the
-      // TrackedDeviceBuffer. It's ok to drop the hold before return the
-      // buffers, because the invariants of this class ensure that the buffer
-      // definition event will not fire until after all of this class' uses of
-      // the TrackedDeviceBuffer have completed.
-      auto* se_buffer =
-          tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
-      DCHECK(se_buffer);
-      auto hold = se_buffer->GetBufferWithUsageHold();
-      buffer_ptrs.push_back(hold->device_memory());
-      buffers.push_back(std::move(buffer));
-    }
-
-    return std::make_unique<GpuAsyncHostToDeviceTransferManager>(
-        std::move(buffers), std::move(buffer_ptrs),
-        std::move(definition_events), std::move(device_shapes), device);
-  }
-
-  GpuAsyncHostToDeviceTransferManager(
-      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
-      absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs,
-      absl::InlinedVector<BufferSequencingEventRef, 4> definition_events,
-      absl::InlinedVector<Shape, 4> device_shapes,
-      PjRtStreamExecutorDevice* device)
-      : buffers_(std::move(buffers)),
-        buffer_ptrs_(std::move(buffer_ptrs)),
-        definition_events_(std::move(definition_events)),
-        device_shapes_(std::move(device_shapes)),
-        remaining_buffer_count_(buffer_ptrs_.size()),
-        transfers_in_flight_(0),
-        device_(device) {
-    buffer_sizes_.reserve(buffer_ptrs_.size());
-    for (const auto& ptr : buffer_ptrs_) {
-      DCHECK(ptr);
-      buffer_sizes_.push_back(ptr->mem().size());
-    }
-    last_transfer_started_.resize(buffer_ptrs_.size(), false);
-  }
-
-  ~GpuAsyncHostToDeviceTransferManager() override {
-    auto transfers_finished = [this]() {
-      mu_.AssertHeld();
-      return transfers_in_flight_ == 0;
-    };
-    {
-      absl::MutexLock l(mu_);
-      // Make sure we don't leave dangling pointers in cleanup routines even
-      // if the client lets the object go out of scope.
-      mu_.Await(absl::Condition(&transfers_finished));
-    }
-  }
-
-  size_t buffer_count() const override { return buffers_.size(); };
-
-  size_t buffer_size(int buffer_index) const override {
-    DCHECK_LT(buffer_index, buffer_sizes_.size());
-    return buffer_sizes_[buffer_index];
-  }
-
-  PjRtDevice* device() const override { return device_; }
-
-  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
-    DCHECK_LT(buffer_index, buffers_.size());
-    return std::move(buffers_[buffer_index]);
-  };
-
-  absl::Status TransferLiteralToBuffer(
-      int buffer_index, const LiteralSlice& literal,
-      absl::AnyInvocable<void() &&> on_done) override {
-    tsl::profiler::TraceMe traceme(
-        "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
-    auto* stream = device_->local_device_state()->host_to_device_stream();
-    auto* se_client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
-    DCHECK(se_client);
-
-    TransferManager* transfer_manager =
-        se_client->client()->backend().transfer_manager();
-
-    tsl::RCReference<RawSEDeviceMemory> buffer;
-    {
-      absl::MutexLock l(mu_);
-
-      DCHECK_LT(buffer_index, buffer_ptrs_.size());
-      if (last_transfer_started_[buffer_index]) {
-        return InvalidArgument(
-            "TransferLiteralToBuffer requested for buffer index %d which has "
-            "already been fully transferred",
-            buffer_index);
-      }
-      last_transfer_started_[buffer_index] = true;
-      buffer = buffer_ptrs_[buffer_index];
-      DCHECK(buffer);
-      ++transfers_in_flight_;
-    }
-
-    // The host to device transfer is performed on a thread pool, mostly because
-    // it includes linearization that may be slow.
-    // TODO(misard) assess if it would be preferable to introduce a heuristic to
-    // put the transfer into the calling thread for small literals.
-    auto transfer_h2d = [this, se_client, buffer_index, stream,
-                         transfer_manager, literal, device = device_,
-                         device_buffer = buffer,
-                         local_device =
-                             std::move(device_->local_device_state()),
-                         on_done = std::move(on_done)]() mutable {
-      tsl::profiler::TraceMe traceme(
-          "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer::"
-          "transfer_"
-          "h2d");
-
-      auto event = local_device->event_pool().AllocateEvent(stream->parent());
-
-      // Initiate linearization and transfer of the buffer on the stream.
-      ShapedBuffer buffer =
-          device_buffer->AsShapedBuffer(device, device_shapes_[buffer_index]);
-      TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-          stream, literal, buffer));
-      local_device->event_pool().ThenRecordEvent(stream, event.value());
-
-      // Call cleanup once the transfer has finished on the stream.
-      auto cleanup = [this, buffer_index, stream, on_done = std::move(on_done),
-                      event = std::move(event).value()]() mutable {
-        CleanUp(buffer_index, std::move(event), stream,
-                /*is_last_transfer=*/true, std::move(on_done));
-      };
-      auto status = RunCallbackOnStream(stream, se_client->thread_pool(),
-                                        std::move(cleanup));
-      if (!status.ok()) {
-        LOG(ERROR) << "DoHostCallback failed: " << status;
-      }
-    };
-    se_client->thread_pool()->Schedule(
-        ([ptr = new absl::AnyInvocable<void()>(std::move(transfer_h2d))]() {
-          (*ptr)();
-          delete ptr;
-        }));
-    return absl::OkStatus();
-  }
-
-  absl::Status TransferRawDataToBuffer(
-      int buffer_index, absl::string_view data,
-      absl::AnyInvocable<void() &&> on_done) override {
-    return TransferRawDataToSubBuffer(buffer_index, data.data(),
-                                      /*offset=*/0, data.size(),
-                                      /*is_last_transfer=*/true,
-                                      std::move(on_done));
-  }
-
-  absl::Status TransferRawDataToSubBuffer(
-      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
-      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
-    auto* stream = device_->local_device_state()->host_to_device_stream();
-
-    auto* client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
-    bool should_stage_host_to_device_transfers =
-        client->should_stage_host_to_device_transfers() &&
-        (!client->IsDmaMapped(data, transfer_size));
-
-    std::shared_ptr<void> staging_buffer;
-    if (should_stage_host_to_device_transfers) {
-      auto* host_memory_allocator = client->host_memory_allocator();
-      if (host_memory_allocator == nullptr) {
-        return InvalidArgument(
-            "host_memory_allocator should be initialized for staging buffer "
-            "transfer.");
-      }
-
-      void* ptr = host_memory_allocator->AllocateRaw(
-          tsl::Allocator::kAllocatorAlignment, transfer_size);
-      staging_buffer = std::shared_ptr<void>(
-          ptr, [host_memory_allocator = host_memory_allocator](void* ptr) {
-            host_memory_allocator->DeallocateRaw(ptr);
-          });
-    }
-
-    absl::ReleasableMutexLock l(mu_);
-    DCHECK_LT(buffer_index, buffer_ptrs_.size());
-    if (last_transfer_started_[buffer_index]) {
-      return InvalidArgument(
-          "TransferRawData requested for buffer index %d which has "
-          "already been fully transferred",
-          buffer_index);
-    }
-    if (is_last_transfer) {
-      last_transfer_started_[buffer_index] = true;
-    }
-    DCHECK(buffer_ptrs_[buffer_index]);
-    auto& buffer_memory = buffer_ptrs_[buffer_index]->mem();
-    se::DeviceMemoryBase sub_buffer;
-    CHECK_LE(offset, buffer_memory.size());
-    CHECK_LE(transfer_size, buffer_memory.size() - offset);
-    if (transfer_size < buffer_memory.size()) {
-      sub_buffer = buffer_memory.GetByteSlice(offset, transfer_size);
-    } else {
-      sub_buffer = buffer_memory;
-    }
-
-    ++transfers_in_flight_;
-    // Release the lock before transfer in case transfer or cleanup could be
-    // called on this thread, to avoid deadlock.
-    l.Release();
-
-    auto event = device_->local_device_state()->event_pool().AllocateEvent(
-        stream->parent());
-
-    if (transfer_size != 0) {
-      if (staging_buffer != nullptr) {
-        auto copy_to_staging_buffer = [data, transfer_size,
-                                       staging_buffer]() mutable {
-          std::memcpy(staging_buffer.get(), data, transfer_size);
-        };
-        if (auto status =
-                stream->DoHostCallback(std::move(copy_to_staging_buffer));
-            !status.ok()) {
-          return status;
-        }
-        if (auto status = stream->Memcpy(&sub_buffer, staging_buffer.get(),
-                                         transfer_size);
-            !status.ok()) {
-          return status;
-        }
-      } else if (auto status = stream->Memcpy(&sub_buffer, data, transfer_size);
-                 !status.ok()) {
-        return status;
-      }
-    }
-    device_->local_device_state()->event_pool().ThenRecordEvent(stream,
-                                                                event.value());
-
-    auto cleanup = [this, buffer_index, event = std::move(event).value(),
-                    stream, is_last_transfer, on_done = std::move(on_done),
-                    staging_buffer = std::move(staging_buffer)]() mutable {
-      CleanUp(buffer_index, std::move(event), stream, is_last_transfer,
-              std::move(on_done));
-    };
-    return RunCallbackOnStream(stream, client->thread_pool(),
-                               std::move(cleanup));
-  }
-
-  void SetBufferError(int buffer_index, absl::Status error) override {
-    BufferSequencingEventRef event;
-    {
-      absl::MutexLock l(mu_);
-      // For a given buffer_index, SetBufferError can't be called twice, or
-      // called after the last transfer has been enqueued.
-      event = std::move(definition_events_[buffer_index]);
-      CHECK(event);
-      CHECK(!event->IsDefined());
-    }
-    VLOG(1) << "SetBufferError sets the " << buffer_index
-            << "th buffer error: " << error;
-    tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client())
-        ->SetEventAsError(event, error);
-  }
-
-  void AddTransferMetadata(const TransferMetadata& meta) override {}
-
- private:
-  absl::Mutex mu_;
-  // The newly created buffers, which will be returned to the caller via
-  // Retrieve.
-  absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers_;
-  // Cached versions of the sizes of all the buffers, so we can return them
-  // without acquiring mu_.
-  absl::InlinedVector<size_t, 4> buffer_sizes_;
-  // References to the underlying storage for all the buffers, which ensures
-  // that the buffers can't be freed before all transfers complete.
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs_
-      ABSL_GUARDED_BY(mu_);
-  // True if the last transfer for a buffer has been initiated. Used to prevent
-  // a client initiating another transfer after the last transfer has already
-  // been initiated.
-  absl::InlinedVector<bool, 4> last_transfer_started_ ABSL_GUARDED_BY(mu_);
-  // The buffer definition events on all the buffers, unblocked once the
-  // corresponding buffer transfer has completed.
-  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events_
-      ABSL_GUARDED_BY(mu_);
-  // Device shapes for all buffers with either compact or custom layout.
-  const absl::InlinedVector<Shape, 4> device_shapes_;
-  // Count of buffers that have not yet been fully transferred.
-  size_t remaining_buffer_count_ ABSL_GUARDED_BY(mu_);
-  // Count of transfers that have been started but have not yet called cleanup.
-  // Used to block in the destructor to avoid dangling pointers in cleanup.
-  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
-
-  PjRtStreamExecutorDevice* device_;  // not owned.
-
-  void CleanUp(int buffer_index, EventPool::Handle device_event,
-               se::Stream* stream, bool is_last_transfer,
-               absl::AnyInvocable<void() &&> on_done) {
-    BufferSequencingEventRef event;
-    {
-      absl::MutexLock l(mu_);
-
-      CHECK_GT(transfers_in_flight_, 0);
-      --transfers_in_flight_;
-      if (is_last_transfer) {
-        // Drop our reference to the TrackedDeviceBuffer for this buffer.
-        CHECK(buffer_ptrs_[buffer_index]);
-        buffer_ptrs_[buffer_index] = tsl::RCReference<xla::RawSEDeviceMemory>();
-        CHECK_GT(remaining_buffer_count_, 0);
-        --remaining_buffer_count_;
-        definition_events_[buffer_index]->SetSequencingEvent(
-            std::move(device_event), stream);
-        event = std::move(definition_events_[buffer_index]);
-        if (remaining_buffer_count_ == 0) {
-          VLOG(1) << "TransferLiteralToBuffer for all buffers is done.";
-        }
-      }
-    }
-
-    // Call on_done after finishing all housekeeping and releasing the lock.
-    std::move(on_done)();
-    // CleanUp happens after the events have already been waited on.
-    if (event) {
-      event.SetStateConcrete();
-    }
-  }
-};
-
 static std::optional<stream_executor::GpuTargetConfigProto>
 GetTargetConfigForDevices(absl::Span<PjRtDevice* const> devices) {
   // Temporary ability to disable TargetConfig via env var until
@@ -663,13 +293,8 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
     absl::Span<const PjRtClient::ShapeSpec> shape_specs,
     std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
     PjRtMemorySpace* memory_space) {
-  CHECK_EQ(memory_space->devices().size(), 1);
-  PjRtDevice* device = memory_space->devices()[0];
-  auto* stream_executor_device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
-  return xla::GpuAsyncHostToDeviceTransferManager::Create(
-      shape_specs, std::move(device_layouts), stream_executor_device, this,
-      memory_space);
+  return xla::CreateAsyncHostToDeviceTransferManager(
+      shape_specs, std::move(device_layouts), memory_space);
 }
 
 absl::StatusOr<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
@@ -717,180 +342,340 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
                                                               num_partitions);
 }
 
-Future<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
-    PjRtBuffer* pjrt_buffer, Future<void*> dst, int64_t offset,
-    int64_t transfer_size) {
-  auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
-  DCHECK(buffer);
-  auto* device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(buffer->device());
-  LocalDeviceState* local_device = device->local_device_state();
-  se::Stream* stream = local_device->GetDeviceToHostStream();
-
-  // Acquire the usage hold inline so that the buffer is kept alive even if
-  // `dst` is not immediately available.
-  PjRtStreamExecutorBuffer::ScopedHold hold(buffer->GetBufferWithUsageHold());
-  if (!hold.ok()) {
-    return Future<>(hold.status());
+absl::Status StreamExecutorGpuClient::UpdateCompileOptionsInternal(
+    CompileOptions* options, ExecutableExtras* returned_extras,
+    bool lookup_addressable_devices) {
+  TF_RETURN_IF_ERROR(PjRtStreamExecutorClient::UpdateCompileOptionsInternal(
+      options, returned_extras, lookup_addressable_devices));
+  if (topology_) {
+    options->executable_build_options.set_slice_size(
+        topology_->gpu_topology().slice_size());
   }
+  return absl::OkStatus();
+}
+
+std::string CrossHostTransferName(PjRtGlobalDeviceId src_global_device_id,
+                                  PjRtGlobalDeviceId dst_global_device_id,
+                                  RunId transfer_run_id) {
+  return absl::StrCat("cross_host_transfer-", src_global_device_id.value(),
+                      "_to_", dst_global_device_id.value(), "-run_",
+                      transfer_run_id.ToInt());
+}
 
-  auto device_memory = hold->device_memory();
-  if (!device_memory) {
-    return Future<>(
-        InvalidArgument("Copy raw buffer called on an invalid buffer"));
+absl::StatusOr<std::unique_ptr<Communicator>> CreateTransferCommunicator(
+    LocalDeviceState* local_device, gpu::GpuCollectives* gpu_collectives,
+    CliqueId clique_id, bool is_sender) {
+  VLOG(3) << "Creating a new communicator for cross host transfer, is_sender = "
+          << is_sender;
+
+  // Create the communicator.
+  //
+  // TODO(mwhittaker): The way we are constructing GpuCliqueKeys is a
+  // big hack. This code doesn't know the GlobalDeviceId of the sending
+  // process. Instead, we use two arbitrary GlobalDeviceIds. This
+  // works because NcclCommunicators don't actually use the
+  // GlobalDeviceIds. Instead, they just need to the know the number
+  // of devices (2 in this case).
+  gpu::GpuCliqueKey clique_key(
+      /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
+      /*num_local_participants=*/1);
+  CliqueIds clique_ids(clique_id);
+  gpu::GpuCollectives::Device collectives_device(local_device->executor());
+  std::vector<Collectives::DeviceRank> ranks = {
+      Collectives::DeviceRank(&collectives_device, RankId(is_sender ? 1 : 0))};
+  gpu::GpuCollectives::Config config;
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Communicator>> communicators,
+                      gpu_collectives->CreateCommunicators(
+                          clique_key, clique_ids, ranks, config));
+  CHECK_EQ(communicators.size(), 1);
+
+  return std::move(communicators[0]);
+}
+
+absl::StatusOr<std::vector<Future<>>>
+StreamExecutorGpuClient::CrossHostSendBuffers(
+    absl::Span<PjRtBuffer* const> buffers,
+    absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Validate arguments.
+  if (dst_global_device_ids.size() != buffers.size() ||
+      transfer_keys.size() != buffers.size()) {
+    return InvalidArgument(
+        "CrossHostSendBuffers: buffers, "
+        "dst_global_device_ids, and transfer_keys "
+        "must have the same length, but got %d, %d, and %d.",
+        buffers.size(), dst_global_device_ids.size(), transfer_keys.size());
   }
 
-  auto [promise, future] = Future<>::MakePromise();
-  auto usage_event = BufferSequencingEvent::Create(this->thread_pool());
-
-  auto definition_events = hold->definition_events();
-  auto first_definition_event = definition_events[0];
-
-  // When using the ComputeSynchronized allocation model, retain a reference to
-  // the device_buffer until the copy completes, to ensure that the buffer isn't
-  // deleted or donated while it is still in use. The choice of retaining a
-  // reference at the host is a heuristic; the alternative is to ensure, before
-  // freeing the buffer, that the compute stream is synchronized past the
-  // transfer, but it seems better to hold onto the buffer too long than to
-  // stall the compute stream.
-  hold.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
-
-  auto async_copy = [this, promise = std::move(promise).ToShared(), offset,
-                     transfer_size, stream, local_device,
-                     owning_device_memory = std::move(device_memory),
-                     definition_events = std::move(definition_events),
-                     usage_event = std::move(usage_event)](
-                        absl::StatusOr<void*> dst) mutable {
-    absl::StatusOr<EventPool::Handle> event =
-        local_device->event_pool().AllocateEvent(stream->parent());
-    if (!event.ok()) {
-      promise->Set(event.status());
-      return;
-    }
+  // Perform sends.
+  std::vector<Future<>> out_futures;
+  out_futures.reserve(buffers.size());
+  for (int i = 0; i < buffers.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Future<> curr_future,
+        CrossHostSendBuffer(buffers[i], dst_global_device_ids[i],
+                            transfer_keys[i]));
+    out_futures.push_back(std::move(curr_future));
+  }
+  return out_futures;
+}
 
-    absl::Status defined_status = definition_events[0]->GetDefinedStatus();
-    if (!defined_status.ok()) {
-      promise->Set(defined_status);
-      return;
-    }
+// Helpers used inside CrossHostSendBuffer to acquire a hold on a send buffer
+// and get its raw buffer and definition events. This is used to ensure that the
+// buffer is not deleted while the send is in progress.
+struct HeldSendBuffer {
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+};
 
-    auto& device_memory = owning_device_memory->mem();
-    if (offset < 0 || offset > device_memory.size() ||
-        device_memory.size() - offset < transfer_size) {
-      promise->Set(
-          InvalidArgument("Copy raw buffer called on buffer size %lld with "
-                          "invalid offset %lld, transfer size %lld",
-                          device_memory.size(), offset, transfer_size));
-      return;
-    }
+absl::StatusOr<HeldSendBuffer> AcquireHeldSendBuffer(
+    tsl::RCReference<PjRtDeviceEvent> usage_event,
+    CommonPjRtBufferImpl* buffer_impl, const char* caller_name) {
+  tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+
+  TF_RETURN_IF_ERROR(buffer_impl->AcquireScopedRawBuffer(
+      [&](tsl::RCReference<CommonPjRtRawBuffer> buf_raw_buffer,
+          std::vector<tsl::RCReference<tsl::AsyncValue>>
+              buf_definition_events) mutable
+          -> absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> {
+        raw_buffer = std::move(buf_raw_buffer);
+        usage_event->AndThen([raw_buffer]() {});
+        definition_events = std::move(buf_definition_events);
+        return usage_event;
+      },
+      caller_name));
 
-    std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
-    if (transfer_size < device_memory.size()) {
-      sub_buffer = std::make_unique<se::DeviceMemoryBase>(
-          device_memory.GetByteSlice(offset, transfer_size));
-    } else {
-      sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
-    }
+  return HeldSendBuffer{std::move(raw_buffer), std::move(definition_events)};
+}
 
-    WaitForBufferDefinitionEventsOnStream(absl::MakeSpan(definition_events),
-                                          stream);
-
-    if (transfer_size != 0) {
-      if (should_stage_host_to_device_transfers() &&
-          !IsDmaMapped(dst.value(), transfer_size)) {
-        if (host_memory_allocator() == nullptr) {
-          promise->Set(
-              InvalidArgument("host_memory_allocator should be initialized for "
-                              "staging buffer transfer."));
-          return;
-        }
-        void* ptr = host_memory_allocator()->AllocateRaw(
-            tsl::Allocator::kAllocatorAlignment, transfer_size);
+absl::StatusOr<Future<>> StreamExecutorGpuClient::CrossHostSendBuffer(
+    PjRtBuffer* buffer, PjRtGlobalDeviceId dst_global_device_id,
+    CrossHostTransferKey transfer_key) {
+  // Get the default GpuCollectives instance.
+  TF_ASSIGN_OR_RETURN(Collectives * collectives,
+                      CollectivesRegistry::Default("gpu"));
+  gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<gpu::GpuCollectives*>(collectives);
 
-        std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-            ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
-              host_memory_allocator->DeallocateRaw(ptr);
-            });
-        if (auto status = stream->Memcpy(staging_buffer.get(), *sub_buffer,
-                                         transfer_size);
-            !status.ok()) {
-          promise->Set(std::move(status));
-          return;
-        }
-        auto copy_to_staging_buffer = [dst, transfer_size,
-                                       staging_buffer]() mutable {
-          std::memcpy(*dst, staging_buffer.get(), transfer_size);
-        };
-        if (auto status = stream->DoHostCallback(copy_to_staging_buffer);
-            !status.ok()) {
-          promise->Set(std::move(status));
-          return;
-        }
-      } else {
-        // D2H request holds a non-owned pointer into sub_buffer base address
-        // that needs to outlive the transfer until the stream callback is
-        // invoked.
-        auto status = stream->Memcpy(*dst, *sub_buffer, transfer_size);
-        if (!status.ok()) {
-          promise->Set(std::move(status));
-          return;
+  // Get the local device and its id.
+  PjRtStreamExecutorDevice* pjrt_se_device =
+      tensorflow::down_cast<PjRtStreamExecutorDevice*>(buffer->device());
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      pjrt_se_device->GetLocalDeviceState());
+  PjRtGlobalDeviceId src_global_device_id = pjrt_se_device->global_device_id();
+
+  // Get the name of the transfer.
+  std::string cross_host_transfer_name = CrossHostTransferName(
+      src_global_device_id, dst_global_device_id, RunId(transfer_key.value()));
+
+  // Get the buffer's shape.
+  TF_ASSIGN_OR_RETURN(Shape shape, buffer->HostShape());
+
+  auto [promise, future] = Future<>::MakePromise();
+
+  // Create an event to track when the send is done.
+  auto usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
+      BufferSequencingEvent::Create(this->thread_pool()));
+
+  // Acquire a hold on the buffer and get some metadata.
+  TF_ASSIGN_OR_RETURN(
+      HeldSendBuffer held_send_buffer,
+      AcquireHeldSendBuffer(
+          usage_event, tensorflow::down_cast<CommonPjRtBufferImpl*>(buffer),
+          "CrossHostSendBuffer"));
+
+  auto send = [this, gpu_collectives, promise = std::move(promise),
+               usage_event = std::move(usage_event),
+               held_send_buffer = std::move(held_send_buffer), local_device,
+               cross_host_transfer_name, shape]() mutable {
+    se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+    auto f = [&]() -> absl::Status {
+      // Wait until the buffer we want to send is fully materialized.
+      for (const auto& event : held_send_buffer.definition_events) {
+        tsl::BlockUntilReady(event.get());
+        if (auto* status = event->GetErrorIfPresent()) {
+          return *status;
         }
       }
-    }
 
-    ThenRecordEvent(usage_event, local_device, std::move(event).value(),
-                    stream);
-
-    auto callback_status = local_device->ThenExecuteCallback(
-        stream, [promise, owning_device_memory =
-                              std::move(owning_device_memory)]() mutable {
-          promise->Set();
-        });
-    if (!callback_status.ok()) {
-      promise->Set(std::move(callback_status));
-      return;
+      // Get the clique ID from the KV store.
+      TF_ASSIGN_OR_RETURN(std::string descriptor,
+                          kv_store_->Get(cross_host_transfer_name,
+                                         cross_host_transfer_timeout_));
+      CliqueId clique_id(descriptor);
+
+      // Create a communicator.
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/true));
+
+      // Send data to the receiver.
+      auto mem = tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                     held_send_buffer.raw_buffer.get())
+                     ->device_buffer();
+
+      Future<> send_future = communicator->Send(
+          mem->mem(), shape.element_type(), ShapeUtil::ElementsIn(shape),
+          RankId(0), gpu::GpuCollectives::On(*stream));
+      TF_RETURN_IF_ERROR(send_future.Await());
+
+      // Mark send as done.
+      TF_RETURN_IF_ERROR(
+          AllocateAndRecordEvent(usage_event->event(), local_device, stream));
+
+      return absl::OkStatus();
+    };
+
+    absl::Status s = f();
+    if (!s.ok()) {
+      SetEventAsError(usage_event->event(), s);
     }
+    promise.Set(s);
   };
 
-  first_definition_event->ExecuteOrAddToFutureTasks(
-      absl::StrFormat("async_copy_raw_sub_buffer_to_host_%p", &async_copy),
-      [this, dst, async_copy = std::move(async_copy)]() mutable {
-        dst.OnReady([this, async_copy = std::move(async_copy)](
-                        absl::StatusOr<void*> dst) {
-          // Trampoline through a thread pool since GPUs do not allow calling
-          // D2H inside the callback's context.
-          thread_pool()->Schedule(absl::bind_front(async_copy, std::move(dst)));
-        });
-      });
+  local_device->execute_thread()->Schedule(std::move(send));
+  return future;
+}
 
-  return FutureHelpers::WithProfiling(
-      std::move(future),
-      /*on_block_start=*/
-      []() {
-        tsl::profiler::TraceMeProducer traceme(
-            "StreamExecutorGpuClient::CopyRawSubBufferToHost");
-        VLOG(1) << "StreamExecutorGpuClient::CopyRawSubBufferToHost";
-        return FutureHelpers::ProfilingKeys(
-            {/*traceme_context_id =*/traceme.GetContextId()});
-      },
-      /*on_block_end=*/
-      [](FutureHelpers::ProfilingKeys keys) {
-        tsl::profiler::TraceMeConsumer traceme(
-            "StreamExecutorGpuClient::CopyRawSubBufferToHost",
-            keys.traceme_context_id);
-      });
+absl::StatusOr<StreamExecutorGpuClient::PrepareReceiveBufferResult>
+StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
+  TF_ASSIGN_OR_RETURN(auto* memory_space, device->default_memory_space());
+  TF_ASSIGN_OR_RETURN(
+      Shape on_device_shape,
+      MakeDefaultShapeForMemorySpace(
+          memory_space, shape, shape.has_layout() ? &shape.layout() : nullptr));
+  TF_ASSIGN_OR_RETURN(size_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, on_device_shape));
+
+  // Allocate an uninitialized buffer. The buffer will be populated with data
+  // received from the sending process.
+  TF_ASSIGN_OR_RETURN(tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+                      AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                        /*retry_on_oom=*/true,
+                                        /*allocate_after=*/{}));
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
+                          ->GetLocalDeviceState());
+
+  se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+  BufferSequencingEventRef definition_event =
+      BufferSequencingEvent::Create(this->thread_pool());
+  TF_ASSIGN_OR_RETURN(
+      auto buffer,
+      DefineBuffer(
+          on_device_shape, raw_buffer,
+          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)},
+          /*raw_buffer_is_mutable=*/true));
+
+  return PrepareReceiveBufferResult{std::move(buffer), std::move(raw_buffer),
+                                    local_device, stream,
+                                    std::move(definition_event)};
 }
 
-absl::Status StreamExecutorGpuClient::UpdateCompileOptionsInternal(
-    CompileOptions* options, ExecutableExtras* returned_extras,
-    bool lookup_addressable_devices) {
-  TF_RETURN_IF_ERROR(PjRtStreamExecutorClient::UpdateCompileOptionsInternal(
-      options, returned_extras, lookup_addressable_devices));
-  if (topology_) {
-    options->executable_build_options.set_slice_size(
-        topology_->gpu_topology().slice_size());
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+StreamExecutorGpuClient::CrossHostReceiveBuffers(
+    xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+    absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+    std::vector<CrossHostTransferKey> transfer_keys) {
+  // Validate arguments.
+  if (shapes.empty()) {
+    return InvalidArgument("shapes parameter empty in CrossHostReceiveBuffers");
   }
-  return absl::OkStatus();
+  if (src_global_device_ids.size() != shapes.size() ||
+      transfer_keys.size() != shapes.size()) {
+    return InvalidArgument(
+        "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+        "transfer_keys must have the same length, but got %d, %d, and %d.",
+        shapes.size(), src_global_device_ids.size(), transfer_keys.size());
+  }
+
+  // Perform receives.
+  std::vector<std::unique_ptr<PjRtBuffer>> receive_buffers;
+  receive_buffers.reserve(shapes.size());
+  for (int i = 0; i < shapes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<PjRtBuffer> receive_buffer,
+        CrossHostReceiveBuffer(shapes[i], device, src_global_device_ids[i],
+                               transfer_keys[i]));
+    receive_buffers.push_back(std::move(receive_buffer));
+  }
+  return receive_buffers;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+StreamExecutorGpuClient::CrossHostReceiveBuffer(
+    xla::Shape shape, xla::PjRtDevice* device,
+    PjRtGlobalDeviceId src_global_device_id,
+    CrossHostTransferKey transfer_key) {
+  // Get the default GpuCollectives instance.
+  TF_ASSIGN_OR_RETURN(Collectives * collectives,
+                      CollectivesRegistry::Default("gpu"));
+  gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<gpu::GpuCollectives*>(collectives);
+
+  // Get the name of the transfer.
+  PjRtGlobalDeviceId dst_global_device_id = device->global_device_id();
+  std::string cross_host_transfer_name = CrossHostTransferName(
+      src_global_device_id, dst_global_device_id, RunId(transfer_key.value()));
+
+  TF_ASSIGN_OR_RETURN(
+      StreamExecutorGpuClient::PrepareReceiveBufferResult receive_prep_result,
+      PrepareReceiveBuffer(device, shape));
+
+  auto recv = [this, gpu_collectives, cross_host_transfer_name,
+               local_device = receive_prep_result.local_device,
+               definition_event = receive_prep_result.definition_event,
+               stream = receive_prep_result.stream,
+               raw_buffer = std::move(receive_prep_result.raw_buffer), shape,
+               dtype = receive_prep_result.buffer->element_type()]() mutable {
+    WaitForAllocation(stream, *raw_buffer);
+    auto f = [&]() -> absl::Status {
+      auto mem =
+          tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+              ->device_buffer();
+
+      // Construct the clique ID and set the descriptor in the KV store.
+      TF_ASSIGN_OR_RETURN(CliqueId clique_id,
+                          gpu_collectives->CreateUniqueCliqueId());
+      std::string descriptor = clique_id.ToString();
+      TF_RETURN_IF_ERROR(kv_store_->Set(cross_host_transfer_name, descriptor));
+
+      // Create a communicator.
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/false));
+
+      // Receive data from the sender.
+      Future<> recv_future = communicator->Recv(
+          mem->mem(), shape.element_type(), ShapeUtil::ElementsIn(shape),
+          RankId(1), gpu::GpuCollectives::On(*stream));
+      TF_RETURN_IF_ERROR(recv_future.Await());
+
+      // Keep mem alive until the Recv has finished executing. Note that
+      // recv_event is fulfilled when the receive is enqueued, but not
+      // necessarily executed.
+      definition_event.AndThen([mem]() {});
+
+      // Set definition event.
+      TF_RETURN_IF_ERROR(
+          AllocateAndRecordEvent(definition_event, local_device, stream));
+
+      return absl::OkStatus();
+    };
+
+    if (absl::Status s = f(); !s.ok()) {
+      SetEventAsError(definition_event, s);
+    }
+  };
+  receive_prep_result.local_device->execute_thread()->Schedule(std::move(recv));
+
+  return std::move(receive_prep_result.buffer);
 }
 
 void StreamExecutorGpuClient::ScheduleRemoteSend(
@@ -925,8 +710,7 @@ void StreamExecutorGpuClient::ScheduleRemoteSend(
       [this, gpu_collectives = std::move(gpu_collectives),
        on_done = std::move(on_done),
        definition_events = std::move(definition_events),
-       memory_space = memory_space, raw_buffer = std::move(raw_buffer),
-       usage_event = usage_event](
+       raw_buffer = std::move(raw_buffer), usage_event = usage_event](
           absl::StatusOr<std::string> serialized_descriptor) mutable {
         if (!serialized_descriptor.ok()) {
           on_done(serialized_descriptor.status(),
@@ -959,29 +743,10 @@ void StreamExecutorGpuClient::ScheduleRemoteSend(
                 CliqueId clique_id(serialized_descriptor);
 
                 // Create a communicator.
-                //
-                // TODO(mwhittaker): The way we are constructing GpuCliqueKeys
-                // is a big hack. This code doesn't know the GlobalDeviceId of
-                // the sending process. Instead, we use two arbitrary
-                // GlobalDeviceIds. This works because NcclCommunicators don't
-                // actually use the GlobalDeviceIds.  Instead, they just need to
-                // the know the number of devices (2 in this case).
-                gpu::GpuCliqueKey clique_key(
-                    /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
-                    /*num_local_participants=*/1);
-                CliqueIds clique_ids(clique_id);
-                gpu::GpuCollectives::Device collectives_device(
-                    local_device->executor());
-                std::vector<Collectives::DeviceRank> ranks = {
-                    Collectives::DeviceRank(&collectives_device, RankId(1))};
-                gpu::GpuCollectives::Config config;
                 TF_ASSIGN_OR_RETURN(
-                    std::vector<std::unique_ptr<Communicator>> communicators,
-                    gpu_collectives->CreateCommunicators(clique_key, clique_ids,
-                                                         ranks, config));
-                CHECK_EQ(communicators.size(), 1);
-                std::unique_ptr<Communicator> communicator =
-                    std::move(communicators[0]);
+                    std::unique_ptr<Communicator> communicator,
+                    CreateTransferCommunicator(local_device, gpu_collectives,
+                                               clique_id, /*is_sender=*/true));
 
                 // Send data to the receiver.
                 Future<> send_future = communicator->Send(
@@ -1031,31 +796,25 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
     return absl::InternalError("Failed to get GPU collectives");
   }
 
-  // Allocate an uninitialized buffer. The buffer will be populated with data
-  // received from the sending process.
-  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
-                          ->GetLocalDeviceState());
-  se::Stream* stream = local_device->GetDeviceToDeviceStream();
-  BufferSequencingEventRef definition_event =
-      BufferSequencingEvent::Create(this->thread_pool());
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtStreamExecutorBuffer> buffer,
-      AllocateDestinationBuffer(shape, device, local_device,
-                                /*copy_stream=*/stream,
-                                /*is_uninitialized_create=*/true, this,
-                                definition_event));
-
-  // Acquire a hold on the buffer to access the underlying memory.
-  PjRtStreamExecutorBuffer::ScopedHold hold = buffer->GetBufferWithUsageHold();
-
-  auto recv = [this, gpu_collectives, notifier, local_device, definition_event,
-               stream, mem = hold->device_memory(), shape = shapes[0],
-               dtype = buffer->element_type()]() mutable {
+      StreamExecutorGpuClient::PrepareReceiveBufferResult receive_prep_result,
+      PrepareReceiveBuffer(device, shape));
+
+  auto recv = [this, gpu_collectives, notifier = std::move(notifier),
+               local_device = receive_prep_result.local_device,
+               definition_event = receive_prep_result.definition_event,
+               stream = receive_prep_result.stream,
+               raw_buffer = std::move(receive_prep_result.raw_buffer),
+               shape = shapes[0],
+               dtype = receive_prep_result.buffer->element_type()]() mutable {
+    WaitForAllocation(stream, *raw_buffer);
     auto f = [&]() -> absl::Status {
       // Create a CliqueId.
       TF_ASSIGN_OR_RETURN(CliqueId clique_id,
                           gpu_collectives->CreateUniqueCliqueId());
+      auto mem =
+          tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+              ->device_buffer();
 
       // Notify the caller with the CliqueId. They will send the id to the
       // sender.
@@ -1068,26 +827,10 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
       });
 
       // Create a communicator.
-      //
-      // TODO(mwhittaker): The way we are constructing GpuCliqueKeys is a big
-      // hack. This code doesn't know the GlobalDeviceId of the sending process.
-      // Instead, we use two arbitrary GlobalDeviceIds. This works because
-      // NcclCommunicators don't actually use the GlobalDeviceIds. Instead, they
-      // just need to the know the number of devices (2 in this case).
-      gpu::GpuCliqueKey clique_key(
-          /*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
-          /*num_local_participants=*/1);
-      CliqueIds clique_ids(clique_id);
-      gpu::GpuCollectives::Device collectives_device(local_device->executor());
-      std::vector<Collectives::DeviceRank> ranks = {
-          Collectives::DeviceRank(&collectives_device, RankId(0))};
-      gpu::GpuCollectives::Config config;
       TF_ASSIGN_OR_RETURN(
-          std::vector<std::unique_ptr<Communicator>> communicators,
-          gpu_collectives->CreateCommunicators(clique_key, clique_ids, ranks,
-                                               config));
-      CHECK_EQ(communicators.size(), 1);
-      std::unique_ptr<Communicator> communicator = std::move(communicators[0]);
+          std::unique_ptr<Communicator> communicator,
+          CreateTransferCommunicator(local_device, gpu_collectives, clique_id,
+                                     /*is_sender=*/false));
 
       // Receive data from the sender.
       Future<> recv_future = communicator->Recv(
@@ -1098,7 +841,7 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
       // Keep mem alive until the Recv has finished executing. Note that
       // recv_event is fulfilled when the receive is enqueued, but not
       // necessarily executed.
-      TF_RETURN_IF_ERROR(local_device->ThenRelease(stream, mem));
+      definition_event.AndThen([mem]() {});
 
       // Set definition event.
       TF_RETURN_IF_ERROR(
@@ -1114,7 +857,7 @@ StreamExecutorGpuClient::MakeCrossHostReceiveBuffers(
   thread_pool()->Schedule(recv);
 
   std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-  buffers.push_back(std::move(buffer));
+  buffers.push_back(std::move(receive_prep_result.buffer));
   return buffers;
 }
 
@@ -1429,14 +1172,13 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     device_proto->set_core_count(desc->core_count());
     device_proto->set_shared_memory_per_block_optin(
         desc->shared_memory_per_block_optin());
-#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-    if (std::stoi(compute_capability) >= 9) {
-      auto fabric_info = GetDeviceFabricInfo(ordinal_and_device.first);
-      if (fabric_info.ok()) {
-        device_proto->set_fabric_uuid(*fabric_info);
-      }
+
+    stream_executor::DeviceInterconnectInfo info =
+        desc->device_interconnect_info();
+    if (!info.cluster_uuid.empty() && !info.clique_id.empty()) {
+      device_proto->set_fabric_uuid(
+          absl::StrCat(info.cluster_uuid, "/", info.clique_id));
     }
-#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
   }
 
   GlobalTopologyProto global_topology;
@@ -1700,64 +1442,6 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
   return devices;
 }
 
-absl::StatusOr<std::string> GetDeviceFabricInfo(const int device_ordinal) {
-#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-  if (!gpu::GpuPerformanceWithCollectiveModel::InitNvml()) {
-    return absl::InternalError("Failed to initialize NVML library.");
-  }
-
-  char pciBusId[] = "00000000:00:00.0";
-  cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), device_ordinal);
-  nvmlDevice_t device;
-
-  nvmlReturn_t get_bus_id_status =
-      nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &device);
-  // NVML library is not a part of the CUDA toolkit, so there might be a
-  // situation when user is using CUDA 12.4 an higher, but the host NVML
-  // version doen't have the required functions.
-  if (get_bus_id_status == NVML_ERROR_FUNCTION_NOT_FOUND) {
-    return absl::InternalError("NVML library doesn't have required functions.");
-  }
-  CHECK_EQ(get_bus_id_status, NVML_SUCCESS);
-
-  nvmlGpuFabricInfoV_t fabricInfo = {
-      .version = nvmlGpuFabricInfo_v2,
-      .state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED};
-
-  nvmlReturn_t get_fabric_info_status =
-      nvmlDeviceGetGpuFabricInfoV(device, &fabricInfo);
-  if (get_fabric_info_status == NVML_ERROR_FUNCTION_NOT_FOUND) {
-    return absl::InternalError("NVML library doesn't have required functions.");
-  }
-  CHECK_EQ(get_fabric_info_status, NVML_SUCCESS);
-
-  if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
-    std::string error_message =
-        "NVML doesn't support extracting fabric info or NVLink is not used by "
-        "the device.";
-    VLOG(2) << error_message;
-    return absl::InternalError(error_message);
-  }
-
-  CHECK_EQ(sizeof(fabricInfo.clusterUuid), 16);
-  std::string uuid_str = absl::StrFormat(
-      "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-      fabricInfo.clusterUuid[0], fabricInfo.clusterUuid[1],
-      fabricInfo.clusterUuid[2], fabricInfo.clusterUuid[3],
-      fabricInfo.clusterUuid[4], fabricInfo.clusterUuid[5],
-      fabricInfo.clusterUuid[6], fabricInfo.clusterUuid[7],
-      fabricInfo.clusterUuid[8], fabricInfo.clusterUuid[9],
-      fabricInfo.clusterUuid[10], fabricInfo.clusterUuid[11],
-      fabricInfo.clusterUuid[12], fabricInfo.clusterUuid[13],
-      fabricInfo.clusterUuid[14], fabricInfo.clusterUuid[15]);
-  return absl::StrCat(uuid_str, "/", std::to_string(fabricInfo.cliqueId));
-#else   // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-  std::string error_message = "NVML usage is not supported";
-  VLOG(2) << error_message;
-  return absl::InternalError(error_message);
-#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
-}
-
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 static absl::Status CheckAlignment(const BufferAllocation& allocation,
                                    se::DeviceMemoryBase buffer, int arg_idx) {
@@ -1897,7 +1581,7 @@ StreamExecutorGpuClient::RunAsync(
 
   std::set<se::DeviceMemoryBase> buffers_in_result;
 
-  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+  xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       gpu_exec->result_shape());
 
   for (auto& p : results) {
@@ -1982,7 +1666,7 @@ StreamExecutorGpuClient::RunAsync(
   TF_RETURN_IF_ERROR(buffer_allocations.TearDown(buffers_in_result,
                                                  gpu_exec->GetAllocations()));
 
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
 
   // Free allocations for arguments.
   for (ShapeTree<PjRtStreamExecutorExecutionInput>& input : arguments) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 4cb94b11835e28..c7331f63d360e0 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -140,10 +141,21 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
       PjRtMemorySpace* memory_space) override;
 
-  Future<> CopyRawSubBufferToHost(PjRtBuffer* buffer, Future<void*> dst,
-                                  int64_t offset,
-                                  int64_t transfer_size) override;
+  // CrossHostSendBuffers and CrossHostReceiveBuffers are part of the new
+  // cross-host transfers API.
+  absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
 
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) override;
+
+  // ScheduleRemoteSend and MakeCrossHostReceiveBuffers are methods implemented
+  // to support the legacy cross-host transfers API.
   void ScheduleRemoteSend(
       PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
@@ -190,6 +202,29 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   const bool abort_collectives_on_failure_ = false;
   std::optional<xla::StreamExecutorGpuTopologyDescription> topology_;
   std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  // Helpers for cross host transfers.
+  absl::Duration cross_host_transfer_timeout_ = absl::Minutes(3);
+
+  absl::StatusOr<Future<>> CrossHostSendBuffer(
+      PjRtBuffer* buffer, PjRtGlobalDeviceId dst_global_device_id,
+      CrossHostTransferKey transfer_key);
+
+  struct PrepareReceiveBufferResult {
+    std::unique_ptr<PjRtBuffer> buffer;
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+    LocalDeviceState* local_device;
+    se::Stream* stream;
+    BufferSequencingEventRef definition_event;
+  };
+
+  absl::StatusOr<PrepareReceiveBufferResult> PrepareReceiveBuffer(
+      PjRtDevice* device, Shape shape);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CrossHostReceiveBuffer(
+      xla::Shape shape, xla::PjRtDevice* device,
+      PjRtGlobalDeviceId src_global_device_ids,
+      CrossHostTransferKey transfer_keys);
 };
 
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
index ecd1b0757f7255..07cbfb7ba3276f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
@@ -155,6 +155,7 @@ TEST(StreamExecutorGpuClientTest, NvshmemMemoryTest) {
       std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
       executable->Execute({{input.get()}}, ExecuteOptions()));
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
   Shape result_shape = result_buffers[0]->on_device_shape();
   int64_t memory_space = result_shape.layout().memory_space();
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index 2f9e9d43c25b30..eb1bd6ce67a367 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -51,6 +52,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
+#include "google/protobuf/text_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
@@ -73,6 +75,7 @@ limitations under the License.
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -1240,47 +1243,6 @@ TEST(StreamExecutorGpuClientTest, DistributedInit) {
   }
 }
 
-TEST(StreamExecutorGpuClientTest, GetDeviceFabricInfo) {
-  auto kv_store = std::make_shared<InMemoryKeyValueStore>();
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "PopulateAndRetrieveFabricInfos", 4);
-  constexpr int num_nodes = 2;
-  for (int node_id = 0; node_id < num_nodes; ++node_id) {
-    thread_pool.Schedule([kv_store, node_id] {
-      GpuClientOptions options = DefaultOptions();
-      options.node_id = node_id;
-      options.num_nodes = num_nodes;
-      options.kv_store = kv_store;
-      TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
-      for (const auto& device : client->addressable_devices()) {
-        LocalDeviceState* local_device_state =
-            tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
-                ->local_device_state();
-        if (local_device_state != nullptr) {
-          se::StreamExecutor* executor = local_device_state->executor();
-          if (auto* cc = executor->GetDeviceDescription()
-                             .gpu_compute_capability()
-                             .cuda_compute_capability()) {
-            if (cc->IsAtLeastHopper()) {
-              auto fabric_info =
-                  GetDeviceFabricInfo(executor->device_ordinal());
-              if (!fabric_info.ok()) {
-                // Only allow failures due to insufficient CUDA driver version.
-                EXPECT_THAT(
-                    fabric_info.status().message(),
-                    AnyOf(HasSubstr("Failed to initialize NVML library."),
-                          HasSubstr(
-                              "NVML library doesn't have required functions."),
-                          HasSubstr("NVML usage is not supported")));
-              }
-            }
-          }
-        }
-      }
-    });
-  }
-}
-
 TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(DefaultOptions()));
@@ -1589,6 +1551,37 @@ TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpace) {
                                      *literal));
 }
 
+TEST(StreamExecutorGpuClientTest, CopyFromPinnedHostMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  std::vector<int32_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  auto device = client->addressable_devices()[0];
+  auto* device_memory_space = *device->default_memory_space();
+  auto* pinned_memory_space = device->memory_spaces()[1];
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          pinned_memory_space, /*device_layout=*/nullptr));
+
+  EXPECT_EQ(buffer->memory_space()->kind(), "pinned_host");
+  EXPECT_TRUE(buffer->IsOnCpu());
+
+  EXPECT_EQ(pinned_memory_space->kind_id(), PinnedHostMemorySpace::kKindId);
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          buffer->CopyToMemorySpace(device_memory_space));
+
+  EXPECT_EQ(result->memory_space()->kind(), "device");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  std::vector<int32_t> expected{1, 2, 3, 4};
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
 TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(DefaultOptions()));
@@ -1752,6 +1745,7 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTest) {
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "pinned_host");
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
@@ -1783,11 +1777,11 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   // Untuple the result so that we get separate buffers.
   // This is how JAX invokes XLA.
   ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, executable->Execute({{input.get()}}, execute_options));
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   EXPECT_EQ(result_buffers.size(), 2);
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
   EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
@@ -2031,6 +2025,7 @@ TEST(StreamExecutorGpuClientTest,
       auto result, executable->Execute({{input.get()}}, ExecuteOptions()));
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
+  TF_ASSERT_OK(result_buffers[0]->GetReadyFuture().Await());
   Shape result_shape = result_buffers[0]->on_device_shape();
   auto memory_space = result_shape.layout().memory_space();
   EXPECT_EQ(memory_space, 1);
@@ -2064,6 +2059,7 @@ TEST(StreamExecutorGpuClientTest, CollectiveMemorySpaceSmoke) {
   TF_ASSERT_OK_AND_ASSIGN(auto results,
                           exe->Execute({{input.get()}}, ExecuteOptions()));
   auto& buf = results[0][0];
+  TF_ASSERT_OK(buf->GetReadyFuture().Await());
 
   // Override default memory space to collective memory space.
   EXPECT_EQ(buf->on_device_shape().layout().memory_space(),
@@ -2586,8 +2582,10 @@ TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurementMultiGPU) {
   auto measurement0 = CreateDeviceTimeMeasurement();
 
   // Test that running the program does not crash/hang.
-  TF_ASSERT_OK(
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto res,
       executable->Execute(absl::MakeSpan(input_ptrs), ExecuteOptions()));
+  TF_ASSERT_OK(res[0][0]->GetReadyFuture().Await());
 
   // Check measurement after execution completes.
   EXPECT_GT(
@@ -2602,13 +2600,17 @@ TEST(StreamExecutorGpuClientTest, DmaMapUnmap) {
       tensorflow::down_cast<PjRtStreamExecutorClient*>(gpu_client.get());
   size_t dma_size = 1024;
   size_t alignment = 4096;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
-  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
-  EXPECT_FALSE(client->IsDmaMapped(
-      reinterpret_cast<char*>(host_dma_ptr.get()) + 5, dma_size));
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
-  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_cleanup =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
+  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr, dma_size));
+  EXPECT_FALSE(
+      client->IsDmaMapped(reinterpret_cast<char*>(host_dma_ptr) + 5, dma_size));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
+  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr, dma_size));
 }
 
 TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
@@ -2636,10 +2638,14 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_cleanup =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
 
-  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr, 0, size);
   TF_EXPECT_OK(result.Await());
 
   PjRtDevice* const second_device = client->addressable_devices()[1];
@@ -2650,12 +2656,12 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
   auto second_buffer = transfer_manager->RetrieveBuffer(0);
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-      0, host_dma_ptr.get(), 0, size, true, []() {}));
+      0, host_dma_ptr, 0, size, true, []() {}));
   TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(StreamExecutorGpuClientTest, RawBuffer) {
@@ -2744,9 +2750,13 @@ ENTRY main.5 {
 
   size_t dma_size = 4 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
-  memset(host_dma_ptr.get(), 0, dma_size);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
+  memset(host_dma_ptr, 0, dma_size);
   Shape shape =
       ShapeUtil::MakeShape(S32, {static_cast<int64_t>(dma_size * 1024)});
 
@@ -2770,11 +2780,10 @@ ENTRY main.5 {
     }
     last_opaque_ptr = opaque_ptr;
 
-    memcpy(host_dma_ptr.get(), &i, sizeof(int32_t));
+    memcpy(host_dma_ptr, &i, sizeof(int32_t));
     absl::Notification done;
     TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-        0, host_dma_ptr.get(), 0, dma_size, true,
-        [&done]() { done.Notify(); }));
+        0, host_dma_ptr, 0, dma_size, true, [&done]() { done.Notify(); }));
     done.WaitForNotification();
 
     std::vector<std::vector<xla::PjRtBuffer*>> input_ptrs = {
@@ -2801,7 +2810,7 @@ ENTRY main.5 {
 
   EXPECT_TRUE(clobbered);
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(StreamExecutorGpuClientTest, EventCaching) {
@@ -2878,6 +2887,283 @@ TEST(StreamExecutorGpuClientTest, LinkedEventPromise) {
   ASSERT_EQ(literal, *new_literal);
 }
 
+TEST(StreamExecutorGpuClientTest, FailedCrossHostSendArgsSizeMismatch) {
+  // Create the client.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  // Create a buffer to try to send.
+  std::vector<int32_t> data(256);
+  std::iota(data.begin(), data.end(), 1);
+
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *client->addressable_devices()[0]->default_memory_space(),
+          /*device_layout=*/nullptr));
+
+  // Try to send some data, giving an extra dst_global_device_id.
+  EXPECT_THAT(
+      client->CrossHostSendBuffers(
+          {buffer.get()}, {PjRtGlobalDeviceId(1), PjRtGlobalDeviceId(2)},
+          {CrossHostTransferKey(0)}),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq("CrossHostSendBuffers: buffers, "
+                           "dst_global_device_ids, and transfer_keys "
+                           "must have the same length, but got 1, 2, and 1.")));
+
+  // Try to send some data, giving and extra transfer key.
+  EXPECT_THAT(
+      client->CrossHostSendBuffers(
+          {buffer.get()}, {PjRtGlobalDeviceId(1)},
+          {CrossHostTransferKey(0), CrossHostTransferKey(1)}),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq("CrossHostSendBuffers: buffers, "
+                           "dst_global_device_ids, and transfer_keys "
+                           "must have the same length, but got 1, 1, and 2.")));
+}
+
+TEST(StreamExecutorGpuClientTest, FailedCrossHostReceiveArgsSizeMismatch) {
+  // Create the client.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  // Create shapes to receive.
+  std::vector<Shape> shapes = {ShapeUtil::MakeShape(S32, {256})};
+
+  // Check InvalidArgument status when we don't give enough
+  // src_global_device_ids.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+      mismatch_status_or_1 = client->CrossHostReceiveBuffers(
+          /*device=*/client->addressable_devices()[0],
+          /*shapes=*/shapes,
+          /*src_global_device_ids=*/{},
+          /*transfer_keys=*/{CrossHostTransferKey(0)});
+  EXPECT_THAT(
+      mismatch_status_or_1.status(),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq(
+              "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+              "transfer_keys must have the same length, but got 1, 0, and "
+              "1.")));
+
+  // Check InvalidArgument status when we give too many
+  // transfer_keys.
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+      mismatch_status_or_2 = client->CrossHostReceiveBuffers(
+          /*device=*/client->addressable_devices()[0],
+          /*shapes=*/shapes,
+          /*src_global_device_ids=*/{PjRtGlobalDeviceId(0)},
+          /*transfer_keys=*/{CrossHostTransferKey(0), CrossHostTransferKey(1)});
+  EXPECT_THAT(
+      mismatch_status_or_2.status(),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::StrEq(
+              "CrossHostReceiveBuffers: shapes, src_global_device_ids, and "
+              "transfer_keys must have the same length, but got 1, 1, and "
+              "2.")));
+}
+
+static std::string SuccessfulCrossHostTransferTestName(
+    const ::testing::TestParamInfo<int>& info) {
+  return absl::StrFormat("num_arrays_%d", info.param);
+}
+
+class SuccessfulCrossHostTransferTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SuccessfulCrossHostTransferTest, SuccessfulCrossHostTransfer) {
+  int num_arrays = GetParam();
+
+  tsl::SubProcess sender;
+  tsl::SubProcess receiver;
+
+  std::vector<std::string> sender_argv;
+  sender_argv.push_back("successful_cross_host_transfer_test");
+  sender_argv.push_back("--test_to_run=SuccessfulCrossHostTransferHelper");
+  sender_argv.push_back("--cross_host_test_role=sender");
+  sender_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+
+  std::vector<std::string> receiver_argv;
+  receiver_argv.push_back("successful_cross_host_transfer_test");
+  receiver_argv.push_back("--test_to_run=SuccessfulCrossHostTransferHelper");
+  receiver_argv.push_back("--cross_host_test_role=receiver");
+  receiver_argv.push_back(absl::StrFormat("--num_arrays=%d", num_arrays));
+
+  sender.SetProgram("/proc/self/exe", sender_argv);
+  sender.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  sender.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  receiver.SetProgram("/proc/self/exe", receiver_argv);
+  receiver.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  receiver.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  ASSERT_TRUE(sender.Start());
+  ASSERT_TRUE(receiver.Start());
+
+  std::string sender_stdout, sender_stderr;
+  std::string receiver_stdout, receiver_stderr;
+
+  int sender_status =
+      sender.Communicate(nullptr, &sender_stdout, &sender_stderr);
+  int receiver_status =
+      receiver.Communicate(nullptr, &receiver_stdout, &receiver_stderr);
+
+  EXPECT_EQ(sender_status, 0) << "sender stdout:\n"
+                              << sender_stdout << "\nsender stderr:\n"
+                              << sender_stderr;
+  EXPECT_EQ(receiver_status, 0) << "receiver stdout:\n"
+                                << receiver_stdout << "\nreceiver stderr:\n"
+                                << receiver_stderr;
+}
+
+INSTANTIATE_TEST_SUITE_P(SuccessfulCrossHostTransfer,
+                         SuccessfulCrossHostTransferTest,
+                         ::testing::ValuesIn({1, 2, 3}),
+                         SuccessfulCrossHostTransferTestName);
+
+absl::Status SuccessfulCrossHostTransferTestBody(bool is_sender,
+                                                 int num_arrays) {
+  std::string log_prefix = is_sender ? "sender" : "receiver";
+
+  // Sender creates a coordination service on so both processes can find each
+  // other via the distributed runtime (port chosen arbitrarily).
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  if (is_sender) {
+    LOG(INFO) << log_prefix << ": creating coordination service";
+    TF_ASSIGN_OR_RETURN(
+        service, xla::GetDistributedRuntimeService(
+                     "127.0.0.1:12347",
+                     xla::CoordinationServiceImpl::Options{/*num_nodes=*/2}));
+    LOG(INFO) << log_prefix << ": created service";
+  }
+
+  // Connect to the coordination service.
+  int32_t node_id = is_sender ? 0 : 1;
+  xla::DistributedRuntimeClient::Options distributed_options;
+  distributed_options.node_id = node_id;
+  distributed_options.init_timeout = absl::Seconds(120);
+  auto distributed_client =
+      GetDistributedRuntimeClient("127.0.0.1:12347", distributed_options);
+
+  LOG(INFO) << log_prefix << ": connecting distributed client";
+  TF_QCHECK_OK(distributed_client->Connect());
+  LOG(INFO) << log_prefix << ": distributed client connected";
+
+  // Create the GPU client.
+  GpuClientOptions options = DefaultOptions();
+  options.node_id = node_id;
+  options.num_nodes = 2;
+  options.kv_store =
+      GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"cross:");
+  options.allowed_devices = {node_id};
+
+  LOG(INFO) << log_prefix << ": creating PjRtClient";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      GetStreamExecutorGpuClient(options));
+  LOG(INFO) << log_prefix << ": PjRtClient created";
+
+  // Sender logic.
+  if (is_sender) {
+    LOG(INFO) << log_prefix << ": creating buffers";
+    std::vector<int32_t> data(256);
+    std::iota(data.begin(), data.end(), 1);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
+
+    // Create the data to send.
+    std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+    for (int i = 0; i < num_arrays; ++i) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<PjRtBuffer> buffer,
+          client->BufferFromHostBuffer(
+              data.data(), shape.element_type(), shape.dimensions(),
+              /*byte_strides=*/std::nullopt,
+              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+              nullptr,
+              *client->addressable_devices()[0]->default_memory_space(),
+              /*device_layout=*/nullptr));
+      TF_RETURN_IF_ERROR(buffer->GetReadyFuture().Await());
+      buffers.push_back(std::move(buffer));
+    }
+
+    // Send some data.
+    LOG(INFO) << log_prefix << ": issuing CrossHostSendBuffers";
+
+    std::vector<PjRtBuffer*> raw_buffers;
+    std::vector<PjRtGlobalDeviceId> dst_device_ids;
+    std::vector<CrossHostTransferKey> transfer_keys;
+    for (int i = 0; i < buffers.size(); ++i) {
+      raw_buffers.push_back(buffers[i].get());
+      dst_device_ids.push_back(PjRtGlobalDeviceId(1));
+      transfer_keys.push_back(CrossHostTransferKey(i));
+    };
+
+    TF_ASSIGN_OR_RETURN(
+        std::vector<Future<>> send_futures,
+        client->CrossHostSendBuffers(raw_buffers, dst_device_ids,
+                                     std::move(transfer_keys)));
+
+    EXPECT_EQ(send_futures.size(), num_arrays);
+    for (int i = 0; i < num_arrays; ++i) {
+      LOG(INFO) << log_prefix << ": waiting for send " << i << " to complete";
+      TF_RETURN_IF_ERROR(send_futures[i].Await());
+      LOG(INFO) << log_prefix << ": send " << i << " completed";
+    }
+  } else {
+    // Receiver logic.
+    // Expected data to receive.
+    std::vector<int32_t> expected_data(256);
+    std::iota(expected_data.begin(), expected_data.end(), 1);
+    auto expected_literal = LiteralUtil::CreateR1<int32_t>(expected_data);
+
+    // Receive some data.
+    std::vector<Shape> shapes;
+    std::vector<PjRtGlobalDeviceId> src_device_ids;
+    std::vector<CrossHostTransferKey> transfer_keys;
+    for (int i = 0; i < num_arrays; ++i) {
+      shapes.push_back(ShapeUtil::MakeShape(S32, {256}));
+      src_device_ids.push_back(PjRtGlobalDeviceId(0));
+      transfer_keys.push_back(CrossHostTransferKey(i));
+    }
+
+    LOG(INFO) << log_prefix << ": calling CrossHostReceiveBuffers";
+    TF_ASSIGN_OR_RETURN(
+        std::vector<std::unique_ptr<PjRtBuffer>> receive_buffers,
+        client->CrossHostReceiveBuffers(client->addressable_devices()[0],
+                                        shapes, src_device_ids,
+                                        std::move(transfer_keys)));
+    LOG(INFO) << log_prefix
+              << ": CrossHostReceiveBuffers returned, waiting for ready";
+
+    // Verify we received the expected data.
+    EXPECT_EQ(receive_buffers.size(), num_arrays);
+
+    for (int i = 0; i < num_arrays; ++i) {
+      LOG(INFO) << log_prefix << ": waiting for receive " << i
+                << " to complete";
+      TF_RETURN_IF_ERROR(receive_buffers[i]->GetReadyFuture().Await());
+      LOG(INFO) << log_prefix << ": receive " << i << " completed";
+
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::Literal> recv_literal,
+                          receive_buffers[i]->ToLiteralSync());
+
+      EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *recv_literal));
+      LOG(INFO) << log_prefix << ": verification of receive " << i
+                << " complete";
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 struct ShardedAutotuningTestInfo {
   bool use_xla_computation;
   int num_active_nodes;
@@ -2916,6 +3202,7 @@ TEST_P(ShardedAutotuningTest, ShardedAutotuningWorks) {
       std::vector<std::string> argv;
       argv.reserve(6);
       argv.push_back("sharded_autotuning_test");
+      argv.push_back("--test_to_run=ShardedAutotuningWorksHelper");
       argv.push_back(absl::StrFormat("--node_id=%d", node_id));
       argv.push_back(absl::StrFormat("--use_xla_computation=%d",
                                      param.use_xla_computation));
@@ -3070,13 +3357,30 @@ INSTANTIATE_TEST_SUITE_P(
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
-  // Save name of binary so that it may invoke itself.
+  // Populated by a command line flag. Will be either
+  // 'ShardedAutotuningWorksHelper', 'SuccessfulCrossHostTransferHelper', or
+  // empty. If empty, all tests are run. Otherwise, the test body for
+  // 'ShardedAutotuningWorks' or 'SuccessfulCrossHostTransfer' will be run.
+  std::string test_to_run;
+
+  // Variables used by ShardedAutotuningWorks.
   int node_id = -1;
   int num_active_nodes = -1;
   int num_nodes_using_cache = -1;
   std::string cache_dir;
   bool use_xla_computation = false;
+
+  // Variables used by SuccessfulCrossHostTransfer.
+  std::string cross_host_test_role;
+  int num_arrays = -1;
+
   std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("test_to_run", &test_to_run,
+                "Which test(s) to execute. Allowed values: '' (runs "
+                "all tests), 'ShardedAutotuningWorksHelper' or "
+                "'SuccessfulCrossHostTransferHelper'."),
+
+      // Flags for ShardedAutotuningWorks.
       tsl::Flag("node_id", &node_id,
                 "Node ID for ShardedAutotuningWorks test."),
       tsl::Flag("num_active_nodes", &num_active_nodes,
@@ -3087,12 +3391,25 @@ int main(int argc, char* argv[]) {
                 "Test parameter for ShardedAutotuningWorks."),
       tsl::Flag("use_xla_computation", &use_xla_computation,
                 "Test parameter for ShardedAutotuningWorks."),
-  };
+
+      // Flags for SuccessfulCrossHostTransfer.
+      tsl::Flag("cross_host_test_role", &cross_host_test_role,
+                "Test parameter for SuccessfulCrossHostTransfer; either "
+                "'sender' or 'receiver'."),
+      tsl::Flag("num_arrays", &num_arrays,
+                "Test parameter for SuccessfulCrossHostTransfer; number of "
+                "arrays to transfer.")};
+
   xla::AppendDebugOptionsFlags(&flag_list);
   std::string usage = tsl::Flags::Usage(argv[0], flag_list);
   tsl::Flags::Parse(&argc, argv, flag_list);
+
   testing::InitGoogleTest(&argc, argv);
-  if (node_id >= 0) {
+  if (test_to_run.empty()) {
+    return RUN_ALL_TESTS();
+  }
+
+  if (test_to_run == "ShardedAutotuningWorksHelper") {
     absl::Status result = xla::ShardedAutotuningWorksTestBody(
         node_id, num_active_nodes, num_nodes_using_cache, cache_dir,
         use_xla_computation);
@@ -3101,5 +3418,23 @@ int main(int argc, char* argv[]) {
     }
     return result.raw_code();
   }
-  return RUN_ALL_TESTS();
+  if (test_to_run == "SuccessfulCrossHostTransferHelper") {
+    absl::Status s;
+    if (cross_host_test_role == "sender") {
+      s = xla::SuccessfulCrossHostTransferTestBody(/*is_sender=*/true,
+                                                   num_arrays);
+    } else if (cross_host_test_role == "receiver") {
+      s = xla::SuccessfulCrossHostTransferTestBody(/*is_sender=*/false,
+                                                   num_arrays);
+    } else {
+      LOG(ERROR) << "cross_host_test_role must be 'sender' or 'receiver'.";
+      return 1;
+    }
+    if (!s.ok()) {
+      LOG(ERROR) << s;
+    }
+    return s.raw_code();
+  }
+  LOG(ERROR) << "Unrecognized multiprocess test name " << test_to_run << ".";
+  return 1;
 }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 8084f148d4ff9a..11bc1f865433d3 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/pjrt/utils.h"
 #include "xla/service/compiler.h"
 #include "xla/service/dump.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
index cddb20b78de037..8dad9d5dad52fb 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
@@ -28,11 +28,13 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -77,61 +79,65 @@ StreamExecutorGpuTopologyDescription::DeviceDescriptions() const {
     return devices;
   }
   devices.reserve(gpu_topology_->number_of_devices());
+  for (int device_id = 0; device_id < gpu_topology_->number_of_devices();
+       ++device_id) {
+    devices.push_back(CreateDeviceDescription(device_id));
+  }
+  return devices;
+}
+
+std::unique_ptr<PjRtStreamExecutorDeviceDescription>
+StreamExecutorGpuTopologyDescription::CreateDeviceDescription(
+    int device_id) const {
   // Instead of "host", we use "process", as it's more accurate and consistent
   // with PjRt terminology. In a multi-process setting, a host can have multiple
   // processes, e.g., one process per GPU.
   const int32_t num_devices_per_process = gpu_topology_->num_devices_per_host();
   const int32_t num_processes_per_partition =
       gpu_topology_->num_hosts_per_partition();
-  for (int device_id = 0; device_id < gpu_topology_->number_of_devices();
-       ++device_id) {
-    // The local_device_id, process_index and partition_index are inferred from
-    // the global device id. It requires the global topology is symmetric:
-    //  - all partitions have the same number of processes.
-    //  - all processes have the same number of devices.
-    //  - processes of the same partition are adjacent to each other.
-    //
-    // And it also requires the ids assignments follows the PjRt topology
-    // exchange protocol in xla/pjrt/distributed/topology_util.cc:
-    //  - ids are densely assigned and start from 0
-    //  - from lower process index to higher process index
-    //  - within the process, from lower device ordinal to higher device ordinal
-    //
-    // If the above requirements are not met, users should get the device
-    // description by looking up individual device from PjRt client.
-    const int local_device_id = num_devices_per_process == -1
-                                    ? 0
-                                    : (device_id % num_devices_per_process);
-    const int process_index = num_devices_per_process == -1
-                                  ? 0
-                                  : (device_id / num_devices_per_process);
-    const int process_index_in_partition =
-        process_index == -1 ? 0 : (process_index % num_processes_per_partition);
-    const int partition_index =
-        num_processes_per_partition == -1
-            ? 0
-            : (process_index / num_processes_per_partition);
-    auto description = std::make_unique<PjRtStreamExecutorDeviceDescription>(
-        device_id, local_device_id, process_index, process_index_in_partition,
-        partition_index, std::string(platform_version()));
-    if (target_config_.has_value()) {
-      std::string compute_capability = "<unknown compute-capability>";
-      std::string gpu_vendor = "<unknown gpu vendor>";
-      if (target_config_->gpu_device_info().has_cuda_compute_capability()) {
-        const auto& cap =
-            target_config_->gpu_device_info().cuda_compute_capability();
-        compute_capability = absl::StrCat(cap.major(), ".", cap.minor());
-        gpu_vendor = "NVIDIA Corporation";
-      }
-
-      StreamExecutorGpuTopologyDescription::SetupDeviceDescription(
-          *description, gpu_vendor, compute_capability,
-          target_config_->gpu_device_info().core_count(),
-          target_config_->gpu_device_info().shared_memory_per_block_optin(), 0);
+  // The local_device_id, process_index and partition_index are inferred from
+  // the global device id. It requires the global topology is symmetric:
+  //  - all partitions have the same number of processes.
+  //  - all processes have the same number of devices.
+  //  - processes of the same partition are adjacent to each other.
+  //
+  // And it also requires the ids assignments follows the PjRt topology
+  // exchange protocol in xla/pjrt/distributed/topology_util.cc:
+  //  - ids are densely assigned and start from 0
+  //  - from lower process index to higher process index
+  //  - within the process, from lower device ordinal to higher device ordinal
+  //
+  // If the above requirements are not met, users should get the device
+  // description by looking up individual device from PjRt client.
+  const int local_device_id =
+      num_devices_per_process == -1 ? 0 : (device_id % num_devices_per_process);
+  const int process_index =
+      num_devices_per_process == -1 ? 0 : (device_id / num_devices_per_process);
+  const int process_index_in_partition =
+      process_index == -1 ? 0 : (process_index % num_processes_per_partition);
+  const int partition_index =
+      num_processes_per_partition == -1
+          ? 0
+          : (process_index / num_processes_per_partition);
+  auto description = std::make_unique<PjRtStreamExecutorDeviceDescription>(
+      device_id, local_device_id, process_index, process_index_in_partition,
+      partition_index, std::string(platform_version()));
+  if (target_config_.has_value()) {
+    std::string compute_capability = "<unknown compute-capability>";
+    std::string gpu_vendor = "<unknown gpu vendor>";
+    if (target_config_->gpu_device_info().has_cuda_compute_capability()) {
+      const auto& cap =
+          target_config_->gpu_device_info().cuda_compute_capability();
+      compute_capability = absl::StrCat(cap.major(), ".", cap.minor());
+      gpu_vendor = "NVIDIA Corporation";
     }
-    devices.push_back(std::move(description));
+
+    StreamExecutorGpuTopologyDescription::SetupDeviceDescription(
+        *description, gpu_vendor, compute_capability,
+        target_config_->gpu_device_info().core_count(),
+        target_config_->gpu_device_info().shared_memory_per_block_optin(), 0);
   }
-  return devices;
+  return description;
 }
 
 absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
@@ -146,24 +152,20 @@ absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
 absl::StatusOr<std::pair<PjRtDeviceDimensions, int32_t>>
 StreamExecutorGpuTopologyDescription::LogicalDeviceOfDefaultTypeForId(
     xla::PjRtGlobalDeviceId device_id) const {
-  // TODO: b/435476605 - improve the lookup performance by adding a lookup api
-  // in pjrt topology description.
-  for (const auto& device_desc : DeviceDescriptions()) {
-    if (device_desc->id() == device_id) {
-      const auto& gpu_device_desc =
-          tsl::down_cast<const xla::PjRtStreamExecutorDeviceDescription&>(
-              *device_desc);
-      const auto& coords = gpu_device_desc.coords();
-      if (coords.size() != 3) {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "GPU topology must have 3 dimensions, but got ", coords.size()));
-      }
-      return std::make_pair(
-          PjRtDeviceDimensions{coords[0], coords[1], coords[2]}, 0);
-    }
+  if (device_id.value() < 0 ||
+      device_id.value() >= gpu_topology_->number_of_devices()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Chip id ", device_id.value(), " is out of range [0, ",
+                     gpu_topology_->number_of_devices(), ")"));
+  }
+  auto device_desc = CreateDeviceDescription(device_id.value());
+  const auto& coords = device_desc->coords();
+  if (coords.size() != 3) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "GPU topology must have 3 dimensions, but got ", coords.size()));
   }
-  return absl::NotFoundError(absl::StrCat("Device id ", device_id.value(),
-                                          " not found in GPU topology."));
+  return std::make_pair(PjRtDeviceDimensions{coords[0], coords[1], coords[2]},
+                        0);
 }
 
 absl::StatusOr<Layout> StreamExecutorGpuTopologyDescription::GetDefaultLayout(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
index 50cdcd5b5d127f..588aa56c6be60e 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -86,19 +87,15 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
     return gpu_topology_->number_of_hosts();
   }
 
-  absl::StatusOr<int> CoreCountOfDefaultType() const override {
-    return gpu_topology_->number_of_devices();
+  absl::StatusOr<int> ChipsPerProcess() const override {
+    return gpu_topology_->num_devices_per_host();
   }
 
-  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
-    return gpu_topology_->number_of_devices();
-  }
-
-  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
-    return gpu_topology_->number_of_devices();
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+    return 1;
   }
 
-  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const override {
     return 1;
   }
 
@@ -129,6 +126,9 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
   FromProto(const xla::PjRtTopologyDescriptionProto& proto);
 
  private:
+  std::unique_ptr<PjRtStreamExecutorDeviceDescription> CreateDeviceDescription(
+      int device_id) const;
+
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
   std::shared_ptr<const GpuTopology> gpu_topology_;
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index feaae0ebaefbd4..4b22c201565c40 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -71,7 +71,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_stream_executor_device_description",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt:semaphore",
@@ -176,6 +175,7 @@ xla_test(
         ":tracked_gpu_device_buffer",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -208,7 +208,6 @@ xla_test(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/gpu:gpu_topology",
@@ -307,6 +306,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
@@ -315,7 +315,6 @@ xla_cc_test(
         "//xla/client:local_client",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_future",
         "//xla/service:gpu_plugin",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_memory",
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 820537ccfa0714..f10613f478ae49 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -369,7 +369,7 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
           if (on_device_shape.layout() != literal_layout) {
             absl::InlinedVector<int64_t, 4> byte_strides(
                 on_device_shape.dimensions().size());
-            absl::Status s = ShapeUtil::ByteStrides(
+            absl::Status s = ShapeUtil::UnpackedByteStrides(
                 on_device_shape, absl::MakeSpan(byte_strides));
             if (!s.ok()) {
               promise.Set(s);
@@ -460,6 +460,13 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
 
             auto d2h_stream = device->d2h_stream();
 
+            // If we do not have a CudaEvent, we need to fall back to the host
+            // event to check readiness.
+            // TODO: Remove this once cuda events are always set/not nullptr.
+            if (device_buffer->GetCudaEvent() == nullptr) {
+              tsl::BlockUntilReady(device_buffer->ready_event());
+            }
+
             absl::Status cuda_event_wait_status =
                 WaitForEventOnStream(d2h_stream, device_buffer->GetCudaEvent());
             if (!cuda_event_wait_status.ok()) {
@@ -485,6 +492,12 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
               promise.Set(status);
               return;
             }
+
+            tsl::BlockUntilReady(device_buffer->ready_event());
+            if (device_buffer->ready_event().IsError()) {
+              promise.Set(device_buffer->ready_event().GetError());
+              return;
+            }
           }
           void* buffer;
           if (should_unpack) {
@@ -739,8 +752,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
     Literal* literal_pointer = literal.get();
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        literal->shape(), absl::MakeSpan(byte_strides)));
     return dst_device->client()->BufferFromHostBuffer(
         literal_pointer->untyped_data(),
         literal_pointer->shape().element_type(),
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index da95f8e36d7ee1..659eca271177f8 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -850,8 +850,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
   absl::InlinedVector<int64_t, 4> tmp_strides;
   if (!byte_strides) {
     tmp_strides.resize(dims.size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(tmp_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        device_shape, absl::MakeSpan(tmp_strides)));
     byte_strides = tmp_strides;
   }
 
@@ -868,8 +868,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
 
   absl::InlinedVector<int64_t, 4> shape_strides(
       device_shape.dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (byte_size == 0 || *byte_strides == shape_strides);
 
@@ -1209,7 +1209,8 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
   TF_ASSIGN_OR_RETURN(
       DeviceTopologyPair device_topology_pair,
       BuildDistributedDevices(
-          pjrt_platform_name, xla_client, options.node_id, options.num_nodes,
+          pjrt_platform_name, xla_client, options.node_id,
+          options.max_inflight_computations, options.num_nodes,
           gpu_run_options.get(), kv_store, options.enable_mock_nccl,
           options.mock_gpu_topology, options.partition_index, absl::Minutes(2),
           absl::Minutes(5)));
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index 5d5635fcc33f3c..fb578be47792cd 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -239,7 +240,6 @@ ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
                           CompileExecutable(kAddProgram, *client));
 
   ExecuteOptions options;
-  options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
       executable->Execute({{buffer.get(), buffer.get()}}, /*options=*/options));
@@ -1440,7 +1440,6 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   // Untuple the result so that we get separate buffers.
   // This is how JAX invokes XLA.
   ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto result, executable->Execute({{input.get()}}, execute_options));
 
@@ -1740,11 +1739,15 @@ TEST(TfrtGpuClientTest, DmaMapUnmap) {
   auto client = tensorflow::down_cast<TfrtGpuClient*>(gpu_client.get());
   size_t dma_size = 8192;
   size_t alignment = 4096;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
 
   // DmaMap the first half of the buffer.
   size_t dma_map_size = dma_size / 2;
-  char* first_half_ptr = static_cast<char*>(host_dma_ptr.get());
+  char* first_half_ptr = static_cast<char*>(host_dma_ptr);
   char* second_half_ptr = first_half_ptr + dma_map_size;
   int offset = 5;
   TF_EXPECT_OK(client->DmaMap(first_half_ptr, dma_map_size));
@@ -1813,10 +1816,15 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr_deleter =
+      absl::Cleanup([host_dma_ptr, dma_size, alignment] {
+        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+      });
+
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
 
-  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr, 0, size);
   TF_EXPECT_OK(result.Await());
 
   PjRtDevice* const second_device = client->addressable_devices()[1];
@@ -1827,12 +1835,12 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
   auto second_buffer = transfer_manager->RetrieveBuffer(0);
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-      0, host_dma_ptr.get(), 0, size, true, []() {}));
+      0, host_dma_ptr, 0, size, true, []() {}));
   TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
 }
 
 TEST(TfrtGpuClientTest, HostExecuteRuntimeTest) {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
index 39529177b683f7..95961906b1ace8 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "xla/executable_run_options.h"
+#include "xla/future.h"
 #include "xla/literal.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/service/hlo.pb.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index dd24a02ac1eb7c..ceeec1635dd75b 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -379,29 +379,11 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   }
 
   // Handle inputs.
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was"
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to"
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   // SPMD sharding produces a single executable for multiple partitions.
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
   TF_ASSIGN_OR_RETURN(std::vector<Shape> output_shapes, GetOutputShapes());
   const Shape& result_shape = output_shapes[executable_idx];
-  if (!options.untuple_result && result_shape.IsTuple()) {
-    return InvalidArgument(
-        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
-  }
 
   // `scheduled_event` indicates whether gpu computation is dispatched to the
   // stream and whether there was an error.
@@ -426,6 +408,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   // the single definition event.
   std::vector<tsl::RCReference<tsl::AsyncValue>> prepare_input_deps;
   std::vector<tsl::RCReference<tsl::AsyncValue>> input_deps;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> ready_deps;
   input_deps.reserve(argument_handles.size() + 1);
 
   absl::Span<int const> donated_params =
@@ -511,6 +494,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                 << definition_event.GetAsyncValue();
         input_deps.push_back(definition_event.CopyRCRef());
       }
+      ready_deps.push_back(tracked_buffer->ready_event().CopyRCRef());
     }
   }
 
@@ -532,9 +516,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
   std::vector<tsl::AsyncValueRef<GpuDeviceMemory>> output_buffers;
   std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   auto gpu_executable = executables_[executable_idx];
-  bool untuple_result = options.untuple_result;
   bool result_is_tuple = result_shape.IsTuple();
-  if (options.untuple_result && result_shape.IsTuple()) {
+  if (result_shape.IsTuple()) {
     output_buffers.reserve(result_shape.tuple_shapes().size());
     outputs.reserve(output_buffers.size());
     for (int i = 0; i < result_shape.tuple_shapes().size(); ++i) {
@@ -603,12 +586,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
       [replica, partition, device, launch_id(options.launch_id),
        output_buffers(output_buffers), complete_event(complete_event.CopyRef()),
        scheduled_event(scheduled_event.CopyRef()),
-       untuple_result(untuple_result), result_is_tuple(result_is_tuple),
+       result_is_tuple(result_is_tuple),
        donation_transactions(std::move(donation_transactions)),
        parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
        gpu_executable(std::move(gpu_executable)),
        device_assignment(device_assignment), executable_name(name()),
        ffi_context(ffi_context), inputs_avs(CopyAsyncValues(input_deps)),
+       ready_deps(std::move(ready_deps)),
        execution_profile(options.execution_profile),
        send_device_memory(std::move(send_device_memory)),
        recv_device_memory(std::move(recv_device_memory)),
@@ -755,7 +739,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
 
         ExecutionOutput& execution_output = result_buffer_or_status.value();
         ScopedShapedBuffer output = execution_output.ConsumeResult();
-        if (untuple_result && result_is_tuple) {
+        if (result_is_tuple) {
           for (int i = 0; i < output_buffers.size(); ++i) {
             ScopedShapedBuffer tuple_buffer = output.TakeSubTree({i});
             stream_executor::DeviceMemoryBase* elem =
@@ -795,6 +779,28 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
           return;
         }
 
+        // Propagate errors (if any) from dependencies.
+        absl::Status ready_deps_status;
+        for (const tsl::RCReference<tsl::AsyncValue>& ready : ready_deps) {
+          tsl::BlockUntilReady(ready.get());
+          if (!ready->IsError()) {
+            continue;
+          }
+          absl::Status err = ready->GetError();
+          LOG(ERROR) << "Computation has failed dependency: " << err;
+          if (ready_deps_status.ok()) {
+            ready_deps_status = err;
+          } else {
+            ready_deps_status = absl::Status(
+                err.code(),
+                absl::StrCat(ready_deps_status.message(), "; ", err.message()));
+          }
+        }
+        if (!ready_deps_status.ok()) {
+          complete_event.SetError(ready_deps_status);
+          return;
+        }
+
         // If any collective is stale, then the collective may have aborted.
         // Note that NCCL doesn't provide a way to *know* if the collective was
         // aborted, but we conservatively assume it was.
@@ -825,7 +831,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
        execute_fn(std::move(execute_fn)), input_deps(std::move(input_deps)),
        parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
        parameter_is_tupled_arguments(parameter_is_tupled_arguments_),
-       arguments_are_tupled(options.arguments_are_tupled),
        input_buffer_sizes_in_bytes(
            input_buffer_sizes_in_bytes_[executable_idx])]() mutable {
         tsl::profiler::TraceMeConsumer activity(
@@ -866,7 +871,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         }
 
         std::vector<ExecutionInput> inputs;
-        if (parameter_is_tupled_arguments && !arguments_are_tupled) {
+        if (parameter_is_tupled_arguments) {
           inputs.emplace_back(
               ShapeTree<MaybeOwningDeviceMemory>(&parameter_shapes->front()));
           ExecutionInput& input = inputs.back();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
index 9060728211c31a..85592ad4984daf 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
index 13b49c0d56447e..06b165166ac6af 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
@@ -704,7 +704,8 @@ using DeviceTopologyPair =
 
 absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     absl::string_view platform_name, LocalClient* xla_client, int node_id,
-    int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    int max_inflight_computations, int num_nodes,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology,
     std::optional<int> partition_index,
@@ -854,7 +855,7 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
       options.process_index = node.node_id();
       options.process_index_in_partition = curr_process_index_in_partition;
       options.partition_index = device_proto.partition_index();
-      options.max_inflight_computations = 8;
+      options.max_inflight_computations = max_inflight_computations;
       options.platform_version = device_proto.name();
       options.device_vendor = device_proto.vendor();
       options.compute_capability = device_proto.compute_capability();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
index 77f23cf2fc0ed2..59f93d17298a67 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
@@ -163,7 +163,8 @@ using DeviceTopologyPair =
 
 absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     absl::string_view platform_name, LocalClient* xla_client, int node_id,
-    int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    int num_nodes, int max_inflight_computations,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology,
     std::optional<int> partition_index,
diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index e8debf7046516b..359947218748e7 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -180,8 +180,11 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
 
 // First 64 bits of SHA-512 of "xla::FfiLoadedHostCallbacks".
 ffi::TypeId FfiLoadedHostCallbacks::id = {7357244197867843242};
+ffi::TypeInfo FfiLoadedHostCallbacks::info =
+    ffi::MakeTypeInfo<FfiLoadedHostCallbacks>();
+
 XLA_FFI_REGISTER_TYPE(ffi::GetXlaFfiApi(), "FfiLoadedHostCallbacks",
                       &FfiLoadedHostCallbacks::id,
-                      ffi::TypeInfo<FfiLoadedHostCallbacks>());
+                      &FfiLoadedHostCallbacks::info);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index 5d93d7c5c27fee..ac365b511cf8cd 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "tsl/platform/logging.h"
 
@@ -65,7 +64,7 @@ class ThreadSafePjRtChunkQueue {
  public:
   // Push a PjRtChunk into the queue.
   void Push(PjRtChunk chunk) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (promises_.empty()) {
       queue_.push_back(std::move(chunk));
       return;
@@ -77,7 +76,7 @@ class ThreadSafePjRtChunkQueue {
 
   // Pop a PjRtChunk future from the queue.
   Future<PjRtChunk> Pop() {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     if (queue_.empty()) {
       auto [promise, future] = Future<PjRtChunk>::MakePromise();
       promises_.push_back(std::move(promise));
@@ -183,6 +182,8 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
 
 struct FfiLoadedHostCallbacks {
   static ffi::TypeId id;
+  static ffi::TypeInfo info;
+
   void** callbacks;
   uint32_t num_callbacks;
 };
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
index 88e385492ba8ff..197b0529c95c07 100644
--- a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
@@ -175,7 +175,7 @@ class CommonAsyncHostToDeviceTransferManager
       return transfers_in_flight_ == 0;
     };
     {
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       // Make sure we don't leave dangling pointers in cleanup routines even
       // if the client lets the object go out of scope.
       mu_.Await(absl::Condition(&transfers_finished));
@@ -215,7 +215,7 @@ class CommonAsyncHostToDeviceTransferManager
   absl::Status TransferLiteralToBuffer(
       int buffer_index, const LiteralSlice& literal,
       absl::AnyInvocable<void() &&> on_done) override {
-    absl::ReleasableMutexLock l(&mu_);
+    absl::ReleasableMutexLock l(mu_);
 
     DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
     tsl::RCReference<CommonPjRtRawBuffer>& undispatched_buffer_ref =
@@ -264,7 +264,7 @@ class CommonAsyncHostToDeviceTransferManager
                     definition_event = std::move(definition_event),
                     on_done = std::move(on_done)]() mutable {
       {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
 
         CHECK_GT(transfers_in_flight_, 0);
         --transfers_in_flight_;
@@ -306,7 +306,7 @@ class CommonAsyncHostToDeviceTransferManager
   absl::Status TransferRawDataToSubBuffer(
       int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
       bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
-    absl::ReleasableMutexLock l(&mu_);
+    absl::ReleasableMutexLock l(mu_);
     DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
     tsl::RCReference<CommonPjRtRawBuffer> undispatched_buffer_ref;
     // Drop reference to the buffer if this is the last transfer.
@@ -351,7 +351,7 @@ class CommonAsyncHostToDeviceTransferManager
             data, offset, transfer_size));
     if (client_->event_tracking_enabled()) {
       // Acquire when logging, for the sake of definition_events_.
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       std::string op_name = debug_info_.has_value()
                                 ? absl::StrCat(" Op:", debug_info_.value())
                                 : "";
@@ -367,7 +367,7 @@ class CommonAsyncHostToDeviceTransferManager
                                  on_done = std::move(on_done)]() mutable {
       tsl::RCReference<PjRtDeviceEventPromise> definition_event;
       {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
 
         CHECK_GT(transfers_in_flight_, 0);
         --transfers_in_flight_;
@@ -414,7 +414,7 @@ class CommonAsyncHostToDeviceTransferManager
   }
 
   void SetBufferError(int buffer_index, absl::Status error) override {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     // For a given buffer_index, SetBufferError can't be called twice, or
     // called after the last transfer has been enqueued.
     auto definition_event = std::move(definition_events_[buffer_index]);
@@ -427,7 +427,7 @@ class CommonAsyncHostToDeviceTransferManager
 
   void AddTransferMetadata(const TransferMetadata& meta) override {
     if (client_->event_tracking_enabled()) {
-      absl::MutexLock l(&mu_);
+      absl::MutexLock l(mu_);
       std::string annotation =
           absl::StrCat(" ", absl::StrJoin(meta, " ", absl::PairFormatter(":")));
       for (int i = 0; i < definition_events_.size(); ++i) {
diff --git a/third_party/xla/xla/pjrt/interpreter/BUILD b/third_party/xla/xla/pjrt/interpreter/BUILD
index 25477d26ab4222..c8cb8170f82a97 100644
--- a/third_party/xla/xla/pjrt/interpreter/BUILD
+++ b/third_party/xla/xla/pjrt/interpreter/BUILD
@@ -38,7 +38,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:utils",
         "//xla/service:batchnorm_expander",
         "//xla/service:computation_placer_hdr",
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
index 27d7fab4b7e2a9..ece4b088dda58c 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
@@ -122,7 +122,7 @@ absl::StatusOr<std::tuple<std::vector<Literal*>, std::unique_ptr<Literal>>>
 ExtractInterpreterInputLiteralsFromBuffers(
     const absl::Span<PjRtBuffer* const> buffers,
     const HloComputation& entry_computation,
-    const bool parameter_is_tupled_arguments, const bool arguments_are_tupled) {
+    const bool parameter_is_tupled_arguments) {
   std::vector<Literal*> literals;
   for (PjRtBuffer* const buffer : buffers) {
     InterpreterLiteralWrapperBuffer* interpreter_buffer =
@@ -135,7 +135,7 @@ ExtractInterpreterInputLiteralsFromBuffers(
   }
 
   // Return early if arguments don't need to be re-tupled.
-  if (!parameter_is_tupled_arguments || arguments_are_tupled) {
+  if (!parameter_is_tupled_arguments) {
     return std::make_tuple(std::move(literals), nullptr);
   }
 
@@ -248,8 +248,7 @@ InterpreterLoadedExecutable::ExecuteSharded(
   TF_ASSIGN_OR_RETURN(const auto literals_and_storage,
                       ExtractInterpreterInputLiteralsFromBuffers(
                           argument_handles, computation,
-                          compile_options_.parameter_is_tupled_arguments,
-                          options.arguments_are_tupled));
+                          compile_options_.parameter_is_tupled_arguments));
   const absl::Span<const Literal* const> literals =
       std::get<0>(literals_and_storage);
   if (computation.num_parameters() != literals.size()) {
@@ -283,8 +282,7 @@ InterpreterLoadedExecutable::ExecuteSharded(
   // Transform the result literal back into a one or more
   // InterpreterLiteralWrapperBuffer.
   std::vector<std::unique_ptr<PjRtBuffer>> result;
-  // Untuple result if requested.
-  if (options.untuple_result && result_literal.shape().IsTuple()) {
+  if (result_literal.shape().IsTuple()) {
     const int tuple_count = result_literal.shape().tuple_shapes().size();
     result.reserve(tuple_count);
     // DecomposeTuple invalidates result_literal. move(...) to make it obvious.
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
index 409db1b20e4d4e..6b725814e690cc 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index 7d04d1056d2024..6f376b49ed5ee7 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -159,7 +159,7 @@ absl::Status LocalDeviceState::SynchronizeAllActivity() {
   // fixed, we could remove the BlockHostUntilDone call.
   status.Update(compute_stream_->BlockHostUntilDone());
   if (callback_stream_map_.has_value()) {
-    absl::MutexLock lock(&callback_stream_map_mu_);
+    absl::MutexLock lock(callback_stream_map_mu_);
     for (auto& callback_stream : callback_stream_map_.value()) {
       status.Update(callback_stream.second->BlockHostUntilDone());
     }
@@ -188,7 +188,7 @@ absl::Status LocalDeviceState::ThenExecuteCallback(
   tsl::profiler::TraceMe traceme("ThenExecuteCallback");
   if (callback_stream_map_.has_value()) {
     // Prevent concurrent updates to the callback stream map.
-    absl::MutexLock lock(&callback_stream_map_mu_);
+    absl::MutexLock lock(callback_stream_map_mu_);
     auto callback_stream = callback_stream_map_->find(stream);
     if (callback_stream == callback_stream_map_->end()) {
       TF_ASSIGN_OR_RETURN(auto new_stream, executor_->CreateStream());
@@ -207,7 +207,7 @@ absl::Status LocalDeviceState::ThenExecuteCallback(
 }
 
 se::Stream* LocalDeviceState::GetDeviceToHostStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_device_to_host_stream_;
   next_device_to_host_stream_ =
       (next_device_to_host_stream_ + 1) % device_to_host_streams_.size();
@@ -215,7 +215,7 @@ se::Stream* LocalDeviceState::GetDeviceToHostStream() {
 }
 
 se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_device_to_device_stream_;
   next_device_to_device_stream_ =
       (next_device_to_device_stream_ + 1) % device_to_device_streams_.size();
@@ -223,7 +223,7 @@ se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
 }
 
 se::Stream* LocalDeviceState::GetFixedSizePoolUsageStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_fixed_size_pool_usage_stream_;
   next_fixed_size_pool_usage_stream_ =
       (next_fixed_size_pool_usage_stream_ + 1) %
@@ -232,7 +232,7 @@ se::Stream* LocalDeviceState::GetFixedSizePoolUsageStream() {
 }
 
 se::Stream* LocalDeviceState::GetExternalReadyEventStream() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int i = next_external_ready_event_stream_;
   next_external_ready_event_stream_ = (next_external_ready_event_stream_ + 1) %
                                       external_ready_event_streams_.size();
@@ -256,7 +256,7 @@ absl::StatusOr<se::Stream*> LocalDeviceState::GetStreamFromExternalStream(
 }
 
 std::vector<se::Stream*> LocalDeviceState::GetDeviceToDeviceStreams() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   std::vector<se::Stream*> result;
   result.reserve(device_to_device_streams_.size());
   for (const auto& stream : device_to_device_streams_) {
@@ -267,7 +267,7 @@ std::vector<se::Stream*> LocalDeviceState::GetDeviceToDeviceStreams() {
 
 std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   {
-    absl::MutexLock lock(&stream_pool_mu_);
+    absl::MutexLock lock(stream_pool_mu_);
     if (!usage_stream_pool_.empty()) {
       std::unique_ptr<se::Stream> stream = std::move(usage_stream_pool_.top());
       usage_stream_pool_.pop();
@@ -294,12 +294,12 @@ void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
   if (status.code() != tsl::error::ABORTED) {
     CHECK(stream->ok()) << status;
   }
-  absl::MutexLock lock(&stream_pool_mu_);
+  absl::MutexLock lock(stream_pool_mu_);
   usage_stream_pool_.push(std::move(stream));
 }
 
 int LocalDeviceState::GetNewPrngSeed() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   int x = 0;
   do {
     x = prng_seed_distribution_(prng_seed_generator_);
@@ -355,7 +355,7 @@ LocalDeviceState::GetEventForComputeStreamSyncPoint(
   }
   mu_.unlock();
   event.AndThen([this, cur_sync_point]() {
-    absl::MutexLock l(&mu_);
+    absl::MutexLock l(mu_);
     while (base_compute_event_sequence_id_ < cur_sync_point) {
       compute_events_.pop_front();
       ++base_compute_event_sequence_id_;
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index 840c27ef48a252..3cbdf0576d8ea2 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -230,6 +230,7 @@ absl::Status ExportShardyForGSPMD(mlir::ModuleOp module) {
   // to handle.
   xla::sdy::StablehloExportPipelineOptions options;
   options.keepHloShardingConstraints = true;
+  options.addMissingShardingToControlFlow = false;
   xla::sdy::addStablehloExportPipeline(pm, options);
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(context);
   if (!mlir::succeeded(pm.run(module))) {
diff --git a/third_party/xla/xla/pjrt/pjrt_client.cc b/third_party/xla/xla/pjrt/pjrt_client.cc
index 8aeb459a59135d..6e70b5de3bc9dc 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
@@ -26,13 +28,15 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/substitute.h"
+#include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -100,4 +104,65 @@ PjRtExecutable* PjRtLoadedExecutable::GetExecutable() const {
   return executable_forwarder_.get();
 }
 
+absl::StatusOr<Shape> PjRtBuffer::HostShape() {
+  Shape device_shape;
+  if (!IsTuple()) {
+    absl::Span<const int64_t> literal_dims;
+    std::optional<std::vector<int64_t>> logical_dims_storage;
+    if (has_dynamic_dimensions()) {
+      TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
+                          logical_dimensions());
+      logical_dims_storage.emplace(std::move(logical_dims));
+      literal_dims = *logical_dims_storage;
+    } else {
+      literal_dims = dimensions();
+    }
+    if (element_type() == TOKEN) {
+      device_shape = ShapeUtil::MakeTokenShape();
+    } else {
+      device_shape = ShapeUtil::MakeShape(element_type(), literal_dims);
+      // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
+      *device_shape.mutable_layout() = layout()->xla_layout();
+    }
+  } else {
+    // TODO(skyewm): does anything need to create tuple literals? The PJRT C
+    // API doesn't support tuples or {logical_}on_device_shape(), so we prefer
+    // to use the above non-tuple code path where possible.
+    device_shape = on_device_shape();
+    if (device_shape.is_dynamic()) {
+      TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
+    }
+  }
+  return ShapeUtil::DeviceShapeToHostShape(device_shape);
+}
+
+xla::Future<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+  absl::StatusOr<Shape> host_shape = HostShape();
+  if (!host_shape.ok()) {
+    return xla::Future<std::shared_ptr<Literal>>(host_shape.status());
+  }
+  auto [promise, future] = xla::Future<std::shared_ptr<Literal>>::MakePromise();
+  auto shared_literal = std::make_shared<Literal>();
+  Literal* literal = shared_literal.get();
+  LazyToLiteral([literal, host_shape = *std::move(
+                              host_shape)]() -> Future<MutableLiteralBase*> {
+    auto literal_or = Literal::Make(host_shape);
+    if (!literal_or.ok()) {
+      return Future<MutableLiteralBase*>(literal_or.status());
+    }
+    *literal = *std::move(literal_or);
+    return Future<MutableLiteralBase*>(literal);
+  })
+      .OnReady(
+          [promise = std::move(promise),
+           shared_literal = std::move(shared_literal)](absl::Status s) mutable {
+            if (!s.ok()) {
+              std::move(promise).Set(s);
+            } else {
+              std::move(promise).Set(std::move(shared_literal));
+            }
+          });
+  return future;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index cc6b342f0730ff..f35a13e9aafe40 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -58,6 +59,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/lib/gtl/int_type.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -197,7 +199,7 @@ class PjRtDevice {
 
   // Returns a scoped event that the caller uses to tell the PjRtClient that
   // there is asynchronous work happening that depends on activity on the
-  // PjRtDevice. See comment on class definition in pjrt_future.h.
+  // PjRtDevice. See comment on class definition in future.h.
   //
   // Only some PjRtDevice implementations support ScopedAsyncTrackingEvent, and
   // those that do not will return nullptr.
@@ -389,14 +391,14 @@ class CopyToDeviceStream {
   // Returns the amount of data the stream currently has either transferred or
   // has buffered to transfer.
   int64_t current_bytes() const ABSL_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return current_bytes_;
   }
 
   // Returns true if the stream is complete; all expected bytes have been
   // transferred or are buffered to transfer.
   bool IsComplete() const ABSL_LOCKS_EXCLUDED(mu_) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return IsCompleteLocked();
   }
 
@@ -442,6 +444,10 @@ struct PjRtPluginAttributes {
   absl::flat_hash_map<std::string, PjRtValueType> attributes;
 };
 
+// Each cross-host transfer in the second transfers API is associated with a
+// unique CrossHostTransferKey.
+TSL_LIB_GTL_DEFINE_INT_TYPE(CrossHostTransferKey, int64_t);
+
 // Encapsulates the state of Python session with XLA.
 //
 // It is the responsibility of the client of this API to keep the PjRtClient
@@ -449,7 +455,7 @@ struct PjRtPluginAttributes {
 //
 // A note on the semantics of cross-device copies.
 //
-// There are two mechanisms to transfer a buffer from one device to another.
+// There are three mechanisms to transfer a buffer from one device to another.
 // When both devices are on the same host (more specifically, the user program
 // ends up with pointers to both the source and destination buffers in the same
 // address space), the caller can use:
@@ -459,18 +465,25 @@ struct PjRtPluginAttributes {
 // made via native device networking (as opposed to the user program fetching
 // the buffer and sending it using its own networking code), the caller can
 // use:
+//   DstHost: dst_client->CrossHostReceiveBuffers(...)
+//   SrcHost: src_client->CrossHostSendBuffers(...)
+//
+// The caller can also use the original cross-host transfers API:
 //   DstHost: dst_client->MakeCrossHostReceiveBuffers(...)
 //   DstHost: [...]
 //   DstHost: gets callback containing PjRtCrossHostRecvDescriptors
 //   DstHost: sends cross-host recv serialized descriptors to SrcHost
 //   SrcHost: src_buffer->CopyToRemoteDevice(serialized_descriptors)
 //
+// See subclass documentation for platform-specific tradeoffs between the
+// two cross-host transfer methods.
+//
 // Note that in the cross-host case, the dst_client may call
-// MakeCrossHostReceiveBuffers before the action that produces src_buffer has
+// (Make)CrossHostReceiveBuffers before the action that produces src_buffer has
 // been enqueued at SrcHost.
 //
 // On some platforms, device-to-device transfers consume scarce hardware
-// resources. If dst_client->MakeCrossHostReceiveBuffers immediately claimed
+// resources. If dst_client->(Make)CrossHostReceiveBuffers immediately claimed
 // those resources, then there would be a risk of system-wide deadlock, if the
 // resources claimed by the recv prevented other transfers that are necessary
 // to generate src_buffer from acquiring enough resources to proceed.
@@ -964,16 +977,16 @@ class PjRtClient {
   virtual absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
       PjRtBuffer* buffer);
 
-  // Returns a vector of PjRtBuffers that can be used to receive
-  // cross host transfers using `client` on `device'. Asynchronously calls
-  // `notifier` once receive descriptors are ready to be communicated to the
-  // sender. `shapes` must be the exact shapes, with identical layouts,
-  // corresponding to the buffers that will be sent. When resources for the
-  // transfer are available, notifier will be called with a vector of
-  // PjRtCrossHostRecvDescriptors structs, one for each shape in `shapes`. Each
-  // struct contains an opaque string that should be transmitted to the sending
-  // host and used in a call to CopyToRemoteDevice. None of the recv buffers
-  // will become ready until *all* of the sends have completed.
+  // Part of original cross-host transfers API. Returns a vector of PjRtBuffers
+  // that can be used to receive cross host transfers using `client` on
+  // `device'. Asynchronously calls `notifier` once receive descriptors are
+  // ready to be communicated to the sender. `shapes` must be the exact shapes,
+  // with identical layouts, corresponding to the buffers that will be sent.
+  // When resources for the transfer are available, notifier will be called with
+  // a vector of PjRtCrossHostRecvDescriptors structs, one for each shape in
+  // `shapes`. Each struct contains an opaque string that should be transmitted
+  // to the sending host and used in a call to CopyToRemoteDevice. None of the
+  // recv buffers will become ready until *all* of the sends have completed.
   //
   // If MakeCrossHostReceiveBuffers returns an error, then `notifier` will not
   // be called. Otherwise `notifier` will be called exactly once. In the case
@@ -1011,6 +1024,28 @@ class PjRtClient {
         "DmaUnmap not supported on platform %s", platform_name()));
   }
 
+  // CrossHostSendBuffers and CrossHostReceiveBuffers are part of the second
+  // cross-host transfers API.
+
+  // Send buffers to remote devices specified by dst_global_device_ids.
+  virtual absl::StatusOr<std::vector<Future<>>> CrossHostSendBuffers(
+      absl::Span<PjRtBuffer* const> buffers,
+      absl::Span<const PjRtGlobalDeviceId> dst_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) {
+    return absl::InternalError(
+        "Cross-host data transfers are not supported by this client.");
+  }
+
+  // Places buffers from a cross-host send onto device.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  CrossHostReceiveBuffers(
+      xla::PjRtDevice* device, absl::Span<const xla::Shape> shapes,
+      absl::Span<const PjRtGlobalDeviceId> src_global_device_ids,
+      std::vector<CrossHostTransferKey> transfer_keys) {
+    return absl::UnimplementedError(
+        "Cross-host data transfers are not supported.");
+  }
+
  private:
   std::unique_ptr<PjRtHostMemoryForDeviceManager>
       host_memory_for_device_manager_;
@@ -1136,58 +1171,21 @@ class PjRtBuffer {
 
   // Synchronous overload of ToLiteral, as a convenience.
   absl::Status ToLiteralSync(MutableLiteralBase* literal) {
-    absl::Notification done;
-    absl::Status status;
-    ToLiteral(literal).OnReady([&](absl::Status s) {
-      status = std::move(s);
-      done.Notify();
-    });
-    done.WaitForNotification();
-    return status;
-  }
-
-  absl::StatusOr<Shape> HostShape() {
-    Shape device_shape;
-    if (!IsTuple()) {
-      absl::Span<const int64_t> literal_dims;
-      std::optional<std::vector<int64_t>> logical_dims_storage;
-      if (has_dynamic_dimensions()) {
-        TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
-                            logical_dimensions());
-        logical_dims_storage.emplace(std::move(logical_dims));
-        literal_dims = *logical_dims_storage;
-      } else {
-        literal_dims = dimensions();
-      }
-      if (element_type() == TOKEN) {
-        device_shape = ShapeUtil::MakeTokenShape();
-      } else {
-        device_shape = ShapeUtil::MakeShape(element_type(), literal_dims);
-        // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
-        *device_shape.mutable_layout() = layout()->xla_layout();
-      }
-    } else {
-      // TODO(skyewm): does anything need to create tuple literals? The PJRT C
-      // API doesn't support tuples or {logical_}on_device_shape(), so we prefer
-      // to use the above non-tuple code path where possible.
-      device_shape = on_device_shape();
-      if (device_shape.is_dynamic()) {
-        TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
-      }
-    }
-    return ShapeUtil::DeviceShapeToHostShape(device_shape);
+    return ToLiteral(literal).Await();
   }
 
+  absl::StatusOr<Shape> HostShape();
+
   // Convenience synchronous overload that allocates a literal with a default
   // layout.
+  ABSL_DEPRECATE_AND_INLINE()
   absl::StatusOr<std::shared_ptr<Literal>> ToLiteralSync() {
-    TF_ASSIGN_OR_RETURN(Shape host_shape, HostShape());
-    TF_ASSIGN_OR_RETURN(auto literal, Literal::Make(host_shape));
-    auto shared_literal = std::make_shared<Literal>(std::move(literal));
-    TF_RETURN_IF_ERROR(ToLiteralSync(shared_literal.get()));
-    return shared_literal;
+    return ToLiteral().Await();
   }
 
+  // ToLiteral overload which async allocates a literal with default layout.
+  xla::Future<std::shared_ptr<Literal>> ToLiteral();
+
   // Returns the number of bytes of the buffer storage on the device.
   virtual absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
 
@@ -1261,13 +1259,14 @@ class PjRtBuffer {
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
       PjRtMemorySpace* dst_memory_space) = 0;
 
-  // Prepares to send a copy of the buffer to a remote device. The destination
-  // device is encoded in `serialized_descriptor`, which must be fulfilled by
-  // the result of call to MakeCrossHostReceiveBuffers on the remote host's
-  // destination device. MakeCrossHostReceiveBuffers takes an array of shapes to
-  // construct the destination buffers, and a callback supplies an array
-  // containing both the destination buffers, and a serialized descriptor for
-  // each buffer. For each destination buffer there should be a matching call to
+  // Part of original cross-host transfers API. Prepares to send a copy of the
+  // buffer to a remote device. The destination device is encoded in
+  // `serialized_descriptor`, which must be fulfilled by the result of call to
+  // MakeCrossHostReceiveBuffers on the remote host's destination device.
+  // MakeCrossHostReceiveBuffers takes an array of shapes to construct the
+  // destination buffers, and a callback supplies an array containing both the
+  // destination buffers, and a serialized descriptor for each buffer. For each
+  // destination buffer there should be a matching call to
   // src->CopyToRemoteDevice on a remote host for a src buffer of the
   // corresponding shape. If `serialized_descriptor` is fulfilled with a non-Ok
   // status, then the transfer is canceled, otherwise it must be the string
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index 664f5ddb2a9b4b..4fddbe0dc5d331 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -31,11 +31,13 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -46,13 +48,13 @@ class TestClientFactory {
  public:
   void Register(
       std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> factory) {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     CHECK(!factory_);
     factory_ = std::move(factory);
   }
 
   std::function<absl::StatusOr<std::unique_ptr<PjRtClient>>()> Get() const {
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(mu_);
     return factory_;
   }
 
@@ -531,6 +533,35 @@ TEST(PjRtClientTest, CreateViewOfUnalignedBufferReturnsErrorCpuOnly) {
               ::testing::HasSubstr("unaligned data"));
 }
 
+TEST(PjRtClientTest, FulfillAliasBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetClient());
+
+  std::vector<int32_t> data{1, 2, 3, 4, 5, 6};
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(
+      *shape.mutable_layout(),
+      client->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto alias_buffer,
+      client->CreateAliasBuffer(shape, client->memory_spaces()[0]));
+  auto future = alias_buffer.first->ToLiteral();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+
+  ASSERT_NE(alias_buffer.second, nullptr);
+  TF_ASSERT_OK(std::move(alias_buffer.second)(param.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto shared_literal, future.Await());
+
+  std::vector<int32_t> expected = {1, 2, 3, 4, 5, 6};
+  EXPECT_EQ(shared_literal->data<int32_t>(), expected);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> MakeFloatBuffer(
     PjRtClient* client, const std::vector<float>& data,
     absl::Span<const int64_t> dimensions) {
@@ -571,7 +602,6 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                           MakeFloatBuffer(client.get(), data, {2, 2}));
 
   xla::ExecuteOptions options;
-  options.untuple_result = true;
   {
     auto result = pjrt_executable->Execute(/*argument_handles=*/{{
                                                buffer0.get(),
diff --git a/third_party/xla/xla/pjrt/pjrt_common.h b/third_party/xla/xla/pjrt/pjrt_common.h
index 61adc9f6a7bd01..e217faf87e7268 100644
--- a/third_party/xla/xla/pjrt/pjrt_common.h
+++ b/third_party/xla/xla/pjrt/pjrt_common.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "xla/pjrt/proto/pjrt_value_type.pb.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
@@ -36,8 +37,23 @@ xla::PjRtValueTypeProto PjRtValueTypeToProto(const PjRtValueType& value);
 
 PjRtValueType PjRtValueTypeFromProto(const xla::PjRtValueTypeProto& value);
 
+template <typename Id>
+using PjRtIdContainer = absl::InlinedVector<Id, 4>;
+
+template <typename Id>
+PjRtIdContainer<Id> MakeContinuousIds(int start, int size) {
+  PjRtIdContainer<Id> container;
+  container.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    container.push_back(Id(start + i));
+  }
+  return container;
+}
+
 // The strong-typed integer classes to better disambiguate different IDs for
 // PJRT devices.
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtProcessId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalChipId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalDeviceId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalDeviceId, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalHardwareId, int32_t);
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.cc b/third_party/xla/xla/pjrt/pjrt_compiler.cc
index a1dcf0173c38f8..a5e830f2a158fc 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.cc
@@ -47,14 +47,14 @@ CompilerRegistry() {
 void PjRtRegisterCompiler(absl::string_view platform_name,
                           std::unique_ptr<PjRtCompiler> compiler) {
   CHECK(compiler != nullptr);
-  absl::MutexLock l(&registry_mutex);
+  absl::MutexLock l(registry_mutex);
   auto* compiler_registry = CompilerRegistry();
   CHECK(!compiler_registry->contains(platform_name));
   (*compiler_registry)[platform_name] = std::move(compiler);
 }
 
 absl::StatusOr<PjRtCompiler*> GetPjRtCompiler(absl::string_view platform_name) {
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(platform_name);
   if (it == compiler_registry->end()) {
@@ -72,7 +72,7 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     return (*topology_compiler)
         ->Compile(std::move(options), computation, topology, client);
   }
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
   if (it == compiler_registry->end()) {
@@ -90,7 +90,7 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     return (*topology_compiler)
         ->Compile(std::move(options), module, topology, client);
   }
-  absl::ReaderMutexLock l(&registry_mutex);
+  absl::ReaderMutexLock l(registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
   if (it == compiler_registry->end()) {
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index 333f567be124ae..6f5a61f7a6b20d 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/pjrt_partial_program.pb.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/fingerprint.h"
 
 namespace xla {
@@ -121,20 +122,41 @@ class PjRtTopologyDescription {
     return absl::UnimplementedError("ProcessCount is unsupported.");
   }
 
+  // Returns the number of chips per process.
+  virtual absl::StatusOr<int> ChipsPerProcess() const {
+    return absl::UnimplementedError("ChipsPerProcess is unsupported.");
+  }
+
   // Returns the number of chips.
   virtual absl::StatusOr<int> ChipCount() const {
-    return absl::UnimplementedError("ChipCount is unsupported.");
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return process_count * chips_per_process;
   }
 
   // Returns the total number of cores of the default type.
   virtual absl::StatusOr<int> CoreCountOfDefaultType() const {
-    return absl::UnimplementedError("CoreCountOfDefaultType is unsupported.");
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int cores_per_process,
+                        CoreCountOfDefaultTypePerProcess());
+    return process_count * cores_per_process;
+  }
+
+  // As above, but returns the number of logical devices per host.
+  virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerProcess()
+      const {
+    TF_ASSIGN_OR_RETURN(int logical_devices_per_chip,
+                        LogicalDeviceCountOfDefaultTypePerChip());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return chips_per_process * logical_devices_per_chip;
   }
 
   // Returns the total number of logical devices of the default type.
   virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const {
-    return absl::UnimplementedError(
-        "LogicalDeviceCountOfDefaultType is unsupported.");
+    TF_ASSIGN_OR_RETURN(int process_count, ProcessCount());
+    TF_ASSIGN_OR_RETURN(int logical_devices_per_process,
+                        LogicalDeviceCountOfDefaultTypePerProcess());
+    return process_count * logical_devices_per_process;
   }
 
   // Returns the number of logical devices of the default type per chip.
@@ -145,8 +167,9 @@ class PjRtTopologyDescription {
 
   // Returns the number of cores of the default type per process.
   virtual absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const {
-    return absl::UnimplementedError(
-        "CoreCountOfDefaultTypePerProcess is unsupported.");
+    TF_ASSIGN_OR_RETURN(int cores_per_chip, CoreCountOfDefaultTypePerChip());
+    TF_ASSIGN_OR_RETURN(int chips_per_process, ChipsPerProcess());
+    return cores_per_chip * chips_per_process;
   }
 
   // Returns the number of cores per chip for the default type.
@@ -155,6 +178,47 @@ class PjRtTopologyDescription {
         "CoreCountOfDefaultTypePerChip is unsupported.");
   }
 
+  // Returns the ids for all processes.
+  virtual absl::StatusOr<PjRtIdContainer<PjRtProcessId>> ProcessIds() const {
+    return absl::UnimplementedError("ProcessIds is unsupported.");
+  }
+
+  // Returns the ids for all the logical devices on a specific process.
+  virtual absl::StatusOr<PjRtIdContainer<PjRtGlobalDeviceId>>
+  LogicalDeviceOfDefaultTypeIdsOnProcess(PjRtProcessId process_id) const {
+    return absl::UnimplementedError(
+        "LogicalDeviceOfDefaultTypeIdsOnProcess is unsupported.");
+  }
+
+  // Returns the process ID and the index of the chip within that process for a
+  // given chip.
+  virtual absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForChip(PjRtGlobalChipId chip_id) const {
+    return absl::UnimplementedError(
+        "ProcessIdAndIndexOnProcessForChip is unsupported.");
+  }
+
+  // Returns the process ID and the index on process for a logical device.
+  virtual absl::StatusOr<std::pair<PjRtProcessId, int>>
+  ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType(
+      xla::PjRtGlobalDeviceId device_id) const {
+    return absl::UnimplementedError(
+        "ProcessIdAndIndexOnProcessForLogicalDeviceOfDefaultType is "
+        "unsupported.");
+  }
+
+  // Returns the coordinates of a process given its ID.
+  virtual absl::StatusOr<PjRtDeviceDimensions> ProcessCoordFromId(
+      PjRtProcessId process_id) const {
+    return absl::UnimplementedError("ProcessCoordForId is unsupported.");
+  }
+
+  // Returns the chip ID for a given chip coordinate.
+  virtual absl::StatusOr<PjRtGlobalChipId> ChipIdFromCoord(
+      const PjRtDeviceDimensions& chip) const {
+    return absl::UnimplementedError("IdForChip is unsupported.");
+  }
+
   // Returns a unique integer ID for the logical device of the default type on
   // the chip at the given coordinates and with the given core index.
   virtual absl::StatusOr<xla::PjRtGlobalDeviceId>
@@ -173,18 +237,21 @@ class PjRtTopologyDescription {
   }
 
   // Returns the bounds of the chips within a single host.
-  virtual absl::StatusOr<PjRtDeviceDimensions> ChipsPerHostBounds() const {
-    return absl::UnimplementedError("GetChipsPerHostBounds is unsupported.");
+  // The product of all dimensions should equal to ChipsPerProcess().
+  virtual absl::StatusOr<PjRtDeviceDimensions> ChipsPerProcessBounds() const {
+    return absl::UnimplementedError("GetChipsPerProcessBounds is unsupported.");
   }
 
   // Returns the total bounds of all chips in the topology.
+  // The product of all dimensions should equal to ChipCount().
   virtual absl::StatusOr<PjRtDeviceDimensions> ChipBounds() const {
     return absl::UnimplementedError("ChipBounds is unsupported.");
   }
 
   // Returns the total bounds of all hosts in the topology.
-  virtual absl::StatusOr<PjRtDeviceDimensions> HostBounds() const {
-    return absl::UnimplementedError("HostBounds is unsupported.");
+  // The product of all dimensions should equal to ProcessCount().
+  virtual absl::StatusOr<PjRtDeviceDimensions> ProcessBounds() const {
+    return absl::UnimplementedError("ProcessBounds is unsupported.");
   }
 
   // Serializes the topology for use in cache keys. (No guarantees on
@@ -217,20 +284,19 @@ class PjRtTopologyDescription {
   }
 };
 
-// Returns true if it's TPU topology.
-inline bool IsTpuTopology(const PjRtTopologyDescription& topology_description) {
-  return topology_description.platform_id() == xla::TpuId();
+// Returns true if it's TPU id.
+inline bool IsTpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::TpuId();
 }
 
-// Returns true if it's GPU topology.
-inline bool IsGpuTopology(const PjRtTopologyDescription& topology_description) {
-  return topology_description.platform_id() == xla::CudaId() ||
-         topology_description.platform_id() == xla::RocmId();
+// Returns true if it's GPU id.
+inline bool IsGpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::CudaId() || platform_id == xla::RocmId();
 }
 
-// Returns true if it's CPU topology.
-inline bool IsCpuTopology(const PjRtTopologyDescription& topology_description) {
-  return topology_description.platform_id() == xla::CpuId();
+// Returns true if it's CPU id.
+inline bool IsCpuId(PjRtPlatformId platform_id) {
+  return platform_id == xla::CpuId();
 }
 
 // Abstract interface that all registered compilers must implement.
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 1638398240e57a..1c5e1521047f7b 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -148,8 +148,8 @@ MultiSliceConfig::~MultiSliceConfig() = default;
 absl::StatusOr<ExecuteOptionsProto> ExecuteOptions::ToProto() const {
   ExecuteOptionsProto proto;
 
-  proto.set_arguments_are_tupled(arguments_are_tupled);
-  proto.set_untuple_result(untuple_result);
+  proto.set_arguments_are_tupled(false);
+  proto.set_untuple_result(true);
   proto.set_launch_id(launch_id);
   if (context != nullptr) {
     return absl::UnimplementedError(
@@ -197,7 +197,6 @@ absl::StatusOr<ExecuteOptions> ExecuteOptions::FromProto(
     const ExecuteOptionsProto& proto) {
   ExecuteOptions options;
 
-  options.arguments_are_tupled = proto.arguments_are_tupled();
   options.untuple_result = proto.untuple_result();
   options.launch_id = proto.launch_id();
   options.strict_shape_checking = proto.strict_shape_checking();
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index ca4f70fcbd866b..f5c761714b521f 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/proto/executable_metadata.pb.h"
@@ -147,12 +148,7 @@ struct CompileOptions {
 
 struct LoadOptions {
   // Origin of the subslice of the target topology to run computation on.
-  struct ComputationOrigin {
-    int x = 0;
-    int y = 0;
-    int z = 0;
-  };
-  std::optional<ComputationOrigin> computation_origin;
+  std::optional<xla::PjRtDeviceDimensions> computation_origin;
 
   // multi_slice_config to associate with the executable during load of a multi
   // slice operation.
@@ -222,7 +218,8 @@ struct ExecuteOptions {
   // If true, the client must pass a single PjRtBuffer which contains all of
   // the arguments as a single XLA tuple, otherwise each argument must be
   // passed in its own PjRtBuffer. May only be true if the executable was
-  // compiled with parameter_is_tupled_arguments==true.
+  // compiled with parameter_is_tupled_arguments==true. This field is
+  // deprecated.
   bool arguments_are_tupled = false;
   // TODO(b/430587318): Remove this deprecated field.
   bool untuple_result = true;
diff --git a/third_party/xla/xla/pjrt/pjrt_executable_test.cc b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
index 9d2cc1577e2cd1..05a352e3c18528 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
@@ -75,8 +75,6 @@ TEST(CompileOptionsTest, Defaults) {
 
 TEST(ExecuteOptionsTest, Serialization) {
   ExecuteOptions src;
-  src.arguments_are_tupled = true;
-  src.untuple_result = false;
   src.launch_id = 1234;
   src.strict_shape_checking = true;
   src.execution_mode = ExecuteOptions::ExecutionMode::kAsynchronous;
diff --git a/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/xla/xla/pjrt/pjrt_future.h
index 32aca761138588..a09fae27f6e58e 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.h
+++ b/third_party/xla/xla/pjrt/pjrt_future.h
@@ -16,16 +16,6 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_FUTURE_H_
 #define XLA_PJRT_PJRT_FUTURE_H_
 
-#include "absl/base/macros.h"
 #include "xla/future.h"
 
-namespace xla {
-
-template <typename T = void>
-using PjRtFuture ABSL_DEPRECATE_AND_INLINE() = ::xla::Future<T>;
-
-using PjRtFutureHelpers ABSL_DEPRECATE_AND_INLINE() = ::xla::FutureHelpers;
-
-}  // namespace xla
-
 #endif  // XLA_PJRT_PJRT_FUTURE_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 69e32714202a6d..d6432e96e38abd 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -123,7 +123,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/pjrt/profiling/profiling_context.h"
 #include "xla/pjrt/raw_buffer.h"
@@ -151,6 +150,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/env.h"
@@ -396,44 +396,6 @@ void StallStreamOnError(LocalDeviceState* local_device, se::Stream* stream) {
   }
 }
 
-// Does all necessary bookkeeping, after a buffer is successfully enqueued onto
-// a stream, to ensure that the buffer will be kept alive until its use on that
-// stream is complete.
-//
-//   device_buffer:              the buffer that was enqueued.
-//   buffer_local_device:        the device the buffer was allocated on.
-//   stream_local_device:        the device that manages usage_stream.
-//   event:                      an event that was recorded on usage_stream
-//                               after the usage of device_buffer was enqueued.
-//   usage_stream:               the stream the operation using device_buffer
-//                               was enqueued on.
-void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
-                 LocalDeviceState* buffer_local_device,
-                 LocalDeviceState* stream_local_device,
-                 BufferSequencingEventRef event, se::Stream* usage_stream,
-                 std::vector<tsl::RCReference<RawSEDeviceMemory>>*
-                     buffers_to_release = nullptr) {
-  tsl::profiler::TraceMe traceme("RecordUsage");
-  bool retain_buffer_until_completion =
-      // If the buffer wasn't allocated on the same device as the stream, always
-      // retain a reference.
-      (stream_local_device != buffer_local_device) ||
-      // In the synchronous allocation model, always retain a reference.
-      (stream_local_device->allocation_model() ==
-       LocalDeviceState::kSynchronous);
-  if (retain_buffer_until_completion) {
-    if (buffers_to_release) {
-      buffers_to_release->push_back(device_buffer->device_memory());
-    } else {
-      buffer_local_device
-          ->ThenRelease(usage_stream, device_buffer->device_memory())
-          .IgnoreError();
-    }
-  }
-  device_buffer.ConvertUsageHold(usage_stream, event,
-                                 retain_buffer_until_completion);
-}
-
 // Adds necessary synchronization after a copy has been enqueued to a buffer.
 // definition_event was added when the buffer was allocated, but has not yet
 // had an event recorded.
@@ -568,12 +530,51 @@ PjRtStreamExecutorClient::DefineBuffer(
           ->device_buffer(),
       definition_events);
 
-  auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      on_device_shape, std::move(dst_device_buffer), this, device,
-      memory_space);
+  auto py_buffer = std::make_unique<CommonPjRtBufferImpl>(
+      on_device_shape, std::move(dst_device_buffer), memory_space);
   return py_buffer;
 }
 
+absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                         CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
+PjRtStreamExecutorClient::CreateRawBufferChannel(
+    PjRtMemorySpace* memory_space) {
+  auto buffer_promise = tsl::MakeIndirectAsyncValue();
+  auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
+      memory_space->devices()[0]);
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      device->GetLocalDeviceState());
+  auto raw_buffer = tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
+      this, memory_space, local_device,
+      tsl::AsyncValueRef<RawSEDeviceMemory>(buffer_promise));
+
+  auto buffer_promise_cb =
+      [buffer_promise = std::move(buffer_promise), memory_space](
+          absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> raw_buffer)
+      -> absl::Status {
+    if (!raw_buffer.ok()) {
+      buffer_promise->SetError(raw_buffer.status());
+      return raw_buffer.status();
+    }
+    if (memory_space != (*raw_buffer)->memory_space()) {
+      auto status = absl::InvalidArgumentError(absl::StrFormat(
+          "Memory space mismatch when forarding raw buffers: %s vs %s",
+          memory_space->DebugString(),
+          (*raw_buffer)->memory_space()->DebugString()));
+      buffer_promise->SetError(status);
+      return status;
+    }
+    buffer_promise->ForwardTo(
+        tensorflow::down_cast<xla::PjRtStreamExecutorRawBuffer*>(
+            raw_buffer->get())
+            ->device_buffer()
+            .CopyRCRef());
+    return absl::OkStatus();
+  };
+
+  return std::make_pair(std::move(raw_buffer), std::move(buffer_promise_cb));
+}
+
 void PjRtStreamExecutorClient::WaitForAllocation(
     se::Stream* stream, const CommonPjRtRawBuffer& raw_buffer) {
   auto event =
@@ -586,198 +587,10 @@ void PjRtStreamExecutorClient::WaitForAllocation(
   }
 }
 
-absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
-AllocateDestinationBuffer(const Shape& on_host_shape, PjRtDevice* device,
-                          LocalDeviceState* local_device,
-                          se::Stream* copy_stream, bool is_uninitialized_create,
-                          PjRtStreamExecutorClient* client,
-                          BufferSequencingEventRef definition_event,
-                          PjRtMemorySpace* memory_space) {
-  if (on_host_shape.IsTuple()) {
-    return InvalidArgument(
-        "Cannot allocate a PjRtStreamExecutorBuffer for a tuple.");
-  }
-
-  if (!memory_space) {
-    memory_space = device->default_memory_space().value_or(nullptr);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      Shape on_device_shape,
-      client->MakeDefaultShapeForMemorySpace(
-          memory_space, on_host_shape,
-          on_host_shape.has_layout() ? &on_host_shape.layout() : nullptr));
-  TF_ASSIGN_OR_RETURN(
-      size_t on_device_bytes_count,
-      client->GetOnDeviceBytesCount(memory_space, on_device_shape));
-  tsl::RCReference<RawSEDeviceMemory> mem;
-  {
-    bool is_pinned_host_memory =
-        memory_space && (memory_space->kind() == PinnedHostMemorySpace::kKind);
-    // Only allow pinned host memory or device memory.
-    PjRtMemorySpace* default_memory_space =
-        device->default_memory_space().value_or(nullptr);
-    if (memory_space != default_memory_space && !is_pinned_host_memory) {
-      return InvalidArgument("Buffer allocation: invalid memory space");
-    }
-
-    auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client);
-    TransferManager* transfer_manager =
-        se_client->client()->backend().transfer_manager();
-
-    // Communicate the desired memory space to the allocator via the shape
-    // callback.
-    auto memory_space_shape_fn = [is_pinned_host_memory,
-                                  transfer_manager](const Shape& shape) {
-      Shape result = transfer_manager->HostShapeToDeviceShape(shape);
-      if (is_pinned_host_memory) {
-        result.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
-      }
-      return result;
-    };
-
-    TF_ASSIGN_OR_RETURN(
-        ScopedShapedBuffer dst_buffer,
-        transfer_manager->AllocateScopedShapedBuffer(
-            on_host_shape, se_client->allocator(),
-            local_device->local_device_id().value(),
-            local_device->local_hardware_id().value(), memory_space_shape_fn));
-    Shape old_on_device_shape = dst_buffer.on_device_shape();
-    DCHECK_EQ(on_device_shape, old_on_device_shape)
-        << on_device_shape.ToString(true) << " vs "
-        << old_on_device_shape.ToString(true);
-    DCHECK_EQ(on_device_bytes_count, dst_buffer.buffer({}).size());
-    mem = RawSEDeviceMemory::Create(dst_buffer.buffer({}), local_device,
-                                    dst_buffer.memory_allocator());
-    dst_buffer.clear();
-    if (local_device->allocation_model() !=
-        LocalDeviceState::kComputeSynchronized) {
-      DCHECK(client->client()
-                 ->backend()
-                 .transfer_manager()
-                 ->CanBufferBeAccessedNow(
-                     local_device->compute_stream()->parent(), mem->mem()));
-    }
-  }
-  if (local_device->allocation_model() ==
-      LocalDeviceState::kComputeSynchronized) {
-    if (copy_stream == nullptr) {
-      CHECK(is_uninitialized_create);
-    } else {
-      CHECK(copy_stream->WaitFor(local_device->compute_stream()).ok());
-    }
-  }
-
-  absl::InlinedVector<BufferSequencingEventRef, 2> definition_events;
-  if (is_uninitialized_create) {
-    // There is not going to be any copy into the buffer so in general we don't
-    // need a definition event.
-    // But if the caller provided a definition event then we record that. Also
-    // put it as the first definition event so that we can guarantee only the
-    // first one might not have event recorded.
-    if (definition_event) {
-      definition_events.push_back(definition_event);
-    }
-    if (local_device->allocation_model() ==
-        LocalDeviceState::kComputeSynchronized) {
-      // The allocation is not valid until the compute stream passes this point,
-      // so add a definition event in the compute stream.
-      definition_events.emplace_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-      TF_RETURN_IF_ERROR(
-          client->AllocateAndRecordEvent(definition_events.back(), local_device,
-                                         local_device->compute_stream()));
-    }
-  } else {
-    // We have at least one definition event, for the copy completing to
-    // the device buffers.
-    if (definition_event) {
-      definition_events.push_back(definition_event);
-    } else {
-      definition_events.emplace_back(
-          BufferSequencingEvent::Create(client->thread_pool()));
-    }
-  }
-
-  auto dst_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, std::move(mem), definition_events);
-
-  auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      on_device_shape, std::move(dst_device_buffer), client, device,
-      memory_space);
-  return py_buffer;
-}
-
-void PjRtStreamExecutorBuffer::ScopedHold::ConvertUsageHold(
-    se::Stream* usage_stream, BufferSequencingEventRef event,
-    bool reference_held) {
-  CHECK(ok());
-  CHECK_EQ(type(), kUsage);
-  parent()->ConvertUsageHold(buffer(), usage_stream, std::move(event),
-                             reference_held);
-  SetState(kConverted);
-}
-
 bool PjRtStreamExecutorClient::IsOnCpu(PjRtMemorySpace* memory_space) {
   return memory_space->kind() == PinnedHostMemorySpace::kKind;
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::DonateWithControlDependency(Future<> dependency) {
-  VLOG(1) << "PjRtStreamExecutorBuffer::DonateWithControlDependency";
-  std::unique_ptr<PjRtBuffer> new_buffer;
-
-  auto tracked_buffer =
-      GetBufferWithHold(PjRtStreamExecutorBuffer::ScopedHold::kDonation);
-
-  if (!tracked_buffer.ok()) {
-    return InvalidArgument(
-        "Invalid buffer passed to DonateWithControlDependency: %s",
-        tracked_buffer.status().ToString());
-  }
-
-  // Copy all the data in the existing tracked_buffer.
-  const auto& original_definition_events = tracked_buffer->definition_events();
-  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-
-  auto definition_event_for_status =
-      BufferSequencingEvent::Create(se_client->thread_pool());
-  // definition_event_for_status must be the first one so that it blocks other
-  // actions like D2H transfer from execution before the buffer is ready.
-  definition_events.push_back(definition_event_for_status);
-  definition_events.insert(definition_events.end(),
-                           original_definition_events.begin(),
-                           original_definition_events.end());
-
-  auto new_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device(), tracked_buffer->device_memory(), std::move(definition_events));
-
-  // Make the new buffer which is identical to the old, except for the new
-  // definition event.
-  new_buffer =
-      std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-          on_device_shape(), std::move(new_device_buffer), se_client, device(),
-          device()->default_memory_space().value_or(nullptr)));
-
-  auto* device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(this->device());
-  LocalDeviceState* local_device = device->local_device_state();
-  dependency.OnReady(
-      [definition_event_for_status = std::move(definition_event_for_status),
-       local_device, client = se_client](absl::Status status) mutable {
-        // Forward the absl::Status from the supplied dependency to the
-        // definition event.
-        auto stream = local_device->BorrowStreamFromPool();
-        TF_CHECK_OK(client->AllocateAndRecordEvent(definition_event_for_status,
-                                                   local_device, stream.get()));
-        local_device->ReturnStreamToPool(std::move(stream));
-      });
-
-  tracked_buffer.ConfirmDonation();
-  return new_buffer;
-}
-
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
 PjRtStreamExecutorClient::LinearizeHostBufferInto(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -798,8 +611,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
   absl::InlinedVector<int64_t, 4> tmp_strides;
   if (!byte_strides) {
     tmp_strides.resize(dims.size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(on_host_shape, absl::MakeSpan(tmp_strides)));
+    TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+        on_host_shape, absl::MakeSpan(tmp_strides)));
     byte_strides = tmp_strides;
   }
   int64_t size = ShapeUtil::ByteSizeOf(on_host_shape);
@@ -808,8 +621,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
 
   absl::InlinedVector<int64_t, 4> shape_strides(
       device_shape.dimensions().size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
+  TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
+      device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (size == 0 || *byte_strides == shape_strides);
 
@@ -829,7 +642,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
     options.dims = dims;
     options.permutation = permutation;
     options.input_layout = TransposePlan::Striding{*byte_strides};
-    absl::MutexLock lock(&transpose_mu_);
+    absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
 
@@ -1017,11 +830,11 @@ PjRtStreamExecutorClient::CreateErrorBuffer(absl::Status error,
 
   // Create an empty buffer.
   auto dummy_device_buffer = std::make_unique<TrackedDeviceBuffer>(
-      device, tsl::RCReference<RawSEDeviceMemory>(),
+      device, tsl::AsyncValueRef<RawSEDeviceMemory>(),
       absl::MakeSpan(&definition_event, 1));
 
-  return std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(dummy_device_buffer), this, device, memory);
+  return std::make_unique<CommonPjRtBufferImpl>(
+      shape, std::move(dummy_device_buffer), memory);
 }
 
 absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
@@ -1047,38 +860,36 @@ PjRtStreamExecutorClient::LinearizeInto(
   // it includes linearization that may be slow.
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
-  auto transfer_h2d =
-      [this, local_client = client(), transfer_manager, local_device,
-       raw_buffer, device, event, literal,
-       on_device_shape = std::move(on_device_shape)]() mutable {
-        // This function uses TF_CHECK_OK and value() since we have no way
-        // to report failures from a callback. However, the operations here are
-        // unlikely to fail and not recoverable even if we were to fail: DMAs to
-        // memory that has already been allocated, and a possible Event
-        // allocation.
-        auto device_memory =
-            tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
-                raw_buffer.get())
-                ->device_buffer();
-
-        se::Stream* h2d_stream = local_device->host_to_device_stream();
-
-        ShapedBuffer buffer =
-            device_memory->AsShapedBuffer(device, on_device_shape);
-        TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-            h2d_stream, literal, buffer));
-
-        TF_CHECK_OK(AddDestinationBufferSynchronization(this, local_device,
-                                                        event, h2d_stream));
-
-        local_device->ThenRelease(h2d_stream, device_memory).IgnoreError();
-
-        // This can sometimes catch the case where the literal memory has been
-        // freed before the H2D transfer was issued.
-        h2d_stream->RefreshStatus()
-            .IgnoreError();  // Can return error::Unimplemented
-        QCHECK(h2d_stream->ok());
-      };
+  auto transfer_h2d = [this, local_client = client(), transfer_manager,
+                       local_device, raw_buffer, device, event, literal,
+                       on_device_shape = std::move(on_device_shape)]() mutable {
+    // This function uses TF_CHECK_OK and value() since we have no way
+    // to report failures from a callback. However, the operations here are
+    // unlikely to fail and not recoverable even if we were to fail: DMAs to
+    // memory that has already been allocated, and a possible Event
+    // allocation.
+    auto device_memory =
+        tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(raw_buffer.get())
+            ->device_buffer();
+
+    se::Stream* h2d_stream = local_device->host_to_device_stream();
+
+    ShapedBuffer buffer =
+        device_memory->AsShapedBuffer(device, on_device_shape);
+    TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
+        h2d_stream, literal, buffer));
+
+    TF_CHECK_OK(AddDestinationBufferSynchronization(this, local_device, event,
+                                                    h2d_stream));
+
+    local_device->ThenRelease(h2d_stream, device_memory).IgnoreError();
+
+    // This can sometimes catch the case where the literal memory has been
+    // freed before the H2D transfer was issued.
+    h2d_stream->RefreshStatus()
+        .IgnoreError();  // Can return error::Unimplemented
+    QCHECK(h2d_stream->ok());
+  };
   thread_pool()->Schedule(WrapClosureAsCopyable(std::move(transfer_h2d)));
   return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(event);
 }
@@ -1115,9 +926,8 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
 
   auto device_buffer = std::make_unique<TrackedDeviceBuffer>(
       device, std::move(buffer), definition_events);
-  return std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(device_buffer), this, device,
-      device->default_memory_space().value_or(nullptr)));
+  return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
+      shape, std::move(device_buffer), memory_space));
 }
 
 absl::Status PjRtStreamExecutorClient::DmaMap(void* data, size_t buffer_size) {
@@ -1132,7 +942,7 @@ absl::Status PjRtStreamExecutorClient::DmaMap(void* data, size_t buffer_size) {
     return absl::InternalError(absl::StrFormat(
         "Failed to register host memory at address: %ps", data));
   }
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   dma_maps_.insert({data, buffer_size});
   return absl::OkStatus();
 }
@@ -1149,7 +959,7 @@ absl::Status PjRtStreamExecutorClient::DmaUnmap(void* data) {
     return absl::InternalError(absl::StrFormat(
         "Failed to unregister host memory at address: %ps", data));
   }
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   dma_maps_.erase(data);
   return absl::OkStatus();
 }
@@ -1256,422 +1066,6 @@ absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorClient::memory_spaces()
   return memory_spaces_;
 }
 
-PjRtStreamExecutorBuffer::PjRtStreamExecutorBuffer(
-    Shape on_device_shape, std::unique_ptr<TrackedDeviceBuffer> device_buffer,
-    PjRtClient* client, PjRtDevice* device, PjRtMemorySpace* memory_space)
-    : CommonPjRtBufferImpl(std::move(on_device_shape), std::move(device_buffer),
-                           memory_space) {}
-
-PjRtStreamExecutorBuffer::~PjRtStreamExecutorBuffer() { Delete(); }
-
-absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>>
-PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
-  tsl::profiler::TraceMe trace_me("PjRtStreamExecutorBuffer::Release");
-  std::unique_ptr<TrackedDeviceBuffer> device_buffer(
-      static_cast<TrackedDeviceBuffer*>(ReleaseBuffer().release()));
-  if (device_buffer == nullptr) {
-    return tsl::RCReference<RawSEDeviceMemory>();
-  }
-  TrackedDeviceBuffer::StreamAndEventContainer events =
-      device_buffer->LockUseAndTransferUsageEvents();
-  auto device_memory = device_buffer->device_memory();
-  auto* se_device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-  LocalDeviceState* local_device_state = se_device->local_device_state();
-  if (wait_for_operations_to_complete) {
-    // Block the host until all usage events have completed. Usage events
-    // dominate definition events, so this also waits for the buffer to be
-    // defined.
-    std::unique_ptr<se::Stream> stream;
-    for (const auto& stream_and_event : events) {
-      if (!stream_and_event.event->IsComplete()) {
-        if (stream == nullptr) {
-          stream = local_device_state->BorrowStreamFromPool();
-        }
-        stream_and_event.event->WaitForEventOnStream(stream.get());
-      }
-    }
-    if (stream != nullptr) {
-      TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-      local_device_state->ReturnStreamToPool(std::move(stream));
-    }
-  } else {
-    if (local_device_state->allocation_model() ==
-        LocalDeviceState::kComputeSynchronized) {
-      se::Stream* block_stream = nullptr;
-      // If an event is not defined yet, we wait for it to be defined in a new
-      // thread in the thread pool.
-      // This allows the host to schedule:
-      //   create buffer -> use -> delete -> fulfill
-      absl::InlinedVector<BufferSequencingEventRef, 5>
-          events_to_wait_for_in_a_different_thread;
-      auto maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait =
-          [&events_to_wait_for_in_a_different_thread, local_device_state,
-           &block_stream](const BufferSequencingEventRef& event) {
-            if (local_device_state->allow_delete_before_fulfill() &&
-                !event->IsDefined()) {
-              // Wait for the event to be defined in a different thread.
-              events_to_wait_for_in_a_different_thread.push_back(event);
-            } else {
-              MaybeWaitForEventOnStream(event, local_device_state,
-                                        block_stream);
-            }
-          };
-      for (const auto& stream_and_event : events) {
-        VLOG(4)
-            << "Checking whether need to wait for stream_and_event: stream: "
-            << (stream_and_event.event->IsDefined()
-                    ? stream_and_event.event->definition_stream()
-                    : nullptr)
-            << "; event: " << &*stream_and_event.event
-            << "; reference_held: " << stream_and_event.reference_held
-            << "; is_predetermined_error: "
-            << stream_and_event.event->IsPredeterminedError();
-        // We only need to do something for events that didn't already acquire a
-        // reference to the buffer and for other situations described in the
-        // comment of MaybeWaitForEventOnStream()
-        if (!stream_and_event.reference_held) {
-          maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait(
-              stream_and_event.event);
-        }
-      }
-      for (const auto& definition_event : device_buffer->definition_events()) {
-        VLOG(4) << "Checking whether need to wait for definition_event: "
-                << &*definition_event << "; is_predetermined_error: "
-                << definition_event->IsPredeterminedError();
-        // Here we wait for the definition events to complete on block_stream as
-        // well, in case they are not also usage events.
-        maybe_wait_for_event_on_block_stream_or_add_to_events_to_wait(
-            definition_event);
-      }
-      if (!events_to_wait_for_in_a_different_thread.empty()) {
-        VLOG(3) << "Going to wait for "
-                << events_to_wait_for_in_a_different_thread.size()
-                << " events in a different thread.";
-        // We always use the cleanup_thread instead of using the
-        // client->thread_pool() here to avoid exhausting the client thread
-        // pool.
-        local_device_state->cleanup_thread()->Schedule(
-            [events_to_wait_for_in_a_different_thread =
-                 std::move(events_to_wait_for_in_a_different_thread),
-             local_device_state, device_memory, block_stream]() mutable {
-              for (const auto& event :
-                   events_to_wait_for_in_a_different_thread) {
-                MaybeWaitForEventOnStream(event, local_device_state,
-                                          block_stream);
-              }
-              if (block_stream != nullptr) {
-                TF_CHECK_OK(local_device_state->ThenExecuteCallback(
-                    block_stream, [device_memory]() {
-                      // Drops device_memory shared pointer.
-                    }));
-              }
-            });
-      } else if (block_stream != nullptr) {
-        TF_RETURN_IF_ERROR(local_device_state->ThenExecuteCallback(
-            block_stream, [device_memory]() {
-              // Drops device_memory shared pointer.
-            }));
-      }
-    }
-  }
-  return device_memory;
-}
-
-void PjRtStreamExecutorBuffer::Delete() {
-  VLOG(3) << "PjRtStreamExecutorBuffer::Delete";
-
-  // When wait_for_reads_to_complete is false, Release should never fail.
-  //
-  // The only usage events that
-  // Release(/*wait_for_operations_to_complete=*/false) doesn't wait for are
-  // events defined on the compute stream. All streams other than the compute
-  // stream are expected to WaitFor compute stream before any write operations.
-  TF_CHECK_OK(Release(/*wait_for_operations_to_complete=*/false).status());
-}
-
-void PjRtStreamExecutorBuffer::ConvertUsageHold(TrackedDeviceBuffer* buffer,
-                                                se::Stream* usage_stream,
-                                                BufferSequencingEventRef event,
-                                                bool reference_held) {
-  absl::MutexLock lock(&mu_);
-  CHECK(device_buffer() == buffer || device_buffer() == nullptr);
-  buffer->AddUsageEvent(std::move(event), reference_held);
-  DecrementUsage();
-}
-
-PjRtStreamExecutorBuffer::ScopedHold
-PjRtStreamExecutorBuffer::GetBufferWithHold(ScopedHold::Type type) {
-  absl::MutexLock lock(&mu_);
-  // Ensure that at most one donation hold can be in progress at a time.
-  WaitForOutstandingDonationHold();
-  ScopedHold hold(this, type);
-  AcquireHoldLocked(&hold);
-  return hold;
-}
-
-absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-PjRtStreamExecutorBuffer::CopyToDeviceHelper(
-    PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
-    PjRtMemorySpace* dst_memory_space, LocalDeviceState* transfer_local_device,
-    LocalDeviceState* src_local_device, se::Stream* transfer_stream,
-    const TrackedDeviceBuffer& src_device_buffer) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
-                      AllocateDestinationBuffer(
-                          ShapeUtil::DeviceShapeToHostShape(on_device_shape()),
-                          dst_device, dst_local_device, transfer_stream,
-                          /*is_uninitialized_create=*/false, se_client,
-                          /*definition_event=*/nullptr, dst_memory_space));
-
-  ScopedHold dst_device_buffer(py_buffer->GetBufferWithUsageHold());
-  CHECK(dst_device_buffer.ok());
-
-  BufferSequencingEventRef copy_event =
-      dst_device_buffer->definition_events()[0];
-
-  // Copy the leaf buffers.
-  auto async_copy_to_device = [src_memory = src_device_buffer.device_memory(),
-                               src_definition_events =
-                                   src_device_buffer.definition_events(),
-                               dst_memory = dst_device_buffer->device_memory(),
-                               transfer_stream = std::move(transfer_stream),
-                               copy_event,
-                               on_device_shape{py_buffer->on_device_shape()},
-                               src_local_device = std::move(src_local_device),
-                               transfer_local_device =
-                                   std::move(transfer_local_device),
-                               dst_local_device = std::move(dst_local_device),
-                               client = se_client]() mutable {
-    tsl::profiler::TraceMe traceme(
-        "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_"
-        "device");
-    VLOG(3)
-        << "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_device";
-
-    absl::Status defined_status = src_definition_events[0]->GetDefinedStatus();
-    // Only proceeds to transfer when the buffer doesn't hold an error.
-    if (defined_status.ok()) {
-      WaitForBufferDefinitionEventsOnStream(src_definition_events,
-                                            transfer_stream);
-
-      const se::DeviceMemoryBase& input_buffer = src_memory->mem();
-      const se::DeviceMemoryBase& output_buffer = dst_memory->mem();
-      CHECK_EQ(input_buffer.size(), output_buffer.size());
-      if (input_buffer.size() != 0) {
-        auto status = transfer_local_device->ThenMemcpyDeviceToDevice(
-            transfer_stream, dst_local_device->compute_stream(), input_buffer,
-            output_buffer);
-        if (!status.ok()) {
-          LOG(ERROR) << "D2D memory copy failed due to: " << status;
-          StallStreamOnError(transfer_local_device, transfer_stream);
-          if (transfer_local_device == dst_local_device) {
-            // Some copies may have been enqueued before the error was
-            // returned, and StallStreamOnError only makes sure the
-            // destination device is ok, so make sure that the src buffer
-            // remains valid until after any transfers have completed.
-            auto status =
-                src_local_device->ThenRelease(transfer_stream, src_memory);
-            if (!status.ok()) {
-              LOG(ERROR) << "ThenRelease failed due to: " << status;
-            }
-          }
-          return;
-        }
-      }
-
-      absl::Status status = client->AllocateAndRecordEvent(
-          copy_event, transfer_local_device, transfer_stream);
-      if (!status.ok()) {
-        StallStreamOnError(transfer_local_device, transfer_stream);
-        LOG(ERROR) << status;
-        return;
-      }
-    } else {
-      client->SetEventAsError(copy_event, defined_status);
-    }
-
-    auto status =
-        src_local_device->ThenRelease(transfer_stream, std::move(src_memory));
-    if (!status.ok()) {
-      LOG(ERROR) << "ThenRelease failed due to: " << status;
-    }
-  };
-
-  src_device_buffer.definition_events()[0]->ExecuteOrAddToFutureTasks(
-      absl::StrFormat("async_copy_to_device_%p", dst_device_buffer.buffer()),
-      std::move(async_copy_to_device));
-
-  RecordUsage(std::move(dst_device_buffer), transfer_local_device,
-              transfer_local_device, copy_event, transfer_stream);
-
-  return std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>(
-      std::unique_ptr<PjRtStreamExecutorBuffer>(std::move(py_buffer)),
-      std::move(copy_event));
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::CopyToDeviceMemorySpace(
-    PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space) {
-  auto* se_client = tensorflow::down_cast<PjRtStreamExecutorClient*>(client());
-  auto* se_device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-  // Copying across PjRtClients involves a copy through the host.
-  if (dst_device->client() != se_client) {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
-    // Avoid use-after-free on `literal` due to unsequenced move and use.
-    Literal* literal_pointer = literal.get();
-    absl::InlinedVector<int64_t, 4> byte_strides(
-        literal->shape().dimensions().size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
-    return dst_device->client()->BufferFromHostBuffer(
-        literal_pointer->untyped_data(),
-        literal_pointer->shape().element_type(),
-        literal_pointer->shape().dimensions(), byte_strides,
-        PjRtStreamExecutorClient::HostBufferSemantics::kImmutableZeroCopy,
-        [literal{std::move(literal)}]() { /* frees literal */ },
-        dst_memory_space, /*device_layout=*/nullptr);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      LocalDeviceState * dst_local_device,
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(dst_device)
-          ->GetLocalDeviceState());
-  LocalDeviceState* transfer_local_device =
-      se_client->EnqueueD2DTransfersOnSrcStream()
-          ? se_device->local_device_state()
-          : dst_local_device;
-  CHECK_EQ(dst_local_device->allocation_model(),
-           transfer_local_device->allocation_model());
-
-  se::Stream* transfer_stream =
-      transfer_local_device->GetDeviceToDeviceStream();
-
-  auto src_device_buffer = GetBufferWithUsageHold();
-  if (!src_device_buffer.ok()) {
-    return InvalidArgument(
-        "CopyToDevice() called on deleted or donated buffer: %s",
-        src_device_buffer.status().ToString());
-  }
-
-  absl::StatusOr<
-      std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-      buffer_and_event_or = CopyToDeviceHelper(
-          dst_device, dst_local_device, dst_memory_space, transfer_local_device,
-          se_device->local_device_state(), transfer_stream, *src_device_buffer);
-  if (!buffer_and_event_or.ok()) {
-    return buffer_and_event_or.status();
-  }
-
-  auto& buffer_and_event = buffer_and_event_or.value();
-  std::unique_ptr<PjRtBuffer>& buffer = buffer_and_event.first;
-  BufferSequencingEventRef& event = buffer_and_event.second;
-
-  src_device_buffer.ConvertUsageHold(transfer_stream, event,
-                                     /*reference_held=*/true);
-
-  return std::move(buffer);
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::CopyToMemorySpace(PjRtMemorySpace* dst_memory_space) {
-  if (dst_memory_space->devices().size() == 1) {
-    return CopyToDeviceMemorySpace(dst_memory_space->devices()[0],
-                                   dst_memory_space);
-  }
-  return Unimplemented("CopyToMemorySpace is not supported");
-}
-
-Future<> PjRtStreamExecutorBuffer::GetReadyFuture() {
-  absl::InlinedVector<BufferSequencingEventRef, 2> definition_events;
-  Promise<> definition_promise;
-  Future<> definition_future;
-  {
-    absl::MutexLock lock(&mu_);
-    if (device_buffer() == nullptr) {
-      return Future<>(InvalidArgument(
-          "GetReadyFuture() called on deleted or donated buffer"));
-    }
-    if (!definition_future_) {
-      definition_events = device_buffer()->definition_events();
-      std::tie(definition_promise, definition_future_) =
-          Future<>::MakePromise();
-    }
-    definition_future = definition_future_;
-  }
-
-  if (!definition_events.empty()) {
-    auto* se_device =
-        tensorflow::down_cast<PjRtStreamExecutorDevice*>(device());
-    LocalDeviceState* local_device_state = se_device->local_device_state();
-    auto first_definition_event = definition_events[0];
-    auto async_wait_for_events =
-        [definition_events = std::move(definition_events),
-         local_device_state = std::move(local_device_state),
-         definition_promise = std::make_shared<Promise<>>(
-             std::move(definition_promise))]() mutable {
-          std::unique_ptr<se::Stream> stream;
-          absl::Status defined_status =
-              definition_events[0]->GetDefinedStatus();
-          if (!defined_status.ok()) {
-            definition_promise->Set(defined_status);
-            return;
-          }
-          for (auto& event : definition_events) {
-            if (!event->IsComplete()) {
-              if (stream == nullptr) {
-                stream = local_device_state->BorrowStreamFromPool();
-              }
-              event->WaitForEventOnStream(stream.get());
-            }
-          }
-
-          if (stream != nullptr) {
-            auto* stream_ptr = stream.release();
-            // We already borrowed a stream from the pool so we can safely do
-            // the callback directly on that stream instead of bouncing through
-            // local_device_state->ThenExecuteCallback. The direct callback
-            // saves significant time.
-            auto status = stream_ptr->DoHostCallback(
-                [definition_promise, stream_ptr, local_device_state,
-                 event_with_status = definition_events[0]]() mutable {
-                  local_device_state->ReturnStreamToPool(
-                      std::unique_ptr<se::Stream>(stream_ptr));
-                  definition_promise->Set(
-                      event_with_status->GetDefinedStatus());
-                });
-            if (!status.ok()) {
-              definition_promise->Set(status);
-              return;
-            }
-          } else {
-            // All events are already complete; set the `definition_promise`
-            // with the status of the buffer's first definition event which may
-            // have error status to propagate.
-            definition_promise->Set(definition_events[0]->GetDefinedStatus());
-          }
-        };
-    first_definition_event->ExecuteOrAddToFutureTasks(
-        absl::StrFormat("async_wait_for_events_%p", &async_wait_for_events),
-        std::move(async_wait_for_events));
-  }
-
-  return FutureHelpers::WithProfiling(
-      std::move(definition_future),
-      /*on_block_start=*/
-      [] {
-        tsl::profiler::TraceMeProducer traceme(
-            "PjRtStreamExecutorBuffer::Await");
-        VLOG(3) << "PjRtStreamExecutorBuffer::Await";
-        return FutureHelpers::ProfilingKeys(
-            {/*traceme_context_id=*/traceme.GetContextId()});
-      },
-      /*on_block_end=*/
-      [](FutureHelpers::ProfilingKeys keys) {
-        tsl::profiler::TraceMeConsumer traceme(
-            "PjRtStreamExecutorBuffer::Await", keys.traceme_context_id);
-      });
-}
-
 namespace {
 
 // Helper struct for the tuple that is transiently constructed to hold the
@@ -1735,12 +1129,12 @@ absl::Status CheckCompatibleShapes(bool strict_shape_checking,
 // Makes a tuple from the arguments to an execution.
 static absl::StatusOr<std::pair<ShapeTree<PjRtStreamExecutorExecutionInput>,
                                 BufferSequencingEventRef>>
-MakeTupleHelper(
-    PjRtStreamExecutorClient* client, LocalDeviceState* local_device,
-    bool strict_shape_checking, const Shape& tupled_parameter_shape,
-    absl::Span<PjRtBuffer* const> py_buffers,
-    absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
-    int device_ordinal) {
+MakeTupleHelper(PjRtStreamExecutorClient* client,
+                LocalDeviceState* local_device, bool strict_shape_checking,
+                const Shape& tupled_parameter_shape,
+                absl::Span<PjRtBuffer* const> py_buffers,
+                absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
+                int device_ordinal) {
   se::DeviceMemoryAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
@@ -1782,11 +1176,11 @@ MakeTupleHelper(
                                       local_device, allocator)};
   ++input_iterator;
   // Then set each sub-tuple in turn from the parameters.
-  for (const PjRtStreamExecutorBuffer::ScopedHold& device_buffer :
-       device_buffers) {
+  for (const CommonPjRtBuffer::ScopedHold& device_buffer : device_buffers) {
     input_iterator->second = {
-        device_buffer.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation,
-        device_buffer->device_memory()};
+        device_buffer.type() == CommonPjRtBuffer::ScopedHold::kDonation,
+        tensorflow::down_cast<TrackedDeviceBuffer*>(device_buffer.buffer())
+            ->device_memory()};
     ++input_iterator;
   }
   CHECK(input_iterator == iterator_end);
@@ -1813,14 +1207,13 @@ MakeTupleHelper(
 // Converts a ScopedShapedBuffer returned from an execution into a
 // PjRtBuffer.
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
-    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+    ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
     BufferSequencingEventRef definition_event, PjRtClient* client,
-    PjRtDevice* device, LocalDeviceState* local_device,
-    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release) {
+    PjRtDevice* device, LocalDeviceState* local_device) {
   if (result_buffer.shape().IsTuple()) {
     return absl::InternalError("OutputBufferHelper called on tuple.");
   }
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<RawSEDeviceMemory>, 1> buffers;
   for (auto& item : result_buffer) {
     buffers.push_back(std::move(item.second));
   }
@@ -1849,12 +1242,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
                          shape.layout().memory_space()));
     }
   }
-  auto pjrt_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      result_buffer.shape(), std::move(out_buffer), client, device,
-      memory_space);
-  RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
-              definition_event, local_device->compute_stream(),
-              &buffers_to_release);
+  auto pjrt_buffer = std::make_unique<CommonPjRtBufferImpl>(
+      result_buffer.shape(), std::move(out_buffer), memory_space);
   return std::unique_ptr<PjRtBuffer>(std::move(pjrt_buffer));
 }
 
@@ -1970,7 +1359,7 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
     int device_ordinal, const ExecuteOptions& options,
     absl::Span<const Shape> executable_parameter_shapes,
     absl::Span<PjRtBuffer* const> argument_handles,
-    absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
+    absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
     absl::flat_hash_set<BufferSequencingEvent*>& events) const {
   std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
@@ -1979,7 +1368,7 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
   // Lift tuple_write_event outside the conditional so that the event it
   // returns is not destroyed until after the loop below that waits on events.
   BufferSequencingEventRef tuple_write_event;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+  if (parameter_is_tupled_arguments_) {
     TF_ASSIGN_OR_RETURN(
         auto tuple_handle,
         MakeTupleHelper(client_, device_state, options.strict_shape_checking,
@@ -2006,11 +1395,12 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
           execution_inputs.back();
       auto input_iterator = execution_input.begin();
       auto iterator_end = execution_input.end();
-      const auto& buf = device_buffers[i]->device_memory();
+      const auto& buf = tensorflow::down_cast<TrackedDeviceBuffer*>(
+                            device_buffers[i].buffer())
+                            ->device_memory();
       CHECK(input_iterator != iterator_end);
       input_iterator->second = {
-          device_buffers[i].type() ==
-              PjRtStreamExecutorBuffer::ScopedHold::kDonation,
+          device_buffers[i].type() == CommonPjRtBuffer::ScopedHold::kDonation,
           buf};
       ++input_iterator;
       CHECK(input_iterator == iterator_end);
@@ -2142,7 +1532,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
           {{"channel_id", channel_id_}});
     });
 
-    absl::ReleasableMutexLock lock(&mu_);
+    absl::ReleasableMutexLock lock(mu_);
 
     VLOG(3) << "Add chunk to a H2D channel #" << channel_id_ << ": "
             << "size=" << chunk.size() << ", "
@@ -2290,7 +1680,7 @@ PjRtStreamExecutorClient::RunAsync(
       ExecutionOutput output,
       exec.RunAsync(std::move(xla_arguments), std::move(run_options)));
   ScopedShapedBuffer ssb = output.ConsumeResult();
-  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+  xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       ssb.on_device_shape());
   auto it = results.begin();
   se::DeviceMemoryAllocator* allocator = ssb.memory_allocator();
@@ -2321,12 +1711,12 @@ PjRtStreamExecutorClient::RunAsync(
 // converted on success.
 // When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
 // to initialize `run_options`.
-absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
 PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     PjRtDevice* device,
-    std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
+    std::vector<CommonPjRtBuffer::ScopedHold>* device_buffers,
     std::shared_ptr<DeviceAssignment> device_assignment,
     std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const {
   int device_ordinal = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
@@ -2349,7 +1739,7 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   donation_clashes.reserve(argument_handles.size());
   for (int i = 0; i < argument_handles.size(); ++i) {
     auto* handle =
-        tensorflow::down_cast<PjRtStreamExecutorBuffer*>(argument_handles[i]);
+        tensorflow::down_cast<CommonPjRtBuffer*>(argument_handles[i]);
     if (handle->device() != device) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
@@ -2366,16 +1756,17 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     TF_RETURN_IF_ERROR(TestBufferDonationClashes(
         handle, donation_clashes, must_donate, i, replica, partition));
     device_buffers->emplace_back(handle->GetBufferWithHold(
-        must_donate ? PjRtStreamExecutorBuffer::ScopedHold::kDonation
-                    : PjRtStreamExecutorBuffer::ScopedHold::kUsage));
-    PjRtStreamExecutorBuffer::ScopedHold& device_buffer =
-        device_buffers->back();
-    if (!device_buffer.ok()) {
+        must_donate ? CommonPjRtBuffer::ScopedHold::kDonation
+                    : CommonPjRtBuffer::ScopedHold::kUsage));
+    CommonPjRtBuffer::ScopedHold& hold = device_buffers->back();
+    if (!hold.ok()) {
       return InvalidArgument(
           "Invalid buffer passed to Execute() as argument %d to replica %d: "
           "%s",
-          i, replica, device_buffer.status().ToString());
+          i, replica, hold.status().ToString());
     }
+    auto* device_buffer =
+        tensorflow::down_cast<TrackedDeviceBuffer*>(hold.buffer());
     // If we are trying to donate the buffer wait on the usage events as well
     // as the definition events to ensure that all reads have been completed
     // before the buffer is mutated. Usage holds are excluded during a donation
@@ -2391,20 +1782,6 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
                           &events);
   }
 
-  if (options.arguments_are_tupled) {
-    if (!parameter_is_tupled_arguments_) {
-      return InvalidArgument(
-          "Arguments may only be supplied as a tuple when the executable was "
-          "compiled with a single tupled parameter");
-    }
-    if (argument_handles.size() != 1) {
-      return InvalidArgument(
-          "Option arguments_are_tupled was true but %d buffers were passed to "
-          "execution",
-          argument_handles.size());
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(
       std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs,
       MakeExecutionInputsAndWaitForEvents(
@@ -2578,11 +1955,9 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     int device_ordinal, const ExecuteOptions& options,
-    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+    ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
     BufferSequencingEventRef definition_event, PjRtDevice* device,
-    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
-    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
-    const {
+    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const {
   tsl::profiler::TraceMe traceme("MakeOutputBuffers");
   std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
@@ -2593,12 +1968,12 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     // in result_buffer.
     for (int i = 0; i < tuple_count; ++i) {
       TF_ASSIGN_OR_RETURN(
-          ShapeTree<tsl::RCReference<RawSEDeviceMemory>> tuple_buffer,
+          ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> tuple_buffer,
           result_buffer.SubShapeTree({i}));
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<PjRtBuffer> buffer,
           OutputBufferHelper(std::move(tuple_buffer), definition_event, client_,
-                             device, device_state, buffers_to_release));
+                             device, device_state));
       outputs.push_back(std::move(buffer));
     }
     if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
@@ -2610,7 +1985,7 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<PjRtBuffer> buffer,
         OutputBufferHelper(std::move(result_buffer), definition_event, client_,
-                           device, device_state, buffers_to_release));
+                           device, device_state));
     outputs.push_back(std::move(buffer));
   }
   return outputs;
@@ -2619,13 +1994,15 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
 static absl::Status GetFirstInputError(
     absl::Span<PjRtBuffer* const> argument_handles) {
   for (auto* handle : argument_handles) {
-    auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(handle);
-    PjRtStreamExecutorBuffer::ScopedHold hold =
-        buffer->GetBufferWithUsageHold();
+    auto* buffer = tensorflow::down_cast<CommonPjRtBuffer*>(handle);
+    CommonPjRtBuffer::ScopedHold hold =
+        buffer->GetBufferWithHold(CommonPjRtBuffer::ScopedHold::kUsage);
     if (!hold.ok()) {
       return hold.status();
     }
-    for (const auto& event : hold->definition_events()) {
+    auto* tracked_buffer =
+        tensorflow::down_cast<TrackedDeviceBuffer*>(hold.buffer());
+    for (const auto& event : tracked_buffer->definition_events()) {
       if (event->IsPredeterminedError()) {
         return event->GetDefinedStatus();
       }
@@ -2696,9 +2073,9 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
   std::vector<absl::AnyInvocable<void() &&>> compute_callbacks;
-  std::vector<PjRtStreamExecutorBuffer::ScopedHold> device_buffers;
+  std::vector<CommonPjRtBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
-  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+  absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
       result_buffer_or_status =
           EnqueueExecution(argument_handles, replica, partition, executable_idx,
                            run_id, options, device, &device_buffers,
@@ -2709,39 +2086,52 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
                << " failed: " << result_buffer_or_status.status();
     return result_buffer_or_status.status();
   }
-  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer =
+  ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer =
       std::move(result_buffer_or_status).value();
 
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   se::Stream* stream = device_state->compute_stream();
 
-  auto definition_event = device_state->GetEventForComputeStreamSyncPoint(
+  auto definition_event_or = device_state->GetEventForComputeStreamSyncPoint(
       device_state->GetNextComputeStreamSyncPoint(), client_->thread_pool());
-  if (!definition_event.ok()) {
+  if (!definition_event_or.ok()) {
     StallStreamOnError(device_state, stream);
-    for (PjRtStreamExecutorBuffer::ScopedHold& b : device_buffers) {
-      if (b.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation) {
+    for (CommonPjRtBuffer::ScopedHold& b : device_buffers) {
+      if (b.type() == CommonPjRtBuffer::ScopedHold::kDonation) {
         // Even though there was an error we need to call ConfirmDonation, which
         // renders b invalid, since the computation has been enqueued and b has
         // been donated.
         b.ConfirmDonation();
       }
     }
-    return definition_event.status();
+    return definition_event_or.status();
+  }
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> leaves_to_release;
+  if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+    leaves_to_release.reserve(result_buffer.leaf_count());
+    for (auto& node : result_buffer.leaves()) {
+      leaves_to_release.push_back(node.second);
+    }
   }
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> buffers_to_release;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> buffers_to_release;
+  auto definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
+      *definition_event_or, "PjRtStreamExecutorLoadedExecutable", "Execute");
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<PjRtBuffer>> outputs,
       MakeOutputBuffers(device_ordinal, options, std::move(result_buffer),
-                        *definition_event, device, compute_callbacks,
-                        buffers_to_release));
+                        *definition_event_or, device, compute_callbacks));
 
-  for (PjRtStreamExecutorBuffer::ScopedHold& b : device_buffers) {
-    if (b.type() == PjRtStreamExecutorBuffer::ScopedHold::kUsage) {
-      RecordUsage(std::move(b), device_state, device_state, *definition_event,
-                  stream, &buffers_to_release);
+  for (CommonPjRtBuffer::ScopedHold& b : device_buffers) {
+    if (b.type() == CommonPjRtBuffer::ScopedHold::kUsage) {
+      if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+        buffers_to_release.push_back(
+            tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
+                b.buffer()->GetRawBuffer(b.parent()->memory_space()).get())
+                ->device_buffer());
+      }
+      b.ConvertUsageHold(definition_event);
     } else {
-      CHECK(b.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation);
+      CHECK(b.type() == CommonPjRtBuffer::ScopedHold::kDonation);
       b.ConfirmDonation();
     }
   }
@@ -2755,7 +2145,8 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   }
   definition_event->AndThen(
       [callbacks{std::move(compute_callbacks)},
-       buffers_to_release{std::move(buffers_to_release)}]() mutable {
+       buffers_to_release{std::move(buffers_to_release)},
+       leaves_to_release = std::move(leaves_to_release)]() mutable {
         for (auto& fn : callbacks) {
           std::move(fn)();
         }
@@ -2857,7 +2248,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
             ExecuteHelper(argument_handles[i], replica, partition, run_id,
                           options, returned_futures.has_value());
 
-        absl::MutexLock lock(&mu);
+        absl::MutexLock lock(mu);
         --running;
         if (!results[i].ok()) {
           if (failed == 0) {
@@ -2872,7 +2263,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
       mu.AssertHeld();
       return running == 0 || failed > 0;
     };
-    absl::MutexLock lock(&mu);
+    absl::MutexLock lock(mu);
     mu.Await(absl::Condition(&done_running_or_failed));
     if (failed > 0) {
       auto done_running = [&]() {
@@ -3569,7 +2960,7 @@ PjRtStreamExecutorClient::Load(std::unique_ptr<PjRtExecutable> executable,
 
 bool PjRtStreamExecutorClient::IsDmaMapped(const void* data_start,
                                            int64_t transfer_size) {
-  absl::MutexLock lock(&dma_maps_mutex_);
+  absl::MutexLock lock(dma_maps_mutex_);
   if (!dma_maps_.empty()) {
     void* data_end = (char*)data_start + transfer_size;
     for (const auto& [map_start, map_size] : dma_maps_) {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 9bd4291d9aaafa..8a74eadf9bf343 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -54,7 +54,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/transpose.h"
@@ -84,13 +83,13 @@ struct PjRtStreamExecutorExecutionInput {
   // Donation is not complete until ReleaseDeviceMemory() is called on the
   // TrackedDeviceBuffer that provides buf.
   bool is_donated;
-  tsl::RCReference<RawSEDeviceMemory> buf;
+  tsl::AsyncValueRef<RawSEDeviceMemory> buf;
 };
 
 struct PjRtStreamExecutorExecutionOutput {
-  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result;
+  ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result;
   // Donated inputs which must be freed.
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
   // use OwningDeviceMemory for donated inputs.
   std::vector<se::OwningDeviceMemory> se_to_be_released;
@@ -399,6 +398,10 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
           definition_device_events,
       bool raw_buffer_is_mutable) override;
 
+  absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
+                           PjRtFulfillAliasRawBufferCallback>>
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
       const LiteralSlice& literal, const xla::Shape& device_shape,
       HostBufferSemantics host_buffer_semantics,
@@ -421,38 +424,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
                          const CommonPjRtRawBuffer& raw_buffer);
 
  protected:
-  friend class PjRtStreamExecutorBuffer;
   friend class PjRtStreamExecutorRawBuffer;
 
-  virtual void CopyToRemoteDevice(PjRtBuffer* buffer,
-                                  absl::string_view serialized_descriptor,
-                                  PjRtBuffer::RemoteSendCallback on_done) {
-    on_done(Unimplemented("Cross host sends not implemented."),
-            /*sends_were_enqueued=*/false);
-  }
-
-  virtual Future<> CopyRawSubBufferToHost(PjRtBuffer* buffer, Future<void*> dst,
-                                          int64_t offset,
-                                          int64_t transfer_size) {
-    return Future<>(Unimplemented("Raw copies to host not implemented."));
-  }
-
-  virtual tsl::RCReference<PjRtDeviceEvent> CopyRawHostToDevice(
-      LocalDeviceState* local_device,
-      tsl::RCReference<RawSEDeviceMemory> device_buffer, const void* src,
-      int64_t offset, int64_t transfer_size) {
-    return CreateErrorDeviceEvent(
-        Unimplemented("Raw copies h2d not implemented."));
-  }
-
-  virtual tsl::RCReference<PjRtDeviceEvent> CopyRawDeviceToHost(
-      LocalDeviceState* local_device,
-      tsl::RCReference<RawSEDeviceMemory> device_buffer, void* dst,
-      int64_t offset, int64_t transfer_size) {
-    return CreateErrorDeviceEvent(
-        Unimplemented("Raw copies d2h not implemented."));
-  }
-
   // Helper function for creating PjRtStreamExecutorExecutables. Modifies
   // `options` in-place.
   struct ExecutableExtras {
@@ -555,144 +528,6 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
 absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices);
 
-class PjRtStreamExecutorBuffer : public CommonPjRtBufferImpl {
- public:
-  class ScopedHold : public CommonPjRtBuffer::ScopedHold {
-   public:
-    // Converts the hold into a usage event. Only valid for holds of type
-    // kUsage.
-    //
-    //   usage_stream:   the stream that the buffer was used on.
-    //   event:          an event that has been recorded on usage_stream after
-    //                   the buffer was used.
-    //   reference_held: true if and only if the caller has caused a
-    //                   reference to this->buffer() to stay live until after
-    //                   the host is sure that the usage (transfer or execution)
-    //                   has completed.
-    void ConvertUsageHold(se::Stream* usage_stream,
-                          BufferSequencingEventRef event, bool reference_held);
-
-    TrackedDeviceBuffer* buffer() const {
-      return static_cast<TrackedDeviceBuffer*>(
-          CommonPjRtBuffer::ScopedHold::buffer());
-    }
-    TrackedDeviceBuffer* operator->() const { return buffer(); }
-    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
-
-    PjRtStreamExecutorBuffer* parent() const {
-      return static_cast<PjRtStreamExecutorBuffer*>(
-          CommonPjRtBuffer::ScopedHold::parent());
-    }
-
-   private:
-    using CommonPjRtBuffer::ScopedHold::ScopedHold;
-    friend class PjRtStreamExecutorBuffer;
-    friend class PjRtStreamExecutorClient;
-  };
-  PjRtStreamExecutorBuffer(Shape on_device_shape,
-                           std::unique_ptr<TrackedDeviceBuffer> device_buffer,
-                           PjRtClient* client, PjRtDevice* device,
-                           PjRtMemorySpace* memory_space);
-  ~PjRtStreamExecutorBuffer() override;
-
-  PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
-  PjRtStreamExecutorBuffer(PjRtStreamExecutorBuffer&&) = delete;
-  PjRtStreamExecutorBuffer& operator=(const PjRtStreamExecutorBuffer&) = delete;
-  PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
-
-  // Drops the buffer's reference to its associated device memory, leaving the
-  // buffer in an invalid state. The memory will be freed lazily when all async
-  // operations using the buffer have completed, according to the allocation
-  // semantics of the underlying platform. Delete may briefly block if another
-  // thread is in the process of enqueuing an operation on this buffer, but it
-  // will never block for a stream operation to complete. If an external
-  // framework holds a reference to the TrackedDeviceBuffer via
-  // GetBufferWithExternalReference, the memory will not be freed until the
-  // external framework drops the reference.
-  void Delete() override;
-
-  // Returns a hold on the TrackedDeviceBuffer holding the device
-  // buffers. See comment on ScopedHold.
-  ScopedHold GetBufferWithHold(ScopedHold::Type type);
-  ScopedHold GetBufferWithUsageHold() {
-    return GetBufferWithHold(ScopedHold::kUsage);
-  }
-  ScopedHold GetBufferWithExternalReference() {
-    return GetBufferWithHold(ScopedHold::kExternalReference);
-  }
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
-      PjRtMemorySpace* dst_memory_space) override;
-
-  Future<> GetReadyFuture() override;
-
-  // Similar to Delete, drops the buffer's reference to its associated device
-  // memory, leaving the buffer in an invalid state, but returns the
-  // TrackedDeviceBuffer rather than freeing the device memory, so that another
-  // framework can take ownership of it.
-  //
-  // When called with wait_for_operations_to_complete=false, the buffer returned
-  // from Release should be dropped on the compute stream, since the only events
-  // that Release doesn't wait for are events defined on the compute stream.
-  //
-  // If wait_for_operations_to_complete=true, the host will block until any
-  // potentially outstanding asynchronous operations have completed before
-  // returning, in which case it is safe to read or mutate the returned buffer.
-  // If the buffer was shared via an external reference it is the client's
-  // responsibility that accesses via that reference do not interfere with
-  // accesses via the buffer returned from Release.
-  absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>> Release(
-      bool wait_for_operations_to_complete);
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      Future<> dependency) override;
-
- private:
-  friend class PjRtClient;
-
-  TrackedDeviceBuffer* device_buffer() const
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return static_cast<TrackedDeviceBuffer*>(CommonPjRtBuffer::device_buffer());
-  }
-
-  // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
-  // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
-  // device_buffer_ was successfully enqueued on a stream.
-  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
-                        BufferSequencingEventRef event, bool reference_held);
-
-  absl::StatusOr<
-      std::pair<std::unique_ptr<PjRtBuffer>, BufferSequencingEventRef>>
-  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
-                     PjRtMemorySpace* dst_memory_space,
-                     LocalDeviceState* transfer_local_device,
-                     LocalDeviceState* src_local_device,
-                     se::Stream* transfer_stream,
-                     const TrackedDeviceBuffer& src_device_buffer);
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceMemorySpace(
-      PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space = nullptr);
-};
-
-// Allocates the device buffers for a buffer that will be used as the
-// destination of a copy, either from the host or another device. copy_stream
-// may be nullptr, e.g., when allocating a buffer for a cross-host copy. If the
-// buffer is a tuple then the tuple tables are allocated, and all necessary
-// synchronization for them is dealt with, before the buffer is returned.
-//
-// It is safe to delete the returned PjRtBuffer without further
-// synchronization if an error occurs before the buffer is used.
-//
-// The caller may optionally provide a definition event to be recorded in
-// the buffer.
-// TODO(phawkins): replace on_host_shape here with on_device_shape.
-absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
-AllocateDestinationBuffer(const Shape& on_host_shape, PjRtDevice* device,
-                          LocalDeviceState* local_device,
-                          se::Stream* copy_stream, bool is_uninitialized_create,
-                          PjRtStreamExecutorClient* client,
-                          BufferSequencingEventRef definition_event = nullptr,
-                          PjRtMemorySpace* memory_space = nullptr);
-
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
 class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
@@ -838,26 +673,24 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
       int device_ordinal, const ExecuteOptions& options,
       absl::Span<const Shape> executable_parameter_shapes,
       absl::Span<PjRtBuffer* const> argument_handles,
-      absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
+      absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
       absl::flat_hash_set<BufferSequencingEvent*>& events) const;
 
-  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+  absl::StatusOr<ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>>>
   EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, PjRtDevice* device,
-      std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
+      std::vector<CommonPjRtBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment,
       std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const;
 
   virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeOutputBuffers(
       int device_ordinal, const ExecuteOptions& options,
-      ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+      ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> result_buffer,
       BufferSequencingEventRef definition_event, PjRtDevice* device,
-      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
-      std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
-      const;
+      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const;
 
   absl::StatusOr<Result> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index e0b2173fb27b75..4c76380257f487 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -114,7 +113,6 @@ absl::Status ExecuteWithSameInputBuffer(
   TF_ASSIGN_OR_RETURN(auto executable,
                       ToyExecutable(*client, shape, std::move(set_up_aliases)));
   xla::ExecuteOptions options;
-  options.untuple_result = true;
   return executable->Execute({{buffer.get(), buffer.get()}}, options).status();
 }
 
@@ -164,7 +162,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
       ShapeUtil::DeviceShapeToHostShape(blocked_buffer->on_device_shape()));
   bool got_literal = false;
   blocked_buffer->ToLiteral(result_literal.get()).OnReady([&](absl::Status s) {
-    absl::MutexLock l(&mu);
+    absl::MutexLock l(mu);
     TF_ASSERT_OK(s);
     got_literal = true;
   });
@@ -176,7 +174,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
   EXPECT_TRUE(future.IsReady());
 
   {
-    absl::MutexLock l(&mu);
+    absl::MutexLock l(mu);
     mu.Await(absl::Condition(&got_literal));
   }
 
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
index 0aa6531ad25117..1c9deefdb0a3a6 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
@@ -80,16 +81,12 @@ class CpuTopologyDescription : public PjRtTopologyDescription {
   // correctly report process count.
   absl::StatusOr<int> ProcessCount() const override { return 1; }
 
-  absl::StatusOr<int> CoreCountOfDefaultType() const override {
+  absl::StatusOr<int> ChipsPerProcess() const override {
     return cpu_topology_.number_of_devices();
   }
 
-  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
-    return cpu_topology_.number_of_devices();
-  }
-
-  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
-    return cpu_topology_.number_of_devices();
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultTypePerChip() const override {
+    return 1;
   }
 
   absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
index 7f4c679ce94a03..dd832637eb07b1 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -56,6 +56,8 @@ struct GpuClientOptions {
   std::optional<int> partition_index;
 
   bool use_tfrt_gpu_client = false;
+
+  int max_inflight_computations = 8;
 };
 
 }  //  namespace xla
diff --git a/third_party/xla/xla/pjrt/raw_buffer.cc b/third_party/xla/xla/pjrt/raw_buffer.cc
index ae70ebc3f3f7be..9ca4f6be7bc6ff 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/raw_buffer.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/pjrt/raw_buffer.h b/third_party/xla/xla/pjrt/raw_buffer.h
index 196288bbe2974b..b7c5554cda6cad 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/raw_buffer.h
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/literal.h"
 #include "xla/pjrt/async_work_runner.h"
 #include "xla/pjrt/device_event.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 16bf23b0b7c91f..1d5fc0516f7e10 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <memory>
-#include <optional>
 #include <utility>
 
 #include "absl/algorithm/container.h"
@@ -37,7 +36,6 @@ limitations under the License.
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/tracked_device_buffer.h"
@@ -107,8 +105,9 @@ void PjRtStreamExecutorDeviceEventPromise::SetFromSEEvent(
   event.AndThen([event = event_, original_event = event]() {
     if (auto* error = original_event.GetErrorIfPresent()) {
       event.SetError(*error);
+    } else {
+      event.SetStateConcrete();
     }
-    event.SetStateConcrete();
   });
 }
 
@@ -132,6 +131,7 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
     const void* src, int64_t offset, int64_t transfer_size) {
   se::Stream* stream = local_device_->host_to_device_stream();
   auto device_event = BufferSequencingEvent::Create(client_->thread_pool());
+  device_event.AndThen([device_buffer = device_buffer_]() {});
   client_->thread_pool()->Schedule([client = client_, device_event,
                                     local_device = local_device_, stream, src,
                                     offset, transfer_size,
@@ -191,6 +191,7 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
     void* dst, int64_t offset, int64_t transfer_size) {
   se::Stream* stream = local_device_->GetDeviceToHostStream();
   auto device_event = BufferSequencingEvent::Create(client_->thread_pool());
+  device_event.AndThen([device_buffer = device_buffer_]() {});
   client_->thread_pool()->Schedule([client = client_, device_event,
                                     local_device = local_device_, stream, dst,
                                     offset, transfer_size,
@@ -299,7 +300,7 @@ void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
           if (on_device_shape.layout() != literal_layout) {
             absl::InlinedVector<int64_t, 4> byte_strides(
                 on_device_shape.dimensions().size());
-            absl::Status s = ShapeUtil::ByteStrides(
+            absl::Status s = ShapeUtil::UnpackedByteStrides(
                 on_device_shape, absl::MakeSpan(byte_strides));
             if (!s.ok()) {
               promise.Set(s);
@@ -317,7 +318,7 @@ void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
             options.permutation = permutation;
             options.input_layout = TransposePlan::Striding{byte_strides};
             {
-              absl::MutexLock lock(&client->transpose_mu_);
+              absl::MutexLock lock(client->transpose_mu_);
               absl::StatusOr<std::shared_ptr<TransposePlan>> t =
                   client->transpose_cache_.GetOrCreate(options);
               if (!t.ok()) {
@@ -362,13 +363,17 @@ void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
           transfer_manager->TransferLiteralFromDevice(
               stream, shaped_buffer, staged.get(),
               [transpose = std::move(transpose),
-               promise = std::move(promise).ToShared(), staged,
+               promise = std::move(promise).ToShared(), staged, client,
                literal = std::move(literal)](absl::Status status) mutable {
                 if (status.ok()) {
                   transpose->Execute(staged->untyped_data(),
                                      literal->untyped_data());
                 }
-                promise->Set(std::move(status));
+                client->async_work_runner()->Schedule(
+                    [promise = std::move(promise),
+                     status = std::move(status)]() {
+                      promise->Set(std::move(status));
+                    });
               },
               transfer_metadata_ptr);
         } else {
@@ -412,12 +417,119 @@ void PjRtStreamExecutorRawBuffer::CopyTo(
     tsl::RCReference<PjRtDeviceEventPromise> definition_event_promise,
     tsl::RCReference<PjRtDeviceEventPromise> src_usage_event_promise,
     ::tsl::AsyncValueRef<bool> allocation_event) {
-  auto status = absl::UnimplementedError("CopyTo not implemented");
-  src_usage_event_promise->SetError(status);
   if (allocation_event) {
-    allocation_event.SetError(status);
+    allocation_event.SetStateConcrete();
+  }
+  if (dst_raw_buffer->memory_space()->client() == memory_space()->client()) {
+    auto usage_event = BufferSequencingEvent::Create(client_->thread_pool());
+    client_->thread_pool()->Schedule(
+        [client = client_, local_device = local_device_,
+         src_buffer = device_buffer_,
+         dst_raw_buffer = std::move(dst_raw_buffer),
+         src_raw_buffer = tsl::FormRef(this), usage_event]() {
+          se::Stream* stream = local_device->GetDeviceToDeviceStream();
+
+          absl::StatusOr<EventPool::Handle> event_or =
+              local_device->event_pool().AllocateEvent(stream->parent());
+          if (!event_or.ok()) {
+            client->SetEventAsError(usage_event, event_or.status());
+            return;
+          }
+
+          auto dst_buffer =
+              tensorflow::down_cast<const PjRtStreamExecutorRawBuffer*>(
+                  dst_raw_buffer.get())
+                  ->device_buffer();
+          auto dst_buffer_mem = dst_buffer->mem();
+          client->WaitForAllocation(stream, *src_raw_buffer);
+          client->WaitForAllocation(stream, *dst_raw_buffer);
+          auto status = stream->MemcpyD2D(&dst_buffer_mem, src_buffer->mem(),
+                                          dst_buffer_mem.size());
+          if (!status.ok()) {
+            client->SetEventAsError(usage_event, status);
+            return;
+          }
+
+          client->ThenRecordEvent(usage_event, local_device,
+                                  std::move(event_or).value(), stream);
+          usage_event.AndThen([src_buffer, dst_buffer]() {});
+        });
+
+    definition_event_promise->Set(
+        tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(usage_event));
+    src_usage_event_promise->Set(
+        tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(std::move(usage_event)));
+  } else if (auto* src_ptr = GetHostPointer()) {
+    auto h2d_event = dst_raw_buffer->CopyRawHostToDeviceAndReturnEvent(
+        src_ptr, 0, GetOnDeviceSizeInBytes());
+    if (!h2d_event.ok()) {
+      definition_event_promise->SetError(h2d_event.status());
+      src_usage_event_promise->SetError(h2d_event.status());
+      return;
+    }
+    (*h2d_event)
+        ->AndThen([src_usage_event_promise = std::move(src_usage_event_promise),
+                   src_buffer = tsl::FormRef(this)]() {
+          src_usage_event_promise->SetReady();
+        });
+    definition_event_promise->Set(*std::move(h2d_event));
+    return;
+  } else if (auto* dst_ptr = dst_raw_buffer->GetHostPointer()) {
+    auto d2h_event =
+        CopyRawDeviceToHostAndReturnEvent(dst_ptr, 0, GetOnDeviceSizeInBytes());
+    if (!d2h_event.ok()) {
+      definition_event_promise->SetError(d2h_event.status());
+      src_usage_event_promise->SetError(d2h_event.status());
+      return;
+    }
+    (*d2h_event)
+        ->AndThen(
+            [definition_event_promise = std::move(definition_event_promise),
+             d2h_event = *d2h_event, dst_buffer = dst_raw_buffer]() {
+              if (const absl::Status* error =
+                      d2h_event->async_value()->GetErrorIfPresent()) {
+                definition_event_promise->SetError(*error);
+              } else {
+                definition_event_promise->SetReady();
+              }
+            });
+    src_usage_event_promise->Set(*std::move(d2h_event));
+    return;
+  } else {
+    void* ptr = client_->host_memory_allocator()->AllocateRaw(
+        tsl::Allocator::kAllocatorAlignment, GetOnDeviceSizeInBytes());
+    std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
+        ptr, [host_memory_allocator = client_->host_memory_allocator()](
+                 void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+    auto d2h_event =
+        CopyRawDeviceToHostAndReturnEvent(ptr, 0, GetOnDeviceSizeInBytes());
+    if (!d2h_event.ok()) {
+      definition_event_promise->SetError(d2h_event.status());
+      src_usage_event_promise->SetError(d2h_event.status());
+      return;
+    }
+    (*d2h_event)
+        ->AndThen([staging_buffer, dst_raw_buffer,
+                   definition_event_promise =
+                       std::move(definition_event_promise),
+                   d2h_event = *d2h_event]() {
+          if (const absl::Status* error =
+                  d2h_event->async_value()->GetErrorIfPresent()) {
+            definition_event_promise->SetError(*error);
+          } else {
+            auto h2d_event = dst_raw_buffer->CopyRawHostToDeviceAndReturnEvent(
+                staging_buffer.get(), 0,
+                dst_raw_buffer->GetOnDeviceSizeInBytes());
+            if (!h2d_event.ok()) {
+              definition_event_promise->SetError(*error);
+            } else {
+              (*h2d_event)->AndThen([staging_buffer]() {});
+              definition_event_promise->Set(*std::move(h2d_event));
+            }
+          }
+        });
+    src_usage_event_promise->Set(*std::move(d2h_event));
   }
-  definition_event_promise->SetError(status);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.h b/third_party/xla/xla/pjrt/se_raw_buffer.h
index c99413f8b362e6..ae523bc80b14bb 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.h
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <cstdint>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/future.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/tracked_device_buffer.h"
@@ -92,10 +92,10 @@ class PjRtStreamExecutorDeviceEventPromise : public PjRtDeviceEventPromise {
 
 class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
  public:
-  PjRtStreamExecutorRawBuffer(PjRtStreamExecutorClient* client,
-                              PjRtMemorySpace* memory_space,
-                              LocalDeviceState* local_device,
-                              tsl::RCReference<RawSEDeviceMemory> device_buffer)
+  PjRtStreamExecutorRawBuffer(
+      PjRtStreamExecutorClient* client, PjRtMemorySpace* memory_space,
+      LocalDeviceState* local_device,
+      tsl::AsyncValueRef<RawSEDeviceMemory> device_buffer)
       : client_(client),
         memory_space_(memory_space),
         local_device_(local_device),
@@ -105,7 +105,7 @@ class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
 
   LocalDeviceState* local_device() const { return local_device_; }
 
-  const tsl::RCReference<RawSEDeviceMemory>& device_buffer() const {
+  const tsl::AsyncValueRef<RawSEDeviceMemory>& device_buffer() const {
     return device_buffer_;
   }
 
@@ -151,7 +151,7 @@ class PjRtStreamExecutorRawBuffer : public CommonPjRtRawBuffer {
   PjRtStreamExecutorClient* client_;
   PjRtMemorySpace* memory_space_;
   LocalDeviceState* local_device_;
-  tsl::RCReference<RawSEDeviceMemory> device_buffer_;
+  tsl::AsyncValueRef<RawSEDeviceMemory> device_buffer_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/semaphore.cc b/third_party/xla/xla/pjrt/semaphore.cc
index 58118c3cf435e6..7a86c6df0dc1a8 100644
--- a/third_party/xla/xla/pjrt/semaphore.cc
+++ b/third_party/xla/xla/pjrt/semaphore.cc
@@ -41,12 +41,12 @@ void Semaphore::Acquire(int64_t amount) {
 
   mu_.LockWhen(absl::Condition(&CanAcquire, &args));
   value_ -= amount;
-  mu_.Unlock();
+  mu_.unlock();
 }
 
 bool Semaphore::TryAcquire(int64_t amount) {
   CHECK_GE(amount, 0);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   if (value_ >= amount) {
     value_ -= amount;
     return true;
@@ -56,7 +56,7 @@ bool Semaphore::TryAcquire(int64_t amount) {
 
 void Semaphore::Release(int64_t amount) {
   CHECK_GE(amount, 0);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   value_ += amount;
 }
 
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.cc b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
index a8fd3b2f464a0f..1013b3ff24d36f 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 
 namespace xla {
 
@@ -162,7 +161,7 @@ static int GetMutexId(
 void TfPjRtClient::TrackBuffer(TfPjRtBuffer* buffer) {
   int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
   {
-    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    absl::MutexLock lock(alive_buffers_[mutex_id].mu);
     alive_buffers_[mutex_id].alive_buffers.insert(buffer);
   }
 }
@@ -173,7 +172,7 @@ void TfPjRtClient::UntrackBuffer(const TfPjRtBuffer* buffer) {
   }
   int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
   {
-    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    absl::MutexLock lock(alive_buffers_[mutex_id].mu);
     alive_buffers_[mutex_id].alive_buffers.erase(buffer);
   }
 }
@@ -181,7 +180,7 @@ void TfPjRtClient::UntrackBuffer(const TfPjRtBuffer* buffer) {
 void TfPjRtClient::DestroyWrappedBuffersAndClient() {
   int num_mutexes = alive_buffers_.size();
   for (int i = 0; i < num_mutexes; ++i) {
-    absl::MutexLock lock(&alive_buffers_[i].mu);
+    absl::MutexLock lock(alive_buffers_[i].mu);
     for (auto* buffer : alive_buffers_[i].alive_buffers) {
       buffer->DestroyWrappedBuffer();
     }
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index da91ae5fbc2511..f1923d31970936 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/codegen/mlir_kernel_emitter.h b/third_party/xla/xla/pjrt/tpu_constants.h
similarity index 66%
rename from third_party/xla/xla/codegen/mlir_kernel_emitter.h
rename to third_party/xla/xla/pjrt/tpu_constants.h
index c653acbdbe8b4f..0d5c84cd27bdb3 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_emitter.h
+++ b/third_party/xla/xla/pjrt/tpu_constants.h
@@ -1,5 +1,4 @@
-
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
-#define XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
+#ifndef XLA_PJRT_TPU_CONSTANTS_H_
+#define XLA_PJRT_TPU_CONSTANTS_H_
 
-#include "xla/codegen/kernel_emitter.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "absl/strings/string_view.h"
 
 namespace xla {
 
-using MlirKernelEmitter = KernelEmitter<MlirKernelDefinition>;
+inline constexpr absl::string_view kTpuHbmMemorySpaceKind = "device";
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_MLIR_KERNEL_EMITTER_H_
+#endif  // XLA_PJRT_TPU_CONSTANTS_H_
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index ffaeeaca0927c0..15fb7e01421901 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -15,31 +15,29 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
-#include <algorithm>
-#include <atomic>
 #include <cstddef>
-#include <cstdint>
-#include <functional>
 #include <iterator>
 #include <limits>
 #include <memory>
-#include <string>
 #include <utility>
+#include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/se_raw_buffer.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -47,10 +45,13 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event.h"
+#include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "tsl/platform/casts.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/context_types.h"
 
@@ -111,11 +112,11 @@ class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
   size_t sync_point_ = std::numeric_limits<size_t>::max();
 };
 
-tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::Create(
+tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::Create(
     se::DeviceMemoryBase value, LocalDeviceState* local_device,
     se::DeviceMemoryAllocator* allocator) {
-  return tsl::MakeRef<AllocatedRawSEDeviceMemory>(value, local_device,
-                                                  allocator);
+  return tsl::MakeAvailableAsyncValueRef<AllocatedRawSEDeviceMemory>(
+      value, local_device, allocator);
 }
 
 class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
@@ -135,11 +136,11 @@ class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
   absl::AnyInvocable<void() &&> on_delete_callback_;
 };
 
-tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
+tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
     se::DeviceMemoryBase value,
     absl::AnyInvocable<void() &&> on_delete_callback) {
-  return tsl::MakeRef<ForeignRawSEDeviceMemory>(value,
-                                                std::move(on_delete_callback));
+  return tsl::MakeAvailableAsyncValueRef<ForeignRawSEDeviceMemory>(
+      value, std::move(on_delete_callback));
 }
 
 ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
@@ -159,7 +160,7 @@ ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
 }
 
 TrackedDeviceBuffer::TrackedDeviceBuffer(
-    PjRtDevice* device, tsl::RCReference<RawSEDeviceMemory> device_memory,
+    PjRtDevice* device, tsl::AsyncValueRef<RawSEDeviceMemory> device_memory,
     absl::Span<const BufferSequencingEventRef> definition_events)
     : device_(device),
       device_memory_(std::move(device_memory)),
@@ -170,7 +171,7 @@ TrackedDeviceBuffer::TrackedDeviceBuffer(
 TrackedDeviceBuffer::~TrackedDeviceBuffer() = default;
 
 void TrackedDeviceBuffer::ReleaseDeviceMemory() {
-  device_memory_ = tsl::RCReference<RawSEDeviceMemory>();
+  device_memory_ = tsl::AsyncValueRef<RawSEDeviceMemory>();
 }
 
 void TrackedDeviceBuffer::ConfirmDonation() {
@@ -207,6 +208,78 @@ void TrackedDeviceBuffer::AddUsageEvent(BufferSequencingEventRef event,
   usage_events_.push_back({event, reference_held});
 }
 
+absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+TrackedDeviceBuffer::CloneWithControlDependency(PjRtMemorySpace* memory_space,
+                                                Future<> dependency) {
+  auto* se_client =
+      tensorflow::down_cast<PjRtStreamExecutorClient*>(memory_space->client());
+
+  // Copy all the data in the existing tracked_buffer.
+  const auto& original_definition_events = definition_events();
+  absl::InlinedVector<BufferSequencingEventRef, 4> definition_events;
+
+  auto definition_event_for_status =
+      BufferSequencingEvent::Create(se_client->thread_pool());
+  // definition_event_for_status must be the first one so that it blocks other
+  // actions like D2H transfer from execution before the buffer is ready.
+  definition_events.push_back(definition_event_for_status);
+  definition_events.insert(definition_events.end(),
+                           original_definition_events.begin(),
+                           original_definition_events.end());
+
+  auto new_device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device_, device_memory(), std::move(definition_events));
+
+  auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
+      memory_space->devices()[0]);
+  LocalDeviceState* local_device = device->local_device_state();
+  dependency.OnReady(
+      [definition_event_for_status = std::move(definition_event_for_status),
+       local_device, client = se_client](absl::Status status) mutable {
+        // Forward the absl::Status from the supplied dependency to the
+        // definition event.
+        if (!status.ok()) {
+          client->SetEventAsError(definition_event_for_status, status);
+          return;
+        }
+        auto stream = local_device->BorrowStreamFromPool();
+        TF_CHECK_OK(client->AllocateAndRecordEvent(definition_event_for_status,
+                                                   local_device, stream.get()));
+        local_device->ReturnStreamToPool(std::move(stream));
+      });
+  return new_device_buffer;
+}
+
+Future<> TrackedDeviceBuffer::GetReadyFuture(PjRtMemorySpace* memory_space) {
+  auto [promise, future] = Future<>::MakePromise();
+  std::vector<tsl::RCReference<tsl::AsyncValue>> definition_events;
+  definition_events.reserve(definition_events_.size());
+  for (const auto& event : definition_events_) {
+    definition_events.push_back(event.CopyRCRef());
+  }
+  absl::Span<tsl::RCReference<tsl::AsyncValue> const> definition_events_span =
+      definition_events;
+  tsl::RunWhenReady(
+      definition_events_span,
+      [promise = std::move(promise),
+       definition_events = std::move(definition_events)]() mutable {
+        for (auto& event : definition_events) {
+          if (const absl::Status* error = event->GetErrorIfPresent()) {
+            promise.Set(*error);
+            return;
+          }
+        }
+        promise.Set();
+      });
+  return future;
+}
+
+void TrackedDeviceBuffer::Delete(PjRtMemorySpace* memory_space) {
+  std::unique_ptr<TrackedDeviceBuffer> device_buffer(this);
+  // All events already hold onto refs to the buffer to ensure liveness so there
+  // is no work to do.
+}
+
 TrackedDeviceBuffer::StreamAndEventContainer
 TrackedDeviceBuffer::LockUseAndTransferUsageEvents() {
   CHECK(in_use_);
@@ -238,6 +311,16 @@ tsl::RCReference<CommonPjRtRawBuffer> TrackedDeviceBuffer::GetRawBuffer(
       device_memory_);
 }
 
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+TrackedDeviceBuffer::GetDefinitionEvent(PjRtMemorySpace* memory_space) {
+  if (definition_events_.size() != 1) {
+    return absl::InternalError(
+        "GetMergedDefinitionEvent only supported on TPU for buffers with "
+        "exactly 1 definition event.");
+  }
+  return tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_events_[0]);
+}
+
 void TrackedDeviceBuffer::AddUsageEvent(
     tsl::RCReference<PjRtDeviceEvent> event) {
   if (event) {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index 83cabee4116374..efd239350c5b8a 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -51,8 +51,7 @@ limitations under the License.
 
 namespace xla {
 
-// TODO(parkers): Implement PjRtRawBuffer API.
-class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
+class RawSEDeviceMemory {
  public:
   explicit RawSEDeviceMemory(se::DeviceMemoryBase value) : value_(value) {}
 
@@ -70,10 +69,10 @@ class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
   ShapedBuffer AsShapedBuffer(PjRtDevice* device,
                               const Shape& on_device_shape) const;
 
-  static tsl::RCReference<RawSEDeviceMemory> Create(
+  static tsl::AsyncValueRef<RawSEDeviceMemory> Create(
       se::DeviceMemoryBase value, LocalDeviceState* local_device,
       se::DeviceMemoryAllocator* allocator);
-  static tsl::RCReference<RawSEDeviceMemory> CreateForeign(
+  static tsl::AsyncValueRef<RawSEDeviceMemory> CreateForeign(
       se::DeviceMemoryBase value,
       absl::AnyInvocable<void() &&> on_delete_callback);
 
@@ -130,7 +129,7 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
       ExecutionInput* execution_input,
       se::DeviceMemoryAllocator* allocator) const;
 
-  const tsl::RCReference<RawSEDeviceMemory>& device_memory() const {
+  const tsl::AsyncValueRef<RawSEDeviceMemory>& device_memory() const {
     return device_memory_;
   }
 
@@ -168,7 +167,7 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   StreamAndEventContainer LockUseAndTransferUsageEvents();
 
   TrackedDeviceBuffer(
-      PjRtDevice* device, tsl::RCReference<RawSEDeviceMemory> device_memory,
+      PjRtDevice* device, tsl::AsyncValueRef<RawSEDeviceMemory> device_memory,
       absl::Span<const BufferSequencingEventRef> definition_events);
   ~TrackedDeviceBuffer() override;
 
@@ -180,9 +179,7 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
 
   void AddUsageEvent(tsl::RCReference<PjRtDeviceEvent> event) override;
 
-  void Delete(PjRtMemorySpace* memory_space) override {
-    LOG(FATAL) << "Implement";
-  }
+  void Delete(PjRtMemorySpace* memory_space) override;
 
   absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override {
     for (const BufferSequencingEventRef& event : definition_events()) {
@@ -191,11 +188,20 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
     return absl::OkStatus();
   }
 
+  absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+  CloneWithControlDependency(PjRtMemorySpace* memory_space,
+                             Future<> dependency) override;
+
+  Future<> GetReadyFuture(PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> GetDefinitionEvent(
+      PjRtMemorySpace* memory_space) override;
+
  private:
   PjRtDevice* device_;
 
   // Each host-side buffer may have several buffers on-device.
-  tsl::RCReference<RawSEDeviceMemory> device_memory_;
+  tsl::AsyncValueRef<RawSEDeviceMemory> device_memory_;
 
   // Events that are triggered when the content of one or more buffers is ready
   // during multistream execution. May be nullptr, which is used in the
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index cf999fcf46859f..bc35b8cc115ad8 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -84,7 +84,7 @@ class TestDevice : public PjRtDevice {
 
 absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(
     const Shape& shape, LocalClient* client, PjRtDevice* device) {
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> device_buffers;
+  std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> device_buffers;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
diff --git a/third_party/xla/xla/pjrt/triton_cuda.cc b/third_party/xla/xla/pjrt/triton_cuda.cc
index acfdb636abd9b2..5b4b8a69395d45 100644
--- a/third_party/xla/xla/pjrt/triton_cuda.cc
+++ b/third_party/xla/xla/pjrt/triton_cuda.cc
@@ -93,7 +93,6 @@ absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> CreateTargetMachine(
   if (enable_fp_fusion) {
     opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
   }
-  opt.UnsafeFPMath = false;
   opt.NoInfsFPMath = false;
   opt.NoNaNsFPMath = true;
   opt.TrapUnreachable = true;
diff --git a/third_party/xla/xla/pjrt/worker_thread.cc b/third_party/xla/xla/pjrt/worker_thread.cc
index d186c734272d51..0154f8d84413cd 100644
--- a/third_party/xla/xla/pjrt/worker_thread.cc
+++ b/third_party/xla/xla/pjrt/worker_thread.cc
@@ -30,13 +30,13 @@ WorkerThread::WorkerThread(tsl::Env* env, const std::string& name) {
 }
 
 WorkerThread::~WorkerThread() {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   work_queue_.push(nullptr);
 }
 
 void WorkerThread::Schedule(absl::AnyInvocable<void() &&> fn) {
   CHECK(fn != nullptr);
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   work_queue_.push(std::move(fn));
 }
 
@@ -46,7 +46,7 @@ void WorkerThread::WorkLoop() {
   while (true) {
     absl::AnyInvocable<void() &&> fn;
     {
-      absl::MutexLock lock(&mu_);
+      absl::MutexLock lock(mu_);
       mu_.Await(absl::Condition(this, &WorkerThread::WorkAvailable));
       fn = std::move(work_queue_.front());
       work_queue_.pop();
diff --git a/third_party/xla/xla/py_strict.bzl b/third_party/xla/xla/py_strict.bzl
index 35862b51ddb151..cf2b93ee5689dc 100644
--- a/third_party/xla/xla/py_strict.bzl
+++ b/third_party/xla/xla/py_strict.bzl
@@ -1,9 +1,9 @@
 """Default (OSS) build versions of Python strict rules."""
 
-load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("@rules_python//python:py_binary.bzl", "py_binary")
 load("@rules_python//python:py_library.bzl", "py_library")
 load("@rules_python//python:py_test.bzl", "py_test")
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 7986d0b501e8ce..95261231334095 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -12,7 +12,7 @@ load(
     "if_google",
     "internal_visibility",
 )
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_pybind_extension")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable", "get_compatible_with_portable", "tsl_pybind_extension")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -88,6 +88,25 @@ py_strict_test(
     ] + xla_py_test_deps(),
 )
 
+cc_library(
+    name = "dlpack_types",
+    srcs = ["dlpack_types.cc"],
+    hdrs = ["dlpack_types.h"],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/status:statusor",
+        "@dlpack",
+    ],
+)
+
 cc_library(
     name = "types",
     srcs = ["types.cc"],
@@ -105,6 +124,7 @@ cc_library(
     deps = [
         ":nb_numpy",
         ":safe_static_init",
+        ":strides",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -126,6 +146,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "strides",
+    srcs = ["strides.cc"],
+    hdrs = ["strides.h"],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    visibility = internal_visibility([":friends"]),
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "literal_type_casters",
     hdrs = ["literal_type_casters.h"],
@@ -365,6 +399,10 @@ nanobind_pywrap_extension(
     name = "_profiler",
     srcs = ["profiler.cc"],
     pytype_srcs = ["_profiler.pyi"],
+    visibility = internal_visibility([
+        ":jax",
+        "//third_party/py/torch_tpu:__subpackages__",
+    ]),
     deps = [
         ":aggregate_profile",
         ":profiler_utils",
@@ -401,6 +439,10 @@ nanobind_pywrap_extension(
         "-fno-strict-aliasing",
     ],
     pytype_srcs = ["_profile_data.pyi"],
+    visibility = internal_visibility([
+        ":jax",
+        "//third_party/py/torch_tpu:__subpackages__",
+    ]),
     deps = [
         "//xla/python/profiler:profile_data_lib",
         "@local_tsl//tsl/platform:protobuf",
diff --git a/third_party/xla/xla/python/custom_partition_callback.cc b/third_party/xla/xla/python/custom_partition_callback.cc
index 56bf633cbab8ed..b43965fcc094b2 100644
--- a/third_party/xla/xla/python/custom_partition_callback.cc
+++ b/third_party/xla/xla/python/custom_partition_callback.cc
@@ -236,7 +236,7 @@ absl::StatusOr<xla::HloSharding> ReadHloSharding(
     JAX_CustomCallPartitioner_string data) {
   xla::OpSharding proto;
   if (data.size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(data.data, data.size)) {
+      !proto.ParseFromString(absl::string_view(data.data, data.size))) {
     return absl::InternalError(
         "custom_call_sharding.cc: error parsing OpShardingProto");
   }
@@ -246,7 +246,7 @@ absl::StatusOr<xla::HloSharding> ReadHloSharding(
 absl::StatusOr<xla::Shape> ReadHloShape(JAX_CustomCallPartitioner_string data) {
   xla::ShapeProto proto;
   if (data.size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(data.data, data.size)) {
+      !proto.ParseFromString(absl::string_view(data.data, data.size))) {
     return absl::InternalError(
         "custom_call_sharding.cc: error parsing xla::Shape");
   }
diff --git a/third_party/xla/xla/python/dlpack_types.cc b/third_party/xla/xla/python/dlpack_types.cc
new file mode 100644
index 00000000000000..e3a37a8ae92c94
--- /dev/null
+++ b/third_party/xla/xla/python/dlpack_types.cc
@@ -0,0 +1,223 @@
+/* Copyright 2025 The JAX Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/dlpack_types.h"
+
+#include "absl/status/statusor.h"
+#include "include/dlpack/dlpack.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+absl::StatusOr<DLDataType> PrimitiveTypeToDLDataType(PrimitiveType type) {
+  switch (type) {
+    case S8:
+      return DLDataType{kDLInt, 8, 1};
+    case S16:
+      return DLDataType{kDLInt, 16, 1};
+    case S32:
+      return DLDataType{kDLInt, 32, 1};
+    case S64:
+      return DLDataType{kDLInt, 64, 1};
+    case U8:
+      return DLDataType{kDLUInt, 8, 1};
+    case U16:
+      return DLDataType{kDLUInt, 16, 1};
+    case U32:
+      return DLDataType{kDLUInt, 32, 1};
+    case U64:
+      return DLDataType{kDLUInt, 64, 1};
+    case F4E2M1FN:
+      return DLDataType{kDLFloat4_e2m1fn, 4, 1};
+    case F8E3M4:
+      return DLDataType{kDLFloat8_e3m4, 8, 1};
+    case F8E4M3:
+      return DLDataType{kDLFloat8_e4m3, 8, 1};
+    case F8E4M3B11FNUZ:
+      return DLDataType{kDLFloat8_e4m3b11fnuz, 8, 1};
+    case F8E4M3FN:
+      return DLDataType{kDLFloat8_e4m3fn, 8, 1};
+    case F8E4M3FNUZ:
+      return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
+    case F8E5M2:
+      return DLDataType{kDLFloat8_e5m2, 8, 1};
+    case F8E5M2FNUZ:
+      return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
+    case F8E8M0FNU:
+      return DLDataType{kDLFloat8_e8m0fnu, 8, 1};
+    case BF16:
+      return DLDataType{kDLBfloat, 16, 1};
+    case F16:
+      return DLDataType{kDLFloat, 16, 1};
+    case F32:
+      return DLDataType{kDLFloat, 32, 1};
+    case F64:
+      return DLDataType{kDLFloat, 64, 1};
+    case PRED:
+      return DLDataType{kDLBool, 8, 1};
+    case C64:
+      return DLDataType{kDLComplex, 64, 1};
+    case C128:
+      return DLDataType{kDLComplex, 128, 1};
+    default:
+      return Unimplemented("XLA type %s has no DLPack equivalent",
+                           PrimitiveType_Name(type));
+  }
+}
+
+absl::StatusOr<PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type) {
+  if (type.lanes != 1) {
+    return Unimplemented("DLPack types with lanes != 1 not implemented, got %d",
+                         type.lanes);
+  }
+  switch (type.code) {
+    case kDLBool:
+      switch (type.bits) {
+        case 8:
+          return PRED;
+        default:
+          return Unimplemented(
+              "Only 8-bit DLPack booleans are supported, got %d bits",
+              type.bits);
+      }
+    case kDLInt:
+      switch (type.bits) {
+        case 8:
+          return S8;
+        case 16:
+          return S16;
+        case 32:
+          return S32;
+        case 64:
+          return S64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack integer width: %d bits",
+              type.bits);
+      }
+    case kDLUInt:
+      switch (type.bits) {
+        case 8:
+          return U8;
+        case 16:
+          return U16;
+        case 32:
+          return U32;
+        case 64:
+          return U64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack unsigned integer width: %d bits",
+              type.bits);
+      }
+    case kDLFloat4_e2m1fn:
+      if (type.bits == 4) {
+        return F4E2M1FN;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float4_e2m1fn width: %d bits",
+          type.bits);
+    case kDLFloat8_e3m4:
+      if (type.bits == 8) {
+        return F8E3M4;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e3m4 width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3:
+      if (type.bits == 8) {
+        return F8E4M3;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3 width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3b11fnuz:
+      if (type.bits == 8) {
+        return F8E4M3B11FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3b11fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3fn:
+      if (type.bits == 8) {
+        return F8E4M3FN;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3fn width: %d bits",
+          type.bits);
+    case kDLFloat8_e4m3fnuz:
+      if (type.bits == 8) {
+        return F8E4M3FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e4m3fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e5m2:
+      if (type.bits == 8) {
+        return F8E5M2;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e5m2 width: %d bits",
+          type.bits);
+    case kDLFloat8_e5m2fnuz:
+      if (type.bits == 8) {
+        return F8E5M2FNUZ;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e5m2fnuz width: %d bits",
+          type.bits);
+    case kDLFloat8_e8m0fnu:
+      if (type.bits == 8) {
+        return F8E8M0FNU;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack float8_e8m0fnu width: %d bits",
+          type.bits);
+    case kDLBfloat:
+      if (type.bits == 16) {
+        return BF16;
+      }
+      return Unimplemented(
+          "Invalid or unsupported DLPack bfloat width: %d bits", type.bits);
+    case kDLFloat:
+      switch (type.bits) {
+        case 16:
+          return F16;
+        case 32:
+          return F32;
+        case 64:
+          return F64;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack float width: %d bits", type.bits);
+      }
+    case kDLComplex:
+      switch (type.bits) {
+        case 64:
+          return C64;
+        case 128:
+          return C128;
+        default:
+          return Unimplemented(
+              "Invalid or unsupported DLPack complex width: %d bits",
+              type.bits);
+      }
+    default:
+      return Unimplemented("Unknown or invalid DLPack type code %d", type.code);
+  }
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/llvm_kernel_definition.h b/third_party/xla/xla/python/dlpack_types.h
similarity index 62%
rename from third_party/xla/xla/codegen/llvm_kernel_definition.h
rename to third_party/xla/xla/python/dlpack_types.h
index 7efcb9f2e97e12..df64c9baf3ad99 100644
--- a/third_party/xla/xla/codegen/llvm_kernel_definition.h
+++ b/third_party/xla/xla/python/dlpack_types.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The JAX Authors
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
-#define XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
+#ifndef XLA_PYTHON_DLPACK_TYPES_H_
+#define XLA_PYTHON_DLPACK_TYPES_H_
 
-#include "xla/codegen/kernel_definition.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "absl/status/statusor.h"
+#include "include/dlpack/dlpack.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
-using LlvmKernelDefinition = KernelDefinition<LlvmIrKernelSource>;
+absl::StatusOr<DLDataType> PrimitiveTypeToDLDataType(PrimitiveType type);
+absl::StatusOr<PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type);
 
 }  // namespace xla
 
-#endif  // XLA_CODEGEN_LLVM_KERNEL_DEFINITION_H_
+#endif  // XLA_PYTHON_DLPACK_TYPES_H_
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index c2f4a5a7dc0908..d733d985d152c5 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -885,6 +885,7 @@ xla_cc_test(
         ":serdes_version",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1153,7 +1154,11 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -1194,6 +1199,7 @@ cc_library(
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:string_view",
     ],
+    alwayslink = True,
 )
 
 cc_library(
diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index 5d9c65ea7eca30..ec5bf8af0b11c9 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -462,7 +462,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferReplicated) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
                           array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_EQ(single_device_arrays.size(), devices.size());
   for (int i = 0; i < single_device_arrays.size(); ++i) {
@@ -547,7 +547,7 @@ TEST(ArrayImplTest, MakeArraysFromHostBufferShardsAndCopyToHostBuffer) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         arrays[i]->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
     ASSERT_EQ(single_device_arrays.size(), devices.size());
     for (int j = 0; j < single_device_arrays.size(); ++j) {
@@ -847,7 +847,7 @@ TEST(ArrayImplTest,
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         arrays[i]->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
     ASSERT_EQ(single_device_arrays.size(), devices.size());
     for (int j = 0; j < single_device_arrays.size(); ++j) {
@@ -906,7 +906,7 @@ TEST(ArrayImplTest, HostBufferRoundTripAllMemoryKinds) {
     std::vector<float> new_data(6);
     tsl::Future<> future = array->CopyToHostBuffer(
         static_cast<void*>(new_data.data()), /*byte_strides=*/std::nullopt,
-        ArrayCopySemantics::kReuseInput);
+        ArrayCopySemantics::kAlwaysCopy);
     TF_ASSERT_OK(future.Await());
     EXPECT_THAT(new_data, ElementsAreArray(data));
   }
@@ -1126,7 +1126,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         assembled_array->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
 
     ASSERT_THAT(single_device_arrays, SizeIs(2));
@@ -1182,7 +1182,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleSingleDeviceArray) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
                           assembled_array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
 
   ASSERT_THAT(single_device_arrays, SizeIs(1));
@@ -1278,7 +1278,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleNonAddressableArray) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto single_device_arrays,
         assembled_array->DisassembleIntoSingleDeviceArrays(
-            ArrayCopySemantics::kAlwaysCopy,
+            ArrayCopySemantics::kReuseInput,
             SingleDeviceShardSemantics::kAddressableShards));
 
     ASSERT_THAT(single_device_arrays, SizeIs(0));
@@ -1353,7 +1353,7 @@ TEST(ArrayImplTest, CopyToDifferentDevice) {
 
     TF_ASSERT_OK_AND_ASSIGN(
         auto shards, arrays[i]->DisassembleIntoSingleDeviceArrays(
-                         ArrayCopySemantics::kAlwaysCopy,
+                         ArrayCopySemantics::kReuseInput,
                          SingleDeviceShardSemantics::kAddressableShards));
     for (const auto& shard : shards) {
       std::vector<float> out_data(6);
diff --git a/third_party/xla/xla/python/ifrt/dtype.cc b/third_party/xla/xla/python/ifrt/dtype.cc
index bbd86ae240aec0..729cdeda6f87a0 100644
--- a/third_party/xla/xla/python/ifrt/dtype.cc
+++ b/third_party/xla/xla/python/ifrt/dtype.cc
@@ -138,11 +138,13 @@ absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
 #define CASE(X)              \
   case DTypeProto::KIND_##X: \
     return DType(DType::Kind::k##X);
+      CASE(S2);
       CASE(S4);
       CASE(S8);
       CASE(S16);
       CASE(S32);
       CASE(S64);
+      CASE(U2);
       CASE(U4);
       CASE(U8);
       CASE(U16);
@@ -154,15 +156,15 @@ absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
       CASE(BF16);
       CASE(C64);
       CASE(C128);
-      CASE(F4E2M1FN);
       CASE(F8E3M4);
       CASE(F8E4M3);
-      CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
       CASE(F8E4M3FNUZ);
       CASE(F8E5M2);
       CASE(F8E5M2FNUZ);
+      CASE(F8E8M0FNU);
+      CASE(F4E2M1FN);
 #undef CASE
     case DTypeProto::KIND_STRING:
       return DType(DType::Kind::kString);
@@ -195,11 +197,13 @@ DTypeProto DType::ToProto(SerDesVersion version) const {
   case DType::Kind::k##X:                       \
     dtype_proto.set_kind(DTypeProto::KIND_##X); \
     break;
+      CASE(S2);
       CASE(S4);
       CASE(S8);
       CASE(S16);
       CASE(S32);
       CASE(S64);
+      CASE(U2);
       CASE(U4);
       CASE(U8);
       CASE(U16);
@@ -211,15 +215,15 @@ DTypeProto DType::ToProto(SerDesVersion version) const {
       CASE(BF16);
       CASE(C64);
       CASE(C128);
-      CASE(F4E2M1FN);
       CASE(F8E3M4);
       CASE(F8E4M3);
-      CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
       CASE(F8E4M3FNUZ);
       CASE(F8E5M2);
       CASE(F8E5M2FNUZ);
+      CASE(F8E8M0FNU);
+      CASE(F4E2M1FN);
 #undef CASE
     case DType::Kind::kString:
       dtype_proto.set_kind(DTypeProto::KIND_STRING);
diff --git a/third_party/xla/xla/python/ifrt/dtype_test.cc b/third_party/xla/xla/python/ifrt/dtype_test.cc
index 7683866f3dfa4a..d92985bacd02ae 100644
--- a/third_party/xla/xla/python/ifrt/dtype_test.cc
+++ b/third_party/xla/xla/python/ifrt/dtype_test.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "xla/python/ifrt/dtype.pb.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -39,16 +40,24 @@ class DTypeSerDesTest : public testing::TestWithParam<SerDesVersion> {
   SerDesVersion version_;
 };
 
+TEST_P(DTypeSerDesTest, Invalid) {
+  DType dtype(DType::kInvalid);
+  EXPECT_THAT(DType::FromProto(dtype.ToProto(version())),
+              absl_testing::IsOkAndHolds(dtype));
+}
+
 TEST_P(DTypeSerDesTest, FromToFromProto) {
   // Unlike other round-trip tests, this test starts from a proto because it is
   // easier to enumerate `DTypeProto::Kind`. This is not a fundamental
   // restriction, and this test may be rewritten as `ToFromToProto` if needed.
-  for (int i = 0; i < DTypeProto::Kind_descriptor()->value_count(); ++i) {
+  for (int i = 1; i < DTypeProto::Kind_descriptor()->value_count(); ++i) {
+    SCOPED_TRACE(DTypeProto::Kind_descriptor()->value(i)->name());
     DTypeProto proto;
     proto.set_version_number(version().version_number().value());
     proto.set_kind(static_cast<DTypeProto::Kind>(
         DTypeProto::Kind_descriptor()->value(i)->number()));
     TF_ASSERT_OK_AND_ASSIGN(DType dtype, DType::FromProto(proto));
+    EXPECT_NE(dtype.kind(), DType::kInvalid);
     TF_ASSERT_OK_AND_ASSIGN(DType dtype_copy,
                             DType::FromProto(dtype.ToProto(version())));
     EXPECT_EQ(dtype_copy, dtype);
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 383b35ae1b5b87..481bd79f89818b 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -264,6 +264,7 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index c85997a33a0393..49ea8fe1b2986f 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -305,11 +305,16 @@ absl::Status PopulateLayouts(mlir::ModuleOp mlir_module,
 
 absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
-    std::unique_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options,
+    std::unique_ptr<xla::ifrt::IfrtIRCompileOptions> ifrt_ir_compile_options,
     xla::ifrt::Client* client,
     std::shared_ptr<xla::ifrt::AtomProgramCompiler> atom_program_compiler) {
   TraceMe traceme([]() { return "ProgramCompiler::CompileForInterpreter"; });
 
+  // Sharing the compile options with the passes and when pipeline is done add
+  // it to the CompiledIfrtIrProgram.
+  std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options =
+      std::move(ifrt_ir_compile_options);
+
   std::vector<xla::ifrt::Device*> devices;
   devices.reserve(compile_options->device_assignments.size());
   for (const auto& device_id : compile_options->device_assignments) {
@@ -394,8 +399,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     }
     TF_RETURN_IF_ERROR(xla::ifrt::createOutlinedAtomProgramsToCompiledPipeline(
         pm, std::move(atom_program_compiler), compile_pipeline_options,
-        std::move(compile_options), atom_executable_map,
-        std::move(bound_executable_map)));
+        compile_options, atom_executable_map, std::move(bound_executable_map)));
 
     {
       TraceMe traceme(
@@ -444,6 +448,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
       /*donatable_input_indices=*/std::move(donatable_input_indices),
       /*program=*/std::move(ifrt_ir_program),
       /*device_assignments=*/std::move(device_assignments),
+      /*compile_options=*/compile_options,
   };
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
index 5896dfc52c41d9..509750627489e1 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
@@ -60,6 +60,9 @@ struct CompiledIfrtIrProgram {
   // ids obtained from IFRT client.
   std::vector<xla::ifrt::DeviceId> device_assignments;
 
+  // The compile options used to compile the program.
+  std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options;
+
   // Compiles an IFRT IR program.
   static absl::StatusOr<CompiledIfrtIrProgram> Create(
       std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
index eb462b2349d1d8..8233b5ed662e27 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
index b71ac31a4b8dac..7e8612f830303f 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
@@ -227,7 +227,8 @@ absl::StatusOr<ExecuteResult> ProgramInterpreter::Execute(
   for (const auto [idx, arg] : llvm::enumerate(main_func.getArguments())) {
     // Add to the environment the arrays that are used.
     bool is_donated = main_func.getArgAttr(
-                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr;
+                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr &&
+                      !options.non_donatable_input_indices.contains(idx);
     if (!arg.use_empty()) {
       env.AssociateArray(arg, ArrayState{/*array=*/arrays[idx],
                                          /*can_be_donated=*/is_donated});
@@ -300,10 +301,6 @@ absl::Status ProgramInterpreter::ExecuteOp(
   execute_options.fill_status = env.fill_status;
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
   for (const auto [idx, input] : llvm::enumerate(call_loaded_op.getInputs())) {
-    bool is_donated = donated_arg_idxs.contains(idx);
-    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. "
@@ -316,14 +313,20 @@ absl::Status ProgramInterpreter::ExecuteOp(
           PrettyPrint(call_loaded_op)));
     }
     inputs.push_back(array_it->second.array);
+
+    bool is_donated = donated_arg_idxs.contains(idx);
+    if (is_donated && !array_it->second.can_be_donated) {
+      VLOG(2) << "Atom program donates input #" << idx
+              << ", but it has not been donated to the IFRT IR program. "
+                 "Input will not be donated. \n"
+              << PrettyPrint(call_loaded_op);
+      is_donated = false;
+    }
+    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
     if (!is_donated) {
       execute_options.non_donatable_input_indices.insert(idx);
-    } else if (!array_it->second.can_be_donated) {
-      LOG(WARNING) << "Atom program donates input #" << idx
-                   << ", but it has not been donated to the IFRT IR program. "
-                      "Input will not be donated. \n"
-                   << PrettyPrint(call_loaded_op);
-      execute_options.non_donatable_input_indices.insert(idx);
     }
   }
 
@@ -413,17 +416,11 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
   input_specs.reserve(remap_op.getInputs().size());
   // Get the input specs of the remap plan and the input arrays.
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
+  std::optional<bool> is_donated;
   for (const auto [idx, input] : llvm::enumerate(remap_op.getInputs())) {
-    if (remap_op.getDonated() || liveness_.isDeadAfter(input, remap_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. " << PrettyPrint(remap_op);
-    if (remap_op.getDonated() && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " cannot be donated. ", PrettyPrint(remap_op)));
-    }
     if (array_it->second.array->IsDeleted()) {
       // We explicitly check here for deletion in order to provide a more
       // informative error message.
@@ -436,7 +433,31 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
         /*dtype=*/array_it->second.array->dtype(),
         /*shape=*/array_it->second.array->shape(),
         /*sharding=*/array_it->second.array->shared_ptr_sharding()});
+
+    // The default buffer donation semantic is finalized at compilation time.
+    // Users can override the donation semantic at runtime. In the meantime, the
+    // IFRT client RemapArrays API requires all input arrays have the same
+    // donation semantic.
+    if (!is_donated.has_value()) {
+      is_donated = remap_op.getDonated() && array_it->second.can_be_donated;
+    }
+    if (*is_donated && !array_it->second.can_be_donated) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Donation semantic must be consistent across all input arrays of "
+          "RemapArraysOp. Input array #",
+          idx,
+          " cannot be donated, but previous input arrays can be donated. It's "
+          "likely due to a MPMD program argument is marked as non-donatable. ",
+          PrettyPrint(remap_op)));
+    }
+    if (*is_donated || liveness_.isDeadAfter(input, remap_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
   }
+  TF_RET_CHECK(is_donated.has_value())
+      << "Unable to determine the donation semantic of the remap op. The remap "
+         "op has no inputs. "
+      << PrettyPrint(remap_op);
 
   // Get the output specs of the remap plan.
   std::vector<xla::ifrt::ArraySpec> output_specs;
@@ -455,8 +476,8 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
 
   // Apply the remap arrays operation.
   xla::ifrt::ArrayCopySemantics copy_semantics =
-      remap_op.getDonated() ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                            : xla::ifrt::ArrayCopySemantics::kReuseInput;
+      *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                  : xla::ifrt::ArrayCopySemantics::kReuseInput;
   TF_ASSIGN_OR_RETURN(
       auto out_arrays,
       client_->RemapArrays({
@@ -500,20 +521,12 @@ absl::Status ProgramInterpreter::ExecuteOp(
   std::vector<ArrayRef> inputs;
   inputs.reserve(copy_arrays_op.getInputs().size());
   llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
+  std::optional<bool> is_donated;
   for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
-    if (copy_arrays_op.getDonated() ||
-        liveness_.isDeadAfter(input, copy_arrays_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
     auto array_it = env.value_to_array.find(input);
     TF_RET_CHECK(array_it != env.value_to_array.end())
         << "Input array #" << idx << " not found. "
         << PrettyPrint(copy_arrays_op);
-    if (copy_arrays_op.getDonated() && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Input array #", idx, " cannot be donated. ",
-                       PrettyPrint(copy_arrays_op)));
-    }
     if (array_it->second.array->IsDeleted()) {
       // We explicitly check here for deletion in order to provide a more
       // informative error message.
@@ -522,7 +535,32 @@ absl::Status ProgramInterpreter::ExecuteOp(
           PrettyPrint(copy_arrays_op)));
     }
     inputs.push_back(array_it->second.array);
+
+    // The default buffer donation semantic is finalized at compilation time.
+    // Users can override the donation semantic at runtime. In the meantime, the
+    // IFRT client CopyArrays API requires all input arrays have the same
+    // donation semantic.
+    if (!is_donated.has_value()) {
+      is_donated =
+          copy_arrays_op.getDonated() && array_it->second.can_be_donated;
+    }
+    if (*is_donated && !array_it->second.can_be_donated) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Donation semantic must be consistent across all input arrays of "
+          "CopyArraysOp. Input array #",
+          idx,
+          " cannot be donated, but previous input arrays can be donated. It's "
+          "likely due to a MPMD program argument is marked as non-donatable. ",
+          PrettyPrint(copy_arrays_op)));
+    }
+    if (*is_donated || liveness_.isDeadAfter(input, copy_arrays_op)) {
+      array_values_to_gc_from_env.insert(input);
+    }
   }
+  TF_RET_CHECK(is_donated.has_value())
+      << "Unable to determine the donation semantic of the copy arrays op. The "
+         "copy arrays op has no inputs. "
+      << PrettyPrint(copy_arrays_op);
 
   const auto out_array_type = llvm::cast<xla::ifrt::IfrtArrayType>(
       copy_arrays_op.getOutputs().front().getType());
@@ -530,7 +568,7 @@ absl::Status ProgramInterpreter::ExecuteOp(
       << "Output array #0 is not of type `IfrtArrayType`. "
       << PrettyPrint(copy_arrays_op);
   auto new_sharding = array_type_to_sharding_.at(out_array_type);
-  auto array_copy_semantics = copy_arrays_op.getDonated()
+  auto array_copy_semantics = *is_donated
                                   ? xla::ifrt::ArrayCopySemantics::kDonateInput
                                   : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
   // It is safe to get the devices and memory kind from the first output
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index 19de11f791cc1a..5cf59b43f365c4 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -144,7 +144,6 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
index e71542292ba4b5..df7d7774895e68 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -36,12 +36,12 @@ limitations under the License.
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/ir/tests/executable_impl_test_base.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -49,7 +49,6 @@ namespace xla {
 namespace ifrt {
 namespace {
 
-using ::tsl::testing::IsOk;
 using ::xla::ifrt::test_util::AssertPerShardData;
 
 class IfrtIrExecutableImplTest
@@ -616,6 +615,170 @@ module {
               testing::Not(absl_testing::IsOk()));
 }
 
+TEST_F(IfrtIrExecutableImplTest, CallLoadedExecutableOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> !array
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @add_one(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array) -> !array
+    return %0 : !array
+  }
+
+  func.func private @add_one(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return %1 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options,
+                           /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(
+      AssertPerShardData<int>(result.outputs[0], DType(DType::kS32),
+                              Shape({1, 2}), {{1, 2}, {3, 4}}, devices));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
+TEST_F(IfrtIrExecutableImplTest, CopyArraysOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> !array
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.CopyArrays(%arg0) : (!array) -> !array
+    return %0 : !array
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options,
+                           /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(
+      AssertPerShardData<int>(result.outputs[0], DType(DType::kS32),
+                              Shape({1, 2}), {{0, 1}, {2, 3}}, devices));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
+TEST_F(IfrtIrExecutableImplTest, RemapArraysOpDonationOverride) {
+  std::string source = R"(
+!array = !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
+!array0 = !ifrt.array<tensor<1x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+!array1 = !ifrt.array<tensor<1x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>
+module {
+  func.func @main(%arg0: !array {ifrt.donated}) -> (!array0, !array1)
+      attributes {ifrt.function} {
+    %0, %1 = ifrt.RemapArrays(%arg0)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<0, 1, [#ifrt.mapping<[1:2:1] to [0:1:1]>]>]
+      : (!array) -> (!array0, !array1)
+    return %0, %1 : !array0, !array1
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutableRef loaded_exec,
+      client_->GetDefaultCompiler()->CompileAndLoad(
+          std::make_unique<IfrtIRProgram>(*mlir_module),
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data_shard0 = {0, 1};
+  std::vector<int> data_shard1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  ExecuteOptions options;
+  options.fill_status = true;
+  options.non_donatable_input_indices.insert(0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), options, devices));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 2);
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef device_list0,
+                          client_->MakeDeviceList({devices->devices()[0]}));
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], DType(DType::kS32), Shape({1, 2}), {{0, 1}},
+      std::move(device_list0)));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceListRef device_list1,
+                          client_->MakeDeviceList({devices->devices()[1]}));
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[1], DType(DType::kS32), Shape({1, 2}), {{2, 3}},
+      std::move(device_list1)));
+  // Not using `CopyToHostBuffer` because some implementations don't support it.
+  ASSERT_FALSE(input->IsDeleted());
+  EXPECT_THAT(input->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              absl_testing::IsOk());
+}
+
 TEST_F(IfrtIrExecutableImplTest, DonateOutputOfCall) {
   std::string source = R"(
 !array = !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
diff --git a/third_party/xla/xla/python/ifrt/test_util.cc b/third_party/xla/xla/python/ifrt/test_util.cc
index 9728f3679d19b1..7f6b0d6c0586cf 100644
--- a/third_party/xla/xla/python/ifrt/test_util.cc
+++ b/third_party/xla/xla/python/ifrt/test_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -125,7 +126,11 @@ absl::StatusOr<DeviceListRef> GetAddressableDevices(
   return client->MakeDeviceList(std::move(devices));
 }
 
-UserContextRef MakeUserContext(uint64_t id) {
+UserContextRef MakeUserContext(uint64_t id,
+                               std::optional<std::string> debug_string) {
+  if (debug_string.has_value()) {
+    return TestUserContext::Create(UserContextId(id), *std::move(debug_string));
+  }
   return TestUserContext::Create(UserContextId(id));
 }
 
diff --git a/third_party/xla/xla/python/ifrt/test_util.h b/third_party/xla/xla/python/ifrt/test_util.h
index 65cd5cc8785db9..efd122a7d92950 100644
--- a/third_party/xla/xla/python/ifrt/test_util.h
+++ b/third_party/xla/xla/python/ifrt/test_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -68,7 +69,7 @@ void AssertPerShardData(
               testing::ElementsAreArray(GetDeviceIds(expected_device_list)));
   TF_ASSERT_OK_AND_ASSIGN(auto actual_per_shard_arrays,
                           actual->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy,
+                              ArrayCopySemantics::kReuseInput,
                               SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_EQ(actual_per_shard_arrays.size(), expected_per_shard_data.size());
   for (int i = 0; i < actual_per_shard_arrays.size(); ++i) {
@@ -97,8 +98,11 @@ absl::StatusOr<DeviceListRef> GetAddressableDevices(
     Client* client, absl::Span<const int> device_indices);
 
 // Returns a new `UserContext` for testing. The created `UserContext` has an
-// ID equal to `id`.
-UserContextRef MakeUserContext(uint64_t id);
+// ID equal to `id`. `debug_string` defaults to `TestUserContext(id)`. This can
+// be overridden with a custom debug string for the tests that must use
+// multiple `UserContext`s with the same debug string.
+UserContextRef MakeUserContext(
+    uint64_t id, std::optional<std::string> debug_string = std::nullopt);
 
 }  // namespace test_util
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/user_context.cc b/third_party/xla/xla/python/ifrt/user_context.cc
index 2d87bf87431e82..ddf132faf57fc1 100644
--- a/third_party/xla/xla/python/ifrt/user_context.cc
+++ b/third_party/xla/xla/python/ifrt/user_context.cc
@@ -133,9 +133,6 @@ UserContextScope::~UserContextScope() {
 
 absl_nullable const UserContextRef& UserContextScope::current() {
   if (current_context == nullptr) {
-#ifdef IFRT_REQUIRE_USER_CONTEXT
-    CHECK(false) << "User context is required but not set";
-#endif
     return *kNullContext;
   }
   return *current_context;
diff --git a/third_party/xla/xla/python/ifrt/user_context_registry.cc b/third_party/xla/xla/python/ifrt/user_context_registry.cc
index 0b30c04d53bc40..6d19f6424c2098 100644
--- a/third_party/xla/xla/python/ifrt/user_context_registry.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_registry.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "xla/python/ifrt/user_context_registry.h"
 
+#include <limits>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/no_destructor.h"
 #include "absl/base/nullability.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/user_context.h"
 
@@ -97,5 +103,27 @@ void UserContextRegistry::Unregister(
   }
 }
 
+CustomStatusExpanderRegistry& CustomStatusExpanderRegistry::Get() {
+  static absl::NoDestructor<CustomStatusExpanderRegistry> registry;
+  return *registry;
+}
+
+void CustomStatusExpanderRegistry::Register(absl::string_view payload_name,
+                                            PayloadExpanderFn expander,
+                                            std::optional<int> process_order) {
+  absl::WriterMutexLock lock(mu_);
+  std::pair<int, std::string> key = {
+      process_order.value_or(std::numeric_limits<int>::max()),
+      std::string(payload_name)};
+  CHECK(registry_.insert({std::move(key), std::move(expander)}).second);
+}
+
+void CustomStatusExpanderRegistry::Process(absl::Status& status) {
+  absl::ReaderMutexLock lock(mu_);
+  for (const auto& [_, expander] : registry_) {
+    expander(status);
+  }
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/user_context_registry.h b/third_party/xla/xla/python/ifrt/user_context_registry.h
index 884116cb576649..acff474af6dc97 100644
--- a/third_party/xla/xla/python/ifrt/user_context_registry.h
+++ b/third_party/xla/xla/python/ifrt/user_context_registry.h
@@ -16,13 +16,19 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_USER_CONTEXT_REGISTRY_H_
 #define XLA_PYTHON_IFRT_USER_CONTEXT_REGISTRY_H_
 
+#include <functional>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/user_context.h"
 
@@ -122,6 +128,36 @@ class TrackedUserContext {
   absl_nonnull const UserContextRef user_context_;
 };
 
+// CustomStatusExpanderRegistry allows registering 'payload expanders' that
+// errors returned by the IFRT backend are processed through before the error
+// message is returned to IFRT users.
+class CustomStatusExpanderRegistry {
+ public:
+  static CustomStatusExpanderRegistry& Get();
+
+  using PayloadExpanderFn = std::function<void(absl::Status&)>;
+  // Registers a payload expander. `expander` is expected to take the entire
+  // `absl::Status` object, remove the payload from the object, and modify the
+  // contents of the `absl::Status` accordingly.
+  //
+  // The optional `process_order`, if supplied, determines the order in which
+  // the expander is processed in relation to other expanders. Expanders with
+  // lower process orders are processed first; please use a positive value
+  // unless you have discussed with IFRT maintainers about writing a
+  // a critical expander function that needs to be processed earlier. Order
+  // among expanders of the same `process_order` is unspecified.
+  void Register(absl::string_view payload_name, PayloadExpanderFn expander,
+                std::optional<int> process_order = std::nullopt);
+
+  // Invokes all registered expanders on the given status.
+  void Process(absl::Status& status);
+
+ private:
+  mutable absl::Mutex mu_;
+  absl::btree_map<std::pair<int, std::string>, PayloadExpanderFn> registry_
+      ABSL_GUARDED_BY(mu_);
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/user_context_status_util.cc b/third_party/xla/xla/python/ifrt/user_context_status_util.cc
index d85bf48d3e7711..1a7722feae2a02 100644
--- a/third_party/xla/xla/python/ifrt/user_context_status_util.cc
+++ b/third_party/xla/xla/python/ifrt/user_context_status_util.cc
@@ -100,14 +100,15 @@ absl::Status ReattachUserContextRefs(absl::Status status) {
   return status;
 }
 
-absl::Status ExpandUserContexts(absl::Status status) {
+static void ExpandStandardUserContext(absl::Status& status) {
   if (status.ok()) {
-    return status;
+    return;
   }
+
   std::optional<absl::Cord> payload =
       status.GetPayload(kIfrtUserContextPayloadUrl);
   if (!payload.has_value()) {
-    return status;
+    return;
   }
 
   status.ErasePayload(kIfrtUserContextPayloadUrl);
@@ -117,7 +118,7 @@ absl::Status ExpandUserContexts(absl::Status status) {
     tsl::errors::AppendToMessage(
         &status, "\n(failed to parse a user context ID: ", payload->Flatten(),
         ")");
-    return status;
+    return;
   }
   TrackedUserContextRef user_context =
       UserContextRegistry::Get().Lookup(UserContextId(user_context_id));
@@ -125,12 +126,23 @@ absl::Status ExpandUserContexts(absl::Status status) {
     tsl::errors::AppendToMessage(
         &status, "\n(failed to find a user context for ID: ", user_context_id,
         ")");
-    return status;
+    return;
   }
   tsl::errors::AppendToMessage(&status, "\n",
                                user_context->user_context()->DebugString());
+}
+
+absl::Status ExpandUserContexts(absl::Status status) {
+  CustomStatusExpanderRegistry::Get().Process(status);
   return status;
 }
 
+static const bool register_standard_user_context = []() {
+  xla::ifrt::CustomStatusExpanderRegistry::Get().Register(
+      kIfrtUserContextPayloadUrl, ExpandStandardUserContext,
+      /*process_order=*/-1);
+  return true;
+}();
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/user_context_test_util.h b/third_party/xla/xla/python/ifrt/user_context_test_util.h
index e2b02b74cce21b..35b79f86459ed6 100644
--- a/third_party/xla/xla/python/ifrt/user_context_test_util.h
+++ b/third_party/xla/xla/python/ifrt/user_context_test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_USER_CONTEXT_TEST_UTIL_H_
 
 #include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -29,21 +30,26 @@ namespace ifrt {
 class TestUserContext : public llvm::RTTIExtends<TestUserContext, UserContext> {
  public:
   static UserContextRef Create(UserContextId id) {
-    return tsl::TakeRef<TestUserContext>(new TestUserContext(id));
+    return Create(id, absl::StrCat("TestUserContext(", id.value(), ")"));
+  }
+
+  static UserContextRef Create(UserContextId id, std::string debug_string) {
+    return tsl::TakeRef<TestUserContext>(
+        new TestUserContext(id, std::move(debug_string)));
   }
 
   UserContextId Id() const override { return id_; }
 
-  std::string DebugString() const override {
-    return absl::StrCat("TestUserContext(", id_.value(), ")");
-  }
+  std::string DebugString() const override { return debug_string_; }
 
   // No new `ID` is not defined because tests below do not exercise RTTI.
 
  private:
-  explicit TestUserContext(UserContextId id) : id_(id) {}
+  explicit TestUserContext(UserContextId id, std::string debug_string)
+      : id_(id), debug_string_(std::move(debug_string)) {}
 
   UserContextId id_;
+  std::string debug_string_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index eadbc5bcb9b48e..14ac7339f695d0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:versions",
+        "//xla/python/ifrt_proxy/contrib/pathways:status_annotator_util",  # build_cleaner: keep
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
@@ -226,6 +227,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -322,6 +324,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
@@ -406,6 +409,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
@@ -629,6 +633,7 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
     ],
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index ccc80534442545..2a5f244d7af2f8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -25,6 +25,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
index e1105b547520d7..4087c59262e446 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
@@ -27,6 +27,7 @@
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
index 38539e33074d36..1396102adc209d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
@@ -28,6 +28,7 @@
 #include "absl/time/time.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "google/protobuf/text_format.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 9f0a7ac5f0f6bb..fd1f0e488bee9c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -26,6 +26,7 @@
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD
new file mode 100644
index 00000000000000..ed69558c0c0665
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/BUILD
@@ -0,0 +1,67 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "cc_library", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = default_ifrt_proxy_visibility,
+)
+
+cc_library(
+    name = "status_annotator_util",
+    srcs = ["status_annotator_util.cc"],
+    hdrs = ["status_annotator_util.h"],
+    deps = [
+        ":status_annotator_proto_cc",
+        "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt:user_context_registry",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_to_from_proto",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+    alwayslink = 1,
+)
+
+ifrt_proxy_cc_test(
+    name = "status_annotator_util_test",
+    srcs = ["status_annotator_util_test.cc"],
+    deps = [
+        ":status_annotator_proto_cc",
+        ":status_annotator_util",
+        "//xla/python/ifrt:test_util",
+        "//xla/python/ifrt:user_context_registry",
+        "//xla/python/ifrt:user_context_status_util",
+        "//xla/tsl/platform:status_to_from_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "status_annotator_proto",
+    srcs = ["status_annotator.proto"],
+    visibility = default_ifrt_proxy_visibility,
+    deps = ["//xla/tsl/protobuf:status_proto"],
+)
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto
new file mode 100644
index 00000000000000..e4e85af6ead7c9
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator.proto
@@ -0,0 +1,57 @@
+// Copyright 2025 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+edition = "2023";
+
+package ifrt_proxy_contrib_pathways;
+
+import "xla/tsl/protobuf/status.proto";
+
+option java_multiple_files = true;
+
+// Summarizes a Pathways worker-side ObjectStore in a way relevant to users
+// debugging an HBM OOM.
+message ObjectStoreDumpProto {
+  // TODO(b/456800998): Rename 'ErrorContext' to 'UserContext'.
+  message PerErrorContext {
+    message PerCreator {
+      string creator = 1;  // The kind of operation that created this object.
+
+      // Count and size of all objects whose `GetReadyFuture().ready()` is true.
+      uint64 ready_obj_count = 2;
+      uint64 ready_total_size = 3;
+
+      // Count and size of objects whose `GetReadyFuture().ready()` is false.
+      uint64 not_ready_obj_count = 4;
+      uint64 not_ready_total_size = 5;
+    }
+    uint64 error_context_id = 1;
+    repeated PerCreator per_creator = 2;
+  }
+
+  message Dump {
+    repeated PerErrorContext per_error_context = 1;
+    string warning = 2;
+  }
+
+  string device = 1;
+  fixed64 dump_timestamp_ns = 2;
+
+  oneof result_statusor {
+    tensorflow.StatusProto dump_failed = 5;
+    Dump dump = 6;
+  }
+
+  reserved 3, 4;
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc
new file mode 100644
index 00000000000000..5d0af1a5399102
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.cc
@@ -0,0 +1,195 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/python/ifrt/user_context_registry.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace ifrt_proxy_contrib_pathways {
+
+static constexpr absl::string_view kObjectStoreDumpPayloadUrl =
+    "type.googleapis.com/ifrt_proxy_contrib_pathways.ObjectStoreDumpProto";
+
+static constexpr int kMaxCitedTraces = 20;
+
+struct PerStackTrace {
+  struct PerCreator {
+    int64_t ready_obj_count = 0;
+    int64_t ready_total_size = 0;
+    int64_t not_ready_obj_count = 0;
+    int64_t not_ready_total_size = 0;
+  };
+  std::string stack_trace;
+  absl::flat_hash_map<std::string, PerCreator> per_creator;
+  int64_t total_size = 0;
+};
+
+std::vector<std::unique_ptr<PerStackTrace>> SortedPerStackTrace(
+    const ObjectStoreDumpProto& object_store_dump) {
+  std::vector<std::unique_ptr<PerStackTrace>> result;
+  absl::flat_hash_map<uint64_t, PerStackTrace*> per_stack_trace;
+  for (const auto& per_error_context :
+       object_store_dump.dump().per_error_context()) {
+    xla::ifrt::TrackedUserContextRef tracked_user_context =
+        xla::ifrt::UserContextRegistry::Get().Lookup(
+            xla::ifrt::UserContextId(per_error_context.error_context_id()));
+    std::string stack_trace = "unknown";
+    if (tracked_user_context != nullptr) {
+      stack_trace = tracked_user_context->user_context()->DebugString();
+    }
+    uint64_t stack_trace_fprint = tsl::Fingerprint64(stack_trace);
+
+    PerStackTrace* entry = nullptr;
+    if (auto it = per_stack_trace.find(stack_trace_fprint);
+        it != per_stack_trace.end()) {
+      entry = it->second;
+    } else {
+      entry = result.emplace_back(std::make_unique<PerStackTrace>()).get();
+      entry->stack_trace = stack_trace;
+      per_stack_trace.insert(it, {stack_trace_fprint, entry});
+    }
+
+    for (const auto& per_creator : per_error_context.per_creator()) {
+      PerStackTrace::PerCreator& creator =
+          entry->per_creator[per_creator.creator()];
+      creator.ready_obj_count += per_creator.ready_obj_count();
+      creator.ready_total_size += per_creator.ready_total_size();
+      creator.not_ready_obj_count += per_creator.not_ready_obj_count();
+      creator.not_ready_total_size += per_creator.not_ready_total_size();
+      entry->total_size +=
+          per_creator.ready_total_size() + per_creator.not_ready_total_size();
+    }
+  }
+
+  auto is_bigger = [](const std::unique_ptr<PerStackTrace>& a,
+                      const std::unique_ptr<PerStackTrace>& b) {
+    return (a->total_size > b->total_size);
+  };
+
+  std::sort(result.begin(), result.end(), is_bigger);
+
+  return result;
+}
+
+static void ExpandObjectStoreDump(absl::Status& status) {
+  std::optional<absl::Cord> payload =
+      status.GetPayload(kObjectStoreDumpPayloadUrl);
+  if (!payload.has_value()) {
+    return;
+  }
+  ObjectStoreDumpProto object_store_dump;
+  if (!object_store_dump.ParseFromString(payload->Flatten())) {
+    LOG(WARNING) << "Unable to expand string to ObjectStoreDumpProto: "
+                 << payload->Flatten();
+    tsl::errors::AppendToMessage(
+        &status,
+        "\nWARNING: Unable to parse attached payload string to "
+        "ObjectStoreDumpProto. Please see logs for the actual payload string.");
+    return;
+  }
+  status.ErasePayload(kObjectStoreDumpPayloadUrl);
+
+  std::string header = absl::StrCat(
+      "Pathways object-store summary for device ", object_store_dump.device(),
+      " at ", absl::FromUnixNanos(object_store_dump.dump_timestamp_ns()));
+
+  if (object_store_dump.has_dump_failed()) {
+    absl::Status error = tsl::StatusFromProto(object_store_dump.dump_failed());
+    tsl::errors::AppendToMessage(
+        &status, "\n", header, " got error while dumping: ", error.ToString());
+    return;
+  }
+
+  auto sorted_per_stack_trace = SortedPerStackTrace(object_store_dump);
+  absl::StrAppend(&header, " (showing ",
+                  std::min(kMaxCitedTraces,
+                           static_cast<int>(sorted_per_stack_trace.size())),
+                  " of ", sorted_per_stack_trace.size(), " entries)");
+
+  if (!object_store_dump.dump().warning().empty()) {
+    absl::StrAppend(&header, " (warning: ", object_store_dump.dump().warning(),
+                    ")");
+  }
+
+  std::vector<std::string> cited_traces;
+  tsl::errors::AppendToMessage(&status, "\n", header, ":");
+  for (const auto& per_error_context : sorted_per_stack_trace) {
+    std::string stack_trace = std::move(per_error_context->stack_trace);
+    if (stack_trace != "unknown") {
+      absl::StrReplaceAll({{"\n", "\t"}}, &stack_trace);
+      absl::StrReplaceAll({{"\t", "\n                "}}, &stack_trace);
+      cited_traces.push_back(stack_trace);
+      tsl::errors::AppendToMessage(
+          &status, "  - The following entries arise from user stack [",
+          cited_traces.size(), "]:");
+    } else {
+      tsl::errors::AppendToMessage(
+          &status,
+          "  - The following entries arise from an unknown user stack:");
+    }
+    for (const auto& [creator, per_creator] : per_error_context->per_creator) {
+      tsl::errors::AppendToMessage(
+          &status, "      + ", creator, " with ", per_creator.ready_obj_count,
+          " 'ready' buffers of total size ", per_creator.ready_total_size,
+          " and ", per_creator.not_ready_obj_count,
+          " 'not ready' buffers of total size ",
+          per_creator.not_ready_total_size);
+    }
+    if (cited_traces.size() >= kMaxCitedTraces) {
+      break;
+    }
+  }
+  for (int i = 0; i < cited_traces.size(); ++i) {
+    tsl::errors::AppendToMessage(
+        &status, absl::StrFormat("[%3d]   %s", i + 1, cited_traces[i]));
+  }
+}
+
+void AnnotateIfrtUserStatusWithObjectStoreDump(
+    absl::Status& status, const ObjectStoreDumpProto& object_store_dump) {
+  status.SetPayload(kObjectStoreDumpPayloadUrl,
+                    object_store_dump.SerializeAsCord());
+}
+
+static const bool register_expanders = []() {
+  xla::ifrt::CustomStatusExpanderRegistry::Get().Register(
+      kObjectStoreDumpPayloadUrl, ExpandObjectStoreDump);
+  return true;
+}();
+
+}  // namespace ifrt_proxy_contrib_pathways
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h
new file mode 100644
index 00000000000000..ca1f0a64d9be43
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
+
+#include "absl/status/status.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+
+namespace ifrt_proxy_contrib_pathways {
+
+// Attaches the given `object_store_dump` to the given `status` as a payload.
+void AnnotateIfrtUserStatusWithObjectStoreDump(
+    absl::Status& status, const ObjectStoreDumpProto& object_store_dump);
+
+}  // namespace ifrt_proxy_contrib_pathways
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CONTRIB_PATHWAYS_STATUS_ANNOTATOR_UTIL_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc
new file mode 100644
index 00000000000000..4ffffa6694395a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/contrib/pathways/status_annotator_util_test.cc
@@ -0,0 +1,187 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator_util.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/python/ifrt/user_context_registry.h"
+#include "xla/python/ifrt/user_context_status_util.h"
+#include "xla/python/ifrt_proxy/contrib/pathways/status_annotator.pb.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+
+namespace ifrt_proxy_contrib_pathways {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+using ::xla::ifrt::TrackedUserContextRef;
+
+TEST(StatusAnnotatorUtilTest, SimpleAnnotateAndExpand) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("tpu:0"));
+}
+
+TEST(StatusAnnotatorUtilTest, ExpandFailedDump) {
+  ObjectStoreDumpProto object_store_dump;
+  *object_store_dump.mutable_dump_failed() =
+      tsl::StatusToProto(absl::InternalError("dump failed"));
+  object_store_dump.set_device("tpu:0");
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("tpu:0"));
+  EXPECT_THAT(xla::ifrt::ExpandUserContexts(status).message(),
+              HasSubstr("dump failed"));
+}
+
+static TrackedUserContextRef RegisterNewContext(
+    int context, std::optional<std::string> debug_string = std::nullopt) {
+  return xla::ifrt::UserContextRegistry::Get().Register(
+      xla::ifrt::test_util::MakeUserContext(context, debug_string));
+}
+
+TEST(StatusAnnotatorUtilTest, DumpWithManyContentsExpandsToAllDetails) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000, 3000}) {
+    user_context_refs.push_back(RegisterNewContext(context));
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    for (int creator : {100, 200}) {
+      auto* per_creator = per_error_context->add_per_creator();
+      *per_creator->mutable_creator() = absl::StrCat("creator", creator);
+      per_creator->set_ready_obj_count(context + creator + 1);
+      per_creator->set_ready_total_size(context + creator + 2);
+      per_creator->set_not_ready_obj_count(context + creator + 3);
+      per_creator->set_not_ready_total_size(context + creator + 4);
+    }
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+
+  for (int context : {1000, 2000, 3000}) {
+    EXPECT_THAT(expanded,
+                HasSubstr(absl::StrCat("TestUserContext(", context, ")")));
+    for (int creator : {100, 200}) {
+      EXPECT_THAT(expanded, HasSubstr(absl::StrCat("creator", creator)));
+      EXPECT_THAT(expanded,
+                  HasSubstr(absl::StrCat(context + creator + 1,
+                                         " 'ready' buffers of total size ",
+                                         context + creator + 2)));
+      EXPECT_THAT(expanded,
+                  HasSubstr(absl::StrCat(context + creator + 3,
+                                         " 'not ready' buffers of total size ",
+                                         context + creator + 4)));
+    }
+  }
+}
+
+TEST(StatusAnnotatorUtilTest, DumpWithDuplicateStackTraces) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000}) {
+    user_context_refs.push_back(
+        RegisterNewContext(context, /*debug_string=*/"foobar"));
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    auto* per_creator = per_error_context->add_per_creator();
+    *per_creator->mutable_creator() = "creator1";
+    per_creator->set_ready_obj_count(1);
+    per_creator->set_ready_total_size(1);
+    per_creator->set_not_ready_obj_count(1);
+    per_creator->set_not_ready_total_size(1);
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+
+  EXPECT_THAT(expanded, HasSubstr("foobar"));
+  EXPECT_THAT(expanded, HasSubstr("2 'ready' buffers of total size 2"));
+  EXPECT_THAT(expanded, HasSubstr("2 'not ready' buffers of total size 2"));
+
+  EXPECT_EQ(expanded.find("foobar"), expanded.rfind("foobar"));
+  EXPECT_EQ(expanded.find("'ready' buffers"),
+            expanded.rfind("'ready' buffers"));
+}
+
+TEST(StatusAnnotatorUtilTest, CitationsWithUnknownStackTraces) {
+  ObjectStoreDumpProto object_store_dump;
+  object_store_dump.set_device("tpu:0");
+
+  std::vector<TrackedUserContextRef> user_context_refs;
+
+  for (int context : {1000, 2000, 3000}) {
+    if (context != 2000) {
+      user_context_refs.push_back(RegisterNewContext(context));
+    }
+
+    auto* per_error_context =
+        object_store_dump.mutable_dump()->add_per_error_context();
+    per_error_context->set_error_context_id(context);
+
+    auto* per_creator = per_error_context->add_per_creator();
+    *per_creator->mutable_creator() = absl::StrCat("creator", context);
+    per_creator->set_ready_obj_count(context + 1);
+    per_creator->set_ready_total_size(context + 2);
+    per_creator->set_not_ready_obj_count(context + 3);
+    per_creator->set_not_ready_total_size(context + 4);
+  }
+
+  absl::Status status = absl::InternalError("test error");
+  AnnotateIfrtUserStatusWithObjectStoreDump(status, object_store_dump);
+
+  std::string expanded(xla::ifrt::ExpandUserContexts(status).message());
+  EXPECT_THAT(expanded, HasSubstr("unknown user stack"));
+  EXPECT_THAT(expanded, HasSubstr("[  1]   TestUserContext(3000)"));
+  EXPECT_THAT(expanded, HasSubstr("[  2]   TestUserContext(1000)"));
+  EXPECT_THAT(expanded, Not(HasSubstr("[  3]")));
+  EXPECT_THAT(expanded, Not(HasSubstr("[3]")));
+}
+
+}  // namespace
+}  // namespace ifrt_proxy_contrib_pathways
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index c9fd9eaabe3ffa..c9d64aac9c3622 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -1191,7 +1191,7 @@ IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
   // TODO(b/282757875): Consider other ArrayCopySemantics.
   TF_ASSIGN_OR_RETURN(auto single_device_arrays,
                       array->DisassembleIntoSingleDeviceArrays(
-                          xla::ifrt::ArrayCopySemantics::kAlwaysCopy,
+                          xla::ifrt::ArrayCopySemantics::kReuseInput,
                           single_device_shard_semantics));
 
   std::vector<uint64_t> response_handles =
diff --git a/third_party/xla/xla/python/inspect_sharding.cc b/third_party/xla/xla/python/inspect_sharding.cc
index 81f278999ff776..10b83d32d97f26 100644
--- a/third_party/xla/xla/python/inspect_sharding.cc
+++ b/third_party/xla/xla/python/inspect_sharding.cc
@@ -46,7 +46,8 @@ std::optional<xla::HloSharding> InspectShardingReadArgs(
     JAX_InspectSharding_Callback_Args* args) {
   xla::OpSharding proto;
   if (args->sharding_spec_size > std::numeric_limits<int>::max() ||
-      !proto.ParseFromArray(args->sharding_spec, args->sharding_spec_size)) {
+      !proto.ParseFromString(
+          absl::string_view(args->sharding_spec, args->sharding_spec_size))) {
     InspectShardingSetError(args,
                             "inspect_sharding: error parsing OpShardingProto");
     return std::nullopt;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index b8f2712af0187a..a5a9752be542d1 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -318,6 +318,7 @@ cc_library(
         ":transfer_server_interface",
         ":xla_executable_version_serdes",
         ":xla_ifrt",
+        "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -335,7 +336,6 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:client",
@@ -579,7 +579,7 @@ xla_cc_test(
     deps = [
         ":basic_string_array",
         ":pjrt_cpu_client_multi_process_test_lib",
-        "//xla/pjrt:pjrt_future",
+        "//xla:future",
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
         "//xla/tsl/concurrency:future",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index 8fe3f79ca54047..33841c00f8a99f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index ac0f28a8a2146d..e0ef67a5de0c12 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "xla/future.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 4f4905fb5ba348..1ce48e498067cc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/type_registry.h"
+#include "xla/future.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
 #include "xla/layout.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
@@ -714,7 +714,6 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
   }
 
   xla::ExecuteOptions opts;
-  opts.untuple_result = true;
   opts.launch_id = options.launch_id;
   opts.use_major_to_minor_data_layout_for_callbacks = true;
   opts.non_donatable_input_indices = options.non_donatable_input_indices;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
index 28c7f0162f128e..6007cbd53eb9e8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -950,7 +951,8 @@ TEST_P(HloShardingTest, DisassembleFailsWithMismatchingShapeDimsSize) {
 
 TEST_P(HloShardingTest, DisassembleFailsWithDynamicShape) {
   auto device_list = GetDevices({0, 1});
-  auto xla_hlo_sharding = xla::HloSharding::Tile(xla::TileAssignment({2}));
+  auto xla_hlo_sharding =
+      xla::HloSharding::Tile(xla::TileAssignment(absl::Span<const int64_t>{2}));
   std::shared_ptr<const HloSharding> sharding =
       HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
 
diff --git a/third_party/xla/xla/python/strides.cc b/third_party/xla/xla/python/strides.cc
new file mode 100644
index 00000000000000..a1048c0a8a8a5b
--- /dev/null
+++ b/third_party/xla/xla/python/strides.cc
@@ -0,0 +1,68 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/strides.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns the strides for `shape`.
+std::vector<int64_t> ByteStridesForShape(const Shape& shape) {
+  std::vector<int64_t> strides;
+  CHECK(shape.IsArray());
+  CHECK(shape.has_layout());
+  return ByteStridesForShape(shape.element_type(), shape.dimensions(),
+                             shape.layout());
+}
+
+static std::vector<int64_t> StridesForShapeHelper(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+    const xla::Layout& layout, int64_t innermost_stride_size) {
+  CHECK_EQ(dimensions.size(), layout.minor_to_major().size());
+  std::vector<int64_t> strides;
+  strides.resize(dimensions.size());
+  int64_t stride = innermost_stride_size;
+  for (int i : layout.minor_to_major()) {
+    strides[i] = stride;
+    stride *= dimensions[i];
+  }
+  return strides;
+}
+
+std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
+                                         absl::Span<const int64_t> dimensions,
+                                         const xla::Layout& layout) {
+  return StridesForShapeHelper(
+      element_type, dimensions, layout,
+      ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+}
+
+std::vector<int64_t> StridesForShape(PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout) {
+  return StridesForShapeHelper(element_type, dimensions, layout,
+                               /*innermost_stride_size=*/1);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/strides.h b/third_party/xla/xla/python/strides.h
new file mode 100644
index 00000000000000..d86d6f2d5c3694
--- /dev/null
+++ b/third_party/xla/xla/python/strides.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_STRIDES_H_
+#define XLA_PYTHON_STRIDES_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/shape.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns the strides for `shape`.
+std::vector<int64_t> ByteStridesForShape(const Shape& shape);
+std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
+                                         absl::Span<const int64_t> dimensions,
+                                         const xla::Layout& layout);
+std::vector<int64_t> StridesForShape(PrimitiveType element_type,
+                                     absl::Span<const int64_t> dimensions,
+                                     const xla::Layout& layout);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_STRIDES_H_
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
deleted file mode 100644
index db2cb0f2a71de9..00000000000000
--- a/third_party/xla/xla/python/tools/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-load("//xla:py_strict.bzl", "py_strict_test")
-
-# NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
-# because the OSS versions of these files do not include ports of those rules.
-# We must instead use `tsl_pybind_extension` and `py_strict_test`.
-load("//xla:pytype.bzl", "pytype_strict_library")
-load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-exports_files([
-    "__init__.py",
-    "types.py",
-    "_types.pyi",
-])
-
-# NOTE: This wrapper library is necessary in order to capture the Python
-# dependencies of our extension (namely `ml_dtypes`).  Although the
-# underlying `pybind_extension` rule has a `py_deps` argument for capturing
-# such dependencies directly, the `tsl_pybind_extension` rule doesn't expose
-# that `py_deps` argument for us to use.
-#
-# NOTE: On the OSS side, the `pytype_strict_library` rule is changed into
-# the non-typed rule, which in turn causes an error about the `pytype_srcs`
-# field.  The "..:xla_client" target gets around this by adding a custom
-# copybara rule; but in lieu of adding yet another custom rule to maintain,
-# we just use the generic copybara mechanism for commenting the field out
-# on the OSS side.
-# TODO(wrengr,phawkins): Once cl/619904840 lands, we can remove the
-# pragma and the preceding commentary.
-pytype_strict_library(
-    name = "types",
-    srcs = ["types.py"],
-    # copybara:uncomment pytype_srcs = ["_types.pyi"],
-    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
-    # dependency isn't part of the public API.
-    tags = ["no_oss"],
-    # TODO(dsuo): Should this be public given note above?
-    visibility = ["//visibility:public"],
-    deps = [
-        ":_types",  # buildcleaner: keep
-        "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
-        "@ml_dtypes_py//ml_dtypes",
-    ],
-)
-
-# NOTE: Copybara detects the `tsl_pybind_extension` rule and automatically
-# injects the @com_google_protobuf//:protobuf_python python dependency
-# required by "@pybind11_protobuf//pybind11_protobuf:native_proto_caster".
-tsl_pybind_extension(
-    name = "_types",
-    srcs = ["_types.cc"],
-    pytype_deps = ["//third_party/py/numpy"],
-    pytype_srcs = ["_types.pyi"],
-    # Users should depend on ":types" instead.
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla:literal",
-        "//xla:xla_data_proto_cc",
-        "//xla/pjrt:status_casters",
-        "//xla/python:logging",
-        "//xla/python:nb_numpy",
-        "//xla/python:types",
-        "//xla/tsl/python/lib/core:numpy",
-        "@com_google_absl//absl/strings",
-        "@nanobind",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:import_status_module",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
-    ],
-)
-
-py_strict_test(
-    name = "types_test",
-    size = "small",
-    srcs = ["types_test.py"],
-    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
-    # dependency isn't part of the public API.
-    tags = ["no_oss"],
-    deps = [
-        ":types",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
-        # copybara:uncomment "//third_party/py/google/protobuf:use_fast_cpp_protos",
-        "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
-    ],
-)
diff --git a/third_party/xla/xla/python/tools/__init__.py b/third_party/xla/xla/python/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
deleted file mode 100644
index fb44e3c64ef2e2..00000000000000
--- a/third_party/xla/xla/python/tools/_types.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/str_cat.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "pybind11/detail/common.h"
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-// The "pybind11_abseil/status_casters.h" header says
-// it's deprecated and that we should import the other headers directly.
-#include "pybind11_abseil/import_status_module.h"
-#include "pybind11_protobuf/native_proto_caster.h"
-#include "xla/literal.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/logging.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/types.h"
-#include "xla/xla_data.pb.h"
-// NOTE: The tsl-numpy header forbids importing the actual NumPy arrayobject.h
-// header before tsl-numpy (whereas, importing pybind11-numpy before tsl-numpy
-// is fine); however, tsl-numpy does reexport NumPy's arrayobject.h header.
-// Since one of the TF headers above already includes tsl-numpy, therefore
-// we must include it down here rather than including actual NumPy directly.
-#include "xla/tsl/python/lib/core/numpy.h"
-
-namespace py = ::pybind11;
-namespace nb = ::nanobind;
-
-namespace {
-py::object MakeNdarray(const xla::LiteralProto& proto) {
-  auto m_lit = xla::Literal::CreateFromProto(proto);
-  if (!m_lit.ok()) {
-    // NOTE: The OSS version of XLA is still using an old version of
-    // Abseil (LTS branch, Aug 2023, Patch 1) which does not have the
-    // `AbslStringify` interface for implicitly converting `absl::Status`
-    // into the `absl::AlphaNum` required by `absl::StrCat`.  Therefore we
-    // inline the latest definition of the `AbslStringify` overload.
-    throw py::value_error(absl::StrCat(
-        "Cannot `xla::Literal::CreateFromProto`: ",
-        m_lit.status().ToString(absl::StatusToStringMode::kWithEverything)));
-  }
-
-  // Move (not copy) the literal onto the heap, for sharing with Python.
-  auto lit = std::make_shared<xla::Literal>(std::move(m_lit).value());
-
-  auto nbobj = xla::ValueOrThrow(xla::LiteralToPython(std::move(lit)));
-  return py::reinterpret_steal<py::object>(nbobj.release().ptr());
-}
-
-// Partial reversion of cl/617156835, until we can get the proto-casters
-// (and hence the extension) switched over to nanobind.
-// TODO(wrengr): Or can we mix `{py,nb}::module_::def` calls??
-xla::PrimitiveType DtypeToEtype(const py::dtype& py_d) {
-  auto nb_d = nb::borrow<xla::nb_dtype>(py_d.ptr());
-  return xla::ValueOrThrow(xla::DtypeToPrimitiveType(nb_d));
-}
-
-py::dtype EtypeToDtype(xla::PrimitiveType p) {
-  auto nb_d = xla::ValueOrThrow(xla::PrimitiveTypeToNbDtype(p));
-  return py::reinterpret_steal<py::dtype>(nb_d.release().ptr());
-}
-}  // namespace
-
-// NOTE: It seems insurmountable to get "native_proto_caster.h" to work
-// with nanobind modules; therefore, we define our extension as a pybind11
-// module so that we can use `pybind11::module_::def`.
-PYBIND11_MODULE(_types, py_m) {
-  // Initialize ABSL logging because code within XLA uses it.
-  // (As per `xla::Init` in "xla.cc"; though we don't need it ourselves.)
-#ifndef PLATFORM_GOOGLE
-  xla::InitializeAbslLogging();
-#endif  // PLATFORM_GOOGLE
-
-  // Normally this would happen at the start of NB_MODULE, but since
-  // this is a pybind11 module we have to do this ourselves.
-  // (As per `xla::Init` in "xla.cc".)
-  nb::detail::init(NB_DOMAIN_STR);
-
-  // Import implicit conversions from Python protobuf objects to C++
-  // protobuf objects.
-  pybind11_protobuf::ImportNativeProtoCasters();
-
-  // Import dependencies for converting `absl::StatusOr` to Python exceptions.
-  // This also brings into scope pybind11 casters for doing conversions
-  // implicitly; however, towards the goal of converting everything to
-  // nanobind, we call `xla::ValueOrThrow` to make make the conversions
-  // explicit (since `nb::detail::type_caster` disallows raising exceptions,
-  // and therefore nanobind cannot do this implicitly).
-  py::google::ImportStatusModule();
-
-  // Import the 'ml_dtypes' module; which is implicitly required by
-  // `xla::LiteralToPython`.
-  // NOTE: If the `tsl_pybind_extension` build rule allowed us to specify
-  // this as a py_dep, then importing the module here would mean that
-  // client Python code need not import the hidden dependency themselves.
-  // However, since `tsl_pybind_extension` does not allow specifying py_deps,
-  // if client rules do not themselves declare the dependency then this will
-  // generate a `ModuleNotFoundError` / `ImportError` exception.  Hence why
-  // we define the "types.py" wrapper library to encapsulate the dependency.
-  py::module_::import("ml_dtypes");
-
-  // Ensure that tsl-numpy initializes datastructures of the actual-NumPy
-  // implementation, and does whatever else tsl-numpy needs.  This is
-  // also necessary for using the `xla::nb_dtype` type.
-  tsl::ImportNumpy();
-
-  // Declare that C++ can `nb::cast` from `std::shared_ptr<xla::Literal>`
-  // to `nb::object`; which is implicitly required by `xla::LiteralToPython`.
-  // (FWIW: This also enables using `nb::type<xla::Literal>()` to get
-  // the Python-type-object associated with the C++ class.)
-  //
-  // NOTE: This does *not* mean that C++ can `py::cast` from `xla::Literal`
-  // to `py::object`.  It's unclear whether we can simultaneously provide
-  // both nanobind and pybind11 bindings (if we wanted the latter).
-  nb::module_ nb_m = nb::cast<nb::module_>(nb::borrow(py_m.ptr()));
-  nb::class_<xla::Literal>(nb_m, "Literal")
-      .def("__repr__", &xla::Literal::ToString);
-
-  // We do not define `py_m.doc()` here, since it wouldn't be inherited
-  // by the "types.py" wrapper library.  See there for the python docstring.
-
-  // LINT.IfChange
-  py_m.def("make_ndarray", &MakeNdarray, py::arg("proto").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `tensorflow.compiler.xla.xla_data_pb2.LiteralProto`
-    into an `xla::Literal` and then converts that literal into a tree
-    of tuples with leaves being `numpy.ndarray` views of array-shaped
-    sub-literals.
-  )pbdoc");
-
-  // This method name is based on `xla_client.dtype_to_etype`.
-  // NOTE: `xla_client` uses a Python class wrapping the protobuf-enum,
-  // rather than using the protobuf-enum directly.  See the module docstring
-  // in "types.py" for more explanation on why.
-  py_m.def("dtype_to_etype", &DtypeToEtype, py::arg("dtype").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `numpy.dtype` into
-    `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType`.
-  )pbdoc");
-
-  py_m.def("etype_to_dtype", &EtypeToDtype, py::arg("ptype").none(false),
-           py::pos_only(), R"pbdoc(
-    Converts `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType` into
-    `numpy.dtype`.
-  )pbdoc");
-  // LINT.ThenChange(_types.pyi)
-}
diff --git a/third_party/xla/xla/python/tools/types.py b/third_party/xla/xla/python/tools/types.py
deleted file mode 100644
index 189758f1e749c8..00000000000000
--- a/third_party/xla/xla/python/tools/types.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""tensorflow.compiler.xla.python.tools.types.
-
-This module provides Python bindings for various functions in
-'tensorflow/compiler/xla/python/types.h'.  It is primarily intended
-to assist internal users in debugging things; and is not considered
-part of the public API for OpenXLA.
-
-NOTE: This module *does* depend on Python protocol buffers; so beware!
-The XLA Python bindings are currently packaged both as part of jaxlib and
-as part of TensorFlow.  Therefore, since we use protocol buffers here,
-importing both jaxlib and TensorFlow may fail with duplicate protocol
-buffer message definitions.
-"""
-
-from typing import Union
-# NOTE: `ml_dtypes` is implicitly required by `xla::LiteralToPython`.
-# The entire goal of this wrapper library is to capture this dependency,
-# so that client code need not be aware of it.
-import ml_dtypes  # pylint: disable=unused-import
-import numpy
-# NOTE: These protos are not part of TensorFlow's public API, therefore
-# we cannot abide by [g-direct-tensorflow-import].
-# pylint: disable=g-direct-tensorflow-import,unused-import
-from local_xla.xla import xla_data_pb2
-# pylint: enable=g-direct-tensorflow-import,unused-import
-
-# NOTE: `import <name> as <name>` is required for names to be exported.
-# See PEP 484 & <https://github.com/google/jax/issues/7570>
-# pylint: disable=g-importing-member,useless-import-alias,unused-import,g-multiple-import
-# LINT.IfChange
-from ._types import (
-    make_ndarray as make_ndarray,
-    dtype_to_etype as dtype_to_etype,
-    etype_to_dtype as etype_to_dtype,
-)
-# TODO(wrengr): We can't import the `NdarrayTree` defined in the pyi file.
-# So re-defining it here for now.
-NdarrayTree = Union[numpy.ndarray, tuple['NdarrayTree', ...]]
-# LINT.ThenChange(_types.pyi)
diff --git a/third_party/xla/xla/python/tools/types_test.py b/third_party/xla/xla/python/tools/types_test.py
deleted file mode 100644
index a6cdb1d0f76b13..00000000000000
--- a/third_party/xla/xla/python/tools/types_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import itertools
-import math
-import re
-from typing import List, NamedTuple
-
-from absl.testing import absltest
-from absl.testing import parameterized
-import numpy as np
-
-# NOTE: These protos are not part of the public API, therefore we cannot
-# abide by [g-direct-tensorflow-import].
-# pylint: disable=g-direct-tensorflow-import
-from local_xla.xla import xla_data_pb2
-from xla.python.tools import types
-# pylint: enable=g-direct-tensorflow-import
-
-
-class MakeNdarrayInvalidTest(absltest.TestCase):
-  """Tests for invalid/unsupported arguments to `make_ndarray`."""
-
-  def setUp(self):
-    super().setUp()
-    self.assert_cannot_create_from_proto = self.assertRaisesRegex(
-        ValueError, re.escape('Cannot `xla::Literal::CreateFromProto`')
-    )
-
-  # NOTE: The `Literal(const Shape&, bool, ArrayValueState)` ctor does
-  # a CHECK forbidding `element_size_in_bits` from being specified;
-  # so we can't test anything about custom sizes here.
-
-  def testMissingLayout(self):
-    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
-    # Though in principle it could use a default ctor instead, like we
-    # do in `make_named_parameter` below`.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testMissingMinorToMajor(self):
-    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
-    # Though in principle it could use a default ctor instead, like we
-    # do in `make_named_parameter` below`.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-            layout=xla_data_pb2.LayoutProto(),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testInvalidPrimitiveType(self):
-    # NOTE: The `is_dynamic_dimension` field isn't required by
-    # `CreateFromProto`; however, the `Shape(const ShapeProto&)` ctor
-    # will log warnings if we leave it unspecified.
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.PRIMITIVE_TYPE_INVALID,
-            dimensions=[1, 2, 3],
-            is_dynamic_dimension=[False, False, False],
-            layout=xla_data_pb2.LayoutProto(
-                minor_to_major=[0, 1, 2],
-            ),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-  def testHasDimLevelTypes(self):
-    # NOTE: `CreateFromProto` forbids `dim_level_types` (even if all-dense).
-    pb = xla_data_pb2.LiteralProto(
-        shape=xla_data_pb2.ShapeProto(
-            element_type=xla_data_pb2.PrimitiveType.F64,
-            dimensions=[1, 2, 3],
-            is_dynamic_dimension=[False, False, False],
-            layout=xla_data_pb2.LayoutProto(
-                dim_level_types=[
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                    xla_data_pb2.DimLevelType.DIM_DENSE,
-                ],
-                minor_to_major=[0, 1, 2],
-            ),
-        )
-    )
-    with self.assert_cannot_create_from_proto:
-      types.make_ndarray(pb)
-
-
-class MakeNdarrayValidTestParameter(NamedTuple):
-  testcase_name: str
-  proto: xla_data_pb2.LiteralProto
-  arr: np.ndarray
-
-
-def make_named_parameter(
-    testcase_name: str,
-    dimensions: List[int],
-    data: List[float],
-) -> MakeNdarrayValidTestParameter:
-  """Helper function to construct parameters for `MakeNdarrayValidTest`."""
-  assert math.prod(dimensions) == len(data)
-  nd = len(dimensions)
-  proto = xla_data_pb2.LiteralProto(
-      shape=xla_data_pb2.ShapeProto(
-          element_type=xla_data_pb2.PrimitiveType.F64,
-          dimensions=dimensions,
-          is_dynamic_dimension=itertools.repeat(False, nd),
-          layout=xla_data_pb2.LayoutProto(
-              minor_to_major=range(nd),
-          ),
-      ),
-      f64s=data,
-  )
-  arr = types.make_ndarray(proto)
-  return MakeNdarrayValidTestParameter(testcase_name, proto, arr)
-
-
-@parameterized.named_parameters(
-    make_named_parameter('A', [2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
-    make_named_parameter('B', [1, 2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
-    make_named_parameter('C', [2, 3], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
-    make_named_parameter('D', [3, 2], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
-)
-class MakeNdarrayValidTest(parameterized.TestCase):
-  """Correctness tests for valid arguments to `make_ndarray`."""
-
-  def testHasCorrectDtype(self, proto, arr):
-    """Test that the result has the right dtype."""
-    e = proto.shape.element_type
-    d = arr.dtype
-    with self.subTest(msg='etype_to_dtype'):
-      self.assertEqual(types.etype_to_dtype(e), d)
-    with self.subTest(msg='dtype_to_etype'):
-      self.assertEqual(e, types.dtype_to_etype(d))
-
-  def testHasCorrectRank(self, proto, arr):
-    """Test that the result has the right rank."""
-    self.assertLen(proto.shape.dimensions, arr.ndim)
-
-  def testHasCorrectShape(self, proto, arr):
-    """Test that the result has the same/right shape."""
-    self.assertTupleEqual(tuple(proto.shape.dimensions), arr.shape)
-
-  def testHasCorrectData(self, proto, arr):
-    """Test that the result has the same/right data."""
-    # TODO(wrengr): Figure out a way to abstract away the name of the
-    # proto field containing the data; so that we can test multiple types.
-    self.assertSequenceAlmostEqual(proto.f64s, list(np.nditer(arr)))
-
-  # TODO(wrengr): Add tests for:
-  # * dynamic dimension sizes.
-  # * non-trivial `minor_to_major`.
-  # * problematic types {PRED,F16,C64,C128} are all handled correctly.
-  # * BF16 is handled correctly.
-  # * tuples are handled correctly
-
-
-if __name__ == '__main__':
-  absltest.main()
diff --git a/third_party/xla/xla/python/transfer/BUILD b/third_party/xla/xla/python/transfer/BUILD
index 82de018f2b4ecc..789a1c0dca6ef9 100644
--- a/third_party/xla/xla/python/transfer/BUILD
+++ b/third_party/xla/xla/python/transfer/BUILD
@@ -34,7 +34,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":transfer_socket_proto_cc",
-        "//xla/pjrt:pjrt_future",
+        "//xla:future",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -111,9 +111,9 @@ cc_library(
     deps = [
         ":streaming",
         ":transfer_socket_proto_cc",
+        "//xla:future",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:raw_buffer",
         "//xla/python/ifrt",
         "//xla/tsl/concurrency:ref_count",
@@ -153,6 +153,7 @@ xla_cc_test(
         ":streaming",
         ":streaming_ifrt",
         ":test_pattern",
+        "//xla:future",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:raw_buffer",
@@ -164,7 +165,9 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/python/transfer/event_loop.cc b/third_party/xla/xla/python/transfer/event_loop.cc
index 9f1a8426301cfb..3794d3e0dca32e 100644
--- a/third_party/xla/xla/python/transfer/event_loop.cc
+++ b/third_party/xla/xla/python/transfer/event_loop.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <arpa/inet.h>
 #include <linux/tcp.h>
+#include <netdb.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -25,9 +26,11 @@ limitations under the License.
 
 #include <atomic>
 #include <cerrno>
+#include <charconv>
 #include <memory>
 #include <queue>
 #include <string>
+#include <system_error>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -40,6 +43,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -222,8 +227,8 @@ class SocketListener::Handler : public PollEventLoop::Handler {
   }
 
   absl::StatusOr<int> Accept(SocketAddress& recv_addr) {
-    socklen_t addr_len = sizeof(recv_addr);
-    int cfd = accept4(fd_, reinterpret_cast<sockaddr*>(&recv_addr), &addr_len,
+    socklen_t addr_len = sizeof(recv_addr.mutable_address());
+    int cfd = accept4(fd_, &recv_addr.mutable_address(), &addr_len,
                       accept_flags_ | SOCK_CLOEXEC);
     if (cfd == -1) {
       return absl::ErrnoToStatus(errno, "accept");
@@ -252,7 +257,7 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
     absl::AnyInvocable<void(int socket_fd, const SocketAddress& addr)>
         on_accept,
     int accept_flags) {
-  std::unique_ptr<SocketListener> result(new SocketListener());
+  auto result = std::make_unique<SocketListener>();
   int sfd = socket(addr.address().sa_family,
                    SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
   if (sfd == -1) {
@@ -266,9 +271,7 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
   if (setsockopt(sfd, SOL_SOCKET, SO_REUSEPORT, &value, sizeof(value)) != 0) {
     return absl::ErrnoToStatus(errno, "setsockopt");
   }
-  if (bind(sfd, reinterpret_cast<const struct sockaddr*>(&addr.address()),
-           addr.address().sa_family == AF_INET6 ? sizeof(sockaddr_in6)
-                                                : sizeof(sockaddr_in)) != 0) {
+  if (bind(sfd, &addr.address(), addr.len()) != 0) {
     return absl::ErrnoToStatus(errno, "bind");
   }
   if (listen(sfd, 1024) != 0) {
@@ -276,27 +279,13 @@ absl::StatusOr<std::unique_ptr<SocketListener>> SocketListener::Listen(
   }
   result->handler_ =
       new SocketListener::Handler(sfd, accept_flags, std::move(on_accept));
-  if (addr.address().sa_family == AF_INET6) {
-    sockaddr_in6 new_sock_name = addr.address_ipv6();
-    sockaddr_in6 new_sock_name2;
-    socklen_t addr_len = sizeof(new_sock_name2);
-    if (getsockname(sfd, reinterpret_cast<struct sockaddr*>(&new_sock_name2),
-                    &addr_len) != 0) {
-      return absl::ErrnoToStatus(errno, "getsockname");
-    }
-    new_sock_name.sin6_port = new_sock_name2.sin6_port;
-    result->addr_ = SocketAddress(new_sock_name);
-  } else {
-    sockaddr_in new_sock_name = addr.address_ipv4();
-    sockaddr_in new_sock_name2;
-    socklen_t addr_len = sizeof(new_sock_name2);
-    if (getsockname(sfd, reinterpret_cast<struct sockaddr*>(&new_sock_name2),
-                    &addr_len) != 0) {
-      return absl::ErrnoToStatus(errno, "getsockname");
-    }
-    new_sock_name.sin_port = new_sock_name2.sin_port;
-    result->addr_ = SocketAddress(new_sock_name);
+
+  sockaddr_storage bound_storage;
+  socklen_t bound_len = sizeof(bound_storage);
+  if (getsockname(sfd, (sockaddr*)&bound_storage, &bound_len) != 0) {
+    return absl::ErrnoToStatus(errno, "getsockname");
   }
+  result->addr_ = SocketAddress(bound_storage);
   return result;
 }
 
@@ -317,103 +306,111 @@ void SocketListener::Start() {
 }
 
 SocketAddress::SocketAddress() {
-  memset(this, 0, sizeof(SocketAddress));
-  saddr6_.sin6_family = AF_INET6;
+  memset(&storage_, 0, sizeof(storage_));
+  storage_.ss_family = AF_INET6;
 }
 
 SocketAddress::SocketAddress(const sockaddr_in& saddr) {
-  memcpy(&saddr4_, &saddr, sizeof(saddr));
+  memcpy(&storage_, &saddr, sizeof(saddr));
 }
 
 SocketAddress::SocketAddress(const sockaddr_in6& saddr) {
-  memcpy(&saddr6_, &saddr, sizeof(saddr));
+  memcpy(&storage_, &saddr, sizeof(saddr));
 }
 
-std::string SocketAddress::ToString() const {
-  if (saddr_.sa_family == AF_INET6) {
-    char tmp[INET6_ADDRSTRLEN + 16];
-    tmp[0] = '[';
-    inet_ntop(AF_INET6, &saddr6_.sin6_addr, &tmp[1], sizeof(tmp) - 1);
-    int pos = strlen(&tmp[0]);
-    pos += snprintf(&tmp[pos], sizeof(tmp) - pos, "]:%d",
-                    ntohs(saddr6_.sin6_port));
-    return std::string(tmp, pos);
-  } else if (saddr_.sa_family == AF_INET) {
-    char tmp[INET_ADDRSTRLEN + 16];
-    inet_ntop(AF_INET, &saddr4_.sin_addr, &tmp[0], sizeof(tmp) - 1);
-    int pos = strlen(&tmp[0]);
-    pos +=
-        snprintf(&tmp[pos], sizeof(tmp) - pos, ":%d", ntohs(saddr4_.sin_port));
-    return std::string(tmp, pos);
-  } else {
-    LOG(FATAL) << "Invalid IPAddress";
+SocketAddress::SocketAddress(const sockaddr_storage& saddr) : storage_(saddr) {}
+
+socklen_t SocketAddress::len() const {
+  switch (storage_.ss_family) {
+    case AF_INET6:
+      return sizeof(sockaddr_in6);
+    case AF_INET:
+      return sizeof(sockaddr_in);
+    default:
+      return sizeof(sockaddr_storage);
   }
 }
 
-int ParsePort(const std::string& addr, size_t it, uint32_t& parsed_port) {
-  size_t port_pos = addr.find(':', it);
-  if (port_pos == std::string::npos) {
-    return -1;
-  }
-  for (size_t i = port_pos + 1; i < addr.size(); ++i) {
-    if (!(addr[i] >= '0' && addr[i] <= '9')) {
-      return -1;
-    }
-    parsed_port = parsed_port * 10 + (addr[i] - '0');
-    if (parsed_port >= 65536) {
-      return -1;
+std::string SocketAddress::ToString() const {
+  char host[NI_MAXHOST], serv[NI_MAXSERV];
+  int flags = NI_NUMERICHOST | NI_NUMERICSERV;
+  if (getnameinfo(&address(), len(), host, sizeof(host), serv, sizeof(serv),
+                  flags) == 0) {
+    if (storage_.ss_family == AF_INET6) {
+      return absl::StrCat("[", host, "]:", serv);
     }
+    return absl::StrCat(host, ":", serv);
+  }
+  LOG(FATAL) << "Invalid IPAddress";
+}
+
+absl::StatusOr<uint16_t> ParsePort(absl::string_view colon_port) {
+  if (!absl::ConsumePrefix(&colon_port, ":")) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Missing colon for port: '", colon_port, "'"));
+  }
+  uint16_t parsed_port;
+  const char* last = colon_port.data() + colon_port.size();
+  auto [ptr, ec] =
+      std::from_chars(colon_port.data(), last, parsed_port, /*base=*/10);
+  if (ec != std::errc{}) {
+    return absl::ErrnoToStatus(static_cast<int>(ec),
+                               absl::StrCat("std::from_chars could not parse '",
+                                            colon_port, "' as a valid port"));
+  }
+  if (ptr != last) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Encountered non-numeric characters while parsing port: '",
+                     colon_port, "'"));
   }
-  return 0;
+  return parsed_port;
 }
 
-int SocketAddress::Parse(const std::string& addr, SocketAddress& out) {
-  memset(&out, 0, sizeof(SocketAddress));
-  if (!addr.empty() && addr.data()[0] == '[') {
+absl::StatusOr<SocketAddress> SocketAddress::Parse(absl::string_view addr) {
+  SocketAddress out;
+  memset(&out.storage_, 0, sizeof(out.storage_));
+  std::string ip_address;
+  absl::string_view colon_port;
+  if (absl::ConsumePrefix(&addr, "[")) {
     size_t it = addr.find(']');
     if (it == std::string::npos) {
-      return -1;
-    }
-    if (it - 1 >= INET6_ADDRSTRLEN) {
-      return -1;
+      return absl::InvalidArgumentError(
+          absl::StrCat("IPv6 address missing closing bracket: '", addr, "'"));
     }
-    char tmp[INET6_ADDRSTRLEN];
-    uint32_t parsed_port = 0;
-    if (ParsePort(addr, it, parsed_port) != 0) {
-      return -1;
-    }
-    out.saddr6_.sin6_family = AF_INET6;
-    out.saddr6_.sin6_port = htons(static_cast<uint16_t>(parsed_port));
-    memcpy(&tmp[0], &addr.data()[1], it - 1);
-    tmp[it - 1] = 0;
-    return inet_pton(AF_INET6, &tmp[0], &out.saddr6_.sin6_addr);
+    ip_address = addr.substr(0, it);
+    colon_port = addr.substr(it + 1);
+    out.storage_.ss_family = AF_INET6;
   } else {
     size_t it = addr.find(':');
     if (it == std::string::npos) {
-      return -1;
-    }
-    if (it >= INET_ADDRSTRLEN) {
-      return -1;
-    }
-    uint32_t parsed_port = 0;
-    if (ParsePort(addr, it, parsed_port) != 0) {
-      return -1;
+      return absl::InvalidArgumentError(
+          absl::StrCat("IPv4 address missing colon for port: '", addr, "'"));
     }
-    char tmp[INET_ADDRSTRLEN];
-    memcpy(&tmp[0], &addr.data()[0], it);
-    tmp[it] = 0;
-    out.saddr4_.sin_family = AF_INET;
-    out.saddr4_.sin_port = htons(static_cast<uint16_t>(parsed_port));
-    return inet_pton(AF_INET, &tmp[0], &out.saddr4_.sin_addr);
-  }
-  return -1;
-}
+    ip_address = addr.substr(0, it);
+    colon_port = addr.substr(it);
+    out.storage_.ss_family = AF_INET;
+  }
+  absl::StatusOr<uint16_t> parsed_port = ParsePort(colon_port);
+  if (!parsed_port.ok()) {
+    return parsed_port.status();
+  }
 
-absl::StatusOr<SocketAddress> SocketAddress::Parse(const std::string& addr) {
-  SocketAddress out;
-  if (Parse(addr, out) == -1) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Could not parse ip address: ", addr));
+  void* sin_addr_dst;
+  if (out.storage_.ss_family == AF_INET6) {
+    sockaddr_in6* v6 = (sockaddr_in6*)&out.storage_;
+    v6->sin6_port = htons(*parsed_port);
+    sin_addr_dst = &v6->sin6_addr;
+  } else {
+    CHECK_EQ(out.storage_.ss_family, AF_INET);
+    sockaddr_in* v4 = (sockaddr_in*)&out.storage_;
+    v4->sin_port = htons(*parsed_port);
+    sin_addr_dst = &v4->sin_addr;
+  }
+  if (inet_pton(out.storage_.ss_family, ip_address.c_str(), sin_addr_dst) ==
+      -1) {
+    return absl::ErrnoToStatus(
+        errno, absl::StrCat("inet_pton failed when parsing address: '",
+                            ip_address, "'"));
   }
   return out;
 }
diff --git a/third_party/xla/xla/python/transfer/event_loop.h b/third_party/xla/xla/python/transfer/event_loop.h
index d120e70652e513..d1f87c5cb789a4 100644
--- a/third_party/xla/xla/python/transfer/event_loop.h
+++ b/third_party/xla/xla/python/transfer/event_loop.h
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include <netinet/in.h>
 #include <poll.h>
+#include <sys/socket.h>
 
 #include <deque>
 #include <memory>
+#include <string>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -89,26 +92,38 @@ class SocketAddress {
   SocketAddress();
   explicit SocketAddress(const sockaddr_in& saddr);
   explicit SocketAddress(const sockaddr_in6& saddr);
+  explicit SocketAddress(const sockaddr_storage& saddr);
 
   // Fetch address.
-  const sockaddr& address() const { return saddr_; }
-  const sockaddr_in6& address_ipv6() const { return saddr6_; }
-  const sockaddr_in& address_ipv4() const { return saddr4_; }
+  const sockaddr& address() const {
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr*>(&storage_);
+  }
+  sockaddr& mutable_address() {
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<sockaddr*>(&storage_);
+  }
+  const sockaddr_in6& address_ipv6() const {
+    CHECK_EQ(storage_.ss_family, AF_INET6);
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr_in6*>(&storage_);
+  }
+  const sockaddr_in& address_ipv4() const {
+    CHECK_EQ(storage_.ss_family, AF_INET);
+    // NOLINTNEXTLINE
+    return *reinterpret_cast<const sockaddr_in*>(&storage_);
+  }
+
+  socklen_t len() const;
 
   // To String (parsable with Parse).
   std::string ToString() const;
 
   // Inverse of ToString().
-  static absl::StatusOr<SocketAddress> Parse(const std::string& addr);
+  static absl::StatusOr<SocketAddress> Parse(absl::string_view addr);
 
  private:
-  static int Parse(const std::string& addr, SocketAddress& out);
-
-  union {
-    sockaddr saddr_;
-    sockaddr_in saddr4_;
-    sockaddr_in6 saddr6_;
-  };
+  sockaddr_storage storage_;
 };
 
 // Calls accept() on sockets.
diff --git a/third_party/xla/xla/python/transfer/socket-server.cc b/third_party/xla/xla/python/transfer/socket-server.cc
index 2a082f0f5a7488..f70f2ee3ce8298 100644
--- a/third_party/xla/xla/python/transfer/socket-server.cc
+++ b/third_party/xla/xla/python/transfer/socket-server.cc
@@ -114,13 +114,22 @@ class SocketServer::SocketNetworkState : public SocketFdPacketState {
     return SendRawFrame(std::move(opacket));
   }
 
-  tsl::RCReference<ChunkDestination> GetNextDest(size_t req_id, size_t offset,
-                                                 size_t size, bool is_largest) {
+  std::optional<tsl::RCReference<ChunkDestination>> GetNextDest(
+      size_t req_id, size_t offset, size_t size, bool is_largest) {
     tsl::RCReference<ChunkDestination> dest;
     {
       absl::MutexLock l(mu_);
+      if (is_poisoned_) {
+        return std::nullopt;
+      }
       auto it = dests_.find(req_id);
-      CHECK(it != dests_.end());
+      if (it == dests_.end()) {
+        Shutdown(SHUT_RDWR);
+        is_poisoned_ = true;
+        poison_status_ =
+            absl::InternalError("SocketServer: it != dests_.end()");
+        return std::nullopt;
+      }
       if (is_largest) {
         it->second.transferred_size += offset;
       } else {
@@ -185,7 +194,7 @@ class SocketServer::SocketNetworkState : public SocketFdPacketState {
 
   void HandlePacket(absl::string_view buffer) override {
     SocketTransferRequest req;
-    if (!req.ParseFromArray(buffer.data(), buffer.size())) {
+    if (!req.ParseFromString(buffer)) {
       Poison(absl::InternalError("Could not parse SocketTransferRequest."));
       return;
     }
@@ -273,7 +282,10 @@ class SocketServer::SocketNetworkState : public SocketFdPacketState {
   void HandlePacket(const SocketTransferPacketErrorHeader& packet) {
     auto dest = GetNextDest(packet.req_id(), packet.offset(), packet.size(),
                             packet.is_largest());
-    dest->Poison(absl::InternalError(
+    if (!dest.has_value()) {
+      return;
+    }
+    (*dest)->Poison(absl::InternalError(
         absl::StrCat("Error while transferring: ", packet.error_message())));
   }
 
@@ -293,9 +305,12 @@ class SocketServer::SocketNetworkState : public SocketFdPacketState {
   void HandlePacket(const SocketTransferPacketHeader& packet) {
     auto dest = GetNextDest(packet.req_id(), packet.offset(), packet.size(),
                             packet.is_largest());
+    if (!dest.has_value()) {
+      return;
+    }
     bulk_transport_->Recv(
         packet.size(), packet.bulk_transport_id(),
-        [offset = packet.offset(), dest = std::move(dest)](
+        [offset = packet.offset(), dest = *std::move(dest)](
             absl::StatusOr<BulkTransportInterface::Message> msgor) {
           if (!msgor.ok()) {
             dest->Poison(msgor.status());
@@ -405,12 +420,16 @@ class SocketServer::SocketNetworkState : public SocketFdPacketState {
     }
   }
 
-  void InjectFailure() {
-    uint32_t header = 12341024;
-    std::string opacket = std::string(absl::string_view(
-        reinterpret_cast<const char*>(&header), sizeof(header)));
-    opacket += "Injected Failure.";
-    SendRawFrame(std::move(opacket));
+  void InjectFailure(Connection::FailureKind kind) {
+    if (kind == Connection::kProtocolFailure) {
+      uint32_t header = 12341024;
+      std::string opacket = std::string(absl::string_view(
+          reinterpret_cast<const char*>(&header), sizeof(header)));
+      opacket += "Injected Failure.";
+      SendRawFrame(std::move(opacket));
+    } else {
+      Poison(absl::InternalError("RECOVERABLE InjectFailure"));
+    }
   }
 
   static void Accept(std::shared_ptr<PullTable> table,
@@ -480,7 +499,9 @@ void SocketServer::Connection::Pull(
   local_->Pull(uuid, buffer_ids, std::move(dests));
 }
 
-void SocketServer::Connection::InjectFailure() { local_->InjectFailure(); }
+void SocketServer::Connection::InjectFailure(FailureKind kind) {
+  local_->InjectFailure(kind);
+}
 
 absl::Status SocketServer::Start(
     const SocketAddress& addr,
diff --git a/third_party/xla/xla/python/transfer/socket-server.h b/third_party/xla/xla/python/transfer/socket-server.h
index c7c439637371b8..c694eb7d561edd 100644
--- a/third_party/xla/xla/python/transfer/socket-server.h
+++ b/third_party/xla/xla/python/transfer/socket-server.h
@@ -64,7 +64,12 @@ class SocketServer {
     void Pull(uint64_t uuid, absl::Span<const int> buffer_ids,
               std::vector<tsl::RCReference<ChunkDestination>> dests);
 
-    void InjectFailure();
+    enum FailureKind {
+      kPoison,
+      kProtocolFailure,
+    };
+
+    void InjectFailure(FailureKind kind = kProtocolFailure);
 
    private:
     SocketNetworkState* local_;
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport.cc b/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
index 49b6776278e3ed..b4e8b20831c609 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport.cc
@@ -262,11 +262,18 @@ class SendConnectionHandler : public PollEventLoop::Handler {
 
   bool HandleEvents(const pollfd& events) override {
     if (events.revents & POLLRDHUP) {
-      HandleRdHup();
+      TransitionToErrorState();
       return false;
     } else if (events.revents & POLLERR) {
 #ifdef MSG_ZEROCOPY
-      CHECK_OK(table_.HandleSocketErrors(*fd_));
+      auto status = table_.HandleSocketErrors(*fd_);
+      if (!status.ok()) {
+        TransitionToErrorState();
+        return false;
+      }
+#else
+      TransitionToErrorState();
+      return false;
 #endif
     } else if (events.revents & POLLOUT) {
       if (no_more_messages_.load() == true) {
@@ -280,7 +287,7 @@ class SendConnectionHandler : public PollEventLoop::Handler {
     return true;
   }
 
-  void HandleRdHup() {
+  void TransitionToErrorState() {
     while (true) {
       auto state = state_.load();
       if (state == SocketState::kNotReady) {
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
index 9bffcd72b8506f..d0434425791472 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
@@ -191,6 +191,50 @@ TEST(SendQueue, SendAndRecvQueuesEarlyClose) {
   }
 }
 
+TEST(SendQueue, SendAndRecvQueuesRecvEarlyClose) {
+  size_t packet_size = 1024 * 8;
+  SlabAllocator uallocator(AllocateAlignedMemory(packet_size * 4).value(),
+                           packet_size);
+  auto recv_thread = RecvThreadState::Create(std::nullopt, uallocator);
+
+  int send_fd, recv_fd;
+  auto status = SetupSocketPairUsingEventLoop(send_fd, recv_fd);
+  ASSERT_TRUE(status.ok()) << status;
+
+  close(recv_fd);
+  auto work_queue = SharedSendWorkQueue::Start();
+  auto msg_queue = std::make_shared<SharedSendMsgQueue>();
+
+  SharedSendMsgQueue::StartSubConnectionSender(WrapSocket(send_fd), 0,
+                                               msg_queue, work_queue, 64);
+
+  std::string txt_msg;
+
+  while (txt_msg.size() < packet_size) {
+    txt_msg += "hello world";
+  }
+  absl::Mutex mu;
+  size_t send_count = 10;
+
+  for (size_t i = 0; i < 10; ++i) {
+    txt_msg.resize(packet_size);
+    BulkTransportInterface::SendMessage msg;
+    msg.data = txt_msg.data();
+    msg.size = txt_msg.size();
+    msg.on_send = [](absl::StatusOr<int> id, size_t size) {};
+    msg.on_done = [&mu, &send_count]() {
+      absl::MutexLock l(mu);
+      --send_count;
+    };
+    msg_queue->ScheduleSendWork(std::move(msg));
+  }
+  {
+    absl::MutexLock l(mu);
+    auto cond = [&]() { return send_count == 0; };
+    mu.Await(absl::Condition(&cond));
+  }
+}
+
 TEST(SocketBulkTransportFactoryTest, SendAndRecvWithFactory) {
   size_t packet_size = 1024 * 8;
   SlabAllocator allocator(AllocateNetworkPinnedMemory(packet_size * 4).value(),
diff --git a/third_party/xla/xla/python/transfer/streaming.cc b/third_party/xla/xla/python/transfer/streaming.cc
index ed8474a918ee06..7a88aa12c6daa8 100644
--- a/third_party/xla/xla/python/transfer/streaming.cc
+++ b/third_party/xla/xla/python/transfer/streaming.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
@@ -322,7 +322,9 @@ void PullTable::Handle(tsl::RCReference<ConnectionState> state,
   if (entry->Handle(std::move(state), req, base_req_id)) {
     absl::MutexLock l(mu_);
     auto it = entries_.find(req.uuid());
-    entries_.erase(it);
+    if (it != entries_.end()) {
+      entries_.erase(it);
+    }
   }
 }
 
diff --git a/third_party/xla/xla/python/transfer/streaming.h b/third_party/xla/xla/python/transfer/streaming.h
index 91f7247b521234..60b76db29759a6 100644
--- a/third_party/xla/xla/python/transfer/streaming.h
+++ b/third_party/xla/xla/python/transfer/streaming.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/future.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
 
 namespace aux {
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.cc b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
index 11cb746dca0480..f4cda2bd3488e8 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/transfer/streaming.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
@@ -116,7 +116,6 @@ PremappedCopierState::PremappedCopierState(
       max_num_parallel_copies_(max_num_parallel_copies),
       xfer_size_(xfer_size) {
   max_copies_ = scratch->size() / xfer_size_;
-  max_copies_ = std::min(max_copies_, size_t(8));
   available_copy_offsets_.reserve(max_copies_);
   for (size_t i = 0; i < max_copies_; ++i) {
     available_copy_offsets_.push_back(reinterpret_cast<char*>(scratch->data()) +
@@ -171,7 +170,8 @@ PremappedCopierState::WorkList PremappedCopierState::FindWorkLocked() {
 void PremappedCopierState::StartWorkUnlocked(const WorkList& work_list) {
   for (WorkQueueItem* work_item : work_list) {
     auto& wu = work_item->work;
-    wu.copy_fn(work_item->dest_buffer, wu.offset, wu.size)
+    auto copy_fn = std::move(wu.copy_fn);
+    std::move(copy_fn)(work_item->dest_buffer, wu.offset, wu.size)
         .OnReady([this, this_shared = shared_from_this(),
                   work_item](absl::Status s) {
           WorkList work_list2;
@@ -201,12 +201,13 @@ void PremappedCopierState::FlushReadyWorkItemsInOrder() {
     }
     currently_flushing_ = true;
     mu_.unlock();
-    if (work_item->result_status.ok()) {
-      std::move(work_item->on_done)(this, work_item->dest_buffer,
-                                    work_item->work);
-    } else {
-      std::move(work_item->on_done)(this, work_item->result_status,
-                                    work_item->work);
+    {
+      auto on_done_fn = std::move(work_item->on_done);
+      if (work_item->result_status.ok()) {
+        std::move(on_done_fn)(this, work_item->dest_buffer, work_item->work);
+      } else {
+        std::move(on_done_fn)(this, work_item->result_status, work_item->work);
+      }
     }
     mu_.lock();
     currently_flushing_ = false;
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.h b/third_party/xla/xla/python/transfer/streaming_ifrt.h
index aa835da2869e0c..6c4821ec98aad6 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.h
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/transfer/streaming.h"
@@ -53,7 +53,7 @@ absl::StatusOr<std::shared_ptr<absl::Span<uint8_t>>> AllocateAndMapPjrtMemory(
 // with an assigned 'buffer_id'.
 struct DmaCopyChunk {
   absl::AnyInvocable<xla::Future<>(void* dst, int64_t offset,
-                                   int64_t transfer_size)>
+                                   int64_t transfer_size) &&>
       copy_fn;
   size_t buffer_id;
   size_t offset;
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
index fadeda5b8b9387..c1d57b2da99daa 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
@@ -24,11 +24,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "xla/future.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/python/ifrt/array.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
 
@@ -154,6 +157,60 @@ absl::StatusOr<std::vector<int32_t>> FetchResult(
   return result;
 }
 
+TEST(PremappedCopierState, FreeCycle) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, xla::ifrt::test_util::GetClient());
+  std::shared_ptr<xla::PjRtClient> pjrt_client =
+      tensorflow::down_cast<xla::ifrt::PjRtClient*>(client.get())
+          ->shared_ptr_pjrt_client();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto scratch, AllocateAndMapPjrtMemory(pjrt_client, 1024 * 1024 * 16));
+  auto cstate = std::make_shared<PremappedCopierState>(scratch, 4, 4096);
+  std::vector<void*> buffers_to_return;
+  for (size_t i = 0; i < 2; ++i) {
+    cstate->ScheduleCopy(
+        {/*copy_fn=*/[](void* dst, int64_t offset,
+                        int64_t transfer_size) -> xla::Future<> {
+           return xla::Future<>(absl::OkStatus());
+         },
+         /*buffer_id=*/0,
+         /*offset=*/100,
+         /*size=*/100},
+        [&buffers_to_return](PremappedCopierState* state,
+                             absl::StatusOr<void*> buf,
+                             const DmaCopyChunk& chunk) {
+          TF_CHECK_OK(buf.status());
+          buffers_to_return.push_back(buf.value());
+        });
+  }
+  class BufferReturner {
+   public:
+    explicit BufferReturner(absl::AnyInvocable<void() &&> on_done)
+        : on_done_(std::move(on_done)) {}
+    ~BufferReturner() { std::move(on_done_)(); }
+
+   private:
+    absl::AnyInvocable<void() &&> on_done_;
+  };
+  cstate->ScheduleCopy(
+      {/*copy_fn=*/[buffer = std::make_unique<BufferReturner>(
+                        [b = buffers_to_return[0], cstate]() {
+                          cstate->ReturnBuffer(b);
+                        })](void* dst, int64_t offset,
+                            int64_t transfer_size) -> xla::Future<> {
+         return xla::Future<>(absl::OkStatus());
+       },
+       /*buffer_id=*/0,
+       /*offset=*/100,
+       /*size=*/100},
+      [buffer = std::make_unique<BufferReturner>(
+           [b = buffers_to_return[1], cstate]() { cstate->ReturnBuffer(b); })](
+          PremappedCopierState* state, absl::StatusOr<void*> buf,
+          const DmaCopyChunk& chunk) {
+        TF_CHECK_OK(buf.status());
+        state->ReturnBuffer(buf.value());
+      });
+}
+
 TEST(PremappedCopierState, RoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, xla::ifrt::test_util::GetClient());
   size_t xfer_size = 1024 * 1024;
diff --git a/third_party/xla/xla/python/transfer/streaming_test.cc b/third_party/xla/xla/python/transfer/streaming_test.cc
index 443a63c05552ff..af7988e3a26487 100644
--- a/third_party/xla/xla/python/transfer/streaming_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_test.cc
@@ -242,5 +242,29 @@ TEST(InvalidAllocator, InvalidAlignedAlloc) {
   ASSERT_FALSE(alloc2_or.ok());
 }
 
+class SelfResettingPullTableEntry : public PullTable::Entry {
+ public:
+  explicit SelfResettingPullTableEntry(std::shared_ptr<PullTable> table)
+      : table_(std::move(table)) {}
+
+  bool Handle(tsl::RCReference<ConnectionState> state,
+              const SocketTransferPullRequest& req,
+              size_t base_req_id) override {
+    table_->Reset();
+    return true;
+  }
+
+ private:
+  std::shared_ptr<PullTable> table_;
+};
+
+TEST(PullTable, PullTableRace) {
+  auto table = std::make_shared<PullTable>();
+  table->AwaitPull(6, tsl::MakeRef<SelfResettingPullTableEntry>(table));
+  SocketTransferPullRequest req;
+  req.set_uuid(6);
+  table->Handle({}, req, 0);
+}
+
 }  // namespace
 }  // namespace aux
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index b9d4901d56236f..ade86533648836 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 #include "xla/python/safe_static_init.h"
+#include "xla/python/strides.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -470,44 +471,6 @@ absl::StatusOr<nb::str> TypeDescriptorForPrimitiveType(PrimitiveType type) {
   }
 }
 
-// Returns the strides for `shape`.
-std::vector<int64_t> ByteStridesForShape(const Shape& shape) {
-  std::vector<int64_t> strides;
-  CHECK(shape.IsArray());
-  CHECK(shape.has_layout());
-  return ByteStridesForShape(shape.element_type(), shape.dimensions(),
-                             shape.layout());
-}
-
-static std::vector<int64_t> StridesForShapeHelper(
-    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
-    const xla::Layout& layout, int64_t innermost_stride_size) {
-  CHECK_EQ(dimensions.size(), layout.minor_to_major().size());
-  std::vector<int64_t> strides;
-  strides.resize(dimensions.size());
-  int64_t stride = innermost_stride_size;
-  for (int i : layout.minor_to_major()) {
-    strides[i] = stride;
-    stride *= dimensions[i];
-  }
-  return strides;
-}
-
-std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
-                                         absl::Span<const int64_t> dimensions,
-                                         const xla::Layout& layout) {
-  return StridesForShapeHelper(
-      element_type, dimensions, layout,
-      ShapeUtil::ByteSizeOfPrimitiveType(element_type));
-}
-
-std::vector<int64_t> StridesForShape(PrimitiveType element_type,
-                                     absl::Span<const int64_t> dimensions,
-                                     const xla::Layout& layout) {
-  return StridesForShapeHelper(element_type, dimensions, layout,
-                               /*innermost_stride_size=*/1);
-}
-
 absl::StatusOr<nb::object> LiteralToPython(
     std::shared_ptr<xla::Literal> literal) {
   xla::Literal& m = *literal;
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 16222f87e7f354..095db6ba1e07d7 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/nb_numpy.h"
+#include "xla/python/strides.h"  // IWYU pragma: keep
 #include "xla/python/version.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
@@ -96,15 +97,6 @@ struct NumpyScalarTypes {
 };
 const NumpyScalarTypes& GetNumpyScalarTypes();
 
-// Returns the strides for `shape`.
-std::vector<int64_t> ByteStridesForShape(const Shape& shape);
-std::vector<int64_t> ByteStridesForShape(PrimitiveType element_type,
-                                         absl::Span<const int64_t> dimensions,
-                                         const xla::Layout& layout);
-std::vector<int64_t> StridesForShape(PrimitiveType element_type,
-                                     absl::Span<const int64_t> dimensions,
-                                     const xla::Layout& layout);
-
 // Converts a literal to (possibly-nested tuples of) NumPy arrays.
 // The literal's leaf arrays are not copied; instead the NumPy arrays share
 // buffers with the literals. Takes ownership of `literal` and keeps the
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 0cb49f9027b506..2aa5ea09a6814d 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -19,6 +19,6 @@ limitations under the License.
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
 #define JAX_IFRT_VERSION_NUMBER \
-  34  // Explicit `has_custom_layout` argument in PjRt-IFRT Array creation.
+  36  // Explicit `has_custom_layout` argument in PjRt-IFRT Array creation.
 
 #endif  // XLA_PYTHON_VERSION_H_
diff --git a/third_party/xla/xla/pytype.bzl b/third_party/xla/xla/pytype.bzl
index deb4cf089a2ae2..88902bae682f77 100644
--- a/third_party/xla/xla/pytype.bzl
+++ b/third_party/xla/xla/pytype.bzl
@@ -1,8 +1,8 @@
 """Default (OSS) build versions of Python pytype rules."""
 
-load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("@rules_python//python:py_binary.bzl", "py_binary")
 load("@rules_python//python:py_library.bzl", "py_library")
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index 1e4e829d47630a..9551dd8de8776a 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -19,8 +19,10 @@ cc_library(
     srcs = ["buffer_use.cc"],
     hdrs = ["buffer_use.h"],
     deps = [
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/third_party/xla/xla/runtime/buffer_use.h b/third_party/xla/xla/runtime/buffer_use.h
index b1811150d77437..3b283ab41982de 100644
--- a/third_party/xla/xla/runtime/buffer_use.h
+++ b/third_party/xla/xla/runtime/buffer_use.h
@@ -17,11 +17,14 @@ limitations under the License.
 #define XLA_RUNTIME_BUFFER_USE_H_
 
 #include <cstdint>
+#include <optional>
 #include <tuple>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
 
 namespace xla {
 
@@ -31,6 +34,8 @@ namespace xla {
 //   conflicts. Synchronization primitives are specific to the target backend.
 // - Determine whether a buffer has defined contents before/after we execute a
 //   thunk. This is used to detect non-deterministic behavior via checksumming.
+// - We also use shape to know how the bytes in the slice are reinterpreted by
+//   thunks. Shape can be used by rewriters in ThunkPassPipeline.
 class BufferUse {
  public:
   enum class MemoryAccess {
@@ -63,28 +68,53 @@ class BufferUse {
                       : ContentValidity::kDefinedOnOutput) {}
 
   BufferUse(BufferAllocation::Slice slice, MemoryAccess access,
-            ContentValidity content_validity)
+            ContentValidity content_validity,
+            std::optional<Shape> shape = std::nullopt)
       : slice_(slice), access_(access), content_validity_(content_validity) {}
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Read(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kRead,
                      ContentValidity::kDefinedOnInputAndOutput);
   }
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Write(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kWrite,
                      ContentValidity::kDefinedOnOutput);
   }
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Scratch(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kWrite, ContentValidity::kUndefined);
   }
 
+  ABSL_DEPRECATED("Please provide shape as well.")
   static BufferUse Consume(BufferAllocation::Slice slice) {
     return BufferUse(slice, MemoryAccess::kWrite,
                      ContentValidity::kDefinedOnInput);
   }
 
+  static BufferUse Read(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kRead,
+                     ContentValidity::kDefinedOnInputAndOutput, shape);
+  }
+
+  static BufferUse Write(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kWrite,
+                     ContentValidity::kDefinedOnOutput, shape);
+  }
+
+  static BufferUse Scratch(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kWrite, ContentValidity::kUndefined,
+                     shape);
+  }
+
+  static BufferUse Consume(BufferAllocation::Slice slice, Shape shape) {
+    return BufferUse(slice, MemoryAccess::kWrite,
+                     ContentValidity::kDefinedOnInput, shape);
+  }
+
   // Returns true if the buffer contains initialized data when thunk starts
   // execution.
   bool HasDefinedContentsOnInput() const {
@@ -146,6 +176,7 @@ class BufferUse {
 
  private:
   BufferAllocation::Slice slice_;
+  std::optional<Shape> shape_;
   MemoryAccess access_;
   ContentValidity content_validity_;
 };
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 3b73d1c4eeb65d..ede1ffc6a4d6ab 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -391,6 +391,7 @@ cc_library(
         ":hlo_value",
         "//xla:comparison_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -498,6 +499,7 @@ xla_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -536,6 +538,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Transforms",
@@ -667,6 +670,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -910,6 +914,7 @@ xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:tile_assignment",
@@ -1574,7 +1579,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory_allocator",
@@ -1591,7 +1595,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:protobuf",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1658,6 +1662,8 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
@@ -2087,6 +2093,7 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
@@ -2250,6 +2257,7 @@ cc_library(
         ":hlo_module_config",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:constants",
@@ -2259,8 +2267,10 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -2742,46 +2752,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_unstacker",
-    srcs = ["hlo_unstacker.cc"],
-    hdrs = ["hlo_unstacker.h"],
-    deps = [
-        ":hlo_creation_utils",
-        ":pattern_matcher",
-        ":tuple_util",
-        ":while_loop_unroller",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "hlo_unstacker_test",
-    srcs = ["hlo_unstacker_test.cc"],
-    tags = if_google(["requires-net:external"]),
-    deps = [
-        ":hlo_unstacker",
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "while_loop_unroller",
     srcs = ["while_loop_unroller.cc"],
@@ -2856,6 +2826,7 @@ cc_library(
         ":call_inliner",
         ":hlo_creation_utils",
         ":pattern_matcher",
+        ":while_util",
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -4305,6 +4276,11 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -4330,6 +4306,9 @@ xla_cc_test(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status_matchers",
@@ -4408,9 +4387,9 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor/integrations:tf_allocator_adapter",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:die_if_null",
@@ -4460,6 +4439,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -4478,6 +4458,7 @@ cc_library(
         ":computation_layout",
         ":computation_placer_hdr",
         ":hlo_runner_interface",
+        "//xla:future",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_layout",
@@ -4492,13 +4473,14 @@ cc_library(
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:recordphase",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
@@ -4529,7 +4511,9 @@ xla_cc_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -4678,6 +4662,7 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
+        "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -4714,6 +4699,9 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -4736,6 +4724,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -4945,8 +4934,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5272,12 +5259,19 @@ cc_library(
         ":pattern_matcher",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:comparators",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -5302,6 +5296,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -5745,7 +5740,6 @@ xla_cc_binary(
     ] + if_cuda_or_rocm_is_configured([
         # keep sorted
         ":gpu_plugin",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
         "//xla/stream_executor/gpu:gpu_init",
     ]) + if_cuda_is_configured([
@@ -5906,6 +5900,9 @@ tf_proto_library(
     name = "buffer_assignment_proto",
     srcs = ["buffer_assignment.proto"],
     make_default_target_header_only = True,
+    protodeps = [
+        "//xla:xla_data_proto",
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/service/all_gather_decomposer.cc b/third_party/xla/xla/service/all_gather_decomposer.cc
index ce3ed5f5f44026..c21b03c7834f23 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.cc
+++ b/third_party/xla/xla/service/all_gather_decomposer.cc
@@ -112,7 +112,7 @@ absl::Status AllGatherDecomposer::DecomposeAllGather(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> AllGatherDecomposer::Run(
+absl::StatusOr<bool> AllGatherDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/all_gather_decomposer.h b/third_party/xla/xla/service/all_gather_decomposer.h
index cf2713243e3c6e..742ee210a8916d 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.h
+++ b/third_party/xla/xla/service/all_gather_decomposer.h
@@ -46,13 +46,6 @@ class AllGatherDecomposer : public HloModulePass {
             [](const HloAllGatherInstruction& ag) { return true; }) {}
   absl::string_view name() const override { return "all_gather_decomposer"; }
 
-  // Run AllGatherDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   virtual HloInstruction* TranslateAllGatherToAllReducePerOperand(
       CollectiveOpGroupMode group_mode, const HloAllGatherInstruction& ag,
@@ -66,6 +59,12 @@ class AllGatherDecomposer : public HloModulePass {
   absl::Status DecomposeAllGather(HloAllGatherInstruction* ag,
                                   HloComputation* comp);
 
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
 };
diff --git a/third_party/xla/xla/service/all_gather_simplifier.cc b/third_party/xla/xla/service/all_gather_simplifier.cc
index 3e83ab88c958cb..d1e6f1a285f32a 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllGatherSimplifier::Run(
+absl::StatusOr<bool> AllGatherSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/all_gather_simplifier.h b/third_party/xla/xla/service/all_gather_simplifier.h
index 726a4a0cdf7fe1..bf4c5c0101a64b 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.h
+++ b/third_party/xla/xla/service/all_gather_simplifier.h
@@ -37,10 +37,10 @@ class AllGatherSimplifier : public HloModulePass {
   static constexpr absl::string_view kName = "all-gather-simplifier";
   absl::string_view name() const override { return kName; }
 
+ protected:
   // Run all-gather simplification on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/all_reduce_promotion.cc b/third_party/xla/xla/service/all_reduce_promotion.cc
index bdc4422a6aff7b..59aea1c4a128f4 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.cc
+++ b/third_party/xla/xla/service/all_reduce_promotion.cc
@@ -73,7 +73,7 @@ AllReducePromotion::AllReducePromotion(
     absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types)
     : pass_(from_to_types, IsAllReduce, CloneAllReduce) {}
 
-absl::StatusOr<bool> AllReducePromotion::Run(
+absl::StatusOr<bool> AllReducePromotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return pass_.Run(module, execution_threads);
diff --git a/third_party/xla/xla/service/all_reduce_promotion.h b/third_party/xla/xla/service/all_reduce_promotion.h
index e6459f82e00dc2..3a480896fb88b6 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.h
+++ b/third_party/xla/xla/service/all_reduce_promotion.h
@@ -35,8 +35,8 @@ class AllReducePromotion : public HloModulePass {
       absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types);
   absl::string_view name() const override { return "all-reduce-promotion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/all_reduce_reassociate.cc b/third_party/xla/xla/service/all_reduce_reassociate.cc
index 6063eef7b6e6b0..3b3c55257f6949 100644
--- a/third_party/xla/xla/service/all_reduce_reassociate.cc
+++ b/third_party/xla/xla/service/all_reduce_reassociate.cc
@@ -179,7 +179,7 @@ bool MatchOperandsToAllReduceWithOptionalConvert(HloInstruction* inst,
 }
 }  // namespace
 
-absl::StatusOr<bool> AllReduceReassociate::Run(
+absl::StatusOr<bool> AllReduceReassociate::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
diff --git a/third_party/xla/xla/service/all_reduce_reassociate.h b/third_party/xla/xla/service/all_reduce_reassociate.h
index 9fbeb32e6bf81f..00f9d143f0c361 100644
--- a/third_party/xla/xla/service/all_reduce_reassociate.h
+++ b/third_party/xla/xla/service/all_reduce_reassociate.h
@@ -39,8 +39,8 @@ class AllReduceReassociate : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-reassociate"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/all_reduce_simplifier.cc b/third_party/xla/xla/service/all_reduce_simplifier.cc
index 13a6db2c62c1a7..02a40c7bb2ff41 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier.cc
+++ b/third_party/xla/xla/service/all_reduce_simplifier.cc
@@ -40,7 +40,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> AllReduceSimplifier::Run(
+absl::StatusOr<bool> AllReduceSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/all_reduce_simplifier.h b/third_party/xla/xla/service/all_reduce_simplifier.h
index ea041c39637c1b..40019304bd9030 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier.h
+++ b/third_party/xla/xla/service/all_reduce_simplifier.h
@@ -33,10 +33,10 @@ class AllReduceSimplifier : public HloModulePass {
   static constexpr absl::string_view kName = "all-reduce-simplifier";
   absl::string_view name() const override { return kName; }
 
+ protected:
   // Run all-reduce simplification on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/batchnorm_expander.cc b/third_party/xla/xla/service/batchnorm_expander.cc
index a0052f1b0e651e..82c526c2c2a910 100644
--- a/third_party/xla/xla/service/batchnorm_expander.cc
+++ b/third_party/xla/xla/service/batchnorm_expander.cc
@@ -581,10 +581,11 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormGrad(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> BatchNormExpander::Run(
+absl::StatusOr<bool> BatchNormExpander::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      2, "BatchNormExpander::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -594,7 +595,8 @@ absl::StatusOr<bool> BatchNormExpander::Run(
       changed = true;
     }
   }
-  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2,
+                 "BatchNormExpander::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/batchnorm_expander.h b/third_party/xla/xla/service/batchnorm_expander.h
index 15738efdc44158..4d0931ef202ac5 100644
--- a/third_party/xla/xla/service/batchnorm_expander.h
+++ b/third_party/xla/xla/service/batchnorm_expander.h
@@ -41,10 +41,10 @@ class BatchNormExpander : public HloModulePass {
   ~BatchNormExpander() override = default;
   absl::string_view name() const override { return "batchnorm_expander"; }
 
+ protected:
   // Run operation expander on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 94e1f217233211..fef83711d7664a 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -245,6 +245,7 @@ BufferAllocation::Slice::ToProto() const {
   proto.set_offset(offset());
   proto.set_size(size());
   proto.set_buffer_allocation_index(allocation() == nullptr ? -1 : index());
+  proto.set_element_type(element_type());
   return proto;
 }
 
@@ -259,13 +260,14 @@ absl::StatusOr<BufferAllocation::Slice> BufferAllocation::Slice::FromProto(
   }
   const BufferAllocation& allocation =
       buffer_allocations[proto.buffer_allocation_index()];
-  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size());
+  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size(),
+                                 proto.element_type());
 }
 
 BufferAllocation::Slice BufferAllocation::GetSlice(
     const HloValue& buffer) const {
   const OffsetSize os = FindOrDie(assigned_buffers_, &buffer);
-  return Slice(this, os.offset, os.size);
+  return Slice(this, os.offset, os.size, buffer.shape().element_type());
 }
 
 absl::Status BufferAllocation::AddAssignment(const HloValue& buffer,
@@ -331,6 +333,8 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_logical_buffer_id(buffer_offset_size.first->id());
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
+    proto_assigned->set_element_type(
+        buffer_offset_size.first->shape().element_type());
   }
   absl::c_sort(*proto.mutable_assigned(),
                [](const BufferAllocationProto::Assigned& assign1,
@@ -990,13 +994,21 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
 
   // Group the values by their offset in the allocation.
   absl::flat_hash_map<int64_t, OffsetInfo> offset_to_buffers;
+  std::vector<const HloValue*> sorted_hlo_values;
+  sorted_hlo_values.reserve(assigned_buffers_.size());
   for (const auto& element : assigned_buffers_) {
-    const HloValue* value = element.first;
-    OffsetInfo& offset_info = offset_to_buffers[element.second.offset];
+    sorted_hlo_values.push_back(element.first);
+  }
+  absl::c_sort(sorted_hlo_values, [](const HloValue* a, const HloValue* b) {
+    return a->id() < b->id();
+  });
+  for (const HloValue* value : sorted_hlo_values) {
+    const OffsetSize& offset_size = assigned_buffers_.find(value)->second;
+    OffsetInfo& offset_info = offset_to_buffers[offset_size.offset];
     offset_info.values.push_back(value);
-    offset_info.offset_size.offset = element.second.offset;
+    offset_info.offset_size.offset = offset_size.offset;
     offset_info.offset_size.size =
-        std::max(offset_info.offset_size.size, element.second.size);
+        std::max(offset_info.offset_size.size, offset_size.size);
   }
 
   // Sort the offset infos by the max size of the values in the group.
@@ -1008,7 +1020,14 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
   }
   absl::c_sort(sorted_offset_infos,
                [](const OffsetInfo& a, const OffsetInfo& b) {
-                 return a.offset_size.size > b.offset_size.size;
+                 if (a.offset_size.size != b.offset_size.size) {
+                   return a.offset_size.size > b.offset_size.size;
+                 }
+                 // Each HloValue appears in just one OffsetInfo `values`
+                 // vector. Therefore we can use the id of any element of the
+                 // `values` vector as tie breaker, as long as the `values`
+                 // vector has a deterministic order.
+                 return a.values.back()->id() < b.values.back()->id();
                });
 
   StrAppend(&output, prefix,
@@ -1030,15 +1049,21 @@ std::string BufferAllocation::MemoryUsageReport(const std::string& prefix,
     // of the line.
     absl::flat_hash_map<std::string, int64_t> shapes;
     for (auto& value : offset_info.values) shapes[value->shape().ToString()]++;
+    std::vector<std::pair<int64_t, std::string>> sorted_shapes;
+    sorted_shapes.reserve(shapes.size());
+    for (const auto& entry : shapes) {
+      sorted_shapes.emplace_back(entry.second, entry.first);
+    }
+    absl::c_sort(sorted_shapes);
 
-    StrAppend(
-        &output,
-        absl::StrJoin(shapes, ", ", [](std::string* out, const auto& pair) {
-          if (pair.second == 1) {
-            return absl::StrAppend(out, pair.first);
-          }
-          return absl::StrAppend(out, pair.second, "×", pair.first);
-        }));
+    StrAppend(&output,
+              absl::StrJoin(
+                  sorted_shapes, ", ", [](std::string* out, const auto& pair) {
+                    if (pair.first == 1) {
+                      return absl::StrAppend(out, pair.second);
+                    }
+                    return absl::StrAppend(out, pair.first, "×", pair.second);
+                  }));
 
     StrAppend(&output, "\n");
 
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index c33ca009589fa8..0ba61f183b4ca7 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -190,15 +190,22 @@ class BufferAllocation {
   class Slice {
    public:
     Slice() = default;
-    Slice(const BufferAllocation* allocation, int64_t offset, int64_t size)
-        : allocation_(allocation), offset_(offset), size_(size) {}
+    Slice(const BufferAllocation* allocation, int64_t offset, int64_t size,
+          PrimitiveType element_type = PrimitiveType::PRIMITIVE_TYPE_INVALID)
+        : allocation_(allocation),
+          offset_(offset),
+          size_(size),
+          element_type_(element_type) {}
 
     const BufferAllocation* allocation() const { return allocation_; }
     Index index() const { return allocation_->index(); }
     int64_t offset() const { return offset_; }
     int64_t size() const { return size_; }
+    PrimitiveType element_type() const { return element_type_; }
 
     bool operator==(const Slice& other) const {
+      // We don't compare element_type_ because it's not always set, and it's
+      // not relevant for the comparison here.
       return index() == other.index() && offset_ == other.offset_ &&
              size_ == other.size_;
     }
@@ -252,6 +259,7 @@ class BufferAllocation {
     const BufferAllocation* allocation_ = nullptr;
     int64_t offset_ = 0;
     int64_t size_ = 0;
+    PrimitiveType element_type_ = PrimitiveType::PRIMITIVE_TYPE_INVALID;
   };
 
   // GetSlice returns the Slice of contiguous memory that holds the value
diff --git a/third_party/xla/xla/service/buffer_assignment.proto b/third_party/xla/xla/service/buffer_assignment.proto
index 6f6b2ac35aef27..df6714b0faa069 100644
--- a/third_party/xla/xla/service/buffer_assignment.proto
+++ b/third_party/xla/xla/service/buffer_assignment.proto
@@ -15,6 +15,8 @@ limitations under the License.
 
 syntax = "proto3";
 
+import "xla/xla_data.proto";
+
 package xla.buffer_assignment;
 
 // This defines the buffer isolation configuration, which is a debugging tool to
@@ -108,4 +110,5 @@ message BufferAllocationSliceProto {
   int64 offset = 1;
   int64 size = 2;
   int64 buffer_allocation_index = 3;
+  xla.PrimitiveType element_type = 4;
 }
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 94e09dda795807..728d4d32ca3c3c 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -2215,6 +2215,14 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   EXPECT_FALSE(buffer.IsInputOrOutput());
   EXPECT_TRUE(buffer.IsPreallocatedTempBuffer());
   ASSERT_EQ(buffer.assigned_buffers().size(), 4);
+  const char* const kExpectedMemoryUsageReport =
+      R"(cumulative_size;       size;       offset; used_by_n_values; shapes_list
+------------------------------------------------------------
+     800B( 50%);       800B;            0;                2; f32[100], f32[200]
+   1.2KiB( 75%);       400B;          800;                1; f32[100]
+   1.6KiB(100%);       400B;         1200;                1; f32[100]
+)";
+  EXPECT_EQ(buffer.MemoryUsageReport(""), kExpectedMemoryUsageReport);
 
   const std::vector<const HloValue*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index 60346bf5f8e9c4..216d15d48df8f6 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -261,7 +262,7 @@ bool InlineComposites(
 
 // Introduces a specific attribute so that the frontend has the direct
 // control over inlining specific calls.
-bool InlineInstruction(HloInstruction* instruction) {
+bool FrontendAttributesAllowInlining(HloInstruction* instruction) {
   auto it = instruction->frontend_attributes().map().find("inlineable");
   if (it != instruction->frontend_attributes().map().end()) {
     return it->second == "true";
@@ -293,18 +294,26 @@ CallInliner::Inline(HloInstruction* call) {
   // inlined instructions.
   if (call->has_frontend_attributes()) {
     const FrontendAttributes& call_attributes = call->frontend_attributes();
-    std::string has_fuse =
-        call_attributes.map().contains("MUST_FUSE")      ? "MUST_FUSE"
-        : call_attributes.map().contains("MAXIMAL_FUSE") ? "MAXIMAL_FUSE"
-                                                         : "";
-    if (!has_fuse.empty()) {
+    for (auto maybe_attribute :
+         {call_attributes.map().contains("MUST_FUSE")
+              ? std::make_optional("MUST_FUSE")
+          : call_attributes.map().contains("MAXIMAL_FUSE")
+              ? std::make_optional("MAXIMAL_FUSE")
+              : std::nullopt,
+          call_attributes.map().contains("mosaic_fusion_group")
+              ? std::make_optional("mosaic_fusion_group")
+              : std::nullopt}) {
+      if (!maybe_attribute.has_value()) {
+        continue;
+      }
+      const auto attribute = *maybe_attribute;
       for (auto instruction : callee->instructions()) {
         // Do so for only fusible instructions.
         if (instruction->IsFusible()) {
           FrontendAttributes frontend_attributes =
               instruction->frontend_attributes();
           frontend_attributes.mutable_map()->insert(
-              {has_fuse, call_attributes.map().at(has_fuse)});
+              {attribute, call_attributes.map().at(attribute)});
           instruction->set_frontend_attributes(frontend_attributes);
         }
       }
@@ -324,11 +333,6 @@ bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
   if (!prerequisite) {
     return false;
   }
-  if (!InlineInstruction(instruction)) {
-    // Always prioritize user's explicit requests after fulfilling the
-    // prerequisites.
-    return false;
-  }
   if (instruction->GetModule()->config().use_shardy_partitioner() &&
       (absl::StrContains(instruction->to_apply()->name(), "shmap_body") ||
        absl::StrContains(instruction->to_apply()->name(),
@@ -351,16 +355,30 @@ bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
 
 bool CallInliner::ShouldInline(const CallGraph& call_graph,
                                HloInstruction* instruction) const {
+  // Check this is an inlineable call op (but not frontend attributes)
   if (!IsInlineableCallOp(instruction)) {
     return false;
   }
 
-  if (should_inline_.has_value()) {
-    if (!(*should_inline_)(call_graph, instruction)) {
+  // Check the override policy, if any.
+  InlineOverridePolicy policy = InlineOverridePolicy::kAllowInline;
+  if (override_policy_.has_value()) {
+    policy = (*override_policy_)(call_graph, instruction);
+  }
+
+  // If the policy is to never inline, we're done.
+  if (policy == InlineOverridePolicy::kProhibitInline) {
+    return false;
+  }
+
+  // If the policy is to ignore frontend attributes, do so.
+  if (policy != InlineOverridePolicy::kAllowIgnoreFrontendAttributes) {
+    if (!FrontendAttributesAllowInlining(instruction)) {
       return false;
     }
   }
 
+  // If we're only inlining calls with a single call site, check that.
   if (single_call_site_) {
     return call_graph.GetNode(instruction->to_apply())
                .caller_callsites()
@@ -421,13 +439,13 @@ absl::StatusOr<bool> CallInliner::InlineAndLegalize(
   }
   if (did_node_mutate && uniquify_channel_ids_) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (!dynamic_cast<HloChannelInstruction*>(instruction)) {
+      if (!HloChannelInstruction::ClassOf(instruction)) {
         continue;
       }
       // Channel IDs for host transfers are part of the ABI, and can never be
       // uniquified.
       HloSendRecvInstruction* send_recv =
-          dynamic_cast<HloSendRecvInstruction*>(instruction);
+          DynCast<HloSendRecvInstruction>(instruction);
       if (send_recv && send_recv->is_host_transfer()) {
         continue;
       }
@@ -448,7 +466,7 @@ absl::StatusOr<bool> CallInliner::RunWithInlineMap(
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instruction : computation->instructions()) {
         HloChannelInstruction* channel_instruction =
-            dynamic_cast<HloChannelInstruction*>(instruction);
+            DynCast<HloChannelInstruction>(instruction);
         if (channel_instruction &&
             channel_instruction->channel_id().has_value()) {
           next_unique_channel_id_ =
@@ -495,7 +513,7 @@ absl::StatusOr<bool> CallInliner::RunWithInlineMap(
   return did_mutate;
 }
 
-absl::StatusOr<bool> CallInliner::Run(
+absl::StatusOr<bool> CallInliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return RunWithInlineMap(module, std::nullopt, execution_threads);
@@ -506,7 +524,7 @@ bool IsInlineableComputation(HloComputation* computation) {
     bool prerequisite = instruction->opcode() == HloOpcode::kCall &&
                         !instruction->has_backend_config() &&
                         !instruction->parent()->IsAsyncComputation();
-    if (!prerequisite || !InlineInstruction(instruction)) {
+    if (!prerequisite || (!FrontendAttributesAllowInlining(instruction))) {
       return false;
     }
     return true;
diff --git a/third_party/xla/xla/service/call_inliner.h b/third_party/xla/xla/service/call_inliner.h
index 54b41c9f37b518..b1a7bf57db417f 100644
--- a/third_party/xla/xla/service/call_inliner.h
+++ b/third_party/xla/xla/service/call_inliner.h
@@ -39,6 +39,13 @@ namespace xla {
 // called function, and proceed recursively.
 class CallInliner : public HloModulePass {
  public:
+  enum class InlineOverridePolicy {
+    kAllowInline,                    // Allow inlining as normal.
+    kProhibitInline,                 // Prohibit inlining of this callsite.
+    kAllowIgnoreFrontendAttributes,  // Allow even with the 'inlineable'
+                                     // frontend attribute is set != 'true'.
+  };
+
   using InlinedInstructionMap =
       absl::flat_hash_map<HloInstruction*, HloInstruction*>;
 
@@ -52,27 +59,23 @@ class CallInliner : public HloModulePass {
   // are being inlined if necessary.
   // If `uniquify_channel_ids` is true, the channel ids of the resulting
   // computation will be uniquified.
-  // If the callback `should_inline` is provided, only functions callsite for
-  // which it returns true will be inlined.
+  // If the callback `override_policy` is provided, callsites will be inlined
+  // according to the policy returned.
   explicit CallInliner(
       bool single_call_site = false, bool update_domain = false,
       absl::flat_hash_set<std::string> composites_to_preserve = {},
       bool uniquify_channel_ids = false,
-      std::optional<std::function<bool(const CallGraph&, HloInstruction*)>>
-          should_inline = std::nullopt)
+      std::optional<std::function<InlineOverridePolicy(const CallGraph&,
+                                                       const HloInstruction*)>>
+          override_policy = std::nullopt)
       : single_call_site_(single_call_site),
         update_domain_(update_domain),
         uniquify_channel_ids_(uniquify_channel_ids),
         composites_to_preserve_(std::move(composites_to_preserve)),
-        should_inline_(std::move(should_inline)) {}
+        override_policy_(std::move(override_policy)) {}
   ~CallInliner() override = default;
   absl::string_view name() const override { return "call-inliner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::StatusOr<bool> RunWithInlineMap(
       HloModule* module, std::optional<InlinedInstructionMap*> inline_map,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
@@ -84,6 +87,11 @@ class CallInliner : public HloModulePass {
   // Maximum length of an op_name that can be formed during inlining.
   static constexpr int kMaxOpNameSize = 1024;
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::StatusOr<bool> InlineAndLegalize(
       const CallGraph& call_graph, HloComputation* computation,
@@ -97,9 +105,9 @@ class CallInliner : public HloModulePass {
   bool update_domain_;
   bool uniquify_channel_ids_;
   absl::flat_hash_set<std::string> composites_to_preserve_;
-  std::optional<
-      std::function<bool(const CallGraph& call_graph, HloInstruction*)>>
-      should_inline_;
+  std::optional<std::function<InlineOverridePolicy(const CallGraph& call_graph,
+                                                   const HloInstruction*)>>
+      override_policy_;
   int64_t next_unique_channel_id_ = 1;
 };
 
diff --git a/third_party/xla/xla/service/call_inliner_test.cc b/third_party/xla/xla/service/call_inliner_test.cc
index f7fa2b17e6d71c..3c2863a3f745e3 100644
--- a/third_party/xla/xla/service/call_inliner_test.cc
+++ b/third_party/xla/xla/service/call_inliner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -358,6 +359,41 @@ ENTRY %main_outer (p0: u32[]) -> u32[] {
   }
 }
 
+TEST_F(CallInlinerTest, PropagateFrontendAttributes) {
+  const absl::string_view hlo_string = R"(
+    HloModule inliner_fe_attr_prop
+
+    %add_one (p: f32[]) -> f32[] {
+      %p = f32[] parameter(0)
+      %one = f32[] constant(1)
+      ROOT %add = f32[] add(%p, %one)
+    }
+
+    ENTRY %main () -> f32[] {
+      %c = f32[] constant(10)
+      ROOT %call = f32[] call(%c), to_apply=%add_one, frontend_attributes={mosaic_fusion_group="1"}
+    })";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CallInliner call_inliner;
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
+  ASSERT_TRUE(mutated);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add());
+  ASSERT_TRUE(root->has_frontend_attributes());
+  EXPECT_EQ(root->frontend_attributes().map().at("mosaic_fusion_group"), "1");
+
+  const HloInstruction* c = root->operand(0);
+  EXPECT_THAT(c, op::Constant());
+  EXPECT_FALSE(c->frontend_attributes().map().contains("mosaic_fusion_group"));
+
+  const HloInstruction* one = root->operand(1);
+  EXPECT_THAT(one, op::Constant());
+  ASSERT_TRUE(one->has_frontend_attributes());
+  EXPECT_EQ(one->frontend_attributes().map().at("mosaic_fusion_group"), "1");
+}
+
 TEST_F(CallInlinerTest, InlineCompositeCall) {
   const absl::string_view hlo_string = R"(
   HloModule composite
@@ -442,6 +478,40 @@ TEST_F(CallInlinerTest, DontInlineCallWithAttributeInlineableFalse) {
   EXPECT_EQ(call->to_apply()->name(), "test");
 }
 
+TEST_F(CallInlinerTest, InlineCallWithOverriddenAttributeInlineableFalse) {
+  const char* const hloString = R"(
+    HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
+    %test (Arg_0.5: f32[1,8]) -> f32[1,8] {
+      %Arg_0.5 = f32[1,8]{1,0} parameter(0)
+      ROOT %add.6 = f32[1,8]{1,0} add(f32[1,8]{1,0} %Arg_0.5, f32[1,8]{1,0} %Arg_0.5), metadata={source_file="-" source_line=11}
+    }
+    ENTRY %main.10 (Arg_0.1: f32[8,8]) -> f32[8,8] {
+      %Arg_0.1 = f32[8,8]{1,0} parameter(0)
+      %custom-call.3 = f32[1,8]{1,0} custom-call(f32[8,8]{1,0} %Arg_0.1), custom_call_target="SPMDFullToShardShape", sharding={manual}, metadata={source_file="-" source_line=4}
+      %call.7 = f32[1,8]{1,0} call(f32[1,8]{1,0} %custom-call.3), to_apply=%test, frontend_attributes={inlineable="false"}
+      ROOT %custom-call.9 = f32[8,8]{1,0} custom-call(f32[1,8]{1,0} %call.7), custom_call_target="SPMDShardToFullShape", sharding={devices=[8,1]<=[8]}, metadata={source_file="-" source_line=7}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hloString));
+  module->mutable_config().set_use_shardy_partitioner(true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      CallInliner(
+          /*single_call_site=*/false, /*update_domain=*/false,
+          /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
+          /*uniquify_channel_ids=*/false,
+          /*override_policy=*/
+          [](const CallGraph&, const HloInstruction*) {
+            return CallInliner::InlineOverridePolicy::
+                kAllowIgnoreFrontendAttributes;
+          })
+          .Run(module.get()));
+  // The single call will be inlined despite the inlineable attribute being
+  // false because we set override_frontend_attributes to true.
+  EXPECT_TRUE(changed);
+  HloInstruction* call = FindInstruction(module.get(), xla::HloOpcode::kCall);
+  EXPECT_EQ(call, nullptr);
+}
+
 TEST_F(CallInlinerTest, UseShardyMhloToHloShmapBodyNotInlined) {
   const char* const hloString = R"(
     HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
@@ -941,15 +1011,20 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(hlo));
 
+  using InlineOverridePolicy = CallInliner::InlineOverridePolicy;
   auto inline_trivial_only = [](const CallGraph& call_graph,
-                                HloInstruction* instruction) {
+                                const HloInstruction* instruction) {
     HloComputation* callee = instruction->to_apply();
-    return (callee->root_instruction()->opcode() == HloOpcode::kParameter);
+    InlineOverridePolicy policy = InlineOverridePolicy::kProhibitInline;
+    if (callee->root_instruction()->opcode() == HloOpcode::kParameter) {
+      policy = InlineOverridePolicy::kAllowInline;
+    }
+    return policy;
   };
   CallInliner call_inliner(/*single_call_site=*/false, /*update_domain=*/false,
                            /*composites_to_preserve=*/{},
                            /*uniquify_channel_ids=*/false,
-                           /*should_inline=*/inline_trivial_only);
+                           /*override_policy=*/inline_trivial_only);
 
   ASSERT_THAT(call_inliner.Run(m.get()), absl_testing::IsOkAndHolds(true));
   EXPECT_THAT(m->entry_computation()->root_instruction(),
@@ -980,8 +1055,7 @@ ENTRY main {
   ASSERT_THAT(call_inliner.Run(m.get()), absl_testing::IsOkAndHolds(true));
   HloComputation* entry = m->entry_computation();
   for (HloInstruction* inst : entry->instructions()) {
-    HloSendRecvInstruction* send_recv =
-        dynamic_cast<HloSendRecvInstruction*>(inst);
+    HloSendRecvInstruction* send_recv = DynCast<HloSendRecvInstruction>(inst);
     if (send_recv && send_recv->is_host_transfer()) {
       EXPECT_EQ(send_recv->channel_id(), 1);
     }
@@ -1025,8 +1099,7 @@ ENTRY main {
   absl::flat_hash_set<int64_t> channel_ids;
   for (HloComputation* comp : m->computations()) {
     for (HloInstruction* inst : comp->instructions()) {
-      HloChannelInstruction* channel =
-          dynamic_cast<HloChannelInstruction*>(inst);
+      HloChannelInstruction* channel = DynCast<HloChannelInstruction>(inst);
       if (channel && channel->channel_id().has_value()) {
         channel_ids.insert(channel->channel_id().value());
       }
diff --git a/third_party/xla/xla/service/change_op_data_type.cc b/third_party/xla/xla/service/change_op_data_type.cc
index 2f004ec2b3d625..b62e1a34b1fb73 100644
--- a/third_party/xla/xla/service/change_op_data_type.cc
+++ b/third_party/xla/xla/service/change_op_data_type.cc
@@ -40,7 +40,7 @@ std::optional<PrimitiveType> GetUniformOperandType(
 }
 }  // namespace
 
-absl::StatusOr<bool> ChangeOpDataType::Run(
+absl::StatusOr<bool> ChangeOpDataType::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -68,8 +68,8 @@ absl::StatusOr<bool> ChangeOpDataType::Run(
 #ifdef XLA_ONEDNN
       // TODO(penporn): Move this logic outside of this pass.
       const DebugOptions& debug_options = module->config().debug_options();
-      if (debug_options.xla_cpu_use_onednn() &&
-          !debug_options.xla_cpu_experimental_onednn_custom_call() &&
+      if ((debug_options.xla_cpu_use_onednn() ||
+           debug_options.xla_cpu_experimental_onednn_custom_call()) &&
           cpu::OneDnnContractionRewriter::ShouldRewriteInstr(instr, true)) {
         continue;
       }
diff --git a/third_party/xla/xla/service/change_op_data_type.h b/third_party/xla/xla/service/change_op_data_type.h
index 43e929d6e8e7c8..811c7281646dbb 100644
--- a/third_party/xla/xla/service/change_op_data_type.h
+++ b/third_party/xla/xla/service/change_op_data_type.h
@@ -63,8 +63,9 @@ class ChangeOpDataType : public HloModulePass {
   }
 
   absl::string_view name() const override { return "change-op-data-type"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index 4c8314be1128df..7d1f735df02342 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -307,7 +307,7 @@ void RemoveAllButOne(std::vector<HloCollectivePermuteInstruction*>& cps) {
   cps = {cps[cp_index]};
 }
 
-absl::StatusOr<bool> CollectivePermuteDecomposer::Run(
+absl::StatusOr<bool> CollectivePermuteDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.h b/third_party/xla/xla/service/collective_permute_decomposer.h
index 0787e50b0c9cad..09d2284858303c 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.h
+++ b/third_party/xla/xla/service/collective_permute_decomposer.h
@@ -69,9 +69,8 @@ class CollectivePermuteDecomposer : public HloModulePass {
     return "collective-permute-decomposer";
   }
 
-  // Runs CollectivePermuteDecomposer pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 3dfb34cbe9740b..8799a00a30e7df 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -359,8 +359,8 @@ CheckStoreIntoSliceIsCompatible(
                             HloOpcode::kAllReduce, HloOpcode::kTranspose,
                             HloOpcode::kBroadcast, HloOpcode::kAllGather,
                             HloOpcode::kReduce, HloOpcode::kGetTupleElement,
-                            HloOpcode::kConcatenate, HloOpcode::kReduceScatter>(
-               i) ||
+                            HloOpcode::kConcatenate, HloOpcode::kReduceScatter,
+                            HloOpcode::kBitcast>(i) ||
            (multi_uses_pipelining && i->IsElementwise()) ||
            (i->opcode() == HloOpcode::kCustomCall &&
             !Cast<HloCustomCallInstruction>(i)
@@ -1882,6 +1882,14 @@ absl::Status TransformLoopForward(
         MapNewOperands(instr->operands(), while_body_to_peeled);
     HloInstruction* cloned_instr = loop_computation->AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), new_operands));
+    if (cloned_instr->opcode() == HloOpcode::kWhile) {
+      cloned_instr->set_while_condition(
+          loop_computation->parent()->AddEmbeddedComputation(
+              instr->while_condition()->CloneWithReplacements(nullptr)));
+      cloned_instr->set_while_body(
+          loop_computation->parent()->AddEmbeddedComputation(
+              instr->while_body()->CloneWithReplacements(nullptr)));
+    }
     TF_RETURN_IF_ERROR(
         UpdateControlDependencies(instr, cloned_instr, while_body_to_peeled));
     UpdateInstructionChannelId(cloned_instr, next_channel_id);
@@ -3051,6 +3059,14 @@ static absl::Status TransformLoopBackward(
           MapNewOperands(instr->operands(), while_body_replacement_map);
       cloned_instr = body_builder.AddInstruction(
           instr->CloneWithNewOperands(instr->shape(), new_operands));
+      if (cloned_instr->opcode() == HloOpcode::kWhile) {
+        cloned_instr->set_while_condition(
+            while_loop->GetModule()->AddEmbeddedComputation(
+                instr->while_condition()->CloneWithReplacements(nullptr)));
+        cloned_instr->set_while_body(
+            while_loop->GetModule()->AddEmbeddedComputation(
+                instr->while_body()->CloneWithReplacements(nullptr)));
+      }
       TF_RETURN_IF_ERROR(UpdateControlDependencies(instr, cloned_instr,
                                                    while_body_replacement_map));
       UpdateInstructionChannelId(cloned_instr, next_channel_id);
@@ -3366,7 +3382,7 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
   return changed;
 }
 
-absl::StatusOr<bool> CollectivePipeliner::Run(
+absl::StatusOr<bool> CollectivePipeliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   CHECK(config_.acceptable_formatting);
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index f366d1582a845b..af9d8e30665dfc 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -149,8 +149,8 @@ class CollectivePipeliner : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index fbd8779fa9a668..2391e3f3454c17 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -2811,7 +2812,7 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
   auto should_pipeline = [](const HloInstruction* instruction) {
     if (!HloPredicateIsOp<HloOpcode::kRecvDone>(instruction)) return false;
     const HloRecvDoneInstruction* recv_done =
-        dynamic_cast<const HloRecvDoneInstruction*>(instruction);
+        DynCast<const HloRecvDoneInstruction>(instruction);
     if (recv_done->is_host_transfer()) return false;
     // Check that the recv-done is used for non-trivial computation, which can
     // also help avoid repeatedly pipelining a loop.
@@ -2909,7 +2910,7 @@ TEST_F(CollectivePipelinerTest,
         !HloPredicateIsOp<HloOpcode::kSend>(instr))
       return false;
     const HloSendRecvInstruction* send_recv =
-        dynamic_cast<const HloSendRecvInstruction*>(instr);
+        DynCast<const HloSendRecvInstruction>(instr);
     // Check that the Send or Recv is used for non-trivial computation, which
     // also help avoid repeatedly pipelining a loop.
     return (send_recv->user_count() == 1 && send_recv->parent() != nullptr &&
@@ -3254,6 +3255,104 @@ ENTRY main.3813_spmd {
   ASSERT_IS_OK(verifier.Run(module.get()).status());
 }
 
+TEST_F(CollectivePipelinerTest, ClonedWhileLoopHasUniqueBodyAndCondition) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+to_apply0 {
+  Arg_0.732 = bf16[] parameter(0)
+  Arg_1.733 = bf16[] parameter(1)
+  ROOT add.734 = bf16[] add(Arg_0.732, Arg_1.733)
+}
+
+body_inner {
+ p2 = (s32[], bf16[3,4096,4096]{2,1,0}) parameter(0)
+ gte1 = s32[] get-tuple-element(p2), index=0
+  gte2 = bf16[3,4096,4096]{2,1,0} get-tuple-element(p2), index=1
+  add = bf16[3,4096,4096]{2,1,0} add(gte2, gte2)
+  ROOT tuple = (s32[], bf16[3,4096,4096]{2,1,0}) tuple(gte1, gte2)
+
+}
+
+condition_inner {
+  cond_p1 = (s32[], bf16[3,4096,4096]{2,1,0}) parameter(0)
+  gte1 = s32[] get-tuple-element(cond_p1), index=0
+  c1 = s32[] constant(9)
+  ROOT comp0 = pred[] compare(gte1, c1), direction=LT
+}
+
+body {
+  p2 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte2 = bf16[3,4096,4096]{2,1,0} get-tuple-element(p2), index=1
+  gte3 = bf16[10,512,3,4096]{3,2,1,0} get-tuple-element(p2), index=2
+  c2 = s32[] constant(9)
+  gte4 = s32[] get-tuple-element(p2), index=0
+  sub0 = s32[] subtract(c2, gte4)
+  c3 = s32[] constant(0)
+  comp1 = pred[] compare(sub0, c3), direction=LT
+  c4 = s32[] constant(19)
+  sub2 = s32[] subtract(c4, gte4)
+  sel0 = s32[] select(comp1, sub2, sub0)
+
+  rsp0 = bf16[3,4096,4096]{2,1,0} reshape(gte2)
+  t = (s32[], bf16[3,4096,4096]{2,1,0}) tuple(c3, rsp0)
+  w0 = (s32[], bf16[3,4096,4096]{2,1,0}) while(t), condition=condition_inner, body=body_inner
+  rsp0_postwhile = bf16[3,4096,4096]{2,1,0} get-tuple-element(w0), index=1
+  rs0 = bf16[3,4096,512]{2,1,0} reduce-scatter(rsp0_postwhile), channel_id=75, replica_groups={{0,1,2,3}}, dimensions={2}, to_apply=to_apply0
+  tran0 = bf16[512,3,4096]{0,2,1} transpose(rs0), dimensions={2,0,1}
+  rsp1 = bf16[1,512,3,4096]{3,2,1,0} bitcast(tran0)
+  dus0 = bf16[10,512,3,4096]{3,2,1,0} dynamic-update-slice(gte3, rsp1, sel0, c3, c3, /*index=5*/c3)
+  c5 = s32[] constant(1)
+  add0 = s32[] add(gte4, c5)
+  ROOT t1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(add0, rsp0, dus0)
+} // body
+
+condition {
+  cond_p1 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) parameter(0)
+  gte1 = s32[] get-tuple-element(cond_p1), index=0
+  c1 = s32[] constant(9)
+  ROOT comp0 = pred[] compare(gte1, c1), direction=LT
+}
+
+ENTRY main.3813_spmd {
+  p0 = bf16[3,4096,4096]{2,1,0} parameter(0)
+  p1 = bf16[10,512,3,4096]{3,2,1,0} parameter(1)
+  c0 = s32[] constant(0)
+
+  t0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) tuple(c0, p0, p1)
+  w0 = (s32[], bf16[3,4096,4096]{2,1,0}, bf16[10,512,3,4096]{3,2,1,0}) while(t0), condition=condition, body=body
+  ROOT gte0 = bf16[3,4096,4096]{2,1,0} get-tuple-element(w0), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  XLA_VLOG_LINES(1, absl::StrCat("Before: ", module->ToString()));
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   HloPredicateIsOp<HloOpcode::kReduceScatter>)
+          .value());
+  XLA_VLOG_LINES(1, absl::StrCat("After: ", module->ToString()));
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(module.get()).status());
+  absl::flat_hash_set<HloComputation*> body_computations_seen;
+  absl::flat_hash_set<HloComputation*> condition_computations_seen;
+  for (auto* computation : module->computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        ASSERT_TRUE(
+            !body_computations_seen.contains(instruction->while_body()));
+        ASSERT_TRUE(!condition_computations_seen.contains(
+            instruction->while_condition()));
+        body_computations_seen.insert(instruction->while_body());
+        condition_computations_seen.insert(instruction->while_condition());
+      }
+    }
+  }
+}
+
 TEST_F(CollectivePipelinerTest,
        PipelineBackwardIncludeInvariantMultiConsumerInChain) {
   constexpr absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/collective_utils.h b/third_party/xla/xla/service/collective_utils.h
index 75d05aaef9d4b8..b2f55f29870fe5 100644
--- a/third_party/xla/xla/service/collective_utils.h
+++ b/third_party/xla/xla/service/collective_utils.h
@@ -70,6 +70,11 @@ constexpr char kSolChunkSizeBytes[] = "chunk_size_bytes";
 // cost model.
 constexpr char kSolGpusPerNode[] = "gpus_per_node";
 
+// Defines the partition size (number of devices per fast-interconnect domain)
+// used by the SoL cost model. This is necessary for AOT compilation when the
+// partition is larger than a node.
+constexpr char kSolPartitionSize[] = "partition_size";
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_COLLECTIVE_UTILS_H_
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index 060bfe44910755..9c8bfcf513bfd1 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -1,4 +1,3 @@
-#include "tsl/platform/status.h"
 /* Copyright 2022 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"
+#include "tsl/platform/status.h"
 
 namespace xla {
 
@@ -161,12 +161,7 @@ T& CompilationEnvironments::GetMutableEnv() {
     it = environments_.find(descriptor);
   }
 
-  // TODO(b/302086111): Remove after XLA has an updated protobuf version.
-#if TSL_IS_IN_OSS
-  return tensorflow::down_cast<T&>(*it->second);
-#else
   return tsl::protobuf::DownCastToGenerated<T>(*it->second);
-#endif
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/service/compiler.cc b/third_party/xla/xla/service/compiler.cc
index b0241163816446..790f68ea9a4c81 100644
--- a/third_party/xla/xla/service/compiler.cc
+++ b/third_party/xla/xla/service/compiler.cc
@@ -68,6 +68,11 @@ absl::StatusOr<Compiler::TargetConfig> Compiler::TargetConfig::FromProto(
                                       proto.runtime_version().minor(),
                                       proto.runtime_version().patch());
   target_config.device_description.set_runtime_version(runtime_version);
+  se::SemanticVersion dnn_version(
+      static_cast<unsigned>(proto.dnn_version_info().major()),
+      static_cast<unsigned>(proto.dnn_version_info().minor()),
+      static_cast<unsigned>(proto.dnn_version_info().patch()));
+  target_config.device_description.set_dnn_version(dnn_version);
   return target_config;
 }
 
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 8e6b6a2440fb1b..3253ca4666e110 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "google/protobuf/message.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -53,7 +54,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
-#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 class DialectRegistry;
@@ -354,6 +354,13 @@ class Compiler {
     return Unimplemented("DeserializeExecutable unimplemented");
   }
 
+  // Creates an `Executable` based on the given `aot_result`.
+  virtual absl::StatusOr<std::unique_ptr<Executable>>
+  LoadExecutableFromAotResult(const AotCompilationResult& aot_result,
+                              const se::StreamExecutor& stream_exec) {
+    return Unimplemented("LoadExecutableFromAotResult unimplemented");
+  }
+
  private:
   // Mutex that guards the platform-compiler map.
   static absl::Mutex platform_compiler_mutex_;
diff --git a/third_party/xla/xla/service/conditional_code_motion.cc b/third_party/xla/xla/service/conditional_code_motion.cc
index 2c85399cf42082..484668efb1510c 100644
--- a/third_party/xla/xla/service/conditional_code_motion.cc
+++ b/third_party/xla/xla/service/conditional_code_motion.cc
@@ -1952,7 +1952,7 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
   return Decision(Decision::Direction::kNoChange, 0);
 }
 
-absl::StatusOr<bool> ConditionalCodeMotion::Run(
+absl::StatusOr<bool> ConditionalCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Begin a new pass of conditional code motion optimization.\n";
diff --git a/third_party/xla/xla/service/conditional_code_motion.h b/third_party/xla/xla/service/conditional_code_motion.h
index 2a345e27cbaff8..a141d450a18844 100644
--- a/third_party/xla/xla/service/conditional_code_motion.h
+++ b/third_party/xla/xla/service/conditional_code_motion.h
@@ -185,10 +185,6 @@ class ConditionalCodeMotion : public HloModulePass {
   }
 
   absl::string_view name() const override { return "conditional-code-motion"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Optimization decision for each boundary of the conditional instruction.
   class Decision {
@@ -216,6 +212,11 @@ class ConditionalCodeMotion : public HloModulePass {
       std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries,
       absl::flat_hash_map<HloInstruction*, int>& visited_count);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const bool is_layout_sensitive_;
   const bool pursue_full_conditional_code_motion_;
diff --git a/third_party/xla/xla/service/conditional_simplifier.cc b/third_party/xla/xla/service/conditional_simplifier.cc
index 2bf4145c3257a0..0308e66d7d3bfc 100644
--- a/third_party/xla/xla/service/conditional_simplifier.cc
+++ b/third_party/xla/xla/service/conditional_simplifier.cc
@@ -606,11 +606,11 @@ static bool InstructionCallsChannelInstructions(
   return false;
 }
 
-absl::StatusOr<bool> ConditionalSimplifier::Run(
+absl::StatusOr<bool> ConditionalSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      3, "ConditionalSimplifier::Run(), before:\n" + module->ToString());
+      3, "ConditionalSimplifier::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   // Gather all the conditional ops in our module. We do this ahead of time so
@@ -674,8 +674,8 @@ absl::StatusOr<bool> ConditionalSimplifier::Run(
     changed |= result;
   }
 
-  XLA_VLOG_LINES(3,
-                 "ConditionalSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "ConditionalSimplifier::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/conditional_simplifier.h b/third_party/xla/xla/service/conditional_simplifier.h
index 01363f94580b68..ff99641290605b 100644
--- a/third_party/xla/xla/service/conditional_simplifier.h
+++ b/third_party/xla/xla/service/conditional_simplifier.h
@@ -28,8 +28,9 @@ namespace xla {
 class ConditionalSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "simplify-conditional"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/conditional_to_select.cc b/third_party/xla/xla/service/conditional_to_select.cc
index 95febe9bee07d9..e750fec2e2cf8f 100644
--- a/third_party/xla/xla/service/conditional_to_select.cc
+++ b/third_party/xla/xla/service/conditional_to_select.cc
@@ -104,7 +104,7 @@ static absl::StatusOr<bool> DoConditionalToSelect(HloInstruction* conditional) {
   return true;
 }
 
-absl::StatusOr<bool> ConditionalToSelect::Run(
+absl::StatusOr<bool> ConditionalToSelect::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/service/conditional_to_select.h b/third_party/xla/xla/service/conditional_to_select.h
index 8bba94a9329ff0..883d4e4ecb0ecf 100644
--- a/third_party/xla/xla/service/conditional_to_select.h
+++ b/third_party/xla/xla/service/conditional_to_select.h
@@ -31,10 +31,8 @@ class ConditionalToSelect : public HloModulePass {
   ~ConditionalToSelect() override = default;
   absl::string_view name() const override { return "conditional-to-select"; }
 
-  // Run conditional to select on the given computation. Returns whether the
-  // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 2a0490b7029163..b3251cc79ba7d2 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -1499,7 +1499,7 @@ absl::Status CopyInsertion::RemoveUnnecessaryCopies(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> CopyInsertion::Run(
+absl::StatusOr<bool> CopyInsertion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Copy insertion is performed in three steps:
diff --git a/third_party/xla/xla/service/copy_insertion.h b/third_party/xla/xla/service/copy_insertion.h
index beb08a99ea8e5b..cce623ebc236f7 100644
--- a/third_party/xla/xla/service/copy_insertion.h
+++ b/third_party/xla/xla/service/copy_insertion.h
@@ -70,13 +70,6 @@ class CopyInsertion : public HloModulePass {
         use_region_based_live_range_analysis_(
             use_region_based_live_range_analysis) {}
 
-  // Run the pass on the given module. Returns whether the module was changed
-  // (copies were inserted).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Try to remove as many copies from the module as possible without
   // introducing live range interference. Only copy instructions that are
   // eligible for copy elision are considered for removal.
@@ -126,6 +119,12 @@ class CopyInsertion : public HloModulePass {
   // with its operand.
   const AliasInfo* alias_info_;
 
+  // Run the pass on the given module. Returns whether the module was changed
+  // (copies were inserted).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   absl::Status AddCopiesToResolveInterference(
       HloModule* module,
diff --git a/third_party/xla/xla/service/copy_insertion_test.cc b/third_party/xla/xla/service/copy_insertion_test.cc
index 884f22eca08656..728755427add14 100644
--- a/third_party/xla/xla/service/copy_insertion_test.cc
+++ b/third_party/xla/xla/service/copy_insertion_test.cc
@@ -2923,56 +2923,6 @@ ENTRY main {
                   op::Fusion(), op::Iota(), op::Iota(), op::Iota()));
 }
 
-TEST_F(CopyInsertionTest, HorizontalLoopFusionNoCopy) {
-  const std::string& hlo_string = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      add0 = f32[10, 20] add(p0, p1)
-      sub0 = f32[10, 10] subtract(p2, p3)
-      reshape0 = f32[200] reshape(add0)
-      reshape1 = f32[100] reshape(sub0)
-      concat0 = f32[300] concatenate(reshape0, reshape1), dimensions={0}
-      slice0 = f32[200] slice(concat0), slice={[0:200]}
-      slice1 = f32[100] slice(concat0), slice={[200:300]}
-      ROOT tuple = (f32[200], f32[100]) tuple(slice0, slice1)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20] parameter(0)
-      p1 = f32[10,20] parameter(1)
-      p2 = f32[10,10] parameter(2)
-      p3 = f32[10,10] parameter(3)
-      fusion = (f32[200], f32[100]) fusion(p0, p1, p2, p3), kind=kInput, calls=fused_computation
-      gte0 = f32[200] get-tuple-element(fusion), index=0
-      gte1 = f32[100] get-tuple-element(fusion), index=1
-      bitcast0 = f32[10,20] bitcast(gte0)
-      bitcast1 = f32[10,10] bitcast(gte1)
-      ROOT tuple = (f32[10,20], f32[10,10]) tuple(bitcast0, bitcast1)
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0},
-      /*param_number=*/0,
-      /*param_index=*/{}));
-  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1},
-      /*param_number=*/3,
-      /*param_index=*/{}));
-
-  InsertCopies(module.get());
-
-  // There should be no copies inserted.
-  EXPECT_EQ(CountCopies(*module), 0);
-}
-
 TEST_F(CopyInsertionTest, NestedWhileAndConditional3) {
   const std::string& hlo_string = R"(
 HloModule TestModule
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index b1e885813c6efa..cb7aa7635fd4aa 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -1,11 +1,3 @@
-# Description:
-#    LLVM-based CPU backend for XLA.
-
-load(
-    "//third_party/compute_library:build_defs.bzl",
-    "acl_deps",
-    "if_enable_acl",
-)
 load(
     "//xla:xla.default.bzl",
     "xla_cc_test",
@@ -20,6 +12,7 @@ load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
     "if_llvm_powerpc_available",
+    "if_llvm_riscv_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
@@ -59,10 +52,6 @@ filegroup(
     name = "runtime_srcs",
     srcs = [
         # Single-threaded support.
-        "runtime_custom_call_status.cc",
-        "runtime_fp16.cc",
-        "runtime_key_value_sort.cc",
-        "runtime_pow.cc",
         "runtime_single_threaded_conv2d.cc",
         "runtime_single_threaded_conv3d.cc",
         "runtime_single_threaded_matmul_c128.cc",
@@ -74,7 +63,6 @@ filegroup(
         "runtime_single_threaded_matmul_f64.cc",
         "runtime_single_threaded_matmul_s32.cc",
         "runtime_single_threaded_matmul_u8.cc",
-        "runtime_topk.cc",
         # Multi-threaded support.
         "runtime_conv2d.cc",
         "runtime_conv3d.cc",
@@ -94,14 +82,9 @@ filegroup(
     name = "runtime_hdrs",
     srcs = [
         # Single-threaded support.
-        "runtime_custom_call_status.h",
-        "runtime_fp16.h",
-        "runtime_key_value_sort.h",
-        "runtime_pow.h",
         "runtime_single_threaded_conv2d.h",
         "runtime_single_threaded_conv3d.h",
         "runtime_single_threaded_matmul.h",
-        "runtime_topk.h",
         # Multi-threaded support.
         "runtime_conv2d.h",
         "runtime_conv3d.h",
@@ -194,10 +177,8 @@ cc_library(
         ":ir_emitter2",
         ":metrics",
         ":parallel_task_assignment",
-        ":runtime_symbol_generator",
         ":small_while_loop_hoisting_pass",
         ":thunk_emitter",
-        "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_pool",
@@ -212,6 +193,7 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
         "//xla/backends/cpu:xnn_support",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
@@ -230,7 +212,6 @@ cc_library(
         "//xla/backends/cpu/transforms/collectives:all_reduce_combiner",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:indexed_array_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/pass:hlo_pass",
@@ -399,6 +380,8 @@ cc_library(
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
@@ -407,6 +390,8 @@ cc_library(
         ":onednn_contraction_rewriter",
         ":onednn_float_support",
         ":onednn_ops_rewriter",
+    ]) + if_ynnpack([
+        "//xla/backends/cpu:ynn_support",
     ]),
 )
 
@@ -417,7 +402,6 @@ cc_library(
     deps = [
         ":cpu_executable",
         ":executable_proto_cc",
-        "//xla:cpu_function_runtime",
         "//xla:util",
         "//xla/backends/cpu:buffer_allocation_info",
         "//xla/backends/cpu:buffer_allocation_info_util",
@@ -505,19 +489,19 @@ xla_test(
     ],
 )
 
-xla_test(
+xla_cc_test(
     name = "cpu_compiler_internals_test",
     srcs = ["cpu_compiler_internals_test.cc"],
-    backends = [
-        "cpu",
-    ],
     deps = [
+        ":cpu_compiler_pure",
         "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:llvm_compiler",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:nullability",
@@ -539,7 +523,6 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/service:hlo_runner",
@@ -552,6 +535,9 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -565,45 +551,6 @@ tf_proto_library(
     ],
 )
 
-cc_library(
-    name = "runtime_symbol_generator",
-    srcs = [
-        "runtime_symbol_generator.cc",
-        "windows_compatibility.cc",
-        "windows_compatibility.h",
-    ],
-    hdrs = ["runtime_symbol_generator.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
-    deps = [
-        ":cpu_runtime",
-        ":runtime_conv2d",
-        ":runtime_conv2d_acl",
-        ":runtime_conv3d",
-        ":runtime_custom_call_status",
-        ":runtime_fp16",
-        ":runtime_key_value_sort",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_pow",
-        ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_conv3d",
-        ":runtime_single_threaded_matmul",
-        ":runtime_topk",
-        "//xla/service:custom_call_target_registry",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:OrcShared",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:mlir_c_runner_utils",
-        "@local_tsl//tsl/platform:logging",
-    ] + if_onednn([
-        ":onednn_convolution",
-        ":onednn_matmul",
-    ]),
-)
-
 cc_library(
     name = "runtime_lightweight_check",
     hdrs = ["runtime_lightweight_check.h"],
@@ -611,30 +558,6 @@ cc_library(
     copts = runtime_copts(),
 )
 
-cc_library(
-    name = "runtime_fp16",
-    srcs = [
-        "runtime_fp16.cc",
-    ],
-    hdrs = [
-        "runtime_fp16.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
-cc_library(
-    name = "runtime_pow",
-    srcs = [
-        "runtime_pow.cc",
-    ],
-    hdrs = [
-        "runtime_pow.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
@@ -762,10 +685,10 @@ xla_cc_test(
         ":cpu_options",
         ":ir_emitter",
         ":ir_function",
-        ":runtime_symbol_generator",
         ":target_machine_features_stub",
         "//xla:shape_util",
         "//xla/backends/cpu:alignment",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:ir_compiler",
@@ -927,7 +850,6 @@ cc_library(
         ":ir_emitter2",
         ":parallel_fusion_emitter",
         "//xla:comparison_util",
-        "//xla:cpu_function_runtime",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -973,13 +895,12 @@ cc_library(
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters:computation_fingerprint",
         "//xla/codegen/emitters:kernel_api_builder",
         "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
@@ -989,14 +910,15 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
         "//xla/service:pattern_matcher",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1013,6 +935,7 @@ cc_library(
     ]) + if_ynnpack([
         "//xla/backends/cpu:ynn_emitter",
         "//xla/backends/cpu:ynn_support",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
     ]),
 )
@@ -1127,7 +1050,7 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "//xla/backends/cpu/runtime:convolution_lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
@@ -1143,25 +1066,13 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "//xla/backends/cpu/runtime:convolution_lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
     ],
 )
 
-cc_library(
-    name = "runtime_custom_call_status",
-    srcs = ["runtime_custom_call_status.cc"],
-    hdrs = ["runtime_custom_call_status.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:custom_call_status_internal",
-        "@com_google_absl//absl/base:core_headers",
-    ],
-)
-
 cc_library(
     name = "runtime_matmul",
     srcs = [
@@ -1187,46 +1098,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "runtime_matmul_acl",
-    srcs = ["runtime_matmul_acl.cc"],
-    hdrs = ["runtime_matmul_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        ":runtime_matmul",
-        "//xla:executable_run_options",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
-)
-
-cc_library(
-    name = "runtime_conv2d_acl",
-    srcs = [
-        "runtime_conv2d_acl.cc",
-    ],
-    hdrs = ["runtime_conv2d_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_conv2d",
-        ":runtime_lightweight_check",
-        ":runtime_single_threaded_conv2d",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/convolution:eigen_helpers",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
-)
-
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = ["runtime_single_threaded_conv2d.cc"],
@@ -1234,7 +1105,7 @@ cc_library(
     copts = runtime_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "//xla/backends/cpu/runtime:convolution_lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
@@ -1248,7 +1119,7 @@ cc_library(
     copts = runtime_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "//xla/backends/cpu/runtime:convolution_lib",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
@@ -1311,53 +1182,7 @@ cc_library(
 
 cc_library(
     name = "runtime_key_value_sort",
-    srcs = ["runtime_key_value_sort.cc"],
-    hdrs = ["runtime_key_value_sort.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_topk",
-    srcs = ["runtime_topk.cc"],
-    hdrs = ["runtime_topk.h"],
-    copts = runtime_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_runtime_test",
-    srcs = ["cpu_runtime_test.cc"],
-    shard_count = 10,
-    tags = ["optonly"],
-    deps = [
-        ":cpu_runtime",
-        ":runtime_custom_call_status",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_single_threaded_matmul",
-        "//xla:array2d",
-        "//xla:executable_run_options",
-        "//xla:types",
-        "//xla/client:local_client",
-        "//xla/service:custom_call_status_internal",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:str_format",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
 )
 
 xla_cc_test(
@@ -1689,7 +1514,8 @@ tf_proto_library(
     srcs = ["backend_config.proto"],
     protodeps = [
         ":onednn_config_proto",
-        "//xla/backends/cpu:xnnpack_config_proto",
+        "//xla/backends/cpu:xnn_fusion_options_proto",
+        "//xla/backends/cpu:ynn_fusion_options_proto",
     ],
 )
 
@@ -2045,14 +1871,13 @@ cc_library(
     deps = [
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
+        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:llvm_kernel_definition",
-        "//xla/codegen:mlir_kernel_definition",
+        "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -2075,7 +1900,6 @@ xla_test(
     deps = [
         ":parallel_fusion_emitter",
         "//xla/backends/cpu/codegen:fusion_compiler",
-        "//xla/codegen:llvm_kernel_definition",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2160,8 +1984,8 @@ cc_library(
     deps = [
         ":cpu_aot_compilation_result",
         ":executable_proto_cc",
-        ":runtime_symbol_generator",
         "//xla:util",
+        "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:ir_compiler",
@@ -2171,19 +1995,25 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
     ] + if_llvm_aarch64_available([
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_riscv_available([
+        "@llvm-project//llvm:RISCVCodeGen",  # fixdeps: keep
     ]) + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index d5c3af6bfe337e..3da0ca6b17a641 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -2,7 +2,8 @@ syntax = "proto3";
 
 package xla.cpu;
 
-import "xla/backends/cpu/xnnpack_config.proto";
+import "xla/backends/cpu/xnn_fusion_options.proto";
+import "xla/backends/cpu/ynn_fusion_options.proto";
 import "xla/service/cpu/onednn_config.proto";
 
 // Backend config for a general custom call instruction, e.g. XLA FFI.
@@ -19,7 +20,8 @@ message CustomCallBackendConfig {
 message FusionBackendConfig {
   string kind = 1;
   oneof custom_fusion_config_oneof {
-    XnnFusionBackendConfig xnn_fusion_config = 2;
+    XnnFusionOptions xnn_fusion_options = 2;
+    YnnFusionOptions ynn_fusion_options = 3;
   }
 }
 
diff --git a/third_party/xla/xla/service/cpu/conv_canonicalization.cc b/third_party/xla/xla/service/cpu/conv_canonicalization.cc
index d00870f01e3b9d..47bcb617260dcb 100644
--- a/third_party/xla/xla/service/cpu/conv_canonicalization.cc
+++ b/third_party/xla/xla/service/cpu/conv_canonicalization.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-absl::StatusOr<bool> ConvCanonicalization::Run(
+absl::StatusOr<bool> ConvCanonicalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/cpu/conv_canonicalization.h b/third_party/xla/xla/service/cpu/conv_canonicalization.h
index ee6c1a44207409..bbb6fe42edb082 100644
--- a/third_party/xla/xla/service/cpu/conv_canonicalization.h
+++ b/third_party/xla/xla/service/cpu/conv_canonicalization.h
@@ -44,8 +44,9 @@ class ConvCanonicalization : public HloModulePass {
   absl::string_view name() const override {
     return "convolution-canonicalization";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index 0894f3243783fe..959cde89ffa095 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -80,9 +79,10 @@ CpuAotCompilationResult::Create(
     absl::string_view function_name, std::vector<ObjFileProto> obj_files,
     std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
     std::unique_ptr<FunctionLibrary> function_library,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data) {
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+    TargetMachineOptionsProto target_machine_options) {
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      hlo_module, &buffer_assignment->Allocations());
   TF_ASSIGN_OR_RETURN(ThunkSequenceProto thunk_proto,
                       thunk_sequence_serdes.ToProto(thunks));
 
@@ -109,7 +109,7 @@ CpuAotCompilationResult::Create(
       hlo_module, buffer_assignment, function_name, std::move(obj_files),
       std::move(symbols), thunk_proto, std::move(temp_allocation_index),
       std::move(buffer_allocation_infos), std::move(function_library),
-      std::move(hlo_profile_printer_data)));
+      std::move(hlo_profile_printer_data), std::move(target_machine_options)));
 }
 
 CpuAotCompilationResult::CpuAotCompilationResult(
@@ -119,7 +119,8 @@ CpuAotCompilationResult::CpuAotCompilationResult(
     std::optional<size_t> temp_allocation_index,
     std::vector<BufferAllocationInfo> buffer_allocation_infos,
     std::unique_ptr<FunctionLibrary> function_library,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+    TargetMachineOptionsProto target_machine_options)
     : temp_allocation_index_(temp_allocation_index),
       buffer_allocation_infos_(std::move(buffer_allocation_infos)),
       function_library_(std::move(function_library)),
@@ -129,6 +130,7 @@ CpuAotCompilationResult::CpuAotCompilationResult(
       hlo_module->config().ToProto();
   *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
   proto_.set_entry_function_name(function_name);
+  *proto_.mutable_target_machine_options() = std::move(target_machine_options);
   for (ObjFileProto& obj_file : obj_files) {
     *proto_.add_object_files() = std::move(obj_file);
   }
@@ -141,7 +143,7 @@ CpuAotCompilationResult::CpuAotCompilationResult(
   module_ = hlo_module->Clone();
 
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      hlo_module, &buffer_assignment->Allocations());
   *proto_.mutable_thunk_sequence() = thunks;
 }
 
@@ -179,7 +181,7 @@ CpuAotCompilationResult::LoadExecutable(
   }
 
   ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-      &buffer_assignment->Allocations());
+      module.get(), &buffer_assignment->Allocations());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<ThunkSequence> thunks,
                       thunk_sequence_serdes.FromProto(proto_.thunk_sequence()));
 
@@ -194,7 +196,7 @@ CpuAotCompilationResult::LoadExecutable(
       CpuExecutable::Create(std::move(function_library_),
                             std::move(buffer_assignment), std::move(module),
                             std::move(*thunks), std::move(constants), nullptr,
-                            nullptr));
+                            nullptr, proto_.target_machine_options()));
 
   // Dump computation proto state and buffer assignment for
   // GetCompiledMemoryStats results.
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 4c5329c98ebc79..52154f376c03ad 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
@@ -107,7 +106,9 @@ class CpuAotCompilationResult : public AotCompilationResult {
       absl::string_view function_name, std::vector<ObjFileProto> obj_files,
       std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
       std::unique_ptr<FunctionLibrary> function_library,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      TargetMachineOptionsProto target_machine_options =
+          TargetMachineOptionsProto());
 
   ~CpuAotCompilationResult() override = default;
 
@@ -186,7 +187,8 @@ class CpuAotCompilationResult : public AotCompilationResult {
       std::optional<size_t> temp_allocation_index,
       std::vector<BufferAllocationInfo> buffer_allocation_infos,
       std::unique_ptr<FunctionLibrary> function_library,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      TargetMachineOptionsProto target_machine_options);
 
   explicit CpuAotCompilationResult(
       CompilationResultProto proto, std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
index 92dbd751a92015..e869d323087ac7 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/StringMap.h"  // IWYU pragma: keep
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/compiler.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/casts.h"
 
 namespace xla {
 namespace cpu {
@@ -125,6 +128,61 @@ ENTRY e {
   test("Test kHloAdd2", kHloAdd2, 1, 3);
 }
 
+TEST_F(CpuAotCompilerTest,
+       ExportedExecutableTargetMachineOptionsMatchTheCompilationMachine) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  a_plus_b = f32[] add(a, b)
+  ROOT result = f32[] add(a_plus_b, b)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::PlatformManager::PlatformWithName("host"));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  Compiler* compiler = backend().compiler();
+  ASSERT_NE(compiler, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(module_string));
+
+  xla::Compiler::CompileOptions compile_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      hlo_module, compiler->RunHloPasses(std::move(hlo_module), stream_exec,
+                                         compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable, compiler->RunBackend(std::move(hlo_module), stream_exec,
+                                            compile_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto aot_result, compiler->Export(executable.get()));
+
+  CpuAotCompilationResult* cpu_aot_result =
+      tsl::down_cast<CpuAotCompilationResult*>(aot_result.get());
+  ASSERT_NE(cpu_aot_result, nullptr);
+
+  EXPECT_EQ(
+      llvm::Triple(cpu_aot_result->proto().target_machine_options().triple())
+          .getArchName(),
+      llvm::Triple(kTargetTripleForHost).getArchName());
+
+  auto host_machine_features = llvm::sys::getHostCPUFeatures();
+  std::vector<absl::string_view> enabled_features;
+  for (const auto& feature : host_machine_features) {
+    if (feature.getValue()) {
+      enabled_features.push_back(feature.getKey());
+    }
+  }
+
+  EXPECT_EQ(cpu_aot_result->proto().target_machine_options().features(),
+            absl::StrJoin(enabled_features, ","));
+}
+
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
index 419d0f53fcd569..ea5a336513c964 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
@@ -22,13 +22,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Host.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
@@ -37,9 +41,9 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/executable.pb.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -88,6 +92,11 @@ absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
     absl::Span<const ObjFileProto> obj_files, const HloModule* hlo_module) {
   const HloModuleConfig& config = hlo_module->config();
   const DebugOptions& debug_options = config.debug_options();
+
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      config.debug_options().xla_backend_extra_options());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<llvm::TargetMachine> target_machine,
       IrCompiler::InferTargetMachine(
@@ -98,7 +107,7 @@ absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   ObjectLoader object_loader(/*num_dylibs=*/1,
@@ -107,7 +116,7 @@ absl::StatusOr<std::unique_ptr<FunctionLibrary>> LoadFunctionLibrary(
 
   for (size_t i = 0; i < object_loader.num_dylibs(); ++i) {
     object_loader.dylib(i).value()->addGenerator(
-        std::make_unique<RuntimeSymbolGenerator>(
+        std::make_unique<BuiltinDefinitionGenerator>(
             target_machine->createDataLayout()));
   }
 
@@ -158,6 +167,54 @@ CpuAotLoader::LoadAotCompilationResult(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
       HloModule::CreateFromProtoWithConfig(aot_result_proto.hlo_module()));
+
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      hlo_module->config().debug_options().xla_backend_extra_options());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::TargetMachine> target_machine,
+      IrCompiler::InferTargetMachine(
+          std::move(CompilerTargetOptions(hlo_module->config())),
+          IrCompiler::GetCodeGenOptLevel(hlo_module->config()),
+          CpuFeatureFromString(
+              hlo_module->config().debug_options().xla_cpu_max_isa())));
+
+  llvm::Triple triple(aot_result_proto.target_machine_options().triple());
+  llvm::Triple expected_triple(target_machine->getTargetTriple());
+  if (triple.getArchName() != expected_triple.getArchName()) {
+    return Internal("Target arch mismatch expected %s got %s.",
+                    expected_triple.getArchName(), triple.getArchName());
+  }
+
+  llvm::StringMap<bool> host_machine_features = llvm::sys::getHostCPUFeatures();
+  auto compile_machine_features =
+      absl::StrSplit(aot_result_proto.target_machine_options().features(), ',');
+  // Convert the supported features to a vector of strings.
+  std::vector<std::string> host_machine_features_vector;
+  for (const auto& [feature, supported] : host_machine_features) {
+    if (supported) {
+      host_machine_features_vector.push_back(feature.str());
+    }
+  }
+
+  for (const absl::string_view feature : compile_machine_features) {
+    if (!host_machine_features.contains(feature) ||
+        !host_machine_features[feature]) {
+      // TODO: b/457415427 - Turn this warning into an error once a mechanism
+      // for passing target machine features to the CPU compiler is implemented.
+      LOG(ERROR)
+          << "Loading XLA:CPU AOT result. Target machine feature " << feature
+          << " is not  supported on the host machine. Machine type used for "
+             "XLA:CPU compilation doesn't match the machine type for "
+             "execution. Compile machine features: ["
+          << absl::StrJoin(compile_machine_features, ",")
+          << "] vs host machine features: ["
+          << absl::StrJoin(host_machine_features_vector, ",") << "]"
+          << ". This could lead to execution errors such as SIGILL.";
+    }
+  }
+
   std::vector<SymbolProto> compiled_symbols_proto;
   for (const auto& symbol_proto : aot_result_proto.compiled_symbols()) {
     compiled_symbols_proto.push_back(symbol_proto);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 85fd218f1f48c2..c1312664a32439 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -87,6 +87,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
@@ -103,10 +104,8 @@ limitations under the License.
 #include "xla/backends/cpu/transforms/library_rewriter.h"
 #include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
 #include "xla/backends/cpu/xnn_support.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/indexed_array_analysis.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -188,7 +187,6 @@ limitations under the License.
 #include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/metrics.h"
 #include "xla/service/cpu/parallel_task_assignment.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/cpu/small_while_loop_hoisting_pass.h"
 #include "xla/service/cpu/thunk_emitter.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
@@ -258,6 +256,10 @@ limitations under the License.
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif  // XLA_ONEDNN
 
+#ifdef XLA_YNNPACK
+#include "xla/backends/cpu/ynn_support.h"
+#endif  // XLA_YNNPACK
+
 namespace xla {
 namespace {
 
@@ -502,11 +504,26 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
       !absl::c_contains(module->config()
                             .debug_options()
                             .xla_cpu_experimental_xnn_fusion_type(),
+                        DebugOptions::LIBRARY_FUSION_TYPE_REDUCE) &&
+      !absl::c_contains(module->config()
+                            .debug_options()
+                            .xla_cpu_experimental_ynn_fusion_type(),
                         DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
-    // Needs to happen after algebraic simplifier.
     pipeline->AddPass<TreeReductionRewriter>();
   }
 
+#ifdef XLA_YNNPACK
+  if (absl::c_contains(module->config()
+                           .debug_options()
+                           .xla_cpu_experimental_ynn_fusion_type(),
+                       DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
+    pipeline->AddPass<TreeReductionRewriter>(
+        /*reduce_window_size=*/32, [](const HloInstruction* hlo) {
+          return !IsReduceOpOffloadedToYnn(hlo);
+        });
+  }
+#endif
+
   // BatchNormExpander can create zero-sized ops, so zero-sized HLO
   // elimination has to come after that pass.
   pipeline->AddPass<ZeroSizedHloElimination>();
@@ -530,6 +547,49 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
   return pipeline;
 }
 
+auto LibrarySupportsDot(HloModule* module,
+                        TargetMachineFeatures* target_machine_features) {
+  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
+  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
+  // `XnnFusionThunk`.
+  const bool xnnpack_enabled =
+      module->config().debug_options().xla_cpu_use_xnnpack();
+  const auto xnn_graph_fusion_mode =
+      module->config()
+          .debug_options()
+          .xla_cpu_experimental_xnn_graph_fusion_mode();
+  const bool xnnpack_use_cost_model =
+      xnn_graph_fusion_mode !=
+      DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
+  const bool xnnpack_dot_enabled =
+      xnnpack_enabled &&
+      xnn_graph_fusion_mode != DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED;
+  const bool ynnpack_dot_enabled = absl::c_linear_search(
+      module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
+  return [=](const HloInstruction& instr) {
+#ifdef XLA_YNNPACK
+    if (ynnpack_dot_enabled &&
+        IsDotSupportedByYnn(instr.dot_dimension_numbers(),
+                            instr.operand(0)->shape(),
+                            instr.operand(1)->shape(), instr.shape())
+            .value_or(false)) {
+      return true;
+    }
+#endif  // XLA_YNNPACK
+
+    if (xnnpack_dot_enabled &&
+        IsDotSupportedByXnn(instr.dot_dimension_numbers(),
+                            instr.operand(0)->shape(),
+                            instr.operand(1)->shape(), instr.shape(),
+                            target_machine_features, xnnpack_use_cost_model)
+            .value_or(false)) {
+      return true;
+    }
+    return false;
+  };
+}
+
 }  // namespace
 
 absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
@@ -570,10 +630,14 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*update_domain=*/false,
         /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
         /*uniquify_channel_ids=*/false,
-        /*should_inline=*/
-        [](const xla::CallGraph& call_graph, xla::HloInstruction* instruction) {
-          return absl::StrContains(instruction->to_apply()->name(),
-                                   sdy::kInlineableManualComputationFuncName);
+        /*override_policy=*/
+        [](const xla::CallGraph& call_graph,
+           const xla::HloInstruction* instruction) {
+          if (absl::StrContains(instruction->to_apply()->name(),
+                                sdy::kInlineableManualComputationFuncName)) {
+            return CallInliner::InlineOverridePolicy::kAllowInline;
+          }
+          return CallInliner::InlineOverridePolicy::kProhibitInline;
         });
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status());
   } else {
@@ -608,36 +672,30 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<BatchedGatherScatterNormalizer>();
   pipeline.AddPass<ResultCaster>();
 
-  // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does
-  // not support. `upcaster_filter` returns false if the instruction shouldn't
-  // be processed.
-  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
-  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
-  // `XnnFusionThunk`.
-  bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack();
+  auto library_supports_dot =
+      LibrarySupportsDot(module, target_machine_features);
+
   auto call_library_for_dot = [&](const HloInstruction& instr) {
-    if (!xnnpack_enabled) return false;
-    DotImplementationStrategy strategy = GetDotImplementationStrategy(
+    if (instr.opcode() != HloOpcode::kDot) {
+      return false;
+    }
+
+    auto dot_strategy = GetDotImplementationStrategy(
         module->config(), instr, *target_machine_features,
         /*allow_runtime_calls=*/true);
-    return strategy == DotImplementationStrategy::kEigen;
+    if (dot_strategy != DotImplementationStrategy::kEigen) {
+      // We aren't going to call a library for this dot.
+      return false;
+    }
+
+    return library_supports_dot(instr);
   };
+
+  // If YNNPACK is enabled, we only need to upcast dots that YnnDotThunk does
+  // not support. `upcaster_filter` returns false if the instruction shouldn't
+  // be processed.
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    if (instr->opcode() != HloOpcode::kDot) {
-      return true;
-    }
-    if (!call_library_for_dot(*instr)) {
-      return true;
-    }
-    bool use_cost_model = module->config()
-                              .debug_options()
-                              .xla_cpu_experimental_xnn_graph_fusion_mode() !=
-                          DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-    return !IsDotSupportedByXnn(instr->dot_dimension_numbers(),
-                                instr->operand(0)->shape(),
-                                instr->operand(1)->shape(), instr->shape(),
-                                target_machine_features, use_cost_model)
-                .value_or(false);
+    return !call_library_for_dot(*instr);
   };
 
   // xla::cpu::GetDotImplementationStrategy (used by call_library_for_dot)
@@ -701,11 +759,13 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
-  CpuFloatSupport bf16_support(BF16, call_library_for_dot,
-                               target_machine_features);
+  CpuFloatSupport bf16_support(BF16, call_library_for_dot);
 #ifdef XLA_ONEDNN
+  bool use_onednn_graph =
+      module->config().debug_options().xla_cpu_use_onednn() &&
+      IsOneDnnCompatible(is_aot_compile);
   OneDnnFloatSupport onednn_bf16_support(BF16);
-  if (use_onednn_custom_call) {
+  if (use_onednn_custom_call || use_onednn_graph) {
     pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
   } else {
     pipeline.AddPass<FloatNormalization>(&bf16_support);
@@ -827,7 +887,6 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<TopkRewriter>([](const HloSortInstruction* sort, int64_t) {
     return sort->operand(0)->shape().element_type() == F32;
   });
-  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr<bool> {
         if (DotImplementationCanHandleTranspose(dot, *target_machine_features,
@@ -930,14 +989,20 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // XNNPACK ops availability checks depend on the layout information,
   // so until another solution is developed the passes creating XNNPACK fusions
   // have to run after layout assignment.
+  const bool use_ynnpack = absl::c_linear_search(
+      debug_options.xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_REDUCE);
   LibraryRewriterOptions options = {
       /*use_onednn=*/debug_options.xla_cpu_use_onednn(),
       /*use_xnnpack=*/debug_options.xla_cpu_use_xnnpack(),
+      /*use_ynnpack=*/use_ynnpack,
       /*onednn_fusion_types=*/
       &debug_options.xla_cpu_experimental_onednn_fusion_type(),
       /*xnn_fusion_types=*/
-      &debug_options.xla_cpu_experimental_xnn_fusion_type()};
-  if (options.use_onednn || options.use_xnnpack) {
+      &debug_options.xla_cpu_experimental_xnn_fusion_type(),
+      /*ynn_fusion_types=*/
+      &debug_options.xla_cpu_experimental_ynn_fusion_type()};
+  if (options.use_onednn || options.use_xnnpack || options.use_ynnpack) {
     HloPassPipeline lib_pipeline("dot-library-passes");
     lib_pipeline.AddPass<DotDecomposer>();
     lib_pipeline.AddPass<LibraryRewriter>(target_machine_features, options);
@@ -1630,7 +1695,7 @@ CpuCompiler::CompileCpuExecutable(
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   std::unique_ptr<LlvmMultipleModuleCompiler> llvm_module_compiler;
@@ -1941,13 +2006,32 @@ CpuCompiler::CompileCpuExecutable(
   TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
                       CreateConstantAllocations(*assignment));
 
+  TargetMachineOptionsProto target_machine_options_proto;
+  target_machine_options_proto.set_triple(
+      target_machine->getTargetTriple().getTriple());
+  target_machine_options_proto.set_cpu(target_machine->getTargetCPU());
+
+  // TODO(basioli): Target machine features are returning an empty string at the
+  // moment so for now we are using the host CPU features. This should be
+  // updated to use the target machine features of the target we are actually
+  // compiling for as we might want to support cross-compilation.
+  auto host_machine_features = llvm::sys::getHostCPUFeatures();
+  std::vector<absl::string_view> enabled_features;
+  for (const auto& feature : host_machine_features) {
+    if (feature.getValue()) {
+      enabled_features.push_back(feature.getKey());
+    }
+  }
+  target_machine_options_proto.set_features(
+      absl::StrJoin(enabled_features, ","));
+
   TF_ASSIGN_OR_RETURN(
       auto cpu_executable,
-      CpuExecutable::Create(std::move(function_library), std::move(assignment),
-                            std::move(module), std::move(thunks),
-                            std::move(constants),
-                            std::move(hlo_profile_printer_data),
-                            std::move(hlo_profile_index_map)));
+      CpuExecutable::Create(
+          std::move(function_library), std::move(assignment), std::move(module),
+          std::move(thunks), std::move(constants),
+          std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map),
+          std::move(target_machine_options_proto)));
 
   // Save object files to be able to export them to AOT compilation
   // result.
@@ -2184,7 +2268,8 @@ CpuCompiler::CompileAheadOfTimeThunks(
       cpu_executable->module_name(), std::move(obj_files),
       cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
       std::move(*cpu_executable).consume_function_library(),
-      std::move(executable_hlo_profile_printer_data));
+      std::move(executable_hlo_profile_printer_data),
+      cpu_executable->target_machine_options());
 }
 
 se::Platform::Id CpuCompiler::PlatformId() const {
@@ -2234,7 +2319,8 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
       cpu_executable->module_name(), std::move(obj_files),
       std::move(compiled_symbols_proto), *thunk_sequence,
       std::move(function_library),
-      std::move(executable_hlo_profile_printer_data));
+      std::move(executable_hlo_profile_printer_data),
+      cpu_executable->target_machine_options());
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
index 38e27955521f90..9c570ddcb87742 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
@@ -28,9 +28,12 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/llvm_compiler.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -38,7 +41,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-using CpuCompilerInternalsTest = HloTestBase;
+using CpuCompilerInternalsTest = HloHardwareIndependentTestBase;
 
 std::optional<int64_t> GetMetadataInt(llvm::Metadata* absl_nullable value) {
   if (value == nullptr) {
@@ -106,8 +109,12 @@ TEST_F(CpuCompilerInternalsTest, DylibWithThunks) {
   DebugOptions& debug_options =
       hlo_module->mutable_config().mutable_debug_options();
   debug_options.set_xla_cpu_use_fusion_emitters(false);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(hlo_module)));
+
+  CpuCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> optimized_module,
+      compiler.RunHloPasses(std::move(hlo_module), /*stream_exec=*/nullptr,
+                            /*options=*/{}));
 
   int64_t max_seen = -1;
   auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
@@ -117,14 +124,11 @@ TEST_F(CpuCompilerInternalsTest, DylibWithThunks) {
     }
   };
 
-  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
-  compiler->SetPreOptimizationHook(pre_opt_hook);
-  ASSERT_TRUE(compiler
-                  ->RunBackend(std::move(optimized_module),
-                               backend().default_stream_executor(),
-                               /*device_allocator=*/nullptr)
-                  .ok());
-  compiler->RemovePreOptimizationHook();
+  compiler.SetPreOptimizationHook(pre_opt_hook);
+  TF_ASSERT_OK(compiler.RunBackend(std::move(optimized_module),
+                                   /*stream_exec=*/nullptr,
+                                   /*options=*/{}));
+  compiler.RemovePreOptimizationHook();
 
   EXPECT_GT(max_seen, 0) << "max dylib_index(" << max_seen << ") too low; "
                          << "expected to use more dylibs.";
@@ -137,8 +141,12 @@ TEST_F(CpuCompilerInternalsTest, JustOneDylibWithThunks) {
       hlo_module->mutable_config().mutable_debug_options();
   debug_options.set_xla_cpu_use_fusion_emitters(false);
   debug_options.set_xla_cpu_parallel_codegen_split_count(1);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(hlo_module)));
+
+  CpuCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> optimized_module,
+      compiler.RunHloPasses(std::move(hlo_module), /*stream_exec=*/nullptr,
+                            /*options=*/{}));
 
   int64_t max_seen = -1;
   auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
@@ -148,14 +156,10 @@ TEST_F(CpuCompilerInternalsTest, JustOneDylibWithThunks) {
     }
   };
 
-  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
-  compiler->SetPreOptimizationHook(pre_opt_hook);
-  ASSERT_TRUE(compiler
-                  ->RunBackend(std::move(optimized_module),
-                               backend().default_stream_executor(),
-                               /*device_allocator=*/nullptr)
-                  .ok());
-  compiler->RemovePreOptimizationHook();
+  compiler.SetPreOptimizationHook(pre_opt_hook);
+  TF_ASSERT_OK(compiler.RunBackend(std::move(optimized_module),
+                                   /*stream_exec=*/nullptr, /*options=*/{}));
+  compiler.RemovePreOptimizationHook();
 
   EXPECT_EQ(max_seen, 0) << "max dylib_index(" << max_seen
                          << ") != 0, but only "
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 40dacd9e10bf2c..74c78a121cec48 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_metadata.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/executable.h"
@@ -89,13 +88,15 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::vector<ConstantAllocation> constants,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) {
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    TargetMachineOptionsProto target_machine_options) {
   VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
           << hlo_module->name() << ", constants=" << constants.size();
 
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment)));
+      std::move(hlo_profile_index_map), std::move(assignment),
+      std::move(target_machine_options)));
   executable->function_library_ = std::move(function_library);
 
   ThunkExecutor::Options thunk_executor_options;
@@ -131,10 +132,12 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<BufferAssignment> assignment)
+    std::unique_ptr<BufferAssignment> assignment,
+    TargetMachineOptionsProto target_machine_options)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      assignment_(std::move(assignment)) {
+      assignment_(std::move(assignment)),
+      target_machine_options_(std::move(target_machine_options)) {
   if (assignment_ && has_module()) {
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(), assignment_);
   }
@@ -224,6 +227,16 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
   return std::move(buffers);
 }
 
+static int32_t GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
+  if (!run_options) {
+    return 0;
+  }
+  if (run_options->device_ordinal() != -1) {
+    return run_options->device_ordinal();
+  }
+  return run_options->stream()->parent()->device_ordinal();
+}
+
 absl::Status CpuExecutable::ExecuteThunks(
     const ExecutableRunOptions* run_options,
     absl::Span<MaybeOwningDeviceMemory const> buffers) {
@@ -269,7 +282,7 @@ absl::Status CpuExecutable::ExecuteThunks(
   Thunk::ExecuteParams execute_params = {
       &*function_library_,
       &allocations,
-      GetXfeedManager(runtime::GetDeviceOrdinal(run_options)),
+      GetXfeedManager(GetDeviceOrdinal(run_options)),
       intra_op_thread_pool,
       &task_runner,
       &collective_execute_params,
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index 9704d8488450f5..9dcc9eb2b8bdb4 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -62,7 +62,8 @@ class CpuExecutable : public Executable {
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::vector<ConstantAllocation> constants,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+      TargetMachineOptionsProto target_machine_options);
 
   ~CpuExecutable() override;
 
@@ -150,6 +151,10 @@ class CpuExecutable : public Executable {
   // structures that might have been used at compile time.
   void Finalize();
 
+  const TargetMachineOptionsProto& target_machine_options() const {
+    return target_machine_options_;
+  }
+
  private:
   // Creates an array suitable for passing as the "buffer_table" argument to the
   // JIT compiled function pointer.
@@ -234,13 +239,16 @@ class CpuExecutable : public Executable {
   // Whether the thunk executor contains any YNN fusion thunks.
   bool has_ynn_fusions_ = false;
 
+  TargetMachineOptionsProto target_machine_options_;
+
   // Entry function name for the computation.
   std::string entry_function_name_;
 
   CpuExecutable(std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-                std::unique_ptr<BufferAssignment> assignment);
+                std::unique_ptr<BufferAssignment> assignment,
+                TargetMachineOptionsProto target_machine_options);
   CpuExecutable(const CpuExecutable&) = delete;
   CpuExecutable& operator=(const CpuExecutable&) = delete;
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.h b/third_party/xla/xla/service/cpu/cpu_float_support.h
index 8486665e1fbf34..7447220cb6a0fe 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.h
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include <functional>
 
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/xnn_support.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -34,20 +32,13 @@ class CpuFloatSupport : public FloatSupport {
   using DotStrategyChecker = std::function<bool(const HloInstruction& hlo)>;
 
   explicit CpuFloatSupport(PrimitiveType low_precision_type,
-                           DotStrategyChecker call_library_for_dot,
-                           TargetMachineFeatures* cpu_features)
+                           DotStrategyChecker library_supports_dot)
       : FloatSupport(low_precision_type),
-        call_library_for_dot_(call_library_for_dot),
-        cpu_features_(cpu_features) {}
+        library_supports_dot_(library_supports_dot) {}
 
-  // Skip trying to upcast the dot if XNNPACK is enabled and the dot is
-  // supported by XNNPACK.
+  // Skip trying to upcast the dot if the dot is supported by a library.
   bool ShouldSkipInstruction(const HloInstruction& hlo) const override {
-    return hlo.opcode() == HloOpcode::kDot && call_library_for_dot_(hlo) &&
-           IsDotSupportedByXnn(hlo.dot_dimension_numbers(),
-                               hlo.operand(0)->shape(), hlo.operand(1)->shape(),
-                               hlo.shape(), cpu_features_)
-               .value_or(false);
+    return hlo.opcode() == HloOpcode::kDot && library_supports_dot_(hlo);
   }
 
   // Makes FloatNormalization skip custom fusion computations for CPU backend.
@@ -58,8 +49,7 @@ class CpuFloatSupport : public FloatSupport {
   }
 
  private:
-  DotStrategyChecker call_library_for_dot_;
-  TargetMachineFeatures* cpu_features_;
+  DotStrategyChecker library_supports_dot_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support_test.cc b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
index 79ac03cfdddf90..2c294d1d38388e 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
@@ -42,8 +42,6 @@ namespace {
 struct SkipInstructionTestSpec {
   HloOpcode op;
   bool call_library_for_dot;
-  std::string cpu_name;
-  std::string features;
   bool upcast;
 };
 
@@ -56,10 +54,7 @@ class SkipInstructionTest
     absl::string_view op = HloOpcodeString(info.param.op);
     absl::string_view dot_strategy =
         info.param.call_library_for_dot ? "LibDot" : "NoLibDot";
-    absl::string_view bf16_strategy =
-        absl::StrContains(info.param.features, "+avx512bf16") ? "Bf16"
-                                                              : "NoBf16";
-    return absl::StrCat(op, "_", dot_strategy, "_", bf16_strategy);
+    return absl::StrCat(op, "_", dot_strategy);
   }
 
   void SetUp() override { TargetMachineTestBase::SetUp(); }
@@ -105,9 +100,7 @@ TEST_P(SkipInstructionTest, Bf16InF32Out) {
   // Create CpuFloatSupport.
   CpuFloatSupport::DotStrategyChecker call_library_for_dot =
       [&spec](const HloInstruction& hlo) { return spec.call_library_for_dot; };
-  std::unique_ptr<TargetMachineFeatures> features = CreateTargetMachineFeatures(
-      "x86_64-unknown-linux-gnu", spec.cpu_name, spec.features);
-  CpuFloatSupport cpu_float_support(BF16, call_library_for_dot, features.get());
+  CpuFloatSupport cpu_float_support(BF16, call_library_for_dot);
 
   // Run FloatNormalization and check the results.
   FloatNormalization float_normalization(&cpu_float_support);
@@ -122,27 +115,16 @@ std::vector<SkipInstructionTestSpec> GetSkipInstructionTestSpecs() {
       // Add op, always upcast.
       SkipInstructionTestSpec{HloOpcode::kAdd,
                               /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
                               /*upcast=*/true},
-      // CPU has BF16, but library dot is disabled.
+      // Library dot is disabled.
       SkipInstructionTestSpec{HloOpcode::kDot,
                               /*call_library_for_dot=*/false,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
                               /*upcast=*/true},
-      // Library dot is enabled, but CPU does not have BF16.
+      // Library dot is enabled.
       SkipInstructionTestSpec{HloOpcode::kDot,
                               /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"znver3",
-                              /*features=*/"+avx2",
-                              /*upcast=*/true},
-      // Library dot is enabled and CPU has BF16. Use mixed precision.
-      SkipInstructionTestSpec{HloOpcode::kDot,
-                              /*call_library_for_dot=*/true,
-                              /*cpu_name=*/"sapphirerapids",
-                              /*features=*/"+avx512bf16",
-                              /*upcast=*/false}};
+                              /*upcast=*/false},
+  };
 }
 
 INSTANTIATE_TEST_SUITE_P(SkipInstructionTestSuite, SkipInstructionTest,
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
index 86645dd94178dc..3f639cd4b07ec1 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
@@ -35,15 +35,6 @@ class CpuInstructionFusion : public InstructionFusion {
       : InstructionFusion(CpuInstructionFusion::IsExpensive, may_duplicate) {}
   ~CpuInstructionFusion() override = default;
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
-    fusion_node_evaluations_.clear();
-    ComputeInstructionsToSkip(module, execution_threads);
-    return InstructionFusion::Run(module, execution_threads);
-  }
-
   // Returns the threshold for a constant to be considered a large constant.
   static constexpr int64_t GetLargeConstantThresholdBytes() {
     constexpr int64_t kLargeConstantThresholdBytes = 10000;
@@ -56,6 +47,14 @@ class CpuInstructionFusion : public InstructionFusion {
   HloInstruction::FusionKind ChooseKind(
       const HloInstruction* producer, const HloInstruction* consumer) override;
 
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
+    fusion_node_evaluations_.clear();
+    ComputeInstructionsToSkip(module, execution_threads);
+    return InstructionFusion::RunImpl(module, execution_threads);
+  }
+
  private:
   HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
                                   HloInstruction* producer) override;
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index f242cecf917068..d2722faab2a2b4 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -71,17 +71,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-// TODO(zhangqiaorjc): Prefer to make callers set and use device_ordinal
-// directly since callers may not have a Stream*.
-int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options) {
-  if (!run_options) {
-    return 0;
-  } else if (run_options->device_ordinal() != -1) {
-    return run_options->device_ordinal();
-  }
-  return run_options->stream()->parent()->device_ordinal();
-}
-
 extern const char* const kEigenMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenMatMulF16";
 extern const char* const kEigenMatMulF32SymbolName =
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index bbdfc6ce6ad9ee..2131c3224acbc6 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -95,8 +95,6 @@ extern const char* const kHandleFfiCallSymbolName;
 // prefix.
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix;
 
-int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options);
-
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime_test.cc b/third_party/xla/xla/service/cpu/cpu_runtime_test.cc
deleted file mode 100644
index 79df0e29e8ea91..00000000000000
--- a/third_party/xla/xla/service/cpu/cpu_runtime_test.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdint>
-#include <utility>
-#define EIGEN_USE_THREADS
-#include "xla/service/cpu/cpu_runtime.h"
-
-#include <memory>
-#include <string>
-#include <tuple>
-
-#include "absl/strings/str_format.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "xla/array2d.h"
-#include "xla/client/local_client.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_custom_call_status.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "xla/service/cpu/runtime_matmul_acl.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/custom_call_status_internal.h"
-#include "xla/types.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class CpuRuntimeTest : public ::testing::Test {};
-
-template <typename T>
-std::unique_ptr<Array2D<float>> MaybeTransposeArray2D(const Array2D<T>& array,
-                                                      bool transpose) {
-  int64_t output_height = array.height();
-  int64_t output_width = array.width();
-  if (transpose) {
-    std::swap(output_width, output_height);
-  }
-  auto output = std::make_unique<Array2D<float>>(output_height, output_width);
-  for (int y = 0; y < array.height(); y++) {
-    for (int x = 0; x < array.width(); x++) {
-      if (transpose) {
-        (*output)(x, y) = array(y, x);
-      } else {
-        (*output)(y, x) = array(y, x);
-      }
-    }
-  }
-  return output;
-}
-
-// Verifies that matrix 'c' equals the result of matrix 'a' times matrix 'b'.
-// Each element is compared to within a small error bound.
-void CheckMatrixMultiply(const Array2D<float>& a, const Array2D<float>& b,
-                         const Array2D<float>& c) {
-  for (int i = 0; i < a.height(); ++i) {
-    for (int j = 0; j < b.width(); ++j) {
-      float sum = 0.0;
-      for (int k = 0; k < a.width(); ++k) {
-        sum += a(i, k) * b(k, j);
-      }
-      EXPECT_NEAR(sum, c(i, j), 0.01);
-    }
-  }
-}
-
-std::unique_ptr<Array2D<float>> EigenMatrixMultiply(const Array2D<float>& a,
-                                                    const Array2D<float>& b,
-                                                    bool transpose_lhs,
-                                                    bool transpose_rhs,
-                                                    bool single_threaded) {
-  CHECK_EQ(a.width(), b.height());
-  int64_t m = a.height();
-  int64_t n = b.width();
-  int64_t k = a.width();
-
-  // The Eigen matmul runtime function expects the matrix to be in column major
-  // order and array2d is in row-major order. Create transposes of a and b. The
-  // 'data' buffer in the transposed array is the original array in column major
-  // order.
-  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
-  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
-
-  // Since we're going to transpose c before returning it. Swap the order of the
-  // dimension sizes to ensure the returned array is properly dimensioned.
-  auto c_transpose = std::make_unique<Array2D<float>>(n, m);
-  if (single_threaded) {
-    __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
-        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
-        m, n, k, transpose_lhs, transpose_rhs);
-  } else {
-    tsl::thread::ThreadPool pool(tsl::Env::Default(), "XLAEigen", 2);
-    Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());
-    ExecutableRunOptions run_options;
-    run_options.set_intra_op_thread_pool(&device);
-
-    __xla_cpu_runtime_EigenMatMulF32(&run_options, c_transpose->data(),
-                                     a_transpose->data(), b_transpose->data(),
-                                     m, n, k, transpose_lhs, transpose_rhs);
-  }
-  return MaybeTransposeArray2D(*c_transpose, true);
-}
-
-struct MatMulShape {
-  int64_t m;
-  int64_t k;
-  int64_t n;
-};
-
-MatMulShape MatMulShapes[] = {
-    MatMulShape{2, 2, 3},     MatMulShape{256, 512, 1024},
-    MatMulShape{128, 128, 1}, MatMulShape{1, 128, 128},
-    MatMulShape{1, 32, 128},  MatMulShape{1, 32, 16},
-    MatMulShape{32, 16, 1},   MatMulShape{32, 128, 1},
-};
-
-// This takes 4 parameters:
-// * shape of the matmul
-// * transpose_lhs
-// * transpose_rhs
-// * single_threaded
-using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
-
-class EigenMatMulTest : public CpuRuntimeTest,
-                        public ::testing::WithParamInterface<MatMulTestParam> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<MatMulTestParam>& info) {
-    MatMulShape shape = std::get<0>(info.param);
-    bool transpose_lhs = std::get<1>(info.param);
-    bool transpose_rhs = std::get<2>(info.param);
-    bool single_threaded = std::get<3>(info.param);
-
-    return absl::StrFormat("EigenMatMul_%d_%d_%d_%s%s%s_threaded", shape.m,
-                           shape.k, shape.n, transpose_lhs ? "Tlhs_" : "",
-                           transpose_rhs ? "Trhs_" : "",
-                           single_threaded ? "single" : "multi");
-  }
-};
-
-TEST_P(EigenMatMulTest, DoIt) {
-  MatMulShape shape = std::get<0>(GetParam());
-  bool transpose_lhs = std::get<1>(GetParam());
-  bool transpose_rhs = std::get<2>(GetParam());
-  bool single_threaded = std::get<3>(GetParam());
-
-  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
-  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
-  auto c = EigenMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs,
-                               single_threaded);
-  CheckMatrixMultiply(*a, *b, *c);
-}
-
-INSTANTIATE_TEST_SUITE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
-                         ::testing::Combine(::testing::ValuesIn(MatMulShapes),
-                                            ::testing::Bool(),
-                                            ::testing::Bool(),
-                                            ::testing::Bool()),
-                         EigenMatMulTest::Name);
-
-TEST_F(CpuRuntimeTest, SuccessStatus) {
-  XlaCustomCallStatus success_status;
-  // Success is the default state.
-  ASSERT_TRUE(__xla_cpu_runtime_StatusIsSuccess(&success_status));
-}
-
-TEST_F(CpuRuntimeTest, FailureStatus) {
-  XlaCustomCallStatus success_status;
-  XlaCustomCallStatusSetFailure(&success_status, "Failed", 6);
-  ASSERT_FALSE(__xla_cpu_runtime_StatusIsSuccess(&success_status));
-}
-
-// When run_options is null, the process should not crash and the device ordinal
-// should be 0.
-TEST_F(CpuRuntimeTest, GetDeviceOrdinalWhenRunOptionsEmpty) {
-  EXPECT_EQ(cpu::runtime::GetDeviceOrdinal(/*run_options=*/nullptr), 0);
-}
-
-// When the device ordinal is set directly in run options, it should be returned
-// (and NOT the value from stream).
-TEST_F(CpuRuntimeTest, GetDeviceOrdinalWhenSetInRunOptions) {
-  // GetDeviceOrdinal implementation bases on the fact that device ordinal is
-  // -1 by default. So we need to assert for that here to avoid crash in case
-  // the default value changes in the future.
-  ExecutableRunOptions run_options;
-  ASSERT_EQ(run_options.device_ordinal(), -1);
-
-  // Actual test - set device ordinal in run options and check that it is
-  // returned.
-  run_options.set_device_ordinal(3);
-  EXPECT_EQ(cpu::runtime::GetDeviceOrdinal(&run_options), 3);
-}
-
-// TODO(abanas): Add test case for the device ordinal with stream case. It
-// requires mocking the stream and stream executor.
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/executable.proto b/third_party/xla/xla/service/cpu/executable.proto
index 947256b475fb86..4fedcfcc4067b3 100644
--- a/third_party/xla/xla/service/cpu/executable.proto
+++ b/third_party/xla/xla/service/cpu/executable.proto
@@ -36,6 +36,12 @@ message ObjFileProto {
   string name = 2;
 }
 
+message TargetMachineOptionsProto {
+  string triple = 1;
+  string cpu = 2;
+  string features = 3;
+}
+
 message CompilationResultProto {
   enum ObjFileKind {
     UNKNOWN = 0;
@@ -58,4 +64,5 @@ message CompilationResultProto {
   ThunkSequenceProto thunk_sequence = 6;
   repeated SymbolProto compiled_symbols = 7;
   repeated ObjFileProto object_files = 8;
+  TargetMachineOptionsProto target_machine_options = 9;
 }
diff --git a/third_party/xla/xla/service/cpu/ir_emitter_test.cc b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
index 0a16c676370004..ba3e73471fe458 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/codegen/builtin_definition_generator.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/ir_function.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -242,7 +242,7 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
       [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+        return std::make_unique<BuiltinDefinitionGenerator>(data_layout);
       };
 
   // Options for orchestrating the JIT compilation process.
diff --git a/third_party/xla/xla/service/cpu/onednn_config.proto b/third_party/xla/xla/service/cpu/onednn_config.proto
index c8e982e0bcfd0d..85b46db3793e70 100644
--- a/third_party/xla/xla/service/cpu/onednn_config.proto
+++ b/third_party/xla/xla/service/cpu/onednn_config.proto
@@ -54,9 +54,8 @@ message OneDnnFusionConfig {
     SWISH = 12;
   }
   repeated FusionKind ops = 1;
-  // To avoid protobuf failures for specific decimal values,
-  // the original float value alpha is type-casted to int32.
-  repeated int32 alpha_typecast = 2;
+  reserved 2;  // was alpha_typecast
+  repeated float alpha = 3;
 }
 
 message OneDnnTensorLayoutProto {
diff --git a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
index bc3df0361d082a..a530fc45da2969 100644
--- a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
@@ -679,22 +679,20 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
           (*it).window_reversal()) {
         return absl::OkStatus();
       }
-      // Changing the input subspace of uint repeated fields from whole numbers
-      // to natural nummbers to avoid misinterpretation of buffer values.
-      conv_config->mutable_window()->add_pad_left((*it).padding_low() + 1);
-      conv_config->mutable_window()->add_pad_right((*it).padding_high() + 1);
-      conv_config->mutable_window()->add_strides((*it).stride() + 1);
+      conv_config->mutable_window()->add_pad_left((*it).padding_low());
+      conv_config->mutable_window()->add_pad_right((*it).padding_high());
+      conv_config->mutable_window()->add_strides((*it).stride());
       conv_config->mutable_window()->add_window_dilations(
-          (*it).window_dilation() + 1);
+          (*it).window_dilation());
     }
 
     for (int i = 0; i < dims; i++) {
       conv_config->mutable_input()->mutable_data()->add_spatial_dims(
-          conv_dims.input_spatial_dimensions()[i] + 1);
+          conv_dims.input_spatial_dimensions()[i]);
       conv_config->mutable_kernel()->mutable_filter()->add_spatial_dims(
-          conv_dims.kernel_spatial_dimensions()[i] + 1);
+          conv_dims.kernel_spatial_dimensions()[i]);
       conv_config->mutable_output()->mutable_data()->add_spatial_dims(
-          conv_dims.output_spatial_dimensions()[i] + 1);
+          conv_dims.output_spatial_dimensions()[i]);
     }
 
     HloInstruction* custom_call =
@@ -1065,10 +1063,7 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
       auto backend_config = custom_call->backend_config<BackendConfig>();
       auto fusions_config = GetFusionsConfig(&backend_config);
       fusions_config->add_ops(OneDnnFusionConfig::LINEAR);
-      // Casting to int32 because of issues in proto config for decimal types
-      // handling.
-      fusions_config->add_alpha_typecast(
-          *(reinterpret_cast<int32_t*>(&constant_value.value())));
+      fusions_config->add_alpha(constant_value.value());
       TF_RETURN_IF_ERROR(custom_call->set_backend_config(*backend_config));
       HloInstruction* new_instr;
       if (optional_convert != nullptr &&
@@ -1530,11 +1525,11 @@ EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(
     OneDnnConvolutionConfig, onednn_conv_config, optimization_config,
     user_scratchpad);
 
-absl::StatusOr<bool> OneDnnContractionRewriter::Run(
+absl::StatusOr<bool> OneDnnContractionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      3, "OneDnnContractionRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "OneDnnContractionRewriter::RunImpl(), before:\n" +
+                        module->ToString());
   OneDnnContractionRewriteVisitor visitor(graph_enabled_);
   TF_ASSIGN_OR_RETURN(auto result,
                       visitor.RunOnModule(module, execution_threads));
@@ -1544,7 +1539,7 @@ absl::StatusOr<bool> OneDnnContractionRewriter::Run(
   TF_ASSIGN_OR_RETURN(auto result2,
                       reorder_visitor.RunOnModule(module, execution_threads));
   XLA_VLOG_LINES(
-      3, "OneDnnContractionRewriter::Run(), after:\n" + module->ToString());
+      3, "OneDnnContractionRewriter::RunImpl(), after:\n" + module->ToString());
   return {result || result2};
 }
 
diff --git a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
index 61d4d35bcb2bc6..19913b382af66b 100644
--- a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.h
@@ -48,11 +48,6 @@ class OneDnnContractionRewriter : public HloModulePass {
     return "onednn-contraction-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static bool ShouldRewriteDot(const HloInstruction* dot_instr,
                                bool before_layout_assignment = false);
   static bool ShouldRewriteConv(const HloInstruction* conv_instr);
@@ -62,6 +57,11 @@ class OneDnnContractionRewriter : public HloModulePass {
            ShouldRewriteConv(instr);
   }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   int intra_op_parallelism_;
   const tsl::thread::ThreadPool* compile_threadpool_;
diff --git a/third_party/xla/xla/service/cpu/onednn_convolution.cc b/third_party/xla/xla/service/cpu/onednn_convolution.cc
index 9af97ca03e048b..3065c22ad21dba 100644
--- a/third_party/xla/xla/service/cpu/onednn_convolution.cc
+++ b/third_party/xla/xla/service/cpu/onednn_convolution.cc
@@ -56,7 +56,7 @@ using dnnl::prop_kind;
 using dnnl::stream;
 
 memory::dims GetPrimitiveParameter(
-    const tsl::protobuf::RepeatedField<uint64_t>& field, int offset) {
+    const tsl::protobuf::RepeatedField<uint64_t>& field, int offset = 0) {
   memory::dims param_field(field.begin(), field.end());
   // Subtract the offset so that values are interpreted accurately
   for (int64_t& n : param_field) {
@@ -73,7 +73,7 @@ std::vector<int> ComputePermutations(
   perm_axes[dim1] = 1;
   int index = 2;
   for (uint64_t n : spatial_dims) {
-    perm_axes[n - 1] = index++;
+    perm_axes[n] = index++;
   }
   return perm_axes;
 }
@@ -134,14 +134,13 @@ CreateOneDnnPrimDesc<dnnl::convolution_forward::primitive_desc>(
   memory::desc weights_md = ShapeToMemDesc(weight_shape);
   memory::desc output_md = ShapeToMemDesc(output_shape);
 
-  memory::dims strides =
-      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims strides = GetPrimitiveParameter(conv_config.window().strides());
   memory::dims pad_left =
-      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_left());
   memory::dims pad_right =
-      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_right());
   memory::dims rhs_dilations =
-      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 1);
 
   uint64_t groups = conv_config.feature_groups();
 
@@ -241,14 +240,13 @@ void ExecuteOneDnnConvolution(absl::Span<MemrefInfoHandler> arguments,
       conv_config.output().data().feature_dim(),
       conv_config.output().data().spatial_dims()));
 
-  memory::dims strides =
-      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims strides = GetPrimitiveParameter(conv_config.window().strides());
   memory::dims pad_left =
-      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_left());
   memory::dims pad_right =
-      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+      GetPrimitiveParameter(conv_config.window().pad_right());
   memory::dims rhs_dilations =
-      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 1);
 
   uint64_t groups = conv_config.feature_groups();
 
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index bea079e3d74de6..7b52bfa90b30bd 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -621,14 +621,16 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> OneDnnOpsRewriter::Run(
+absl::StatusOr<bool> OneDnnOpsRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3, "OneDnnOpsRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "OneDnnOpsRewriter::RunImpl(), before:\n" + module->ToString());
   OneDnnOpsRewriterVisitor visitor;
   TF_ASSIGN_OR_RETURN(auto result,
                       visitor.RunOnModule(module, execution_threads));
-  XLA_VLOG_LINES(3, "OneDnnOpsRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3,
+                 "OneDnnOpsRewriter::RunImpl(), after:\n" + module->ToString());
   return result;
 }
 
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
index fec2a4e4cc11d0..0cdc66c0c1116b 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
@@ -29,8 +29,8 @@ class OneDnnOpsRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "onednn-ops-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/cpu/onednn_util.cc b/third_party/xla/xla/service/cpu/onednn_util.cc
index f78bedb94aa487..46519fadafc8b6 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_util.cc
@@ -114,9 +114,7 @@ dnnl::post_ops PopulateOneDnnPostOps(
         fused_operand_idx++;
       } break;
       case OneDnnFusionConfig::LINEAR: {
-        float const_float;
-        *(reinterpret_cast<int32_t*>(&const_float)) =
-            fusion_config->alpha_typecast()[linear_scale_idx];
+        float const_float = fusion_config->alpha()[linear_scale_idx];
         post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, const_float,
                                 0.f);
         linear_scale_idx++;
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
index 707bcce9805bd2..201c7d5907e07c 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.cc
@@ -31,14 +31,13 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/fusion_emitter.h"
+#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 
@@ -46,7 +45,7 @@ namespace xla::cpu {
 
 struct ParallelFusionEmitter::CompilerInstance {
   std::unique_ptr<mlir::MLIRContext> mlir_context;
-  std::unique_ptr<gpu::SymbolicExprContext> symbolic_expr_context;
+  std::unique_ptr<SymbolicExprContext> symbolic_expr_context;
   std::unique_ptr<FusionCompiler> compiler;
 };
 
@@ -103,7 +102,7 @@ auto ParallelFusionEmitter::FusionCompilerPool::GetInstance()
       FusionCompiler::CreateContext();
 
   auto symbolic_expr_context =
-      std::make_unique<gpu::SymbolicExprContext>(mlir_context.get());
+      std::make_unique<SymbolicExprContext>(mlir_context.get());
 
   auto compiler = std::make_unique<FusionCompiler>(mlir_context.get(), options_,
                                                    GetNestedHooks());
@@ -172,8 +171,9 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   // fixed but will require a rework of the ThunkEmitter.
   auto compiler_instance = fusion_compiler_pool_->GetInstance();
   TF_ASSIGN_OR_RETURN(
-      MlirKernelDefinition mlir_kernel_definition,
-      EmitFusionKernel(*compiler_instance->symbolic_expr_context, *fusion,
+      KernelDefinition mlir_kernel_definition,
+      EmitFusionKernel(*compiler_instance->mlir_context,
+                       *compiler_instance->symbolic_expr_context, *fusion,
                        buffer_assignment_, use_unique_c_name_));
 
   {
@@ -182,8 +182,8 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   }
 
   KernelSpec spec = mlir_kernel_definition.spec();
-  auto shared_source =
-      std::make_shared<MlirKernelDefinition>(std::move(mlir_kernel_definition));
+  auto shared_source = std::make_shared<KernelDefinition<MlirKernelSource>>(
+      std::move(mlir_kernel_definition));
 
   thread_pool_.Schedule(absl::bind_front(&ParallelFusionEmitter::CompileFusion,
                                          this, std::move(shared_source),
@@ -192,7 +192,7 @@ absl::StatusOr<KernelSpec> ParallelFusionEmitter::AddFusion(
   return spec;
 }
 
-absl::StatusOr<std::vector<LlvmKernelDefinition>>
+absl::StatusOr<std::vector<KernelDefinition<LlvmKernelSource>>>
 ParallelFusionEmitter::ConsumeKernels() {
   absl::MutexLock lock(kernels_mutex_);
 
@@ -205,8 +205,8 @@ ParallelFusionEmitter::ConsumeKernels() {
   }
 
   // Sort the kernels by name to ensure a deterministic order.
-  absl::c_sort(kernels_, [](const LlvmKernelDefinition& lhs,
-                            const LlvmKernelDefinition& rhs) {
+  absl::c_sort(kernels_, [](const KernelDefinition<LlvmKernelSource>& lhs,
+                            const KernelDefinition<LlvmKernelSource>& rhs) {
     return lhs.spec().name() < rhs.spec().name();
   });
 
@@ -214,11 +214,12 @@ ParallelFusionEmitter::ConsumeKernels() {
 }
 
 void ParallelFusionEmitter::CompileFusion(
-    std::shared_ptr<MlirKernelDefinition> mlir_kernel_definition,
+    std::shared_ptr<KernelDefinition<MlirKernelSource>> mlir_kernel,
     std::shared_ptr<CompilerInstance> compiler_instance) {
-  auto [spec, source] = std::move(*mlir_kernel_definition).ReleaseStorage();
-  absl::StatusOr<LlvmIrKernelSource> llvm_kernel_source =
-      compiler_instance->compiler->Compile(std::move(source));
+  KernelSpec spec = mlir_kernel->spec();
+  absl::StatusOr<LlvmKernelSource> llvm_kernel_source =
+      compiler_instance->compiler->Compile(
+          std::move(*mlir_kernel).TakeSource());
 
   absl::MutexLock lock(kernels_mutex_);
   outstanding_kernels_--;
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
index 2fc984baa1271c..033f196dac2bdc 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter.h
@@ -25,9 +25,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -53,14 +54,15 @@ class ParallelFusionEmitter {
 
   // Returns the kernels for all the added fusions, blocks until all kernels
   // have been compiled.
-  absl::StatusOr<std::vector<LlvmKernelDefinition>> ConsumeKernels();
+  absl::StatusOr<std::vector<KernelDefinition<LlvmKernelSource>>>
+  ConsumeKernels();
 
  private:
   struct CompilerInstance;
   class FusionCompilerPool;
 
   void CompileFusion(
-      std::shared_ptr<MlirKernelDefinition> mlir_kernel_definition,
+      std::shared_ptr<KernelDefinition<MlirKernelSource>> mlir_kernel,
       std::shared_ptr<CompilerInstance> compiler_instance);
 
   tsl::thread::ThreadPool& thread_pool_;
@@ -71,7 +73,8 @@ class ParallelFusionEmitter {
   absl::Mutex kernels_mutex_;
   int64_t outstanding_kernels_ ABSL_GUARDED_BY(kernels_mutex_) = 0;
   absl::Status kernels_status_ ABSL_GUARDED_BY(kernels_mutex_);
-  std::vector<LlvmKernelDefinition> kernels_ ABSL_GUARDED_BY(kernels_mutex_);
+  std::vector<KernelDefinition<LlvmKernelSource>> kernels_
+      ABSL_GUARDED_BY(kernels_mutex_);
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc b/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
index 9abba57fce23c2..28e03fde26bd5f 100644
--- a/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
+++ b/third_party/xla/xla/service/cpu/parallel_fusion_emitter_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/backends/cpu/codegen/fusion_compiler.h"
-#include "xla/codegen/llvm_kernel_definition.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -124,9 +123,9 @@ TEST_F(ParallelFusionEmitterTest, HappyPathSingleFusion) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto kernels, fussion_emitter.ConsumeKernels());
   ASSERT_EQ(kernels.size(), 1);
-  LlvmKernelDefinition& lowered_kernel = kernels[0];
-  auto [spec, source] = std::move(lowered_kernel).ReleaseStorage();
-  EXPECT_EQ(spec.name(), expected_name);
+  KernelDefinition<LlvmKernelSource>& lowered_kernel = kernels[0];
+  EXPECT_EQ(lowered_kernel.spec().name(), expected_name);
+  auto source = std::move(lowered_kernel).TakeSource();
 
   llvm::orc::ThreadSafeModule thread_safe_llvm_module =
       std::move(source).thread_safe_module();
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
index 9e8dc4861f7dbb..e25383ae0c2b24 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
@@ -202,7 +202,7 @@ int64_t ParallelTaskAssignment::GetTargetParallelTaskCount(
   return 1;
 }
 
-absl::StatusOr<bool> ParallelTaskAssigner::Run(
+absl::StatusOr<bool> ParallelTaskAssigner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(2, "ParallelTaskAssigner ENTRY");
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.h b/third_party/xla/xla/service/cpu/parallel_task_assignment.h
index 9b0f06c505820b..507662b4094110 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.h
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.h
@@ -87,11 +87,8 @@ class ParallelTaskAssigner : public HloModulePass {
     return "cpu-parallel-task-assigner";
   }
 
-  // Run parallel task assigner on computations with specified
-  // `execution_threads` in 'module'. By default, all `execution_threads` are
-  // included. Returns true if the computation was changed, false otherwise.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d.cc b/third_party/xla/xla/service/cpu/runtime_conv2d.cc
index c388b0982fecfe..785360dcc86a0d 100644
--- a/third_party/xla/xla/service/cpu/runtime_conv2d.cc
+++ b/third_party/xla/xla/service/cpu/runtime_conv2d.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
 
@@ -39,12 +39,12 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   xla::cpu::internal::EigenConv2D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch,
-      input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
-      kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
-      col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
+      nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+      input_channels, kernel_rows, kernel_cols, kernel_channels, kernel_filters,
+      output_rows, output_cols, row_stride, col_stride, padding_top,
+      padding_bottom, padding_left, padding_right, lhs_row_dilation,
+      lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, feature_group_count,
+      /*done=*/[] {});
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
@@ -61,10 +61,10 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   xla::cpu::internal::EigenConv2D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch,
-      input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
-      kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
-      col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
+      nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+      input_channels, kernel_rows, kernel_cols, kernel_channels, kernel_filters,
+      output_rows, output_cols, row_stride, col_stride, padding_top,
+      padding_bottom, padding_left, padding_right, lhs_row_dilation,
+      lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, feature_group_count,
+      /*done=*/[] {});
 }
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc b/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc
deleted file mode 100644
index 9db849566865f0..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/executable_run_options.h"
-#include "xla/tsl/platform/dynamic_annotations.h"
-#include "tsl/platform/types.h"
-#ifdef XLA_CPU_USE_ACL
-#include "absl/base/call_once.h"
-#include "xla/service/cpu/runtime_conv2d.h"
-#include "xla/service/cpu/runtime_conv2d_acl.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/platform/logging.h"
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace {
-int32_t ACLDepthwiseConvImpl(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  VLOG(1) << "Accelerate with ACLDepthwiseConvImpl";
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_depthwise_conv_obj_t acl_conv_obj;
-  struct acl_conv_conf_t acl_conf;
-
-  /* ir_emitter HandleConvolution ensures the below preconditions before
-   * dispatching it to ACL layout: NHWC format: FP32 Number of feature groups
-   * matches the input channels source and kernel dilation is: 1
-   */
-  acl_conf.dilation_info =
-      arm_compute::Size2D(lhs_col_dilation, lhs_row_dilation);
-  acl_conf.padstride_info = arm_compute::PadStrideInfo(
-      col_stride, row_stride, static_cast<unsigned int>(padding_left),
-      static_cast<unsigned int>(padding_right),
-      static_cast<unsigned int>(padding_top),
-      static_cast<unsigned int>(padding_bottom),
-      arm_compute::DimensionRoundingType::FLOOR);
-  acl_conf.with_bias = false;
-
-  acl_conf.act_info = arm_compute::ActivationLayerInfo();
-
-  acl_conf.input_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, input_cols, input_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.kernel_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, kernel_cols, kernel_rows), 1,
-      arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.output_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, output_cols, output_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-
-  auto acl_st = arm_compute::NEDepthwiseConvolutionLayer::validate(
-      &acl_conf.input_info, &acl_conf.kernel_info,
-      /*acp.with_bias */ nullptr, &acl_conf.output_info,
-      acl_conf.padstride_info, kernel_channels /*depth_multiplier*/,
-      acl_conf.act_info, acl_conf.dilation_info);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << " Gemm conv validation failed";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_conv_obj.input_tensor.allocator()->init(acl_conf.input_info);
-  acl_conv_obj.kernel_tensor.allocator()->init(acl_conf.kernel_info);
-  acl_conv_obj.output_tensor.allocator()->init(acl_conf.output_info);
-
-  acl_conv_obj.depthwise_conv.configure(
-      &acl_conv_obj.input_tensor, &acl_conv_obj.kernel_tensor, nullptr,
-      &acl_conv_obj.output_tensor, acl_conf.padstride_info,
-      kernel_channels /*depth_multiplier*/, acl_conf.act_info,
-      acl_conf.dilation_info);
-
-  /* import_memory() and free() methods do not allocate/free any additional
-   * memory, only acquire/release pointers.
-   */
-  acl_conv_obj.input_tensor.allocator()->import_memory(lhs);
-  acl_conv_obj.kernel_tensor.allocator()->import_memory(rhs);
-  acl_conv_obj.output_tensor.allocator()->import_memory(out);
-
-  acl_conv_obj.depthwise_conv.run();
-
-  acl_conv_obj.input_tensor.allocator()->free();
-  acl_conv_obj.kernel_tensor.allocator()->free();
-  acl_conv_obj.output_tensor.allocator()->free();
-
-  return 0;
-}
-
-int32_t ACLGemmConvImpl(const void* run_options_ptr, float* out, float* lhs,
-                        float* rhs, int64_t input_batch, int64_t input_rows,
-                        int64_t input_cols, int64_t input_channels,
-                        int64_t kernel_rows, int64_t kernel_cols,
-                        int64_t kernel_channels, int64_t kernel_filters,
-                        int64_t output_rows, int64_t output_cols,
-                        int64_t row_stride, int64_t col_stride,
-                        int64_t padding_top, int64_t padding_bottom,
-                        int64_t padding_left, int64_t padding_right,
-                        int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-                        int64_t rhs_row_dilation, int64_t rhs_col_dilation) {
-  VLOG(1) << "Accelerate with ACLGemmConvImpl";
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_gemm_conv_obj_t acl_conv_obj;
-  struct acl_conv_conf_t acl_conf;
-
-  /* TODO: add TF_XLA_* flag for runtime control of fast math mode
-   */
-  acl_conf.fast_math = true;
-
-  /* ir_emitter HandleConvolution ensures the below preconditions before
-   * dispatching it to ACL layout: NHWC format: FP32 Number of feature groups: 1
-   *  source and kernel dilation is: 1
-   */
-  acl_conf.dilation_info =
-      arm_compute::Size2D(lhs_col_dilation, lhs_row_dilation);
-  acl_conf.padstride_info = arm_compute::PadStrideInfo(
-      col_stride, row_stride, static_cast<unsigned int>(padding_left),
-      static_cast<unsigned int>(padding_right),
-      static_cast<unsigned int>(padding_top),
-      static_cast<unsigned int>(padding_bottom),
-      arm_compute::DimensionRoundingType::FLOOR);
-  acl_conf.with_bias = false;
-
-  acl_conf.input_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, input_cols, input_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.kernel_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(input_channels, kernel_cols, kernel_rows,
-                               kernel_filters),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.output_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(kernel_filters, output_cols, output_rows,
-                               input_batch),
-      1, arm_compute::DataType::F32, arm_compute::DataLayout::NHWC);
-  acl_conf.act_info = arm_compute::ActivationLayerInfo();
-
-  // Validate convolution manually to check for return status
-  auto acl_st = arm_compute::NEGEMMConvolutionLayer::validate(
-      &acl_conf.input_info, &acl_conf.kernel_info,
-      /*acp.with_bias */ nullptr, &acl_conf.output_info,
-      acl_conf.padstride_info, acl_conf.kernel_wei_info, acl_conf.dilation_info,
-      acl_conf.act_info, acl_conf.fast_math);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << " Gemm conv validation failed";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_conv_obj.input_tensor.allocator()->init(acl_conf.input_info);
-  acl_conv_obj.kernel_tensor.allocator()->init(acl_conf.kernel_info);
-  acl_conv_obj.output_tensor.allocator()->init(acl_conf.output_info);
-
-  // Configure GEMM
-  acl_conv_obj.gemm_conv.configure(
-      &acl_conv_obj.input_tensor, &acl_conv_obj.kernel_tensor, nullptr,
-      &acl_conv_obj.output_tensor, acl_conf.padstride_info,
-      acl_conf.kernel_wei_info, acl_conf.dilation_info, acl_conf.act_info,
-      acl_conf.fast_math);
-
-  /* import_memory() and free() methods do not allocate/free any additional
-   * memory, only acquire/release pointers.
-   */
-  acl_conv_obj.input_tensor.allocator()->import_memory(lhs);
-  acl_conv_obj.kernel_tensor.allocator()->import_memory(rhs);
-  acl_conv_obj.output_tensor.allocator()->import_memory(out);
-
-  acl_conv_obj.gemm_conv.run();
-
-  acl_conv_obj.input_tensor.allocator()->free();
-  acl_conv_obj.kernel_tensor.allocator()->free();
-  acl_conv_obj.output_tensor.allocator()->free();
-
-  return 0;
-}
-}  // namespace
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLConv2DF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs,
-    int64_t input_batch, int64_t input_rows, int64_t input_cols,
-    int64_t input_channels, int64_t kernel_rows, int64_t kernel_cols,
-    int64_t kernel_channels, int64_t kernel_filters, int64_t output_rows,
-    int64_t output_cols, int64_t row_stride, int64_t col_stride,
-    int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
-    int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
-    int64_t feature_group_count) {
-  bool fallback_to_eigen = false;
-
-  if (lhs_row_dilation > 1 || lhs_col_dilation > 1 ||
-      ((feature_group_count > 1) && (input_channels != feature_group_count))) {
-    fallback_to_eigen = true;
-  } else if ((feature_group_count > 1) &&
-             (input_channels == feature_group_count)) {
-    if (ACLDepthwiseConvImpl(
-            run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
-            input_channels, kernel_rows, kernel_cols, kernel_channels,
-            kernel_filters, output_rows, output_cols, row_stride, col_stride,
-            padding_top, padding_bottom, padding_left, padding_right,
-            lhs_row_dilation, lhs_col_dilation, rhs_row_dilation,
-            rhs_col_dilation, feature_group_count) < 0)
-      fallback_to_eigen = true;
-  } else {
-    if (ACLGemmConvImpl(run_options_ptr, out, lhs, rhs, input_batch, input_rows,
-                        input_cols, input_channels, kernel_rows, kernel_cols,
-                        kernel_channels, kernel_filters, output_rows,
-                        output_cols, row_stride, col_stride, padding_top,
-                        padding_bottom, padding_left, padding_right,
-                        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation,
-                        rhs_col_dilation) < 0)
-      fallback_to_eigen = true;
-  }
-
-  if (fallback_to_eigen) {
-    VLOG(1) << "XLA conv2d not supported by ACL, fallback to Eigen runtime";
-    __xla_cpu_runtime_EigenConv2DF32(
-        run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
-        input_channels, kernel_rows, kernel_cols, kernel_channels,
-        kernel_filters, output_rows, output_cols, row_stride, col_stride,
-        padding_top, padding_bottom, padding_left, padding_right,
-        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-        feature_group_count);
-  }
-}
-#endif  // XLA_CPU_USE_ACL
diff --git a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h b/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
deleted file mode 100644
index 69a2429ff49f64..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_conv2d_acl.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
-
-#include "tsl/platform/types.h"
-
-#ifdef XLA_CPU_USE_ACL
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/Utils.h"
-
-extern "C" {
-struct acl_depthwise_conv_obj_t {
-  arm_compute::NEDepthwiseConvolutionLayer depthwise_conv;
-  arm_compute::NEArithmeticAddition add;
-  arm_compute::NEActivationLayer act;
-  arm_compute::Tensor input_tensor;
-  arm_compute::Tensor kernel_tensor;
-  arm_compute::Tensor bia_tensor;
-  arm_compute::Tensor output_tensor;
-  arm_compute::Tensor output_acc_tensor;
-};
-
-struct acl_gemm_conv_obj_t {
-  arm_compute::NEGEMMConvolutionLayer gemm_conv;
-  arm_compute::NEArithmeticAddition add;
-  arm_compute::NEActivationLayer act;
-  arm_compute::Tensor input_tensor;
-  arm_compute::Tensor kernel_tensor;
-  arm_compute::Tensor bia_tensor;
-  arm_compute::Tensor output_tensor;
-  arm_compute::Tensor output_acc_tensor;
-};
-
-struct acl_conv_conf_t {
-  bool with_bias;
-  bool is_int8;
-  bool sum_with_eltwise;
-  bool fast_math;
-  arm_compute::TensorInfo input_info;
-  arm_compute::TensorInfo kernel_info;
-  arm_compute::TensorInfo bia_info;
-  arm_compute::TensorInfo output_info;
-  arm_compute::PadStrideInfo padstride_info;
-  arm_compute::Size2D dilation_info;
-  arm_compute::WeightsInfo kernel_wei_info;
-  arm_compute::ActivationLayerInfo act_info;
-};
-
-extern void __xla_cpu_runtime_ACLConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count);
-}
-#else
-#include <iostream>
-
-extern "C" {
-inline extern void __xla_cpu_runtime_ACLConv2DF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
-    int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
-    int64_t kernel_cols, int64_t kernel_channels, int64_t kernel_filters,
-    int64_t output_rows, int64_t output_cols, int64_t row_stride,
-    int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
-    int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
-    int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation, int64_t feature_group_count) {
-  std::cerr
-      << "Attempt to call ACL Conv2D runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-}
-#endif  // XLA_CPU_USE_ACL
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONV2D_ACL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_conv3d.cc b/third_party/xla/xla/service/cpu/runtime_conv3d.cc
index f5ae5297ebf47b..530a079ec959a1 100644
--- a/third_party/xla/xla/service/cpu/runtime_conv3d.cc
+++ b/third_party/xla/xla/service/cpu/runtime_conv3d.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
 
@@ -41,13 +41,13 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   xla::cpu::internal::EigenConv3D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch, input_x,
-      input_y, input_z, input_channels, kernel_x, kernel_y, kernel_z,
-      kernel_channels, kernel_filters, output_x, output_y, output_z, x_stride,
-      y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
+      nullptr, out, lhs, rhs, input_batch, input_x, input_y, input_z,
+      input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
+      kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
+      z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
+      rhs_z_dilation, feature_group_count, /*done=*/[] {});
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
@@ -66,11 +66,11 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   xla::cpu::internal::EigenConv3D(
-      *run_options->intra_op_thread_pool(), out, lhs, rhs, input_batch, input_x,
-      input_y, input_z, input_channels, kernel_x, kernel_y, kernel_z,
-      kernel_channels, kernel_filters, output_x, output_y, output_z, x_stride,
-      y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
+      nullptr, out, lhs, rhs, input_batch, input_x, input_y, input_z,
+      input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
+      kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
+      z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
+      rhs_z_dilation, feature_group_count, /*done=*/[] {});
 }
diff --git a/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc b/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc
deleted file mode 100644
index d943fa63aeafc3..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_custom_call_status.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/service/cpu/runtime_custom_call_status.h"
-
-#include "absl/base/attributes.h"
-#include "xla/service/custom_call_status_internal.h"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY bool __xla_cpu_runtime_StatusIsSuccess(
-    const void* status_ptr) {
-  auto status = static_cast<const XlaCustomCallStatus*>(status_ptr);
-  return !xla::CustomCallStatusGetMessage(status).has_value();
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_custom_call_status.h b/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
deleted file mode 100644
index e243b46e0e3bc3..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_custom_call_status.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
-#define XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
-
-extern "C" {
-
-// Returns true iff the given 'XlaCustomCallStatus' is in a success state, so
-// that generated code can return early if a CustomCall fails.
-extern bool __xla_cpu_runtime_StatusIsSuccess(
-    const void* /* XlaCustomCallStatus* */ status_ptr);
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_STATUS_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc b/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc
deleted file mode 100644
index a63007ae046106..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_key_value_sort.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/service/cpu/runtime_key_value_sort.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <numeric>
-#include <string>
-
-#include "absl/base/attributes.h"
-#include "absl/base/dynamic_annotations.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
-    int64_t a, int64_t b, int64_t c, char** values, int32_t values_count,
-    int32_t* values_primitive_type_size_in_bytes, bool is_stable,
-    char* run_options, int64_t* prof_counters,
-    void (*less_than)(char*, char*, char**, char**, int64_t*)) {
-  // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
-  // code, so msan can't tell they are initialized.
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values_primitive_type_size_in_bytes,
-                                      values_count * sizeof(int32_t));
-
-  // High-level idea of the iteration/sorting logic:
-  // Conceptually we have a 3-dimensional shape [a, b, c]. b corresponds to the
-  // dimension to sort, c is the product of the more minor dimensions (set to 1
-  // if b is the most minor dimension), and a is the product of the more major
-  // dimensions (set to 1 if b is the most major dimension). There are a * c
-  // many rows that we need to sort. We iterate through these, calculate a
-  // 'base_offset' value which points to the first element in that row, and add
-  // i * c for accessing the 'i'-th element in that row.
-
-  int64_t sort_dimension_elements = b;
-  int64_t num_iteration_elements = a * c;
-  int64_t sort_dimension_offset = c;
-
-  std::unique_ptr<int64_t[]> indices(new int64_t[sort_dimension_elements]);
-  std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
-  std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
-  std::unique_ptr<std::string[]> reordered_values(
-      new std::string[sort_dimension_elements]);
-  for (int64_t index = 0; index < num_iteration_elements; ++index) {
-    // If the sort should be stable, we have to reinitialize indices to iota to
-    // guarantee that we still keep the relative order in case of ties.
-    if (is_stable && index > 0) {
-      std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
-    }
-    // 'index' can be split into two values which index into the 'c' dimension
-    // and the 'a' dimension, respectively. 'index' % 'c' is the index into the
-    // 'c' dimension, 'index' / 'c' is the index into the 'a' dimension. When
-    // calculating the base offset, we need to multiply the index into the 'a'
-    // dimension with 'b' * 'c'.
-    // 'index' / 'c' * 'c' * 'b' = ('index' - 'index' % 'c') * 'b'.
-    int64_t base_offset =
-        index % sort_dimension_offset +
-        (index - index % sort_dimension_offset) * sort_dimension_elements;
-    auto compare_function = [&](int64_t a, int64_t b) -> bool {
-      for (int32_t i = 0; i < values_count; ++i) {
-        int64_t memory_index_lhs = (base_offset + a * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[i];
-        int64_t memory_index_rhs = (base_offset + b * sort_dimension_offset) *
-                                   values_primitive_type_size_in_bytes[i];
-        comparison_values[i * 2] = values[i] + memory_index_lhs;
-        comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
-      }
-      char result = 0;  // Overwritten by less_than.
-      less_than(&result, run_options, comparison_values.get(), nullptr,
-                prof_counters);
-      return result != 0u;
-    };
-    if (is_stable) {
-      std::stable_sort(indices.get(), indices.get() + sort_dimension_elements,
-                       compare_function);
-    } else {
-      std::sort(indices.get(), indices.get() + sort_dimension_elements,
-                compare_function);
-    }
-
-    // Reorder the values according to the order defined by 'indices'.
-    for (int32_t idx = 0; idx < values_count; ++idx) {
-      for (int64_t i = 0; i < sort_dimension_elements; ++i) {
-        int64_t memory_index =
-            (base_offset + indices[i] * sort_dimension_offset) *
-            values_primitive_type_size_in_bytes[idx];
-
-        reordered_values[i] =
-            std::string(values[idx] + memory_index,
-                        values_primitive_type_size_in_bytes[idx]);
-      }
-      for (int64_t i = 0; i < sort_dimension_elements; ++i) {
-        int64_t memory_index = (base_offset + i * sort_dimension_offset) *
-                               values_primitive_type_size_in_bytes[idx];
-        memcpy(values[idx] + memory_index, reordered_values[i].c_str(),
-               values_primitive_type_size_in_bytes[idx]);
-      }
-    }
-  }
-}
diff --git a/third_party/xla/xla/service/cpu/runtime_key_value_sort.h b/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
deleted file mode 100644
index dfd99f64cf2209..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_key_value_sort.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
-#define XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
-
-#include <stdint.h>
-
-#include "unsupported/Eigen/CXX11/Tensor"
-
-extern "C" {
-
-// Each entry in 'values' represents a 3-dimensional shape with dimensions
-// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
-// according to the results of comparisons using the provided 'less_than'
-// function. 'values_count' must be > 0 and specifies the number of entries in
-// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
-// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
-// bytes. 'is_stable' specifies whether the sorting should be stable.
-// 'run_options' and 'prof_counters' are passed through to the less-than
-// function, which expects the following arguments:
-// - pointer to the return value buffer (char*)
-// - xla::ExecutableRunOptions = 'run_options' (char*)
-// - pointers to the parameter buffers (char**)
-// - pointers to the buffer tables = nullptr for thread local functions (char**)
-// - profile counters = 'prof_counters' (int64_t*)
-extern void __xla_cpu_runtime_KeyValueSort(
-    int64_t a, int64_t b, int64_t c, char** values, int32_t values_count,
-    int32_t* values_primitive_type_size_in_bytes, bool is_stable,
-    char* run_options, int64_t* prof_counters,
-    void (*less_than)(char*, char*, char**, char**, int64_t*));
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc b/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc
deleted file mode 100644
index fa947ca89ce74f..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_acl.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef XLA_CPU_USE_ACL
-#include "xla/service/cpu/runtime_matmul_acl.h"
-
-#include "absl/base/call_once.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/cpu/runtime_lightweight_check.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/types.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "xla/tsl/platform/dynamic_annotations.h"
-
-namespace {
-// ACL GEMM API for 32-bit Matrix Multiplication.
-
-// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
-// Since XLA MatMul does not use alpha, beta, we set them to 1.0 and 0.0.
-// Matrix lhs, rhs and out are all column-major.
-int32_t MatMulF32(const void* run_options_ptr, float* out, float* lhs,
-                  float* rhs, int64_t m, int64_t n, int64_t k,
-                  int64_t batch_size, int32_t transpose_lhs,
-                  int32_t transpose_rhs) {
-  const float alpha = 1.0f, beta = 0.0f;
-
-  /* TODO: optimize this object creation along with tensor init and
-   * gemm configuration by caching the shapes, similar to onednn
-   * primitive caching feature
-   */
-  struct acl_matmul_obj_t acl_obj;
-  struct acl_matmul_conf_t acl_conf;
-
-  acl_conf.is_trans_lhs = (bool)transpose_lhs;
-  acl_conf.is_trans_rhs = (bool)transpose_rhs;
-
-  if (acl_conf.is_trans_lhs) {
-    acl_conf.lhs_acc_info =
-        arm_compute::TensorInfo(arm_compute::TensorShape(k, m, batch_size), 1,
-                                arm_compute::DataType::F32);
-  }
-  if (acl_conf.is_trans_rhs) {
-    acl_conf.rhs_acc_info =
-        arm_compute::TensorInfo(arm_compute::TensorShape(n, k, 1, batch_size),
-                                1, arm_compute::DataType::F32);
-  }
-
-  acl_conf.lhs_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(m, k, batch_size), 1,
-                              arm_compute::DataType::F32);
-  acl_conf.rhs_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(k, n, 1, batch_size), 1,
-                              arm_compute::DataType::F32);
-  acl_conf.out_info =
-      arm_compute::TensorInfo(arm_compute::TensorShape(m, n, 1, batch_size), 1,
-                              arm_compute::DataType::F32);
-
-  /* TODO: add TF_XLA_* flag for runtime control of fast math mode*/
-  bool is_fastmath_enabled = true;
-  acl_conf.gemm_info.set_fast_math(is_fastmath_enabled);
-
-  // Fused ReLU activation
-  acl_conf.gemm_info.set_activation_info(arm_compute::ActivationLayerInfo());
-
-  // Set alpha (output scaling)
-  acl_conf.alpha = alpha;
-
-  // Validate ACL transpose
-  if (acl_conf.is_trans_lhs) {
-    auto acl_trans_lhs_st = arm_compute::NETranspose::validate(
-        &acl_conf.lhs_acc_info, &acl_conf.lhs_info);
-    if (acl_trans_lhs_st.error_code() != arm_compute::ErrorCode::OK) {
-      VLOG(1) << "lhs transpose validation failed";
-      return -1;
-    }
-  }
-  if (acl_conf.is_trans_rhs) {
-    auto acl_trans_rhs_st = arm_compute::NETranspose::validate(
-        &acl_conf.rhs_acc_info, &acl_conf.rhs_info);
-    if (acl_trans_rhs_st.error_code() != arm_compute::ErrorCode::OK) {
-      VLOG(1) << "rhs transpose validation failed";
-      return -1;
-    }
-  }
-
-  // Validate ACL GEMM
-  auto acl_st = arm_compute::NEGEMM::validate(
-      &acl_conf.rhs_info, &acl_conf.lhs_info, nullptr, &acl_conf.out_info,
-      acl_conf.alpha, 0.0f, acl_conf.gemm_info);
-  if (acl_st.error_code() != arm_compute::ErrorCode::OK) {
-    VLOG(1) << "validate acl GEMM FAILED";
-    return -1;
-  }
-
-  static absl::once_flag flag_once;
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  const Eigen::ThreadPoolDevice* tpd =
-      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
-  // The threads in Compute Library are bound for the cores 0..max_threads-1
-  const int max_threads = tpd->numThreads();
-
-  // arm_compute::Scheduler does not support concurrent access thus a
-  // workaround here restricts it to only one call
-  absl::call_once(flag_once, [&]() {
-    arm_compute::Scheduler::get().set_num_threads(max_threads);
-  });
-
-  // configure the acl obj with the config
-  acl_obj.lhs_tensor.allocator()->init(acl_conf.lhs_info);
-  acl_obj.rhs_tensor.allocator()->init(acl_conf.rhs_info);
-  acl_obj.out_tensor.allocator()->init(acl_conf.out_info);
-
-  // Configure transpose kernel for src, wei or both
-  if (acl_conf.is_trans_lhs) {
-    acl_obj.lhs_acc_tensor.allocator()->init(acl_conf.lhs_acc_info);
-    acl_obj.trans_lhs.configure(&acl_obj.lhs_acc_tensor, &acl_obj.lhs_tensor);
-  }
-  if (acl_conf.is_trans_rhs) {
-    acl_obj.rhs_acc_tensor.allocator()->init(acl_conf.rhs_acc_info);
-    acl_obj.trans_rhs.configure(&acl_obj.rhs_acc_tensor, &acl_obj.rhs_tensor);
-  }
-  // Configure GEMM
-  acl_obj.gemm.configure(&acl_obj.rhs_tensor, &acl_obj.lhs_tensor, nullptr,
-                         &acl_obj.out_tensor, acl_conf.alpha, 0.0f,
-                         acl_conf.gemm_info);
-
-  // Run transpose kernel
-  if (transpose_lhs && !transpose_rhs) {
-    acl_obj.lhs_tensor.allocator()->allocate();
-    acl_obj.lhs_acc_tensor.allocator()->import_memory(lhs);
-    acl_obj.trans_lhs.run();
-    acl_obj.rhs_tensor.allocator()->import_memory(rhs);
-  } else if (transpose_rhs && !transpose_lhs) {
-    acl_obj.rhs_tensor.allocator()->allocate();
-    acl_obj.rhs_acc_tensor.allocator()->import_memory(rhs);
-    acl_obj.trans_rhs.run();
-    acl_obj.lhs_tensor.allocator()->import_memory(lhs);
-  } else if (transpose_rhs && transpose_lhs) {
-    acl_obj.lhs_tensor.allocator()->allocate();
-    acl_obj.lhs_acc_tensor.allocator()->import_memory(lhs);
-    acl_obj.rhs_tensor.allocator()->allocate();
-    acl_obj.rhs_acc_tensor.allocator()->import_memory(rhs);
-    acl_obj.trans_lhs.run();
-    acl_obj.trans_rhs.run();
-  } else {
-    acl_obj.lhs_tensor.allocator()->import_memory(lhs);
-    acl_obj.rhs_tensor.allocator()->import_memory(rhs);
-  }
-
-  acl_obj.out_tensor.allocator()->import_memory(out);
-
-  // Execute the function
-  acl_obj.gemm.run();
-
-  acl_obj.lhs_tensor.allocator()->free();
-  acl_obj.rhs_tensor.allocator()->free();
-  acl_obj.out_tensor.allocator()->free();
-  if (acl_conf.is_trans_lhs) acl_obj.lhs_acc_tensor.allocator()->free();
-  if (acl_conf.is_trans_rhs) acl_obj.rhs_acc_tensor.allocator()->free();
-
-  return 0;
-}
-
-}  // namespace
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int32_t transpose_lhs, int32_t transpose_rhs) {
-  if (MatMulF32(run_options_ptr, out, lhs, rhs, m, n, k, 1 /*batch_size*/,
-                transpose_lhs, transpose_rhs) < 0) {
-    VLOG(1) << "ACL matmul failed, fallback to Eigen matmul";
-    __xla_cpu_runtime_EigenMatMulF32(run_options_ptr, out, lhs, rhs, m, n, k,
-                                     transpose_lhs, transpose_rhs);
-  }
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* run_options_ptr, float* out, float* lhs, float* rhs, int64_t m,
-    int64_t n, int64_t k, int64_t batch_size, int32_t transpose_lhs,
-    int32_t transpose_rhs) {
-  if (MatMulF32(run_options_ptr, out, lhs, rhs, m, n, k, batch_size,
-                transpose_lhs, transpose_rhs) < 0) {
-    VLOG(1) << "ACL batch matmul failed, fallback to Eigen batch matmul";
-    __xla_cpu_runtime_EigenBatchMatMulF32(run_options_ptr, out, lhs, rhs, m, n,
-                                          k, batch_size, transpose_lhs,
-                                          transpose_rhs);
-  }
-}
-
-#endif  // XLA_CPU_USE_ACL
diff --git a/third_party/xla/xla/service/cpu/runtime_matmul_acl.h b/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
deleted file mode 100644
index 94f4f56d65f100..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_matmul_acl.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
-#define XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
-
-#include <iostream>
-
-#include "tsl/platform/types.h"
-
-#ifdef XLA_CPU_USE_ACL
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/Utils.h"
-
-extern "C" {
-struct acl_matmul_obj_t {
-  arm_compute::NEGEMM gemm;
-  arm_compute::NETranspose trans_lhs;
-  arm_compute::NETranspose trans_rhs;
-  arm_compute::Tensor rhs_tensor;
-  arm_compute::Tensor rhs_acc_tensor;
-  arm_compute::Tensor lhs_tensor;
-  arm_compute::Tensor lhs_acc_tensor;
-  arm_compute::Tensor out_tensor;
-};
-
-struct acl_matmul_conf_t {
-  bool with_bias;
-  bool is_trans_lhs;
-  bool is_trans_rhs;
-  arm_compute::TensorInfo lhs_info;
-  arm_compute::TensorInfo lhs_acc_info;
-  arm_compute::TensorInfo rhs_info;
-  arm_compute::TensorInfo rhs_acc_info;
-  arm_compute::TensorInfo out_info;
-  arm_compute::GEMMInfo gemm_info;
-  float alpha;
-};
-
-extern void __xla_cpu_runtime_ACLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-extern void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
-    int32_t transpose_lhs, int32_t transpose_rhs);
-
-}  // extern "C"
-#else
-extern "C" {
-inline extern void __xla_cpu_runtime_ACLMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
-    int32_t transpose_lhs, int32_t transpose_rhs) {
-  std::cerr
-      << "Attempt to call ACL MatMul runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-
-inline extern void __xla_cpu_runtime_ACLBatchMatMulF32(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
-    float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
-    int32_t transpose_lhs, int32_t transpose_rhs) {
-  std::cerr
-      << "Attempt to call ACL MatMul runtime library without defining "
-         "XLA_CPU_USE_ACL. Add --define=build_with_acl=true to build with ACL.";
-  exit(1);
-}
-}  // extern "C"
-#endif  // XLA_CPU_USE_ACL
-#endif  // XLA_SERVICE_CPU_RUNTIME_MATMUL_ACL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc
index fa1753116dca48..e392770525ee89 100644
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc
+++ b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv2d.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/base/attributes.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
@@ -32,12 +32,12 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
     int64_t lhs_col_dilation, int64_t rhs_row_dilation,
     int64_t rhs_col_dilation, int64_t feature_group_count) {
   xla::cpu::internal::EigenConv2D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
-      input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
-      kernel_filters, output_rows, output_cols, row_stride, col_stride,
-      padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
+      nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+      input_channels, kernel_rows, kernel_cols, kernel_channels, kernel_filters,
+      output_rows, output_cols, row_stride, col_stride, padding_top,
+      padding_bottom, padding_left, padding_right, lhs_row_dilation,
+      lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, feature_group_count,
+      /*done=*/[] {});
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -52,10 +52,10 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
     int64_t rhs_row_dilation, int64_t rhs_col_dilation,
     int64_t feature_group_count) {
   xla::cpu::internal::EigenConv2D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
-      input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
-      kernel_filters, output_rows, output_cols, row_stride, col_stride,
-      padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
-      feature_group_count, /*count_down=*/{}, /*use_thunk_runtime=*/false);
+      nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+      input_channels, kernel_rows, kernel_cols, kernel_channels, kernel_filters,
+      output_rows, output_cols, row_stride, col_stride, padding_top,
+      padding_bottom, padding_left, padding_right, lhs_row_dilation,
+      lhs_col_dilation, rhs_row_dilation, rhs_col_dilation, feature_group_count,
+      /*done=*/[] {});
 }
diff --git a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc
index a82e950953a72d..e2b278fafe5dbb 100644
--- a/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc
+++ b/third_party/xla/xla/service/cpu/runtime_single_threaded_conv3d.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/base/attributes.h"
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
@@ -34,13 +34,13 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
     int64_t rhs_y_dilation, int64_t rhs_z_dilation,
     int64_t feature_group_count) {
   xla::cpu::internal::EigenConv3D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
-      input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
+      nullptr, out, lhs, rhs, input_batch, input_x, input_y, input_z,
+      input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
       kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
+      rhs_z_dilation, feature_group_count, /*done=*/[] {});
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -57,11 +57,11 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
     int64_t rhs_y_dilation, int64_t rhs_z_dilation,
     int64_t feature_group_count) {
   xla::cpu::internal::EigenConv3D(
-      Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
-      input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
+      nullptr, out, lhs, rhs, input_batch, input_x, input_y, input_z,
+      input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
       kernel_filters, output_x, output_y, output_z, x_stride, y_stride,
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation, feature_group_count, {});
+      rhs_z_dilation, feature_group_count, /*done=*/[] {});
 }
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
deleted file mode 100644
index 240f9d4cb109e6..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/runtime_symbol_generator.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/strings/string_view.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Support/Error.h"
-#include "mlir/ExecutionEngine/CRunnerUtils.h"
-#include "xla/service/cpu/cpu_runtime.h"
-#include "xla/service/cpu/runtime_conv2d.h"
-#include "xla/service/cpu/runtime_conv2d_acl.h"
-#include "xla/service/cpu/runtime_conv3d.h"
-#include "xla/service/cpu/runtime_custom_call_status.h"
-#include "xla/service/cpu/runtime_fp16.h"
-#include "xla/service/cpu/runtime_key_value_sort.h"
-#include "xla/service/cpu/runtime_matmul.h"
-#include "xla/service/cpu/runtime_matmul_acl.h"
-#include "xla/service/cpu/runtime_pow.h"
-#include "xla/service/cpu/runtime_single_threaded_conv2d.h"
-#include "xla/service/cpu/runtime_single_threaded_conv3d.h"
-#include "xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "xla/service/cpu/runtime_topk.h"
-#include "xla/service/cpu/windows_compatibility.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "tsl/platform/logging.h"
-
-#ifdef XLA_ONEDNN
-#include "xla/service/cpu/onednn_convolution.h"
-#include "xla/service/cpu/onednn_matmul.h"
-#endif  // XLA_ONEDNN
-
-namespace xla::cpu {
-
-RuntimeSymbolGenerator::RuntimeSymbolGenerator(llvm::DataLayout data_layout)
-    : data_layout_(std::move(data_layout)) {}
-
-llvm::Error RuntimeSymbolGenerator::tryToGenerate(
-    llvm::orc::LookupState&, llvm::orc::LookupKind kind,
-    llvm::orc::JITDylib& jit_dylib, llvm::orc::JITDylibLookupFlags,
-    const llvm::orc::SymbolLookupSet& names) {
-  llvm::orc::SymbolMap symbols;
-  symbols.reserve(names.size());
-
-  for (const auto& [name, flags] : names) {
-    if (auto symbol = ResolveRuntimeSymbol(*name)) {
-      symbols[name] = *symbol;
-    }
-  }
-
-  cantFail(jit_dylib.define(llvm::orc::absoluteSymbols(std::move(symbols))));
-  return llvm::Error::success();
-}
-
-std::optional<llvm::orc::ExecutorSymbolDef>
-RuntimeSymbolGenerator::ResolveRuntimeSymbol(llvm::StringRef name) {
-  void* fn_addr = nullptr;
-  if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
-    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
-    // registered name may not.
-    std::string stripped_name(name.begin() + 1, name.end());
-    fn_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name, "Host");
-  } else {
-    fn_addr = CustomCallTargetRegistry::Global()->Lookup(name.str(), "Host");
-  }
-
-  // We register runtime symbols as weak, because during concurrent compilation
-  // different threads may race to register their symbols in the same dylib and
-  // we get spurious "symbol already defined" errors.
-  return llvm::orc::ExecutorSymbolDef{
-      llvm::orc::ExecutorAddr(reinterpret_cast<uint64_t>(fn_addr)),
-      llvm::JITSymbolFlags::Weak};
-}
-
-//===----------------------------------------------------------------------===//
-// Register XLA:CPU runtime symbols with the CustomCallTargetRegistry.
-//===----------------------------------------------------------------------===//
-
-#if defined(PLATFORM_WINDOWS)
-// This function is used by compiler-generated code on windows, but it's not
-// declared anywhere. The signature does not matter, we just need the address.
-extern "C" void __chkstk(size_t);
-#endif
-
-extern "C" {
-// Provided by compiler-rt and MLIR.
-// Converts an F32 value to a BF16.
-uint16_t __truncsfbf2(float);
-// Converts an F64 value to a BF16.
-uint16_t __truncdfbf2(double);
-
-#ifdef __APPLE__
-// Converts an F32 value to a F16.
-uint16_t __truncsfhf2(float);
-
-float __extendhfsf2(uint16_t a);
-#endif  // __APPLE__
-
-}  // extern "C"
-
-#define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
-  do {                                                                       \
-    auto* function_address =                                                 \
-        reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);              \
-    registry->Register(xla::cpu::runtime::k##base_name##SymbolName,          \
-                       function_address, "Host");                            \
-    CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \
-             "__xla_cpu_runtime_" #base_name);                               \
-  } while (false)
-
-// Register both the f32 (float) and f64 (double) versions of a libm symbol.
-// Unfortunately the double versions are overloaded on some systems, e.g.
-// Mac so we need an explicit cast. This requires passing the function signature
-// for that case.
-#define REGISTER_LIBM_SYMBOL(name, double_sig)                                 \
-  do {                                                                         \
-    registry->Register(#name "f", reinterpret_cast<void*>(name##f), "Host");   \
-    registry->Register(#name,                                                  \
-                       reinterpret_cast<void*>(static_cast<double_sig>(name)), \
-                       "Host");                                                \
-  } while (false)
-
-static bool RegisterKnownJITSymbols() {
-  xla::CustomCallTargetRegistry* registry =
-      xla::CustomCallTargetRegistry::Global();
-  registry->Register("printf", reinterpret_cast<void*>(&printf), "Host");
-  registry->Register("puts", reinterpret_cast<void*>(&puts), "Host");
-
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv2DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv3DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenConv3DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC128);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenBatchMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLBatchMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(ACLConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv2DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv2DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv3DF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConv3DF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF8E4M3FN);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF8E5M2);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC64);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC128);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulU8);
-  REGISTER_CPU_RUNTIME_SYMBOL(StatusIsSuccess);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
-  REGISTER_CPU_RUNTIME_SYMBOL(TopKF32);
-#ifdef XLA_ONEDNN
-  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
-  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMulReorder);
-#endif  // XLA_ONEDNN
-
-  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
-                     "Host");
-  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee),
-                     "Host");
-  registry->Register("__truncdfhf2", reinterpret_cast<void*>(__truncdfhf2),
-                     "Host");
-  registry->Register("__truncdfbf2", reinterpret_cast<void*>(__truncdfbf2),
-                     "Host");
-  registry->Register("__truncsfbf2", reinterpret_cast<void*>(__truncsfbf2),
-                     "Host");
-
-#ifdef __APPLE__
-  registry->Register("__truncsfhf2", reinterpret_cast<void*>(__truncsfhf2),
-                     "Host");
-  registry->Register("__extendhfsf2", reinterpret_cast<void*>(__extendhfsf2),
-                     "Host");
-#endif  // __APPLE__
-  registry->Register("__powisf2", reinterpret_cast<void*>(__powisf2), "Host");
-  registry->Register("__powidf2", reinterpret_cast<void*>(__powidf2), "Host");
-
-  REGISTER_LIBM_SYMBOL(acos, double (*)(double));
-  REGISTER_LIBM_SYMBOL(acosh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(asin, double (*)(double));
-  REGISTER_LIBM_SYMBOL(asinh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(atan, double (*)(double));
-  REGISTER_LIBM_SYMBOL(atan2, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(atanh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(cbrt, double (*)(double));
-  REGISTER_LIBM_SYMBOL(ceil, double (*)(double));
-  REGISTER_LIBM_SYMBOL(copysign, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(cos, double (*)(double));
-  REGISTER_LIBM_SYMBOL(cosh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(erf, double (*)(double));
-  REGISTER_LIBM_SYMBOL(erfc, double (*)(double));
-  REGISTER_LIBM_SYMBOL(exp, double (*)(double));
-  REGISTER_LIBM_SYMBOL(exp2, double (*)(double));
-  REGISTER_LIBM_SYMBOL(expm1, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fabs, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fdim, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(floor, double (*)(double));
-  REGISTER_LIBM_SYMBOL(fma, double (*)(double, double, double));
-  REGISTER_LIBM_SYMBOL(fmax, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(fmin, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(fmod, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(frexp, double (*)(double, int*));
-  REGISTER_LIBM_SYMBOL(hypot, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
-  REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
-  REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(log, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log10, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
-  REGISTER_LIBM_SYMBOL(log2, double (*)(double));
-  REGISTER_LIBM_SYMBOL(logb, double (*)(double));
-  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
-  REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
-  REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
-  REGISTER_LIBM_SYMBOL(nextafter, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(nexttoward, double (*)(double, long double));
-  REGISTER_LIBM_SYMBOL(pow, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(remainder, double (*)(double, double));
-  REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
-  REGISTER_LIBM_SYMBOL(rint, double (*)(double));
-  REGISTER_LIBM_SYMBOL(round, double (*)(double));
-  REGISTER_LIBM_SYMBOL(scalbln,
-                       double (*)(double, long));  // NOLINT(runtime/int)
-  REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
-  REGISTER_LIBM_SYMBOL(sin, double (*)(double));
-#ifdef __APPLE__
-  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
-  registry->Register("__sincosf_stret",
-                     reinterpret_cast<void*>(__sincosf_stret), "Host");
-  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret),
-                     "Host");
-#else
-  REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
-#endif
-  REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tan, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tanh, double (*)(double));
-  REGISTER_LIBM_SYMBOL(tgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(trunc, double (*)(double));
-
-  registry->Register("memcpy", reinterpret_cast<void*>(memcpy), "Host");
-  registry->Register("memmove", reinterpret_cast<void*>(memmove), "Host");
-  registry->Register("memset", reinterpret_cast<void*>(memset), "Host");
-
-  // Used by MLIR lowering.
-  registry->Register("malloc", reinterpret_cast<void*>(malloc), "Host");
-  registry->Register("calloc", reinterpret_cast<void*>(calloc), "Host");
-  registry->Register("free", reinterpret_cast<void*>(free), "Host");
-#ifndef _WIN32
-  // TODO(b/246980307): fails to link on windows because it's marked dllimport.
-  registry->Register("memrefCopy", reinterpret_cast<void*>(memrefCopy), "Host");
-#endif
-
-#ifdef __APPLE__
-  registry->Register("__bzero", reinterpret_cast<void*>(bzero), "Host");
-  registry->Register("bzero", reinterpret_cast<void*>(bzero), "Host");
-  registry->Register("memset_pattern16",
-                     reinterpret_cast<void*>(memset_pattern16), "Host");
-#endif
-
-#ifdef MEMORY_SANITIZER
-  registry->Register("__msan_unpoison",
-                     reinterpret_cast<void*>(__msan_unpoison), "Host");
-#endif
-
-#if defined(PLATFORM_WINDOWS)
-  registry->Register("__chkstk", reinterpret_cast<void*>(__chkstk), "Host");
-#endif
-
-  return true;
-}
-
-#undef REGISTER_CPU_RUNTIME_SYMBOL
-#undef REGISTER_LIBM_SYMBOL
-
-static bool unused = RegisterKnownJITSymbols();
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/runtime_topk.h b/third_party/xla/xla/service/cpu/runtime_topk.h
deleted file mode 100644
index 13e922d16401cd..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime_topk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_TOPK_H_
-#define XLA_SERVICE_CPU_RUNTIME_TOPK_H_
-
-#include <stdint.h>
-
-extern "C" {
-
-// Calculates `batch_size` topk operations with `input_size` inputs each. The
-// outputs are written to `out_values` and `out_indices`.
-extern void __xla_cpu_runtime_TopKF32(int64_t batch_size, int64_t input_size,
-                                      int64_t k, const float* values,
-                                      float* out_values, int32_t* out_indices);
-}
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_TOPK_H_
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
index d032db343c362c..8b9a8619759c4d 100644
--- a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
@@ -47,7 +47,10 @@ static bool InstructionIsUnavailable(const HloInstruction* instr) {
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFft:
+    case HloOpcode::kPartitionId:
+    case HloOpcode::kReplicaId:
       return true;
     default:
       return IsCollective(instr);
@@ -83,7 +86,7 @@ SmallWhileLoopHoistingPass::SmallWhileLoopHoistingPass(
     int64_t small_buffer_access_size)
     : small_buffer_access_size_(small_buffer_access_size) {}
 
-absl::StatusOr<bool> SmallWhileLoopHoistingPass::Run(
+absl::StatusOr<bool> SmallWhileLoopHoistingPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> while_instrs;
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
index a190d2a38039a8..5ee2e37fa02bdd 100644
--- a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
@@ -34,8 +34,9 @@ class SmallWhileLoopHoistingPass final : public HloModulePass {
   explicit SmallWhileLoopHoistingPass(int64_t small_buffer_access_size);
 
   absl::string_view name() const final { return "small-while-loop-hoisting"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) final;
 
diff --git a/third_party/xla/xla/service/cpu/test_target_triple_helper.h b/third_party/xla/xla/service/cpu/test_target_triple_helper.h
index 0b057bf4001791..2da6b059b32b2c 100644
--- a/third_party/xla/xla/service/cpu/test_target_triple_helper.h
+++ b/third_party/xla/xla/service/cpu/test_target_triple_helper.h
@@ -23,6 +23,9 @@ static const char kTargetTripleForHost[] = "aarch64-unknown-linux-gnu";
        defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
 static const char kTargetCpuForHost[] = "ppc";
 static const char kTargetTripleForHost[] = "ppc64le-ibm-linux-gnu";
+#elif defined(__riscv) && (__riscv_xlen == 64)
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "riscv64-unknown-linux-gnu";
 #elif defined(__s390x__)
 static const char kTargetCpuForHost[] = "s390x";
 static const char kTargetTripleForHost[] = "systemz-none-linux-gnu";
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 529106d33da8c4..3ea7a53b3b6206 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -394,11 +394,10 @@ xla_cc_test(
         "notap",
     ],
     deps = [
-        "//xla:literal",
         "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
         "//xla/service:cpu_plugin",
         "//xla/service:pattern_matcher",
         "//xla/service/cpu:backend_config_proto_cc",
@@ -407,6 +406,7 @@ xla_cc_test(
         "//xla/service/cpu:onednn_util",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -422,6 +422,7 @@ xla_cc_test(
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
index ae26a37a35a9e0..35acc45c65cee1 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -61,10 +61,10 @@ class CpuUnaryIntrinsicTest
     auto spec = info.param;
 
     std::string opcode(HloOpcodeString(spec.opcode));
-    opcode[0] = toupper(opcode[0]);
+    opcode[0] = absl::ascii_toupper(opcode[0]);
 
     std::string type(PrimitiveType_Name(spec.type));
-    type[0] = toupper(type[0]);
+    type[0] = absl::ascii_toupper(type[0]);
 
     std::string triple{spec.triple.data(), spec.triple.size()};
     if (triple == kTriple_x86_64) {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
index 892ff2303e4de4..b659c9a48f821b 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -63,7 +63,7 @@ class CpuVectorizationTest
     auto spec = info.param;
 
     std::string opcode(HloOpcodeString(spec.opcode));
-    opcode[0] = toupper(opcode[0]);
+    opcode[0] = absl::ascii_toupper(opcode[0]);
 
     std::string triple{spec.triple.data(), spec.triple.size()};
     if (triple == kTriple_x86_64) {
@@ -256,7 +256,7 @@ class JitVectorizationTest
   static std::string Name(
       const ::testing::TestParamInfo<JitVectorizationTestSpec>& info) {
     std::string op_name(HloOpcodeString(info.param.opcode));
-    op_name[0] = toupper(op_name[0]);
+    op_name[0] = absl::ascii_toupper(op_name[0]);
     return absl::StrCat(op_name, "_max_", info.param.max_isa);
   }
 
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
index 2991dd47cbb899..dd28c5432c8bd5 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_replace.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/test.h"
@@ -300,20 +301,20 @@ TEST_P(ConvolutionTest, ConvInsufficientScratchTest) {
             "dims":"3",
             "input":{
               "dims":"3",
-              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["1"]}
             },
             "kernel":{
               "dims":"3",
               "filter":{"input_feature_dim":"1","output_feature_dim":"2",
-                "spatial_dims":["1"],"shape":[]}
+                "spatial_dims":["0"],"shape":[]}
             },
             "output":{
               "dims":"3",
-              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["1"]}
             },
             "window":{
-              "size":[],"pad_left":["1"],"pad_right":["1"],
-              "strides":["2"],"window_dilations":["2"]
+              "size":[],"pad_left":["0"],"pad_right":["0"],
+              "strides":["1"],"window_dilations":["1"]
             },
             "feature_groups":"1",
             "optimization_config":{"user_scratchpad":true}
@@ -756,8 +757,7 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(F32, BF16, F16),
     [](const ::testing::TestParamInfo<ConvolutionTest::ParamType>& info) {
       auto test_name = primitive_util::LowercasePrimitiveTypeName(info.param);
-      std::transform(test_name.begin(), test_name.end(), test_name.begin(),
-                     [](auto c) { return std::toupper(c); });
+      absl::AsciiStrToUpper(&test_name);
       return test_name;
     });
 
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
index 882d6e690692ef..9e26ce45dcba31 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_fusion_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/test.h"
+#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 namespace {
@@ -26,7 +27,8 @@ using OneDnnFusionTest = HloTestBase;
 
 inline constexpr bool IsOneDnnGraphEnabled() {
 #if defined(XLA_ONEDNN_USE_GRAPH_API)
-  return true;
+  // Some Aarch64 CPUs have failures. Only test on x86 for now.
+  return tsl::port::IsX86CPU();
 #endif  // XLA_ONEDNN_USE_GRAPH_API
   return false;
 }
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 5d1195a947994c..e618f47ae3d297 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
@@ -77,12 +78,9 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/codegen/llvm_ir_kernel_source.h"
-#include "xla/codegen/llvm_kernel_definition.h"
-#include "xla/codegen/mlir_kernel_definition.h"
+#include "xla/codegen/llvm_kernel_source.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/comparison_util.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -108,6 +106,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -128,6 +127,7 @@ limitations under the License.
 
 #ifdef XLA_YNNPACK
 #include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
 #include "xla/backends/cpu/ynn_emitter.h"
 #include "xla/backends/cpu/ynn_support.h"
 #endif  // XLA_YNNPACK
@@ -234,13 +234,15 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitEntryComputation(
 absl::StatusOr<std::vector<ThunkEmitter::EmittedKernel>>
 ThunkEmitter::ConsumeKernels() {
   tsl::profiler::TraceMe trace("ThunkEmitter::ConsumeKernels");
-  TF_ASSIGN_OR_RETURN(std::vector<LlvmKernelDefinition> fusion_kernels,
-                      parallel_fusion_emitter_.ConsumeKernels());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<KernelDefinition<LlvmKernelSource>> fusion_kernels,
+      parallel_fusion_emitter_.ConsumeKernels());
 
   kernels_.reserve(kernels_.size() + fusion_kernels.size());
-  for (LlvmKernelDefinition& kernel : fusion_kernels) {
-    auto [spec, source] = std::move(kernel).ReleaseStorage();
-    kernels_.push_back({spec.name(), std::move(source).thread_safe_module()});
+  for (KernelDefinition<LlvmKernelSource>& kernel : fusion_kernels) {
+    std::string name(kernel.spec().name());
+    auto source = std::move(kernel).TakeSource();
+    kernels_.push_back({name, std::move(source).thread_safe_module()});
   }
 
   return std::move(kernels_);
@@ -687,11 +689,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
       maybe_small_call.has_value() && *maybe_small_call == "true") {
     ComputationKernelEmitter emitter(instruction, &buffer_assignment_,
                                      &target_machine_features_);
-    TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                         emitter.EmitKernelDefinition());
 
-    auto [kernel_spec, kernel_source] =
-        std::move(kernel_definition).ReleaseStorage();
+    auto kernel_spec = kernel_definition.spec();
+    auto kernel_source = std::move(kernel_definition).TakeSource();
 
     kernels_.push_back(
         {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -711,11 +713,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConcatenateKernelThunk(
     const HloInstruction* instruction) {
   ConcatenateKernelEmitter emitter(instruction, &buffer_assignment_,
                                    &target_machine_features_);
-  TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+  TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  auto kernel_spec = kernel_definition.spec();
+  auto kernel_source = std::move(kernel_definition).TakeSource();
 
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instruction->backend_config<BackendConfig>());
@@ -783,8 +785,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       TF_ASSIGN_OR_RETURN(auto output_buffer, GetAllocationSlice(instruction));
 
       ConvolutionThunk::Options options;
-      options.multi_threaded =
-          hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
       return ThunkSequence::Of<ConvolutionThunk>(
           ThunkInfo(instruction), options, input_buffer, input_shape,
           kernel_buffer, kernel_shape, output_buffer, output_shape,
@@ -817,11 +817,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitElementalKernelThunk(
     const HloInstruction* instruction) {
   ElementalKernelEmitter emitter(instruction, &buffer_assignment_,
                                  &target_machine_features_);
-  TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+  TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                       emitter.EmitKernelDefinition());
 
-  auto [kernel_spec, kernel_source] =
-      std::move(kernel_definition).ReleaseStorage();
+  auto kernel_spec = kernel_definition.spec();
+  auto kernel_source = std::move(kernel_definition).TakeSource();
 
   kernels_.push_back(
       {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -849,17 +849,17 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
     auto kernel_emitter = std::make_unique<CpuScatterFusion>(
         buffer_assignment_, fusion, &symbolic_expr_context_);
 
-    TF_ASSIGN_OR_RETURN(MlirKernelDefinition kernel_definition,
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                         kernel_emitter->EmitKernelDefinition());
 
-    auto [kernel_spec, kernel_source] =
-        std::move(kernel_definition).ReleaseStorage();
+    auto kernel_spec = kernel_definition.spec();
+    auto kernel_source = std::move(kernel_definition).TakeSource();
 
-    TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_ir_kernel_source,
+    TF_ASSIGN_OR_RETURN(LlvmKernelSource llvm_kernel_source,
                         fusion_compiler_.Compile(std::move(kernel_source)));
 
     kernels_.push_back({kernel_spec.name(),
-                        std::move(llvm_ir_kernel_source).thread_safe_module()});
+                        std::move(llvm_kernel_source).thread_safe_module()});
 
     return MakeKernelThunkSequence(instruction, std::move(kernel_spec),
                                    /*min_alignment=*/MinAlign());
@@ -1061,11 +1061,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
     case DotImplementationStrategy::kTiledLlvmIrGemv: {
       DotKernelEmitter emitter(instruction, &buffer_assignment_,
                                &target_machine_features_);
-      TF_ASSIGN_OR_RETURN(LlvmKernelDefinition kernel_definition,
+      TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
                           emitter.EmitKernelDefinition());
 
-      auto [kernel_spec, kernel_source] =
-          std::move(kernel_definition).ReleaseStorage();
+      auto kernel_spec = kernel_definition.spec();
+      auto kernel_source = std::move(kernel_definition).TakeSource();
 
       kernels_.push_back(
           {kernel_spec.name(), std::move(kernel_source).thread_safe_module()});
@@ -1083,6 +1083,22 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
       TF_ASSIGN_OR_RETURN(BufferAllocation::Slice out_slice,
                           GetAllocationSlice(instruction));
 
+#ifdef XLA_YNNPACK
+      const bool use_ynn = absl::c_linear_search(
+          hlo_module_config_.debug_options()
+              .xla_cpu_experimental_ynn_fusion_type(),
+          DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
+      if (use_ynn) {
+        TF_ASSIGN_OR_RETURN(
+            auto is_dot_supported,
+            IsDotSupportedByYnn(dnums, lhs->shape(), rhs->shape(),
+                                instruction->shape()));
+        if (is_dot_supported) {
+          return EmitYnnFusionThunk(instruction);
+        }
+      }
+#endif  // XLA_YNNPACK
+
       // Decide whether to use XNNPACK or Eigen.
       bool use_xnn = hlo_module_config_.debug_options().xla_cpu_use_xnnpack();
       if (use_xnn) {
@@ -1509,8 +1525,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitXnnFusionThunk(
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     const HloInstruction* instruction) {
 #ifdef XLA_YNNPACK
-  auto* fusion = Cast<HloFusionInstruction>(instruction);
-
   // Collect YNNPACK fusion arguments.
   std::vector<YnnFusionThunk::Argument> arguments;
   for (HloInstruction* operand : instruction->operands()) {
@@ -1531,15 +1545,36 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     results.push_back(YnnFusionThunk::Result{slice, indexed.shape});
   }
 
-  const HloComputation* computation = fusion->fused_instructions_computation();
-
-  // Construct YNNPACK subgraph builder from the fusion computation.
-  TF_ASSIGN_OR_RETURN(auto builder, EmitYnnFusionBuilder(computation));
+  absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+      absl::Span<const se::DeviceMemoryBase> arguments_buffers)>
+      builder;
+  absl::Span<const int64_t> captured_arguments_ids;
+  if (instruction->opcode() == HloOpcode::kDot) {
+    const HloDotInstruction* dot = Cast<HloDotInstruction>(instruction);
+    // TODO(b/455903737): If we know the RHS is a constant, we should capture it
+    // here.
+    bool capture_rhs = false;
+    // Construct YNNPACK subgraph builder from the dot instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnDotBuilder(dot, capture_rhs));
+    static constexpr int64_t kCapturedIds[1] = {1};
+    if (capture_rhs) {
+      captured_arguments_ids = kCapturedIds;
+    }
+  } else {
+    auto* fusion = Cast<HloFusionInstruction>(instruction);
+    const HloComputation* computation =
+        fusion->fused_instructions_computation();
+    // Construct YNNPACK subgraph builder from the fusion computation.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnFusionBuilder(computation));
+  }
 
   return ThunkSequence::Of<YnnFusionThunk>(
-      YnnFusionThunk::Options{}, ThunkInfo(instruction), std::move(arguments),
-      std::move(results),
-      [b = std::move(builder)](auto, auto) mutable { return b(); });
+      YnnFusionThunk::Options{}, ThunkInfo(instruction), instruction,
+      std::move(arguments), std::move(results),
+      [b = std::move(builder)](auto, auto, auto arg_buffers) mutable {
+        return b(arg_buffers);
+      },
+      captured_arguments_ids);
 #else
   return Unimplemented("XLA is not built with YNNPACK.");
 #endif  // XLA_YNNPACK
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 8b052b416d8c61..2ed118863d9730 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -20,11 +20,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "mlir/IR/MLIRContext.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/codegen/kernel_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -41,7 +44,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/parallel_fusion_emitter.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -69,6 +71,9 @@ class ThunkEmitter {
   };
 
   struct EmittedKernel {
+    EmittedKernel(absl::string_view name, llvm::orc::ThreadSafeModule module)
+        : kernel_name(name), module(std::move(module)) {}
+
     std::string kernel_name;
     llvm::orc::ThreadSafeModule module;
   };
@@ -269,9 +274,7 @@ class ThunkEmitter {
   std::vector<EmittedKernel> kernels_;
 
   std::unique_ptr<mlir::MLIRContext> mlir_context_;
-  // TODO (b/449934916): SymbolicExprContext should be moved to a more generic
-  // (not GPU specific) place.
-  gpu::SymbolicExprContext symbolic_expr_context_{mlir_context_.get()};
+  SymbolicExprContext symbolic_expr_context_{mlir_context_.get()};
   FusionCompiler fusion_compiler_;
 
   absl::flat_hash_map<std::string, KernelSpec> kernel_spec_cache_;
diff --git a/third_party/xla/xla/service/cpu/windows_compatibility.cc b/third_party/xla/xla/service/cpu/windows_compatibility.cc
deleted file mode 100644
index f6da04d750f7e0..00000000000000
--- a/third_party/xla/xla/service/cpu/windows_compatibility.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/windows_compatibility.h"
-
-#ifdef _MSC_VER
-
-#include <math.h>
-
-void sincos(double x, double *sinv, double *cosv) {
-  *sinv = sin(x);
-  *cosv = cos(x);
-}
-
-void sincosf(float x, float *sinv, float *cosv) {
-  *sinv = sinf(x);
-  *cosv = cosf(x);
-}
-
-#endif  // _MSC_VER
diff --git a/third_party/xla/xla/service/cpu/windows_compatibility.h b/third_party/xla/xla/service/cpu/windows_compatibility.h
deleted file mode 100644
index 4e10087e5fa74d..00000000000000
--- a/third_party/xla/xla/service/cpu/windows_compatibility.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
-#define XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
-
-#ifdef _MSC_VER
-
-extern "C" {
-
-// MSVC does not have sincos[f].
-void sincos(double x, double *sinv, double *cosv);
-void sincosf(float x, float *sinv, float *cosv);
-
-}
-
-#endif  // _MSC_VER
-
-#endif  // XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
diff --git a/third_party/xla/xla/service/debug/BUILD b/third_party/xla/xla/service/debug/BUILD
index 75c1830e23d973..2ae5b9612315cb 100644
--- a/third_party/xla/xla/service/debug/BUILD
+++ b/third_party/xla/xla/service/debug/BUILD
@@ -1,6 +1,7 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -78,3 +79,29 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+xla_test(
+    name = "float_check_device_test",
+    srcs = ["float_check_device_test.cc"],
+    backend_args = if_google(
+        {
+            "b200": ["--heap_check="],
+            "h100": ["--heap_check="],
+        },
+        {},
+    ),
+    backends = [
+        "h100",
+        "b200",
+    ],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/debug/float_check_device_test.cc b/third_party/xla/xla/service/debug/float_check_device_test.cc
new file mode 100644
index 00000000000000..a99ab11f0ef160
--- /dev/null
+++ b/third_party/xla/xla/service/debug/float_check_device_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/log_severity.h"
+#include "absl/log/scoped_mock_log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla::gpu {
+
+using ::testing::_;
+
+namespace {
+
+TEST_F(GpuCodegenTest, OnNanShouldLogHloInstruction) {
+  static constexpr absl::string_view kHloModule = R"hlo(
+    HloModule test_module
+    ENTRY main {
+      zero = f32[] constant(0)
+      zero_init = f32[1024] broadcast(zero), dimensions={}
+      ROOT div = f32[1024] divide(zero_init, zero_init)
+    }
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kHloModule));
+  module->mutable_config().mutable_debug_options().set_xla_gpu_detect_nan(
+      DebugOptions::DETECTION_MODE_WARNING);
+  absl::ScopedMockLog log;
+  EXPECT_CALL(
+      log, Log(absl::LogSeverity::kError, _,
+               ::testing::HasSubstr("Found entry with non zero nan count ")));
+  EXPECT_CALL(log, Log(absl::LogSeverity::kError, _,
+                       ::testing::HasSubstr("HLO instruction with id ")));
+  EXPECT_CALL(log,
+              Log(absl::LogSeverity::kError, _,
+                  ::testing::HasSubstr("HLO fusion instruction computation")));
+  log.StartCapturingLogs();
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/true));
+  log.StopCapturingLogs();
+}
+
+TEST_F(GpuCodegenTest, OnInfShouldLogHloInstruction) {
+  static constexpr absl::string_view kHloModule = R"hlo(
+    HloModule test_module
+    ENTRY main {
+      p0 = f32[1024] parameter(0)
+      zero = f32[] constant(0)
+      zero_init = f32[1024] broadcast(zero), dimensions={}
+      ROOT div = f32[1024] divide(p0, zero_init)
+    }
+  )hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kHloModule));
+  module->mutable_config().mutable_debug_options().set_xla_gpu_detect_inf(
+      DebugOptions::DETECTION_MODE_WARNING);
+  absl::ScopedMockLog log;
+  EXPECT_CALL(
+      log, Log(absl::LogSeverity::kError, _,
+               ::testing::HasSubstr("Found entry with non zero inf count ")));
+  EXPECT_CALL(log, Log(absl::LogSeverity::kError, _,
+                       ::testing::HasSubstr("HLO instruction with id ")));
+  EXPECT_CALL(log,
+              Log(absl::LogSeverity::kError, _,
+                  ::testing::HasSubstr("HLO fusion instruction computation")));
+  log.StartCapturingLogs();
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/true));
+  log.StopCapturingLogs();
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector.cc b/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
index 31911ca44f9e5d..a7c60d54d1d9ec 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector.cc
@@ -74,11 +74,11 @@ std::string UniqueReductionOpsAsString(
   return result;
 }
 
-absl::StatusOr<bool> UnstableReductionDetector::Run(
+absl::StatusOr<bool> UnstableReductionDetector::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (module->config().debug_options().xla_detect_unstable_reductions() ==
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE) {
+      DebugOptions::DETECTION_MODE_NONE) {
     return false;
   }
   std::vector<const HloInstruction*> unstable_reductions =
@@ -94,7 +94,7 @@ absl::StatusOr<bool> UnstableReductionDetector::Run(
     LOG(WARNING) << "Unstable reduction: " << reduction->ToString();
   }
   if (module->config().debug_options().xla_detect_unstable_reductions() ==
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL) {
+      DebugOptions::DETECTION_MODE_FAIL) {
     std::string reduction_ops_string =
         UniqueReductionOpsAsString(unstable_reductions);
     return absl::FailedPreconditionError(absl::StrFormat(
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector.h b/third_party/xla/xla/service/debug/unstable_reduction_detector.h
index f2559e4abe887f..3e767d70daaaba 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector.h
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector.h
@@ -33,13 +33,14 @@ namespace xla {
 // accumulation type is a floating point type smaller than f32.
 class UnstableReductionDetector : public HloModulePass {
  public:
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override {
     return "unstable-reduction-detector";
   }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc b/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
index b2c9342628173e..f0a6454e2f8639 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_detector_test.cc
@@ -62,6 +62,22 @@ static constexpr absl::string_view kUnstableReductionNoMetadataHloModule = R"(
   }
 )";
 
+static constexpr absl::string_view kNoOpUnstableReductionHloModule = R"(
+  red {
+      p0 = bf16[] parameter(0)
+      p1 = bf16[] parameter(1)
+      ROOT red = bf16[] add(p0, p1)
+  }
+
+  ENTRY main {
+      p0 = bf16[1] parameter(0)
+      init = bf16[] constant(1.0)
+      ROOT red = bf16[] reduce(p0, init),
+          to_apply=red,
+          dimensions={0}
+  }
+)";
+
 using ::absl::LogSeverity;
 using ::absl_testing::IsOkAndHolds;
 using ::absl_testing::StatusIs;
@@ -73,8 +89,7 @@ TEST(UnstableReductionDetectorTest, FailOnUnstableReductions) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_FAIL);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(
@@ -101,8 +116,7 @@ TEST(UnstableReductionDetectorTest, WarningOnUnstableReduction) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_WARNING);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_WARNING);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _,
@@ -123,8 +137,7 @@ TEST(UnstableReductionDetectorTest, FailOnUnstableReductionNoMetadata) {
       ParseAndReturnUnverifiedModule(kUnstableReductionNoMetadataHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_FAIL);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_FAIL);
   UnstableReductionDetector detector;
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _,
@@ -143,8 +156,7 @@ TEST(UnstableReductionDetectorTest, DoNothingOnUnstableReduction) {
       auto module, ParseAndReturnUnverifiedModule(kUnstableReductionHloModule));
   module->mutable_config()
       .mutable_debug_options()
-      .set_xla_detect_unstable_reductions(
-          DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE);
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
   ::absl::ScopedMockLog log;
   EXPECT_CALL(log, Log(LogSeverity::kWarning, _, _)).Times(0);
   UnstableReductionDetector detector;
@@ -153,5 +165,21 @@ TEST(UnstableReductionDetectorTest, DoNothingOnUnstableReduction) {
               IsOkAndHolds(false));
 }
 
+TEST(UnstableReductionDetectorTest, NoOpUnstableReduction) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(
+                                           kNoOpUnstableReductionHloModule));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_WARNING);
+  UnstableReductionDetector detector;
+  ::absl::ScopedMockLog log;
+  EXPECT_CALL(log, Log(LogSeverity::kWarning, _, _)).Times(0);
+  EXPECT_CALL(log, Log(LogSeverity::kError, _, _)).Times(0);
+  log.StartCapturingLogs();
+  EXPECT_THAT(detector.Run(module.get(), /*execution_threads=*/{}),
+              IsOkAndHolds(false));
+  log.StopCapturingLogs();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/debug/unstable_reduction_finder.cc b/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
index 764b115d326e76..07f866c4a007ae 100644
--- a/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
+++ b/third_party/xla/xla/service/debug/unstable_reduction_finder.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -62,7 +63,13 @@ bool IsKnownStableReduction(const HloReduceInstruction* reduction) {
     return true;
   }
 
-  return false;
+  Shape operand_shape = reduction->operand(0)->shape();
+  for (auto dim : reduction->dimensions()) {
+    if (operand_shape.dimensions(dim) != 1) {
+      return false;
+    }
+  }
+  return true;
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index c5e490b3c1b22f..2cd25931440cc8 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -45,6 +45,8 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Transforms/LocationSnapshot.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -64,7 +66,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/regexp.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
@@ -1176,6 +1177,8 @@ void DumpHloUnoptimizedSnapshotIfEnabled(
       return;
     }
     tsl::WritableFileCopyingOutputStream output_stream(file.get());
+    // TODO - b/457711066: Add missing include once capybara can re-write the
+    // dependency correctly.
     tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output_stream);
     if (!SerializeHloUnoptimizedSnapshot(hlo_snapshot, &adaptor).ok()) {
       LOG(ERROR) << "Failed to serialize HLO unoptimized snapshot proto";
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index a094e1f86c1838..b0cf5cd49a2862 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -2079,7 +2079,7 @@ absl::Status DynamicShapeRemovingVisitor::HandleSetDimensionSize(
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicPadder::Run(
+absl::StatusOr<bool> DynamicPadder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Pre DynamicPadder HLO:";
diff --git a/third_party/xla/xla/service/dynamic_padder.h b/third_party/xla/xla/service/dynamic_padder.h
index 8ec029acee1292..4b428194e2d781 100644
--- a/third_party/xla/xla/service/dynamic_padder.h
+++ b/third_party/xla/xla/service/dynamic_padder.h
@@ -74,8 +74,8 @@ class DynamicPadder : public HloModulePass {
 
   absl::string_view name() const override { return "dynamic_padder"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6d36303349b4d9..bc7a438e4e8529 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -6,6 +6,10 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
+load(
+    "@local_config_sycl//sycl:build_defs.bzl",
+    "if_sycl_is_configured",
+)
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -38,7 +42,7 @@ package_group(
     ],
 )
 
-# Filegroup used to collect source files for dependency checking.
+#Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -119,7 +123,6 @@ cc_library(
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -196,6 +199,7 @@ xla_test(
     deps = [
         "//xla:debug_options_flags",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -223,6 +227,8 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -348,6 +354,7 @@ cc_library(
         ":gpu_constants",
         ":ir_emitter_context",
         ":kernel_call",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/ir:hlo",
@@ -376,6 +383,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ptx_custom_kernel_emitter_sycl",
+    srcs = ["custom_kernel_emitter_sycl_stub.cc"],
+    hdrs = ["custom_kernel_emitter.h"],
+    tags = [
+        "gpu",
+        "oneapi-only",
+    ],
+    deps = [
+        ":ir_emitter_context",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 cc_library(
     name = "ptx_custom_kernel_emitter",
     hdrs = ["custom_kernel_emitter.h"],
@@ -384,6 +407,8 @@ cc_library(
         ":ptx_custom_kernel_emitter_cuda",
     ]) + if_rocm_is_configured([
         ":ptx_custom_kernel_emitter_rocm",
+    ]) + if_sycl_is_configured([
+        ":ptx_custom_kernel_emitter_sycl",
     ]) + [
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status:statusor",
@@ -403,11 +428,11 @@ cc_library(
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:host_execute_thunk",
         "//xla/backends/gpu/runtime:thunk_id",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:call_inliner",
         "//xla/service:name_uniquer",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "@com_google_absl//absl/algorithm:container",
@@ -458,7 +483,6 @@ cc_library(
         "//xla/backends/gpu/runtime:all_gather_thunk",
         "//xla/backends/gpu/runtime:all_reduce_thunk",
         "//xla/backends/gpu/runtime:all_to_all_thunk",
-        "//xla/backends/gpu/runtime:cholesky_thunk",
         "//xla/backends/gpu/runtime:collective_broadcast_thunk",
         "//xla/backends/gpu/runtime:collective_group_thunk",
         "//xla/backends/gpu/runtime:collective_permute_thunk",
@@ -474,6 +498,7 @@ cc_library(
         "//xla/backends/gpu/runtime:cudnn_thunk",
         "//xla/backends/gpu/runtime:custom_call_target",
         "//xla/backends/gpu/runtime:custom_call_thunk",
+        "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:fft_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:gpublas_lt_matmul_thunk",
@@ -495,6 +520,7 @@ cc_library(
         "//xla/backends/gpu/runtime:select_k_thunk",
         "//xla/backends/gpu/runtime:send_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/backends/gpu/runtime:topk",
         "//xla/backends/gpu/runtime:triangular_solve_thunk",
@@ -602,7 +628,6 @@ cc_library(
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/service/llvm_ir:tuple_ops",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -612,6 +637,7 @@ cc_library(
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -621,7 +647,7 @@ cc_library(
     srcs = ["kernel_call.cc"],
     hdrs = ["kernel_call.h"],
     deps = [
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -642,7 +668,7 @@ xla_cc_test(
     srcs = ["kernel_call_test.cc"],
     deps = [
         ":kernel_call",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -738,8 +764,9 @@ cc_library(
         "//xla/backends/gpu/runtime:nvshmem_collective_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
-        "//xla/backends/gpu/runtime:thunk_checksum_tracing_pass",
+        "//xla/backends/gpu/runtime:thunk_buffer_debug_pass",
         "//xla/backends/gpu/runtime:thunk_pass_pipeline",
+        "//xla/backends/gpu/runtime:thunk_proto_deserialization",
         "//xla/core/collectives:clique_key",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -1150,11 +1177,10 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -1395,8 +1421,8 @@ cc_library(
 )
 
 tf_proto_library(
-    name = "executable_proto",
-    srcs = ["executable.proto"],
+    name = "kernel_reuse_cache_proto",
+    srcs = ["kernel_reuse_cache.proto"],
     protodeps = [
         "//xla/service:hlo_proto",
         "//xla:xla_proto",
@@ -1480,13 +1506,13 @@ cc_library(
     tags = ["gpu"],
     deps = [
         ":alias_info",
-        ":executable_proto_cc",
         ":execution_stream_assignment",
         ":gpu_constants",
         ":gpu_executable",
         ":gpu_memory_space_assignment",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
+        ":kernel_reuse_cache_proto_cc",
         ":metrics",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -1496,13 +1522,13 @@ cc_library(
         "//xla/backends/gpu/runtime:runtime_intrinsics",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_value",
         "//xla/service:dump",
         "//xla/service:hlo_proto_cc",
         "//xla/service:logical_buffer",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -1536,10 +1562,10 @@ cc_library(
     hdrs = ["fusion_dispatch_pipeline.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:hlo_cost_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
         "//xla/service/gpu/transforms:fusion_dynamic_memcpy_rewriter",
         "//xla/stream_executor:device_description",
@@ -1552,6 +1578,7 @@ cc_library(
     hdrs = ["fusion_pipeline.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -1562,7 +1589,6 @@ cc_library(
         "//xla/service:hlo_verifier",
         "//xla/service:layout_assignment",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:multi_output_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/service/gpu/transforms:variadic_op_splitter",
@@ -1609,13 +1635,13 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":cublas_cudnn",
-        ":executable_proto_cc",
         ":execution_stream_assignment",
         ":flag_utils",
         ":fusion_dispatch_pipeline",
         ":fusion_pipeline",
         ":gpu_constants",
         ":gpu_executable",
+        ":gpu_executable_proto_cc",
         ":gpu_float_support",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
@@ -1626,6 +1652,7 @@ cc_library(
         ":ir_emitter_context",
         ":ir_emitter_unnested",
         ":kernel_reuse_cache",
+        ":legacy_gpu_aot_compilation_result",
         ":matmul_utils",
         ":metrics",
         ":pre_scheduling_copy_insertion_pipeline",
@@ -1648,11 +1675,13 @@ cc_library(
         "//xla/backends/gpu/runtime:runtime_intrinsics",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
+        "//xla/backends/gpu/runtime:thunk_proto_cc",
         "//xla/core/host_offloading:hlo_host_device_type_call_wrapper",
         "//xla/core/host_offloading:host_compute_asyncifier",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -1675,6 +1704,7 @@ cc_library(
         "//xla/hlo/transforms/collectives:collectives_schedule_linearizer",
         "//xla/hlo/transforms/collectives:convert_async_collectives_to_sync",
         "//xla/hlo/transforms/expanders:bitcast_dtypes_expander",
+        "//xla/hlo/transforms/expanders:cholesky_expander",
         "//xla/hlo/transforms/expanders:comparison_expander",
         "//xla/hlo/transforms/expanders:convolution_4d_expander",
         "//xla/hlo/transforms/expanders:convolution_pred_expander",
@@ -1724,6 +1754,7 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/service:all_reduce_promotion",
         "//xla/service:all_reduce_reassociate",
         "//xla/service:all_reduce_simplifier",
@@ -1781,7 +1812,6 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:matmul_ptable_stats_collection",
         "//xla/service/gpu/model:sol_gpu_cost_model_stats_collection",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:add_tracking_suffix_to_instruction_names",
         "//xla/service/gpu/transforms:algebraic_simplifier",
         "//xla/service/gpu/transforms:algorithm_checker",
@@ -1870,6 +1900,7 @@ cc_library(
         "//xla/tsl/lib/monitoring:counter",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -1884,6 +1915,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -1934,11 +1966,11 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -1967,23 +1999,99 @@ xla_test(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/testing:temporary_directory",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@com_google_protobuf//:protobuf",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:regexp",
     ],
 )
 
+cc_library(
+    name = "legacy_gpu_aot_compilation_result",
+    srcs = ["legacy_gpu_aot_compilation_result.cc"],
+    hdrs = ["legacy_gpu_aot_compilation_result.h"],
+    # Explicitely restrict the visibility since this is an internal implementation detail of the
+    # gpu_compiler.
+    visibility = [],
+    deps = [
+        ":gpu_executable_proto_cc",
+        ":gpu_latency_hiding_scheduler",
+        ":ir_emission_utils",
+        "//xla:util",
+        "//xla/hlo/analysis:alias_info",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "gpu_aot_compilation_result",
+    hdrs = ["gpu_aot_compilation_result.h"],
+    deps = [
+        ":gpu_executable",
+        ":gpu_executable_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_aot_compilation_result_test",
+    srcs = ["gpu_aot_compilation_result_test.cc"],
+    deps = [
+        ":gpu_aot_compilation_result",
+        ":gpu_executable",
+        ":launch_dimensions",
+        "//xla:literal_util",
+        "//xla/backends/gpu/runtime:kernel_thunk",
+        "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/codegen/emitters:kernel_arguments",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:mock_platform",
+        "//xla/stream_executor:mock_stream_executor",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_test(
     name = "gpu_offloading_test",
     srcs = ["gpu_offloading_test.cc"],
@@ -2119,7 +2227,6 @@ cc_library(
         "//xla/service/gpu/transforms:cudnn_norm_rewriter",
         "//xla/service/gpu/transforms:cudnn_pad_for_convolutions",
         "//xla/service/gpu/transforms:cudnn_simplify_padding",
-        "//xla/service/gpu/transforms:gpusolver_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -2133,7 +2240,6 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/cuda:cuda_diagnostics",
         "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/stream_executor/cuda:cuda_solver_context",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -2182,13 +2288,13 @@ xla_test(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_value",
         "//xla/service:logical_buffer",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -2361,7 +2467,6 @@ cc_library(
         "//xla/service/gpu/transforms:conv_rewriter",
         "//xla/service/gpu/transforms:cublas_pad_for_gemms",
         "//xla/service/gpu/transforms:cudnn_fused_conv_rewriter",
-        "//xla/service/gpu/transforms:gpusolver_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
@@ -2428,9 +2533,11 @@ cc_library(
         ":backend_configs_cc",
         ":flag_utils",
         ":gpu_latency_hiding_scheduler",
+        ":hlo_fusion_analysis",
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/collectives:async_collective_creator",
@@ -2442,11 +2549,9 @@ cc_library(
         "//xla/service:legalize_scheduling_annotations",
         "//xla/service:p2p_schedule_preparation",
         "//xla/service:profile_guided_latency_estimator",
-        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:analytical_latency_estimator",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:sol_latency_estimator",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:pgle_accuracy_checker",
         "//xla/service/gpu/transforms:scheduling_instruction_annotator",
         "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
@@ -2611,12 +2716,12 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:repeat_buffer_kernel",
         "//xla/tsl/platform:statusor",
@@ -2634,8 +2739,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
@@ -2776,10 +2879,31 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
+xla_cc_test(
+    name = "nvptx_alias_info_test",
+    srcs = ["nvptx_alias_info_test.cc"],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":nvptx_alias_info",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/service:copy_insertion",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_fusible",
     srcs = ["gpu_fusible.cc"],
@@ -2795,7 +2919,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:side_effect_util",
         "//xla:util",
-        "//xla/codegen:ir_emission_utils",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_instruction_utils",
@@ -2821,7 +2944,10 @@ xla_cc_test(
         "nomsan",
     ],
     deps = [
+        ":gpu_device_info_for_tests",
         ":gpu_fusible",
+        ":hlo_fusion_analysis",
+        "//xla:debug_options_flags",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2947,7 +3073,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3062,7 +3187,8 @@ cc_library(
     srcs = ["kernel_reuse_cache.cc"],
     hdrs = ["kernel_reuse_cache.h"],
     deps = [
-        ":executable_proto_cc",
+        ":gpu_executable_proto_cc",
+        ":kernel_reuse_cache_proto_cc",
         ":launch_dimensions",
         "//xla:status_macros",
         "//xla:util",
@@ -3073,6 +3199,8 @@ cc_library(
         "//xla/stream_executor/gpu:tma_metadata",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -3084,7 +3212,6 @@ xla_cc_test(
     name = "kernel_reuse_cache_test",
     srcs = ["kernel_reuse_cache_test.cc"],
     deps = [
-        ":executable_proto_cc",
         ":kernel_reuse_cache",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
@@ -3201,12 +3328,12 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service:profile_guided_latency_estimator",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -3336,10 +3463,14 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":intel_gpu_compiler_impl",
-        "//xla/service:compiler",
         "//xla/stream_executor/sycl:sycl_platform_id",
     ],
     alwayslink = True,  # Contains compiler registration
@@ -3356,12 +3487,19 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":gpu_compiler",
         ":target_constants",
+        "//xla:debug_options_flags",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/service:dump",
         "//xla/service/gpu/llvm_gpu_backend:spirv_backend",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
@@ -3378,7 +3516,12 @@ xla_test(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":intel_gpu_compiler",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 48579c8cfd3eba..5bc76acbad72e0 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -60,7 +60,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/conv_rewriter.h"
 #include "xla/service/gpu/transforms/cublas_pad_for_gemms.h"
 #include "xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h"
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
@@ -124,8 +123,6 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
   ConvBfloat16Support conv_bf16_support(*gpu_version.rocm_compute_capability());
   pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
 
-  pipeline.AddPass<GpusolverRewriter>(
-      stream_executor::RocmSolverContext::Create);
   pipeline.AddPass<ConvRewriter>(gpu_version);
   pipeline.AddPass<ConvPaddingLegalization>();
   auto rcc = gpu_version.rocm_compute_capability();
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 6131823f47fd6e..219ea714b50474 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -48,6 +48,7 @@ cc_library(
         ":triton_configs",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -59,7 +60,7 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/service/gpu/transforms:block_scaling_rewriter",
         "//xla/service/gpu/transforms:cudnn_fusion_compiler",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
@@ -92,13 +93,13 @@ cc_library(
         ":triton_configs",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:executable",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/rocm:rocblas_plugin",
@@ -124,6 +125,7 @@ cc_library(
         ":gemm_fusion_autotuner_rocm",
     ]) + [
         ":autotuner_compile_util",
+        ":autotuner_pass",
         ":autotuner_status_key",
         ":autotuner_util",
         ":dot_search_space",
@@ -135,8 +137,13 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/backends/gpu/autotuner:cublas",
+        "//xla/backends/gpu/autotuner:fission_backend",
+        "//xla/backends/gpu/autotuner:triton",
         "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/backends/gpu/runtime:buffer_comparator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -146,6 +153,7 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:algorithm_util",
         "//xla/service:call_inliner",
+        "//xla/service:compiler",
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:hlo_cost_analysis",
@@ -163,7 +171,7 @@ cc_library(
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
         "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/service/gpu/transforms:block_scaling_rewriter",
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
@@ -236,6 +244,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -249,7 +258,6 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/service/gpu/transforms:gemm_rewriter",
         "//xla/stream_executor:device_description",
@@ -576,8 +584,8 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:lazy_op_runner",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -668,7 +676,6 @@ xla_cc_test(
     ],
     tags = [
         "gpu",
-        "cuda-only", #TODO(rocm): weekly sync 24-10-01
     ],
     deps = [
         ":autotune_cache_key",
@@ -726,6 +733,7 @@ cc_library(
         "//xla/backends/gpu/autotuner:legacy_cache",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:compiler",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory_allocator",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
index c61bdac92138a3..bb134b965c0f9d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.cc
@@ -55,12 +55,12 @@ std::string AutotuneCacheKey::HloInstructionToCanonicalString(
 std::string AutotuneCacheKey::DeviceDescriptionToCacheKey(
     const se::DeviceDescription& device_description) {
   std::string compute_capability;
-  if (auto* ccc = std::get_if<se::CudaComputeCapability>(
-          &device_description.gpu_compute_capability())) {
+  if (auto* ccc = device_description.gpu_compute_capability()
+                      .cuda_compute_capability()) {
     compute_capability = absl::StrCat("CUDA: ", ccc->major, ".", ccc->minor);
   } else {
-    auto* rcc = std::get_if<se::RocmComputeCapability>(
-        &device_description.gpu_compute_capability());
+    auto* rcc =
+        device_description.gpu_compute_capability().rocm_compute_capability();
     CHECK(rcc != nullptr) << "Unknown compute capability type";
     compute_capability = absl::StrCat("ROCM: ", rcc->gfx_version());
   }
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
index 9305220c9bfb14..819332c4a84826 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
@@ -32,7 +32,7 @@ class AutotuneCacheKey {
   // Tie a version to the cache key in order to invalidate the cache when
   // necessary. This should be incremented on triton upgrades or any other
   // changes that may affect the autotuning results.
-  static constexpr int kCurrentVersion = 14;
+  static constexpr int kCurrentVersion = 16;
 
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction,
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index e4774e2db5979d..67f44d5bb9c3e0 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -73,6 +74,8 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
 
   autotune_config.expect_all_instructions_in_cache =
       debug_options.xla_gpu_require_complete_aot_autotune_results();
+  autotune_config.dump_hlos =
+      debug_options.xla_gpu_dump_autotuned_gemm_fusions();
 
   return autotune_config;
 }
@@ -92,7 +95,8 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
     stream_executor::StreamExecutor* stream_executor,
     tsl::thread::ThreadPool* thread_pool, InstructionFilterFn should_autotune,
     const Compiler::TargetConfig* target_config,
-    se::DeviceMemoryAllocator* allocator, bool optimize_scratch_bytes) {
+    se::DeviceMemoryAllocator* allocator, bool optimize_scratch_bytes,
+    MultiProcessKeyValueStore key_value_store) {
   std::unique_ptr<Profiler> profiler = nullptr;
   bool is_deviceless = stream_executor == nullptr;
   AutotuneConfig autotune_config =
@@ -103,32 +107,34 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
                                    GetProfileOptions(debug_options), allocator);
   }
 
-  se::DeviceDescription device_description = target_config->device_description;
-  device_description.set_dnn_version(
-      {static_cast<unsigned>(target_config->dnn_version_info.major_version()),
-       static_cast<unsigned>(target_config->dnn_version_info.minor_version()),
-       static_cast<unsigned>(target_config->dnn_version_info.patch())});
-
   std::unique_ptr<AutotunerCacheInterface> cache =
       std::make_unique<LegacyCache>(
           debug_options.xla_gpu_experimental_autotuner_cache_dir(),
           debug_options.xla_gpu_experimental_autotune_cache_mode(),
-          device_description);
+          target_config->device_description);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Autotuner> autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler),
                         autotune_config, std::move(cache), thread_pool));
-  return absl::WrapUnique(
-      new AutotunerPass(std::move(autotuner), should_autotune));
+  return absl::WrapUnique(new AutotunerPass(
+      std::move(autotuner), should_autotune, std::move(key_value_store),
+      debug_options.xla_gpu_shard_autotuning()));
 }
 
-absl::StatusOr<bool> AutotunerPass::Run(
+absl::StatusOr<bool> AutotunerPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running Autotuner Pass";
 
-  TF_RETURN_IF_ERROR(autotuner_->Autotune(module, should_autotune_));
+  bool shard_autotuning =
+      enable_sharding_ && key_value_store_.process_count > 1;
+  if (shard_autotuning) {
+    TF_RETURN_IF_ERROR(
+        autotuner_->Autotune(module, should_autotune_, key_value_store_));
+  } else {
+    TF_RETURN_IF_ERROR(autotuner_->Autotune(module, should_autotune_));
+  }
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
index 372a5a9b334fbf..ca64d8deef94f7 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -36,6 +37,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// HloModulePass that runs the autotuner.
 class AutotunerPass : public HloModulePass {
  public:
   // Note: the target_config must outlive the pass.
@@ -45,22 +47,30 @@ class AutotunerPass : public HloModulePass {
       tsl::thread::ThreadPool* thread_pool, InstructionFilterFn should_autotune,
       const Compiler::TargetConfig* target_config,
       se::DeviceMemoryAllocator* allocator = nullptr,
-      bool optimize_scratch_bytes = true);
+      bool optimize_scratch_bytes = true,
+      MultiProcessKeyValueStore key_value_store = MultiProcessKeyValueStore());
 
   absl::string_view name() const override { return "autotuner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   explicit AutotunerPass(std::unique_ptr<Autotuner> autotuner,
-                         InstructionFilterFn should_autotune)
-      : autotuner_(std::move(autotuner)), should_autotune_(should_autotune) {}
+                         InstructionFilterFn should_autotune,
+                         MultiProcessKeyValueStore key_value_store,
+                         bool enable_sharding)
+      : autotuner_(std::move(autotuner)),
+        should_autotune_(should_autotune),
+        key_value_store_(std::move(key_value_store)),
+        enable_sharding_(enable_sharding) {}
 
   std::unique_ptr<Autotuner> autotuner_;
   InstructionFilterFn should_autotune_;
+  MultiProcessKeyValueStore key_value_store_;
+  bool enable_sharding_ = false;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
index 12f99971698fd9..ef071e1495b25f 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
@@ -62,9 +62,9 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/lazy_op_runner.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -156,7 +156,7 @@ absl::StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
 
 absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
     const GpuConvConfig& config, se::Stream* stream, bool use_fallback,
-    const se::NumericOptions& numeric_options) {
+    const se::EngineOptions& engine_options) {
   TF_ASSIGN_OR_RETURN(se::dnn::DataType input_type,
                       GetDNNDataTypeFromPrimitiveType(config.input_type));
 
@@ -190,7 +190,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
           /* leakyrelu_alpha = */ config.fusion->leakyrelu_alpha, stream,
           config.input_descriptor, config.filter_descriptor,
           config.bias_descriptor, config.output_descriptor, config.conv_desc,
-          use_fallback, config.fusion->mode, numeric_options, &runners));
+          use_fallback, config.fusion->mode, engine_options, &runners));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -208,7 +208,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       TF_RETURN_IF_ERROR(dnn->GetGraphConvolveRunners(
           kind, input_type, output_type, stream, config.input_descriptor,
           config.filter_descriptor, config.output_descriptor, config.conv_desc,
-          use_fallback, numeric_options, &runners, config.serialized_graph));
+          use_fallback, engine_options, &runners, config.serialized_graph));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -232,7 +232,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
           /* filter_data = */ DeviceMemoryBase(nullptr),
           config.output_descriptor,
           /* output_data = */ DeviceMemoryBase(nullptr), config.conv_desc,
-          use_fallback, nullptr, numeric_options, &runners));
+          use_fallback, nullptr, engine_options, &runners));
 
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
@@ -254,7 +254,7 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
                     absl::Span<se::DeviceMemoryBase> result_buffers,
                     se::StreamExecutor* stream_exec,
                     ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    const se::NumericOptions& numeric_options) {
+                    const se::EngineOptions& engine_options) {
   TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
 
   TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype,
@@ -275,8 +275,7 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
       params.config->filter_descriptor, params.filter_buf,
       params.config->output_descriptor, params.output_buf,
       params.config->conv_desc,
-      /* use_fallback = */ false, scratch_allocator, numeric_options,
-      &runners));
+      /* use_fallback = */ false, scratch_allocator, engine_options, &runners));
 
   return runners;
 }
@@ -813,8 +812,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
         instr->precision_config().operand_precision(),
         [](int precision) { return precision <= PrecisionConfig::HIGH; });
   }
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   // Use the first algorithm that's supported as reference. There isn't a
   // particular reason to use it, as any algorithm suffices. It doesn't make
@@ -828,7 +828,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   TF_ASSIGN_OR_RETURN(
       std::vector<GenericConvRunner> runners,
       GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                    /* use_fallback = */ false, numeric_options));
+                    /* use_fallback = */ false, engine_options));
 
   std::vector<AutotuneResult> profile_results;
   for (auto& runner_cache : runners) {
@@ -851,7 +851,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     TF_ASSIGN_OR_RETURN(
         std::vector<GenericConvRunner> fallback_runners,
         GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                      /* use_fallback = */ true, numeric_options));
+                      /* use_fallback = */ true, engine_options));
 
     for (auto& runner_cache : fallback_runners) {
       TF_ASSIGN_OR_RETURN(
@@ -916,8 +916,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   const bool allow_tf32 = absl::c_all_of(
       instr->precision_config().operand_precision(),
       [](int precision) { return precision <= PrecisionConfig::HIGH; });
-  const se::NumericOptions numeric_options{
-      RequireDeterminism(instr->GetModule()->config()), allow_tf32};
+  const se::EngineOptions engine_options{
+      RequireDeterminism(instr->GetModule()->config()), allow_tf32,
+      /*require_command_buffer=*/false};
 
   se::StreamExecutor* stream_exec = config_.GetExecutor();
   const auto device_ordinal = stream_exec->device_ordinal();
@@ -971,7 +972,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners,
       GetMIOpenAlgorithms(instr, absl::MakeSpan(operand_buffers),
                           absl::MakeSpan(result_buffers), stream_exec,
-                          &scratch_allocator, stream, numeric_options));
+                          &scratch_allocator, stream, engine_options));
 
   std::vector<AutotuneResult> profile_results;
 
@@ -1154,7 +1155,7 @@ absl::StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> GpuConvAlgorithmPicker::Run(
+absl::StatusOr<bool> GpuConvAlgorithmPicker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER(
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
index 39ac8351b23408..4f3c1e8e770135 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
@@ -93,8 +93,8 @@ class GpuConvAlgorithmPicker : public HloModulePass {
     return IsCustomCallToDnnConvolution(*instr);
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
index 4c0ae7875b2716..6461e3e5a5da98 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker_test.cc
@@ -130,9 +130,8 @@ ENTRY main {
   // Algorithm 14 is disabled for cuDNN 9 on V100
   TF_ASSERT_OK_AND_ASSIGN(auto dnn_version, GetDnnVersionInfo(stream_exec));
   if (dnn_version.major_version() >= 9 && dnn_version.major_version() < 10 &&
-      std::holds_alternative<stream_executor::CudaComputeCapability>(cc) &&
-      std::get<stream_executor::CudaComputeCapability>(cc).major == 7 &&
-      std::get<stream_executor::CudaComputeCapability>(cc).minor == 0) {
+      cc.IsCuda() && cc.cuda_compute_capability()->major == 7 &&
+      cc.cuda_compute_capability()->minor == 0) {
     EXPECT_TRUE(conv->backend_config<GpuBackendConfig>()
                     ->has_cudnn_conv_backend_config() &&
                 conv->backend_config<GpuBackendConfig>()
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
index b1ab40f5040937..3c975f91d19939 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -123,7 +123,8 @@ TritonDotFusionSearchSpace::TritonDotFusionSearchSpace(
 }
 
 std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
-    std::optional<int64_t> force_contracting_split, bool autotune_tma) const {
+    std::optional<int64_t> force_contracting_split, bool autotune_tma,
+    bool autotune_warp_specialization) const {
   std::vector<ConfigWithNotes> configs;
   if (force_contracting_split.has_value()) {
     ConfigWithNotes config;
@@ -151,8 +152,22 @@ std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddCtaSizeParameter);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddContractingTiling);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddPipeliningParameter);
+
+  if (autotune_warp_specialization && !autotune_tma) {
+    LOG(WARNING)
+        << "Warp specialization is requested, but TMA is not enabled, hence "
+           "warp specialization will be ignored. Set both "
+           "`is_warp_specialization_allowed` and `is_tma_allowed` "
+           "to true on the configuration to enable warp specialization.";
+  }
   if (autotune_tma) {
+    VLOG(10) << "Parameterizing all currently constructed configs with "
+                "TMA.";
     ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddTmaParameter);
+    if (autotune_warp_specialization) {
+      ExtendConfigs(
+          configs, &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
+    }
   }
 
   std::vector<TritonGemmConfig> result;
@@ -629,15 +644,34 @@ void TritonDotFusionSearchSpace::AddTmaParameter(
     std::vector<ConfigWithNotes>& updated_configs) const {
   ConfigWithNotes new_config = config;
   new_config.config.is_tma_allowed = false;
-  VLOG(10) << "Adding TMA (disabled) parameter: config = "
-           << new_config.ToString();
   updated_configs.push_back(new_config);
   new_config.config.is_tma_allowed = true;
-  VLOG(10) << "Adding TMA (enabled) parameter: config = "
-           << new_config.ToString();
   updated_configs.push_back(new_config);
 }
 
+void TritonDotFusionSearchSpace::AddWarpSpecializationParameter(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  ConfigWithNotes new_config = config;
+  new_config.config.is_warp_specialization_allowed = false;
+  updated_configs.push_back(new_config);
+
+  // Warp specialization probably only makes sense if TMA is enabled. Other
+  // restrictions are required for compatibility with Triton, including:
+  // - num_warps must be a multiple of 4.
+  // - num_warps must be <= 16. This is because the next step for num_warps is
+  // 32, which would break with auto warp specialization, because the feature
+  // will employ `worker warps` that will mean we exceed the maximum block size
+  // of 1024 threads.
+  if (config.config.is_tma_allowed && config.config.num_warps <= 16 &&
+      config.config.num_warps % 4 == 0) {
+    new_config.config.is_warp_specialization_allowed = true;
+    VLOG(10) << "Adding warp specialization parameter: config = "
+             << new_config.ToString();
+    updated_configs.push_back(new_config);
+  }
+}
+
 void TritonDotFusionSearchSpace::EliminateLowOccupancyConfigs(
     std::vector<ConfigWithNotes>& configs) const {
   CHECK(!configs.empty());
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
index 042d5159b7237c..3cbae90aa0a9c7 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
@@ -42,13 +42,19 @@ class TritonDotFusionSearchSpace {
                              const HloDotInstruction* dot);
 
   // Generates the list of promising configs in the search space for the
-  // autotuner to try. If `force_contracting_split` is set, the search space
+  // autotuner to try.
+  // If `force_contracting_split` is set, the search space
   // will be restricted to only include configs with the given split_k factor.
-  // If `autotune_tma` is set, the search space will be extended with TMA
-  // parameterization.
+  //
+  // If true, `autotune_tma` and `autotune_warp_specialization` extend the
+  // search space with TMA parameterization and warp specialization
+  // respectively. Setting 'autotune_warp_specialization' to true also requires
+  // `autotune_tma` to be true, given that warp specialization is probably not
+  // useful without TMA.
   std::vector<TritonGemmConfig> GenerateConfigs(
       std::optional<int64_t> force_contracting_split = std::nullopt,
-      bool autotune_tma = false) const;
+      bool autotune_tma = false,
+      bool autotune_warp_specialization = false) const;
 
   // Restrict the set of configs to the ones compatible with the hints list.
   // Generally, this will mean that configs are restricted to the ones that
@@ -214,6 +220,11 @@ class TritonDotFusionSearchSpace {
   void AddTmaParameter(const ConfigWithNotes& config,
                        std::vector<ConfigWithNotes>& updated_configs) const;
 
+  // Extend the passed configs with automatic warp specialization.
+  void AddWarpSpecializationParameter(
+      const ConfigWithNotes& config,
+      std::vector<ConfigWithNotes>& updated_configs) const;
+
   // The order of these fields is important: the values of those defined earlier
   // are used to compute the values of later ones.
   se::DeviceDescription device_description_;
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
index 694d166bde7932..18e243a976b1b2 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
@@ -154,7 +154,7 @@ class DotSearchSpaceTest : public DefaultDeviceDotSearchSpaceTest {
     device_description_.set_threads_per_warp(32);
     device_description_.set_shared_memory_per_block_optin(227 * 1024);
     device_description_.set_gpu_compute_capability(
-        se::CudaComputeCapability::H100Family());
+        se::CudaComputeCapability::H100Accelerated());
   }
 };
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 08ec663cd0fc7c..610d0475c75f99 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -44,7 +44,12 @@ limitations under the License.
 #include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/backends/gpu/autotuner/cublas.h"
+#include "xla/backends/gpu/autotuner/fission_backend.h"
+#include "xla/backends/gpu/autotuner/triton.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -60,9 +65,11 @@ limitations under the License.
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/call_inliner.h"
+#include "xla/service/compiler.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuning/autotuner_pass.h"
 #include "xla/service/gpu/autotuning/autotuner_status_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/dot_search_space.h"
@@ -75,9 +82,9 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
@@ -134,6 +141,21 @@ using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 namespace {
 
+std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
+    const se::DeviceDescription& device_description) {
+  auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
+  for (GemmRewriterOptions::DType dtype :
+       {GemmRewriterOptions::DType::kFp8Only,
+        GemmRewriterOptions::DType::kNonFp8Only}) {
+    auto gemm_rewriter = std::make_unique<GemmRewriter>(
+        device_description.gpu_compute_capability(),
+        device_description.runtime_version(), GemmRewriterOptions{dtype});
+    pipeline->AddPass(std::move(gemm_rewriter));
+  }
+  return pipeline;
+}
+
 using AutoTuneCacheKeyCount = absl::flat_hash_map<AutotuneCacheKey, uint64_t>;
 
 using KeysAndInstructions =
@@ -322,6 +344,17 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
 
   NestGemmFusion nest_gemm_fusion(gpu_device_info, symbolic_expr_context);
   TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(new_module.get()).status());
+  bool is_legacy_gemm_disabled = absl::c_contains(
+      debug_opts.xla_gpu_unsupported_generic_triton_emitter_features(),
+      DebugOptions::GENERIC_TRITON_EMITTER_DISABLE_LEGACY_GEMM);
+  bool is_triton_gemm_fusion =
+      IsGpuFusionKind(*new_module->entry_computation()->root_instruction(),
+                      kTritonGemmFusionKind);
+  if (is_legacy_gemm_disabled && is_triton_gemm_fusion) {
+    return absl::InternalError(
+        absl::StrCat("Unexpected ", kTritonGemmFusionKind,
+                     " fusion: ", new_module->ToString()));
+  }
   return new_module;
 }
 
@@ -424,16 +457,25 @@ absl::StatusOr<std::unique_ptr<HloModule>> CuDnnFusionExtractor(
   tsl::profiler::TraceMe traceme("CuDnnFusionExtractor");
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       FusionExtractor(fusion, debug_opts));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  // Swizzle scale tensors for block scaled dot.
+  HloInstruction* scaled_dot = hlo_query::GetFirstInstructionWithOpcode(
+      *root->called_computations()[0], HloOpcode::kScaledDot);
+  if (scaled_dot != nullptr) {
+    TF_ASSIGN_OR_RETURN(root, CudnnScaledDotHelper::AddScaleSwizzle(
+                                  Cast<HloFusionInstruction>(root)));
+  }
 
+  // Update backend config of the root fusion.
   GpuBackendConfig gpu_config;
   FusionBackendConfig& backend_config =
       *gpu_config.mutable_fusion_backend_config();
   backend_config.set_kind(std::string(kCuDnnFusionKind));
   // Provided a plan ID the autotuner just compiles one plan.
   backend_config.mutable_cudnn_fusion_config()->set_plan_id(plan_id);
-  TF_RETURN_IF_ERROR(
-      module->entry_computation()->root_instruction()->set_backend_config(
-          gpu_config));
+  TF_RETURN_IF_ERROR(root->set_backend_config(gpu_config));
+
   return module;
 }
 
@@ -655,26 +697,6 @@ std::string GetSelectedGemmBackendAsString(const HloModule* module) {
   return "";
 }
 
-bool HasBroadcastProducer(const HloInstruction& instr) {
-  return HloBfsFindIf({&instr},
-                      [](const HloInstruction* node) {
-                        return node->opcode() == HloOpcode::kBroadcast;
-                      })
-      .has_value();
-}
-
-// CUDA_ERROR_MISALIGNED_ADDRESS errors are happening for some cases when
-// pipelining stages are > 2. The pattern observed is that these happen in the
-// presence of a broadcast.
-void RestrictTmaConfigs(std::vector<TritonGemmConfig>& configs) {
-  configs.erase(std::remove_if(configs.begin(), configs.end(),
-                               [&](const TritonGemmConfig& config) {
-                                 return config.is_tma_allowed &&
-                                        config.num_stages > 2;
-                               }),
-                configs.end());
-}
-
 }  // anonymous namespace
 
 absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
@@ -752,10 +774,16 @@ absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
 
   // Autotune result has a cuDNN fusion.
   CHECK(autotune_result.has_algorithm());
+  if (fusion_backend_config.kind() == kTritonScaledDotFusionKind) {
+    TF_ASSIGN_OR_RETURN(fusion_instr,
+                        CudnnScaledDotHelper::AddScaleSwizzle(
+                            Cast<HloFusionInstruction>(fusion_instr)));
+  }
   fusion_backend_config.set_kind(kCuDnnFusionKind);
   fusion_backend_config.mutable_cudnn_fusion_config()->set_plan_id(
       autotune_result.algorithm().algo_id());
   TF_RETURN_IF_ERROR(fusion_instr->set_backend_config(gpu_config));
+
   MarkAsChanged();
   return absl::OkStatus();
 }
@@ -779,6 +807,12 @@ bool GemmFusionAutotunerImpl::IsAutotuningEnabled() const {
          !debug_options_.xla_gpu_deterministic_ops();
 }
 
+bool GemmFusionAutotunerImpl::IsWarpSpecializationAvailable() const {
+  auto compute_capability = config_.GetGpuComputeCapability();
+  return compute_capability.IsCuda() &&
+         compute_capability.cuda_compute_capability()->IsAtLeastBlackwell();
+}
+
 static std::vector<BackendConfig> GenerateCustomKernelFusionConfigs(
     const HloFusionInstruction& fusion,
     se::DeviceDescription device_description) {
@@ -889,7 +923,9 @@ GemmFusionAutotunerImpl::GenerateDotConfigs(const HloFusionInstruction& fusion,
     }
 
     // Add lib (e.g. cuDNN) plans, if available.
-    if (AddLibConfigs(fusion, dot, configs)) return configs;
+    if (AddLibConfigs(fusion, dot, configs)) {
+      return configs;
+    }
   }
 
   // Add CustomKernelFusion (Cutlass) configs, if available.
@@ -923,6 +959,10 @@ GemmFusionAutotunerImpl::GenerateScaledDotConfigs(
       IsAutotuningEnabled() && !config_.IsDeviceless()) {
     // Add cuBLAS reference config, if available.
     configs.push_back(CuBlasConfig{});
+    // Add lib (e.g. cuDNN) plans, if available.
+    if (AddLibConfigs(fusion, dot, configs)) {
+      return configs;
+    }
   }
 
   // TODO(b/436988479): fine tune the search space.
@@ -943,12 +983,7 @@ absl::StatusOr<std::vector<TritonGemmConfig>>
 GemmFusionAutotunerImpl::GenerateTritonConfigs(
     const HloScaledDotInstruction& dot) {
   tsl::profiler::TraceMe traceme("GenerateTritonConfigs");
-  // TODO(b/421858850): Restricting configs for dots from broadcasts is a
-  // temporary solution. We should remove this once we have a fix for the error.
   auto configs = GetDefaultTritonConfigs();
-  if (HasBroadcastProducer(dot)) {
-    RestrictTmaConfigs(configs);
-  }
 
   if (!IsAutotuningEnabled()) {
     // Keep the first config, which likely does not spill registers.
@@ -974,6 +1009,16 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
   bool autotune_tma = debug_options_.xla_gpu_experimental_enable_triton_tma() &&
                       stream_executor::gpu::IsTmaAvailableForDevice(
                           config_.GetDeviceDescription());
+  bool autotune_warp_specialization =
+      debug_options_.xla_gpu_experimental_enable_triton_warp_specialization() &&
+      IsWarpSpecializationAvailable();
+  if (autotune_warp_specialization && !autotune_tma) {
+    return absl::InvalidArgumentError(
+        "Warp specialization is requested, but TMA is not enabled. If you wish "
+        "to enable warp specialization, set both "
+        "`xla_gpu_experimental_enable_triton_tma` and "
+        "`xla_gpu_experimental_enable_triton_warp_specialization` to true.");
+  }
   TritonDotFusionSearchSpace search_space(config_.GetDeviceDescription(), &dot);
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -983,20 +1028,13 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
           : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma);
-
-  // TODO(b/421858850): Restricting configs for dots from broadcasts is a
-  // temporary solution. We should remove this once we have a fix for the error.
-  auto default_configs = GetDefaultTritonConfigs();
-  if (HasBroadcastProducer(dot)) {
-    RestrictTmaConfigs(configs);
-    RestrictTmaConfigs(default_configs);
-  }
+      /*autotune_tma=*/autotune_tma,
+      /*autotune_warp_specialization=*/autotune_warp_specialization);
 
   if (!debug_options_.xla_gpu_exhaustive_tiling_search()) {
     VLOG(1) << "Restricting configs to the default set.";
-    configs =
-        search_space.OptimizeConfigSet(configs, /*hints=*/default_configs);
+    configs = search_space.OptimizeConfigSet(
+        configs, /*hints=*/GetDefaultTritonConfigs());
   }
   if (!IsAutotuningEnabled()) {
     // Keep the first config, which likely does not spill registers.
@@ -1523,12 +1561,17 @@ static absl::Status ExchangeResults(KeyValueStoreInterface& key_value_store,
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> GemmFusionAutotuner::Run(
+absl::StatusOr<bool> GemmFusionAutotuner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("GEMM fusion autotuner");
 
   const DebugOptions& debug_options = module->config().debug_options();
+
+  if (debug_options.xla_gpu_experimental_use_autotuner_pass()) {
+    return RunViaNewInfra(module, execution_threads);
+  }
+
   GemmFusionAutotunerImpl autotuner(config_, toolkit_version_, debug_options,
                                     thread_pool_, symbolic_expr_context_);
   GemmFusionCollector fusion_collector(&autotuner);
@@ -1649,5 +1692,48 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
       module, execution_threads);
 }
 
+absl::StatusOr<bool> GemmFusionAutotuner::RunViaNewInfra(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  const DebugOptions& debug_options = module->config().debug_options();
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+
+  se::StreamExecutor* stream_exec = config_.GetExecutor();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
+                      Compiler::GetForPlatform(stream_exec->GetPlatform()));
+  se::DeviceMemoryAllocator* device_allocator = config_.GetAllocator();
+  std::unique_ptr<Compiler::TargetConfig> target_config;
+  target_config = std::make_unique<Compiler::TargetConfig>(stream_exec);
+  backends.push_back(std::make_unique<TritonBackend>(
+      &debug_options, compiler.get(), target_config.get(),
+      symbolic_expr_context_));
+  backends.push_back(std::make_unique<FissionBackend>(
+      &debug_options, compiler.get(), target_config.get(),
+      std::make_unique<CublasBackend>(stream_exec, &debug_options,
+                                      compiler.get(), target_config.get()),
+      GetCublasRewriterPipeline(target_config->device_description),
+      symbolic_expr_context_));
+  auto should_autotune = [](const HloInstruction& instruction) -> bool {
+    if (instruction.opcode() != HloOpcode::kFusion) {
+      return false;
+    }
+    auto gpu_config = instruction.backend_config<GpuBackendConfig>();
+    const FusionBackendConfig& backend_config =
+        gpu_config->fusion_backend_config();
+    if (backend_config.kind() == kTritonGemmFusionKind ||
+        backend_config.kind() == kCuDnnFusionKind) {
+      return true;
+    }
+    return false;
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<AutotunerPass> autotuner_pass,
+      AutotunerPass::Create(std::move(backends), debug_options, stream_exec,
+                            thread_pool_, should_autotune, target_config.get(),
+                            device_allocator, false, key_value_store_));
+  return autotuner_pass->Run(module, execution_threads);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
index 50fa6b76ce8d1e..3771d94f4f82d5 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
@@ -88,11 +88,15 @@ class GemmFusionAutotuner : public HloModulePass {
 
   absl::string_view name() const override { return "gemm-fusion-autotuner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+  absl::StatusOr<bool> RunViaNewInfra(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
  private:
   AutotuneConfig config_;
   se::SemanticVersion toolkit_version_;
@@ -164,6 +168,7 @@ class GemmFusionAutotunerImpl {
   // Helper methods.
   const AutotuneConfig& GetConfig() const { return config_; }
   bool IsAutotuningEnabled() const;
+  bool IsWarpSpecializationAvailable() const;
 
   static const int64_t BLAS_GEMM_DEFAULT;
 
@@ -201,13 +206,8 @@ class GemmFusionAutotunerImpl {
     return config_.GetGpuComputeCapability();
   }
 
-  bool isRocm() const {
-    return std::holds_alternative<se::RocmComputeCapability>(
-        GetComputeCapability());
-  }
-
   bool AddLibConfigs(const HloFusionInstruction& fusion,
-                     const HloDotInstruction* dot,
+                     const HloInstruction* dot,
                      std::vector<BackendConfig>& configs);
 
   std::vector<TritonGemmConfig> GetDefaultTritonConfigs() const;
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index 2dac39160ef4b9..c530e8714c52b6 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 
@@ -49,22 +50,32 @@ int GetCuDnnPlanCount(const HloInstruction& hlo,
 }
 
 bool GemmFusionAutotunerImpl::AddLibConfigs(
-    const HloFusionInstruction& fusion, const HloDotInstruction* dot,
+    const HloFusionInstruction& fusion, const HloInstruction* dot,
     std::vector<BackendConfig>& configs) {
   // Add cuDNN plans, if available.
-  auto cc = std::get<se::CudaComputeCapability>(GetComputeCapability());
-  bool is_cudnn_enabled =
-      !config_.IsDeviceless() &&
-      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9 &&
+  stream_executor::CudaComputeCapability cc =
+      *GetComputeCapability().cuda_compute_capability();
+  auto dnn_version = GetDnnVersionInfoOrDefault(
+      !config_.IsDeviceless() ? config_.GetExecutor() : nullptr);
+
+  bool is_cudnn_fusion = IsGpuFusionKind(fusion, kCuDnnFusionKind);
+  bool is_supported_triton_dot_fusion =
+      IsGpuFusionKind(fusion, kTritonGemmFusionKind) &&
+      dnn_version.major_version() >= 9 &&
+      algorithm_util::IsSupportedByCudnn(dot->precision_config().algorithm()) &&
       ((cc.IsAtLeastAmpere() &&
         debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 1) ||
        (cc.IsAtLeastBlackwell() &&
         debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0));
-  if ((IsGpuFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
-      (IsGpuFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
-       algorithm_util::IsSupportedByCudnn(
-           dot->precision_config().algorithm()) &&
-       IsAutotuningEnabled())) {
+  bool is_supported_triton_scaled_dot_fusion =
+      IsGpuFusionKind(fusion, kTritonScaledDotFusionKind) &&
+      dnn_version >= kCudnnSupportsBlockScaledDot &&
+      CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(dot)) &&
+      cc.IsAtLeastBlackwell();
+
+  if (IsAutotuningEnabled() &&
+      (is_cudnn_fusion || is_supported_triton_dot_fusion ||
+       is_supported_triton_scaled_dot_fusion)) {
     const int plan_count = GetCuDnnPlanCount(fusion, config_);
     for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
       configs.push_back(CuDnnConfig{plan_id});
@@ -81,8 +92,8 @@ bool GemmFusionAutotunerImpl::AddLibConfigs(
 
 std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     const {
-  auto compute_capability =
-      std::get<se::CudaComputeCapability>(GetComputeCapability());
+  stream_executor::CudaComputeCapability compute_capability =
+      *GetComputeCapability().cuda_compute_capability();
   std::vector<TritonGemmConfig> configs;
 
   if (compute_capability.IsAtLeastBlackwell()) {
@@ -93,12 +104,11 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     configs = *kDefaultCudaConfigs;
   }
 
+  // Hopper+ devices support TMA. Add TMA parameterized configs.
   if (!debug_options_.xla_gpu_experimental_enable_triton_tma() ||
       !compute_capability.IsAtLeastHopper()) {
     return configs;
   }
-
-  // Hopper+ devices support TMA. Add TMA parameterized configs.
   std::vector<TritonGemmConfig> tma_parameterized_configs;
   for (auto& config : configs) {
     config.is_tma_allowed = false;
@@ -107,7 +117,25 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     config.is_tma_allowed = true;
     tma_parameterized_configs.push_back(config);
   }
-  return tma_parameterized_configs;
+
+  // TODO(b/449668102): Currently only supporting warp specialization on
+  // Blackwell+. Potentially extend support to Hopper.
+  if (!compute_capability.IsAtLeastBlackwell()) {
+    return tma_parameterized_configs;
+  }
+  std::vector<TritonGemmConfig> warp_specialized_configs;
+  for (auto& config : tma_parameterized_configs) {
+    config.is_warp_specialization_allowed = false;
+    warp_specialized_configs.push_back(config);
+
+    if (config.is_tma_allowed && config.num_warps <= 16 &&
+        config.num_warps % 4 == 0) {
+      config.is_warp_specialization_allowed = true;
+      warp_specialized_configs.push_back(config);
+    }
+  }
+
+  return warp_specialized_configs;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
index e128e3b1eff800..b1ac498f97cd34 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
@@ -28,7 +28,7 @@ namespace gpu {
 const int64_t GemmFusionAutotunerImpl::BLAS_GEMM_DEFAULT = HIPBLAS_GEMM_DEFAULT;
 
 bool GemmFusionAutotunerImpl::AddLibConfigs(
-    const HloFusionInstruction& fusion, const HloDotInstruction* dot,
+    const HloFusionInstruction& fusion, const HloInstruction* dot,
     std::vector<BackendConfig>& configs) {
   return false;
 }
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index ffc57dd776230c..c800cb212c5a43 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/transforms/gemm_fusion.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/service/hlo_module_config.h"
@@ -197,13 +197,13 @@ class StatelessAutotunerTest : public HloTestBase {
       SymbolicExprContext* symbolic_expr_context) {
     const HloFusionInstruction& fusion = *Cast<HloFusionInstruction>(
         module.entry_computation()->root_instruction());
-    if (!isRocm()) {
-      auto cu_compute_capability =
-          std::get<se::CudaComputeCapability>(compute_capability);
+    if (GpuComputeComp().IsCuda()) {
+      auto* cu_compute_capability =
+          compute_capability.cuda_compute_capability();
       se::GpuDeviceInfoProto deviceless_proto;
       auto ccc = deviceless_proto.mutable_cuda_compute_capability();
-      ccc->set_major(cu_compute_capability.major);
-      ccc->set_minor(cu_compute_capability.minor);
+      ccc->set_major(cu_compute_capability->major);
+      ccc->set_minor(cu_compute_capability->minor);
     }
 
     DeviceConfig test_config{backend().default_stream_executor(),
@@ -237,10 +237,6 @@ class StatelessAutotunerTest : public HloTestBase {
         .gpu_compute_capability();
   }
 
-  bool isRocm() {
-    return std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp());
-  }
-
   // Returns the config for the current device.
   absl::StatusOr<std::vector<GemmFusionAutotunerImpl::BackendConfig>>
   GetPossibleMatmulAutotuneConfigs(const HloModule& module) {
@@ -321,7 +317,7 @@ TEST_F(StatelessAutotunerTest, CublasFallbackForBf16Bf16F32Algorithm) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto configs,
                           GetPossibleMatmulAutotuneConfigs(*module));
-  if (!isRocm()) {
+  if (!GpuComputeComp().IsRocm()) {
     switch (GetCudaComputeCapability().major) {
       case se::CudaComputeCapability::kAmpere:
         EXPECT_TRUE(hasCublasConfig(configs))
@@ -361,8 +357,8 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
   }
 
   stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (isRocm()) {
-      return GetRocmComputeCapability();
+    if (GpuComputeComp().IsRocm()) {
+      return GpuComputeComp();
     } else {
       return stream_executor::GpuComputeCapability{
           stream_executor::CudaComputeCapability{
@@ -427,7 +423,8 @@ GetPossibleMatmulAutotuneTritonConfigs(
   TF_ASSIGN_OR_RETURN(se::DeviceDescription device_description,
                       se::DeviceDescription::FromProto(
                           se::GpuDeviceInfoProto::default_instance()));
-  device_description.set_gpu_compute_capability(compute_capability);
+  device_description.set_gpu_compute_capability(
+      se::GpuComputeCapability{compute_capability});
   // Using H100 numbers as the most relevant example here.
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
   // https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/#nvidia_h100_gpu_architecture_in-depth
@@ -446,7 +443,7 @@ GetPossibleMatmulAutotuneTritonConfigs(
 }
 
 TEST_F(GemmFusionAutotunerTest, AmpereUsesMoreThanTwoStages) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
@@ -625,26 +622,38 @@ ENTRY e {
 
 // TODO(b/344770374): Make this test not fragile.
 TEST_F(GemmFusionAutotunerTest, DoNotRunAutotuningKernelSpillingRegisters) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHloText = R"(
 HloModule m
 
+lhs_computation {
+  %p0 = s8[12288,1536] parameter(0)
+  ROOT %convert = f16[12288,1536] convert(%p0)
+}
+
+rhs_computation {
+  %p1 = s8[4,12288] parameter(0)
+  ROOT %convert = f16[4,12288] convert(%p1)
+}
+
 %triton_gemm_dot {
-  %p1 = s8[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
-  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = s8[4,12288] parameter(1)
+  %lhs = f16[12288,1536] fusion(%p0), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["256","16"]}]}}}
+  %rhs = f16[4,12288] fusion(%p1), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","256"]}]}}}
+  %dot = f16[4,1536] dot(%rhs, %lhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT %convert = s8[4,1536] convert(%dot)
 }
 
 ENTRY %e {
-  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
-  %convert = s8[4,12288]{1,0} parameter(1)
-  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
+  %p0 = s8[12288,1536] parameter(0)
+  %convert = s8[4,12288] parameter(1)
+  ROOT %triton = s8[4,1536] fusion(%p0, %convert), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"output_tiles":[{"sizes":["256","256"]}],"num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -677,20 +686,32 @@ TEST_F(GemmFusionAutotunerTest,
   const std::string kHloText = R"(
 HloModule m
 
+rhs_computation {
+  %p0 = s8[12288,1536] parameter(0)
+  ROOT %convert = f16[12288,1536] convert(%p0)
+}
+
+lhs_computation {
+  %p1 = s8[4,12288] parameter(0)
+  ROOT %convert = f16[4,12288] convert(%p1)
+}
+
 %triton_gemm_dot {
-  %p1 = s8[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.p0 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  %convert.p1 = f16[4,12288]{1,0} convert(s8[4,12288]{1,0} %p1)
-  %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %convert.p1, f16[12288,1536]{1,0} %convert.p0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT %convert = s8[4,1536]{1,0} convert(f16[4,1536]{1,0} %dot)
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = s8[4,12288] parameter(1)
+  %rhs = f16[12288,1536] fusion(%p0), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["256","16"]}]}}}
+  %lhs = f16[4,12288] fusion(%p1), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","256"]}]}}}
+  %dot = f16[4,1536] dot(%lhs, %rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT %convert = s8[4,1536] convert(%dot)
 }
 
 ENTRY %e {
-  %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
-  %convert = s8[4,12288]{1,0} parameter(1)
-  ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = s8[4,12288] parameter(1)
+  ROOT %triton = s8[4,1536] fusion(%p0, %p1), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"output_tiles":[{"sizes":["256","256"]}],"num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -717,18 +738,30 @@ TEST_F(GemmFusionAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
 
+rhs_computation {
+  %p0 = s8[12288,1536] parameter(0)
+  ROOT %convert = f16[12288,1536] convert(%p0)
+}
+
+lhs_computation {
+  ROOT %p1 = f16[4,12288] parameter(0)
+}
+
 %triton_gemm_dot {
-  %p1 = f16[4,12288]{1,0} parameter(1)
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %convert.10406 = f16[12288,1536]{1,0} convert(s8[12288,1536]{1,0} %p0)
-  ROOT %dot = f16[4,1536]{1,0} dot(f16[4,12288]{1,0} %p1, f16[12288,1536]{1,0} %convert.10406), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = f16[4,12288] parameter(1)
+  %rhs = f16[12288,1536] fusion(%p0), kind=kCustom, calls=rhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","16"]}]}}}
+  %lhs = f16[4,12288] fusion(%p1), kind=kCustom, calls=lhs_computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{"output_tiles":[{"sizes":["16","32"]}]}}}
+  ROOT %dot = f16[4,1536] dot(%lhs, %rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY %e {
-  %p0 = s8[12288,1536]{1,0} parameter(0)
-  %p1 = f16[4,12288]{1,0} parameter(1)
-  ROOT %triton_dot = f16[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %p0, f16[4,12288]{1,0} %p1), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"}}}
+  %p0 = s8[12288,1536] parameter(0)
+  %p1 = f16[4,12288] parameter(1)
+  ROOT %triton_dot = f16[4,1536] fusion(%p0, %p1), kind=kCustom, calls=%triton_gemm_dot,
+    backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"output_tiles":[{"sizes":["16","32"]}],"num_stages":"1","num_warps":"2","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -808,11 +841,10 @@ ENTRY main {
 
   TF_EXPECT_OK(HloTestBase::RunHloPass(&pipeline, module.get()));
   const bool is_at_least_hopper =
-      std::holds_alternative<se::CudaComputeCapability>(
-          autotune_config.GetGpuComputeCapability()) &&
-      std::get<se::CudaComputeCapability>(
-          autotune_config.GetGpuComputeCapability())
-          .IsAtLeastHopper();
+      autotune_config.GetGpuComputeCapability().IsCuda() &&
+      autotune_config.GetGpuComputeCapability()
+          .cuda_compute_capability()
+          ->IsAtLeastHopper();
   TF_ASSERT_OK_AND_ASSIGN(
       bool filecheck_matches,
       RunFileCheck(module->ToString(), is_at_least_hopper
@@ -822,8 +854,9 @@ ENTRY main {
 }
 
 TEST_F(GemmFusionAutotunerDumpTest, DumpingWorks) {
-  if (isRocm() || GetDebugOptionsForTest()
-                      .xla_gpu_experimental_disable_binary_libraries()) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
     GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
   HloModuleConfig config;
@@ -891,8 +924,9 @@ CHECK: cublas
 }
 
 TEST_F(GemmFusionAutotunerTest, AutotuneCuDnnFusion) {
-  if (isRocm() || GetDebugOptionsForTest()
-                      .xla_gpu_experimental_disable_binary_libraries()) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
     GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
   const std::string kHlo = R"(
@@ -916,48 +950,43 @@ ENTRY e {
 )");
 }
 
-// TODO(b/281489442): Write a testcase called
-// `SkipConfigsProducingDeviantResults` or similar.
-
-// TODO(b/393299275): remove when the legacy GEMM emitter is removed.
-class GemmFusionAutotunerLevelLegacyEmitterTest
-    : public StatelessAutotunerTest,
-      public ::testing::WithParamInterface<int> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options =
-        StatelessAutotunerTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_autotune_level(GetParam());
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    return debug_options;
+TEST_F(GemmFusionAutotunerTest, AutotuneScaledDotCuDnnFusion) {
+  if (GpuComputeComp().IsRocm() ||
+      GetDebugOptionsForTest()
+          .xla_gpu_experimental_disable_binary_libraries()) {
+    GTEST_SKIP() << "Not supported on ROCm or with binary libraries disabled.";
   }
-};
-
-TEST_P(GemmFusionAutotunerLevelLegacyEmitterTest,
-       AllAutotuningLevelsWorkCorrectly) {
-  const std::string kHloText = R"(
-HloModule m
+  if (!GetCudaComputeCapability().IsAtLeastBlackwell()) {
+    GTEST_SKIP() << "Not supported on pre-Blackwell GPUs.";
+  }
+  const std::string kHlo = R"(
+fusion1 {
+  %lhs = f8e4m3fn[4,192,224] parameter(0)
+  %rhs = f8e4m3fn[4,256,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,192,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,256,7] parameter(3)
+  ROOT %result = f32[4,192,256] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
 
 ENTRY e {
-  p0 = pred[64,10] parameter(0)
-  p0c = f32[64,10] convert(p0)
-  p1 = f32[10,128] parameter(1)
-  ROOT r = f32[64,128] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %lhs = f8e4m3fn[4,192,224] parameter(0)
+  %rhs = f8e4m3fn[4,256,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,192,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,256,7] parameter(3)
+  ROOT _ = f32[4,192,256] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion1,
+      backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
 })";
 
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: kind=kCustom
-; CHECK-SAME: __triton_gemm
-      )");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  CheckTritonAutotuning(kHlo, R"(
+// CHECK: "plan_id":
+)");
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerLevelSweep,
-                         GemmFusionAutotunerLevelLegacyEmitterTest,
-                         ::testing::Range(0, 5));
+// TODO(b/281489442): Write a testcase called
+// `SkipConfigsProducingDeviantResults` or similar.
 
 class GemmFusionAutotunerLevelTest : public StatelessAutotunerTest,
                                      public ::testing::WithParamInterface<int> {
@@ -967,10 +996,6 @@ class GemmFusionAutotunerLevelTest : public StatelessAutotunerTest,
         StatelessAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(GetParam());
     debug_options.set_xla_gpu_cublas_fallback(false);
-    // TODO(b/393299275): remove when the flag is enabled by default.
-    debug_options.clear_xla_gpu_unsupported_generic_triton_emitter_features();
-    debug_options.add_xla_gpu_unsupported_generic_triton_emitter_features(
-        DebugOptions::GENERIC_TRITON_EMITTER_ENABLE_NESTED_GEMM);
     return debug_options;
   }
 };
@@ -1202,7 +1227,7 @@ ENTRY entry {
 }
 
 TEST_F(GemmFusionAutotunerTest, CreatesCustomKernelFusionConfigs) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1241,7 +1266,7 @@ TEST_F(GemmFusionAutotunerTest, CreatesCustomKernelFusionConfigs) {
 }
 
 TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1273,8 +1298,9 @@ TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &symbolic_expr_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(),
+          &symbolic_expr_context_));
   EXPECT_EQ(
       2, std::count_if(
              configs.begin(), configs.end(),
@@ -1287,7 +1313,7 @@ TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
 TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
   // Same as GeneratesTwoConfigsForUpcastGemmWithPrologue, but with contracting
   // dimension size = 128 which is not supported by the SplitK kernel.
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1319,8 +1345,9 @@ TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &symbolic_expr_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(),
+          &symbolic_expr_context_));
   EXPECT_EQ(
       1, std::count_if(
              configs.begin(), configs.end(),
@@ -1332,7 +1359,7 @@ TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
 
 TEST_F(GemmFusionAutotunerTest,
        GeneratesConfigForUpcastGemmWithPrologueAndEpilogue) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1368,8 +1395,9 @@ TEST_F(GemmFusionAutotunerTest,
   TF_ASSERT_OK_AND_ASSIGN(
       const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *module, compute_capability, GetToolkitVersion(),
-          GetDebugOptionsForTest(), &symbolic_expr_context_));
+          *module, se::GpuComputeCapability{compute_capability},
+          GetToolkitVersion(), GetDebugOptionsForTest(),
+          &symbolic_expr_context_));
   EXPECT_EQ(
       2, std::count_if(
              configs.begin(), configs.end(),
@@ -1486,7 +1514,7 @@ class GemmFusionShardedAutotunerTest : public GemmFusionAutotunerTest {
 TEST_F(
     GemmFusionShardedAutotunerTest,
     AutotuningSucceedsWhenKeyValueStoreAlreadyContainsAutotuningResultsForTheInputModule) {  // NOLINT(whitespace/line_length)
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1545,7 +1573,7 @@ TEST_F(
 TEST_F(
     GemmFusionShardedAutotunerTest,
     AutotuningStoresDifferentResultsForTheSameFusionInDifferentModules) {  // NOLINT(whitespace/line_length)
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo1 = R"(
@@ -1626,7 +1654,7 @@ TEST_F(
 }
 
 TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
@@ -1711,7 +1739,7 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, VerifyHopperConfigsAreDifferentFromBlackwell) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1752,7 +1780,7 @@ TEST_F(GemmFusionAutotunerTest, VerifyHopperConfigsAreDifferentFromBlackwell) {
 }
 
 TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsAreGenerated) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1781,7 +1809,7 @@ TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsAreGenerated) {
 }
 
 TEST_F(GemmFusionAutotunerTest, ScaledDotConfigsHaveCuBlasFallback) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1829,7 +1857,7 @@ class GemmFusionAutotunerEnableTma : public GemmFusionAutotunerTest {
 
 TEST_F(GemmFusionAutotunerEnableTma,
        TmaConfigsAreGeneratedOnlyForHopperAndWorkCorrectly) {
-  if (isRocm()) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1883,9 +1911,8 @@ TEST_F(GemmFusionAutotunerEnableTma,
                             ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
 }
 
-TEST_F(GemmFusionAutotunerEnableTma,
-       TmaConfigsGeneratedAndRunCorrectlyForDotsOfBroadcasts) {
-  if (isRocm()) {
+TEST_F(GemmFusionAutotunerEnableTma, TmaRunCorrectlyForDotsOfBroadcasts) {
+  if (GpuComputeComp().IsRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
 
@@ -1908,15 +1935,6 @@ TEST_F(GemmFusionAutotunerEnableTma,
           GetToolkitVersion(), GetDebugOptionsForTest(),
           &symbolic_expr_context_));
 
-  auto is_disallowed_tma_config = [](const TritonGemmConfig& c) {
-    return c.num_stages > 2 && c.is_tma_allowed;
-  };
-  auto is_allowed_tma_config = [](const TritonGemmConfig& c) {
-    return c.num_stages <= 2 && c.is_tma_allowed;
-  };
-  EXPECT_FALSE(absl::c_any_of(hopper_configs, is_disallowed_tma_config));
-  EXPECT_TRUE(absl::c_any_of(hopper_configs, is_allowed_tma_config));
-
   EXPECT_TRUE(RunAndCompare(std::move(module),
                             ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
 }
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 405a43600d2b96..64c9222ca68967 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -177,7 +177,7 @@ message CuDnnFusionConfig {
     CONV_WGRAD = 1;
     CONV_DGRAD = 2;
   }
-  Kind kind = 2;
+  optional Kind kind = 2;
 }
 
 // Output tile sizes for a fusion root.
@@ -205,6 +205,9 @@ message BlockLevelFusionConfig {
 
   // Allow/disallow TMA usage for all arguments of the kernel (where possible).
   bool is_tma_allowed = 6;
+
+  // Allow/disallow automatic warp specialization.
+  bool is_warp_specialization_allowed = 7;
 }
 
 message DynamicMemcpyConfig {
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 6613dabb587de5..086142041b3088 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -11,6 +11,7 @@ def get_cub_sort_kernel_types(name = ""):
     """ List of supported types for CUB sort kernels.
     """
     return [
+        "bf16",
         "f16",
         "f32",
         "f64",
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index fcab22e65ccc6c..9b872652720e81 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
@@ -61,7 +62,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/metrics.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
@@ -313,7 +313,7 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
   IrEmitterContext ir_emitter_context(
       hlo_module, results.buffer_assignment.get(),
       results.execution_stream_assignment.get(), platform->Name(), device_desc,
-      symbolic_expr_context.get(), results.llvm_module.get(),
+      mlir_context.get(), results.llvm_module.get(),
       results.llvm_module_constants.get(),
       /*emit_kernels=*/true);
 
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index e40088194fdfaf..5c7f2756f7f7e9 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -31,10 +31,10 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/alias_info.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_reuse_cache.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
index cb63391039d22a..34682660e4e0ee 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
@@ -118,8 +118,8 @@ ENTRY TestComputation {
 }
 
 TEST_F(ConvolutionLayoutNormalizationTest, GraphConvF8) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::kHopper)) {
+  if (IsRocm() || !GetCudaComputeCapability().IsAtLeast(
+                      se::CudaComputeCapability::kHopper)) {
     GTEST_SKIP() << "FP8 convolutions require Hopper or newer architecture.";
   }
   const char* hlo = R"(
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
index 746e8953de5677..27e40c90fc029d 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
@@ -42,8 +42,8 @@ constexpr std::array<CublasPaddingRequirement, 3> CublasPaddingRequirements{
      {se::CudaComputeCapability::Volta(), F16, 8},
      {se::CudaComputeCapability::Ampere(), BF16, 8}}};
 
-constexpr std::array<HipblasPaddingRequirement, 2> HipblasPaddingRequirements{
-    {{/*rocm gpu arch,*/ F16, 8}, {/*rocm gpu arch,*/ BF16, 8}}};
+// No padding requirements for ROCM
+constexpr std::array<HipblasPaddingRequirement, 0> HipblasPaddingRequirements;
 
 // Tell if either of the operands of the dot requires padding.
 bool CublasRequiresPadding(const HloDotInstruction& dot,
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 8e58a5e67a64a9..0771d1d7a59377 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -21,6 +21,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "xla/literal_util.h"
+
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
@@ -778,6 +782,92 @@ TEST_F(CustomCallTest, FfiExecutionState) {
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
+//===----------------------------------------------------------------------===//
+// Asynchronous custom calls example.
+//===----------------------------------------------------------------------===//
+
+// This is an example of how to implement an asynchronous custom call:
+//
+//   1. Start custom call initiates async operations and extends the lifetime of
+//      the input buffer by aliasing it with the output.
+//   2. Done custom call waits for the async operations to complete and returns
+//      the result.
+//
+// Because HLO type system doesn't allow to express arbitrary values passed
+// between operations, we rely on a "side channel" to communicate between
+// start and done custom calls. In this example, this side channel is
+// implemented as a global static map.
+static absl::NoDestructor<absl::flat_hash_map<int32_t, void*>> async_work_map;
+
+static absl::Status AsyncStartCustomCall(ffi::AnyBuffer arg,
+                                         ffi::Result<ffi::AnyBuffer> ret,
+                                         int32_t channel) {
+  // Inside that start custom call we alias input with output and by doing that
+  // extend the lifetime of the input buffer until the linked done custom call.
+  EXPECT_EQ(arg.untyped_data(), ret->untyped_data());
+  EXPECT_EQ(arg.element_type(), F32);
+  EXPECT_EQ(ret->element_type(), F32);
+
+  EXPECT_TRUE(async_work_map->empty());
+  async_work_map->insert({channel, arg.untyped_data()});
+
+  return absl::OkStatus();
+}
+
+static absl::Status AsyncDoneCustomCall(ffi::AnyBuffer arg,
+                                        ffi::Result<ffi::AnyBuffer> ret,
+                                        int32_t channel) {
+  // In done custom call we "allocate" real result buffer.
+  EXPECT_NE(arg.untyped_data(), ret->untyped_data());
+  EXPECT_EQ(arg.element_type(), F32);
+
+  // Chat that argument is the same as the one we put into a map earlier.
+  EXPECT_EQ(async_work_map->at(channel), arg.untyped_data());
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(
+    kAsyncStartCustomCall, AsyncStartCustomCall,
+    ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
+        "channel"));
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_start_custom_call",
+                         PLATFORM, kAsyncStartCustomCall);
+
+XLA_FFI_DEFINE_HANDLER(
+    kAsyncDoneCustomCall, AsyncDoneCustomCall,
+    ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
+        "channel"));
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_done_custom_call",
+                         PLATFORM, kAsyncDoneCustomCall);
+
+TEST_F(CustomCallTest, AsyncCustomCalls) {
+  auto shape = ShapeUtil::MakeShape(F32, {});
+
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, shape, "p0");
+
+  auto start = CustomCall(
+      &b, "xla.gpu.async_start_custom_call",
+      /*operands=*/{Copy(p0)}, ShapeUtil::MakeShape(F32, {}),
+      /*opaque=*/"{channel = 0 : i32}",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{{{}, {0, {}}}}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+
+  CustomCall(&b, "xla.gpu.async_done_custom_call",
+             /*operands=*/{start}, ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"{channel = 0 : i32}",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+
+  Literal literal = LiteralUtil::CreateR0<float>(42.0f);
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {&literal}).status());
+}
+
 //===----------------------------------------------------------------------===//
 // Testing the use of buffers in custom calls.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc b/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
index 0aad3b040f6e70..6470fd52b8ef15 100644
--- a/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
+++ b/third_party/xla/xla/service/gpu/custom_kernel_emitter_cuda.cc
@@ -14,11 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/custom_kernel_emitter.h"
@@ -42,7 +43,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> EmitPtxCustomKernelThunk(
 
   TF_ASSIGN_OR_RETURN(
       KernelCall call,
-      KernelCall::Parse(backend_config_str, context->symbolic_expr_context()));
+      KernelCall::Parse(backend_config_str, context->expr_context()));
   if (call.kernel_type != KernelCall::KernelType::kPtxSource) {
     return absl::InvalidArgumentError(
         "PTX custom call backend config is not a PTX source");
@@ -61,8 +62,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> EmitPtxCustomKernelThunk(
           call.name, call.kernel_data, kernel_arguments.args().size(),
           call.block_dim, call.thread_dim, call.shared_mem));
 
+  Thunk::ThunkInfo thunk_info =
+      Thunk::ThunkInfo::WithProfileAnnotation(instr, context->GetNextThunkId());
   return std::make_unique<CustomKernelThunk>(
-      instr, ptx_custom_kernel, kernel_arguments, context->GetNextThunkId());
+      std::move(thunk_info), ptx_custom_kernel, kernel_arguments);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc b/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc
new file mode 100644
index 00000000000000..08344224e30bfc
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/custom_kernel_emitter_sycl_stub.cc
@@ -0,0 +1,31 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/custom_kernel_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<std::unique_ptr<Thunk>> EmitPtxCustomKernelThunk(
+    const HloCustomCallInstruction* /*instr*/, IrEmitterContext* /*context*/) {
+  return absl::UnimplementedError(
+      "Custom kernel emitter for PTX custom call is not yet implemented in "
+      "SYCL platform.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 87e26ec6d0c563..c592b558fdf340 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -262,9 +262,6 @@ ENTRY e {
 }
 
 TEST_F(DeterminismTest, Conv) {
-  if (IsRocm()) {
-      GTEST_SKIP() << "Test temporarily disabled for ROCm!"; //TODO(rocm): weekly sync 25-08-25
-  }
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
   input = f32[16,3,64,64] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
index e13fbe58a16d48..9f1eacfd4ce3ee 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
index 019c8605acb8f2..5c3a0245bf1131 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index 95a2f28edf7445..447e6619f14a03 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/transforms/multi_output_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index a95396efba3cc5..886c91d783ed12 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
new file mode 100644
index 00000000000000..ce72861d1adcca
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
+#define XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+// `AotCompilationResult` implementation for GPU, containing a serialized
+// `GpuExecutable`.
+//
+// Unlike `LegacyGpuAotCompilationResult`, this result contains the entire
+// optimized executable, including the Thunks, as opposed to just the optimized
+// HLO.
+class GpuAotCompilationResult : public AotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<GpuAotCompilationResult>> Create(
+      GpuExecutableProto executable) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        HloModule::CreateFromProtoWithConfig(
+                            executable.hlo_module_with_config()));
+
+    return absl::WrapUnique(
+        new GpuAotCompilationResult(std::move(executable), std::move(module)));
+  }
+
+  absl::StatusOr<std::string> SerializeAsString() const final {
+    std::string serialized = executable_.SerializeAsString();
+    if (serialized.empty()) {
+      return absl::InternalError("Failed to serialize GpuExecutableProto.");
+    }
+    return serialized;
+  }
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
+      final {
+    return GpuExecutable::FromProto(executable_,
+                                    stream_exec->GetDeviceDescription(),
+                                    stream_exec->GetPlatform()->Name());
+  }
+
+  const HloModule* optimized_module() const final { return hlo_module_.get(); };
+
+  std::unique_ptr<HloModule> consume_optimized_module() final {
+    return std::move(hlo_module_);
+  };
+
+ private:
+  explicit GpuAotCompilationResult(GpuExecutableProto executable,
+                                   std::unique_ptr<HloModule> hlo_module)
+      : executable_(std::move(executable)),
+        hlo_module_(std::move(hlo_module)) {}
+
+  GpuExecutableProto executable_;
+  std::unique_ptr<HloModule> hlo_module_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_AOT_COMPILATION_RESULT_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
new file mode 100644
index 00000000000000..1aa4c3ce29c884
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_aot_compilation_result.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
+#include "xla/stream_executor/mock_platform.h"
+#include "xla/stream_executor/mock_stream_executor.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::stream_executor::DeviceDescription;
+using ::stream_executor::GpuComputeCapability;
+using ::stream_executor::MockPlatform;
+using ::stream_executor::MockStreamExecutor;
+using ::testing::Return;
+using ::testing::ReturnRef;
+using ::tsl::proto_testing::EqualsProto;
+
+DeviceDescription GetDeviceDescription() {
+  DeviceDescription device_description;
+  device_description.set_gpu_compute_capability(
+      GpuComputeCapability{::stream_executor::CudaComputeCapability::Volta()});
+  device_description.set_driver_version({12, 3, 0});
+  device_description.set_runtime_version({12, 3, 0});
+  return device_description;
+}
+
+class GpuAotCompilationResultTest : public ::testing::Test {
+ public:
+  GpuAotCompilationResultTest() : device_description_(GetDeviceDescription()) {
+    EXPECT_CALL(executor_, GetDeviceDescription())
+        .WillRepeatedly(ReturnRef(device_description_));
+    EXPECT_CALL(executor_, GetPlatform()).WillRepeatedly(Return(&platform_));
+    EXPECT_CALL(platform_, Name()).WillRepeatedly(ReturnRef(platform_name_));
+  }
+
+  // Creates a dummy GpuExecutableProto, the actual values don't matter much.
+  absl::StatusOr<GpuExecutableProto> CreateGpuExecutableProto() {
+    Thunk::ThunkInfo thunk_info;
+    thunk_info.thunk_id = 123;
+
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        thunk_info,
+        /*kernel_name=*/"test_kernel", emitters::KernelArguments({}),
+        LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0, ::stream_executor::gpu::TmaMetadata()));
+
+    auto hlo_module = std::make_unique<HloModule>("test_module_with_shape",
+                                                  HloModuleConfig());
+    auto builder = HloComputation::Builder("entry");
+    auto constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+    hlo_module->AddEntryComputation(builder.Build(constant));
+
+    GpuExecutable::Params params;
+    params.debug_module = std::move(hlo_module);
+    params.asm_text = "test_asm_text";
+    params.binary = {1, 2, 3};
+    params.dnn_compiled_graphs = {{"test_dnn_compiled_graph", "test_json"}};
+
+    thunk_info.thunk_id = 456;
+    params.executable = std::make_unique<SequentialThunk>(
+        thunk_info, std::move(thunk_sequence));
+    params.device_description = device_description_;
+
+    params.module_name = "test_module";
+    params.enable_debug_info_manager = false;
+    params.mlir_allocations = {BufferAllocation(0, 1024, 0)};
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuExecutable> executable,
+                        GpuExecutable::Create(std::move(params)));
+    return executable->ToProto();
+  }
+
+  DeviceDescription device_description_;
+  MockStreamExecutor executor_;
+  MockPlatform platform_;
+  const std::string platform_name_ = "gpu";
+};
+
+TEST_F(GpuAotCompilationResultTest, CreateAndSerialize) {
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                          CreateGpuExecutableProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuAotCompilationResult> result,
+      GpuAotCompilationResult::Create(reference_executable));
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_result,
+                          result->SerializeAsString());
+  GpuExecutableProto deserialized_executable;
+  ASSERT_TRUE(deserialized_executable.ParseFromString(serialized_result))
+      << "Failed to parse serialized result.";
+
+  EXPECT_THAT(deserialized_executable, EqualsProto(reference_executable));
+}
+
+TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto reference_executable,
+                          CreateGpuExecutableProto());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuAotCompilationResult> result,
+      GpuAotCompilationResult::Create(reference_executable));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*result).LoadExecutable(/*compiler=*/nullptr, &executor_));
+
+  auto* gpu_executable = dynamic_cast<GpuExecutable*>(executable.get());
+  ASSERT_NE(gpu_executable, nullptr) << "Executable is not a GpuExecutable.";
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto executable_proto,
+                          gpu_executable->ToProto());
+  // HLO module is re-created from proto, and will have a new ID, so we clear
+  // it for comparison purposes.
+  executable_proto.mutable_hlo_module_with_config()
+      ->mutable_hlo_module()
+      ->clear_id();
+  reference_executable.mutable_hlo_module_with_config()
+      ->mutable_hlo_module()
+      ->clear_id();
+  EXPECT_THAT(executable_proto, EqualsProto(reference_executable));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index c3d30adae6dd5a..fd019a055f821a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -28,8 +28,6 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/call_once.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -37,14 +35,11 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -52,31 +47,30 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
-#include "mlir/IR/Diagnostics.h"
 #include "mlir/Support/LLVM.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
-#include "xla/backends/cpu/nanort/nanort_executable.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/backends/gpu/runtime/host_execute_thunk.h"
 #include "xla/backends/gpu/runtime/runtime_intrinsics.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/core/host_offloading/hlo_host_device_type_call_wrapper.h"
 #include "xla/core/host_offloading/host_compute_asyncifier.h"
 #include "xla/hlo/analysis/alias_info.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
@@ -89,6 +83,7 @@ limitations under the License.
 #include "xla/hlo/transforms/collectives/collectives_schedule_linearizer.h"
 #include "xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h"
 #include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
+#include "xla/hlo/transforms/expanders/cholesky_expander.h"
 #include "xla/hlo/transforms/expanders/comparison_expander.h"
 #include "xla/hlo/transforms/expanders/convolution_4d_expander.h"
 #include "xla/hlo/transforms/expanders/convolution_pred_expander.h"
@@ -140,8 +135,8 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
 #include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
-#include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/maybe_owning.h"
+#include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_reduce_reassociate.h"
 #include "xla/service/all_reduce_simplifier.h"
@@ -150,7 +145,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/call_inliner.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/collective_permute_decomposer.h"
 #include "xla/service/collective_pipeliner.h"
 #include "xla/service/collective_pipeliner_utils.h"
@@ -173,12 +167,12 @@ limitations under the License.
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/flag_utils.h"
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
@@ -188,10 +182,10 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/legacy_gpu_aot_compilation_result.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/model/collective_ptable_stats_collection.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_cost_model_stats_collection.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/matmul_ptable_stats_collection.h"
@@ -212,7 +206,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
 #include "xla/service/gpu/transforms/collectives/collective_backend_assigner.h"
 #include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h"
 #include "xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h"
 #include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
@@ -366,199 +359,15 @@ DeviceOrDevicelessConfig GetDeviceConfig(
     return DeviceOrDevicelessConfig{
         DeviceConfig{stream_exec, options.device_allocator}};
   }
-  se::DeviceDescription device_description =
-      gpu_target_config.device_description;
-  device_description.set_dnn_version(
-      {static_cast<unsigned>(
-           gpu_target_config.dnn_version_info.major_version()),
-       static_cast<unsigned>(
-           gpu_target_config.dnn_version_info.minor_version()),
-       static_cast<unsigned>(gpu_target_config.dnn_version_info.patch())});
-  return DeviceOrDevicelessConfig{DevicelessConfig{device_description}};
+  return DeviceOrDevicelessConfig{
+      DevicelessConfig{gpu_target_config.device_description}};
 }
 
 se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) {
   return stream_exec->GetDeviceDescription().gpu_compute_capability();
 }
 
-class GpuThunkAotCompilationResult : public AotCompilationResult {
- public:
-  static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
-  FromModule(const HloModule* hlo_module,
-             const BufferAssignment* buffer_assignment,
-             absl::string_view asm_text, absl::Span<const uint8_t> binary,
-             const BinaryMap& dnn_compiled_graphs, int pointer_size) {
-    tsl::profiler::TraceMe traceme("ResultFromModule");
-    CompilationResultProto proto;
-    *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
-    *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
-    proto.set_asm_text(asm_text);
-    proto.set_binary(binary.data(), binary.size());
-    proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
-                                                dnn_compiled_graphs.cend());
-    return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(hlo_module->Clone(), std::move(proto),
-                                         pointer_size));
-  }
-
-  static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
-  FromString(const std::string& serialized, int pointer_size) {
-    tsl::profiler::TraceMe traceme("ResultFromString");
-    CompilationResultProto proto;
-    if (!proto.ParseFromString(serialized)) {
-      return Internal(
-          "Failed to parse serialized GpuThunkAotCompilationResult.");
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModule> module,
-        HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
-    return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(std::move(module), std::move(proto),
-                                         pointer_size));
-  }
-
-  absl::StatusOr<std::string> SerializeAsString() const override {
-    return proto_.SerializeAsString();
-  }
-
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      override;
-
-  const HloModule* optimized_module() const override { return module_.get(); }
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
-  }
-
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
-      const override;
-
- private:
-  GpuThunkAotCompilationResult(std::unique_ptr<HloModule> module,
-                               CompilationResultProto proto, int pointer_size)
-      : module_(std::move(module)),
-        proto_(std::move(proto)),
-        pointer_size_(pointer_size) {}
-
-  std::unique_ptr<HloModule> module_;
-  CompilationResultProto proto_;
-  int pointer_size_;
-};
-
-}  // end anonymous namespace
-
-absl::StatusOr<std::unique_ptr<BufferAssignment>>
-GpuThunkAotCompilationResult::buffer_assignment() const {
-  auto buffer_size_bytes_function =
-      [pointer_size = pointer_size_](const BufferValue& buffer) {
-        return gpu::ShapeSizeBytesFunction(pointer_size)(buffer.shape());
-      };
-
-  // Recreate BufferAssignment from proto.
-  // Technically, we should pass the proper GpuAliasInfo, but the FromProto()
-  // method does not actually make use of the MayAlias function. And for now, we
-  // don't have backend-specific MustAlias rules.
-  // TODO(b/424109294): This needs to be fixed when we implement
-  // backend-specific MustAlias rules.
-  AliasInfo alias_info;
-  return BufferAssignment::FromProto(proto_.buffer_assignment(), module_.get(),
-                                     buffer_size_bytes_function, &alias_info);
-}
-
-absl::StatusOr<std::unique_ptr<Executable>>
-GpuThunkAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
-  tsl::profiler::TraceMe traceme("LoadExecutable");
-  // Recreate HloModule+HloModuleConfig from proto.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProtoWithConfig(proto_.hlo_module_with_config()));
-
-  ExecutionStreamAssignment execution_stream_assignment(hlo_module.get());
-
-  std::vector<uint8_t> binary(proto_.binary().begin(), proto_.binary().end());
-
-  // Build the executable, which should be a thunk sequence.
-  TF_ASSIGN_OR_RETURN(
-      se::Platform * platform,
-      se::PlatformManager::PlatformWithId(compiler->PlatformId()));
-  std::string platform_name = platform->Name();
-  const se::DeviceDescription& gpu_device_info =
-      stream_exec->GetDeviceDescription();
-  llvm::LLVMContext llvm_context;
-  auto* gpu_compiler = dynamic_cast<GpuCompiler*>(compiler);
-  if (gpu_compiler == nullptr) {
-    return Internal("Compiler is not a GpuCompiler.");
-  }
-  auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
-  llvm_module->setTargetTriple(llvm::Triple(gpu_compiler->target_triple()));
-  llvm_module->setDataLayout(gpu_compiler->data_layout());
-
-  // Recreate BufferAssignment from proto.
-  std::unique_ptr<GpuAliasInfo> alias_info =
-      gpu_compiler->GetAliasInfo(gpu_device_info);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssignment::FromProto(proto_.buffer_assignment(), hlo_module.get(),
-                                  compiler->BufferSizeBytesFunction(),
-                                  alias_info.get()));
-
-  IrEmitterContext ir_emitter_context(
-      hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
-      platform_name, gpu_device_info, gpu_compiler->symbolic_expr_context(),
-      llvm_module.get(),
-      /*llvm_module_constants=*/nullptr,
-      /*emit_kernels=*/false);
-
-  absl::string_view cache_file_path =
-      hlo_module->config().debug_options().xla_gpu_kernel_cache_file();
-  if (!cache_file_path.empty() &&
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_llvm_module_compilation_parallelism()) {
-    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
-  }
-
-  auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
-  TF_RETURN_IF_ERROR(
-      ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
-
-  // Get all other fields required by GpuExecutable.
-  std::vector<GpuExecutable::ConstantInfo> constants =
-      std::move(ir_emitter_context.constants());
-  TF_ASSIGN_OR_RETURN(auto output_info,
-                      GetOutputInfo(*hlo_module, *buffer_assignment));
-  ProgramShape program_shape =
-      hlo_module->entry_computation_layout().ComputeProgramShape();
-  *program_shape.mutable_result() = hlo_module->result_shape();
-  DebugOptions debug_options = hlo_module->config().debug_options();
-  std::string hlo_module_name = hlo_module->name();
-
-  {
-    tsl::profiler::TraceMe traceme("CreateGpuExecutable");
-    std::unique_ptr<GpuAliasInfo> alias_info =
-        gpu_compiler->GetAliasInfo(gpu_device_info);
-    return GpuExecutable::Create(GpuExecutable::Params{
-        /*asm_text=*/proto_.asm_text(),
-        /*binary=*/binary,
-        /*dnn_compiled_graphs=*/
-        BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
-                  proto_.dnn_compiled_graphs().cend()),
-        /*executable=*/ir_emitter->ConsumeThunkSequence(),
-        /*constants=*/std::move(constants),
-        /*output_info=*/std::move(output_info),
-        /*module_name=*/std::move(hlo_module_name),
-        /*program_shape=*/std::move(program_shape),
-        /*mlir_allocations=*/std::nullopt,
-        /*buffer_assignment=*/std::move(buffer_assignment),
-        /*alias_info=*/std::move(alias_info),
-        /*debug_options=*/std::move(debug_options),
-        /*device_description=*/gpu_device_info,
-        /*debug_module=*/std::move(hlo_module),
-        /*enable_debug_info_manager=*/true});
-  }
-}
+}  // namespace
 
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
                          const char* target_triple, const char* data_layout)
@@ -570,8 +379,8 @@ GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
 
 namespace {
 // Adds the HloVerifier for GPU to the given pipeline.
-void AddHloVerifier(HloPassPipeline* pipeline,
-                    HloVerifierOpts&& opts = {}, bool debug_only = false) {
+void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
+                    bool debug_only = false) {
   opts.verify_no_collective_deadlocks = true;
   std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
@@ -746,7 +555,7 @@ absl::Status RunOptimizationPasses(
   HloPassPipeline pipeline("optimization");
   AddHloVerifier(&pipeline);
   if (debug_options.xla_detect_unstable_reductions() !=
-      DebugOptions::UNSTABLE_REDUCTION_DETECTION_MODE_NONE) {
+      DebugOptions::DETECTION_MODE_NONE) {
     pipeline.AddPass<UnstableReductionDetector>();
   }
   pipeline.AddPass<RaggedDotRewriter>();
@@ -813,6 +622,9 @@ absl::Status RunOptimizationPasses(
   // Rewrite select-and-scatter as a scatter and a reduce-window.
   pipeline.AddPass<SelectAndScatterExpander>();
 
+  // Rewrite Cholesky.
+  pipeline.AddPass<CholeskyExpander>();
+
   if (RequireDeterminism(hlo_module->config())) {
     // Scatter can be indeterministic if indices are not unique or a non
     // associative combiner function is used. Eliminate these Scatter ops.
@@ -982,7 +794,7 @@ absl::Status RunCollectiveOptimizationPasses(
 
   if (debug_options.xla_gpu_experimental_enable_nvshmem()) {
     collectives_pipeline.AddPass<CollectiveBackendAssigner>(
-        gpu_version, num_visible_devices_per_process);
+        gpu_version, num_visible_devices_per_process, options.slice_size);
   }
 
   if (debug_options.xla_gpu_unsupported_enable_ragged_all_to_all_decomposer()) {
@@ -1484,6 +1296,7 @@ AlgebraicSimplifierOptions GpuCompiler::GetAlgebraicSimplifierOptions(
   }
 
   switch (mode) {
+    case AlgebraicSimplifierMode::kAfterSimplifyFPConversions:
     case AlgebraicSimplifierMode::kPostFusionSimplification:
     case AlgebraicSimplifierMode::kLayoutNormalization:
     case AlgebraicSimplifierMode::kPostLayoutAssignment:
@@ -1941,7 +1754,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
         pipeline.AddPass<HloPassPipeline>(
             "remove-no-op-reduce-precision-algebraic-simplifier");
     AlgebraicSimplifierOptions options = GetAlgebraicSimplifierOptions(
-        AlgebraicSimplifierMode::kPostFusionSimplification, debug_options,
+        AlgebraicSimplifierMode::kAfterSimplifyFPConversions, debug_options,
         gpu_target_config.platform_name == "ROCM");
     remove_no_op_reduce_precision_pipeline
         .AddPass<HloPassFix<GpuAlgebraicSimplifier>>(options, gpu_version);
@@ -2018,6 +1831,11 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
+  // TODO rename slice_size to partition_size in CompileOptions
+  if (options.slice_size > 0) {
+    module->mutable_config().set_partition_size(options.slice_size);
+  }
+
   const DebugOptions debug_opts = module->config().debug_options();
   TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
   bool is_deviceless = options.target_config.has_value() ||
@@ -2097,9 +1915,9 @@ bool ShouldAddCopyForCollectiveMemorySpace(const HloValue* value) {
            .xla_gpu_experimental_enable_nccl_symmetric_buffers()) {
     return false;
   }
-  // Add copy if a potential collective-memmory-spaced op directly consumes from
+  // Add copy if a potential collective-memory-spaced op directly consumes from
   // module input or a constant as they are allocated by bfc ahead of time and
-  // the alignment might not match collective memory space's requiment.
+  // the alignment might not match collective memory space's requirement.
   if (absl::c_linear_search(
           module->entry_computation()->parameter_instructions(), inst) ||
       (inst->opcode() == HloOpcode::kConstant)) {
@@ -2653,6 +2471,40 @@ GpuCompiler::CompileToBackendResult(
                                    std::move(compile_module_results)};
 }
 
+static absl::Status DumpGpuExecutableIfEnabled(
+    const GpuExecutable& gpu_executable,
+    const Compiler::CompileOptions& compile_options,
+    const DebugOptions& debug_options) {
+  // If we were to dump the GPU executable for autotuning, we would end up
+  // creating lots of tiny executables that aren't event useful for customers.
+  if (compile_options.is_autotuning_compilation) {
+    return absl::OkStatus();
+  }
+  if (!debug_options.has_xla_dump_to() ||
+      !debug_options.xla_gpu_experimental_dump_gpu_executable()) {
+    return absl::OkStatus();
+  }
+
+  TF_ASSIGN_OR_RETURN(GpuExecutableProto gpu_executable_proto,
+                      gpu_executable.ToProto());
+  std::string serialized_proto = gpu_executable_proto.SerializeAsString();
+  if (serialized_proto.empty()) {
+    return absl::InternalError("Failed to serialize GPU executable proto");
+  }
+
+  ExecutableAndOptionsProto dump_proto;
+  *dump_proto.mutable_serialized_executable() = std::move(serialized_proto);
+  constexpr absl::string_view kDumpFilename = "gpu_executable";
+  if (gpu_executable.has_module()) {
+    DumpPerModuleProtobufToFile(gpu_executable.module(), dump_proto,
+                                debug_options, kDumpFilename);
+  } else {
+    DumpProtobufToFile(dump_proto, debug_options, kDumpFilename);
+  }
+
+  return absl::OkStatus();
+}
+
 absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
@@ -2741,7 +2593,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
   const GpuAliasInfo* alias_info_ptr = alias_info.get();
   TF_ASSIGN_OR_RETURN(
-      auto gpu_executable,
+      std::unique_ptr<GpuExecutable> gpu_executable,
       GpuExecutable::Create(GpuExecutable::Params{
           /*asm_text=*/(options.is_autotuning_compilation &&
                         !res.backend_result.binary.empty())
@@ -2763,13 +2615,16 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
           /*buffer_assignment=*/
           std::move(res.compile_module_results.buffer_assignment),
           /*alias_info=*/std::move(alias_info),
-          /*debug_options=*/std::move(debug_opts),
+          /*debug_options=*/debug_opts,
           /*device_description=*/gpu_device_info,
           /*debug_module=*/options.is_autotuning_compilation
               ? std::unique_ptr<HloModule>()
               : std::move(module),
           /*enable_debug_info_manager=*/!options.is_autotuning_compilation}));
 
+  TF_RETURN_IF_ERROR(
+      DumpGpuExecutableIfEnabled(*gpu_executable, options, debug_opts));
+
   if (embed_ir_in_executable) {
     std::string ir_module_string_before_opt =
         llvm_ir::DumpToString(res.compile_module_results.llvm_module.get());
@@ -2835,7 +2690,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
   TF_ASSIGN_OR_RETURN(
       results.emplace_back(),
-      GpuThunkAotCompilationResult::FromModule(
+      LegacyGpuAotCompilationResult::FromModule(
           optimized_module.get(),
           res.compile_module_results.buffer_assignment.get(),
           res.backend_result.asm_text, res.backend_result.binary,
@@ -2852,9 +2707,11 @@ HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
 absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
     Executable* executable) const {
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
-  if (!gpu_executable) return Internal("GpuExecutable is null");
+  if (!gpu_executable) {
+    return Internal("GpuExecutable is null");
+  }
 
-  return GpuThunkAotCompilationResult::FromModule(
+  return LegacyGpuAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
       gpu_executable->text(), gpu_executable->binary(),
       gpu_executable->dnn_compiled_graphs(), pointer_size_);
@@ -3093,8 +2950,104 @@ absl::Status GpuCompiler::SerializeAutotuneResultsToFile(
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
 GpuCompiler::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
-  return GpuThunkAotCompilationResult::FromString(serialized_aot_result,
-                                                  pointer_size_);
+  return LegacyGpuAotCompilationResult::FromString(serialized_aot_result,
+                                                   pointer_size_);
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+GpuCompiler::LoadExecutableFromAotResult(
+    const AotCompilationResult& aot_result,
+    const se::StreamExecutor& stream_exec) {
+  tsl::profiler::TraceMe traceme("LoadExecutableFromAotResult");
+
+  const auto* gpu_aot_result =
+      dynamic_cast<const LegacyGpuAotCompilationResult*>(&aot_result);
+  if (gpu_aot_result == nullptr) {
+    return Internal(
+        "AotCompilationResult is not a GpuThunkAotCompilationResult.");
+  }
+  const GpuExecutableProto& proto = gpu_aot_result->GetGpuExecutableProto();
+
+  // Recreate HloModule+HloModuleConfig from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+
+  ExecutionStreamAssignment execution_stream_assignment(hlo_module.get());
+
+  std::vector<uint8_t> binary(proto.binary().begin(), proto.binary().end());
+
+  // Build the executable, which should be a thunk sequence.
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      se::PlatformManager::PlatformWithId(PlatformId()));
+  std::string platform_name = platform->Name();
+
+  const se::DeviceDescription& gpu_device_info =
+      stream_exec.GetDeviceDescription();
+  llvm::LLVMContext llvm_context;
+
+  auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
+  llvm_module->setTargetTriple(llvm::Triple(target_triple()));
+  llvm_module->setDataLayout(data_layout());
+
+  // Recreate BufferAssignment from proto.
+  std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssignment::FromProto(proto.buffer_assignment(), hlo_module.get(),
+                                  BufferSizeBytesFunction(), alias_info.get()));
+
+  IrEmitterContext ir_emitter_context(
+      hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
+      platform_name, gpu_device_info, mlir_context(), llvm_module.get(),
+      /*llvm_module_constants=*/nullptr, /*emit_kernels=*/false);
+
+  absl::string_view cache_file_path =
+      hlo_module->config().debug_options().xla_gpu_kernel_cache_file();
+  if (!cache_file_path.empty() &&
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_llvm_module_compilation_parallelism()) {
+    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
+  }
+
+  auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
+  TF_RETURN_IF_ERROR(
+      ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
+
+  // Get all other fields required by GpuExecutable.
+  std::vector<GpuExecutable::ConstantInfo> constants =
+      std::move(ir_emitter_context.constants());
+  TF_ASSIGN_OR_RETURN(auto output_info,
+                      GetOutputInfo(*hlo_module, *buffer_assignment));
+  ProgramShape program_shape =
+      hlo_module->entry_computation_layout().ComputeProgramShape();
+  *program_shape.mutable_result() = hlo_module->result_shape();
+  DebugOptions debug_options = hlo_module->config().debug_options();
+  std::string hlo_module_name = hlo_module->name();
+
+  {
+    tsl::profiler::TraceMe traceme("CreateGpuExecutable");
+    std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
+    return GpuExecutable::Create(GpuExecutable::Params{
+        /*asm_text=*/proto.asm_text(),
+        /*binary=*/binary,
+        /*dnn_compiled_graphs=*/
+        BinaryMap(proto.dnn_compiled_graphs().cbegin(),
+                  proto.dnn_compiled_graphs().cend()),
+        /*executable=*/ir_emitter->ConsumeThunkSequence(),
+        /*constants=*/std::move(constants),
+        /*output_info=*/std::move(output_info),
+        /*module_name=*/std::move(hlo_module_name),
+        /*program_shape=*/std::move(program_shape),
+        /*mlir_allocations=*/std::nullopt,
+        /*buffer_assignment=*/std::move(buffer_assignment),
+        /*alias_info=*/std::move(alias_info),
+        /*debug_options=*/std::move(debug_options),
+        /*device_description=*/gpu_device_info,
+        /*debug_module=*/std::move(hlo_module),
+        /*enable_debug_info_manager=*/true});
+  }
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index c6c49f99a736d8..fb4d3cef622fb4 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -27,9 +27,8 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotune_results.pb.h"
-#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -38,9 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -54,7 +51,6 @@ limitations under the License.
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
@@ -142,6 +138,10 @@ class GpuCompiler : public LLVMCompiler {
       AlgebraicSimplifierMode mode, const DebugOptions& debug_options,
       bool is_rocm);
 
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutableFromAotResult(
+      const AotCompilationResult& aot_result,
+      const se::StreamExecutor& stream_exec) override;
+
  protected:
   struct BackendCompileResult {
     std::string asm_text;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 359250760b8a71..1221f5a809d7ad 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -35,12 +34,14 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "google/protobuf/text_format.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -48,7 +49,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -87,11 +87,12 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/testing/temporary_directory.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/regexp.h"
 
 namespace xla {
@@ -110,6 +111,7 @@ using ::testing::Not;
 using ::testing::StartsWith;
 using ::testing::TempDir;
 using ::tsl::gtl::ValueOrDie;
+using ::tsl::testing::TemporaryDirectory;
 
 class GpuCompilerTest : public HloTestBase {
  public:
@@ -152,6 +154,17 @@ class GpuCompilerTest : public HloTestBase {
   }
 };
 
+absl::StatusOr<std::string> ReadNonEmptyFile(absl::string_view file_path) {
+  std::string str;
+  tsl::Env* env = tsl::Env::Default();
+  TF_RETURN_IF_ERROR(tsl::ReadFileToString(env, std::string(file_path), &str));
+  if (str.empty()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("File is empty: ", file_path));
+  }
+  return str;
+}
+
 TEST_F(GpuCompilerTest, CompiledProgramsCount) {
   const char* hlo_text = R"(
 HloModule test
@@ -390,6 +403,50 @@ ENTRY e {
   EXPECT_THAT(entry_root, GmockMatch(m::Fusion()));
 }
 
+TEST_F(GpuCompilerTest, GpuExecutableDump) {
+  constexpr absl::string_view hlo_text = R"hlo(
+    HloModule test
+
+    ENTRY main {
+      p = f32[10]{0} parameter(0)
+      ROOT neg = f32[10]{0} negate(p)
+    }
+)hlo";
+  HloModuleConfig config = GetModuleConfigForTest();
+  DebugOptions& debug_options = config.mutable_debug_options();
+  debug_options.set_xla_gpu_experimental_dump_gpu_executable(true);
+  TF_ASSERT_OK_AND_ASSIGN(TemporaryDirectory temp_dir,
+                          TemporaryDirectory::CreateForCurrentTestcase());
+  debug_options.set_xla_dump_to(temp_dir.path());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+  std::string module_name = module->name();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      backend().compiler()->RunBackend(std::move(module),
+                                       backend().default_stream_executor(),
+                                       Compiler::CompileOptions()));
+
+  std::vector<std::string> dump_files;
+  TF_ASSERT_OK(tsl::Env::Default()->GetMatchingPaths(
+      tsl::io::JoinPath(debug_options.xla_dump_to(), "*gpu_executable.txt"),
+      &dump_files));
+  ASSERT_EQ(dump_files.size(), 1);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string dump_serialized_contents,
+                          ReadNonEmptyFile(dump_files[0]));
+  ExecutableAndOptionsProto dump_content;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      dump_serialized_contents, &dump_content));
+
+  GpuExecutableProto gpu_executable_proto;
+  ASSERT_TRUE(gpu_executable_proto.ParseFromString(
+      dump_content.serialized_executable()));
+  EXPECT_THAT(gpu_executable_proto.binary(), Not(IsEmpty()));
+  EXPECT_EQ(gpu_executable_proto.module_name(), module_name);
+}
+
 class PersistedAutotuningTest : public HloTestBase {
  protected:
   static constexpr absl::string_view kHloText = R"(
@@ -410,14 +467,6 @@ ENTRY e {
     return filename;
   }
 
-  std::string ExpectToReadNonEmptyFile(absl::string_view file_path) {
-    std::string str;
-    tsl::Env* env = tsl::Env::Default();
-    TF_EXPECT_OK(tsl::ReadFileToString(env, std::string(file_path), &str));
-    EXPECT_THAT(str, Not(IsEmpty()));
-    return str;
-  }
-
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions options = HloTestBase::GetDebugOptionsForTest();
     options.set_xla_gpu_dump_autotune_results_to(
@@ -439,8 +488,9 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   // Check that it writes the results on the first compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
   {
-    std::string autotune_results_str =
-        ExpectToReadNonEmptyFile(xla_gpu_dump_autotune_results_to_);
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::string autotune_results_str,
+        ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
     AutotuneResults results;
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
                                                            &results));
@@ -454,8 +504,9 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   // Check that it writes the results on the second compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
   {
-    std::string autotune_results_str =
-        ExpectToReadNonEmptyFile(xla_gpu_dump_autotune_results_to_);
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::string autotune_results_str,
+        ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
     AutotuneResults results;
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
                                                            &results));
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index f91a8a4b7eb028..c44b9350d9dc19 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -169,7 +169,7 @@ results {
   }
 }
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 9.10.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
   hlo: "{\n  tmp_0 = f8e4m3fn[12288,4096]{0,1} parameter(0)\n  tmp_1 = f8e4m3fn[4096,12288]{1,0} bitcast(f8e4m3fn[12288,4096]{0,1} tmp_0)\n  ROOT tmp_2 = f8e4m3fn[12288,4096]{1,0} transpose(f8e4m3fn[4096,12288]{1,0} tmp_1), dimensions={1,0}\n}"
   result {
     other {
@@ -182,7 +182,7 @@ results {
   }
 }
 results {
-  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 9.10.0"
+  device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
   hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,32768,4,1024]{3,2,1,0} bitcast(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[3,4,32768,1024]{3,2,1,0} transpose(bf16[3,32768,4,1024]{3,2,1,0} tmp_1), dimensions={0,2,1,3}\n  tmp_3 = bf16[3,4,32,1024,1024]{4,3,2,1,0} bitcast(bf16[3,4,32768,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_5 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_5), dimensions={0,2,3}\n  tmp_7 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_3, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_10 = bf16[] constant({...})\n  tmp_11 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_10), dimensions={}\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_11)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_13), dimensions={0,2,1}\n  ROOT tmp_15 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[128,1024,1024]{2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
   result {
     other {
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 106f18b1289ac3..41b3b5fb463f43 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -45,8 +45,9 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/thunk_checksum_tracing_pass.h"
+#include "xla/backends/gpu/runtime/thunk_buffer_debug_pass.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -60,6 +61,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/resource_requests.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_value.h"
@@ -177,7 +179,16 @@ static absl::Status RunThunkPasses(const DebugOptions& debug_options,
                                    ThunkPassBufferAllocator& allocator) {
   ThunkPassPipeline pipeline("thunk-passes");
   if (debug_options.xla_gpu_experimental_enable_checksum_tracing_on_thunks()) {
-    pipeline.AddPass(std::make_unique<ThunkChecksumTracingPass>());
+    pipeline.AddPass(std::make_unique<ThunkBufferDebugPass>(
+        ThunkBufferDebugPass::Mode::kChecksum));
+  }
+  if ((debug_options.xla_gpu_detect_nan() !=
+       DebugOptions::DETECTION_MODE_NONE) ||
+      (debug_options.xla_gpu_detect_inf() !=
+       DebugOptions::DETECTION_MODE_NONE)) {
+    LOG(ERROR) << "Adding ThunkBufferDebugPass for nan/inf checking";
+    pipeline.AddPass(std::make_unique<ThunkBufferDebugPass>(
+        ThunkBufferDebugPass::Mode::kFloatChecker));
   }
   if (debug_options.xla_gpu_experimental_enable_command_buffer_on_thunks()) {
     pipeline.AddPass(std::make_unique<CommandBufferConversionPass>(
@@ -1104,8 +1115,8 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
   return output;
 }
 
-OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
-  OutputInfoProto proto;
+GpuExecutableProto::OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
+  GpuExecutableProto::OutputInfoProto proto;
   proto.set_allocation_index(allocation_index);
   proto.set_passthrough(passthrough);
 
@@ -1130,7 +1141,7 @@ OutputInfoProto GpuExecutable::OutputInfo::ToProto() const {
 }
 
 absl::StatusOr<GpuExecutable::OutputInfo> GpuExecutable::OutputInfo::FromProto(
-    const OutputInfoProto& proto) {
+    const GpuExecutableProto::OutputInfoProto& proto) {
   OutputInfo output_info;
   output_info.allocation_index = proto.allocation_index();
   output_info.passthrough = proto.passthrough();
@@ -1155,5 +1166,140 @@ absl::StatusOr<GpuExecutable::OutputInfo> GpuExecutable::OutputInfo::FromProto(
   }
   return output_info;
 }
+
+GpuExecutableProto::ConstantInfoProto GpuExecutable::ConstantInfo::ToProto()
+    const {
+  GpuExecutableProto::ConstantInfoProto proto;
+  proto.set_symbol_name(symbol_name);
+  *proto.mutable_content() = content.ToProto();
+  proto.set_allocation_index(allocation_index);
+  return proto;
+}
+
+GpuExecutable::ConstantInfo GpuExecutable::ConstantInfo::FromProto(
+    const GpuExecutableProto::ConstantInfoProto& proto) {
+  return ConstantInfo{
+      /*symbol_name=*/proto.symbol_name(),
+      /*content=*/DenseDataIntermediate::FromProto(proto.content()),
+      /*allocation_index=*/static_cast<int>(proto.allocation_index())};
+}
+
+absl::StatusOr<GpuExecutableProto> GpuExecutable::ToProto() const {
+  GpuExecutableProto proto;
+  proto.set_binary(binary_.data(), binary_.size());
+  proto.set_asm_text(text_);
+  proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs_.cbegin(),
+                                              dnn_compiled_graphs_.cend());
+
+  *proto.mutable_gpu_compute_capability() = gpu_version_.ToProto();
+
+  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunks_->ToProto());
+
+  proto.set_module_name(module_name_);
+  *proto.mutable_program_shape() = program_shape_.ToProto();
+
+  absl::Span<const BufferAllocation* const> allocations = GetAllocations();
+  proto.mutable_buffer_allocations()->mutable_values()->Reserve(
+      allocations.size());
+  for (const auto& allocation : allocations) {
+    proto.mutable_buffer_allocations()->mutable_values()->Add(
+        allocation->ToProto());
+  }
+
+  if (hlo_module_ != nullptr) {
+    *proto.mutable_hlo_module_with_config() = hlo_module_->ToProtoWithConfig();
+  }
+
+  proto.mutable_output_info_map()->Reserve(output_info_.size());
+  for (const auto& [shape_index, output_info] : output_info_) {
+    auto map_entry = proto.add_output_info_map();
+    *map_entry->mutable_shape_index() = shape_index.ToProto();
+    *map_entry->mutable_output_info() = output_info.ToProto();
+  }
+
+  proto.mutable_constants()->Reserve(constants_.size());
+  for (const auto& constant : constants_) {
+    *proto.add_constants() = constant.ToProto();
+  }
+
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::FromProto(
+    const GpuExecutableProto& proto,
+    const se::DeviceDescription& device_description,
+    absl::string_view platform_name) {
+  Params params;
+  params.enable_debug_info_manager = false;
+  params.asm_text = proto.asm_text();
+  const std::string& binary = proto.binary();
+  params.binary.assign(binary.begin(), binary.end());
+  params.buffer_assignment = nullptr;
+  if (proto.has_hlo_module_with_config()) {
+    TF_ASSIGN_OR_RETURN(
+        params.debug_module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+  }
+
+  params.mlir_allocations.emplace();
+  params.mlir_allocations->reserve(proto.buffer_allocations().values_size());
+  for (const BufferAllocationProto& allocation_proto :
+       proto.buffer_allocations().values()) {
+    params.mlir_allocations->push_back(
+        BufferAllocation::FromProto(allocation_proto));
+  }
+
+  for (const auto& [key, value] : proto.dnn_compiled_graphs()) {
+    params.dnn_compiled_graphs.emplace(key, value);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      stream_executor::GpuComputeCapability gpu_compute_capability,
+      stream_executor::GpuComputeCapability::FromProto(
+          proto.gpu_compute_capability()));
+
+  if (gpu_compute_capability != device_description.gpu_compute_capability()) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "GPU compute capability of serialized executable doesn't match target "
+        "device capability. (serialized: %s, target: %s)",
+        gpu_compute_capability.ToString(),
+        device_description.gpu_compute_capability().ToString()));
+  }
+
+  params.device_description = device_description;
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Thunk> thunk,
+      DeserializeThunkProto(proto.thunk(), params.mlir_allocations.value(),
+                            params.debug_module.get(), platform_name));
+
+  if (dynamic_cast<const SequentialThunk*>(thunk.get()) == nullptr) {
+    return absl::InvalidArgumentError(
+        "The top-most serialized thunk in the GPU Executable is not a "
+        "SequentialThunk!");
+  }
+
+  params.executable = unique_ptr_down_cast<SequentialThunk>(std::move(thunk));
+
+  params.constants.reserve(proto.constants().size());
+  for (const auto& constant_proto : proto.constants()) {
+    params.constants.push_back(ConstantInfo::FromProto(constant_proto));
+  }
+
+  params.output_info.reserve(proto.output_info_map().size());
+  for (const auto& output_info_proto : proto.output_info_map()) {
+    ShapeIndex shape_index =
+        ShapeIndex::FromProto(output_info_proto.shape_index());
+    TF_ASSIGN_OR_RETURN(OutputInfo output_info,
+                        OutputInfo::FromProto(output_info_proto.output_info()));
+    params.output_info.emplace(std::move(shape_index), std::move(output_info));
+  }
+
+  params.module_name = proto.module_name();
+  TF_ASSIGN_OR_RETURN(params.program_shape,
+                      ProgramShape::FromProto(proto.program_shape()));
+
+  return Create(std::move(params));
+}
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index 35a975bbb1f360..52fa7b468bedfc 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -70,6 +70,11 @@ class GpuExecutable : public Executable {
     std::string symbol_name;
     DenseDataIntermediate content;
     int allocation_index = -1;
+
+    GpuExecutableProto::ConstantInfoProto ToProto() const;
+
+    static ConstantInfo FromProto(
+        const GpuExecutableProto::ConstantInfoProto& proto);
   };
 
   struct OutputInfo {
@@ -83,8 +88,9 @@ class GpuExecutable : public Executable {
     // would indicate the aliased parameter), and what kind of alias it is.
     std::optional<HloInputOutputAliasConfig::Alias> alias_config;
 
-    OutputInfoProto ToProto() const;
-    static absl::StatusOr<OutputInfo> FromProto(const OutputInfoProto& proto);
+    GpuExecutableProto::OutputInfoProto ToProto() const;
+    static absl::StatusOr<OutputInfo> FromProto(
+        const GpuExecutableProto::OutputInfoProto& proto);
 
     friend bool operator==(const OutputInfo& lhs, const OutputInfo& rhs) {
       return std::tie(lhs.allocation_index, lhs.passthrough,
@@ -210,6 +216,13 @@ class GpuExecutable : public Executable {
 
   absl::Status VerboseAllocationError(absl::Status s);
 
+  static absl::StatusOr<std::unique_ptr<GpuExecutable>> FromProto(
+      const GpuExecutableProto&,
+      const se::DeviceDescription& device_description,
+      absl::string_view platform);
+
+  absl::StatusOr<GpuExecutableProto> ToProto() const;
+
  private:
   // Use GpuExecutable::Create() to create an instance.
   explicit GpuExecutable(Params params,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.proto b/third_party/xla/xla/service/gpu/gpu_executable.proto
index 251e3541320640..8a898ce36c5ab6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.proto
+++ b/third_party/xla/xla/service/gpu/gpu_executable.proto
@@ -2,15 +2,94 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/backends/gpu/runtime/thunk.proto";
+import "xla/service/gpu/ir_emission_utils.proto";
 import "xla/service/hlo.proto";
+import "xla/shape_util.proto";
+import "xla/stream_executor/device_description.proto";
+import "xla/xla.proto";
+import "xla/xla_data.proto";
 
-message OutputInfoProto {
-  // This output is part of the following buffer allocation
-  int64 allocation_index = 1;
+// Serialized representation of a GPU executable, used for AOT compilation.
+//
+// There's a legacy and a new verions of this proto, see
+// `xla::gpu::LegacyGpuAotCompilationResult` and
+// `xla::gpu::GpuAotCompilationResult` respectively for more details.
+//
+// If `thunk` is set, then this is the new format. Otherwise, it's the legacy
+// format.
+message GpuExecutableProto {
+  // The HLO module of the executable - for debugging purposes only.
+  xla.HloModuleProtoWithConfig hlo_module_with_config = 1;
 
-  // True when this output is passed through from an input parameter
-  bool passthrough = 2;
+  // Wrapper around repeated BufferAllocationProto to allow using it in a oneof.
+  message RepeatedBufferAllocations {
+    repeated BufferAllocationProto values = 1;
+  }
 
-  // Describes whether and how this output aliases with an input parameter
-  optional xla.HloInputOutputAliasProto.AliasEntryProto alias_config = 3;
+  // Used in the legacy format.
+  BufferAssignmentProto buffer_assignment = 2;
+  // The buffer allocations of the executable, used in the new format.
+  RepeatedBufferAllocations buffer_allocations = 10;
+
+  // The PTX of the executable. (Only applicable to CUDA)
+  string asm_text = 3;
+
+  // The binary of the executable.
+  //
+  // For CUDA, this is a cubin binary.
+  // For ROCm, this is a hsaco binary.
+  bytes binary = 4;
+
+  // The DNN compiled graphs of the executable.
+  //
+  // The key is the DNN kernel name, and the value is the compiled graph
+  // serialized to JSON. (Only applicable to cuDNN)
+  map<string, bytes> dnn_compiled_graphs = 5;
+
+  // The target compute capability of the executable.
+  stream_executor.GpuComputeCapabilityProto gpu_compute_capability = 6;
+
+  // The thunk tree of the executable.
+  // Not set in the legacy format.
+  optional ThunkProto thunk = 7;
+
+  // The name of the HLO module - for debugging purposes only.
+  string module_name = 8;
+
+  // The shape of the program (parameters and result).
+  xla.ProgramShapeProto program_shape = 9;
+
+  message OutputInfoProto {
+    // This output is part of the following buffer allocation
+    int64 allocation_index = 1;
+
+    // True when this output is passed through from an input parameter
+    bool passthrough = 2;
+
+    // Describes whether and how this output aliases with an input parameter
+    optional xla.HloInputOutputAliasProto.AliasEntryProto alias_config = 3;
+  }
+
+  message OutputInfoMapEntry {
+    xla.ShapeIndexProto shape_index = 1;
+    OutputInfoProto output_info = 2;
+  }
+
+  // Map from output shape index to output info.
+  repeated OutputInfoMapEntry output_info_map = 11;
+
+  message ConstantInfoProto {
+    // The name of the constant in the HLO module.
+    string symbol_name = 1;
+
+    // The content of the constant - this can be large.
+    DenseDataIntermediateProto content = 2;
+
+    // The index of the buffer allocation for this constant.
+    int64 allocation_index = 3;
+  }
+
+  // The constants used by the executable.
+  repeated ConstantInfoProto constants = 12;
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_test.cc b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
index 453f81e680f0f7..9f5c12b29e2fe6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
@@ -62,8 +62,11 @@ namespace xla::gpu {
 namespace {
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::Pair;
+using ::testing::Pointee;
 using ::testing::Property;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
 using ::tsl::proto_testing::EqualsProto;
 
 TEST(GpuExecutableTest, OuputInfoToAndFromProto) {
@@ -433,5 +436,56 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
               )pb"));
 }
 
+TEST(GpuExecutableTest, ProtoConversion) {
+  se::DeviceDescription device_description;
+  device_description.set_gpu_compute_capability(
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()});
+  device_description.set_driver_version({12, 3, 0});
+  device_description.set_runtime_version({12, 3, 0});
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.thunk_id = 123;
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::make_unique<KernelThunk>(
+      thunk_info,
+      /*kernel_name=*/"test_kernel", emitters::KernelArguments({}),
+      LaunchDimensions(),
+      /*cluster_dim=*/std::nullopt,
+      /*shmem_bytes=*/0, se::gpu::TmaMetadata()));
+
+  GpuExecutable::Params params;
+  params.asm_text = "test_asm_text";
+  params.binary = {1, 2, 3};
+  params.dnn_compiled_graphs = {{"test_dnn_compiled_graph", "test_json"}};
+
+  thunk_info.thunk_id = 456;
+  params.executable =
+      std::make_unique<SequentialThunk>(thunk_info, std::move(thunk_sequence));
+  params.device_description = device_description;
+
+  params.module_name = "test_module";
+  params.enable_debug_info_manager = false;
+  params.mlir_allocations = {BufferAllocation(0, 1024, 0)};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GpuExecutable> reference_executable,
+                          GpuExecutable::Create(std::move(params)));
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto proto,
+                          reference_executable->ToProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GpuExecutable> reconstructed_executable,
+      GpuExecutable::FromProto(proto, device_description, "TEST_PLATFORM"));
+  EXPECT_THAT(reconstructed_executable->text(), "test_asm_text");
+  EXPECT_THAT(reconstructed_executable->binary(), ElementsAre(1, 2, 3));
+  EXPECT_THAT(
+      reconstructed_executable->dnn_compiled_graphs(),
+      UnorderedElementsAre(Pair("test_dnn_compiled_graph", "test_json")));
+  EXPECT_THAT(reconstructed_executable->GetThunk().thunks(),
+              ElementsAre(Pointee(Property(&Thunk::kind, Thunk::kKernel))));
+  EXPECT_THAT(reconstructed_executable->GetAllocations(),
+              ElementsAre(Pointee(Property(&BufferAllocation::size, 1024))));
+  EXPECT_THAT(reconstructed_executable->name(), "test_module");
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 2ed3aa4a31655f..2f3388cd5ca0b1 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -73,8 +73,8 @@ const Shape& GetElementShape(const HloFusionAnalysis& analysis) {
 }
 
 // Computes the maximum valid unroll factor for a given instruction.
-int ComputeMaxUnrollFactor(int64_t num_elements) {
-  for (int i = MaxUnrollFactor(); i > 1; i /= 2) {
+int ComputeMaxUnrollFactor(int64_t num_elements, int64_t max_unroll) {
+  for (int i = max_unroll; i > 1; i /= 2) {
     if (num_elements % i == 0) {
       return i;
     }
@@ -930,6 +930,21 @@ LaunchDimensionsConfig ComputeLoopFusionConfig(
   return ComputeLoopFusionConfig(analysis, GetElementShape(analysis));
 }
 
+namespace {
+bool ContainsTransposeWithSmallMostMinorDim(const HloFusionAdaptor& fusion,
+                                            int64_t unroll_factor) {
+  return HloAnyOf(fusion, [unroll_factor](HloInstructionAdaptor instr) {
+    if (instr.opcode() != HloOpcode::kTranspose) {
+      return false;
+    }
+    const HloInstruction& transpose = instr.instruction();
+    // We can assume that TransposeDimensionGrouper pass has run, so no need
+    // to try to combine adjacent dimensions.
+    return transpose.shape().dimensions().back() < unroll_factor;
+  });
+}
+}  // namespace
+
 LaunchDimensionsConfig ComputeLoopFusionConfig(
     const HloFusionAnalysis& analysis, const Shape& element_shape) {
   int unroll_factor = 1;
@@ -944,7 +959,56 @@ LaunchDimensionsConfig ComputeLoopFusionConfig(
                           analysis.device_info().core_count();
   if (num_elements >= n_threads_max &&
       !MayCausePerformanceDropIfUnrolled(analysis.fusion())) {
-    unroll_factor = ComputeMaxUnrollFactor(num_elements);
+    int64_t max_unroll = MaxUnrollFactor();
+    // On Blackwell we would like to increase the maximum unroll factor to 8, as
+    // we need more vectorization for full performance.
+    // However we need to check additional conditions:
+    //   - Unrolling is potentially bad for fusions with reductions, where one
+    //     thread will handle the full reduction dimension, so more unrolling
+    //     can hurt parallelism.
+    //   - Unrolling is potentially bad for fusions with many outputs, as that
+    //     might increase register pressure. A thread needs to compute all the
+    //     outputs first before it can write them due to potential in-place
+    //     buffers. More unrolling will increase the number of values that need
+    //     to be computed before writing.
+    //   - Unrolling is potentially bad for transposes if the most minor
+    //     dimension of transpose is smaller than the unroll factor. This could
+    //     potentially be checked with indexing analysis as well, but it is
+    //     tricky to get the conditions right when bad or unknown indexing
+    //     should block more unrolling or not. For now, let's keep it simple and
+    //     only check for transpose.
+
+    // For now, don't allow any multi-output fusions. However register pressure
+    // also does not only depend on the number of outputs, so we might hit it
+    // also for single fusions, or there could be multi-output fusions that
+    // don't face register pressure. This part of the heuristic may need
+    // improvements.
+    constexpr int kMaxNumOutputsForFullUnrolling = 1;
+    // On PTX level, we can vectorize with v4.b32, but not with v8.b32. So
+    // higher unroll factor does not make sense with 32 bit or more.
+    constexpr int kMaxBitsToVectorizeWithVectorSize4 = 32;
+    if (analysis.device_info().cuda_compute_capability().IsBlackwell() &&
+        analysis.emitter_fusion_kind() ==
+            HloFusionAnalysis::EmitterFusionKind::kLoop &&
+        analysis.input_output_info().smallest_output_dtype_bits <
+            kMaxBitsToVectorizeWithVectorSize4 &&
+        analysis.fusion_root_count() <= kMaxNumOutputsForFullUnrolling &&
+        analysis.fusion_root(0)
+            .instruction()
+            .GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_experimental_allow_unroll_factor_eight() &&
+        !HloAnyOf(
+            analysis.fusion(),
+            [](HloInstructionAdaptor node) {
+              return node.opcode() == HloOpcode::kReduce;
+            }) &&
+        !ContainsTransposeWithSmallMostMinorDim(analysis.fusion(),
+                                                max_unroll * 2)) {
+      max_unroll *= 2;
+    }
+    unroll_factor = ComputeMaxUnrollFactor(num_elements, max_unroll);
   }
   // CHECK that unroll_factor is a power-of-2, as needed by the logic below.
   CHECK(absl::has_single_bit(static_cast<uint64_t>(unroll_factor)));
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index 6f601d8620b68c..3b7796d553cd8c 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -20,11 +20,14 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
@@ -51,6 +54,12 @@ class GpuFusibleTest : public HloHardwareIndependentTestBase {
     TF_ASSERT_OK_AND_ASSIGN(device_description_, MakeDeviceDescription());
   }
 
+  DebugOptions GetDebugOptionsForTest() const override {
+    auto debug_options = GetDebugOptionsFromFlags();
+    debug_options.set_xla_gpu_experimental_allow_unroll_factor_eight(true);
+    return debug_options;
+  }
+
   bool IsReduceInputFusion(const HloInstruction& instr) const {
     return ::xla::gpu::IsReduceInputFusion(instr, device_description_);
   }
@@ -1668,6 +1677,128 @@ ENTRY main {
   EXPECT_TRUE(MayCausePerformanceDropIfUnrolled(*fusion_adaptor));
 }
 
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[2048,1024]{1,0} parameter(0)
+  ROOT res = f16[2048,1024]{1,0} negate(p0)
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 8);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfig32Bit) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f32[2048,1024]{1,0} parameter(0)
+  ROOT res = f32[2048,1024]{1,0} negate(p0)
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopReduce) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+max {
+  p0 = f16[] parameter(0)
+  p1 = f16[] parameter(1)
+  ROOT add = f16[] maximum(p0, p1)
+}
+
+ENTRY main {
+  p0 = f16[270336,8]{1,0} parameter(0)
+  neg_inf = f16[] constant(-inf)
+  ROOT res = f16[270336]{0} reduce(p0, neg_inf), dimensions={1}, to_apply=max
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopTransposeSmallMinorDim) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[256,2048,4]{2,1,0} parameter(0)
+  ROOT res = f16[2048,256,4]{2,1,0} transpose(p0), dimensions={1,0,2}
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+}
+
+TEST_F(GpuFusibleTest, ComputeLoopFusionConfigForLoopTransposeLargerMinorDim) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[256,2048,8]{2,1,0} parameter(0)
+  ROOT res = f16[2048,256,8]{2,1,0} transpose(p0), dimensions={1,0,2}
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 8);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 6716d81f07c28a..cafd274483ca7e 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/analytical_latency_estimator.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/sol_latency_estimator.h"
 #include "xla/service/gpu/transforms/collectives/async_collective_annotator.h"
@@ -341,7 +341,7 @@ std::optional<ProfiledInstructionsProto> ProfileFromConfig(
   ProfiledInstructionsProto profile;
   absl::string_view from_config = config.fdo_profile();
   LOG(INFO) << "Attempting to parse as a binary proto.";
-  if (profile.ParseFromArray(from_config.data(), from_config.size())) {
+  if (profile.ParseFromString(from_config)) {
     LOG(INFO) << "Using PGLE profile from fdo_profile (binary)";
     return profile;
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index 2371692b7f2360..ec76cbc52b5533 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index e6f378cfe2130b..bb9b4921196e81 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/service/profile_guided_latency_estimator.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
index 8b1e5ee5874aad..46146af1c57ae7 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline.cc
@@ -143,10 +143,14 @@ void AddSPMDPasses(
       /*update_domain=*/false,
       /*composites_to_preserve=*/absl::flat_hash_set<std::string>{},
       /*uniquify_channel_ids=*/false,
-      /*should_inline=*/
-      [](const xla::CallGraph& call_graph, xla::HloInstruction* instruction) {
-        return absl::StrContains(instruction->to_apply()->name(),
-                                 sdy::kInlineableManualComputationFuncName);
+      /*override_policy=*/
+      [](const xla::CallGraph& call_graph,
+         const xla::HloInstruction* instruction) {
+        if (absl::StrContains(instruction->to_apply()->name(),
+                              sdy::kInlineableManualComputationFuncName)) {
+          return xla::CallInliner::InlineOverridePolicy::kAllowInline;
+        }
+        return xla::CallInliner::InlineOverridePolicy::kProhibitInline;
       });
   spmd_pipeline.AddPass<CollectivePermuteMotion>();
 }
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 85368317fa137d..aef79222c29ee6 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -114,7 +114,8 @@ HloFusionAnalysis::EmitterFusionKind GetEmitterFusionKind(
   if (fusion_backend_config.kind() == kTritonFusionKind ||
       fusion_backend_config.kind() == kTritonGemmFusionKind ||
       fusion_backend_config.kind() == kTritonNestedGemmFusionKind ||
-      fusion_backend_config.kind() == kTritonScaledDotFusionKind) {
+      fusion_backend_config.kind() == kTritonScaledDotFusionKind ||
+      fusion_backend_config.kind() == kTritonCollectiveFusionKind) {
     return HloFusionAnalysis::EmitterFusionKind::kTriton;
   }
 
diff --git a/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc b/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
index b306fc21951c0d..b34d2a1ed9c787 100644
--- a/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/intel_gpu_compiler.cc
@@ -29,7 +29,7 @@ IntelGpuCompiler::IntelGpuCompiler()
                   spir::DataLayout()) {}
 
 absl::Status IntelGpuCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+    HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
     se::dnn::VersionInfo dnn_version,
     const se::SemanticVersion& toolkit_version) {
   // Note: this is a stub.
diff --git a/third_party/xla/xla/service/gpu/intel_gpu_compiler.h b/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
index 11bc3eefc81840..86cc56393f3351 100644
--- a/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/intel_gpu_compiler.h
@@ -35,7 +35,7 @@ class IntelGpuCompiler : public GpuCompiler {
   IntelGpuCompiler();
 
   absl::Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::GpuComputeCapability gpu_version,
+      HloModule* hlo_module, const se::GpuComputeCapability& gpu_version,
       se::dnn::VersionInfo dnn_version,
       const se::SemanticVersion& toolkit_version) override;
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index b02f4ffaee992e..d44c1b73d5cfb2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -133,14 +133,6 @@ absl::StatusOr<bool> IsCublasSupportedMatMul(
       return false;
   }
 }
-const char* const kCusolverCholeskyCallTarget = "__cusolver$cholesky";
-
-bool IsCustomCallToCusolver(const HloInstruction& hlo) {
-  if (hlo.opcode() != HloOpcode::kCustomCall) {
-    return false;
-  }
-  return hlo.custom_call_target() == kCusolverCholeskyCallTarget;
-}
 
 bool IsCustomCallToTopK(const HloInstruction& hlo) {
   return hlo.opcode() == HloOpcode::kCustomCall &&
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 4b45907d03809c..81206e6d205e2a 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -86,6 +86,10 @@ inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
 // kTritonGemmFusionKind.
 inline constexpr absl::string_view kTritonFusionKind = "__triton";
 
+// Used for fusions that codegen a collective.
+inline constexpr absl::string_view kTritonCollectiveFusionKind =
+    "__triton_collective";
+
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
 inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
@@ -140,13 +144,6 @@ std::optional<std::string> GetCustomFusionConfigName(
 // fusion. This is determined by checking the name of custom fusion config.
 bool IsDynamicSliceFusion(const HloInstruction* instr);
 
-// Returns true if `hlo` will be implemented as a call to a cuSolver routine.
-//
-// This returns true if `hlo` is a CustomCall HLO with a call target equal to
-// one of the kCusolver... constants, but returns *false* for HLOs with
-// say, a kCholesky opcode.
-bool IsCustomCallToCusolver(const HloInstruction& hlo);
-
 // Returns true if `hlo` will be implemented as a call to a TopK routine.
 bool IsCustomCallToTopK(const HloInstruction& hlo);
 
@@ -157,12 +154,6 @@ bool IsCustomCallToPtxKernel(const HloInstruction& hlo);
 // Returns true if instruction is a Mosaic GPU collective instruction.
 bool IsCollectiveMosaicGpuInstruction(const HloInstruction& hlo);
 
-// Cholesky decomposition. Takes a (batched) matrix as input, and returns a
-// tuple of (result, workspace, info), where result is the result of the
-// Cholesky decomposition, workspace is scratch space for cuSolver, and info
-// is a success/failure code per batch element.
-extern const char* const kCusolverCholeskyCallTarget;
-
 // Returns true if `instr` is a slice (or dynamic slice) instruction and
 // operates on a contiguous slice of the input buffer.
 bool IsContiguousSlice(const HloInstruction& instr);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 9b767237fb406f..993e3d7a439d76 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -1,4 +1,3 @@
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 /* Copyright 2017 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,10 +27,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/host_execute_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
@@ -69,15 +70,15 @@ class IrEmitterContext {
                    const ExecutionStreamAssignment* execution_stream_assignment,
                    std::string platform_name,
                    const se::DeviceDescription& gpu_device_info,
-                   SymbolicExprContext* symbolic_expr_context,
-                   llvm::Module* llvm_module,
+                   mlir::MLIRContext* mlir_context, llvm::Module* llvm_module,
                    llvm::Module* llvm_module_constants, bool emit_kernels)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
         execution_stream_assignment_(execution_stream_assignment),
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
-        symbolic_expr_context_(symbolic_expr_context),
+        mlir_context_(mlir_context),
+        expr_context_(mlir_context_),
         llvm_module_(llvm_module),
         llvm_module_constants_(llvm_module_constants),
         emit_kernels_(emit_kernels) {}
@@ -100,21 +101,13 @@ class IrEmitterContext {
   const se::GpuComputeCapability& gpu_compute_capability() const {
     return gpu_device_info_.gpu_compute_capability();
   }
-  se::CudaComputeCapability cuda_compute_capability() const {
-    auto* cc = gpu_compute_capability().cuda_compute_capability();
-    return cc != nullptr ? *cc : se::CudaComputeCapability();
-  }
-  se::RocmComputeCapability rocm_compute_capability() const {
-    auto* cc = gpu_compute_capability().rocm_compute_capability();
-    return cc != nullptr ? *cc : se::RocmComputeCapability();
-  }
+
+  mlir::MLIRContext* mlir_context() { return mlir_context_; }
 
   // TODO: b/451959933 - Add nullability annotation to be explicit about this
   // pointer: go/totw/230. Alternatively, return by reference instead of pointer
   // (and require reference in ctor) to signal that it is always present.
-  SymbolicExprContext* symbolic_expr_context() {
-    return symbolic_expr_context_;
-  }
+  SymbolicExprContext* expr_context() { return &expr_context_; }
 
   llvm::Module* llvm_module() { return llvm_module_; }
   // A separate module can optionally be used to emit constants.
@@ -166,7 +159,8 @@ class IrEmitterContext {
   const ExecutionStreamAssignment* execution_stream_assignment_;
   std::string platform_name_;
   const se::DeviceDescription& gpu_device_info_;
-  SymbolicExprContext* symbolic_expr_context_;
+  mlir::MLIRContext* mlir_context_;
+  SymbolicExprContext expr_context_;
   llvm::Module* llvm_module_;
   llvm::Module* llvm_module_constants_;
   NameUniquer name_uniquer_;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index 15cb6c7e5e9216..61cbd68b549245 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -54,6 +53,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -110,16 +110,12 @@ IrEmitterNested::IrEmitterNested(const HloComputation& nested_computation,
 // Nested function serves the same purpose on GPU as a thread-local function on
 // a CPU.
 absl::StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
-  // Include a fingerprint of the HLO in the function name. Currently, codegen
-  // is invoked on temporary HLO objects, which means the address of the
-  // computation is not necessarily unique.
-  std::string fingerprint =
-      emitters::GetComputationFingerprint(&nested_computation_, {});
-  size_t hash = absl::Hash<std::string>{}(fingerprint);
-  std::string function_name = llvm_ir::SanitizeFunctionName(
-      absl::StrCat(nested_computation_.name(), "_",
-                   absl::Hex(reinterpret_cast<intptr_t>(&nested_computation_)),
-                   "_", absl::Hex(hash)));
+  // Include a fingerprint of the HLO in the function name to make the name
+  // unique.
+  tsl::Fprint128 fingerprint = tsl::Fingerprint128(
+      emitters::GetComputationFingerprint(&nested_computation_, {}));
+  std::string function_name = llvm_ir::SanitizeFunctionName(absl::StrCat(
+      nested_computation_.name(), "_", fingerprint.low64, fingerprint.high64));
 
   auto* function =
       ir_emitter_context_->llvm_module()->getFunction(function_name);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 7446374600f5fb..8fa9d4b8413741 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -73,11 +72,9 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/cholesky_thunk.h"
 #include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
 #include "xla/backends/gpu/runtime/collective_group_thunk.h"
 #include "xla/backends/gpu/runtime/collective_permute_thunk.h"
@@ -92,6 +89,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/cub_sort_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/fft_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
@@ -113,6 +111,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/select_k_thunk.h"
 #include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/topk.h"
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
@@ -165,18 +164,13 @@ limitations under the License.
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/service/llvm_ir/sort_util.h"
 #include "xla/service/name_uniquer.h"
-#include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
-#include "xla/stream_executor/gpu_solver_context.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/errors.h"
@@ -1073,82 +1067,20 @@ absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
   TF_ASSIGN_OR_RETURN(xla::SortOptions options,
                       instr->backend_config<xla::SortOptions>());
   const Shape& operand_shape = instr->operand(0)->shape();
-  auto thunk = std::make_unique<CubSortThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      operand_shape.element_type(),
-      instr->operand_count() == 2
-          ? std::optional(instr->operand(1)->shape().element_type())
-          : std::nullopt,
-      operands, results, scratch, options.descending(),
-      Product(operand_shape.dimensions()) /
-          operand_shape.dimensions(operand_shape.dimensions().size() - 1),
-      ir_emitter_context_->platform_name());
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(CholeskyOptions options,
-                      instr->backend_config<CholeskyOptions>());
-  const Shape& shape = instr->operand(0)->shape();
-  int ndim = shape.dimensions().size();
-  CHECK_GE(ndim, 2);
-  int64_t n = shape.dimensions(ndim - 1);
-
-  const absl::Span<const int64_t>& dims = shape.dimensions();
-  int64_t batch_size =
-      std::accumulate(dims.begin(), dims.end() - 2, int64_t{1},
-                      [](int64_t a, int64_t b) { return a * b; });
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice operand_buffer,
-                      GetAllocationSliceForHlo(instr->operand(0), {}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_buffer,
-                      GetAllocationSliceForHlo(instr, {0}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice workspace_buffer,
-                      GetAllocationSliceForHlo(instr, {1}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice info_buffer,
-                      GetAllocationSliceForHlo(instr, {2}));
-
-  ThunkSequence thunks;
-
-  if (operand_buffer != a_buffer) {
-    thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(
-            instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/operand_buffer,
-        /*destination_buffer=*/a_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
-  }
-
   TF_ASSIGN_OR_RETURN(
-      se::Platform * platform,
-      PlatformUtil::GetPlatform(ir_emitter_context_->platform_name()));
-
-  TF_ASSIGN_OR_RETURN(
-      std::function<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_creator,
-      stream_executor::PlatformObjectRegistry::GetGlobalRegistry()
-          .FindObject<stream_executor::GpuSolverContextFactory>(
-              platform->id()));
-
-  thunks.push_back(std::make_unique<CholeskyThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      options, a_buffer, workspace_buffer, info_buffer, shape.element_type(),
-      batch_size, n, std::move(solver_creator)));
-
-  // Elide the sequential thunk if there's no copy.
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
-  } else {
-    AddThunkToThunkSequence(std::make_unique<SequentialThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(
-            instr, ir_emitter_context_->GetNextThunkId()),
-        std::move(thunks)));
-  }
-
+      std::unique_ptr<CubSortThunk> thunk,
+      CubSortThunk::Create(
+          Thunk::ThunkInfo::WithProfileAnnotation(
+              instr, ir_emitter_context_->GetNextThunkId()),
+          operand_shape.element_type(),
+          instr->operand_count() == 2
+              ? std::optional(instr->operand(1)->shape().element_type())
+              : std::nullopt,
+          operands, results, scratch, options.descending(),
+          Product(operand_shape.dimensions()) /
+              operand_shape.dimensions(operand_shape.dimensions().size() - 1),
+          ir_emitter_context_->platform_name()));
+  AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
@@ -1161,7 +1093,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   bool is_ffi_custom_call =
       instr->api_version() == CustomCallApiVersion::API_VERSION_TYPED_FFI;
 
-  using Slices = std::vector<std::optional<ShapedSlice>>;
+  using Slices = std::vector<NullableShapedSlice>;
 
   Slices operands;
   for (auto* operand : instr->operands()) {
@@ -1200,7 +1132,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   // attributes map at IR emission time, so that we do not need to
   // parse MLIR at run time. For FFI handlers backend config must be
   // a compatible MLIR dictionary.
-  CustomCallThunk::AttributesMap attributes;
+  ffi::AttributesMap attributes;
 
   auto backend_config = instr->backend_config<GpuBackendConfig>();
   if (!backend_config.ok()) {
@@ -1218,7 +1150,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     if (!backend_config_str.empty()) {
       mlir::Attribute attr = mlir::parseAttribute(
           backend_config_str,
-          ir_emitter_context_->symbolic_expr_context()->GetMLIRContext());
+          ir_emitter_context_->expr_context()->GetMLIRContext());
       auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr);
       if (dict == nullptr) {
         return absl::InternalError(
@@ -1441,9 +1373,10 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
       kernel::topk::GetTopKKernel("topk", dtype, n, k, batch_size,
                                   platform_name(), wavefront_size));
 
+  Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(
+      instr, ir_emitter_context_->GetNextThunkId());
   AddThunkToThunkSequence(std::make_unique<CustomKernelThunk>(
-      instr, std::move(kernel), kernel_arguments,
-      ir_emitter_context_->GetNextThunkId()));
+      std::move(thunk_info), std::move(kernel), kernel_arguments));
   return absl::OkStatus();
 }
 
@@ -1451,7 +1384,7 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
     const HloCustomCallInstruction* instr) {
   auto generate = [this, &instr]() -> absl::StatusOr<KernelReuseCache::Entry> {
     mlir::MLIRContext& mlir_context =
-        *ir_emitter_context_->symbolic_expr_context()->GetMLIRContext();
+        *ir_emitter_context_->expr_context()->GetMLIRContext();
     LoadMlirDialectsForTriton(mlir_context);
     auto call =
         TritonCall::Parse(instr->raw_backend_config_string(), &mlir_context);
@@ -1628,7 +1561,7 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr) {
           /*buffer_assignment=*/
           &ir_emitter_context_->buffer_assignment(),
           /*call_graph=*/*call_graph_),
-      ir_emitter_context_->symbolic_expr_context());
+      ir_emitter_context_->expr_context());
   TF_ASSIGN_OR_RETURN(auto result, emitter->Emit(*ir_emitter_context_, *instr));
 
   const ExecutionStreamAssignment& stream_assignment =
@@ -3291,9 +3224,6 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       if (IsCustomCallToDnnConvolution(*instr)) {
         return EmitConvolutionThunk(custom_call);
       }
-      if (IsCustomCallToCusolver(*instr)) {
-        return EmitCholeskyThunk(instr);
-      }
       if (IsTriangularSolve(*instr)) {
         return EmitTriangularSolveCustomCall(instr);
       }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index 9273537d8af225..4fe71b79f5dca9 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -126,7 +126,6 @@ class IrEmitterUnnested : public IrEmitter {
   absl::Status EmitCuDnnThunk(const HloCustomCallInstruction* instr);
   absl::Status EmitPtxCustomCall(const HloCustomCallInstruction* instr);
   absl::Status EmitCubDeviceRadixSort(const HloCustomCallInstruction* instr);
-  absl::Status EmitCholeskyThunk(const HloInstruction* instr);
   absl::Status EmitCustomCallThunk(const HloCustomCallInstruction* instr);
   absl::Status EmitFftThunk(const HloFftInstruction* instr);
   absl::Status EmitAsyncComputation(const HloInstruction* instr);
diff --git a/third_party/xla/xla/service/gpu/kernel_call.cc b/third_party/xla/xla/service/gpu/kernel_call.cc
index 2c9ba308465254..e6c4c813e3de78 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.h b/third_party/xla/xla/service/gpu/kernel_call.h
index 6f7aeb3708af8d..05e7384e41ca41 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.h
+++ b/third_party/xla/xla/service/gpu/kernel_call.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_call_test.cc b/third_party/xla/xla/service/gpu/kernel_call_test.cc
index c03ae4e687bd12..4de3cf412a00bc 100644
--- a/third_party/xla/xla/service/gpu/kernel_call_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
index d10d59e2af0568..1319bc85cdf3c4 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 
 #include <functional>
+#include <optional>
 #include <string>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -27,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
index 4cfa8160cd090c..4347a899e1c621 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/hlo/ir/hlo_computation.h"
-#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/kernel_reuse_cache.pb.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
similarity index 80%
rename from third_party/xla/xla/service/gpu/executable.proto
rename to third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
index e7228bb07bae36..1a0888f6c795c1 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.proto
@@ -17,16 +17,8 @@ syntax = "proto3";
 
 package xla.gpu;
 
-import "xla/service/hlo.proto";
-import "xla/xla.proto";
-
-message CompilationResultProto {
-  HloModuleProtoWithConfig hlo_module_with_config = 1;
-  BufferAssignmentProto buffer_assignment = 2;
-  string asm_text = 3;
-  bytes binary = 4;
-  map<string, bytes> dnn_compiled_graphs = 5;
-}
+option java_outer_classname = "KernelReuseCache";
+option java_multiple_files = true;
 
 message CompilationCacheEntryProto {
   message LaunchDimensionsProto {
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc b/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
index 3f32225a72759c..fa428fc8f54c4f 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache_test.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/kernel_reuse_cache.h"
 
+#include <string>
+
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
new file mode 100644
index 00000000000000..67e3ecfcc86c41
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
@@ -0,0 +1,112 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/legacy_gpu_aot_compilation_result.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/alias_info.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+LegacyGpuAotCompilationResult::FromModule(
+    const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+    absl::string_view asm_text, absl::Span<const uint8_t> binary,
+    const BinaryMap& dnn_compiled_graphs, int pointer_size) {
+  tsl::profiler::TraceMe traceme("ResultFromModule");
+  GpuExecutableProto proto;
+  *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
+  *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
+  proto.set_asm_text(asm_text);
+  proto.set_binary(binary.data(), binary.size());
+  proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
+                                              dnn_compiled_graphs.cend());
+  return std::unique_ptr<LegacyGpuAotCompilationResult>(
+      new LegacyGpuAotCompilationResult(hlo_module->Clone(), std::move(proto),
+                                        pointer_size));
+}
+
+absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
+                                          int pointer_size) {
+  tsl::profiler::TraceMe traceme("ResultFromString");
+  GpuExecutableProto proto;
+  if (!proto.ParseFromString(serialized)) {
+    return Internal("Failed to parse serialized GpuThunkAotCompilationResult.");
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+  return std::unique_ptr<LegacyGpuAotCompilationResult>(
+      new LegacyGpuAotCompilationResult(std::move(module), std::move(proto),
+                                        pointer_size));
+}
+
+absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
+    const {
+  return proto_.SerializeAsString();
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+LegacyGpuAotCompilationResult::LoadExecutable(
+    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
+  if (stream_exec == nullptr) {
+    return InvalidArgument("Stream executor is null.");
+  }
+
+  return compiler->LoadExecutableFromAotResult(*this, *stream_exec);
+}
+
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+LegacyGpuAotCompilationResult::buffer_assignment() const {
+  auto buffer_size_bytes_function =
+      [pointer_size = pointer_size_](const BufferValue& buffer) {
+        return gpu::ShapeSizeBytesFunction(pointer_size)(buffer.shape());
+      };
+
+  // Recreate BufferAssignment from proto.
+  // Technically, we should pass the proper GpuAliasInfo, but the FromProto()
+  // method does not actually make use of the MayAlias function. And for now, we
+  // don't have backend-specific MustAlias rules.
+  // TODO(b/424109294): This needs to be fixed when we implement
+  // backend-specific MustAlias rules.
+  AliasInfo alias_info;
+  return BufferAssignment::FromProto(proto_.buffer_assignment(), module_.get(),
+                                     buffer_size_bytes_function, &alias_info);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
new file mode 100644
index 00000000000000..cbbaecdc964857
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -0,0 +1,89 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
+#define XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_executable.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Represents the legacy result of a GPU AOT compilation.
+//
+// This result primarily contains the optimized HLO module. Executables loaded
+// from this result can bypass the HLO optimization passes, since this result
+// already contains the optimized HLO.
+//
+// This class is considered legacy and is expected to be replaced by a
+// new AOT result type as part of the runtime split. The new type will
+// encapsulate the compilation up to the Thunks generation stage.
+class LegacyGpuAotCompilationResult : public AotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+  FromModule(const HloModule* hlo_module,
+             const BufferAssignment* buffer_assignment,
+             absl::string_view asm_text, absl::Span<const uint8_t> binary,
+             const BinaryMap& dnn_compiled_graphs, int pointer_size);
+
+  static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
+  FromString(const std::string& serialized, int pointer_size);
+
+  absl::StatusOr<std::string> SerializeAsString() const override;
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
+      override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const override;
+
+  const GpuExecutableProto& GetGpuExecutableProto() const { return proto_; }
+
+ private:
+  LegacyGpuAotCompilationResult(std::unique_ptr<HloModule> module,
+                                GpuExecutableProto proto, int pointer_size)
+      : module_(std::move(module)),
+        proto_(std::move(proto)),
+        pointer_size_(pointer_size) {}
+
+  std::unique_ptr<HloModule> module_;
+  GpuExecutableProto proto_;
+  int pointer_size_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_LEGACY_GPU_AOT_COMPILATION_RESULT_H_
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index cdedeafa4ee132..26a04585330adf 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -357,7 +357,12 @@ cc_library(
     tags = [
         "gpu",
         "oneapi-only",
-    ],
+    ] + if_google([
+        # TODO(b/456585142): Currently we don't support building the SYCL backend.
+        "notap",
+        "nobuilder",
+        "manual",
+    ]),
     deps = [
         ":llvm_gpu_backend",
         "//xla/service/llvm_ir:llvm_command_line_options",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index 59f61bfccd9eed..5add4d5e1c2624 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -194,7 +194,8 @@ void HsacoCache::Add(const std::string& ir, uint64_t hash,
 // TargetMachine for the AMDGPU target.
 absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     llvm::Module* module, llvm::TargetMachine* target_machine,
-    const DebugOptions& debug_options) {
+    const DebugOptions& debug_options,
+    llvm_ir::LLVMCommandLineOptionsLock& llvm_lock) {
   auto* env = tsl::Env::Default();
   std::vector<std::string> tempdir_vector;
   env->GetLocalTempDirectories(&tempdir_vector);
@@ -258,8 +259,6 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
 
   if (debug_options.xla_gpu_use_inprocess_lld()) {
 #ifdef HAS_SUPPORT_FOR_LLD_AS_A_LIBRARY
-    static absl::Mutex lld_mu(absl::kConstInit);
-
     std::array<const char*, 7> args{
         "ld.lld",           "--threads=1",       "-shared",
         "--no-undefined",   isabin_path.c_str(), "-o",
@@ -270,7 +269,7 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     llvm::raw_string_ostream os(error_message);
     lld::Result result;
     {
-      absl::MutexLock lock(&lld_mu);
+      llvm_lock.UpgradeToExclusiveAccessToRawLLVMCommandLine();
       result =
           lld::lldMain(args, llvm::nulls(), os, {{lld::Gnu, &lld::elf::link}});
     }
@@ -640,7 +639,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
 
     // Lower optimized LLVM module to HSA code object.
     TF_ASSIGN_OR_RETURN(
-        hsaco, EmitModuleToHsaco(module, target_machine.get(), debug_options));
+        hsaco, EmitModuleToHsaco(module, target_machine.get(), debug_options, llvm_lock));
     HsacoCache::Add(str, hash, gcn_arch_name, hsaco);
   }
   return hsaco;
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
index 15758215ba2835..32f41027529958 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
@@ -75,7 +75,7 @@ def main():
 
 namespace {cpp_namespace} {{
   inline const char kRaw_{cpp_identifier}[] = "{data_string}";
-  constexpr llvm::StringRef {cpp_identifier}{{kRaw_{cpp_identifier}, sizeof(kRaw_{cpp_identifier})}};
+  constexpr llvm::StringRef {cpp_identifier}{{kRaw_{cpp_identifier}, sizeof(kRaw_{cpp_identifier}) - 1}};
 }} // namespace {cpp_namespace}
 """)
 
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
index 18533563d58e2a..b9e837eb5b9801 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
@@ -247,13 +247,16 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
       {12, 1}, {12, 0}, {11, 0}, {10, 3}, {10, 0}, {9, 0}, {8, 9}, {8, 7},
       {8, 6},  {8, 0},  {7, 5},  {7, 2},  {7, 0},  {6, 2}, {6, 1}, {6, 0},
       {5, 3},  {5, 2},  {5, 0},  {3, 7},  {3, 5},  {3, 2}, {3, 0}};
-  auto target_compute_capability = kSupportedVersions[0];
+  // Initialize to the least supported version, which acts as a safe fallback
+  auto target_compute_capability =
+      kSupportedVersions[std::size(kSupportedVersions) - 1];
 
   for (const auto& v : kSupportedVersions) {
-    if (!gpu_compute_capability.CanRunOn(v)) {
+    if (gpu_compute_capability.SupportsAllFeaturesOf(v)) {
+      // Found the most advanced supported capability
+      target_compute_capability = v;
       break;
     }
-    target_compute_capability = v;
   }
 
   if (target_compute_capability.major == gpu_compute_capability.major &&
@@ -273,7 +276,7 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
     // major version supports the forward compatible feature
     // extension.
     target_compute_capability.feature_extension =
-        se::CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures;
+        se::CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures;
   }
 
   // If the current CC isn't supported by LLVM and it is newer then
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
index 2a2463ea880557..7d0f0a75ec360c 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
@@ -40,7 +40,7 @@ TEST(UtilsTest, TestGetSmName) {
                 10, 0, FeatureExtension::kAcceleratedFeatures}),
             "sm_100a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
-                10, 0, FeatureExtension::kForwardCompatibleFeatures}),
+                10, 0, FeatureExtension::kFamilyCompatibleFeatures}),
             "sm_100f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
                 10, 3, FeatureExtension::kAcceleratedFeatures}),
@@ -56,6 +56,10 @@ TEST(UtilsTest, TestGetSmName) {
             "sm_121a");
   // Do not use the extension for a yet-unknown compute capability.
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes-ptx-release-history
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 9}), "sm_103f");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{
+                10, 9, FeatureExtension::kAcceleratedFeatures}),
+            "sm_103f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 9}), "sm_121f");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{13, 0}), "sm_121");
 }
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 1c571864c8d536..70ce5def839dba 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -47,8 +46,8 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
@@ -534,7 +533,7 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
                                  PrecisionConfig::Algorithm precision_algorithm,
                                  se::blas::AlgorithmType algorithm,
                                  se::blas::ComputePrecision compute_precision,
-                                 const se::NumericOptions& numeric_options,
+                                 const se::EngineOptions& engine_options,
                                  se::blas::ProfileResult* profile_result,
                                  se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
@@ -560,14 +559,14 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
         alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
-        output.batch_size, computation_type, algorithm, numeric_options,
+        output.batch_size, computation_type, algorithm, engine_options,
         profile_result, context);
   }
   return blas->BlasGemmWithAlgorithm(
       stream, lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
       lhs.cast<Input>(), lhs.leading_dim_stride, rhs.cast<Input>(),
       rhs.leading_dim_stride, beta, &output_data, output.leading_dim_stride,
-      computation_type, algorithm, numeric_options, profile_result, context);
+      computation_type, algorithm, engine_options, profile_result, context);
 }
 
 template <typename Scale, typename Input, typename Output>
@@ -579,7 +578,7 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
                     PrecisionConfig::Algorithm precision_algorithm,
                     std::optional<se::blas::AlgorithmType> algorithm,
                     se::blas::ComputePrecision compute_precision,
-                    const se::NumericOptions& numeric_options,
+                    const se::EngineOptions& engine_options,
                     se::blas::ProfileResult* profile_result,
                     se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
@@ -592,8 +591,7 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
   if (algorithm) {
     return DoGemmWithAlgorithm<Scale, Input, Output>(
         lhs, rhs, output, workspace, alpha, beta, stream, precision_algorithm,
-        *algorithm, compute_precision, numeric_options, profile_result,
-        context);
+        *algorithm, compute_precision, engine_options, profile_result, context);
   }
 
   // Set a workspace for all Blas operations launched below.
@@ -605,14 +603,14 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
         alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
-        output.batch_size, numeric_options, context);
+        output.batch_size, engine_options, context);
   }
 
   return blas->BlasGemm(stream, lhs.transpose, rhs.transpose, output.m,
                         output.n, output.k, alpha, lhs.cast<Input>(),
                         lhs.leading_dim_stride, rhs.cast<Input>(),
                         rhs.leading_dim_stride, beta, &output_data,
-                        output.leading_dim_stride, numeric_options, context);
+                        output.leading_dim_stride, engine_options, context);
 }
 
 }  // namespace
@@ -630,10 +628,11 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
       GemmConfig::DescriptorsTuple desc,
       config.GetMatrixDescriptors(lhs_buffer, rhs_buffer, output_buffer));
 
-  se::NumericOptions numeric_options{
+  se::EngineOptions engine_options{
       deterministic_ops,
-      /*allow_tf32=*/IsTf32Allowed(config.precision_algorithm,
-                                   config.compute_precision)};
+      /*allow_tf32=*/
+      IsTf32Allowed(config.precision_algorithm, config.compute_precision),
+      /*require_command_buffer=*/false};
 
   if (!algorithm) {
     algorithm = config.algorithm;
@@ -673,7 +672,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         static_cast<NativeScaleType>(config.alpha.real()),                  \
         static_cast<NativeScaleType>(config.beta), stream,                  \
         config.precision_algorithm, algorithm, config.compute_precision,    \
-        numeric_options, profile_result, context);                          \
+        engine_options, profile_result, context);                           \
   }
 
 #define TYPED_GEMM_COMPLEX(SCALENTYPE, ATYPE, BTYPE, CTYPE)                 \
@@ -687,7 +686,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         static_cast<NativeScaleType>(config.alpha),                         \
         static_cast<NativeScaleType>(config.beta), stream,                  \
         config.precision_algorithm, algorithm, config.compute_precision,    \
-        numeric_options, profile_result, context);                          \
+        engine_options, profile_result, context);                           \
   }
 
   if (config.output_layout.dtype == S32) {
@@ -700,7 +699,7 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
         desc.lhs, desc.rhs, desc.output, workspace_buffer,
         static_cast<int32_t>(config.alpha.real()),
         static_cast<int32_t>(config.beta), stream, PrecisionConfig::ALG_UNSET,
-        *algorithm, se::blas::kDefaultComputePrecision, numeric_options,
+        *algorithm, se::blas::kDefaultComputePrecision, engine_options,
         profile_result, context);
   }
 
@@ -806,10 +805,10 @@ absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
   TF_RET_CHECK(proto.num_warps() > 0);
   TF_RET_CHECK(proto.num_ctas() > 0);
 
-  return TritonGemmConfig(proto.block_m(), proto.block_n(), proto.block_k(),
-                          proto.split_k(), proto.num_stages(),
-                          proto.num_warps(), proto.num_ctas(),
-                          proto.is_tma_allowed());
+  return TritonGemmConfig(
+      proto.block_m(), proto.block_n(), proto.block_k(), proto.split_k(),
+      proto.num_stages(), proto.num_warps(), proto.num_ctas(),
+      proto.is_tma_allowed(), proto.is_warp_specialization_allowed());
 }
 
 AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
@@ -822,15 +821,17 @@ AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
   key.set_num_warps(num_warps);
   key.set_num_ctas(num_ctas);
   key.set_is_tma_allowed(is_tma_allowed);
+  key.set_is_warp_specialization_allowed(is_warp_specialization_allowed);
   return key;
 }
 
 std::string TritonGemmConfig::ToString() const {
-  return absl::StrCat("{block_m:", block_m, ",block_n:", block_n,
-                      ",block_k:", block_k, ",split_k:", split_k,
-                      ",num_stages:", num_stages, ",num_warps:", num_warps,
-                      ",num_ctas:", num_ctas,
-                      ",is_tma_allowed:", is_tma_allowed, "}");
+  return absl::StrCat(
+      "{block_m:", block_m, ",block_n:", block_n, ",block_k:", block_k,
+      ",split_k:", split_k, ",num_stages:", num_stages,
+      ",num_warps:", num_warps, ",num_ctas:", num_ctas,
+      ",is_tma_allowed:", is_tma_allowed,
+      ",is_warp_specialization_allowed:", is_warp_specialization_allowed, "}");
 }
 
 absl::StatusOr<bool> IsMatrixMultiplicationTooSmallForRewriting(
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index ae2a861c6cc12d..ce0509f3ad1c27 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -179,7 +179,8 @@ struct TritonGemmConfig {
   constexpr TritonGemmConfig() = default;
   constexpr TritonGemmConfig(int block_m, int block_n, int block_k, int split_k,
                              int num_stages, int num_warps, int num_ctas = 1,
-                             bool is_tma_allowed = false)
+                             bool is_tma_allowed = false,
+                             bool is_warp_specialization_allowed = false)
       : block_m(block_m),
         block_n(block_n),
         block_k(block_k),
@@ -187,7 +188,8 @@ struct TritonGemmConfig {
         num_stages(num_stages),
         num_warps(num_warps),
         num_ctas(num_ctas),
-        is_tma_allowed(is_tma_allowed) {}
+        is_tma_allowed(is_tma_allowed),
+        is_warp_specialization_allowed(is_warp_specialization_allowed) {}
   int block_m = 0;
   int block_n = 0;
   int block_k = 0;
@@ -198,6 +200,8 @@ struct TritonGemmConfig {
   int num_ctas = 0;
   // Allow/disallow TMA usage for all arguments of the kernel (where possible).
   bool is_tma_allowed = false;
+  // Allow/disallow automatic warp specialization.
+  bool is_warp_specialization_allowed = false;
 
   // When adding new members, please update all methods, such as ToTuple,
   // FromProto, ToProto, ToString, etc. Updating ToTuple is not enough.
@@ -209,7 +213,8 @@ struct TritonGemmConfig {
  private:
   auto ToTuple() const {
     return std::make_tuple(block_m, block_n, block_k, split_k, num_stages,
-                           num_warps, num_ctas, is_tma_allowed);
+                           num_warps, num_ctas, is_tma_allowed,
+                           is_warp_specialization_allowed);
   }
 
  public:
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 8536e6c2b77d4b..b446e1c45dbd24 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -8,7 +8,6 @@ load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
-load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -32,11 +31,11 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
@@ -60,6 +59,7 @@ cc_library(
         ":sol_gpu_cost_model",
         "//xla:util",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
@@ -67,7 +67,6 @@ cc_library(
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:flag_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
@@ -91,6 +90,7 @@ xla_cc_test(
         ":sol_latency_estimator",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
@@ -98,7 +98,6 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
@@ -161,12 +160,12 @@ xla_test(
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu:alias_info",
         "//xla/service/gpu:gpu_compiler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -220,11 +219,11 @@ cc_library(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_graph_dumper",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
@@ -238,12 +237,12 @@ xla_cc_test(
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
@@ -313,12 +312,12 @@ cc_library(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -336,13 +335,13 @@ xla_cc_test(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
@@ -364,11 +363,11 @@ cc_library(
         ":gpu_performance_model_base",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
@@ -390,6 +389,7 @@ xla_cc_test(
         ":gpu_performance_model_base",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -397,7 +397,6 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
@@ -458,7 +457,6 @@ cc_library(
     name = "gpu_collective_performance_model",
     srcs = ["gpu_collective_performance_model.cc"],
     hdrs = ["gpu_collective_performance_model.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
@@ -494,29 +492,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nvml",
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
-)
-
-xla_test(
-    name = "gpu_collective_performance_model_test",
-    srcs = ["gpu_collective_performance_model_test.cc"],
-    backends = ["nvgpu_any"],
-    tags = [
-        "cuda-only",
-        "gpu",
     ],
-    deps = [
-        ":gpu_collective_performance_model",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_googletest//:gtest",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
 )
 
 cc_library(
@@ -537,6 +513,7 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
@@ -545,7 +522,6 @@ cc_library(
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
@@ -573,6 +549,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -582,7 +559,6 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
@@ -627,9 +603,9 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -649,13 +625,13 @@ xla_cc_test(
     deps = [
         ":triton_emitter_constraints",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -679,11 +655,11 @@ cc_library(
         "//xla/codegen/tiling:affine_map_evaluator",
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
@@ -707,13 +683,13 @@ xla_cc_test(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -879,6 +855,7 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":sol_latency_estimator",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -888,7 +865,6 @@ cc_library(
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/gpu:gpu_latency_hiding_scheduler",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -906,10 +882,12 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model_stats_collection",
         "//xla:shape_util",
+        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:statusor",
@@ -970,6 +948,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 
@@ -1121,12 +1100,12 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 020ac9a6426ce9..14a933306d623e 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/time/time.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
index 9e09359ad2a8e0..e04cfde4aee4e3 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
index defcce91630be8..60365d19a859ad 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
@@ -19,9 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <type_traits>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -29,11 +27,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_compiler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
diff --git a/third_party/xla/xla/service/gpu/model/block_level_parameters.h b/third_party/xla/xla/service/gpu/model/block_level_parameters.h
index 3aba427de0fb0f..380f857b2d9892 100644
--- a/third_party/xla/xla/service/gpu/model/block_level_parameters.h
+++ b/third_party/xla/xla/service/gpu/model/block_level_parameters.h
@@ -36,6 +36,7 @@ struct BlockLevelParameters {
   int num_ctas = 1;
   int num_stages = 1;
   bool is_tma_allowed = false;
+  bool is_warp_specialization_allowed = false;
 
   // Returns a BlockLevelParameters struct from a BlockLevelFusionConfig proto.
   static BlockLevelParameters FromBlockLevelFusionConfig(
@@ -45,6 +46,8 @@ struct BlockLevelParameters {
     result.num_ctas = config.num_ctas();
     result.num_stages = config.num_stages();
     result.is_tma_allowed = config.is_tma_allowed();
+    result.is_warp_specialization_allowed =
+        config.is_warp_specialization_allowed();
     result.output_tile_sizes.reserve(config.output_tiles_size());
     for (const auto& tile : config.output_tiles()) {
       result.output_tile_sizes.push_back(
@@ -65,6 +68,7 @@ struct BlockLevelParameters {
     config.set_num_ctas(num_ctas);
     config.set_num_stages(num_stages);
     config.set_is_tma_allowed(is_tma_allowed);
+    config.set_is_warp_specialization_allowed(is_warp_specialization_allowed);
     return config;
   }
 };
diff --git a/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc b/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
index 7cf1aa13b21c19..9452e82afed4ca 100644
--- a/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
+++ b/third_party/xla/xla/service/gpu/model/block_level_parameters_test.cc
@@ -36,6 +36,7 @@ TEST(BlockLevelParametersTest,
   block_level_fusion_config.set_num_ctas(13);
   block_level_fusion_config.set_num_stages(14);
   block_level_fusion_config.set_is_tma_allowed(true);
+  block_level_fusion_config.set_is_warp_specialization_allowed(true);
 
   BlockLevelParameters block_level_parameters =
       BlockLevelParameters::FromBlockLevelFusionConfig(
@@ -46,6 +47,7 @@ TEST(BlockLevelParametersTest,
   EXPECT_THAT(block_level_parameters.num_ctas, 13);
   EXPECT_THAT(block_level_parameters.num_stages, 14);
   EXPECT_THAT(block_level_parameters.is_tma_allowed, true);
+  EXPECT_THAT(block_level_parameters.is_warp_specialization_allowed, true);
 }
 
 TEST(BlockLevelParametersTest,
@@ -56,6 +58,7 @@ TEST(BlockLevelParametersTest,
   block_level_parameters.num_ctas = 13;
   block_level_parameters.num_stages = 14;
   block_level_parameters.is_tma_allowed = true;
+  block_level_parameters.is_warp_specialization_allowed = true;
 
   BlockLevelFusionConfig block_level_fusion_config =
       block_level_parameters.ToBlockLevelFusionConfig();
@@ -67,6 +70,7 @@ TEST(BlockLevelParametersTest,
   EXPECT_THAT(block_level_fusion_config.num_ctas(), 13);
   EXPECT_THAT(block_level_fusion_config.num_stages(), 14);
   EXPECT_THAT(block_level_fusion_config.is_tma_allowed(), true);
+  EXPECT_THAT(block_level_fusion_config.is_warp_specialization_allowed(), true);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index ecf93a97b6a438..7a35506147da51 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -37,13 +37,13 @@ limitations under the License.
 #include "xla/codegen/tiling/affine_map_evaluator.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/layout.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
@@ -503,7 +503,7 @@ bool IsIndexingCoalesced(IndexingMap& thread_x_to_linearized_input,
 std::optional<CoalescingMap> ComputeCoalescingForAllOperands(
     const HloFusionAnalysis& fusion_analysis,
     absl::Span<const HloInstruction* const> operands,
-    gpu::SymbolicExprContext* symbolic_expr_context) {
+    SymbolicExprContext* symbolic_expr_context) {
   auto emitter = GetFusionEmitter(
       PreBufferAssignmentFusionInfo{fusion_analysis}, symbolic_expr_context);
   const auto* fusion_interface =
@@ -569,7 +569,7 @@ CoalescingAnalysis CoalescingAnalysis::Create(
     const HloInstruction* instr,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    gpu::SymbolicExprContext* symbolic_expr_context, bool use_heuristic) {
+    SymbolicExprContext* symbolic_expr_context, bool use_heuristic) {
   return Create(/*producer=*/instr, /*consumer=*/nullptr, operands,
                 fusion_analysis, symbolic_expr_context, use_heuristic);
 }
@@ -579,7 +579,7 @@ CoalescingAnalysis CoalescingAnalysis::Create(
     const HloInstruction* producer, const HloInstruction* consumer,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    gpu::SymbolicExprContext* symbolic_expr_context, bool use_heuristic) {
+    SymbolicExprContext* symbolic_expr_context, bool use_heuristic) {
   std::optional<CoalescingMap> coalescing_per_operand;
 
   if (!use_heuristic) {
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index 519db0300e74e6..18a25890b24965 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index d61340e6b374d2..1286067cac603a 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 4d7313b8ddae71..0b85ccbd6f2990 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
+#include "google/protobuf/text_format.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
index 26cfc6a2bbb7df..f692cf4a1eb147 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
@@ -148,13 +148,13 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
                                         int num_hosts) {
     IotaReplicaGroupList iota(1, 1);
     switch (comm) {
-      case GPUCommunicationType::SINGLE_HOST:
+      case GPUCommunicationType::SINGLE_PARTITION:
         iota = IotaReplicaGroupList(num_hosts, kNumGpusPerHost);
         break;
-      case GPUCommunicationType::RAIL_ALIGNED:
+      case GPUCommunicationType::MULTI_HOST_WORLD_LEVEL:
         iota = IotaReplicaGroupList(1, num_hosts * kNumGpusPerHost);
         break;
-      case GPUCommunicationType::NON_RAIL_ALIGNED:
+      case GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL:
         iota = IotaReplicaGroupList(kNumGpusPerHost, num_hosts,
                                     {num_hosts, kNumGpusPerHost}, {1, 0});
         break;
@@ -169,273 +169,273 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
   std::vector<SpaceSpec> test_space_ = {
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_PARTITION,
           /*tensor_size=*/1024,
           /*num_nodes=*/1,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllToAll,
-          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/4096,
@@ -459,7 +459,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -471,7 +471,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -483,7 +483,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -495,7 +495,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -507,7 +507,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -519,7 +519,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -531,7 +531,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -543,7 +543,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -555,7 +555,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -567,67 +567,68 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AR_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AR_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(625),
         },
         {
-            /*test_name=*/"ARS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"ARS_SINGLE_PARTITION_aligned_interpolate_tensor_"
+                          "size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllReduceStart,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -639,7 +640,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -651,7 +652,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -663,7 +664,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -675,7 +676,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -687,7 +688,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -699,7 +700,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -711,7 +712,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -723,7 +724,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -735,7 +736,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -747,55 +748,55 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"RS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"RS_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -807,7 +808,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -819,7 +820,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -831,7 +832,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -843,7 +844,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1056,
                 /*num_nodes=*/3,
             },
@@ -855,7 +856,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -867,7 +868,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -879,7 +880,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -891,7 +892,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -903,7 +904,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -915,67 +916,68 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(2500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_extrapolate_nodes",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_extrapolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_extrapolate_tensor_size",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_extrapolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Seconds(1),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_interpolate_nodes",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_interpolate_nodes",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
             /*expected_duration=*/absl::Milliseconds(500),
         },
         {
-            /*test_name=*/"AG_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AG_SINGLE_PARTITION_aligned_interpolate_tensor_size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(625),
         },
         {
-            /*test_name=*/"AGS_single_host_aligned_interpolate_tensor_size",
+            /*test_name=*/"AGS_SINGLE_PARTITION_aligned_interpolate_tensor_"
+                          "size",
             /*spec=*/
             {
                 /*opcode=*/HloOpcode::kAllGatherStart,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -986,7 +988,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
@@ -997,18 +999,18 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/2,
             },
             /*expected_duration=*/absl::Milliseconds(250),
         },
         {
-            /*test_name=*/"A2A_single_host_exact_match",
+            /*test_name=*/"A2A_SINGLE_PARTITION_exact_match",
             {
                 /*opcode=*/HloOpcode::kAllToAll,
                 /*comm=*/
-                GPUCommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_PARTITION,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/1,
             },
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
index 722a64e79c314c..2618390347add0 100644
--- a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
@@ -64,7 +64,7 @@ absl::StatusOr<HloInstructionProfileList> CollectProfiles(
 
 }  // namespace
 
-absl::StatusOr<bool> CollectivePerfTableStatsCollection::Run(
+absl::StatusOr<bool> CollectivePerfTableStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
index b88e401485190b..9ef844990e1aa9 100644
--- a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
@@ -38,9 +38,8 @@ class CollectivePerfTableStatsCollection : public HloModulePass {
     return "collective-perf-table-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/experimental/BUILD b/third_party/xla/xla/service/gpu/model/experimental/BUILD
index cfc42792a1b906..ea4a19cf79f74a 100644
--- a/third_party/xla/xla/service/gpu/model/experimental/BUILD
+++ b/third_party/xla/xla/service/gpu/model/experimental/BUILD
@@ -1,7 +1,6 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -211,88 +210,3 @@ xla_cc_test(
         "@llvm-project//mlir:IR",
     ],
 )
-
-cc_library(
-    name = "symbolic_map",
-    srcs = ["symbolic_map.cc"],
-    hdrs = ["symbolic_map.h"],
-    deps = [
-        ":symbolic_expr",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_map_test",
-    srcs = ["symbolic_map_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "symbolic_expr",
-    srcs = ["symbolic_expr.cc"],
-    hdrs = ["symbolic_expr.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_expr_test",
-    srcs = ["symbolic_expr_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        "//xla/hlo/analysis:indexing_test_utils",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "symbolic_map_converter",
-    srcs = ["symbolic_map_converter.cc"],
-    hdrs = ["symbolic_map_converter.h"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        "//xla/hlo/analysis:interval",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_map_converter_test",
-    srcs = ["symbolic_map_converter_test.cc"],
-    deps = [
-        ":symbolic_expr",
-        ":symbolic_map",
-        ":symbolic_map_converter",
-        "//xla/hlo/analysis:interval",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AsmParser",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
index 5c5bdeeb3f1191..12f031b53d39a7 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <cstdlib>
-#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -34,21 +33,10 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-#endif  // GOOGLE_CUDA
 namespace xla {
 namespace gpu {
-
 namespace {
 
-// Different algorithms that can be used to perform the collective.
-enum class CollectiveAlgo {
-  RING = 0,
-  TREE,
-};
-
 struct CudaBandwidthSettings {
   // Table for max system bandwidths GB/s for using NCCL's low latency
   // algorithm. This is used for intra-node estimate.
@@ -87,15 +75,22 @@ struct CudaBandwidthSettings {
 
   // Returns NVLink bw in GB/s
   float GetNvlinkBw() const {
-    return compute_capability.IsAtLeast(se::CudaComputeCapability::kHopper)
-               ? kSm90NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kAmpere)
-               ? kSm80NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kVolta)
-               ? kSm70NvlinkBandwidth
-           : compute_capability.IsAtLeast(se::CudaComputeCapability::kPascal)
-               ? kSm60NvlinkBandwidth
-               : kSm80NvlinkBandwidth;
+    switch (compute_capability.major) {
+      case se::CudaComputeCapability::kBlackwell:
+        return kSm100NvlinkBandwidth;
+      case se::CudaComputeCapability::kHopper:
+        return kSm90NvlinkBandwidth;
+      case se::CudaComputeCapability::kAmpere:
+        return kSm80NvlinkBandwidth;
+      case se::CudaComputeCapability::kVolta:
+        return kSm70NvlinkBandwidth;
+      case se::CudaComputeCapability::kPascal:
+        return kSm60NvlinkBandwidth;
+      default:
+        LOG(WARNING) << "NVLink bandwidth for " << compute_capability.ToString()
+                     << "unknown. Assumes Blackwell.";
+        return kSm100NvlinkBandwidth;
+    }
   }
 
   // Max bandwidth in GB/s for ring low latency 128 algorithm per channel on a
@@ -111,15 +106,13 @@ struct CudaBandwidthSettings {
   static constexpr double kSm70NvlinkBandwidth = 20.0;
   static constexpr double kSm80NvlinkBandwidth = 20.0;
   static constexpr double kSm90NvlinkBandwidth = 20.0;
-
-  // PCIE bandwidth for PCI Gen3 x16
-  static constexpr double kPciBandwidth = 12.0;
+  static constexpr double kSm100NvlinkBandwidth = 40.0;
 
   // Discount factor for ring algorithm
   static constexpr double kRingAlgorithmDiscountFactor = 0.92;
 
   // Maximum number of channels allowed by NCCL
-  static constexpr int64_t kMaxNumChannelsRing = 16;
+  static constexpr int64_t kMaxNumChannelsRing = 32;
 
   // ll128 is by default enabled for Volta, Ampere and Hopper, ll128 by default
   // launches 640 threads.
@@ -211,9 +204,6 @@ struct RocmBandwidthSettings {
   static constexpr double kMi200InfinityFabricBandwidth = 75.0;
   static constexpr double kMi300InfinityFabricBandwidth = 112.0;
 
-  // PCIe bandwidth for PCI Gen4 x16 (approximate)
-  static constexpr double kPciBandwidth = 32.0;
-
   // Discount factor for ring algorithm (based on ROCm NCCL implementation)
   static constexpr double kRingAlgorithmDiscountFactor = 0.90;
 
@@ -245,15 +235,9 @@ float GetMaxLowLatencyBandwidth(const BandwidthSettings& bandwidth_settings) {
 static constexpr absl::Duration kNcclKernelLaunchOverhead =
     absl::Microseconds(5);
 
-int64_t GetNcclMaxNumChannels(CollectiveAlgo algorithm) {
-  int64_t max_nchannels = 0;
-  switch (algorithm) {
-      // Tree and Ring algos share the same max channel number.
-    case CollectiveAlgo::RING:
-    case CollectiveAlgo::TREE:
-      max_nchannels = CudaBandwidthSettings::kMaxNumChannelsRing;
-      break;
-  }
+int64_t GetNcclMaxNumChannels() {
+  int64_t max_nchannels = CudaBandwidthSettings::kMaxNumChannelsRing;
+
   const char* env = std::getenv("NCCL_MAX_NCHANNELS");
   if (env != nullptr) {
     int64_t max_nchannels_from_env;
@@ -264,15 +248,8 @@ int64_t GetNcclMaxNumChannels(CollectiveAlgo algorithm) {
   return max_nchannels;
 }
 
-int64_t GetMinNumberOfChannels(CollectiveAlgo algorithm) {
-  int64_t min_nchannels = 0;
-  switch (algorithm) {
-      // Tree and Ring algos share the same min channel number.
-    case CollectiveAlgo::RING:
-    case CollectiveAlgo::TREE:
-      min_nchannels = 1;
-      break;
-  }
+int64_t GetMinNumberOfChannels() {
+  int64_t min_nchannels = 1;
   const char* env = std::getenv("NCCL_MIN_NCHANNELS");
   if (env != nullptr) {
     int64_t min_nchannels_from_env;
@@ -317,14 +294,13 @@ absl::Duration ComputeAllreduceTimeImpl(
   float bw_intra_node = GetMaxLowLatencyBandwidth(bandwidth_settings);
   int64_t num_devices = cost_analysis->NumOfDevices(instr);
 
-  int64_t min_nchannels =
-      std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
-  int64_t num_channels =
-      std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
-  int default_threads =
-      (bw_intra_node * num_channels <= bandwidth_settings.kPciBandwidth)
-          ? 256
-          : bandwidth_settings.kLL128NumThreads;
+  int64_t min_nchannels = std::max(num_devices, GetMinNumberOfChannels());
+  int64_t num_channels = std::max(min_nchannels, GetNcclMaxNumChannels());
+  int64_t pcie_bandwidth_gbps =
+      gpu_device_info.pcie_bandwidth() / 1024 / 1024 / 1024;
+  int default_threads = (bw_intra_node * num_channels <= pcie_bandwidth_gbps)
+                            ? 256
+                            : bandwidth_settings.kLL128NumThreads;
 
   int warp_size = gpu_device_info.threads_per_warp();
   int num_threads =
@@ -339,16 +315,17 @@ absl::Duration ComputeAllreduceTimeImpl(
           /*num_blocks=*/num_channels, /*num_threads_per_block=*/num_threads);
   total_time += compute_time_per_channel;
 
-  uint32_t supported_p2p =
-      GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P();
-
-  if (supported_p2p == 0) {
-    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
-               "continue using default system bandwidth.";
-  } else {
+  if (gpu_device_info.device_interconnect_info().active_links) {
     VLOG(8) << "Nvlink supports p2p communication, setting intra node "
                "bandwidth to nvlink bw.";
     bw_intra_node = bandwidth_settings.GetNvlinkBw();
+    num_channels =
+        std::min(static_cast<int64_t>(
+                     gpu_device_info.device_interconnect_info().active_links),
+                 num_channels);
+  } else {
+    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
+               "continue using default system bandwidth.";
   }
 
   double bus_bandwidth = bw_intra_node * num_channels;
@@ -380,60 +357,6 @@ RocmBandwidthSettings CreateSettings(
 
 }  // namespace
 
-/*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
-#if GOOGLE_CUDA && (defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE))
-  nvmlReturn_t init_result = nvmlInit();
-  if (init_result != NVML_SUCCESS) {
-    LOG(ERROR) << "NVML init failed with " << init_result;
-  }
-  return init_result == NVML_SUCCESS;
-#elif TENSORFLOW_USE_ROCM
-  return true;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ bool GpuPerformanceWithCollectiveModel::ShutdownNvml() {
-#if GOOGLE_CUDA
-  nvmlReturn_t shutdown_result = nvmlShutdown();
-  return shutdown_result == NVML_SUCCESS;
-#elif TENSORFLOW_USE_ROCM
-  return true;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ uint32_t
-GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P() {
-#if GOOGLE_CUDA
-  // We will use nvml library to detect nvlink capability
-  // to see if it supports p2p communication.
-  // We first load libnvidia-ml.so and assign symbols to function pointers
-  // to avoid linking errors.
-  // Then gpu 0 will be used to query for nvlink capability, note that
-  // we only look at link 0 of gpu 0 since all other links are assumed
-  // to have the same capability.
-  CHECK(InitNvml()) << "NVML init failed.";
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result = nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  CHECK(get_device_result == NVML_SUCCESS);
-
-  uint32_t supported_p2p = 0;
-
-  nvmlReturn_t nvlink_cap_result = nvmlDeviceGetNvLinkCapability(
-      nvml_device, /*nvlink link number*/ 0, NVML_NVLINK_CAP_P2P_SUPPORTED,
-      &supported_p2p);
-  CHECK(nvlink_cap_result == NVML_SUCCESS ||
-        nvlink_cap_result == NVML_ERROR_NOT_SUPPORTED);
-  CHECK(ShutdownNvml()) << "NVML shutdown failed.";
-  return supported_p2p;
-#else
-  return 0;
-#endif  // GOOGLE_CUDA
-}
-
 /*static*/ absl::Duration
 GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
     const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
index be93dfadd21ead..d7cf30861a75ac 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -35,16 +35,6 @@ class GpuPerformanceWithCollectiveModel : public GpuPerformanceModelBase {
       const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
       const se::DeviceDescription& gpu_device_info);
 
-  // Initialize nvml library.
-  static bool InitNvml();
-
-  // Shut down nvml library.
-  static bool ShutdownNvml();
-
-  // This checks if the nvlink supports direct P2P communication,
-  // If not, we will use PCIE bandwidth to estimate latency.
-  static uint32_t CheckIfNvlinkSupportsP2P();
-
  private:
   static absl::Duration ComputeAllreduceTime(
       const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
deleted file mode 100644
index a394d7b602df5c..00000000000000
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/model/gpu_collective_performance_model.h"
-
-#include <gtest/gtest.h>
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using GpuPerformanceWithCollectiveModelTest = HloHardwareIndependentTestBase;
-
-TEST_F(GpuPerformanceWithCollectiveModelTest, TestNvmlLibraryLoading) {
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-  // After successful init, we try to use one of the
-  // nvml functions to see if the result is good.
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result = nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
-
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
index 35f84b1b7bd943..668aa4e033df04 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> GpuCostModelStatsCollection::Run(
+absl::StatusOr<bool> GpuCostModelStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Scan all computations for fusion instructions.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
index e6a80f6d03ed4f..8e38585c1630a6 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
@@ -46,8 +46,8 @@ class GpuCostModelStatsCollection : public HloModulePass {
     return "gpu_cost_model_stats_collection";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index 70c0a3684af1a9..5d2f4937d6686b 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index bd1f9543411b80..ccb5ee14aab522 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,12 +24,12 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
index 36c34f3ebe1b54..61a102b2b02900 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
@@ -534,7 +534,7 @@ ENTRY main {
           *fusion_adaptor, launch_dimensions, /*output_tile_sizes=*/{{1, 1}}));
 
   EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.read_time), 2932, 2);
-  EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.compute_time), 19, 1);
+  EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.compute_time), 14, 1);
   EXPECT_NEAR(absl::ToDoubleSeconds(runtime_data.exec_time), 2932, 2);
 }
 
@@ -682,7 +682,7 @@ ENTRY main {
                           indexing_cost_model_.EstimateRunTimeForTiledFusion(
                               *fusion_adaptor, /*launch_dimensions=*/{1024, 8},
                               /*output_tile_sizes=*/{{4, 4}}));
-  EXPECT_NEAR(absl::ToDoubleMicroseconds(res1.exec_time), 412, 1);
+  EXPECT_NEAR(absl::ToDoubleMicroseconds(res1.exec_time), 292, 1);
 
   TF_ASSERT_OK_AND_ASSIGN(auto res2,
                           indexing_cost_model_.EstimateRunTimeForTiledFusion(
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 9906bf42d03383..1672ba39783efa 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/coalescing_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
@@ -227,7 +227,11 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
     absl::Span<const HloInstruction* const> fused_consumers) {
   auto cache_result = gpu_performance_model_cache_.Get(*producer);
-  CHECK(cache_result.has_value());
+  CHECK(cache_result.has_value())
+      << "Producer `" << producer->name()
+      << "` not found in cache. This should never happen! HLO module name: "
+      << producer->GetModule()->name()
+      << " HLO Instruction: " << producer->ToString();
   EstimateRunTimeData producer_runtime = *cache_result;
 
   absl::Duration time_unfused =
@@ -240,7 +244,11 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     VLOG(8) << "Fused consumer: " << fused_consumer->name();
 
     auto cache_result = gpu_performance_model_cache_.Get(*fused_consumer);
-    CHECK(cache_result.has_value());
+    CHECK(cache_result.has_value())
+        << "Consumer `" << fused_consumer->name()
+        << "` not found in cache. This should never happen! HLO module name: "
+        << fused_consumer->GetModule()->name()
+        << " HLO Instruction: " << fused_consumer->ToString();
     EstimateRunTimeData consumer_runtime = *cache_result;
 
     time_unfused += consumer_runtime.exec_time;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 706e10791edd86..9076948fb5fb74 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 47620f317e66a2..1774373122fcc1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -30,13 +30,13 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
@@ -346,7 +346,7 @@ int64_t GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
       std::min<int64_t>(num_blocks, gpu_device_info.core_count());
   int64_t fpu_count = n_active_core * n_active_fpus_per_core;
 
-  int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
+  double flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
   return flop_per_ns_per_fpu * fpu_count;
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 83583e1bc88f46..15242dbc830849 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -25,10 +25,10 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index e46de6cffe2391..c1967332fe27d3 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 
+#include <cstdint>
 #include <memory>
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -27,7 +29,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
@@ -318,6 +319,18 @@ ENTRY e {
   EXPECT_EQ(launch_dimensions.num_threads_per_block(), 128);
 }
 
+TEST_F(GpuPerformanceModelBaseTest,
+       CalculateEffectiveFlopsPerNsForFullOccupancyH100) {
+  se::DeviceDescription h100_device_info =
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo();
+  int64_t flops_per_ns = GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
+      h100_device_info, /*num_blocks=*/h100_device_info.core_count(),
+      /*num_threads_per_block=*/h100_device_info.fpus_per_core());
+  // H100 has a peak of 66.9 TFLOPS/s for TF32.
+  EXPECT_GT(flops_per_ns, 66000);
+  EXPECT_LT(flops_per_ns, 68000);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 080a76895bfc5b..f58fd9181c19f7 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
@@ -761,8 +761,8 @@ ENTRY entry_computation.1 {
 
   auto t = gpu_performance_model_.EstimateRunTimesForMultiOutputFusion(
       producer, consumer, &analysis_);
-  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_unfused), 162, 1);
-  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_fused), 145, 1);
+  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_unfused), 120, 1);
+  EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_fused), 103, 1);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
index 404bd9eddb96f6..1cfc21ee390f7e 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
@@ -397,7 +397,9 @@ absl::StatusOr<HloInstructionProfile> HloOpProfiler::MeasureClockCyclesPerOp(
 
   // Longer chains are too slow to compile.
   constexpr int kMinOpChainLength = 16;
-  constexpr int kMaxOpChainLength = 8192;
+  // If you get "too fast to measure" errors on faster GPUs, try increasing
+  // kMaxOpChainLength.
+  constexpr int kMaxOpChainLength = 16 * 1024;
 
   absl::Duration duration = absl::ZeroDuration();
   int chain_length = kMinOpChainLength;
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h b/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
index 3873df5d660e38..f97be52d5a2459 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator_data.h
@@ -19,6012 +19,8262 @@ limitations under the License.
 // BEGIN_DEFAULT_PERF_TABLE
 constexpr char kDefaultMatmulPTable[] = R"pb(
   entries {
-    key: "sm_90"
+    key: "sm_100"
     value {
       entries {
         b: 1
-        m: 1024
-        n: 2048
+        m: 256
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 139519467775467 }
-        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "bf16xbf16->bf16" value: 6765006451612 }
+        flops { key: "f32xf32->f32" value: 5065584541062 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 6919866364198 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 5531558193208 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 6808935064935 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 332247798870580 }
-        flops { key: "f32xf32->f32" value: 147979854465270 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 13400332268370 }
+        flops { key: "f32xf32->f32" value: 11552567395420 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12404595933456 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 13148288401253 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 13107200000000 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 41838443890274 }
-        flops { key: "f32xf32->f32" value: 32140260536398 }
+        flops { key: "bf16xbf16->bf16" value: 18551171803731 }
+        flops { key: "f32xf32->f32" value: 20161893946222 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21287506423473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 20453783602560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21236982278481 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 256
+        b: 1
+        m: 256
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 42366707070707 }
-        flops { key: "f32xf32->f32" value: 31476953095684 }
+        flops { key: "bf16xbf16->bf16" value: 46091252747252 }
+        flops { key: "f32xf32->f32" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41721395088591 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43690666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52093044052008 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238821580071174 }
-        flops { key: "f32xf32->f32" value: 116711067826086 }
+        m: 256
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89240510638297 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92182505494505 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 94519526760563 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92691801104972 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 256
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 588674245614035 }
-        flops { key: "f32xf32->f32" value: 154518848960722 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 12593143929442 }
+        flops { key: "f32xf32->f32" value: 11244782841823 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 13187043426999 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12018063037249 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 13357656050955 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
+        m: 256
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 21788592207792 }
-        flops { key: "f32xf32->f32" value: 15650388059701 }
+        flops { key: "bf16xbf16->bf16" value: 21017495772001 }
+        flops { key: "f32xf32->f32" value: 17329596901226 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26540978445718 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21076904522613 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21287506423473 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
+        b: 1
+        m: 256
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 139230008298755 }
-        flops { key: "f32xf32->f32" value: 62368832713754 }
+        flops { key: "bf16xbf16->bf16" value: 39102032920611 }
+        flops { key: "f32xf32->f32" value: 34370737003841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 50676884274117 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 52103155279503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52408327996876 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 1
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 493447529411764 }
-        flops { key: "f32xf32->f32" value: 205860344429266 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88738993719008 }
+        flops { key: "f32xf32->f32" value: 63310249056603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 94769799117387 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93190576636000 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 285873755058572 }
-        flops { key: "f32xf32->f32" value: 119890779812416 }
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 163281907542579 }
+        flops { key: "f32xf32->f32" value: 125173912800186 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 157903209411764 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178897338220593 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181344675561560 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 525314003913894 }
-        flops { key: "f32xf32->f32" value: 175161798368678 }
+        b: 1
+        m: 256
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 20213513253012 }
+        flops { key: "f32xf32->f32" value: 15590397026367 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21024080200501 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 24813778517286 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21126668975287 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 712739345502821 }
-        flops { key: "f32xf32->f32" value: 284776746657826 }
+        b: 1
+        m: 256
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 33156553359683 }
+        flops { key: "f32xf32->f32" value: 31007907589234 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 48770976744186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 41514917414166 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 41323192118226 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 256
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 343707370038412 }
-        flops { key: "f32xf32->f32" value: 144709140700808 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 81245598062954 }
+        flops { key: "f32xf32->f32" value: 61105271113134 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 81245598062954 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81024888620585 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 96943104369808 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 256
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631984593290170 }
-        flops { key: "f32xf32->f32" value: 329166714898835 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 156796411214953 }
+        flops { key: "f32xf32->f32" value: 119810513724615 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 148429889964058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 147816881057268 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
       }
       entries {
-        b: 4
+        b: 1
         m: 256
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22550021505376 }
-        flops { key: "f32xf32->f32" value: 16513007874015 }
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 292333739177783 }
+        flops { key: "f32xf32->f32" value: 214042026113824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 311410041763341 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 343212985136646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 323318826859379 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 36954220264317 }
-        flops { key: "f32xf32->f32" value: 21959706806282 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 34583284720432 }
+        flops { key: "f32xf32->f32" value: 43226321417069 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 44020245326336 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43464290155440 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44136049983558 }
       }
       entries {
         b: 1
         m: 256
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91678775956284 }
-        flops { key: "f32xf32->f32" value: 46442120415224 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 69024287991771 }
+        flops { key: "f32xf32->f32" value: 49417425625920 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84720042922518 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85789535314797 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85380234096692 }
       }
       entries {
         b: 1
         m: 256
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22671913513513 }
-        flops { key: "f32xf32->f32" value: 16810837675350 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 136677930753564 }
+        flops { key: "f32xf32->f32" value: 99855093834278 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162442030862329 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153919412844036 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 392162828341855 }
-        flops { key: "f32xf32->f32" value: 183985919122686 }
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 266768155031055 }
+        flops { key: "f32xf32->f32" value: 186381153272001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309168391592283 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 299551352768865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 294296786076469 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 1024
+        n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216131607085346 }
-        flops { key: "f32xf32->f32" value: 82697306223043 }
+        flops { key: "bf16xbf16->bf16" value: 464421204152249 }
+        flops { key: "f32xf32->f32" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 459600566720171 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 468371569901853 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678724288242730 }
-        flops { key: "f32xf32->f32" value: 302806341426443 }
+        b: 1
+        m: 256
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 53081956891437 }
+        flops { key: "f32xf32->f32" value: 37241322974472 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 68759081967213 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 68900271047227 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 67361469510664 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 256
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 211034163522012 }
-        flops { key: "f32xf32->f32" value: 111199443247721 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 104694015600624 }
+        flops { key: "f32xf32->f32" value: 73827133113311 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 135300129032258 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 134739844898983 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 129288600120409 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 763011394582711 }
-        flops { key: "f32xf32->f32" value: 326132189596626 }
+        b: 1
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205855411042944 }
+        flops { key: "f32xf32->f32" value: 138941747412008 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 260585323140395 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 247548547319884 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 253689739870053 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 206250830580099 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 514244168582375 }
-        flops { key: "f32xf32->f32" value: 165649772292502 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394106009910075 }
+        flops { key: "f32xf32->f32" value: 272800260162601 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 488897814001138 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 509365191650853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 520097759263744 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
-        flops { key: "f32xf32->f32" value: 174082656290531 }
+        m: 256
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 651394145142943 }
+        flops { key: "f32xf32->f32" value: 424382915468603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 930452187175043 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 906876540540540 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920777638760853 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 95055048158640 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 13568310553983 }
+        flops { key: "f32xf32->f32" value: 11066765171503 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12863497028943 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12826617737003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 12190529336966 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 512
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 588997160724081 }
-        flops { key: "f32xf32->f32" value: 229043545055794 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23489276863843 }
+        flops { key: "f32xf32->f32" value: 20311399515738 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26214400000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21183353535353 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21290883248730 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
+        b: 1
+        m: 512
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43690666666666 }
-        flops { key: "f32xf32->f32" value: 31595510357815 }
+        flops { key: "bf16xbf16->bf16" value: 47127011235955 }
+        flops { key: "f32xf32->f32" value: 31707471769430 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41521338901778 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51781530864197 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43464290155440 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 783328793548203 }
-        flops { key: "f32xf32->f32" value: 401589990188086 }
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 90933420054200 }
+        flops { key: "f32xf32->f32" value: 67778173462946 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93466384401114 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92166680171673 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92182505494505 }
       }
       entries {
         b: 1
         m: 512
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 162098705314009 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 119623643493761 }
-        flops { key: "f32xf32->f32" value: 54515730300568 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 165241893505694 }
+        flops { key: "f32xf32->f32" value: 114495822563446 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169466828282828 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 171196081632653 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151830009049773 }
       }
       entries {
         b: 1
         m: 512
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91056803256445 }
-        flops { key: "f32xf32->f32" value: 79044598351001 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 17924376068376 }
+        flops { key: "f32xf32->f32" value: 18594863951233 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 25885772034715 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 20971520000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 19593828905109 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 378628050954291 }
-        flops { key: "f32xf32->f32" value: 115084868595927 }
+        b: 1
+        m: 512
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 48480306303052 }
+        flops { key: "f32xf32->f32" value: 34304850607028 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 39290903981264 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47384899558693 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52748173707997 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 172294901155327 }
-        flops { key: "f32xf32->f32" value: 73143176021798 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 89478485333333 }
+        flops { key: "f32xf32->f32" value: 65154236893203 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 102573731753916 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 98962380092165 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 512
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 447019910074937 }
-        flops { key: "f32xf32->f32" value: 193886208739617 }
+        flops { key: "bf16xbf16->bf16" value: 162466609774549 }
+        flops { key: "f32xf32->f32" value: 124737665427509 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161683756060834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166471600620155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181344675561560 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 291698403694648 }
+        flops { key: "f32xf32->f32" value: 199710187668557 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 328965019607843 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 326514162688155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 311319751812119 }
+      }
+      entries {
+        b: 1
+        m: 512
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
-        flops { key: "f32xf32->f32" value: 65728564152791 }
+        flops { key: "bf16xbf16->bf16" value: 44143308008551 }
+        flops { key: "f32xf32->f32" value: 30890156041426 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41630808933002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 41425224691358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47645625843095 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 349525333333333 }
-        flops { key: "f32xf32->f32" value: 149462948775055 }
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 80273760765550 }
+        flops { key: "f32xf32->f32" value: 60897335753176 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 81616131346913 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91149560611205 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 512
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 626088527113702 }
-        flops { key: "f32xf32->f32" value: 217400652763717 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 157163615925058 }
+        flops { key: "f32xf32->f32" value: 116495803840729 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 154273250574712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153523280526165 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 177477987438016 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 1024
+        b: 1
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 673839272969739 }
-        flops { key: "f32xf32->f32" value: 278534507964558 }
+        flops { key: "bf16xbf16->bf16" value: 289262344827586 }
+        flops { key: "f32xf32->f32" value: 213679964975124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284887721942159 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 328060441185456 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
+        b: 1
+        m: 512
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 218240208130081 }
-        flops { key: "f32xf32->f32" value: 83938541588492 }
+        flops { key: "bf16xbf16->bf16" value: 492429178628754 }
+        flops { key: "f32xf32->f32" value: 311703846142680 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 584667478355567 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 583555339130434 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581029125541125 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 66576253968253 }
-        flops { key: "f32xf32->f32" value: 35734219382321 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 69741609768771 }
+        flops { key: "f32xf32->f32" value: 50595694279521 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84519979848866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84506675901149 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84733414141414 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 619859436746254 }
-        flops { key: "f32xf32->f32" value: 319734034048770 }
+        b: 1
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 100162483582089 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151487277652370 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 146846529540481 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 150764086492558 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 523265996101364 }
-        flops { key: "f32xf32->f32" value: 254561835941204 }
+        flops { key: "bf16xbf16->bf16" value: 263107528546924 }
+        flops { key: "f32xf32->f32" value: 187930659665704 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300852290277388 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286178524520255 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 307750594439667 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 512
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 358870930481283 }
-        flops { key: "f32xf32->f32" value: 144865329735563 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 750605958755679 }
-        flops { key: "f32xf32->f32" value: 306217422870230 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 471611649939606 }
+        flops { key: "f32xf32->f32" value: 313867823443437 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 579617718758434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 533668898608349 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 306433168949771 }
-        flops { key: "f32xf32->f32" value: 102573731753916 }
+        b: 1
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 761316546308605 }
+        flops { key: "f32xf32->f32" value: 443237079050567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 972372038940457 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 983280058608058 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 975907133833219 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 512
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 78766272300469 }
-        flops { key: "f32xf32->f32" value: 51781530864197 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 103383576352782 }
+        flops { key: "f32xf32->f32" value: 73252955655614 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 131328500978473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 135847902834008 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 134993943173246 }
       }
       entries {
         b: 1
         m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9731563805104 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 204893010972235 }
+        flops { key: "f32xf32->f32" value: 139664649323621 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 265679036001484 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 244448907000569 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 242680941123290 }
       }
       entries {
-        b: 4
+        b: 1
         m: 512
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 77852510440835 }
-        flops { key: "f32xf32->f32" value: 49490312684365 }
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 396397535394554 }
+        flops { key: "f32xf32->f32" value: 275813466221423 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 477643160142348 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 516097968757510 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 471767057996485 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 68338965376782 }
-        flops { key: "f32xf32->f32" value: 39709386982248 }
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 669258635917413 }
+        flops { key: "f32xf32->f32" value: 434999472932597 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 920777638760853 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 928741982052113 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 864439427593841 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 304694047673098 }
-        flops { key: "f32xf32->f32" value: 178718679094540 }
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 990479630095128 }
+        flops { key: "f32xf32->f32" value: 582605438958220 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1472896877914952 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1493382230876217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1480640281306558 }
       }
       entries {
         b: 1
         m: 1024
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 349981037809648 }
-        flops { key: "f32xf32->f32" value: 160361695702497 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 26214400000000 }
+        flops { key: "f32xf32->f32" value: 19152073059360 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21902370757180 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21672489585015 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 22787390152801 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 441505684210526 }
-        flops { key: "f32xf32->f32" value: 223882782318598 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 42460527681113 }
+        flops { key: "f32xf32->f32" value: 35098778242677 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 41630808933002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51622203076923 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 52418561999609 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 301612871910112 }
-        flops { key: "f32xf32->f32" value: 162294713422007 }
+        flops { key: "bf16xbf16->bf16" value: 90443212938005 }
+        flops { key: "f32xf32->f32" value: 68618470347648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93206755555555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92166680171673 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93466384401114 }
       }
       entries {
         b: 1
-        m: 256
-        n: 512
+        m: 1024
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9731563805104 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 729692031260618 }
-        flops { key: "f32xf32->f32" value: 223348685105857 }
+        flops { key: "bf16xbf16->bf16" value: 166885580354367 }
+        flops { key: "f32xf32->f32" value: 116281332466969 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 175218966057441 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167327695808009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 150785258250245 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 596523235555555 }
-        flops { key: "f32xf32->f32" value: 270604520358498 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 274403737285969 }
+        flops { key: "f32xf32->f32" value: 181375308108108 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 243148057971014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289223386936026 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 291105279652975 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 1024
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 41221660933660 }
-        flops { key: "f32xf32->f32" value: 25536097412480 }
+        flops { key: "bf16xbf16->bf16" value: 47926344581324 }
+        flops { key: "f32xf32->f32" value: 33884808886644 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51306470948012 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46863731843575 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51463852760736 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 324589426844014 }
-        flops { key: "f32xf32->f32" value: 131521536501714 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 725746417032781 }
-        flops { key: "f32xf32->f32" value: 317725034149228 }
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88272099967116 }
+        flops { key: "f32xf32->f32" value: 65664250489236 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 101660843022154 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 99827242841204 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91914211950008 }
       }
       entries {
         b: 1
         m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 220029062295081 }
-        flops { key: "f32xf32->f32" value: 94586136715997 }
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 164080352078239 }
+        flops { key: "f32xf32->f32" value: 122210542226269 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174762666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164836018421860 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 783185953785750 }
-        flops { key: "f32xf32->f32" value: 320546111440999 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 542037204101593 }
-        flops { key: "f32xf32->f32" value: 263236534444716 }
+        flops { key: "bf16xbf16->bf16" value: 293652898673594 }
+        flops { key: "f32xf32->f32" value: 201207125269371 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 333874945273631 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 329773287469287 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 330534654148068 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 503874974234136 }
-        flops { key: "f32xf32->f32" value: 242271975405963 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 621378370370370 }
-        flops { key: "f32xf32->f32" value: 272873920869137 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 467606673489384 }
+        flops { key: "f32xf32->f32" value: 294599581315590 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547687744963019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 498024964749536 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
+        m: 1024
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 232613046793760 }
-        flops { key: "f32xf32->f32" value: 111848106666666 }
+        flops { key: "bf16xbf16->bf16" value: 77314359447004 }
+        flops { key: "f32xf32->f32" value: 61008058181818 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 82620946752847 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91678775956284 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92436451790633 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 370767204419889 }
-        flops { key: "f32xf32->f32" value: 177771825165562 }
+        flops { key: "bf16xbf16->bf16" value: 152824056931397 }
+        flops { key: "f32xf32->f32" value: 114716006837606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 156044444702804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 154273250574712 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 170760468193384 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 4096
+        b: 1
+        m: 1024
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 592245904026475 }
-        flops { key: "f32xf32->f32" value: 257739276044167 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 171633923273657 }
-        flops { key: "f32xf32->f32" value: 70640909473684 }
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 208736746500777 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 332994828345479 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 308457863832232 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
         m: 1024
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 524288000000000 }
-        flops { key: "f32xf32->f32" value: 224069662771285 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 492542121100917 }
+        flops { key: "f32xf32->f32" value: 316178393404004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 586023645244917 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498950661710037 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
+        b: 1
+        m: 1024
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 600191069871436 }
-        flops { key: "f32xf32->f32" value: 269314937593077 }
+        flops { key: "bf16xbf16->bf16" value: 723362913010526 }
+        flops { key: "f32xf32->f32" value: 425412767036450 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 967117157396982 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 856082777755630 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 368224219478738 }
-        flops { key: "f32xf32->f32" value: 118934628267611 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 99864380952380 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 131296383467840 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 150426145138694 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158978653242522 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 793131337247852 }
-        flops { key: "f32xf32->f32" value: 346934459512435 }
+        b: 1
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 263624312300515 }
+        flops { key: "f32xf32->f32" value: 183859901369863 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 294296786076469 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302249633779028 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 710616693580410 }
-        flops { key: "f32xf32->f32" value: 292535340070665 }
+        flops { key: "bf16xbf16->bf16" value: 468473745200698 }
+        flops { key: "f32xf32->f32" value: 323002729638264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 575887274872620 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 465932663918420 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 394467973548861 }
-        flops { key: "f32xf32->f32" value: 210125601565557 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 760305770224818 }
+        flops { key: "f32xf32->f32" value: 454493893756613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 979468026453819 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 990194189279538 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 983280058608058 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 488064465454545 }
-        flops { key: "f32xf32->f32" value: 259107583011583 }
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 981258235320996 }
+        flops { key: "f32xf32->f32" value: 559714249820811 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431655765333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1437044682894186 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1478983228650137 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 1024
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 293051807860262 }
-        flops { key: "f32xf32->f32" value: 117889967501097 }
+        flops { key: "bf16xbf16->bf16" value: 204288779299847 }
+        flops { key: "f32xf32->f32" value: 144165121374865 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 250406208955223 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 255652815238095 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 76433785876993 }
-        flops { key: "f32xf32->f32" value: 50006605067064 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394069850077988 }
+        flops { key: "f32xf32->f32" value: 269158820329635 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487068189612156 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 475949390070922 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 511183920019043 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 481930800718132 }
-        flops { key: "f32xf32->f32" value: 188843726603205 }
+        b: 1
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 675256237088279 }
+        flops { key: "f32xf32->f32" value: 442233041186161 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923847557754355 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 919102781082816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 925440055160525 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607491838189533 }
-        flops { key: "f32xf32->f32" value: 289264780044282 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1014878850661625 }
+        flops { key: "f32xf32->f32" value: 584508341861731 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1470879210958904 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1492992889893108 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1490919828516879 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 1024
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1213953447145279 }
+        flops { key: "f32xf32->f32" value: 656584784697406 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1937615652625049 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1934452109447134 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1951814267666439 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 21564544987146 }
-        flops { key: "f32xf32->f32" value: 15307678832116 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 41020088019559 }
+        flops { key: "f32xf32->f32" value: 33091155818540 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 52093044052008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51140304057915 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43351979328165 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 174762666666666 }
+        m: 2048
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 90687654054054 }
+        flops { key: "f32xf32->f32" value: 68200065040650 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 88504931091328 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88069375328083 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89225679242147 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 393312023443223 }
-        flops { key: "f32xf32->f32" value: 198473534935305 }
+        flops { key: "bf16xbf16->bf16" value: 165267327074034 }
+        flops { key: "f32xf32->f32" value: 115879756529246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174705796290270 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177009862182657 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169440085845037 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 276097151967086 }
+        flops { key: "f32xf32->f32" value: 181084716080613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 293612749248017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289887101511879 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 301570516500491 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 406105077155824 }
+        flops { key: "f32xf32->f32" value: 275247840041015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 488953471766848 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 489845722627737 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 421281735752820 }
+      }
+      entries {
+        b: 1
+        m: 2048
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 543116754678806 }
-        flops { key: "f32xf32->f32" value: 234954447264770 }
+        flops { key: "bf16xbf16->bf16" value: 86466566596875 }
+        flops { key: "f32xf32->f32" value: 65027968992248 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89717732620320 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 97506522339266 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93433851722937 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 728585722240481 }
-        flops { key: "f32xf32->f32" value: 267534616782552 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 162098705314009 }
+        flops { key: "f32xf32->f32" value: 120672266127219 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 179916525469168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 179435465240641 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164030220592728 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 445536026556016 }
-        flops { key: "f32xf32->f32" value: 229144359164510 }
+        flops { key: "bf16xbf16->bf16" value: 231784527576902 }
+        flops { key: "f32xf32->f32" value: 203668783004552 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329672036843721 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330534654148068 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 315759983531833 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 717741860962566 }
-        flops { key: "f32xf32->f32" value: 290910570294045 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 470114633975481 }
+        flops { key: "f32xf32->f32" value: 296531848660591 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 503631249530956 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 509365191650853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 497045167920379 }
       }
       entries {
         b: 1
-        m: 256
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 39107729603729 }
-        flops { key: "f32xf32->f32" value: 21902370757180 }
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 647466238938720 }
+        flops { key: "f32xf32->f32" value: 405798119425548 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 818322815280556 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 867144618614980 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 874204619580704 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 342392163265306 }
-        flops { key: "f32xf32->f32" value: 157347864009378 }
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 150089715403969 }
+        flops { key: "f32xf32->f32" value: 112410157453936 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166059669656665 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169013351802298 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169013351802298 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 284359593220339 }
+        flops { key: "f32xf32->f32" value: 214405316293929 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 314235242610477 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 311319751812119 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303574165677127 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 777585153532370 }
-        flops { key: "f32xf32->f32" value: 310086351656483 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 496069218757218 }
+        flops { key: "f32xf32->f32" value: 318357964272477 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 504518653353694 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 541064159234064 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 138084082304526 }
-        flops { key: "f32xf32->f32" value: 79324898345153 }
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 725378702246242 }
+        flops { key: "f32xf32->f32" value: 426744229320880 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970614078192090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 979468026453819 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 969081068592057 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 340654131979695 }
-        flops { key: "f32xf32->f32" value: 181990139661016 }
+        b: 1
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 911350548193729 }
+        flops { key: "f32xf32->f32" value: 538216453132832 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1423942742146705 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1373071386189258 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394017298279779 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 136123456389452 }
-        flops { key: "f32xf32->f32" value: 76346830489192 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 261060496960855 }
+        flops { key: "f32xf32->f32" value: 184317539095356 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 256630455066921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297559047803796 }
       }
       entries {
         b: 1
-        m: 256
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466033777777777 }
+        flops { key: "f32xf32->f32" value: 327360312195121 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 567516820295983 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 544355804309252 }
+      }
+      entries {
+        b: 1
+        m: 2048
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 72471775377969 }
-        flops { key: "f32xf32->f32" value: 40427026506024 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 760373071788970 }
+        flops { key: "f32xf32->f32" value: 452292259477674 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 985084242201834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 992138437514437 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 990422528767439 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
-        flops { key: "f32xf32->f32" value: 60567566787003 }
+        b: 1
+        m: 2048
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 985762519164562 }
+        flops { key: "f32xf32->f32" value: 556342913989637 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1427729509183079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1472896877914952 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1425715284979253 }
       }
       entries {
         b: 1
         m: 2048
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 77314359447004 }
-        flops { key: "f32xf32->f32" value: 45039506040268 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1172847432004369 }
+        flops { key: "f32xf32->f32" value: 630003087111974 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2035167823728010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2043276544243577 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2055008275598086 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 2048
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21959706806282 }
-        flops { key: "f32xf32->f32" value: 17084741344195 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 397093869822485 }
+        flops { key: "f32xf32->f32" value: 270583210231210 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 502688119850187 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496069218757218 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 486296115942029 }
       }
       entries {
         b: 1
         m: 2048
-        n: 256
+        n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 307486203894616 }
-        flops { key: "f32xf32->f32" value: 118152658688894 }
+        flops { key: "bf16xbf16->bf16" value: 666920387577639 }
+        flops { key: "f32xf32->f32" value: 457178912768109 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 855997468061783 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 888675211255948 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 884466082372322 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 458472170794193 }
-        flops { key: "f32xf32->f32" value: 222398886495443 }
+        b: 1
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1017704471536046 }
+        flops { key: "f32xf32->f32" value: 574078366103054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1484735043125054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1464609478601875 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1514045050145413 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
+        b: 1
+        m: 2048
+        n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 384045003442571 }
-        flops { key: "f32xf32->f32" value: 115134229466009 }
+        flops { key: "bf16xbf16->bf16" value: 1208488265616207 }
+        flops { key: "f32xf32->f32" value: 657577477761616 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1955813887067395 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1964311592042076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1967911704925544 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 807671014191940 }
-        flops { key: "f32xf32->f32" value: 340050234669602 }
+        flops { key: "bf16xbf16->bf16" value: 1342544381979447 }
+        flops { key: "f32xf32->f32" value: 700304466981901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2491280334106728 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2489836113623188 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2492183823021687 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 168615236180904 }
-        flops { key: "f32xf32->f32" value: 70455500262467 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89717732620320 }
+        flops { key: "f32xf32->f32" value: 66841498007968 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84519979848866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 92420539163367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85163532994923 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 546433498218829 }
-        flops { key: "f32xf32->f32" value: 279765978113600 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 161708106024096 }
+        flops { key: "f32xf32->f32" value: 115904773747841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169413351846008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166059669656665 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168562295761381 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40329846153846 }
-        flops { key: "f32xf32->f32" value: 25653235474006 }
+        b: 1
+        m: 4096
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 271112693851786 }
+        flops { key: "f32xf32->f32" value: 170719743063836 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 225576013445378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 291777669565217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 290475266874070 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 629391456037514 }
-        flops { key: "f32xf32->f32" value: 218732021746049 }
+        b: 1
+        m: 4096
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 403586477729750 }
+        flops { key: "f32xf32->f32" value: 277829568277378 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487123431552682 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 488842168905076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 473431139329806 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 358391797062750 }
-        flops { key: "f32xf32->f32" value: 143089262260127 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 514706368985559 }
+        flops { key: "f32xf32->f32" value: 359276196913296 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 666920387577639 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 634552307896875 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 617093002298850 }
       }
       entries {
         b: 1
-        m: 512
+        m: 4096
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 160884300869044 }
+        flops { key: "f32xf32->f32" value: 123333542844015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 176138750656167 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177978091165257 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
+      }
+      entries {
+        b: 1
+        m: 4096
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 287943637436310 }
+        flops { key: "f32xf32->f32" value: 200306281876690 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322590303139552 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324148475169811 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 308546501149425 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 93727463687150 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 465175706270984 }
+        flops { key: "f32xf32->f32" value: 282210874301859 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 538891756085319 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496183837338262 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 505349723026238 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 39290903981264 }
-        flops { key: "f32xf32->f32" value: 22519753020134 }
+        m: 4096
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 652334036452004 }
+        flops { key: "f32xf32->f32" value: 407956620060790 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 871278485850491 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 857450049111599 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 801150400298451 }
       }
       entries {
         b: 1
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 31126560296846 }
-        flops { key: "f32xf32->f32" value: 19217887743413 }
+        m: 4096
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 805318951108611 }
+        flops { key: "f32xf32->f32" value: 517965182826821 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1172127255509313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1034433356454720 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1009037306707388 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
-        flops { key: "f32xf32->f32" value: 86313651446945 }
+        flops { key: "bf16xbf16->bf16" value: 283683440951122 }
+        flops { key: "f32xf32->f32" value: 210992694831990 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312816263364894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323318826859379 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 80854053012048 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 490573077784123 }
+        flops { key: "f32xf32->f32" value: 315065089201877 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510333566539923 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569926658174097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 515107615255456 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
+        m: 4096
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 594212409518539 }
-        flops { key: "f32xf32->f32" value: 294579375582990 }
+        flops { key: "bf16xbf16->bf16" value: 715648970424060 }
+        flops { key: "f32xf32->f32" value: 419102975800156 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 850825534072900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 915966580507570 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 919299506849315 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 808699906572796 }
-        flops { key: "f32xf32->f32" value: 339606581835209 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 913822828936170 }
+        flops { key: "f32xf32->f32" value: 546155556459816 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1332185885856079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1405076403369592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1441022411004865 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 4096
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674884867379006 }
-        flops { key: "f32xf32->f32" value: 297355612396257 }
+        flops { key: "bf16xbf16->bf16" value: 1075624166291009 }
+        flops { key: "f32xf32->f32" value: 636857546856465 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1895395982347749 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1751617983686786 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1746986900955867 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 738736406437117 }
-        flops { key: "f32xf32->f32" value: 324960877363219 }
+        flops { key: "bf16xbf16->bf16" value: 458766000427259 }
+        flops { key: "f32xf32->f32" value: 308457863832232 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 471767057996485 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 472597633802816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 562757769392033 }
       }
       entries {
         b: 1
-        m: 512
+        m: 4096
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 114716006837606 }
-        flops { key: "f32xf32->f32" value: 55097589490968 }
+        flops { key: "bf16xbf16->bf16" value: 762397673914972 }
+        flops { key: "f32xf32->f32" value: 441052299856233 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 979468026453819 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 893296026622296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 890333187396351 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674037554300062 }
-        flops { key: "f32xf32->f32" value: 301869906503957 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 512
+        b: 1
+        m: 4096
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 518715857004830 }
-        flops { key: "f32xf32->f32" value: 168990076764179 }
+        flops { key: "bf16xbf16->bf16" value: 990308345861194 }
+        flops { key: "f32xf32->f32" value: 556757597433321 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1462614437595777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1416195629709010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1445143773889636 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 91304576870748 }
-        flops { key: "f32xf32->f32" value: 79044598351001 }
+        b: 1
+        m: 4096
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1170011862566826 }
+        flops { key: "f32xf32->f32" value: 629783686498772 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1833497244823906 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1900428007079646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1896651488628836 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 486737000906618 }
-        flops { key: "f32xf32->f32" value: 210951242436149 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1277788708367422 }
+        flops { key: "f32xf32->f32" value: 740001257064093 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2484974207564909 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2481116248546774 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2484075937536148 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 464019802938634 }
-        flops { key: "f32xf32->f32" value: 230617748627424 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 645132151107773 }
+        flops { key: "f32xf32->f32" value: 423733947908445 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 861751062600321 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 913045768707483 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 916161965870307 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 649964784503632 }
-        flops { key: "f32xf32->f32" value: 315620759553204 }
+        flops { key: "bf16xbf16->bf16" value: 991451361034164 }
+        flops { key: "f32xf32->f32" value: 584687376510227 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1484991717866712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1470879210958904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1485120088520055 }
       }
       entries {
         b: 1
         m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 216480206451612 }
-        flops { key: "f32xf32->f32" value: 113073064869418 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1204928403983728 }
+        flops { key: "f32xf32->f32" value: 655520039072039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1951814267666439 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1924268501792114 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1958936052907639 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 490293070319634 }
-        flops { key: "f32xf32->f32" value: 225860711821623 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1343541814655509 }
+        flops { key: "f32xf32->f32" value: 698028164472615 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2499435394486069 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2500799764765821 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2502894694638694 }
       }
       entries {
         b: 1
-        m: 256
+        m: 4096
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 78033562790697 }
-        flops { key: "f32xf32->f32" value: 54471480519480 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1416867213789406 }
+        flops { key: "f32xf32->f32" value: 757940978271899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2764981863158106 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2785491852050019 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2784419640842787 }
       }
       entries {
         b: 2
         m: 256
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 11554556473829 }
-        flops { key: "f32xf32->f32" value: 9709037037037 }
+        flops { key: "bf16xbf16->bf16" value: 13527285627897 }
+        flops { key: "f32xf32->f32" value: 11212842773600 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12942886017357 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 9508198356474 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 11911406460773 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 629391456037514 }
-        flops { key: "f32xf32->f32" value: 227873901528013 }
+        b: 2
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24813778517286 }
+        flops { key: "f32xf32->f32" value: 19551016460305 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26132735202492 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21895224796084 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21952523388943 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
-        flops { key: "f32xf32->f32" value: 147330107574094 }
+        m: 256
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46337900224408 }
+        flops { key: "f32xf32->f32" value: 31293478200046 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 50830421511077 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43690666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 53071462238038 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        b: 2
+        m: 256
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 88054930621617 }
+        flops { key: "f32xf32->f32" value: 67633019904258 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 95021400353982 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 82620946752847 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92182505494505 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 412026793553338 }
-        flops { key: "f32xf32->f32" value: 177834391073018 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164836018421860 }
+        flops { key: "f32xf32->f32" value: 111476518272425 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 165700898765432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 165241893505694 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 167353775561097 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738474431911967 }
-        flops { key: "f32xf32->f32" value: 291068289477919 }
+        b: 2
+        m: 256
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 22129880956306 }
+        flops { key: "f32xf32->f32" value: 18152248850419 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 26296576802507 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 19284156321839 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 26291425661116 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 478494573975044 }
-        flops { key: "f32xf32->f32" value: 166474826876489 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 169681072060682 }
-        flops { key: "f32xf32->f32" value: 69832324661810 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
-        flops { key: "f32xf32->f32" value: 50917195751138 }
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 45582519273221 }
+        flops { key: "f32xf32->f32" value: 34309235173824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46210269581683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51771544069431 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 39568905660377 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21732145077720 }
-        flops { key: "f32xf32->f32" value: 16677153081510 }
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 88301136842105 }
+        flops { key: "f32xf32->f32" value: 65408249512670 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91647475588938 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 94753073067419 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92166680171673 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 78215459207459 }
-        flops { key: "f32xf32->f32" value: 50533783132530 }
+        flops { key: "bf16xbf16->bf16" value: 162442030862329 }
+        flops { key: "f32xf32->f32" value: 118527632630533 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 164080352078239 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159403477434679 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 174308737662337 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 136400130081300 }
-        flops { key: "f32xf32->f32" value: 62543209692451 }
+        m: 256
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 293693059080962 }
+        flops { key: "f32xf32->f32" value: 190870469113856 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 324099554482342 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323416212048192 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325672376099484 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 508400484848484 }
-        flops { key: "f32xf32->f32" value: 158512199295085 }
-      }
-      entries {
-        b: 1
-        m: 2048
+        m: 256
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43577184415584 }
-        flops { key: "f32xf32->f32" value: 31714964083175 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 40524676328502 }
+        flops { key: "f32xf32->f32" value: 30720468757152 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 40920039024390 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 48489063583815 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 519217516441005 }
-        flops { key: "f32xf32->f32" value: 167719747578881 }
-      }
-      entries {
-        b: 4
         m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 204600195121951 }
-        flops { key: "f32xf32->f32" value: 82544728167281 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 132364623274161 }
-        flops { key: "f32xf32->f32" value: 67513947686116 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 304003913929784 }
-        flops { key: "f32xf32->f32" value: 132104062992125 }
+        flops { key: "bf16xbf16->bf16" value: 81245598062954 }
+        flops { key: "f32xf32->f32" value: 61447969783678 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89003798408488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81430443197330 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 81430443197330 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 256
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 518215166023166 }
-        flops { key: "f32xf32->f32" value: 170273045353631 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 561874319204604 }
-        flops { key: "f32xf32->f32" value: 170869163590070 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 157140615249524 }
+        flops { key: "f32xf32->f32" value: 115480944719294 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 149421350403562 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173857160621761 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173800877954030 }
       }
       entries {
         b: 2
         m: 256
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 226336809443507 }
-        flops { key: "f32xf32->f32" value: 110649404781533 }
+        flops { key: "bf16xbf16->bf16" value: 292333739177783 }
+        flops { key: "f32xf32->f32" value: 200324967164179 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 313592822429906 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 337124591522762 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607674484339352 }
-        flops { key: "f32xf32->f32" value: 208697443303227 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
-        flops { key: "f32xf32->f32" value: 264863931424695 }
+        flops { key: "bf16xbf16->bf16" value: 488842168905076 }
+        flops { key: "f32xf32->f32" value: 306695750928306 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 573580034188034 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569851040997744 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 574654441530639 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 256
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40920039024390 }
-        flops { key: "f32xf32->f32" value: 25970922600619 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 68460968120377 }
+        flops { key: "f32xf32->f32" value: 50748739200302 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 76948675935215 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 80829706714844 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 81628540672038 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 21959706806282 }
-        flops { key: "f32xf32->f32" value: 15592208178438 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135283082272899 }
+        flops { key: "f32xf32->f32" value: 99420539259259 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 150131686800894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 154628718894009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 327760019536019 }
-        flops { key: "f32xf32->f32" value: 147330107574094 }
+        flops { key: "bf16xbf16->bf16" value: 261601126568400 }
+        flops { key: "f32xf32->f32" value: 187717102097902 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305691622491103 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309882200288600 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309882200288600 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 2
+        m: 256
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 758434522012648 }
-        flops { key: "f32xf32->f32" value: 384029980138926 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 460438174957118 }
+        flops { key: "f32xf32->f32" value: 302292180180180 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 570987409731454 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 571139268085106 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 571139268085106 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 764575547036718 }
-        flops { key: "f32xf32->f32" value: 342271935609352 }
+        flops { key: "bf16xbf16->bf16" value: 732242314551189 }
+        flops { key: "f32xf32->f32" value: 452649765084049 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 968862462440785 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 968862462440785 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 967334976576576 }
       }
       entries {
-        b: 4
+        b: 2
         m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 71392408510638 }
-        flops { key: "f32xf32->f32" value: 37532921700223 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 104034669508768 }
+        flops { key: "f32xf32->f32" value: 73168097035775 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 135300129032258 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 136677930753564 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 131586007843137 }
       }
       entries {
         b: 2
-        m: 512
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371280022130013 }
-        flops { key: "f32xf32->f32" value: 144865329735563 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205186666157080 }
+        flops { key: "f32xf32->f32" value: 142604664851583 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249417380720092 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 246242821694759 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 256110154800238 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
-        flops { key: "f32xf32->f32" value: 266971285356870 }
+        b: 2
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 395922501474926 }
+        flops { key: "f32xf32->f32" value: 271403936556082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 472493651925192 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498024964749536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 506422272845183 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 256
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 493901482980680 }
-        flops { key: "f32xf32->f32" value: 250640014939309 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 652334036452004 }
+        flops { key: "f32xf32->f32" value: 416501871217998 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 927038052233973 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 820903535168195 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 928641577513513 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 197016848440366 }
-      }
-      entries {
-        b: 1
         m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 41838443890274 }
-        flops { key: "f32xf32->f32" value: 25930782071097 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 993056022196531 }
+        flops { key: "f32xf32->f32" value: 552762843758043 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431655765333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1418417204755614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1431536470627447 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 688634012446012 }
-        flops { key: "f32xf32->f32" value: 325083857968683 }
+        b: 2
+        m: 512
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 20164923076923 }
+        flops { key: "f32xf32->f32" value: 19642576906190 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21732145077720 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 21841778356387 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 26630501587301 }
       }
       entries {
-        b: 4
+        b: 2
         m: 512
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 343267846547314 }
-        flops { key: "f32xf32->f32" value: 147898322865013 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46725057615317 }
+        flops { key: "f32xf32->f32" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 42581766497461 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 51781530864197 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43233283298437 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 298593388209121 }
-        flops { key: "f32xf32->f32" value: 161708106024096 }
+        flops { key: "bf16xbf16->bf16" value: 75551774838164 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91180521739130 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93727463687150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91914211950008 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 512
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631241519106408 }
-        flops { key: "f32xf32->f32" value: 232113344376139 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164482509803921 }
+        flops { key: "f32xf32->f32" value: 114495822563446 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 153919412844036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151103549676329 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 174734226851098 }
       }
       entries {
-        b: 4
+        b: 2
         m: 512
-        n: 512
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 77492914549653 }
-        flops { key: "f32xf32->f32" value: 51861564142194 }
+        flops { key: "bf16xbf16->bf16" value: 273913730612244 }
+        flops { key: "f32xf32->f32" value: 176326763116840 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 282489298605630 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 265711908933432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296941876106194 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
+        b: 2
+        m: 512
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 145572373101952 }
-        flops { key: "f32xf32->f32" value: 80369897005988 }
+        flops { key: "bf16xbf16->bf16" value: 39850869358669 }
+        flops { key: "f32xf32->f32" value: 33825032258064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46995002801120 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47118738985430 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51130563047619 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 512
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 703631601572739 }
-        flops { key: "f32xf32->f32" value: 244339392332690 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 89210852775008 }
+        flops { key: "f32xf32->f32" value: 64902189555125 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 87381333333333 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93174403332176 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 86452642834138 }
       }
       entries {
-        b: 4
+        b: 2
+        m: 512
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 162466609774549 }
+        flops { key: "f32xf32->f32" value: 121574028985507 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163232262693827 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168562295761381 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 179405484377610 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 292971848294679 }
+        flops { key: "f32xf32->f32" value: 202688404719207 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309971658198614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 312134251162790 }
+      }
+      entries {
+        b: 2
         m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 699734000651678 }
-        flops { key: "f32xf32->f32" value: 276276973537996 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 466844271304347 }
+        flops { key: "f32xf32->f32" value: 287365669476783 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 485307039096045 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 538891756085319 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 549861387274356 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
+        b: 2
+        m: 512
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 235883528998242 }
-        flops { key: "f32xf32->f32" value: 107288351718625 }
+        flops { key: "bf16xbf16->bf16" value: 79324898345153 }
+        flops { key: "f32xf32->f32" value: 61780312082853 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91165038546442 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88069375328083 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91165038546442 }
       }
       entries {
         b: 2
         m: 512
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 139810133333333 }
-        flops { key: "f32xf32->f32" value: 68548379979570 }
+        flops { key: "bf16xbf16->bf16" value: 151146090090090 }
+        flops { key: "f32xf32->f32" value: 113551377326565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167353775561097 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168986752282027 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 149442146694502 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371794260387811 }
-        flops { key: "f32xf32->f32" value: 135163875125881 }
+        flops { key: "bf16xbf16->bf16" value: 289184439536762 }
+        flops { key: "f32xf32->f32" value: 203978310030395 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 325771184466019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309257437788018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 306389448994150 }
       }
       entries {
         b: 2
-        m: 256
-        n: 1024
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 66182311637080 }
+        flops { key: "bf16xbf16->bf16" value: 495268369003690 }
+        flops { key: "f32xf32->f32" value: 316924977567886 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 562757769392033 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 577202969493347 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 580871963213416 }
       }
       entries {
         b: 2
-        m: 256
+        m: 512
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 228261442176870 }
-        flops { key: "f32xf32->f32" value: 111662003327787 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
+        flops { key: "f32xf32->f32" value: 422961967206657 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 836084737395367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 931865327836841 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917532000854518 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 680876235890932 }
-        flops { key: "f32xf32->f32" value: 276028393287221 }
+        m: 512
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 133649716704007 }
+        flops { key: "f32xf32->f32" value: 99864380952380 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 156044444702804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 156067125581395 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158626358989511 }
       }
       entries {
-        b: 1
+        b: 2
         m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 430530001603849 }
-        flops { key: "f32xf32->f32" value: 181252839972991 }
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 261123984435797 }
+        flops { key: "f32xf32->f32" value: 176834951251646 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 295633762114537 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 302207099352659 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 301570516500491 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 458864027350427 }
-        flops { key: "f32xf32->f32" value: 221164911803705 }
+        b: 2
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466793532876861 }
+        flops { key: "f32xf32->f32" value: 314995767950128 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 542156942186316 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 577125409298575 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462819751724137 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 512
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 756697425931839 }
-        flops { key: "f32xf32->f32" value: 301108029620282 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 757222724964739 }
+        flops { key: "f32xf32->f32" value: 458741500240320 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 974136379224313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 975907133833219 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373345557719054 }
-        flops { key: "f32xf32->f32" value: 186932768802228 }
+        b: 2
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 972372038940457 }
+        flops { key: "f32xf32->f32" value: 558259218301163 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1445143773889636 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1416545941952506 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1431417195800700 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 512
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
-        flops { key: "f32xf32->f32" value: 9218250549450 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 720518760010484 }
-        flops { key: "f32xf32->f32" value: 298160677964925 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 205225883792048 }
+        flops { key: "f32xf32->f32" value: 145100246486486 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 242708368896925 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 259107583011583 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 254682595825426 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 450395060402684 }
-        flops { key: "f32xf32->f32" value: 192289008595988 }
+        b: 2
+        m: 512
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 391876578102189 }
+        flops { key: "f32xf32->f32" value: 271971080040526 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 478334702750863 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477643160142348 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474162872157209 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 512
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 489845722627737 }
-        flops { key: "f32xf32->f32" value: 198331476807277 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 671088640000000 }
+        flops { key: "f32xf32->f32" value: 438239609815825 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923748208624583 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 872694767042568 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 923847557754355 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 602887043234138 }
-        flops { key: "f32xf32->f32" value: 205855411042944 }
+        b: 2
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 989623800921659 }
+        flops { key: "f32xf32->f32" value: 581048776811986 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1490919828516879 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1499639418994413 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1495462150417827 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 2
+        m: 512
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 801942750700917 }
-        flops { key: "f32xf32->f32" value: 323686992182136 }
+        flops { key: "bf16xbf16->bf16" value: 1209168720720720 }
+        flops { key: "f32xf32->f32" value: 662241507362578 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1908874353777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1903797560283688 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1908874353777777 }
       }
       entries {
-        b: 1
+        b: 2
         m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 45221606469002 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 41721395088591 }
+        flops { key: "f32xf32->f32" value: 34450135523613 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51931796479009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 48620803477630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51306470948012 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 1024
         n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43129089974293 }
-        flops { key: "f32xf32->f32" value: 31595510357815 }
+        flops { key: "bf16xbf16->bf16" value: 90443212938005 }
+        flops { key: "f32xf32->f32" value: 67378377510040 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92420539163367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 88989045582628 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 92436451790633 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335544320000000 }
-        flops { key: "f32xf32->f32" value: 168403673776662 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 477643160142348 }
-        flops { key: "f32xf32->f32" value: 158743616794795 }
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 165292768472906 }
+        flops { key: "f32xf32->f32" value: 113743837288135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174280445382243 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174762666666666 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169413351846008 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 1024
         n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 299593142857142 }
-        flops { key: "f32xf32->f32" value: 161805579264617 }
+        flops { key: "bf16xbf16->bf16" value: 275036327868852 }
+        flops { key: "f32xf32->f32" value: 178956970666666 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 293011822622458 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 240103270125223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300263373601789 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 235057316987740 }
-        flops { key: "f32xf32->f32" value: 110923742148760 }
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 274754816786079 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 467555769214021 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481822671752299 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470836142951107 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
+        b: 2
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 542293850505050 }
-        flops { key: "f32xf32->f32" value: 267632558325024 }
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 64265131912856 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 92948565096952 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89448669110296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89928125963149 }
       }
       entries {
         b: 2
-        m: 256
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 72944417391304 }
-        flops { key: "f32xf32->f32" value: 37449142857142 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 163680156097560 }
+        flops { key: "f32xf32->f32" value: 122657279415124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 174762666666666 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167772160000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162836188049742 }
       }
       entries {
-        b: 4
+        b: 2
         m: 1024
-        n: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 541746631685166 }
-        flops { key: "f32xf32->f32" value: 265518896867224 }
+        flops { key: "bf16xbf16->bf16" value: 293612749248017 }
+        flops { key: "f32xf32->f32" value: 197615132787337 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309926922788281 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 310599312698871 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 310599312698871 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 112598765100671 }
-        flops { key: "f32xf32->f32" value: 53303307386814 }
-      }
-      entries {
-        b: 1
         m: 1024
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
-        flops { key: "f32xf32->f32" value: 97612893090909 }
+        flops { key: "bf16xbf16->bf16" value: 466033777777777 }
+        flops { key: "f32xf32->f32" value: 289242864570004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553332555526926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 500812417910447 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 510333566539923 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 79324898345153 }
-        flops { key: "f32xf32->f32" value: 54120051612903 }
+        b: 2
+        m: 1024
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 648394821256038 }
+        flops { key: "f32xf32->f32" value: 409395414736440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 868723158576051 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 806112480480480 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 806112480480480 }
       }
       entries {
-        b: 4
+        b: 2
         m: 1024
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 313959597660818 }
-        flops { key: "f32xf32->f32" value: 102888254503641 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 148470938053097 }
+        flops { key: "f32xf32->f32" value: 113144554689146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171633923273657 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167353775561097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168166299765074 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 216131607085346 }
-        flops { key: "f32xf32->f32" value: 96006958512160 }
+        b: 2
+        m: 1024
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 209326800662832 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322638769230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 320999050523168 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 322638769230769 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335544320000000 }
-        flops { key: "f32xf32->f32" value: 176486164365548 }
+        b: 2
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 496183837338262 }
+        flops { key: "f32xf32->f32" value: 317229285471600 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 579617718758434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 580871963213416 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 583396807389296 }
       }
       entries {
-        b: 1
+        b: 2
         m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738474431911967 }
-        flops { key: "f32xf32->f32" value: 324837989770739 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 715768235313723 }
+        flops { key: "f32xf32->f32" value: 426659444295435 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 915966580507570 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 887389937190082 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 894598478650281 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
-        flops { key: "f32xf32->f32" value: 59178892416225 }
+        b: 2
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 906876540540540 }
+        flops { key: "f32xf32->f32" value: 528920574612850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1380130879177378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1335499781094527 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1379798344229379 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 1024
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 135027895372233 }
-        flops { key: "f32xf32->f32" value: 75318590347923 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 263107528546924 }
+        flops { key: "f32xf32->f32" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 306389448994150 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 296286375275938 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296204641103448 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 309614136101499 }
-        flops { key: "f32xf32->f32" value: 120590950584007 }
+        b: 2
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 466844271304347 }
+        flops { key: "f32xf32->f32" value: 323318826859379 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 457300606473594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 563940033613445 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573426875300400 }
       }
       entries {
         b: 2
-        m: 2048
+        m: 1024
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737206882251973 }
-        flops { key: "f32xf32->f32" value: 287329623508358 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 758292248587570 }
+        flops { key: "f32xf32->f32" value: 456425855047821 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 974136379224313 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 977684337810152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 1024
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 759301869386266 }
-        flops { key: "f32xf32->f32" value: 293054307299973 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 443328581337737 }
-        flops { key: "f32xf32->f32" value: 226910782755705 }
+        flops { key: "bf16xbf16->bf16" value: 983055000228885 }
+        flops { key: "f32xf32->f32" value: 551768665981500 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1387263338501292 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1474920087912088 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1486790928948507 }
       }
       entries {
         b: 2
         m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
-        flops { key: "f32xf32->f32" value: 51463852760736 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1159939854432516 }
+        flops { key: "f32xf32->f32" value: 630349820543396 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1807646168350168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1803092903442485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1900007651404556 }
       }
       entries {
         b: 2
-        m: 256
+        m: 1024
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 60349697841726 }
-        flops { key: "f32xf32->f32" value: 28102539363484 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 394178349486049 }
+        flops { key: "f32xf32->f32" value: 238397385435168 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 495268369003690 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 493334171376062 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 1024
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 174535407022106 }
-        flops { key: "f32xf32->f32" value: 71851032119914 }
+        flops { key: "bf16xbf16->bf16" value: 681146189199904 }
+        flops { key: "f32xf32->f32" value: 317287873231633 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 914601212947189 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 808540530120481 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 849479291139240 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 1024
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43464290155440 }
-        flops { key: "f32xf32->f32" value: 32704124756335 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 991222546965151 }
+        flops { key: "f32xf32->f32" value: 574750566525041 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1470753290300488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1470501513652315 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1482684835073789 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 512
+        m: 1024
+        n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 645277538461538 }
-        flops { key: "f32xf32->f32" value: 180369867965731 }
+        flops { key: "bf16xbf16->bf16" value: 1208488265616207 }
+        flops { key: "f32xf32->f32" value: 655682658779077 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1936087134050825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1913551925150367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1937615652625049 }
       }
       entries {
         b: 2
         m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 714636821297837 }
-        flops { key: "f32xf32->f32" value: 287946050499886 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1340475504457232 }
+        flops { key: "f32xf32->f32" value: 708857451064532 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2324745491745602 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2363361995253981 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2277213663916227 }
       }
       entries {
         b: 2
         m: 2048
-        n: 1024
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 215784128617363 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        flops { key: "bf16xbf16->bf16" value: 89225679242147 }
+        flops { key: "f32xf32->f32" value: 68182742189484 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84720042922518 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85584395345129 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 4096
+        b: 2
+        m: 2048
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 209715200000000 }
-        flops { key: "f32xf32->f32" value: 106184911392405 }
+        flops { key: "bf16xbf16->bf16" value: 162049777241171 }
+        flops { key: "f32xf32->f32" value: 115084868595927 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169013351802298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166523235732009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172933133193751 }
       }
       entries {
         b: 2
-        m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 587707621237000 }
-        flops { key: "f32xf32->f32" value: 177157712647589 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 2048
+        m: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 140395112970711 }
-        flops { key: "f32xf32->f32" value: 80177854241338 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 521740439261418 }
-        flops { key: "f32xf32->f32" value: 259734355104015 }
+        flops { key: "bf16xbf16->bf16" value: 271695805668016 }
+        flops { key: "f32xf32->f32" value: 178451358484294 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 222906751920282 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 238821580071174 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 294256460400109 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 2048
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737475872335858 }
-        flops { key: "f32xf32->f32" value: 250700363853927 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 267849535141877 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 470114633975481 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 466844271304347 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470887764060958 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 2048
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 225197530201342 }
-        flops { key: "f32xf32->f32" value: 101449529856387 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 534731984063745 }
+        flops { key: "f32xf32->f32" value: 354282545244576 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 636857546856465 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 632170635266411 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 642814831400134 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206806976887519 }
-        flops { key: "f32xf32->f32" value: 110558260296540 }
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 161319384615384 }
+        flops { key: "f32xf32->f32" value: 123376056991841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176138750656167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 171606492568323 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 487178686025408 }
-        flops { key: "f32xf32->f32" value: 230819147977965 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 288020875536480 }
+        flops { key: "f32xf32->f32" value: 201528120120120 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 318759633071099 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 320328706443914 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 319470938411187 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 692967184002742 }
-        flops { key: "f32xf32->f32" value: 312069047782530 }
+        m: 2048
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 460339474383708 }
+        flops { key: "f32xf32->f32" value: 290514562770562 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 543322871094244 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 496987652858134 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 537812083145504 }
       }
       entries {
         b: 2
-        m: 256
+        m: 2048
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 142179796610169 }
-        flops { key: "f32xf32->f32" value: 75234152466367 }
+        flops { key: "bf16xbf16->bf16" value: 646783720502974 }
+        flops { key: "f32xf32->f32" value: 412026793553338 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 805885598273759 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 867319728594507 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 807324679699248 }
       }
       entries {
         b: 2
         m: 2048
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 729080438554983 }
-        flops { key: "f32xf32->f32" value: 316974680283028 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 804075127960310 }
+        flops { key: "f32xf32->f32" value: 518465390632544 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1014699024511251 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1163317252437703 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1056832503937007 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21620123711340 }
-        flops { key: "f32xf32->f32" value: 16912516129032 }
+        b: 2
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 289808859379217 }
+        flops { key: "f32xf32->f32" value: 211699886435331 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329773287469287 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302292180180180 }
       }
       entries {
         b: 2
         m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
-        flops { key: "f32xf32->f32" value: 80466263788968 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
-        flops { key: "f32xf32->f32" value: 106017162717219 }
+        flops { key: "bf16xbf16->bf16" value: 498776831494599 }
+        flops { key: "f32xf32->f32" value: 315806418823529 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 513199581311984 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 556775641171895 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573580034188034 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 522247968871595 }
-        flops { key: "f32xf32->f32" value: 225292031892572 }
+        b: 2
+        m: 2048
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 719485266102688 }
+        flops { key: "f32xf32->f32" value: 427444993630573 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 960305711794298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 965595165467625 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920876349914236 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 391305329446064 }
-        flops { key: "f32xf32->f32" value: 196081414170927 }
+        b: 2
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 910529424634301 }
+        flops { key: "f32xf32->f32" value: 532214039157373 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1342177280000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1385473321290322 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1429392560445960 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 140395112970711 }
-        flops { key: "f32xf32->f32" value: 80562861944777 }
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1074547734801100 }
+        flops { key: "f32xf32->f32" value: 621198625397743 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1753942744665645 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1747342268510984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1915151795775040 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 1024
+        b: 2
+        m: 2048
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 631984593290170 }
-        flops { key: "f32xf32->f32" value: 335859187988739 }
+        flops { key: "bf16xbf16->bf16" value: 469190222416430 }
+        flops { key: "f32xf32->f32" value: 330509218622547 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481067125448028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 562684042447268 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 2048
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 449640629815745 }
-        flops { key: "f32xf32->f32" value: 179195898531375 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 223144163969346 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 209061881619937 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759163463720724 }
+        flops { key: "f32xf32->f32" value: 458447701980039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 986895058823529 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 988484993325661 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 990422528767439 }
       }
       entries {
         b: 2
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 79324898345153 }
+        m: 2048
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 979635581000171 }
+        flops { key: "f32xf32->f32" value: 553760610624033 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1464609478601875 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1458516782748960 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1462614437595777 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 2048
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 588029476451259 }
-        flops { key: "f32xf32->f32" value: 174876518566775 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1160566721880699 }
+        flops { key: "f32xf32->f32" value: 632939217625170 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2047172209723546 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1918573810262996 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2045222521904762 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 299259148272017 }
-        flops { key: "f32xf32->f32" value: 143548372192513 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1277123787094855 }
+        flops { key: "f32xf32->f32" value: 701792041830065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2489836113623188 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2488393566628041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2341857849509269 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
-        n: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 753643516181744 }
-        flops { key: "f32xf32->f32" value: 383517745845006 }
+        flops { key: "bf16xbf16->bf16" value: 655570067312829 }
+        flops { key: "f32xf32->f32" value: 413932854279105 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 836247526479750 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 911495606112054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917434005340168 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 607320036199095 }
-        flops { key: "f32xf32->f32" value: 295554929835275 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 992310355455438 }
+        flops { key: "f32xf32->f32" value: 576660485499463 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1480767900706774 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1484735043125054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1474793474461327 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 2048
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 649571581367211 }
-        flops { key: "f32xf32->f32" value: 180796956357934 }
+        flops { key: "bf16xbf16->bf16" value: 1204421563656758 }
+        flops { key: "f32xf32->f32" value: 651740105614567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1957150738664844 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1927723202872531 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1910572640569395 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 596543948887114 }
-        flops { key: "f32xf32->f32" value: 260048879631872 }
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1341338943160524 }
+        flops { key: "f32xf32->f32" value: 710330222713788 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2460594268690919 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2463946817353890 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2483537287170220 }
       }
       entries {
         b: 2
-        m: 512
+        m: 2048
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 734684792336640 }
-        flops { key: "f32xf32->f32" value: 251847002279540 }
+        flops { key: "bf16xbf16->bf16" value: 1417378628523105 }
+        flops { key: "f32xf32->f32" value: 760667656280094 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2782615676060900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2778115974126779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2770443941059082 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 4096
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 6452775384615 }
-        flops { key: "f32xf32->f32" value: 5652700808625 }
+        flops { key: "bf16xbf16->bf16" value: 159783009523809 }
+        flops { key: "f32xf32->f32" value: 114716006837606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172046438711744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176080981305346 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161319384615384 }
       }
       entries {
         b: 2
-        m: 512
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
-        flops { key: "f32xf32->f32" value: 110376421052631 }
+        m: 4096
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
+        flops { key: "f32xf32->f32" value: 175893492341715 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 294256460400109 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 297517823219728 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 240479691825307 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 4096
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 107805404016064 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 113168404721753 }
-        flops { key: "f32xf32->f32" value: 55692003319502 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 403056240240240 }
+        flops { key: "f32xf32->f32" value: 270310736736106 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 473326790390125 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 463569055153804 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 469241483229542 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
-        flops { key: "f32xf32->f32" value: 80562861944777 }
+        b: 2
+        m: 4096
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 535131733864938 }
+        flops { key: "f32xf32->f32" value: 355779265738899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 643585419345171 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 639132038095238 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 643681872761333 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 4096
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 338933656565656 }
-        flops { key: "f32xf32->f32" value: 149546215041782 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 793600756836659 }
+        flops { key: "f32xf32->f32" value: 460833400858369 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 903775537061392 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 900790120805369 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 900742892256068 }
       }
       entries {
         b: 2
-        m: 512
-        n: 512
+        m: 4096
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81049352657004 }
-        flops { key: "f32xf32->f32" value: 50006605067064 }
+        flops { key: "bf16xbf16->bf16" value: 284359593220339 }
+        flops { key: "f32xf32->f32" value: 198546934911242 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 302974555304740 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322541851607089 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325672376099484 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 286483944503735 }
-        flops { key: "f32xf32->f32" value: 94552820007044 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 459502224884989 }
+        flops { key: "f32xf32->f32" value: 279583862517901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 538959379595934 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 541132329091596 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 539027020080321 }
       }
       entries {
         b: 2
-        m: 256
+        m: 4096
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80659692307692 }
-        flops { key: "f32xf32->f32" value: 48841967976710 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
-        flops { key: "f32xf32->f32" value: 51228140458015 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 641806230723251 }
-        flops { key: "f32xf32->f32" value: 184745668272539 }
+        flops { key: "bf16xbf16->bf16" value: 642959176047904 }
+        flops { key: "f32xf32->f32" value: 400351164802386 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 871455269554631 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 865833544199173 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 798915047619047 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 4096
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 345476777348777 }
-        flops { key: "f32xf32->f32" value: 147735528893780 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 493901482980680 }
-        flops { key: "f32xf32->f32" value: 155299656349435 }
+        flops { key: "bf16xbf16->bf16" value: 804715405124361 }
+        flops { key: "f32xf32->f32" value: 519217516441005 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1103537331963001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1071531789683777 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1107806885736394 }
       }
       entries {
         b: 2
         m: 4096
-        n: 256
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 234646377622377 }
-        flops { key: "f32xf32->f32" value: 110376421052631 }
+        flops { key: "bf16xbf16->bf16" value: 916773082739667 }
+        flops { key: "f32xf32->f32" value: 619831481906411 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1535013329521086 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1390857284974093 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1541210117879250 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 211699886435331 }
-        flops { key: "f32xf32->f32" value: 113168404721753 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 773528256013687 }
-        flops { key: "f32xf32->f32" value: 315414325188586 }
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 481930800718132 }
+        flops { key: "f32xf32->f32" value: 308192257175660 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566170220933298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 514244168582375 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 571063328812657 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236715569664903 }
-        flops { key: "f32xf32->f32" value: 110200833786626 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
+        flops { key: "f32xf32->f32" value: 419409920999951 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 909950698305084 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 920580279927124 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 916064262770608 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238397385435168 }
-        flops { key: "f32xf32->f32" value: 124506241187384 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 909180206604572 }
+        flops { key: "f32xf32->f32" value: 530947528633680 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1409110005249343 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1403354777323966 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1416195629709010 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
+        b: 2
+        m: 4096
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
-        flops { key: "f32xf32->f32" value: 68548379979570 }
+        flops { key: "bf16xbf16->bf16" value: 1074816640640640 }
+        flops { key: "f32xf32->f32" value: 618359039124644 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1870223076856085 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1766930904453358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1880355626771739 }
       }
       entries {
         b: 2
         m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 584190328618063 }
-        flops { key: "f32xf32->f32" value: 171414722860791 }
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1161174646187120 }
+        flops { key: "f32xf32->f32" value: 735439605479452 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2393989783522034 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2398836762523126 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2396745142857143 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 337654661635220 }
-        flops { key: "f32xf32->f32" value: 176486164365548 }
+        b: 2
+        m: 4096
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 725317452672464 }
+        flops { key: "f32xf32->f32" value: 432480847447387 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 954967714508060 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 963645343504599 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 967226054723567 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
-        flops { key: "f32xf32->f32" value: 76000978482446 }
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 961057797270082 }
+        flops { key: "f32xf32->f32" value: 560700691383812 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1468867064295485 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1448677728644911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394469901298701 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 430530001603849 }
-        flops { key: "f32xf32->f32" value: 176196557925828 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1162687410936654 }
+        flops { key: "f32xf32->f32" value: 620447071416963 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1798562519262981 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1893307161560502 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1799598720368721 }
       }
       entries {
-        b: 1
+        b: 2
         m: 4096
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 716066571523841 }
-        flops { key: "f32xf32->f32" value: 311322572580572 }
+        flops { key: "bf16xbf16->bf16" value: 1276412138935324 }
+        flops { key: "f32xf32->f32" value: 731097151295281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2484075937536148 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2489204793566849 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2363768462300495 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138084082304526 }
-        flops { key: "f32xf32->f32" value: 67650064516129 }
+        b: 2
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1344396058651485 }
+        flops { key: "f32xf32->f32" value: 813209750260342 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2901024853765619 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2887676299443219 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2898394176848942 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 139230008298755 }
-        flops { key: "f32xf32->f32" value: 80466263788968 }
+        b: 2
+        m: 4096
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 985027761252221 }
+        flops { key: "f32xf32->f32" value: 507919500473036 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1420293417989418 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1438970532205377 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1420293417989418 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 178243994687915 }
+        flops { key: "bf16xbf16->bf16" value: 1203409161109554 }
+        flops { key: "f32xf32->f32" value: 661769580092833 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1884894309506829 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1893724557319224 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1911848340084576 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
+        m: 4096
+        n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 738998566899666 }
-        flops { key: "f32xf32->f32" value: 285990347902083 }
+        flops { key: "bf16xbf16->bf16" value: 1335473827389859 }
+        flops { key: "f32xf32->f32" value: 686693481118783 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2462181180078825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2472635173287277 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2460858611853178 }
       }
       entries {
         b: 2
-        m: 256
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 42048160401002 }
-        flops { key: "f32xf32->f32" value: 25653235474006 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1418402566353963 }
+        flops { key: "f32xf32->f32" value: 756222782991460 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2788939802597402 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2793474664065040 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2781545677521199 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206488812307692 }
-        flops { key: "f32xf32->f32" value: 103883690402476 }
-      }
-      entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 730064133265340 }
-        flops { key: "f32xf32->f32" value: 307093213402808 }
+        flops { key: "bf16xbf16->bf16" value: 1466617794742374 }
+        flops { key: "f32xf32->f32" value: 859296898417253 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3123541589327515 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3120031633511538 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3121732444595868 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 418776062402496 }
-        flops { key: "f32xf32->f32" value: 179076354903268 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24745156342182 }
+        flops { key: "f32xf32->f32" value: 18636174396001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 21620123711340 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 25575024390243 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 21509251282051 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 542293850505050 }
-        flops { key: "f32xf32->f32" value: 274965896030729 }
+        b: 4
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46329902657921 }
+        flops { key: "f32xf32->f32" value: 29852697508896 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51453988115775 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 50984891927825 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51140304057915 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 256
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 67581937562940 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 88301136842105 }
+        flops { key: "f32xf32->f32" value: 66296729068905 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 90184933982865 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 90687654054054 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90687654054054 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 146206675381263 }
-        flops { key: "f32xf32->f32" value: 75743638826185 }
+        b: 4
+        m: 256
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 162049777241171 }
+        flops { key: "f32xf32->f32" value: 112386625915846 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166523235732009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159735469205593 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172488646425702 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 114033753610875 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 270566164545798 }
+        flops { key: "f32xf32->f32" value: 173590142106539 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277309355371900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 292333739177783 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 295593069236063 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 300284366636370 }
-        flops { key: "f32xf32->f32" value: 158182354743665 }
+        m: 256
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 38746457274826 }
+        flops { key: "f32xf32->f32" value: 29746836879432 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51306470948012 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46466237839709 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 51921751644100 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 488953471766848 }
-        flops { key: "f32xf32->f32" value: 200180247302556 }
+        b: 4
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 74214944982029 }
+        flops { key: "f32xf32->f32" value: 64403900191938 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91397839972761 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93190576636000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 93206755555555 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 138654677685950 }
-        flops { key: "f32xf32->f32" value: 81541754556500 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 160164353221957 }
+        flops { key: "f32xf32->f32" value: 117734849122807 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177477987438016 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 157903209411764 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 336385283208020 }
-        flops { key: "f32xf32->f32" value: 171086970044614 }
+        flops { key: "bf16xbf16->bf16" value: 289223386936026 }
+        flops { key: "f32xf32->f32" value: 192513101568803 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322590303139552 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 311998205433677 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 308546501149425 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 213044012698412 }
-        flops { key: "f32xf32->f32" value: 113551377326565 }
+        b: 4
+        m: 256
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 454205509306260 }
+        flops { key: "f32xf32->f32" value: 286121330757444 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 534598866816031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 536870912000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 537946805611222 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 622459028405797 }
-        flops { key: "f32xf32->f32" value: 167969819797711 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 78192675793766 }
+        flops { key: "f32xf32->f32" value: 61008058181818 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89958262734584 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89478485333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 77649828174717 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 238397385435168 }
-        flops { key: "f32xf32->f32" value: 111015490488006 }
+        b: 4
+        m: 256
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
+        flops { key: "f32xf32->f32" value: 111824809831285 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 173800877954030 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164886643734643 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 148470938053097 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 213382715421303 }
-        flops { key: "f32xf32->f32" value: 111199443247721 }
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 283122432168754 }
+        flops { key: "f32xf32->f32" value: 194747768930806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 301528172985116 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322638769230769 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 299593142857142 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81840078048780 }
-        flops { key: "f32xf32->f32" value: 51385041347626 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 489734013226909 }
+        flops { key: "f32xf32->f32" value: 291085550389698 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 572357049040511 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 575964502614992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 572204542499333 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 512
+        b: 4
+        m: 256
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 71240832271762 }
-        flops { key: "f32xf32->f32" value: 38971465737514 }
+        flops { key: "bf16xbf16->bf16" value: 686425970273293 }
+        flops { key: "f32xf32->f32" value: 425328510200039 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 960413080500894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 912754711720327 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 854719859900497 }
       }
       entries {
         b: 4
         m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 664033286332714 }
-        flops { key: "f32xf32->f32" value: 230393525114828 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 135300129032258 }
+        flops { key: "f32xf32->f32" value: 100584714192037 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 148758911609864 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153545234377234 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153194724497075 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 4096
+        m: 256
+        n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 752455207507089 }
-        flops { key: "f32xf32->f32" value: 329143854183181 }
+        flops { key: "bf16xbf16->bf16" value: 260111875968992 }
+        flops { key: "f32xf32->f32" value: 177302150594451 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309257437788018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 315713561893560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 250815656155103 }
       }
       entries {
-        b: 1
+        b: 4
         m: 256
-        n: 2048
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 215784128617363 }
-        flops { key: "f32xf32->f32" value: 95257436479772 }
+        flops { key: "bf16xbf16->bf16" value: 462023160068846 }
+        flops { key: "f32xf32->f32" value: 293632822588364 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 465932663918420 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 482797582733812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474162872157209 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 256
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 737966889347079 }
-        flops { key: "f32xf32->f32" value: 366781651896369 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 719666101876675 }
+        flops { key: "f32xf32->f32" value: 452578218756585 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970614078192090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958484109796920 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 977573072948674 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 294984017582417 }
-        flops { key: "f32xf32->f32" value: 152954675783475 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 371794260387811 }
-        flops { key: "f32xf32->f32" value: 137871317925012 }
-      }
-      entries {
-        b: 1
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 593227527071823 }
-        flops { key: "f32xf32->f32" value: 296941876106194 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 961057797270082 }
+        flops { key: "f32xf32->f32" value: 549914189174482 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1410614104934723 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1264713573616018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1394469901298701 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
+        b: 4
+        m: 256
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 644116271145770 }
-        flops { key: "f32xf32->f32" value: 189975552724699 }
+        flops { key: "bf16xbf16->bf16" value: 202401851837888 }
+        flops { key: "f32xf32->f32" value: 143548372192513 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267832832127712 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 244476735883424 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 252259326676847 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 725508891943538 }
-        flops { key: "f32xf32->f32" value: 323102203176976 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 390096938782924 }
+        flops { key: "f32xf32->f32" value: 263948334316617 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510212318365407 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 495154172930597 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 500812417910447 }
       }
       entries {
         b: 4
         m: 256
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 228650303236797 }
-        flops { key: "f32xf32->f32" value: 110285725554642 }
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 644502895558223 }
+        flops { key: "f32xf32->f32" value: 387038595656483 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 923847557754355 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 922260531672750 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 922458611683848 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 256
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 743588520775623 }
-        flops { key: "f32xf32->f32" value: 298057220898862 }
+        flops { key: "bf16xbf16->bf16" value: 993343115582538 }
+        flops { key: "f32xf32->f32" value: 540910839835017 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1431417195800700 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1418417204755614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1442837757957504 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 1024
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 295308532453245 }
-        flops { key: "f32xf32->f32" value: 95088719801629 }
+        flops { key: "bf16xbf16->bf16" value: 1190401135254989 }
+        flops { key: "f32xf32->f32" value: 652135939265107 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1869002304612706 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1858890844405972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1870630355400696 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 62368832713754 }
-        flops { key: "f32xf32->f32" value: 28605653878942 }
+        b: 4
+        m: 512
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 46329902657921 }
+        flops { key: "f32xf32->f32" value: 35696204255319 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 51612277638915 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 52914538931598 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 43577184415584 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 512
         n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 42908480818414 }
-        flops { key: "f32xf32->f32" value: 32388447876447 }
+        flops { key: "bf16xbf16->bf16" value: 89003798408488 }
+        flops { key: "f32xf32->f32" value: 67769617773289 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89003798408488 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 93711103508465 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 88054930621617 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
-        flops { key: "f32xf32->f32" value: 165496581997533 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 162098705314009 }
+        flops { key: "f32xf32->f32" value: 113156478448730 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 168588761815041 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172960989690721 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151487277652370 }
       }
       entries {
         b: 4
         m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 272212403092914 }
+        flops { key: "f32xf32->f32" value: 172474793028672 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 291105279652975 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 240479691825307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 285493704865727 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 634224349675132 }
-        flops { key: "f32xf32->f32" value: 269108226566416 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 404880024132730 }
+        flops { key: "f32xf32->f32" value: 276168164609053 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 449499455363683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 468371569901853 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462769884279711 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 141579881856540 }
-        flops { key: "f32xf32->f32" value: 81640953771289 }
+        b: 4
+        m: 512
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 85380234096692 }
+        flops { key: "f32xf32->f32" value: 64776895752895 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89210852775008 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 89717732620320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89702742188805 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
-        flops { key: "f32xf32->f32" value: 177624784780810 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 165292768472906 }
+        flops { key: "f32xf32->f32" value: 120266781362007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172046438711744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167301624181988 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178481021276595 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 214748364800000 }
-        flops { key: "f32xf32->f32" value: 110832145334434 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 289184439536762 }
+        flops { key: "f32xf32->f32" value: 193676375180375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304262347407197 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319518471656003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302207099352659 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 464320788756756 }
+        flops { key: "f32xf32->f32" value: 283458770855332 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 537946805611222 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 532477968757748 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 495268369003690 }
+      }
+      entries {
+        b: 4
+        m: 512
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 298261617777777 }
-        flops { key: "f32xf32->f32" value: 145893790414076 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 630731668404435 }
+        flops { key: "f32xf32->f32" value: 399364665581849 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 857621265175718 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 871278485850491 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 813209750260342 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 60133390681003 }
-        flops { key: "f32xf32->f32" value: 28556963404255 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 149462948775055 }
+        flops { key: "f32xf32->f32" value: 112598765100671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169013351802298 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 171141508447561 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168192641604010 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 312134251162790 }
-        flops { key: "f32xf32->f32" value: 118570170775474 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 286790017094017 }
+        flops { key: "f32xf32->f32" value: 200868361051351 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322541851607089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324884061724659 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 316457949896846 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 309971658198614 }
-        flops { key: "f32xf32->f32" value: 104857600000000 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 494242496662830 }
+        flops { key: "f32xf32->f32" value: 310262753449396 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 572357049040511 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 573580034188034 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 339791716455696 }
-        flops { key: "f32xf32->f32" value: 183608383036935 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 175218966057441 }
-        flops { key: "f32xf32->f32" value: 69977960375391 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43018502564102 }
-        flops { key: "f32xf32->f32" value: 31536120300751 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 706234859163035 }
+        flops { key: "f32xf32->f32" value: 424970790679265 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 906876540540540 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 909757952976064 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 878496071998363 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335125413233458 }
-        flops { key: "f32xf32->f32" value: 158182354743665 }
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 903062930193439 }
+        flops { key: "f32xf32->f32" value: 527103033902985 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1410961660972404 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1327245765142150 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1423942742146705 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 512
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 19463127610208 }
-        flops { key: "f32xf32->f32" value: 14315030716723 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 258608339113680 }
+        flops { key: "f32xf32->f32" value: 180855958228061 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 299509574337517 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 298884293389004 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297600283813747 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 747739211298869 }
-        flops { key: "f32xf32->f32" value: 298907694304529 }
+        b: 4
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 468473745200698 }
+        flops { key: "f32xf32->f32" value: 309591818352194 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 461229305841924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 457203246327443 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462023160068846 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 653127630170316 }
-        flops { key: "f32xf32->f32" value: 307927107542300 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 742495858933356 }
+        flops { key: "f32xf32->f32" value: 442506418297960 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 970833475587703 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 981258235320996 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 976128930909090 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 54120051612903 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 959286905131498 }
+        flops { key: "f32xf32->f32" value: 553368201507440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1403240152250265 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1311040078144078 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1312342004736078 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 461229305841924 }
-        flops { key: "f32xf32->f32" value: 180947391978429 }
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1157673125606469 }
+        flops { key: "f32xf32->f32" value: 636315018482166 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1883241346560701 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1880355626771739 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1878813340332458 }
       }
       entries {
         b: 4
         m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 142179796610169 }
-        flops { key: "f32xf32->f32" value: 75658245772266 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 390167813953488 }
+        flops { key: "f32xf32->f32" value: 265514793273986 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 494356272559852 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477590047370176 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 486186019470228 }
       }
       entries {
         b: 4
-        m: 256
+        m: 512
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 80659692307692 }
-        flops { key: "f32xf32->f32" value: 49344752941176 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 653873379919311 }
+        flops { key: "f32xf32->f32" value: 417149115773115 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 911302205813706 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 931865327836841 }
       }
       entries {
         b: 4
-        m: 256
+        m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 140102012526096 }
-        flops { key: "f32xf32->f32" value: 76520939566704 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 995126806302131 }
+        flops { key: "f32xf32->f32" value: 571443227248536 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1454934720867208 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1460749016580222 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1460749016580222 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 366214810368349 }
-        flops { key: "f32xf32->f32" value: 136469474326385 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1207808575928009 }
+        flops { key: "f32xf32->f32" value: 655395001869301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1912273951914514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1903481157165808 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1877171020979021 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82040176039119 }
-        flops { key: "f32xf32->f32" value: 51542906298003 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1339848246914543 }
+        flops { key: "f32xf32->f32" value: 665299752505058 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2244634222962600 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2247497276818419 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2265875650751780 }
       }
       entries {
         b: 4
         m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 735943676490747 }
-        flops { key: "f32xf32->f32" value: 237765556725785 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 89687756765786 }
+        flops { key: "f32xf32->f32" value: 67108864000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 93190576636000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 86452642834138 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90443212938005 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 118987347517730 }
-        flops { key: "f32xf32->f32" value: 56775688663282 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 164457317200183 }
+        flops { key: "f32xf32->f32" value: 114118591136146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171579070629594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168139966176010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169039959697733 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 1024
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 713449716943521 }
-        flops { key: "f32xf32->f32" value: 286677555112427 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 271078471093158 }
+        flops { key: "f32xf32->f32" value: 176138750656167 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 280790225941422 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 280167468754076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 297517823219728 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 143395008547008 }
-        flops { key: "f32xf32->f32" value: 79796508917954 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 406643372088619 }
+        flops { key: "f32xf32->f32" value: 270055790744466 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 403662339849624 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 443694968595041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 466033777777777 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
+        m: 1024
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 394758023529411 }
-        flops { key: "f32xf32->f32" value: 200999967053538 }
+        flops { key: "bf16xbf16->bf16" value: 531423817866864 }
+        flops { key: "f32xf32->f32" value: 358391797062750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 630731668404435 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 642094079234564 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 630870636897767 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
+        b: 4
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 447019910074937 }
-        flops { key: "f32xf32->f32" value: 230614652920962 }
+        flops { key: "bf16xbf16->bf16" value: 162860886394661 }
+        flops { key: "f32xf32->f32" value: 120916872072072 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155705020881670 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 158649796690307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 154985829099307 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 462819751724137 }
-        flops { key: "f32xf32->f32" value: 165140237465395 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 291065823800487 }
+        flops { key: "f32xf32->f32" value: 194465602463098 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312770703175065 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319470938411187 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303660018099547 }
       }
       entries {
         b: 4
-        m: 512
-        n: 4096
+        m: 1024
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 442233041186161 }
-        flops { key: "f32xf32->f32" value: 216305766317485 }
+        flops { key: "bf16xbf16->bf16" value: 464270597340828 }
+        flops { key: "f32xf32->f32" value: 290122081599567 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 532477968757748 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 522247968871595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 528416251968503 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376223484232655 }
-        flops { key: "f32xf32->f32" value: 216305766317485 }
+        b: 4
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 648248025960304 }
+        flops { key: "f32xf32->f32" value: 408480412382899 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 769018316204118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 870131137763371 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 862963089411292 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 209715200000000 }
-        flops { key: "f32xf32->f32" value: 86037005128205 }
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 803698970059880 }
+        flops { key: "f32xf32->f32" value: 502452889096864 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1081038836143971 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1033313435823409 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1080223162977867 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22310127659574 }
-        flops { key: "f32xf32->f32" value: 16545577909270 }
+        b: 4
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 286790017094017 }
+        flops { key: "f32xf32->f32" value: 205835679861976 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309257437788018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 322638769230769 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 135027895372233 }
-        flops { key: "f32xf32->f32" value: 65600062561094 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 644116271145770 }
-        flops { key: "f32xf32->f32" value: 264212189286867 }
+        flops { key: "bf16xbf16->bf16" value: 502570476948279 }
+        flops { key: "f32xf32->f32" value: 307112427314980 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 498776831494599 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 515169400983567 }
       }
       entries {
         b: 4
-        m: 512
+        m: 1024
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 375434204195804 }
-        flops { key: "f32xf32->f32" value: 150468304932735 }
+        flops { key: "bf16xbf16->bf16" value: 713924085106383 }
+        flops { key: "f32xf32->f32" value: 422068327044025 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 909950698305084 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958698057142857 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 857621265175718 }
       }
       entries {
-        b: 1
+        b: 4
         m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 470114633975481 }
-        flops { key: "f32xf32->f32" value: 181620741542625 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 910722497031382 }
+        flops { key: "f32xf32->f32" value: 518121394052717 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1418300106001816 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1422174601324503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1247086903600464 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 4
+        m: 1024
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 779348875095689 }
-        flops { key: "f32xf32->f32" value: 320786271886174 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1062387556984725 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1734639457189014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1890390535211267 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1733151998385876 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 512
+        b: 4
+        m: 1024
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 622820083526682 }
-        flops { key: "f32xf32->f32" value: 205542624504983 }
+        flops { key: "bf16xbf16->bf16" value: 470011741737798 }
+        flops { key: "f32xf32->f32" value: 295633762114537 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 558077871101871 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 573426875300400 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 536870912000000 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 779136806172370 }
-        flops { key: "f32xf32->f32" value: 321542765401136 }
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 736259071912231 }
+        flops { key: "f32xf32->f32" value: 455264712317150 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 986668342752125 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 981482471663619 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 965378129017756 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 134217728000000 }
-        flops { key: "f32xf32->f32" value: 69542864248704 }
+        b: 4
+        m: 1024
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 971492263288848 }
+        flops { key: "f32xf32->f32" value: 552229803407264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1414330220136659 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1468867064295485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1451002464864864 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 677439636593059 }
-        flops { key: "f32xf32->f32" value: 309037697921444 }
+        b: 4
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1152663234861954 }
+        flops { key: "f32xf32->f32" value: 618381296666906 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1828910329908979 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1810312875026343 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1906755736293007 }
       }
       entries {
         b: 4
         m: 1024
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 718702693440428 }
-        flops { key: "f32xf32->f32" value: 311503208144837 }
+        flops { key: "bf16xbf16->bf16" value: 1278668417021751 }
+        flops { key: "f32xf32->f32" value: 701040313552665 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2407492878923767 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2384768071071627 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2386921734491143 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
-        flops { key: "f32xf32->f32" value: 54648912052117 }
+        b: 4
+        m: 1024
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 658585800199340 }
+        flops { key: "f32xf32->f32" value: 396489018786060 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 928842408304498 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 916161965870307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 917532000854518 }
       }
       entries {
         b: 4
-        m: 512
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 237133795053003 }
-        flops { key: "f32xf32->f32" value: 110837865703225 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 990536738007380 }
+        flops { key: "f32xf32->f32" value: 562334103106281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1466609969609014 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1472518143824462 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1447091407008086 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 228261442176870 }
-        flops { key: "f32xf32->f32" value: 110740699669967 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1203451310567055 }
+        flops { key: "f32xf32->f32" value: 650580118301965 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1905486821650399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1897070360424028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1939912961156278 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 335125413233458 }
-        flops { key: "f32xf32->f32" value: 168087323731997 }
+        m: 1024
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1341417492748248 }
+        flops { key: "f32xf32->f32" value: 710117355598726 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2349544472647702 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2358090616155377 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2326398210365957 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 627185644859813 }
-        flops { key: "f32xf32->f32" value: 233930680610021 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1413879180223646 }
+        flops { key: "f32xf32->f32" value: 756206139665911 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2670999562189054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2592258501141100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2671259129501856 }
       }
       entries {
         b: 4
         m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 730436614965986 }
-        flops { key: "f32xf32->f32" value: 298181377997240 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 160164353221957 }
+        flops { key: "f32xf32->f32" value: 113924861962864 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162049777241171 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173379916680122 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164457317200183 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 687861993493688 }
-        flops { key: "f32xf32->f32" value: 298202080903986 }
-      }
-      entries {
-        b: 2
-        m: 1024
+        m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
-        flops { key: "f32xf32->f32" value: 108854605028386 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 269445878042659 }
+        flops { key: "f32xf32->f32" value: 174521222917513 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 271661435547122 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284887721942159 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 240533562724014 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 2048
         n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 82040176039119 }
-        flops { key: "f32xf32->f32" value: 54032901771336 }
+        flops { key: "bf16xbf16->bf16" value: 406643372088619 }
+        flops { key: "f32xf32->f32" value: 276666277763463 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 464421204152249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 464320788756756 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 466844271304347 }
       }
       entries {
         b: 4
         m: 2048
-        n: 1024
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 300936609865470 }
-        flops { key: "f32xf32->f32" value: 162294713422007 }
+        flops { key: "bf16xbf16->bf16" value: 533668898608349 }
+        flops { key: "f32xf32->f32" value: 362199974363299 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 627048294912037 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 642959176047904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 594541430786268 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 512
+        m: 2048
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 134486701402805 }
-        flops { key: "f32xf32->f32" value: 79986727056019 }
+        flops { key: "bf16xbf16->bf16" value: 794004214262605 }
+        flops { key: "f32xf32->f32" value: 462695103258820 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 904584518955349 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 897027421888053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 906063455724909 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
-        n: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 342392163265306 }
-        flops { key: "f32xf32->f32" value: 167982137672090 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 733304984804507 }
-        flops { key: "f32xf32->f32" value: 275585610792515 }
+        flops { key: "bf16xbf16->bf16" value: 284887721942159 }
+        flops { key: "f32xf32->f32" value: 194465602463098 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 313501262481751 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 290514562770562 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 292413350762527 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 117734849122807 }
-        flops { key: "f32xf32->f32" value: 54920046238044 }
-      }
-      entries {
-        b: 2
         m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 393889150403521 }
-        flops { key: "f32xf32->f32" value: 202669275953189 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 298925897550111 }
-        flops { key: "f32xf32->f32" value: 118462248896734 }
-      }
-      entries {
-        b: 2
-        m: 512
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43351979328165 }
-        flops { key: "f32xf32->f32" value: 32201950095969 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 462819751724137 }
+        flops { key: "f32xf32->f32" value: 281065852758327 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 510212318365407 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 498024964749536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 531423817866864 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 2048
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80466263788968 }
-        flops { key: "f32xf32->f32" value: 50686453172205 }
+        flops { key: "bf16xbf16->bf16" value: 636715928544955 }
+        flops { key: "f32xf32->f32" value: 406412499621498 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 804753100243582 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 870131137763371 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 861751062600321 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 715350982011992 }
-        flops { key: "f32xf32->f32" value: 316600830834723 }
-      }
-      entries {
-        b: 1
-        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 653525151552039 }
-        flops { key: "f32xf32->f32" value: 266503307024075 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 802497626307922 }
+        flops { key: "f32xf32->f32" value: 503867585171281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1114996701973001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1074816640640640 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1078054040160642 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 2048
         n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 445906073089701 }
-        flops { key: "f32xf32->f32" value: 226719135135135 }
+        flops { key: "bf16xbf16->bf16" value: 923251783319002 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1533916891428571 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1524998374151169 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1518394023951566 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 137236940695296 }
-        flops { key: "f32xf32->f32" value: 78398205607476 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 373865537604456 }
-        flops { key: "f32xf32->f32" value: 136608374554707 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40721398058252 }
-        flops { key: "f32xf32->f32" value: 25970922600619 }
+        flops { key: "bf16xbf16->bf16" value: 482689064508878 }
+        flops { key: "f32xf32->f32" value: 302292180180180 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 544493825557809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 569926658174097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 534598866816031 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 234236872600349 }
-        flops { key: "f32xf32->f32" value: 111107390728476 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 699962075619296 }
+        flops { key: "f32xf32->f32" value: 420415749412686 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 911495606112054 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 909757952976064 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 920580279927124 }
       }
       entries {
         b: 4
         m: 2048
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 745015413610295 }
-        flops { key: "f32xf32->f32" value: 302422553078378 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 912900217014719 }
+        flops { key: "f32xf32->f32" value: 519594398257924 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1365866527587851 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1372742244027167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1320715650676506 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 206488812307692 }
-        flops { key: "f32xf32->f32" value: 110014531147540 }
+        m: 2048
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1062814759751306 }
+        flops { key: "f32xf32->f32" value: 615666619505814 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1764571608874281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1764118620321404 }
       }
       entries {
         b: 4
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 210372614420062 }
-        flops { key: "f32xf32->f32" value: 94719638673253 }
-      }
-    }
-  }
-  entries {
-    key: "sm_100"
-    value {
-      entries {
-        b: 1
-        m: 1024
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 437815218756371 }
-        flops { key: "f32xf32->f32" value: 273861333673404 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1159626674586567 }
+        flops { key: "f32xf32->f32" value: 734056963937788 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2398083359017309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2382948773701366 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2380802270509978 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 369650339616145 }
-        flops { key: "f32xf32->f32" value: 249663854909027 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 729258391374480 }
+        flops { key: "f32xf32->f32" value: 418449658612626 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 880116249180327 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 880116249180327 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 884283981058266 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 2048
+        b: 4
+        m: 2048
+        n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 454878976488032 }
-        flops { key: "f32xf32->f32" value: 290475266874070 }
+        flops { key: "bf16xbf16->bf16" value: 964456811542132 }
+        flops { key: "f32xf32->f32" value: 547425968964088 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1462614437595777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1399582010916497 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1472518143824462 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 426003500892680 }
-        flops { key: "f32xf32->f32" value: 293011822622458 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678617047874861 }
-        flops { key: "f32xf32->f32" value: 406643372088619 }
-      }
-      entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74051160275862 }
-        flops { key: "f32xf32->f32" value: 54637788723794 }
+        flops { key: "bf16xbf16->bf16" value: 1158688148917515 }
+        flops { key: "f32xf32->f32" value: 626271113444152 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1887068231985940 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1893307161560502 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 638229778735418 }
-        flops { key: "f32xf32->f32" value: 407241008486227 }
-      }
-      entries {
-        b: 1
         m: 2048
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 434229834799312 }
-        flops { key: "f32xf32->f32" value: 273303677760101 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1276364723922734 }
+        flops { key: "f32xf32->f32" value: 699734000651678 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2416466584710598 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2347618090188576 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2400177316056023 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 2048
         n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1543980334681405 }
-        flops { key: "f32xf32->f32" value: 845590844317566 }
+        flops { key: "bf16xbf16->bf16" value: 1345554305943628 }
+        flops { key: "f32xf32->f32" value: 816456096568767 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2895097286289048 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2900045439567859 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2894426616797237 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 368192652893270 }
-        flops { key: "f32xf32->f32" value: 264663994084298 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1276483268059812 }
-        flops { key: "f32xf32->f32" value: 730390033968922 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 982155796021038 }
+        flops { key: "f32xf32->f32" value: 489845722627737 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1441143292005704 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1362292378399809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1427848170212766 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 103393531439576 }
-        flops { key: "f32xf32->f32" value: 67501214811089 }
+        flops { key: "bf16xbf16->bf16" value: 1181884231150247 }
+        flops { key: "f32xf32->f32" value: 647027311840916 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1903481157165808 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1898747699381078 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1882106615249781 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 2048
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 497016408725337 }
-        flops { key: "f32xf32->f32" value: 344285955591182 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1328554407655872 }
+        flops { key: "f32xf32->f32" value: 706641542612701 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2472635173287277 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2441275950690966 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2451028167635624 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 619139007640190 }
-        flops { key: "f32xf32->f32" value: 393510219982591 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1411135503223951 }
+        flops { key: "f32xf32->f32" value: 755624084447572 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2788939802597402 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2776712800210113 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2781714569948186 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 40414853357422 }
-        flops { key: "f32xf32->f32" value: 24704164918093 }
+        b: 4
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1466234461380899 }
+        flops { key: "f32xf32->f32" value: 867100852167768 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3131369836002825 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3128198233137212 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3128767935165898 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 83235800310077 }
-        flops { key: "f32xf32->f32" value: 49922904221684 }
+        flops { key: "bf16xbf16->bf16" value: 265777679207920 }
+        flops { key: "f32xf32->f32" value: 174720010414124 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 289808859379217 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 288562704649287 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 237921964103700 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
+        b: 4
+        m: 4096
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 518590593576430 }
-        flops { key: "f32xf32->f32" value: 349952521469893 }
+        flops { key: "bf16xbf16->bf16" value: 402451958020989 }
+        flops { key: "f32xf32->f32" value: 266768155031055 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 463619094991364 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 457300606473594 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 478494573975044 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1141633331162574 }
-        flops { key: "f32xf32->f32" value: 626705183088315 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 536736727818045 }
+        flops { key: "f32xf32->f32" value: 353204547368421 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 670198532573925 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 644502895558223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 639132038095238 }
       }
       entries {
         b: 4
-        m: 512
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 418001683309002 }
-        flops { key: "f32xf32->f32" value: 286426628609536 }
-      }
-      entries {
-        b: 1
-        m: 1024
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 465882123440720 }
-        flops { key: "f32xf32->f32" value: 299530462096380 }
-      }
-      entries {
-        b: 1
-        m: 256
-        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 9216984480153 }
-        flops { key: "f32xf32->f32" value: 10510393735317 }
+        flops { key: "bf16xbf16->bf16" value: 790642421832574 }
+        flops { key: "f32xf32->f32" value: 460833400858369 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 840666920336660 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 910577685058567 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 909950698305084 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 898740246606157 }
-        flops { key: "f32xf32->f32" value: 610644387005047 }
-      }
-      entries {
-        b: 2
         m: 4096
-        n: 512
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 250844953626912 }
-        flops { key: "f32xf32->f32" value: 165867277979454 }
+        flops { key: "bf16xbf16->bf16" value: 791826754730025 }
+        flops { key: "f32xf32->f32" value: 511199130657303 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1012754984761399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1034184275463520 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1035212508451086 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 868679232643980 }
-        flops { key: "f32xf32->f32" value: 512434205810415 }
-      }
-      entries {
-        b: 2
-        m: 256
+        m: 4096
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 11846224889673 }
-        flops { key: "f32xf32->f32" value: 10951185378590 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 169882418163120 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 455602768218945 }
+        flops { key: "f32xf32->f32" value: 280496819226750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 504459395818651 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 530373832551247 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 531423817866864 }
       }
       entries {
         b: 4
         m: 4096
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 616296067728512 }
-        flops { key: "f32xf32->f32" value: 393240001464933 }
+        flops { key: "bf16xbf16->bf16" value: 632263697335492 }
+        flops { key: "f32xf32->f32" value: 392162828341855 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 798840750674230 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 793967519364081 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 844136654088050 }
       }
       entries {
         b: 4
-        m: 512
-        n: 256
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81037118792452 }
-        flops { key: "f32xf32->f32" value: 57247911281723 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42683328987120 }
-        flops { key: "f32xf32->f32" value: 30608375826681 }
-      }
-      entries {
-        b: 2
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1324942674121775 }
-        flops { key: "f32xf32->f32" value: 703898273387485 }
+        flops { key: "bf16xbf16->bf16" value: 801187762160145 }
+        flops { key: "f32xf32->f32" value: 499879806331471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1046340774955843 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1085684351870576 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1112685827979274 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 192824247822573 }
-        flops { key: "f32xf32->f32" value: 131432991492747 }
-      }
-      entries {
-        b: 1
-        m: 2048
-        n: 1024
+        m: 4096
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 271661435547122 }
-        flops { key: "f32xf32->f32" value: 186364978564609 }
+        flops { key: "bf16xbf16->bf16" value: 913239909844779 }
+        flops { key: "f32xf32->f32" value: 614093122104661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1534876189046725 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1524118983676366 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1526285464108031 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 4096
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1462661134166977 }
-        flops { key: "f32xf32->f32" value: 873213190244894 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 1090646850177755 }
+        flops { key: "f32xf32->f32" value: 668548937493311 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1712208215672106 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1717557529017745 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1724410346942360 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 512
+        m: 4096
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 656171002368039 }
-        flops { key: "f32xf32->f32" value: 398789906778087 }
+        flops { key: "bf16xbf16->bf16" value: 683041872773536 }
+        flops { key: "f32xf32->f32" value: 422068327044025 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 936743139803707 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 897777444816053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 872783437512700 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 941723904182426 }
-        flops { key: "f32xf32->f32" value: 570418659406335 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 903775537061392 }
+        flops { key: "f32xf32->f32" value: 524144039539921 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1338830204488778 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1412818189473684 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1401751728459530 }
       }
       entries {
         b: 4
         m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1394455753005752 }
-        flops { key: "f32xf32->f32" value: 783958802331816 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1060485752098765 }
+        flops { key: "f32xf32->f32" value: 613205403387289 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1868697360526459 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1755914675388389 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1767476253497942 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185758993960727 }
-        flops { key: "f32xf32->f32" value: 653985389291764 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1163061297046627 }
+        flops { key: "f32xf32->f32" value: 732374979868060 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2400764279485746 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2410194891133558 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2404796918253079 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 4096
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 1350485933693623 }
+        flops { key: "f32xf32->f32" value: 769974753061659 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2651214380246913 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2639807803318992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2639807803318992 }
+      }
+      entries {
+        b: 4
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 710969590465154 }
-        flops { key: "f32xf32->f32" value: 407569490985006 }
+        flops { key: "bf16xbf16->bf16" value: 945767640187173 }
+        flops { key: "f32xf32->f32" value: 535532081795511 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1343857101376721 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1399696039107055 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1396283256176853 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1145324612266666 }
+        flops { key: "f32xf32->f32" value: 634177526172019 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1777442365526874 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1777718251655629 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1789196957300562 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1265225848510513 }
+        flops { key: "f32xf32->f32" value: 695520143477424 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2422343992950051 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2410110361449163 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2421746431350437 }
       }
       entries {
         b: 4
         m: 4096
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 768467936303453 }
-        flops { key: "f32xf32->f32" value: 455506129600169 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1345751933573554 }
+        flops { key: "f32xf32->f32" value: 814211809668246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2884888089502739 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2893208013472549 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2884766985118485 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
-        n: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 1539854612058775 }
+        flops { key: "f32xf32->f32" value: 844198873934301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3122477132679026 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3118509563260120 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3129729778020677 }
+      }
+      entries {
+        b: 4
+        m: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1190854967178456 }
-        flops { key: "f32xf32->f32" value: 645132151107773 }
+        flops { key: "bf16xbf16->bf16" value: 1152431271775951 }
+        flops { key: "f32xf32->f32" value: 600023371891589 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1804608107563025 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1862518341717259 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1852876314063848 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 871322675051985 }
-        flops { key: "f32xf32->f32" value: 511686349486224 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1322749398213735 }
+        flops { key: "f32xf32->f32" value: 692059948800064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2274424992917190 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2245147567171981 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2428679156600106 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 4
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432873140092723 }
-        flops { key: "f32xf32->f32" value: 272471439193047 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1411903819196038 }
+        flops { key: "f32xf32->f32" value: 783319864536610 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2662049496833175 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2764703763115545 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2767375835051546 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 4096
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 239193990643795 }
-        flops { key: "f32xf32->f32" value: 170516408448467 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1465710635889068 }
+        flops { key: "f32xf32->f32" value: 869531089085859 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3133333032521344 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3124606771972900 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3128661100229916 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 126352297481760 }
-        flops { key: "f32xf32->f32" value: 104175979819540 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 1670467342710337 }
+        flops { key: "f32xf32->f32" value: 869747066286761 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 3279676742061148 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 3280107240848911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 3281752003581683 }
       }
+    }
+  }
+  entries {
+    key: "sm_90"
+    value {
       entries {
         b: 1
-        m: 1024
+        m: 256
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 22482031490787 }
-        flops { key: "f32xf32->f32" value: 17327359669506 }
+        flops { key: "bf16xbf16->bf16" value: 6721641025641 }
+        flops { key: "f32xf32->f32" value: 6026298850574 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 5991862857142 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 5991862857142 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 6009031518624 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 256110154800238 }
-        flops { key: "f32xf32->f32" value: 174720010414124 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 12372578171091 }
+        flops { key: "f32xf32->f32" value: 10155699757869 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10381940594059 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10433592039800 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10356306172839 }
       }
       entries {
         b: 1
         m: 256
-        n: 2048
+        n: 1024
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 43240247422680 }
-        flops { key: "f32xf32->f32" value: 32443250664732 }
+        flops { key: "bf16xbf16->bf16" value: 23692449779346 }
+        flops { key: "f32xf32->f32" value: 17697485232067 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 15709003745318 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 15887515151515 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16039403441682 }
       }
       entries {
-        b: 2
+        b: 1
         m: 256
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442871447308723 }
-        flops { key: "f32xf32->f32" value: 280735165435649 }
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 43804741514360 }
+        flops { key: "f32xf32->f32" value: 33893365656565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30229218018018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30012908765652 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30229218018018 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1329235407913189 }
-        flops { key: "f32xf32->f32" value: 813185692650861 }
+        b: 1
+        m: 256
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 82646384236453 }
+        flops { key: "f32xf32->f32" value: 55924053333333 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59592730824730 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59283448763250 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 256
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 149462948775055 }
-        flops { key: "f32xf32->f32" value: 99540356354871 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 11459846994535 }
+        flops { key: "f32xf32->f32" value: 9058971922246 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 9279433628318 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 9279433628318 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 9279433628318 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 241344532254439 }
-        flops { key: "f32xf32->f32" value: 160511521638388 }
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 23045626373626 }
+        flops { key: "f32xf32->f32" value: 16644063492063 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16448250980392 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16448250980392 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16416062622309 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 256
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 466742805477070 }
-        flops { key: "f32xf32->f32" value: 297538434083824 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 43804741514360 }
+        flops { key: "f32xf32->f32" value: 26630501587301 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47662545454545 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46995002801120 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
+        m: 256
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 620525506898793 }
-        flops { key: "f32xf32->f32" value: 386446580529062 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 696161325228948 }
-        flops { key: "f32xf32->f32" value: 422670599419377 }
+        flops { key: "bf16xbf16->bf16" value: 83055524752475 }
+        flops { key: "f32xf32->f32" value: 51781530864197 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58457198606271 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58457198606271 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1337930917225602 }
-        flops { key: "f32xf32->f32" value: 814834520883850 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 146206675381263 }
+        flops { key: "f32xf32->f32" value: 81740394640682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 89240510638297 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166937472636815 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 89597949265687 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 4096
+        b: 1
+        m: 256
+        n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1048000316232538 }
-        flops { key: "f32xf32->f32" value: 609939793158539 }
+        flops { key: "bf16xbf16->bf16" value: 18893261261261 }
+        flops { key: "f32xf32->f32" value: 15252014545454 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 12748644376899 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 12671613293051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 12729298937784 }
       }
       entries {
         b: 1
-        m: 512
+        m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133152507936507 }
-        flops { key: "f32xf32->f32" value: 89590473425114 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 41120627450980 }
+        flops { key: "f32xf32->f32" value: 20945338327091 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46733192200557 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46345900552486 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 46603377777777 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 195599202841788 }
-        flops { key: "f32xf32->f32" value: 130912195074372 }
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 75065843400447 }
+        flops { key: "f32xf32->f32" value: 68618470347648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 90443212938005 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 90933420054200 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90933420054200 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 1024
+        b: 1
+        m: 256
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 679583432911392 }
-        flops { key: "f32xf32->f32" value: 404175156072083 }
+        flops { key: "bf16xbf16->bf16" value: 141579881856540 }
+        flops { key: "f32xf32->f32" value: 72865216069489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 169466828282828 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172074010256410 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172960989690721 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1270066289685253 }
-        flops { key: "f32xf32->f32" value: 740080952204536 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 237553500884955 }
+        flops { key: "f32xf32->f32" value: 111662003327787 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267899656686626 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 273355861507128 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 269513510040160 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 1
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 370671208768447 }
-        flops { key: "f32xf32->f32" value: 229812579378243 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 31895847908745 }
+        flops { key: "f32xf32->f32" value: 41221660933660 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 42799020408163 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44384169312169 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
       }
       entries {
         b: 1
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 20757458707083 }
-        flops { key: "f32xf32->f32" value: 16008793893129 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 63913203809523 }
+        flops { key: "f32xf32->f32" value: 67243350701402 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 86480494845360 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87381333333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 87609483028720 }
       }
       entries {
-        b: 4
+        b: 1
         m: 256
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22306419810536 }
-        flops { key: "f32xf32->f32" value: 17655581162851 }
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 119623643493761 }
+        flops { key: "f32xf32->f32" value: 60241350089766 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162098705314009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164482509803921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162885592233009 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 769018316204118 }
-        flops { key: "f32xf32->f32" value: 453737664316086 }
+        b: 1
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 222583296849087 }
+        flops { key: "f32xf32->f32" value: 99200094604582 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277309355371900 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 277309355371900 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 275601084188911 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 764059114253947 }
-        flops { key: "f32xf32->f32" value: 504089351368797 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 143625177100053 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 416825242236024 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 431568257234726 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 416825242236024 }
       }
       entries {
         b: 1
         m: 256
-        n: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 196188895304220 }
-        flops { key: "f32xf32->f32" value: 132071565067650 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 772302503214205 }
-        flops { key: "f32xf32->f32" value: 496772089870745 }
+        flops { key: "bf16xbf16->bf16" value: 90933420054200 }
+        flops { key: "f32xf32->f32" value: 61342654478976 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 78215459207459 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 79891504761904 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 256
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 158077559661391 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 380658273154302 }
-        flops { key: "f32xf32->f32" value: 254185198319228 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 93077481276005 }
+        flops { key: "f32xf32->f32" value: 108240103225806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151146090090090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151146090090090 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 151146090090090 }
       }
       entries {
         b: 1
         m: 256
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 23168950112204 }
-        flops { key: "f32xf32->f32" value: 16508945633456 }
-      }
-      entries {
-        b: 4
-        m: 1024
-        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 623407692285361 }
-        flops { key: "f32xf32->f32" value: 378878554693013 }
+        flops { key: "bf16xbf16->bf16" value: 177068242744063 }
+        flops { key: "f32xf32->f32" value: 160355708482676 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249475330855018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 250874257943925 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 250874257943925 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442187511170596 }
-        flops { key: "f32xf32->f32" value: 293974489801505 }
+        b: 1
+        m: 256
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 296286375275938 }
+        flops { key: "f32xf32->f32" value: 123589068139963 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 411710822085889 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 410451767584097 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 422068327044025 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 1024
+        b: 1
+        m: 256
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1339613176654060 }
-        flops { key: "f32xf32->f32" value: 674540389650163 }
+        flops { key: "bf16xbf16->bf16" value: 418449658612626 }
+        flops { key: "f32xf32->f32" value: 184175269982847 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 542293850505050 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 561580451882845 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 540077622885884 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 239193990643795 }
-        flops { key: "f32xf32->f32" value: 167523492316093 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 12336188235294 }
+        flops { key: "f32xf32->f32" value: 10230009756097 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10754625641025 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10837994832041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10699755102040 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 2048
+        m: 512
+        n: 512
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 371762078767419 }
-        flops { key: "f32xf32->f32" value: 263656678698588 }
+        flops { key: "bf16xbf16->bf16" value: 23629881690140 }
+        flops { key: "f32xf32->f32" value: 17623126050420 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16039403441682 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16194223938223 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16008793893129 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411121014733513 }
-        flops { key: "f32xf32->f32" value: 748508871575072 }
+        b: 1
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45714485013623 }
+        flops { key: "f32xf32->f32" value: 34379540983606 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30393507246376 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30338546112115 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30338546112115 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1416706559656953 }
-        flops { key: "f32xf32->f32" value: 754906067043463 }
+        b: 1
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 56205078726968 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58970882249560 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59388375221238 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 512
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 685549448683160 }
-        flops { key: "f32xf32->f32" value: 401155120347452 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141281818947368 }
+        flops { key: "f32xf32->f32" value: 80659692307692 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171196081632653 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 170760468193384 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 171196081632653 }
       }
       entries {
-        b: 2
+        b: 1
         m: 512
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 37866477077161 }
-        flops { key: "f32xf32->f32" value: 26087021963070 }
+        flops { key: "bf16xbf16->bf16" value: 23045626373626 }
+        flops { key: "f32xf32->f32" value: 15768060150375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16808732373199 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16878486921529 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16644063492063 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 382863905865573 }
-        flops { key: "f32xf32->f32" value: 252022491256894 }
-      }
-      entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 512
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 429410847430513 }
-        flops { key: "f32xf32->f32" value: 273025700591189 }
+        flops { key: "bf16xbf16->bf16" value: 43577184415584 }
+        flops { key: "f32xf32->f32" value: 26800664536741 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47934902857142 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47259763380281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 48072252148997 }
       }
       entries {
         b: 1
         m: 512
-        n: 256
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 21672489585015 }
-        flops { key: "f32xf32->f32" value: 14923029575272 }
+        flops { key: "bf16xbf16->bf16" value: 85380234096692 }
+        flops { key: "f32xf32->f32" value: 52510848200312 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58052650519031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58254222222222 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58146963283873 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 512
+        m: 512
+        n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 270532079617032 }
-        flops { key: "f32xf32->f32" value: 182593627072527 }
+        flops { key: "bf16xbf16->bf16" value: 147492008791208 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177039047650453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 175677654450261 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 176602273684210 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 423358038048299 }
-        flops { key: "f32xf32->f32" value: 276666277763463 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 243589343012704 }
+        flops { key: "f32xf32->f32" value: 133417224652087 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307134389016018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309257437788018 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 512
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 100134460878485 }
-        flops { key: "f32xf32->f32" value: 68048787882628 }
-      }
-      entries {
-        b: 2
-        m: 1024
-        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 872783437512700 }
-        flops { key: "f32xf32->f32" value: 524192017574906 }
+        flops { key: "bf16xbf16->bf16" value: 41527762376237 }
+        flops { key: "f32xf32->f32" value: 23899168091168 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 46345900552486 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 46345900552486 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 46733192200557 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 2048
+        b: 1
+        m: 512
+        n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 447299239325140 }
-        flops { key: "f32xf32->f32" value: 276399208185854 }
+        flops { key: "bf16xbf16->bf16" value: 73423264770240 }
+        flops { key: "f32xf32->f32" value: 42632487254824 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91678775956284 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91428970027247 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 90443212938005 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 512
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1186864883177893 }
-        flops { key: "f32xf32->f32" value: 657413916923371 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 146525903930131 }
+        flops { key: "f32xf32->f32" value: 72707328277356 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178481021276595 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178007596816976 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178481021276595 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 611339733257419 }
-        flops { key: "f32xf32->f32" value: 390433825371574 }
+        b: 1
+        m: 512
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 241833744144144 }
+        flops { key: "f32xf32->f32" value: 117631663453111 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 321095043062200 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 310689185185185 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 318051488151658 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 516594574933846 }
-        flops { key: "f32xf32->f32" value: 350152233490950 }
+        b: 1
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 381300363636363 }
+        flops { key: "f32xf32->f32" value: 192980198418404 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 505528165725047 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 493447529411764 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 499879806331471 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 512
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 423274593081699 }
-        flops { key: "f32xf32->f32" value: 298199492883427 }
+        flops { key: "bf16xbf16->bf16" value: 61680941176470 }
+        flops { key: "f32xf32->f32" value: 67786731313131 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 87154368831168 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87838827225130 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 88069375328083 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1690656401017304 }
-        flops { key: "f32xf32->f32" value: 872390099668027 }
+        b: 1
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 116306523396880 }
+        flops { key: "f32xf32->f32" value: 57456219178082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171633923273657 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169039959697733 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 169466828282828 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 265186916275623 }
-        flops { key: "f32xf32->f32" value: 189288994975760 }
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 231011580034423 }
+        flops { key: "f32xf32->f32" value: 96838187590187 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 322638769230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 319566019047619 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 322638769230769 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1338569417117924 }
-        flops { key: "f32xf32->f32" value: 705104419618305 }
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 342829445721583 }
+        flops { key: "f32xf32->f32" value: 170111188846641 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 502688119850187 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 502688119850187 }
       }
       entries {
         b: 1
-        m: 256
+        m: 512
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 80841878030417 }
-        flops { key: "f32xf32->f32" value: 59061706490649 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529981156959526 }
+        flops { key: "f32xf32->f32" value: 245370617915904 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 766958445714285 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 766958445714285 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 769156034383954 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
+        m: 512
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 625586963221906 }
-        flops { key: "f32xf32->f32" value: 424382915468603 }
+        flops { key: "bf16xbf16->bf16" value: 92820005532503 }
+        flops { key: "f32xf32->f32" value: 110365075958474 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 153919412844036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 151487277652370 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153216584474885 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 512
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1320056028583503 }
-        flops { key: "f32xf32->f32" value: 687731197694201 }
+        flops { key: "bf16xbf16->bf16" value: 187454927374301 }
+        flops { key: "f32xf32->f32" value: 158275622641509 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300263373601789 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 291777669565217 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 289887101511879 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 512
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 300936609865470 }
+        flops { key: "f32xf32->f32" value: 123532193281178 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 497102696296296 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 497102696296296 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 504577924812030 }
+      }
+      entries {
+        b: 1
+        m: 512
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411845803897398 }
-        flops { key: "f32xf32->f32" value: 754591068657109 }
+        flops { key: "bf16xbf16->bf16" value: 460413495846063 }
+        flops { key: "f32xf32->f32" value: 185064085487762 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 746691115438108 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 748773935843793 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 749819709497206 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 1024
+        b: 1
+        m: 512
+        n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1341469864251273 }
-        flops { key: "f32xf32->f32" value: 693743707963172 }
+        flops { key: "bf16xbf16->bf16" value: 661578449784350 }
+        flops { key: "f32xf32->f32" value: 280570113404755 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 961272895255147 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 975242346957311 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 965595165467625 }
       }
       entries {
         b: 1
         m: 1024
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 44027465310808 }
-        flops { key: "f32xf32->f32" value: 29070333116742 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23563505617977 }
+        flops { key: "f32xf32->f32" value: 17586180293501 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16578276679841 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16644063492063 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16416062622309 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 1024
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 868503573327941 }
-        flops { key: "f32xf32->f32" value: 517372438234054 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45466710027100 }
+        flops { key: "f32xf32->f32" value: 33554432000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30504029090909 }
       }
       entries {
         b: 1
-        m: 256
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
+        flops { key: "f32xf32->f32" value: 56394003361344 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59178892416225 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58661594405594 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 55450414377194 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 83781353308364 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172074010256410 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 172960989690721 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 78951604705882 }
-        flops { key: "f32xf32->f32" value: 53419991243781 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 219668949263502 }
+        flops { key: "f32xf32->f32" value: 108065803542673 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305040290909090 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
       }
       entries {
         b: 1
-        m: 4096
-        n: 1024
+        m: 1024
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 433616082382635 }
-        flops { key: "f32xf32->f32" value: 272177902154626 }
+        flops { key: "bf16xbf16->bf16" value: 43690666666666 }
+        flops { key: "f32xf32->f32" value: 26672839427662 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 47259763380281 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 47798336182336 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 47798336182336 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 1024
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 434273740748230 }
-        flops { key: "f32xf32->f32" value: 264925197137922 }
+        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
+        flops { key: "f32xf32->f32" value: 46668194714881 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59283448763250 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59283448763250 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59074704225352 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 1
+        m: 1024
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 413494492731298 }
-        flops { key: "f32xf32->f32" value: 269716609897010 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 149130808888888 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 177536677248677 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 177536677248677 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178007596816976 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244032232727272 }
+        flops { key: "f32xf32->f32" value: 100087791200596 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 328914634400367 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 324197410628019 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 328965019607843 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 663520360883670 }
-        flops { key: "f32xf32->f32" value: 401173855408182 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 347714321243523 }
+        flops { key: "f32xf32->f32" value: 153298614983759 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 532610031746031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 559240533333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 535799313373253 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 107892064308681 }
-        flops { key: "f32xf32->f32" value: 103224555277831 }
+        b: 1
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 69759733887733 }
+        flops { key: "f32xf32->f32" value: 41425224691358 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 91428970027247 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 91678775956284 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 91180521739130 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 1024
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 58153261698440 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 144010437768240 }
+        flops { key: "f32xf32->f32" value: 72865216069489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178956970666666 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 178481021276595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 179435465240641 }
       }
       entries {
-        b: 4
+        b: 1
         m: 1024
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 682878972255346 }
-        flops { key: "f32xf32->f32" value: 422649802794725 }
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 252764082862523 }
+        flops { key: "f32xf32->f32" value: 116609668114682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 331401797530864 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 331401797530864 }
       }
       entries {
-        b: 4
-        m: 4096
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 937561077493997 }
-        flops { key: "f32xf32->f32" value: 520176497532322 }
+        b: 1
+        m: 1024
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 384027834048640 }
+        flops { key: "f32xf32->f32" value: 195795372720641 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 578524689655172 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 574808256959314 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 577280550537634 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133666354288559 }
-        flops { key: "f32xf32->f32" value: 88054930621617 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 240641376961004 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 865920825806451 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
       }
       entries {
-        b: 4
-        m: 2048
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 517684239860182 }
-        flops { key: "f32xf32->f32" value: 351283465914202 }
+        b: 1
+        m: 1024
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 116105301038062 }
+        flops { key: "f32xf32->f32" value: 58304834057341 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 168192641604010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 169895858227848 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168192641604010 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 1024
         n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 954225126860697 }
-        flops { key: "f32xf32->f32" value: 562702472372342 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
+        flops { key: "f32xf32->f32" value: 99273467455621 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 329722654383540 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 328160704156479 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 330585536945812 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 1024
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 613435306148682 }
-        flops { key: "f32xf32->f32" value: 387597445717895 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 359833050938337 }
+        flops { key: "f32xf32->f32" value: 169681072060682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 583555339130434 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 582289492407809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581029125541125 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1338986725692685 }
-        flops { key: "f32xf32->f32" value: 703703655108853 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 249128033410672 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 887389937190082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 896278651085141 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 877240052287581 }
       }
       entries {
-        b: 4
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42246688070506 }
-        flops { key: "f32xf32->f32" value: 32764000488221 }
+        b: 1
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 639132038095238 }
+        flops { key: "f32xf32->f32" value: 283010496573537 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1201053494407158 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1207808575928009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1209168720720720 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81827604328608 }
-        flops { key: "f32xf32->f32" value: 55273438896324 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 187193483960948 }
+        flops { key: "f32xf32->f32" value: 161903170084439 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 292413350762527 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289223386936026 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 291144746203904 }
       }
       entries {
-        b: 2
+        b: 1
         m: 1024
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 424655655131500 }
-        flops { key: "f32xf32->f32" value: 294619789820277 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 312134251162790 }
+        flops { key: "f32xf32->f32" value: 122071603456116 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 556920033195020 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 559240533333333 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 572357049040511 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 472597633802816 }
+        flops { key: "f32xf32->f32" value: 184175269982847 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 824686500768049 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 825955249230769 }
+      }
+      entries {
+        b: 1
+        m: 1024
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 616960036773683 }
-        flops { key: "f32xf32->f32" value: 388948815576182 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 665679990080595 }
+        flops { key: "f32xf32->f32" value: 283758410147991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1254371289719626 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1252907612602100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1257308927400468 }
       }
       entries {
         b: 1
-        m: 512
+        m: 1024
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 442187511170596 }
-        flops { key: "f32xf32->f32" value: 299196607175200 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751920044817927 }
+        flops { key: "f32xf32->f32" value: 325820611136398 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1526285464108031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1525201454545454 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1546064541396688 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 2048
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 195902540412333 }
-        flops { key: "f32xf32->f32" value: 131690908689519 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45343827027027 }
+        flops { key: "f32xf32->f32" value: 32961131630648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30448667876588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30611866347360 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30504029090909 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1131259288446976 }
-        flops { key: "f32xf32->f32" value: 630916973338229 }
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84307618090452 }
+        flops { key: "f32xf32->f32" value: 55924053333333 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59074704225352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59074704225352 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59074704225352 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 20510044009779 }
-        flops { key: "f32xf32->f32" value: 16976692132557 }
+        b: 1
+        m: 2048
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 145572373101952 }
+        flops { key: "f32xf32->f32" value: 83572682440846 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 173857160621761 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 172516359897172 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173857160621761 }
       }
       entries {
         b: 1
-        m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 44020245326336 }
-        flops { key: "f32xf32->f32" value: 27817145699481 }
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 219668949263502 }
+        flops { key: "f32xf32->f32" value: 114912438356164 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 306433168949771 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 304348589569161 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
       }
       entries {
         b: 1
-        m: 4096
+        m: 2048
         n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 510212318365407 }
-        flops { key: "f32xf32->f32" value: 348306487389506 }
+        flops { key: "bf16xbf16->bf16" value: 303316899435028 }
+        flops { key: "f32xf32->f32" value: 147492008791208 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 493447529411764 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 503631249530956 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 495268369003690 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 1
+        m: 2048
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152498483738105 }
-        flops { key: "f32xf32->f32" value: 106680757476403 }
+        flops { key: "bf16xbf16->bf16" value: 78951604705882 }
+        flops { key: "f32xf32->f32" value: 50382030030030 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58052650519031 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58559218150087 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1338543343968523 }
-        flops { key: "f32xf32->f32" value: 815037469664174 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 4096
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 1092363202975726 }
-        flops { key: "f32xf32->f32" value: 668763641402935 }
-      }
-      entries {
-        b: 4
-        m: 4096
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1039220227081632 }
-        flops { key: "f32xf32->f32" value: 610232273079245 }
+        flops { key: "bf16xbf16->bf16" value: 150131686800894 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 178007596816976 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 174308737662337 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 178956970666666 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 383959171821920 }
-        flops { key: "f32xf32->f32" value: 253181283659514 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
+        flops { key: "f32xf32->f32" value: 133682996015936 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 324982392251816 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 329773287469287 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 1
+        m: 2048
         n: 2048
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 766889973395232 }
-        flops { key: "f32xf32->f32" value: 500724837773243 }
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 150891206295671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 548947762781186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 548947762781186 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 546711723014256 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 103066022653100 }
-        flops { key: "f32xf32->f32" value: 67096283446853 }
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 449264361506276 }
+        flops { key: "f32xf32->f32" value: 195367871906841 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 848137301737756 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 844136654088050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 848137301737756 }
       }
       entries {
         b: 1
         m: 2048
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 959394046127212 }
-        flops { key: "f32xf32->f32" value: 548002206826156 }
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 68688704196519 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 180886425876010 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 179916525469168 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 181375308108108 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 2048
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 257553807627728 }
-        flops { key: "f32xf32->f32" value: 170286547299976 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 855400775941047 }
-        flops { key: "f32xf32->f32" value: 520381328648452 }
+        flops { key: "bf16xbf16->bf16" value: 257615600767754 }
+        flops { key: "f32xf32->f32" value: 116205825108225 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 327360312195121 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 335544320000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
-        m: 256
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 379112657427840 }
-        flops { key: "f32xf32->f32" value: 252007703808015 }
+        m: 2048
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 386794605187319 }
+        flops { key: "f32xf32->f32" value: 199877480268056 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 595200567627494 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 587386118161925 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 589968035164835 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 2048
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674355047260166 }
-        flops { key: "f32xf32->f32" value: 400556520960596 }
+        flops { key: "bf16xbf16->bf16" value: 498024964749536 }
+        flops { key: "f32xf32->f32" value: 256999000478697 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 940229267950963 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 943534115992970 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 271661435547122 }
-        flops { key: "f32xf32->f32" value: 181084716080613 }
-      }
-      entries {
-        b: 2
-        m: 4096
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 774565788277727 }
-        flops { key: "f32xf32->f32" value: 455905028368229 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 596523235555555 }
+        flops { key: "f32xf32->f32" value: 302632983089064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1380130879177378 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1390857284974093 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1367823979617834 }
       }
       entries {
-        b: 4
+        b: 1
         m: 2048
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252259326676847 }
-        flops { key: "f32xf32->f32" value: 163269493499581 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 231409875862068 }
+        flops { key: "f32xf32->f32" value: 100688468117029 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 335544320000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 330585536945812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 335544320000000 }
       }
       entries {
         b: 1
-        m: 256
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74544697584004 }
-        flops { key: "f32xf32->f32" value: 53762358501902 }
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 354136485488126 }
+        flops { key: "f32xf32->f32" value: 192151364352183 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 589968035164835 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 610080581818181 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 593883752212389 }
       }
       entries {
         b: 1
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 106670159348301 }
-      }
-      entries {
-        b: 4
-        m: 1024
+        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 931865327836841 }
-        flops { key: "f32xf32->f32" value: 541081200088186 }
-      }
-      entries {
-        b: 2
-        m: 4096
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 678563440398135 }
-        flops { key: "f32xf32->f32" value: 403018419442619 }
+        flops { key: "bf16xbf16->bf16" value: 535265116650049 }
+        flops { key: "f32xf32->f32" value: 266834449304174 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 999759612662942 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1001624835820895 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1005376239700374 }
       }
       entries {
-        b: 2
+        b: 1
         m: 2048
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1172607274861784 }
-        flops { key: "f32xf32->f32" value: 659394687341675 }
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 641039894925373 }
+        flops { key: "f32xf32->f32" value: 342610664964901 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1412818189473684 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1409110005249343 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1414679610013175 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 1024
+        b: 1
+        m: 2048
+        n: 4096
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 248063260713873 }
-        flops { key: "f32xf32->f32" value: 169426717790927 }
+        flops { key: "bf16xbf16->bf16" value: 725501232432432 }
+        flops { key: "f32xf32->f32" value: 315250095126247 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1835456109401709 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1838599013698630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1848092640275387 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 1
+        m: 2048
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 38211452811387 }
-        flops { key: "f32xf32->f32" value: 27588433299075 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 333874945273631 }
+        flops { key: "f32xf32->f32" value: 122182729176149 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 550072655737704 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 553475167010309 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 258546068865880 }
-        flops { key: "f32xf32->f32" value: 184080545859763 }
+        b: 1
+        m: 2048
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 458080982935153 }
+        flops { key: "f32xf32->f32" value: 174872958449542 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 917728054700854 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 936947490401396 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 935315177700348 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
+        b: 1
+        m: 2048
+        n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1455736066093293 }
-        flops { key: "f32xf32->f32" value: 863592087063613 }
+        flops { key: "bf16xbf16->bf16" value: 663622882571075 }
+        flops { key: "f32xf32->f32" value: 326266127013065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1416545941952506 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1414679610013175 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1412818189473684 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 1
+        m: 2048
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1458447135366869 }
-        flops { key: "f32xf32->f32" value: 871969454646965 }
+        flops { key: "bf16xbf16->bf16" value: 752183414360770 }
+        flops { key: "f32xf32->f32" value: 372051914067914 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1856079211754537 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1838599013698630 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1840174505569837 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 22129880956306 }
-        flops { key: "f32xf32->f32" value: 17115242030094 }
+        b: 1
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 752315168330705 }
+        flops { key: "f32xf32->f32" value: 383410756650598 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2112625330054107 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2101256015655577 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2049125618320610 }
       }
       entries {
         b: 1
         m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 421281735752820 }
-        flops { key: "f32xf32->f32" value: 285209329703167 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 55188210526315 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58457198606271 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58153261698440 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58457198606271 }
       }
       entries {
         b: 1
         m: 4096
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 448888722408026 }
-        flops { key: "f32xf32->f32" value: 293632822588364 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 83886080000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 171196081632653 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 173407917312661 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 173407917312661 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 4096
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961918767301231 }
-        flops { key: "f32xf32->f32" value: 559185925332812 }
-      }
-      entries {
-        b: 4
-        m: 2048
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185554425781519 }
-        flops { key: "f32xf32->f32" value: 652309267722215 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 222953036544850 }
+        flops { key: "f32xf32->f32" value: 115010906598114 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307838825688073 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 303660018099547 }
       }
       entries {
         b: 1
-        m: 512
-        n: 256
+        m: 4096
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 12861031812955 }
-        flops { key: "f32xf32->f32" value: 10251888787045 }
+        flops { key: "bf16xbf16->bf16" value: 308192257175660 }
+        flops { key: "f32xf32->f32" value: 132757396636993 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 500812417910447 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 507439425330812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 484540534296028 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 2048
+        m: 4096
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 243976783458304 }
-        flops { key: "f32xf32->f32" value: 167340734668432 }
+        flops { key: "bf16xbf16->bf16" value: 398567863400148 }
+        flops { key: "f32xf32->f32" value: 187651489688919 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 779203065312046 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 739491614325068 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 784898994152046 }
       }
       entries {
-        b: 2
+        b: 1
         m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 668581459526774 }
-        flops { key: "f32xf32->f32" value: 416724134866346 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 144943550755939 }
+        flops { key: "f32xf32->f32" value: 81740394640682 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 175218966057441 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 176138750656167 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 175218966057441 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1416487544543843 }
-        flops { key: "f32xf32->f32" value: 753341957980475 }
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 240103270125223 }
+        flops { key: "f32xf32->f32" value: 130689121713729 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 325771184466019 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 323416212048192 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 325771184466019 }
       }
       entries {
         b: 1
-        m: 2048
-        n: 512
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152131173703598 }
-        flops { key: "f32xf32->f32" value: 109643809251506 }
+        flops { key: "bf16xbf16->bf16" value: 354604301188903 }
+        flops { key: "f32xf32->f32" value: 146206675381263 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 551202168377823 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 562757769392033 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 555766989648033 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 376916831592803 }
-        flops { key: "f32xf32->f32" value: 259561690699220 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 449640629815745 }
+        flops { key: "f32xf32->f32" value: 196076938346001 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 854889987261146 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 868723158576051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 858993459200000 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
-        n: 1024
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 760844516563330 }
-        flops { key: "f32xf32->f32" value: 496800820797547 }
+        flops { key: "bf16xbf16->bf16" value: 551768665981500 }
+        flops { key: "f32xf32->f32" value: 237448435205661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1332185885856079 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1333840775155279 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1333840775155279 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 248034609378609 }
-        flops { key: "f32xf32->f32" value: 157509435822209 }
+        b: 1
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 229040491467576 }
+        flops { key: "f32xf32->f32" value: 114422615515771 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 332222099009901 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 328914634400367 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 332222099009901 }
       }
       entries {
         b: 1
         m: 4096
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 780193877565849 }
-        flops { key: "f32xf32->f32" value: 502438194484251 }
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 377546351617440 }
+        flops { key: "f32xf32->f32" value: 194659503988397 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 597851795100222 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 595200567627494 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 597851795100222 }
       }
       entries {
         b: 1
         m: 4096
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 682011480111155 }
-        flops { key: "f32xf32->f32" value: 409453958339291 }
-      }
-      entries {
-        b: 4
-        m: 256
-        n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 41831923952002 }
-        flops { key: "f32xf32->f32" value: 27363451172273 }
+        flops { key: "bf16xbf16->bf16" value: 497563403151065 }
+        flops { key: "f32xf32->f32" value: 274053553854007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 977906943533697 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1011056331450094 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 977906943533697 }
       }
       entries {
         b: 1
-        m: 1024
-        n: 256
+        m: 4096
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74731474387527 }
-        flops { key: "f32xf32->f32" value: 52919754756037 }
+        flops { key: "bf16xbf16->bf16" value: 604584360360360 }
+        flops { key: "f32xf32->f32" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1497547871687587 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1359166865822784 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1516584497175141 }
       }
       entries {
-        b: 2
-        m: 2048
+        b: 1
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1272347282651360 }
-        flops { key: "f32xf32->f32" value: 698908473373744 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 685221329929802 }
+        flops { key: "f32xf32->f32" value: 309034918405526 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1880458535901926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1920826161001789 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1939912961156278 }
       }
       entries {
         b: 1
-        m: 256
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 32896501960784 }
-        flops { key: "f32xf32->f32" value: 23012040805829 }
+        flops { key: "bf16xbf16->bf16" value: 314327231850117 }
+        flops { key: "f32xf32->f32" value: 191193344729344 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 582289492407809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 563940033613445 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 577280550537634 }
       }
       entries {
         b: 1
-        m: 1024
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 531029586547972 }
+        flops { key: "f32xf32->f32" value: 271283937342091 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1012963984905660 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 997901323420074 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1005376239700374 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 428126724082934 }
-        flops { key: "f32xf32->f32" value: 298863495650963 }
+        flops { key: "bf16xbf16->bf16" value: 643730110311750 }
+        flops { key: "f32xf32->f32" value: 345921979381443 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1574401501466275 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1547178420749279 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1562942975254730 }
       }
       entries {
-        b: 4
-        m: 2048
+        b: 1
+        m: 4096
         n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1276246201801467 }
-        flops { key: "f32xf32->f32" value: 692610985264770 }
+        flops { key: "bf16xbf16->bf16" value: 727467360433604 }
+        flops { key: "f32xf32->f32" value: 310824091474887 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1999519225325884 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2003249671641791 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2018311699248120 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 1
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 735565558486042 }
+        flops { key: "f32xf32->f32" value: 326514162688155 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2301697371918542 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2266473507124010 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2279706632696390 }
+      }
+      entries {
+        b: 1
+        m: 4096
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 371183760781263 }
-        flops { key: "f32xf32->f32" value: 248954747043821 }
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 233625288076588 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 943534115992970 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 932067555555555 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
         b: 1
-        m: 512
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
-        flops { key: "f32xf32->f32" value: 185335604384223 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 661986327990135 }
+        flops { key: "f32xf32->f32" value: 325567457863518 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1553895548480463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1538312068767908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1556147571014492 }
       }
       entries {
         b: 1
-        m: 2048
+        m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133384077515527 }
-        flops { key: "f32xf32->f32" value: 101210465076821 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751656859642982 }
+        flops { key: "f32xf32->f32" value: 301398569907281 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2031678001892147 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2025927969811320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2027721355444083 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 1
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 447299239325140 }
-        flops { key: "f32xf32->f32" value: 287346443834883 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 759364797736916 }
+        flops { key: "f32xf32->f32" value: 381673091264551 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2315346251212938 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2321603943783784 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2339306806100218 }
       }
       entries {
-        b: 4
+        b: 1
         m: 4096
         n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 780193877565849 }
+        flops { key: "f32xf32->f32" value: 403149640735319 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2423753698474561 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2443098575654152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2435479045080805 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 789352807737370 }
-        flops { key: "f32xf32->f32" value: 511625396348908 }
+        flops { key: "bf16xbf16->bf16" value: 12336188235294 }
+        flops { key: "f32xf32->f32" value: 10230009756097 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 10381940594059 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 10381940594059 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 10356306172839 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 40227102652480 }
-        flops { key: "f32xf32->f32" value: 24738314994009 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 24036126074498 }
+        flops { key: "f32xf32->f32" value: 17439933471933 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 15917662239089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16039403441682 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 15797755178907 }
       }
       entries {
-        b: 1
+        b: 2
         m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 618381296666906 }
-        flops { key: "f32xf32->f32" value: 407009457095475 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 44267060686015 }
+        flops { key: "f32xf32->f32" value: 33354306163021 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30012908765652 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30012908765652 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30066695340501 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 903419093103357 }
-        flops { key: "f32xf32->f32" value: 606698067733163 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80854053012048 }
+        flops { key: "f32xf32->f32" value: 52841625196850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59493673758865 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59599346358792 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59918628571428 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 1024
+        m: 256
+        n: 4096
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 242242938296672 }
-        flops { key: "f32xf32->f32" value: 167106345654034 }
+        flops { key: "bf16xbf16->bf16" value: 135573462626262 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84947929113924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 156796411214953 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 156796411214953 }
       }
       entries {
         b: 2
-        m: 4096
-        n: 4096
+        m: 256
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 902849367212339 }
-        flops { key: "f32xf32->f32" value: 615016438175699 }
+        flops { key: "bf16xbf16->bf16" value: 22795130434782 }
+        flops { key: "f32xf32->f32" value: 16416062622309 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16352062378167 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16163021194605 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16384000000000 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 44034687664041 }
+        flops { key: "f32xf32->f32" value: 26504290679304 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 23269370319001 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 43018502564102 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 697121781528972 }
-        flops { key: "f32xf32->f32" value: 429453784221577 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 51622203076923 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59178892416225 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59178892416225 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59178892416225 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 2
+        m: 256
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 145257281385281 }
+        flops { key: "f32xf32->f32" value: 77492914549653 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 157163615925058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159783009523809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 87381333333333 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1142696410522465 }
-        flops { key: "f32xf32->f32" value: 633335883801518 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 236298816901408 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 263689053045186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265777679207920 }
       }
       entries {
         b: 2
-        m: 1024
+        m: 256
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 142481664543524 }
-        flops { key: "f32xf32->f32" value: 99099383848638 }
+        flops { key: "bf16xbf16->bf16" value: 38130036363636 }
+        flops { key: "f32xf32->f32" value: 24244531791907 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 43804741514360 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44034687664041 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 22733355013550 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 2048
+        b: 2
+        m: 256
+        n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 884283981058266 }
-        flops { key: "f32xf32->f32" value: 525201589190180 }
+        flops { key: "bf16xbf16->bf16" value: 76087147392290 }
+        flops { key: "f32xf32->f32" value: 38881149478563 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 84947929113924 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85163532994923 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84733414141414 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 256
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 877777906396893 }
-        flops { key: "f32xf32->f32" value: 522645163943901 }
+        flops { key: "bf16xbf16->bf16" value: 145888834782608 }
+        flops { key: "f32xf32->f32" value: 108590394822006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
       }
       entries {
-        b: 4
+        b: 2
+        m: 256
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 239247286987522 }
+        flops { key: "f32xf32->f32" value: 117838215978928 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 263689053045186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 266834449304174 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265252426877470 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 367216766073871 }
+        flops { key: "f32xf32->f32" value: 145572373101952 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 404270265060240 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 409825123664122 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 406105077155824 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 61455003663003 }
+        flops { key: "f32xf32->f32" value: 69042041152263 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 82040176039119 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 81840078048780 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 82443321867321 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 125203104477611 }
+        flops { key: "f32xf32->f32" value: 112034831385642 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155344592592592 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 157163615925058 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 155705020881670 }
+      }
+      entries {
+        b: 2
         m: 256
         n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 607191248462571 }
-        flops { key: "f32xf32->f32" value: 378277901708648 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 220390357963875 }
+        flops { key: "f32xf32->f32" value: 165089456334563 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 269513510040160 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 262657001956947 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 267365992031872 }
       }
       entries {
-        b: 4
-        m: 4096
+        b: 2
+        m: 256
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 343267846547314 }
+        flops { key: "f32xf32->f32" value: 154628718894009 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 392449497076023 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 409200390243902 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 389036892753623 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 484103617673579 }
+        flops { key: "f32xf32->f32" value: 168139966176010 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547269023445463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 542293850505050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 546711723014256 }
+      }
+      entries {
+        b: 2
+        m: 256
         n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1148195100016708 }
-        flops { key: "f32xf32->f32" value: 589037550024000 }
+        flops { key: "bf16xbf16->bf16" value: 92820005532503 }
+        flops { key: "f32xf32->f32" value: 107031681020733 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 147492008791208 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 144631172413793 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 146846529540481 }
       }
       entries {
-        b: 1
+        b: 2
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 190379756028368 }
+        flops { key: "f32xf32->f32" value: 159973454112038 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 247178136279926 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 246723764705882 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 247178136279926 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 298925897550111 }
+        flops { key: "f32xf32->f32" value: 202440012066365 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 407337566009104 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 390735743813682 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 392449497076023 }
+      }
+      entries {
+        b: 2
+        m: 256
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 421405739403453 }
+        flops { key: "f32xf32->f32" value: 183734056125941 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 535799313373253 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 546711723014256 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 521740439261418 }
+      }
+      entries {
+        b: 2
         m: 256
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 602887043234138 }
+        flops { key: "f32xf32->f32" value: 174308737662337 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 628286614394382 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 626819511967308 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 635726361160450 }
+      }
+      entries {
+        b: 2
+        m: 512
         n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 5990793072665 }
-        flops { key: "f32xf32->f32" value: 4529485961123 }
+        flops { key: "bf16xbf16->bf16" value: 23497501400560 }
+        flops { key: "f32xf32->f32" value: 17476266666666 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16288559223300 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16480565815324 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16448250980392 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 2
+        m: 512
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45100043010752 }
+        flops { key: "f32xf32->f32" value: 33825032258064 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30504029090909 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30393507246376 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 81442796116504 }
+        flops { key: "f32xf32->f32" value: 53008581358609 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60349697841726 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 60025817531305 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60025817531305 }
+      }
+      entries {
+        b: 2
+        m: 512
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 711912364661031 }
-        flops { key: "f32xf32->f32" value: 431459872017680 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161708106024096 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160547521531100 }
       }
       entries {
-        b: 1
+        b: 2
         m: 512
         n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 214405316293929 }
+        flops { key: "f32xf32->f32" value: 113647525825571 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 280790225941422 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 281378884696016 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 276168164609053 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 272730968757937 }
-        flops { key: "f32xf32->f32" value: 182593627072527 }
+        flops { key: "bf16xbf16->bf16" value: 43240247422680 }
+        flops { key: "f32xf32->f32" value: 26379270440251 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 24070611190817 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 24036126074498 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 44384169312169 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 512
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 151103549676329 }
-        flops { key: "f32xf32->f32" value: 102593333078540 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 84733414141414 }
+        flops { key: "f32xf32->f32" value: 51701744221879 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 61794534069981 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 62137837037037 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 61567765137614 }
       }
       entries {
-        b: 4
-        m: 256
-        n: 512
+        b: 2
+        m: 512
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 80635462901772 }
-        flops { key: "f32xf32->f32" value: 56199195226630 }
+        flops { key: "bf16xbf16->bf16" value: 147816881057268 }
+        flops { key: "f32xf32->f32" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166523235732009 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 165700898765432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 166111049504950 }
       }
       entries {
-        b: 1
+        b: 2
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 244476735883424 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300936609865470 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 297600283813747 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 296286375275938 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 184238473575840 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 460438174957118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 481930800718132 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 454975349152542 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 69184395876288 }
+        flops { key: "f32xf32->f32" value: 36711632385120 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 86037005128205 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 85816961636828 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 85163532994923 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 145888834782608 }
+        flops { key: "f32xf32->f32" value: 72160068817204 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167772160000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 167772160000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 168615236180904 }
+      }
+      entries {
+        b: 2
         m: 512
         n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 147451500137324 }
-        flops { key: "f32xf32->f32" value: 104510592174420 }
+        flops { key: "bf16xbf16->bf16" value: 250406208955223 }
+        flops { key: "f32xf32->f32" value: 117625220353836 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 300936609865470 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 301612871910112 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305735143507972 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 381300363636363 }
+        flops { key: "f32xf32->f32" value: 146525903930131 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 484540534296028 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 489845722627737 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 491640029304029 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 495268369003690 }
+        flops { key: "f32xf32->f32" value: 215345949810724 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 698141628088426 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 682173966963151 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 693631669250646 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 114520245733788 }
+        flops { key: "f32xf32->f32" value: 107374182400000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 159403477434679 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 158649796690307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 158649796690307 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 235469698245614 }
+        flops { key: "f32xf32->f32" value: 98762125091979 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305040290909090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 307134389016018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 307134389016018 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 371280022130013 }
+        flops { key: "f32xf32->f32" value: 154361964347326 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 492542121100917 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 501748515887850 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 489845722627737 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 530996760338752 }
+        flops { key: "f32xf32->f32" value: 182175402782490 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 743588520775623 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 731431760217983 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 731431760217983 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 641039894925373 }
+        flops { key: "f32xf32->f32" value: 232559516792332 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 972592231884058 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 932016990397656 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 174308737662337 }
+        flops { key: "f32xf32->f32" value: 155524597914252 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 276168164609053 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 276168164609053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 275601084188911 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 322638769230769 }
+        flops { key: "f32xf32->f32" value: 121025904418394 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 487178686025408 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 482797582733812 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 488064465454545 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 474686924845269 }
+        flops { key: "f32xf32->f32" value: 184745668272539 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 690953554697554 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 684784326530612 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 686535693094629 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 662803595061728 }
+        flops { key: "f32xf32->f32" value: 190819588413008 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 947697991173874 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 939406670166229 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 933688542608695 }
+      }
+      entries {
+        b: 2
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 747731075208913 }
+        flops { key: "f32xf32->f32" value: 252911063603642 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1045512973709834 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1038403649792982 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1052172292013718 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45221606469002 }
+        flops { key: "f32xf32->f32" value: 33288126984126 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30559591985428 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30504029090909 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30559591985428 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 86258179948586 }
+        flops { key: "f32xf32->f32" value: 54207483037156 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59599346358792 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59811821746880 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162098705314009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 217180789644012 }
+        flops { key: "f32xf32->f32" value: 114033753610875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286790017094017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 287404128479657 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 284963329087048 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 301612871910112 }
+        flops { key: "f32xf32->f32" value: 157440150146627 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 457300606473594 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 470939396491228 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 461229305841924 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 81640953771289 }
+        flops { key: "f32xf32->f32" value: 51622203076923 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 57456219178082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 57358003418803 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 57653663230240 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 150131686800894 }
+        flops { key: "f32xf32->f32" value: 78306725787631 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 166937472636815 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 166111049504950 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 167353775561097 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 246723764705882 }
+        flops { key: "f32xf32->f32" value: 112316090376569 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 303660018099547 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 303660018099547 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 300936609865470 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 196935544775092 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 515231201535508 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 512281404580152 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519217516441005 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 456135014443500 }
+        flops { key: "f32xf32->f32" value: 234032655623365 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 749819709497206 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 751920044817927 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 742560044260027 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 140985008403361 }
+        flops { key: "f32xf32->f32" value: 67108864000000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 170760468193384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 168615236180904 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 170327065989847 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
+        flops { key: "f32xf32->f32" value: 113551377326565 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309971658198614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312861836829836 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 310689185185185 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382386689458689 }
+        flops { key: "f32xf32->f32" value: 139737353461738 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 545600520325203 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 546711723014256 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 499385767804197 }
+        flops { key: "f32xf32->f32" value: 214405316293929 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 817155117199391 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 858993459200000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 597851795100222 }
+        flops { key: "f32xf32->f32" value: 274053553854007 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1179936070329670 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1154561101075268 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1173488332240437 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 233016888888888 }
+        flops { key: "f32xf32->f32" value: 92119236787920 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 314327231850117 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 309971658198614 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 315065089201877 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 355073354497354 }
+        flops { key: "f32xf32->f32" value: 151572815358554 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 556920033195020 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 553475167010309 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 547827461224489 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 532082172447968 }
+        flops { key: "f32xf32->f32" value: 169734717673095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 858993459200000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 860370051282051 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 863136514469453 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
+        flops { key: "f32xf32->f32" value: 236819987648875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1202398459126539 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1199711535195530 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1206451487640449 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721600688172043 }
+        flops { key: "f32xf32->f32" value: 289496312752763 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1460873229931972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1473907788606726 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1487177041551246 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 326563815085158 }
+        flops { key: "f32xf32->f32" value: 110421824763471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 516222030769230 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 540111581488933 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 529458493096646 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 452673618887015 }
+        flops { key: "f32xf32->f32" value: 183294951177876 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 848137301737756 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 819650247328244 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 819650247328244 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 662394709438618 }
+        flops { key: "f32xf32->f32" value: 180642971736204 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1185145501103752 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1201053494407158 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1207808575928009 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 754827292794376 }
+        flops { key: "f32xf32->f32" value: 300305362606628 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1481023205517241 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1478983228650137 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1490273176960444 }
+      }
+      entries {
+        b: 2
+        m: 1024
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 742688448210271 }
+        flops { key: "f32xf32->f32" value: 313592822429906 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1603796600448095 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1614042576475009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1631206720850740 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 84733414141414 }
+        flops { key: "f32xf32->f32" value: 52841625196850 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60133390681003 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59918628571428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60787014492753 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 144923987582669 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 222214781456953 }
+        flops { key: "f32xf32->f32" value: 112504382229673 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286790017094017 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 289262344827586 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 287404128479657 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 304348589569161 }
+        flops { key: "f32xf32->f32" value: 174876518566775 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 464421204152249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 475107001769911 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 462819751724137 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 396800378418329 }
+        flops { key: "f32xf32->f32" value: 216131607085346 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 752974631136044 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 758292248587570 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 757222724964739 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 64839482125603 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 165700898765432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164482509803921 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 165292768472906 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 242270267148014 }
+        flops { key: "f32xf32->f32" value: 112693306465155 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305735143507972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 304348589569161 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305040290909090 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 348165312581063 }
+        flops { key: "f32xf32->f32" value: 186924633154894 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 527378106090373 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 536870912000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 526344031372549 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 450395060402684 }
+        flops { key: "f32xf32->f32" value: 240965400359066 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 810983250755287 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 823421644171779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 818400780487804 }
+      }
+      entries {
+        b: 2
+        m: 2048
+        n: 4096
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 547827461224489 }
+        flops { key: "f32xf32->f32" value: 285266159404888 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1195703590200445 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1207808575928009 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1203746439461883 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
         n: 256
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 439967967219832 }
-        flops { key: "f32xf32->f32" value: 284002333928453 }
+        flops { key: "bf16xbf16->bf16" value: 229432013675213 }
+        flops { key: "f32xf32->f32" value: 88534121372031 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 312134251162790 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312134251162790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309971658198614 }
       }
       entries {
         b: 2
-        m: 256
+        m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 21785055672780 }
-        flops { key: "f32xf32->f32" value: 17770121541109 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382386689458689 }
+        flops { key: "f32xf32->f32" value: 140615744368779 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 565127275789473 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 556847827823155 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 560408050104384 }
       }
       entries {
-        b: 1
+        b: 2
         m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 425286394296465 }
-        flops { key: "f32xf32->f32" value: 297209002560376 }
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 502688119850187 }
+        flops { key: "f32xf32->f32" value: 211117149823043 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 956989147950089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 956989147950089 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
       }
       entries {
-        b: 4
+        b: 2
         m: 2048
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1127177061575304 }
-        flops { key: "f32xf32->f32" value: 626225456878326 }
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 599186285714285 }
+        flops { key: "f32xf32->f32" value: 278315662001036 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1348921889447236 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1352319677581864 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1337162919053549 }
       }
       entries {
         b: 2
-        m: 512
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432916772099586 }
-        flops { key: "f32xf32->f32" value: 271644253747391 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 683911989808917 }
+        flops { key: "f32xf32->f32" value: 309703439284684 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1712506896331738 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1726273028938906 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1662139046439628 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 2048
+        m: 2048
+        n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 952584928416967 }
-        flops { key: "f32xf32->f32" value: 539738271567703 }
+        flops { key: "bf16xbf16->bf16" value: 319566019047619 }
+        flops { key: "f32xf32->f32" value: 110421824763471 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 558077871101871 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 541200516129032 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 553475167010309 }
       }
       entries {
         b: 2
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 269445878042659 }
-        flops { key: "f32xf32->f32" value: 183828423900017 }
+        m: 2048
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 168721216844751 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 950214003539823 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 948535180212014 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 955286320284697 }
       }
       entries {
         b: 2
-        m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 197034925038994 }
-        flops { key: "f32xf32->f32" value: 131432991492747 }
+        m: 2048
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 639132038095238 }
+        flops { key: "f32xf32->f32" value: 228115960059485 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1385473321290322 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1383687917525773 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1387263338501292 }
       }
       entries {
         b: 2
-        m: 4096
+        m: 2048
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1045799372028610 }
-        flops { key: "f32xf32->f32" value: 611938562895154 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 723774321572262 }
+        flops { key: "f32xf32->f32" value: 301610224348452 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1806125860386879 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1786592053244592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1794054843776107 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 2048
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 380658273154302 }
-        flops { key: "f32xf32->f32" value: 252704594963520 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 722807492516277 }
+        flops { key: "f32xf32->f32" value: 321720396704119 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2014524998123827 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2041334266159695 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2008871513564078 }
       }
       entries {
         b: 2
         m: 2048
-        n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 697971446493865 }
-        flops { key: "f32xf32->f32" value: 427020013521574 }
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 242160988723500 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 884466082372322 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 890333187396351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 896278651085141 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 958644561352603 }
-        flops { key: "f32xf32->f32" value: 553635692823305 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 438485686166411 }
-        flops { key: "f32xf32->f32" value: 266024607990089 }
+        flops { key: "bf16xbf16->bf16" value: 663597249179188 }
+        flops { key: "f32xf32->f32" value: 179645612180023 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1394469901298701 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1362508460940598 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1399924151238592 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 59905256862307 }
-        flops { key: "f32xf32->f32" value: 59061706490649 }
+        b: 2
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 751393858642407 }
+        flops { key: "f32xf32->f32" value: 301655239218991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1794054843776107 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1764571608874281 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1773314325350949 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 2048
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133666354288559 }
-        flops { key: "f32xf32->f32" value: 100143800037306 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 752183414360770 }
+        flops { key: "f32xf32->f32" value: 316503132981148 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2054025488283118 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2022112662900188 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2069863757108433 }
       }
       entries {
         b: 2
-        m: 512
-        n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 151103549676329 }
-        flops { key: "f32xf32->f32" value: 98112374269005 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43683556712774 }
-        flops { key: "f32xf32->f32" value: 32824095866960 }
+        m: 2048
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 771777749855402 }
+        flops { key: "f32xf32->f32" value: 328536451406402 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2095105998048780 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2093063984405458 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2096128499755978 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 65012219907968 }
-        flops { key: "f32xf32->f32" value: 45957105975004 }
-      }
-      entries {
-        b: 4
-        m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1197453766222903 }
-        flops { key: "f32xf32->f32" value: 647966854017764 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 143089262260127 }
+        flops { key: "f32xf32->f32" value: 77852510440835 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 161319384615384 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162491196125908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
       }
       entries {
-        b: 4
-        m: 1024
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1143456965889048 }
-        flops { key: "f32xf32->f32" value: 608923713257837 }
+        b: 2
+        m: 4096
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 218595648208469 }
+        flops { key: "f32xf32->f32" value: 113743837288135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 286178524520255 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286178524520255 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
         b: 2
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1199460251623263 }
-        flops { key: "f32xf32->f32" value: 646503817111031 }
+        m: 4096
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 308192257175660 }
+        flops { key: "f32xf32->f32" value: 176023249836065 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 474267590106007 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 483667488288288 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 470114633975481 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 2
+        m: 4096
         n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 154584195796141 }
-        flops { key: "f32xf32->f32" value: 102124959482594 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 400052840536512 }
+        flops { key: "f32xf32->f32" value: 216480206451612 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 709208602377807 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 702710617801047 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 700875864229765 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 938380444832860 }
-        flops { key: "f32xf32->f32" value: 535682366748776 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 467453993905093 }
+        flops { key: "f32xf32->f32" value: 229285036087977 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 955286320284697 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 916161965870307 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 944363961301671 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 620660013872832 }
-        flops { key: "f32xf32->f32" value: 302899770513769 }
+        m: 4096
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 238821580071174 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 307838825688073 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 306433168949771 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 306433168949771 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 256
+        m: 4096
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 152131173703598 }
-        flops { key: "f32xf32->f32" value: 105165702644466 }
+        flops { key: "bf16xbf16->bf16" value: 348617475324675 }
+        flops { key: "f32xf32->f32" value: 191876666190135 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 525314003913894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 528416251968503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 560408050104384 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
+        b: 2
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 55819391973383 }
+        flops { key: "bf16xbf16->bf16" value: 453055621940928 }
+        flops { key: "f32xf32->f32" value: 239140718040089 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 827227907550077 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 836247526479750 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 827227907550077 }
       }
       entries {
         b: 2
-        m: 256
+        m: 4096
         n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 422649802794725 }
-        flops { key: "f32xf32->f32" value: 276381421879021 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 548667258048032 }
+        flops { key: "f32xf32->f32" value: 284434920264900 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1269198373522458 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1267699910271546 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1261741273795534 }
       }
       entries {
-        b: 4
+        b: 2
         m: 4096
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1350353246924739 }
-        flops { key: "f32xf32->f32" value: 769556557977547 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376916831592803 }
-        flops { key: "f32xf32->f32" value: 251329351980806 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 613216347230154 }
+        flops { key: "f32xf32->f32" value: 212243886934176 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1569797988304093 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1581357620029455 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1573248093772893 }
       }
       entries {
         b: 2
         m: 4096
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 148779523901898 }
-        flops { key: "f32xf32->f32" value: 102436731921389 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 378078107042253 }
+        flops { key: "f32xf32->f32" value: 146605929000546 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 566319527426160 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 550072655737704 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 558077871101871 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 75223611041053 }
-        flops { key: "f32xf32->f32" value: 54637788723794 }
+        flops { key: "bf16xbf16->bf16" value: 495725680517082 }
+        flops { key: "f32xf32->f32" value: 210620208709297 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 922458611683848 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 969081068592057 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 974357372050816 }
       }
       entries {
-        b: 4
-        m: 512
+        b: 2
+        m: 4096
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 271627074120920 }
-        flops { key: "f32xf32->f32" value: 180127801375608 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 598518296544035 }
+        flops { key: "f32xf32->f32" value: 259604835275095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1418417204755614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1439332203753351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1445143773889636 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81037118792452 }
-        flops { key: "f32xf32->f32" value: 49627557034572 }
+        m: 4096
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 684347880178457 }
+        flops { key: "f32xf32->f32" value: 279109202453190 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1893620191127032 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1887068231985940 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1922545790510295 }
       }
       entries {
         b: 2
-        m: 512
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 134739844898983 }
-        flops { key: "f32xf32->f32" value: 88395638758541 }
+        m: 4096
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 691843958762886 }
+        flops { key: "f32xf32->f32" value: 313547035771645 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2218474842975206 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2218474842975206 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2221917897568546 }
       }
       entries {
         b: 2
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 150426145138694 }
-        flops { key: "f32xf32->f32" value: 104683808521010 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 941001762830695 }
-        flops { key: "f32xf32->f32" value: 567685595743977 }
+        m: 4096
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 524800500488758 }
+        flops { key: "f32xf32->f32" value: 161316355086480 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 956989147950089 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 958698057142857 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 970833475587703 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42360021461259 }
-        flops { key: "f32xf32->f32" value: 28381841404102 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 636480037937166 }
+        flops { key: "f32xf32->f32" value: 212622143366336 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1489239700416088 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1487177041551246 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1508064359550561 }
       }
       entries {
-        b: 4
-        m: 256
+        b: 2
+        m: 4096
         n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81815134410240 }
-        flops { key: "f32xf32->f32" value: 49485750944787 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721843243025210 }
+        flops { key: "f32xf32->f32" value: 284887721942159 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1973676740076971 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1970168484403669 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1979247601843318 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 1024
+        m: 4096
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 948325744314418 }
-        flops { key: "f32xf32->f32" value: 539179273263660 }
+        flops { key: "bf16xbf16->bf16" value: 730560860010205 }
+        flops { key: "f32xf32->f32" value: 275264378130895 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2288208468833244 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2284557072340425 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2215042442496132 }
       }
       entries {
         b: 2
-        m: 256
-        n: 1024
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759163463720724 }
+        flops { key: "f32xf32->f32" value: 326649966778608 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2395366670826289 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2367025238908790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2378165723145072 }
+      }
+      entries {
+        b: 2
+        m: 4096
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 375861319331408 }
-        flops { key: "f32xf32->f32" value: 253435256741606 }
+        flops { key: "bf16xbf16->bf16" value: 597519100723427 }
+        flops { key: "f32xf32->f32" value: 171059713876055 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1427848170212766 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1464859241473397 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1398101333333333 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 272800260162601 }
-        flops { key: "f32xf32->f32" value: 176355723741479 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 741022652864044 }
+        flops { key: "f32xf32->f32" value: 230317851565851 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1959382890510949 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1943424115837104 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1984735349353050 }
       }
       entries {
-        b: 4
-        m: 1024
+        b: 2
+        m: 4096
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1185513520615533 }
-        flops { key: "f32xf32->f32" value: 656785594341966 }
+        flops { key: "bf16xbf16->bf16" value: 746561323831044 }
+        flops { key: "f32xf32->f32" value: 300200412105962 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2312852609585352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2264083972588297 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2346976664480874 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 2
+        m: 4096
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 151465908308647 }
-        flops { key: "f32xf32->f32" value: 110524119814719 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 778986660500019 }
+        flops { key: "f32xf32->f32" value: 319078587332873 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2368289653679803 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2378824312378842 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2397414064192018 }
       }
       entries {
         b: 2
-        m: 2048
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255591960009521 }
-        flops { key: "f32xf32->f32" value: 186122694401109 }
+        m: 4096
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 804185691260323 }
+        flops { key: "f32xf32->f32" value: 336088741093054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2537646851403249 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2555767507289497 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2535025702228124 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 256
         n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 956030561157484 }
-        flops { key: "f32xf32->f32" value: 486916338860074 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 23899168091168 }
+        flops { key: "f32xf32->f32" value: 17225067761806 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 16070130268199 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 16100975047984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 16100975047984 }
+      }
+      entries {
+        b: 4
+        m: 256
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 45590260869565 }
+        flops { key: "f32xf32->f32" value: 32961131630648 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30840470588235 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30783882568807 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30727501831501 }
+      }
+      entries {
+        b: 4
+        m: 256
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 80082176610978 }
+        flops { key: "f32xf32->f32" value: 52758540880503 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 60458436036036 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 60567566787003 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 60458436036036 }
       }
       entries {
-        b: 1
-        m: 512
-        n: 4096
+        b: 4
+        m: 256
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 125657322878876 }
-        flops { key: "f32xf32->f32" value: 105652053921086 }
+        flops { key: "bf16xbf16->bf16" value: 135573462626262 }
+        flops { key: "f32xf32->f32" value: 82040176039119 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 155344592592592 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 155705020881670 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 155705020881670 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 256
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 936743139803707 }
-        flops { key: "f32xf32->f32" value: 541353999810934 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 213382715421303 }
+        flops { key: "f32xf32->f32" value: 106437532117367 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 249475330855018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 249475330855018 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 247634184501845 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
+        b: 4
+        m: 256
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 272177902154626 }
-        flops { key: "f32xf32->f32" value: 182067286816447 }
+        flops { key: "bf16xbf16->bf16" value: 43233283298437 }
+        flops { key: "f32xf32->f32" value: 26337858712715 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 23172950276243 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 44501899204244 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 23205001383125 }
       }
       entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 151787082838563 }
-        flops { key: "f32xf32->f32" value: 100887139340411 }
+        b: 4
+        m: 256
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 84519979848866 }
+        flops { key: "f32xf32->f32" value: 51701744221879 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 58355533913043 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 58559218150087 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58867424561403 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 627094071543291 }
-        flops { key: "f32xf32->f32" value: 406335600378429 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 77852510440835 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 88301136842105 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 87953950196592 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 159783009523809 }
       }
       entries {
         b: 4
         m: 256
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 376388335465778 }
-        flops { key: "f32xf32->f32" value: 245314558830249 }
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 234236872600349 }
+        flops { key: "f32xf32->f32" value: 112598765100671 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 260616947572815 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 260111875968992 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 262144000000000 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 256
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 254140076686390 }
-        flops { key: "f32xf32->f32" value: 162282449028942 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 346368330322580 }
+        flops { key: "f32xf32->f32" value: 185640011065006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 375434204195804 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 376487315568022 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 382386689458689 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 383410756650598 }
-        flops { key: "f32xf32->f32" value: 263107528546924 }
+        m: 256
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 73746004395604 }
+        flops { key: "f32xf32->f32" value: 38926255220417 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 85163532994923 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 84307618090452 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 84519979848866 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
         n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 40814270336019 }
-        flops { key: "f32xf32->f32" value: 29530853245324 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 144320137634408 }
+        flops { key: "f32xf32->f32" value: 59283448763250 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 163281907542579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 159025744075829 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 161708106024096 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1140269417847542 }
-        flops { key: "f32xf32->f32" value: 627620983596975 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 241398791366906 }
+        flops { key: "f32xf32->f32" value: 116914397212543 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 265777679207920 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 269513510040160 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 265252426877470 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 256
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1148482940352636 }
-        flops { key: "f32xf32->f32" value: 730327935213722 }
+        flops { key: "bf16xbf16->bf16" value: 365218307482993 }
+        flops { key: "f32xf32->f32" value: 147411013728720 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 408577558599695 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 400649934328358 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 401248813153961 }
       }
       entries {
-        b: 1
+        b: 4
         m: 256
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255106159182703 }
-        flops { key: "f32xf32->f32" value: 188739993671998 }
+        flops { key: "bf16xbf16->bf16" value: 491640029304029 }
+        flops { key: "f32xf32->f32" value: 189506146134839 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 531029586547972 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 534199912437810 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 528416251968503 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 58757897778264 }
+        b: 4
+        m: 256
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 122685308957952 }
+        flops { key: "f32xf32->f32" value: 106861248407643 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 151830009049773 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 153919412844036 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 153567194508009 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 256
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 927839122056599 }
-        flops { key: "f32xf32->f32" value: 544752804134825 }
+        flops { key: "bf16xbf16->bf16" value: 231409875862068 }
+        flops { key: "f32xf32->f32" value: 157903209411764 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 267365992031872 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 268973402805611 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 270600258064516 }
       }
       entries {
-        b: 2
+        b: 4
         m: 256
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81616131346913 }
-        flops { key: "f32xf32->f32" value: 49774792508807 }
-      }
-      entries {
-        b: 1
-        m: 2048
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 712857642489626 }
-        flops { key: "f32xf32->f32" value: 430422137194969 }
+        flops { key: "bf16xbf16->bf16" value: 356015193633952 }
+        flops { key: "f32xf32->f32" value: 154096128587830 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 397682157037037 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 419430400000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 395922501474926 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 461130265836375 }
-        flops { key: "f32xf32->f32" value: 292353638009665 }
-      }
-      entries {
-        b: 2
-        m: 1024
+        m: 256
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 242680941123290 }
-        flops { key: "f32xf32->f32" value: 166484506395844 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 166937472636815 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547269023445463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 539568755778894 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 548387039836567 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 617669849140720 }
-        flops { key: "f32xf32->f32" value: 389248440819285 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 601536035854341 }
+        flops { key: "f32xf32->f32" value: 201793238864875 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 611818703133903 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 602887043234138 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 581973888346883 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 256
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 258048984378755 }
-        flops { key: "f32xf32->f32" value: 178214410622406 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 186932768802228 }
+        flops { key: "f32xf32->f32" value: 154450780207134 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 241833744144144 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 256140702290076 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 235883528998242 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1278192748470137 }
-        flops { key: "f32xf32->f32" value: 692492333712903 }
+        m: 256
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 303316899435028 }
+        flops { key: "f32xf32->f32" value: 202899059712774 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 401248813153961 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 397093869822485 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 395922501474926 }
       }
       entries {
-        b: 2
-        m: 4096
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1153437120010742 }
-        flops { key: "f32xf32->f32" value: 733524152854276 }
+        b: 4
+        m: 256
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 422733001574803 }
+        flops { key: "f32xf32->f32" value: 180699972484590 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 536334577422577 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 545600520325203 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 534199912437810 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 256
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 960198367091437 }
-        flops { key: "f32xf32->f32" value: 565946408749505 }
+        flops { key: "bf16xbf16->bf16" value: 589644054914881 }
+        flops { key: "f32xf32->f32" value: 172571813564770 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 621378370370370 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 604584360360360 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 615324827507163 }
       }
       entries {
         b: 4
-        m: 512
+        m: 256
         n: 4096
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1336642743639617 }
-        flops { key: "f32xf32->f32" value: 657395049754623 }
+        flops { key: "bf16xbf16->bf16" value: 660560949861581 }
+        flops { key: "f32xf32->f32" value: 226122317363377 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 643730110311750 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 649768123449319 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 643718050246360 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81628540672038 }
-        flops { key: "f32xf32->f32" value: 48691357881371 }
+        n: 256
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 44739242666666 }
+        flops { key: "f32xf32->f32" value: 33091155818540 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 30393507246376 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 30559591985428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 30559591985428 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 616340287866829 }
-        flops { key: "f32xf32->f32" value: 428105387091951 }
+        b: 4
+        m: 512
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 53687091200000 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59074704225352 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59493673758865 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59493673758865 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 674407991834812 }
-        flops { key: "f32xf32->f32" value: 420662810577864 }
+        b: 4
+        m: 512
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 135847902834008 }
+        flops { key: "f32xf32->f32" value: 81840078048780 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160932527577937 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160164353221957 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 162098705314009 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 79879617914000 }
-        flops { key: "f32xf32->f32" value: 56299382550335 }
+        b: 4
+        m: 512
+        n: 2048
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 211699886435331 }
+        flops { key: "f32xf32->f32" value: 112788006722689 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 277883494824016 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 276168164609053 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 277883494824016 }
       }
       entries {
         b: 4
-        m: 256
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 673456259662877 }
-        flops { key: "f32xf32->f32" value: 429410847430513 }
+        m: 512
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 299593142857142 }
+        flops { key: "f32xf32->f32" value: 164785424186617 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 417473493001555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 424068650868878 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 418123763239875 }
       }
       entries {
         b: 4
         m: 512
         n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 244922861313868 }
-        flops { key: "f32xf32->f32" value: 164457317200183 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 80082176610978 }
+        flops { key: "f32xf32->f32" value: 51228140458015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 57752895008605 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 57358003418803 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 58153261698440 }
       }
       entries {
         b: 4
         m: 512
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 255591960009521 }
-        flops { key: "f32xf32->f32" value: 179151050971886 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 146846529540481 }
+        flops { key: "f32xf32->f32" value: 78306725787631 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 162491196125908 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 162491196125908 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 163281907542579 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 629299237509157 }
-        flops { key: "f32xf32->f32" value: 416744352416068 }
-      }
-      entries {
-        b: 1
-        m: 256
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 52331700165708 }
-        flops { key: "f32xf32->f32" value: 34234849636525 }
+        n: 1024
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 242270267148014 }
+        flops { key: "f32xf32->f32" value: 111662003327787 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 298261617777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 296286375275938 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 298925897550111 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252704594963520 }
-        flops { key: "f32xf32->f32" value: 162257925802795 }
+        m: 512
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 345032719794344 }
+        flops { key: "f32xf32->f32" value: 184872903581267 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 455747803056027 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 479349028571428 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 481876730169415 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 512
+        b: 4
+        m: 512
+        n: 4096
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 151146090090090 }
-        flops { key: "f32xf32->f32" value: 109109015750431 }
+        flops { key: "bf16xbf16->bf16" value: 448888722408026 }
+        flops { key: "f32xf32->f32" value: 231509664510564 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 629391456037514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 626453806301050 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 630130178403755 }
       }
       entries {
         b: 4
-        m: 4096
+        m: 512
         n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 252704594963520 }
-        flops { key: "f32xf32->f32" value: 163244671075636 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 138941747412008 }
+        flops { key: "f32xf32->f32" value: 71697504273504 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 167772160000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 164886643734643 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 166937472636815 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 42683328987120 }
-        flops { key: "f32xf32->f32" value: 32009951824469 }
+        b: 4
+        m: 512
+        n: 512
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 247178136279926 }
+        flops { key: "f32xf32->f32" value: 117220723144104 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 307134389016018 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 302974555304740 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 302292180180180 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 108920858592006 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 382932176890156 }
+        flops { key: "f32xf32->f32" value: 145809590439978 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 484540534296028 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 485416737793851 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 481930800718132 }
       }
       entries {
-        b: 1
-        m: 4096
-        n: 4096
+        b: 4
+        m: 512
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1043924724068785 }
-        flops { key: "f32xf32->f32" value: 641195408737193 }
+        flops { key: "bf16xbf16->bf16" value: 494356272559852 }
+        flops { key: "f32xf32->f32" value: 210207874706343 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 690065439588689 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 693631669250646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 691843958762886 }
       }
       entries {
         b: 4
-        m: 2048
+        m: 512
         n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1142639409653979 }
-        flops { key: "f32xf32->f32" value: 733320635321737 }
+        flops { key: "bf16xbf16->bf16" value: 598184860167130 }
+        flops { key: "f32xf32->f32" value: 255409568030447 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932877344917463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 928792192463642 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 939406670166229 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 81221015431164 }
-        flops { key: "f32xf32->f32" value: 55634291398963 }
-      }
-      entries {
-        b: 2
-        m: 4096
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 260080373985709 }
-        flops { key: "f32xf32->f32" value: 181574672190749 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 224444361204013 }
+        flops { key: "f32xf32->f32" value: 97826332361516 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 308546501149425 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305735143507972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 309971658198614 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 512
         n: 512
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 65265124240213 }
-        flops { key: "f32xf32->f32" value: 48128275392200 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961918767301231 }
-        flops { key: "f32xf32->f32" value: 559714249820811 }
+        flops { key: "bf16xbf16->bf16" value: 358391797062750 }
+        flops { key: "f32xf32->f32" value: 153831206876790 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 496183837338262 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 490741235831809 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 494356272559852 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 512
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 270566164545798 }
-        flops { key: "f32xf32->f32" value: 182330077092885 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 529458493096646 }
+        flops { key: "f32xf32->f32" value: 176544199934232 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 734433532147742 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 738474431911967 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 735439605479452 }
       }
       entries {
         b: 4
         m: 512
-        n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82430663595885 }
-        flops { key: "f32xf32->f32" value: 58964405491488 }
-      }
-      entries {
-        b: 1
-        m: 1024
         n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 150785258250245 }
-        flops { key: "f32xf32->f32" value: 103056130530761 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 637992765300059 }
+        flops { key: "f32xf32->f32" value: 228212927523910 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 951055645704163 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 951055645704163 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956136975957257 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 956775962575183 }
-        flops { key: "f32xf32->f32" value: 556811732157905 }
+        b: 4
+        m: 512
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 702710617801047 }
+        flops { key: "f32xf32->f32" value: 277919457486734 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1006790270979840 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1018700179904533 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1013442023596035 }
       }
       entries {
-        b: 2
+        b: 4
         m: 512
-        n: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 621243551891227 }
-        flops { key: "f32xf32->f32" value: 415113062001643 }
+        flops { key: "bf16xbf16->bf16" value: 317675095857988 }
+        flops { key: "f32xf32->f32" value: 121574028985507 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 482797582733812 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 478494573975044 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474267590106007 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82228658600091 }
-        flops { key: "f32xf32->f32" value: 58648777802053 }
-      }
-      entries {
-        b: 1
-        m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 43226321417069 }
-        flops { key: "f32xf32->f32" value: 31584357689139 }
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 454590103302286 }
+        flops { key: "f32xf32->f32" value: 184809264027538 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 692736660645161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 687414740076824 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 671928550688360 }
       }
       entries {
-        b: 1
+        b: 4
         m: 512
         n: 1024
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 380152885112409 }
-        flops { key: "f32xf32->f32" value: 249649342943501 }
+        flops { key: "bf16xbf16->bf16" value: 665267548946716 }
+        flops { key: "f32xf32->f32" value: 185895118690284 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932877344917463 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 941827157721616 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 956989147950089 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 512
         n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 614137026667619 }
-        flops { key: "f32xf32->f32" value: 392091226583896 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 747991517938000 }
+        flops { key: "f32xf32->f32" value: 250991543712014 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1027996001914791 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1061009707509881 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1027996001914791 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 82241254901960 }
-        flops { key: "f32xf32->f32" value: 49701065728568 }
+        b: 4
+        m: 512
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 724155672905075 }
+        flops { key: "f32xf32->f32" value: 305125619895478 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1118772413649387 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1118772413649387 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1120816100208768 }
       }
       entries {
         b: 4
         m: 1024
-        n: 4096
+        n: 256
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 497909494087642 }
-        flops { key: "f32xf32->f32" value: 344065312505006 }
+        flops { key: "bf16xbf16->bf16" value: 83886080000000 }
+        flops { key: "f32xf32->f32" value: 52593153605015 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 59918628571428 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 59178892416225 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 59599346358792 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 426680637393204 }
-        flops { key: "f32xf32->f32" value: 264435863563600 }
+        m: 1024
+        n: 512
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 141879205073995 }
+        flops { key: "f32xf32->f32" value: 82342164417177 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160547521531100 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160547521531100 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 253659774155445 }
-        flops { key: "f32xf32->f32" value: 172253440924039 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 216829932148626 }
+        flops { key: "f32xf32->f32" value: 112977885521885 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 284359593220339 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 284359593220339 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 279038935550935 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 150468304932735 }
-        flops { key: "f32xf32->f32" value: 103234479761561 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 43457253683017 }
-        flops { key: "f32xf32->f32" value: 29582924399382 }
+        flops { key: "bf16xbf16->bf16" value: 302292180180180 }
+        flops { key: "f32xf32->f32" value: 172398639104082 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466844271304347 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 472597633802816 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 461229305841924 }
       }
       entries {
         b: 4
-        m: 512
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 638229778735418 }
-        flops { key: "f32xf32->f32" value: 400631248169395 }
+        m: 1024
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 398863976225854 }
+        flops { key: "f32xf32->f32" value: 205934373609512 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 652334036452004 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 683041872773536 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 646832424096385 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 1024
         n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
-        flops { key: "f32xf32->f32" value: 105330765548361 }
+        flops { key: "bf16xbf16->bf16" value: 142784817021276 }
+        flops { key: "f32xf32->f32" value: 60622280036133 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 164080352078239 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 163680156097560 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 164482509803921 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1184900281674598 }
-        flops { key: "f32xf32->f32" value: 657804081020025 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 2048
+        n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 149754787168758 }
-        flops { key: "f32xf32->f32" value: 104673603431468 }
+        flops { key: "bf16xbf16->bf16" value: 243589343012704 }
+        flops { key: "f32xf32->f32" value: 111941391159299 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 304348589569161 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 300936609865470 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 304348589569161 }
       }
       entries {
-        b: 1
+        b: 4
         m: 1024
         n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150764086492558 }
-        flops { key: "f32xf32->f32" value: 110172565565360 }
+        flops { key: "bf16xbf16->bf16" value: 347264496765847 }
+        flops { key: "f32xf32->f32" value: 195652664723032 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 548947762781186 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 515231201535508 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519154755953100 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 379146124293785 }
-        flops { key: "f32xf32->f32" value: 255348828537455 }
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 451911542087542 }
+        flops { key: "f32xf32->f32" value: 237448435205661 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 743588520775623 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 741534408839779 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 735439605479452 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 1024
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 923897240333423 }
-        flops { key: "f32xf32->f32" value: 548842539901603 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 549228554475703 }
+        flops { key: "f32xf32->f32" value: 274614277237851 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1047552999024390 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1049601000977517 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1061009707509881 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1268917141886402 }
-        flops { key: "f32xf32->f32" value: 688944686864636 }
+        m: 1024
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 225955771043771 }
+        flops { key: "f32xf32->f32" value: 83209998760074 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 309971658198614 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 312134251162790 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 312861836829836 }
       }
       entries {
-        b: 1
-        m: 4096
+        b: 4
+        m: 1024
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 691676833239391 }
-        flops { key: "f32xf32->f32" value: 429067661938061 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 379146124293785 }
+        flops { key: "f32xf32->f32" value: 137942166495375 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 547827461224489 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 543391611336032 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 551202168377823 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 133119492189437 }
-        flops { key: "f32xf32->f32" value: 100443575678203 }
+        b: 4
+        m: 1024
+        n: 1024
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 499414801860465 }
+        flops { key: "f32xf32->f32" value: 209306398440545 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 863136514469453 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 861751062600321 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 867319728594507 }
       }
       entries {
-        b: 1
-        m: 2048
+        b: 4
+        m: 1024
         n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 685549448683160 }
-        flops { key: "f32xf32->f32" value: 411060659041967 }
+        flops { key: "bf16xbf16->bf16" value: 601199229563269 }
+        flops { key: "f32xf32->f32" value: 269581175997991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1147083473592842 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1150848685959271 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1141064637619553 }
       }
       entries {
         b: 4
-        m: 256
-        n: 512
+        m: 1024
+        n: 4096
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 152152731188890 }
-        flops { key: "f32xf32->f32" value: 99845808443369 }
+        flops { key: "bf16xbf16->bf16" value: 679798559037670 }
+        flops { key: "f32xf32->f32" value: 303445478027412 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1382796940115904 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1383687917525773 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1393564988968202 }
       }
       entries {
-        b: 2
+        b: 4
         m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 437102309790352 }
-        flops { key: "f32xf32->f32" value: 273008345792016 }
+        n: 256
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 323027022864019 }
+        flops { key: "f32xf32->f32" value: 202287457422758 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553475167010309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 528416251968503 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 554618710743801 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 962781281327056 }
-        flops { key: "f32xf32->f32" value: 473744462386940 }
+        m: 1024
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 526344031372549 }
+        flops { key: "f32xf32->f32" value: 165902516406898 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 853530861685214 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 857621265175718 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 861751062600321 }
       }
       entries {
         b: 4
-        m: 512
-        n: 2048
+        m: 1024
+        n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 935264259567750 }
-        flops { key: "f32xf32->f32" value: 546624747335263 }
+        flops { key: "bf16xbf16->bf16" value: 642190086124401 }
+        flops { key: "f32xf32->f32" value: 223789458941225 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1214640072398190 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1203746439461883 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1214640072398190 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 674355047260166 }
-        flops { key: "f32xf32->f32" value: 403056240240240 }
+        m: 1024
+        n: 2048
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 721116067159167 }
+        flops { key: "f32xf32->f32" value: 285038976373772 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1467863053998633 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1406341616240995 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1472896877914952 }
       }
       entries {
         b: 4
-        m: 256
+        m: 1024
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1176461630076011 }
-        flops { key: "f32xf32->f32" value: 636538994201448 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 44143308008551 }
-        flops { key: "f32xf32->f32" value: 32506109953984 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 714391657771355 }
+        flops { key: "f32xf32->f32" value: 318546858710969 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1580775596613912 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1603759171415902 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1607397940119760 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 2048
+        b: 4
+        m: 1024
+        n: 256
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 609258429108447 }
-        flops { key: "f32xf32->f32" value: 397075513890814 }
+        flops { key: "bf16xbf16->bf16" value: 403359062359128 }
+        flops { key: "f32xf32->f32" value: 242379644243792 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 825955249230769 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 786048187408492 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 825955249230769 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 885742894617446 }
-        flops { key: "f32xf32->f32" value: 526231175421937 }
+        b: 4
+        m: 1024
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 661986327990135 }
+        flops { key: "f32xf32->f32" value: 175821487473391 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1186455054143646 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1185145501103752 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1198372571428571 }
       }
       entries {
         b: 4
         m: 1024
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 432176222177500 }
-        flops { key: "f32xf32->f32" value: 270310736736106 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 749296457780879 }
+        flops { key: "f32xf32->f32" value: 245818255993475 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1464859241473397 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1466860415300546 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1462863520435967 }
       }
       entries {
         b: 4
         m: 1024
         n: 2048
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1342964173070158 }
-        flops { key: "f32xf32->f32" value: 700832977094254 }
+        flops { key: "bf16xbf16->bf16" value: 740639299189515 }
+        flops { key: "f32xf32->f32" value: 310440758471456 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1599615380260707 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1602599737313432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1607397940119760 }
       }
       entries {
         b: 4
-        m: 256
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 128039807297877 }
-        flops { key: "f32xf32->f32" value: 88170621120052 }
-      }
-      entries {
-        b: 2
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 81024888620585 }
-        flops { key: "f32xf32->f32" value: 48908710212262 }
-      }
-      entries {
-        b: 1
         m: 1024
         n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 240076427948574 }
-        flops { key: "f32xf32->f32" value: 167353775561097 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 689899172114689 }
-        flops { key: "f32xf32->f32" value: 419348495996875 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 769776377094721 }
+        flops { key: "f32xf32->f32" value: 325561288307750 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1657328688404399 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1663104470861568 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1661496052611218 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 2048
         n: 256
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 11779684746357 }
-        flops { key: "f32xf32->f32" value: 8254472816728 }
-      }
-      entries {
-        b: 1
-        m: 1024
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 273321070128547 }
-        flops { key: "f32xf32->f32" value: 185351600897635 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 143395008547008 }
+        flops { key: "f32xf32->f32" value: 78398205607476 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 160932527577937 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 160932527577937 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 160932527577937 }
       }
       entries {
-        b: 2
-        m: 4096
+        b: 4
+        m: 2048
         n: 512
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 934297867304764 }
-        flops { key: "f32xf32->f32" value: 545202284408619 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 220029062295081 }
+        flops { key: "f32xf32->f32" value: 112882866274179 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 285569634042553 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 286790017094017 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 2048
         n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 80647575784888 }
-        flops { key: "f32xf32->f32" value: 56477057858194 }
-      }
-      entries {
-        b: 1
-        m: 512
-        n: 512
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 196782154128104 }
-        flops { key: "f32xf32->f32" value: 133525066716408 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 303660018099547 }
+        flops { key: "f32xf32->f32" value: 172516359897172 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466844271304347 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 484540534296028 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 479349028571428 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 512
+        b: 4
+        m: 2048
+        n: 2048
         k: 256
-        flops { key: "bf16xbf16->bf16" value: 38926255220417 }
-        flops { key: "f32xf32->f32" value: 33412429176001 }
+        flops { key: "bf16xbf16->bf16" value: 400649934328358 }
+        flops { key: "f32xf32->f32" value: 209388031201248 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 702710617801047 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 693631669250646 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 702710617801047 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 245763750057221 }
-        flops { key: "f32xf32->f32" value: 160319794550205 }
+        b: 4
+        m: 2048
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 460228486806504 }
+        flops { key: "f32xf32->f32" value: 222122843194042 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 887389937190082 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 890333187396351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 896278651085141 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 972427077828720 }
-        flops { key: "f32xf32->f32" value: 558931228942317 }
+        m: 2048
+        n: 256
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 230614652920962 }
+        flops { key: "f32xf32->f32" value: 113168404721753 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 305040290909090 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 305735143507972 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 305735143507972 }
       }
       entries {
         b: 4
-        m: 512
+        m: 2048
         n: 512
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 150447222082107 }
-        flops { key: "f32xf32->f32" value: 106829352701223 }
+        flops { key: "bf16xbf16->bf16" value: 345921979381443 }
+        flops { key: "f32xf32->f32" value: 188640517217146 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 524288000000000 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 527378106090373 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 520160747971418 }
       }
       entries {
         b: 4
-        m: 4096
-        n: 256
+        m: 2048
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 431481544705646 }
-        flops { key: "f32xf32->f32" value: 264403305589756 }
+        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
+        flops { key: "f32xf32->f32" value: 240533562724014 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 823421644171779 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 824686500768049 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 806112480480480 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1193626706315570 }
-        flops { key: "f32xf32->f32" value: 660827740513510 }
+        b: 4
+        m: 2048
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 547548099949005 }
+        flops { key: "f32xf32->f32" value: 280570113404755 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1173488332240437 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1176058952902519 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1170928924754634 }
       }
       entries {
-        b: 2
+        b: 4
         m: 2048
         n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1037713701428528 }
-        flops { key: "f32xf32->f32" value: 612440303869668 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 608697179138322 }
+        flops { key: "f32xf32->f32" value: 296490908187215 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1459880114208021 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1438368150033489 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1468867064295485 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1344409209351462 }
-        flops { key: "f32xf32->f32" value: 695865248354497 }
+        b: 4
+        m: 2048
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 378611362482369 }
+        flops { key: "f32xf32->f32" value: 119195384675158 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 553475167010309 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 555766989648033 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 563940033613445 }
       }
       entries {
-        b: 1
-        m: 512
+        b: 4
+        m: 2048
         n: 512
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 17508182624576 }
-        flops { key: "f32xf32->f32" value: 16412047933480 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 493447529411764 }
+        flops { key: "f32xf32->f32" value: 242598694984184 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 958698057142857 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 950108902997456 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 962134250896057 }
       }
       entries {
-        b: 1
-        m: 1024
-        n: 512
+        b: 4
+        m: 2048
+        n: 1024
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 143663610382659 }
-        flops { key: "f32xf32->f32" value: 104358229565555 }
+        flops { key: "bf16xbf16->bf16" value: 597851795100222 }
+        flops { key: "f32xf32->f32" value: 261760561677230 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1320715650676506 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1332185885856079 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1335499781094527 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 1024
+        m: 2048
+        n: 2048
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 676000203982057 }
-        flops { key: "f32xf32->f32" value: 402961701552751 }
+        flops { key: "bf16xbf16->bf16" value: 685440040855410 }
+        flops { key: "f32xf32->f32" value: 309569503820095 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1707061723370429 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1677721600000000 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1660853556071152 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 2048
+        n: 4096
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 688296041025641 }
+        flops { key: "f32xf32->f32" value: 329672036843721 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1912273951914514 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1920826161001789 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1919109605004468 }
+      }
+      entries {
+        b: 4
+        m: 2048
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 134184181954511 }
-        flops { key: "f32xf32->f32" value: 88629122905489 }
+        flops { key: "bf16xbf16->bf16" value: 523744563868056 }
+        flops { key: "f32xf32->f32" value: 156567778361038 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 932067555555555 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 936947490401396 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 940229267950963 }
       }
       entries {
-        b: 2
-        m: 1024
+        b: 4
+        m: 2048
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 266238984378874 }
-        flops { key: "f32xf32->f32" value: 184333360343347 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 635350191715976 }
+        flops { key: "f32xf32->f32" value: 283758410147991 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1374829480153649 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1355734626262626 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1387263338501292 }
       }
       entries {
-        b: 1
+        b: 4
         m: 2048
-        n: 4096
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 888675211255948 }
-        flops { key: "f32xf32->f32" value: 533287884029178 }
+        n: 1024
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 722328842246888 }
+        flops { key: "f32xf32->f32" value: 301105390914189 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1758790866502866 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1774779874380165 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1779191091963546 }
       }
       entries {
-        b: 1
-        m: 256
-        n: 1024
+        b: 4
+        m: 2048
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 133649716704007 }
-        flops { key: "f32xf32->f32" value: 89099811136005 }
+        flops { key: "bf16xbf16->bf16" value: 723180214850985 }
+        flops { key: "f32xf32->f32" value: 320496029848518 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2012636970946579 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1932059062528115 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1944303891353553 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 2048
         n: 4096
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 760305770224818 }
-        flops { key: "f32xf32->f32" value: 499298685887003 }
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 751328137146855 }
+        flops { key: "f32xf32->f32" value: 334316750681092 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2067870628791526 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2076336674149834 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2064391875030040 }
       }
       entries {
         b: 4
-        m: 512
-        n: 2048
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 253689739870053 }
-        flops { key: "f32xf32->f32" value: 163033984816276 }
+        m: 2048
+        n: 256
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 573886597541421 }
+        flops { key: "f32xf32->f32" value: 166419997520148 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1330535097893432 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1335499781094527 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1333840775155279 }
       }
       entries {
-        b: 2
-        m: 512
+        b: 4
+        m: 2048
         n: 512
         k: 4096
-        flops { key: "bf16xbf16->bf16" value: 375861319331408 }
-        flops { key: "f32xf32->f32" value: 249185849152935 }
+        flops { key: "bf16xbf16->bf16" value: 743331134648667 }
+        flops { key: "f32xf32->f32" value: 304823796735273 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1731841651612903 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1711142349003984 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1738853156275303 }
       }
       entries {
         b: 4
-        m: 256
-        n: 2048
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 259045072135102 }
-        flops { key: "f32xf32->f32" value: 176109861243234 }
+        m: 2048
+        n: 1024
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 740639299189515 }
+        flops { key: "f32xf32->f32" value: 314325794104059 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2021161080470588 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2012636970946579 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2035529524170616 }
       }
       entries {
         b: 4
-        m: 256
-        n: 4096
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 249417380720092 }
-        flops { key: "f32xf32->f32" value: 163455902572689 }
+        m: 2048
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 775330457067102 }
+        flops { key: "f32xf32->f32" value: 328297881650387 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2059936352997602 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2053534447047573 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2050073141391089 }
       }
       entries {
         b: 4
         m: 2048
-        n: 256
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 128299895327996 }
-        flops { key: "f32xf32->f32" value: 102280608115831 }
-      }
-      entries {
-        b: 2
-        m: 256
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 146826449336797 }
-        flops { key: "f32xf32->f32" value: 101353768548234 }
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 800699997215239 }
+        flops { key: "f32xf32->f32" value: 335092321690103 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2201418398769861 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2221038186052149 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2215328070148291 }
       }
       entries {
-        b: 1
-        m: 256
+        b: 4
+        m: 4096
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 20357610799332 }
-        flops { key: "f32xf32->f32" value: 14924688980318 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 210702869701726 }
+        flops { key: "f32xf32->f32" value: 112128427736006 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 285569634042553 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 285569634042553 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 286178524520255 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 512
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 458815008652921 }
-        flops { key: "f32xf32->f32" value: 287365669476783 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 304003913929784 }
+        flops { key: "f32xf32->f32" value: 173857160621761 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 466033777777777 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 477643160142348 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 474267590106007 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
-        n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 443603315017558 }
-        flops { key: "f32xf32->f32" value: 289203911925122 }
+        n: 1024
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 396507320531757 }
+        flops { key: "f32xf32->f32" value: 206966427139552 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 728454426051560 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 709208602377807 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 728454426051560 }
       }
       entries {
         b: 4
-        m: 256
+        m: 4096
         n: 2048
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 961111562741258 }
-        flops { key: "f32xf32->f32" value: 525233702772937 }
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 463419000431592 }
+        flops { key: "f32xf32->f32" value: 219354815934627 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 941878792982456 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 949373849690539 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 941878792982456 }
       }
       entries {
-        b: 2
-        m: 512
-        n: 2048
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 692624946944041 }
-        flops { key: "f32xf32->f32" value: 424991816346724 }
+        b: 4
+        m: 4096
+        n: 4096
+        k: 256
+        flops { key: "bf16xbf16->bf16" value: 505171406257351 }
+        flops { key: "f32xf32->f32" value: 241724859072489 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1053204339382050 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1061009707509881 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1052655812260653 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
+        b: 4
+        m: 4096
+        n: 256
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 257060527651424 }
-        flops { key: "f32xf32->f32" value: 176341242240105 }
+        flops { key: "bf16xbf16->bf16" value: 341520936386768 }
+        flops { key: "f32xf32->f32" value: 169359909148264 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 519217516441005 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 522247968871595 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 519217516441005 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 465983215362916 }
-        flops { key: "f32xf32->f32" value: 294923250429169 }
+        b: 4
+        m: 4096
+        n: 512
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 451152026890756 }
+        flops { key: "f32xf32->f32" value: 235057316987740 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 834946986003110 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 834946986003110 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 829785026275115 }
       }
       entries {
-        b: 2
-        m: 1024
-        n: 4096
+        b: 4
+        m: 4096
+        n: 1024
         k: 512
-        flops { key: "bf16xbf16->bf16" value: 611339733257419 }
-        flops { key: "f32xf32->f32" value: 388684823167420 }
+        flops { key: "bf16xbf16->bf16" value: 547548099949005 }
+        flops { key: "f32xf32->f32" value: 275955236186070 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1267699910271546 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1263225675294117 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1264713573616018 }
       }
       entries {
-        b: 2
-        m: 2048
-        n: 256
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 607191248462571 }
-        flops { key: "f32xf32->f32" value: 394685471053115 }
+        b: 4
+        m: 4096
+        n: 2048
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 612680557907312 }
+        flops { key: "f32xf32->f32" value: 289845530119363 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1596642117472119 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1602599737313432 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1594271453600594 }
       }
       entries {
         b: 4
-        m: 1024
+        m: 4096
         n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 1411468820638164 }
-        flops { key: "f32xf32->f32" value: 751048949004349 }
+        k: 512
+        flops { key: "bf16xbf16->bf16" value: 620391058211757 }
+        flops { key: "f32xf32->f32" value: 316736541295440 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1777718251655629 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1761676495488105 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1771120534432989 }
       }
       entries {
-        b: 2
-        m: 256
-        n: 4096
-        k: 4096
-        flops { key: "bf16xbf16->bf16" value: 960413080500894 }
-        flops { key: "f32xf32->f32" value: 533917679833421 }
+        b: 4
+        m: 4096
+        n: 256
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 489399190519598 }
+        flops { key: "f32xf32->f32" value: 163129965474675 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 880116249180327 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 875808991843393 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 960413080500894 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 4096
         n: 512
         k: 1024
-        flops { key: "bf16xbf16->bf16" value: 74710675201781 }
-        flops { key: "f32xf32->f32" value: 54109142511590 }
+        flops { key: "bf16xbf16->bf16" value: 596192017767906 }
+        flops { key: "f32xf32->f32" value: 230463065047957 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1424060774535809 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1439332203753351 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1435483721925133 }
       }
       entries {
-        b: 2
+        b: 4
         m: 4096
         n: 1024
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1131296535229817 }
-        flops { key: "f32xf32->f32" value: 608923713257837 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 677653407384032 }
+        flops { key: "f32xf32->f32" value: 278640670559231 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1869002304612706 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1821444994062765 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1892056077533039 }
       }
       entries {
         b: 4
-        m: 512
+        m: 4096
+        n: 2048
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 690176329101719 }
+        flops { key: "f32xf32->f32" value: 295713908970028 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2198038534288638 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2210482396294390 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2189076093781855 }
+      }
+      entries {
+        b: 4
+        m: 4096
         n: 4096
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1156192824819974 }
-        flops { key: "f32xf32->f32" value: 624665727988364 }
+        k: 1024
+        flops { key: "bf16xbf16->bf16" value: 722328842246888 }
+        flops { key: "f32xf32->f32" value: 322142681117569 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2260509103157894 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2261104130560674 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2277289128313892 }
       }
       entries {
-        b: 2
-        m: 256
+        b: 4
+        m: 4096
         n: 256
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 68182742189484 }
-        flops { key: "f32xf32->f32" value: 45769046206308 }
+        flops { key: "bf16xbf16->bf16" value: 576970351423965 }
+        flops { key: "f32xf32->f32" value: 153347875464153 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1295225360675512 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1425951957503320 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1405421235602094 }
       }
       entries {
-        b: 1
+        b: 4
+        m: 4096
+        n: 512
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 710146708994709 }
+        flops { key: "f32xf32->f32" value: 243976783458304 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1898747699381078 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1885411455662862 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1830761848252344 }
+      }
+      entries {
+        b: 4
         m: 4096
         n: 1024
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 940023483475596 }
-        flops { key: "f32xf32->f32" value: 543873280486260 }
+        flops { key: "bf16xbf16->bf16" value: 716544427093760 }
+        flops { key: "f32xf32->f32" value: 299696362080620 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2243974553814002 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2276082297827239 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2256945504992117 }
       }
       entries {
         b: 4
         m: 4096
-        n: 512
+        n: 2048
         k: 2048
-        flops { key: "bf16xbf16->bf16" value: 1147696518404703 }
-        flops { key: "f32xf32->f32" value: 618938256439817 }
+        flops { key: "bf16xbf16->bf16" value: 758158392939099 }
+        flops { key: "f32xf32->f32" value: 292981216299122 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2362468259625962 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2363118182118294 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2372254789284728 }
       }
       entries {
         b: 4
-        m: 512
-        n: 1024
-        k: 256
-        flops { key: "bf16xbf16->bf16" value: 149775676384432 }
-        flops { key: "f32xf32->f32" value: 102436731921389 }
+        m: 4096
+        n: 4096
+        k: 2048
+        flops { key: "bf16xbf16->bf16" value: 759530558969895 }
+        flops { key: "f32xf32->f32" value: 340836184539995 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2527938373160683 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2536148388544434 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2547406579342940 }
       }
       entries {
-        b: 1
+        b: 4
         m: 4096
         n: 256
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 258079996154308 }
-        flops { key: "f32xf32->f32" value: 186624111236638 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 611296227725590 }
+        flops { key: "f32xf32->f32" value: 166008321583178 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 1681662997650744 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 1697615532015810 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 1672495052959501 }
       }
       entries {
         b: 4
-        m: 256
-        n: 1024
-        k: 512
-        flops { key: "bf16xbf16->bf16" value: 150426145138694 }
-        flops { key: "f32xf32->f32" value: 100896619432437 }
+        m: 4096
+        n: 512
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 720624539759440 }
+        flops { key: "f32xf32->f32" value: 264794531196054 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2204808673511293 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2203677422267829 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2192428430832057 }
       }
       entries {
-        b: 1
-        m: 1024
+        b: 4
+        m: 4096
         n: 1024
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 267332708577119 }
-        flops { key: "f32xf32->f32" value: 188723406977766 }
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 773519549031967 }
+        flops { key: "f32xf32->f32" value: 310430240689167 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2360520635339379 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2388747105672970 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2350830484948002 }
       }
       entries {
         b: 4
-        m: 1024
-        n: 256
-        k: 2048
-        flops { key: "bf16xbf16->bf16" value: 425328510200039 }
-        flops { key: "f32xf32->f32" value: 275036327868852 }
+        m: 4096
+        n: 2048
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 804715405124361 }
+        flops { key: "f32xf32->f32" value: 335819386905045 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2501072817586257 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2495260593173565 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2554627387955390 }
       }
       entries {
         b: 4
-        m: 2048
-        n: 2048
-        k: 1024
-        flops { key: "bf16xbf16->bf16" value: 1038246762796881 }
-        flops { key: "f32xf32->f32" value: 607212709292051 }
+        m: 4096
+        n: 4096
+        k: 4096
+        flops { key: "bf16xbf16->bf16" value: 788699393418751 }
+        flops { key: "f32xf32->f32" value: 345601440030376 }
+        flops { key: "f8e4m3fnxf8e4m3fn->bf16" value: 2592796435858738 }
+        flops { key: "f8e4m3fnxf8e5m2->bf16" value: 2595734559794515 }
+        flops { key: "f8e5m2xf8e4m3fn->bf16" value: 2620467860643587 }
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
index 80b71828223e2f..d2190003d598e4 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
@@ -58,9 +58,49 @@ struct DotSpec {
   int m;
   int n;
   int k;
+  absl::string_view lhs_type;
+  absl::string_view rhs_type;
+  absl::string_view result_type;
   int64_t clock_cycles;
 };
 
+absl::StatusOr<DotContext> Dot(int b, int m, int n, int k,
+                               absl::string_view lhs_type,
+                               absl::string_view rhs_type,
+                               absl::string_view result_type) {
+  absl::string_view kTemplate = R"(
+    HloModule m
+
+    ENTRY r {
+      lhs = $4[$0,$1,$2] parameter(0)
+      rhs = $5[$0,$2,$3] parameter(1)
+      ROOT _ = $6[$0,$1,$3] dot(lhs,rhs),
+       lhs_contracting_dims={2}, rhs_contracting_dims={1},
+       lhs_batch_dims={0}, rhs_batch_dims={0}
+    })";
+  TF_ASSIGN_OR_RETURN(
+      auto module,
+      ParseAndReturnUnverifiedModule(absl::Substitute(
+          kTemplate, b, m, k, n, lhs_type, rhs_type, result_type)));
+  return DotContext{
+      /*dot=*/module->entry_computation()->root_instruction(),
+      /*module=*/std::move(module),
+  };
+}
+
+// Generates a Dot HLO instruction with S8 data type.
+absl::StatusOr<DotContext> DotS8(int b, int m, int n, int k) {
+  // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
+  return Dot(b, m, n, k, /*lhs_type=*/"s8", /*rhs_type=*/"s8",
+             /*result_type=*/"s8");
+}
+
+// Generates a Dot HLO instruction with BF16 data type.
+absl::StatusOr<DotContext> DotBF16(int b, int m, int n, int k) {
+  return Dot(b, m, n, k, /*lhs_type=*/"bf16", /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16");
+}
+
 struct ParametrizedTestCase {
   std::string test_name;
   DotSpec spec;
@@ -83,26 +123,6 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
   }
 
  protected:
-  absl::StatusOr<DotContext> Dot(int b, int m, int n, int k) {
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = f32[$0,$1,$2] parameter(0)
-      rhs = f32[$0,$2,$3] parameter(1)
-      ROOT _ = f32[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
-
   void AddProfileEntry(DotContext dot_context, int64_t clock_cycles,
                        HloInstructionProfileList& list) {
     HloInstructionProfile profile;
@@ -118,7 +138,8 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
     HloInstructionProfileList list;
     for (DotSpec spec : specs) {
       TF_ASSIGN_OR_RETURN(DotContext dot_context,
-                          Dot(spec.b, spec.m, spec.n, spec.k));
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
       AddProfileEntry(std::move(dot_context), spec.clock_cycles, list);
     }
     return list;
@@ -138,6 +159,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/1024,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(1)),
       },
       DotSpec{
@@ -145,6 +169,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(2)),
       },
       DotSpec{
@@ -152,6 +179,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/64,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(3)),
       },
       DotSpec{
@@ -159,6 +189,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/1024,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(4)),
       },
       DotSpec{
@@ -166,6 +199,9 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
           /*m=*/256,
           /*n=*/2048,
           /*k=*/512,
+          /*lhs_type=*/"f32",
+          /*rhs_type=*/"f32",
+          /*result_type=*/"f32",
           /*clock_cycles=*/ClockCycles(absl::Seconds(5)),
       },
   };
@@ -175,8 +211,8 @@ class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
 TEST_P(MatmulInterpolatorParamTest,
        MatmulInteprolatorNextNeighbourInterpolation) {
   const auto& [_, spec, expected_duration] = GetParam();
-  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          Dot(spec.b, spec.m, spec.n, spec.k));
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context, Dot(spec.b, spec.m, spec.n,
+                                                  spec.k, "f32", "f32", "f32"));
   EXPECT_EQ(absl::Trunc(*interpolator().EstimatedRuntime(*context.dot),
                         absl::Milliseconds(1)),
             expected_duration);
@@ -193,6 +229,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/64,
                 /*n=*/64,
                 /*k=*/64,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Milliseconds(1),
         },
@@ -204,6 +243,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/512,
                 /*n=*/2048,
                 /*k=*/1024,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(40),
         },
@@ -215,6 +257,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/128,
                 /*n=*/2048,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(12),
         },
@@ -226,6 +271,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/512,
                 /*n=*/2048,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -237,6 +285,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/256,
                 /*n=*/4096,
                 /*k=*/512,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -248,6 +299,9 @@ INSTANTIATE_TEST_SUITE_P(
                 /*m=*/256,
                 /*n=*/2048,
                 /*k=*/1024,
+                /*lhs_type=*/"f32",
+                /*rhs_type=*/"f32",
+                /*result_type=*/"f32",
             },
             /*expected_duration=*/absl::Seconds(10),
         },
@@ -277,51 +331,6 @@ class MatmulInterpolatorDefaultTableTest
     return GetMatmulInterpolator(TestGpuDeviceInfo::RTXA6000DeviceInfo(
         se::CudaComputeCapability(10, 0)));
   }
-
- protected:
-  // Generates a Dot HLO instruction with BF16 data type.
-  absl::StatusOr<DotContext> DotBF16(int b, int m, int n, int k) {
-    // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = bf16[$0,$1,$2] parameter(0)
-      rhs = bf16[$0,$2,$3] parameter(1)
-      ROOT _ = bf16[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
-
-  // Generates a Dot HLO instruction with FP8 data type.
-  absl::StatusOr<DotContext> DotFP8(int b, int m, int n, int k) {
-    // Template uses $0=b, $1=m, $2=k, $3=n for dimensions.
-    absl::string_view kTemplate = R"(
-    HloModule m
-
-    ENTRY r {
-      lhs = s8[$0,$1,$2] parameter(0)
-      rhs = s8[$0,$2,$3] parameter(1)
-      ROOT _ = s8[$0,$1,$3] dot(lhs,rhs),
-       lhs_contracting_dims={2}, rhs_contracting_dims={1},
-       lhs_batch_dims={0}, rhs_batch_dims={0}
-    })";
-    TF_ASSIGN_OR_RETURN(auto module,
-                        ParseAndReturnUnverifiedModule(
-                            absl::Substitute(kTemplate, b, m, k, n)));
-    return DotContext{
-        /*dot=*/module->entry_computation()->root_instruction(),
-        /*module=*/std::move(module),
-    };
-  }
 };
 
 using H100BF16Test = MatmulInterpolatorDefaultTableTest;
@@ -343,25 +352,41 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"exact_match1_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(12),
         },
         {
             /*test_name=*/"exact_match2_bf16",
             /*spec=*/
-            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256, /*clock_cycles=*/0},
+            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(6),
         },
         {
             /*test_name=*/"exact_match3_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/4096, /*n=*/2048, /*k=*/4096, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/4096, /*n=*/2048, /*k=*/4096,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(90),
         },
         {
             /*test_name=*/"extrapolate_small_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,256,256,256)
             // flops/sec and scaling by new dimensions.
             /*expected_duration=*/absl::Microseconds(0),
@@ -369,7 +394,11 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"extrapolate_slightly_larger_k_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/513, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/513,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on (1,1024,4096,512) flops/sec and
             // scaling k.
             /*expected_duration=*/absl::Microseconds(12),
@@ -377,10 +406,14 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"interpolate_mid_n_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/2048, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/2048, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             // Expected duration based on linear interpolation of flops/sec
             // between (1,1024,1024,512) and (1,1024,4096,512).
-            /*expected_duration=*/absl::Microseconds(9),
+            /*expected_duration=*/absl::Microseconds(8),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
@@ -405,25 +438,33 @@ INSTANTIATE_TEST_SUITE_P(
         {
             /*test_name=*/"exact_match1_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/1024, /*n=*/4096, /*k=*/512,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(9),
         },
         {
             /*test_name=*/"exact_match2_bf16",
             /*spec=*/
-            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256, /*clock_cycles=*/0},
+            {/*b=*/4, /*m=*/256, /*n=*/1024, /*k=*/256,
+             /*lhs_type=*/"bf16",
+             /*rhs_type=*/"bf16",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
             /*expected_duration=*/absl::Microseconds(6),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
            info) { return info.param.test_name; });
 
-using H100F8Test = MatmulInterpolatorDefaultTableTest;
+using H100S8Test = MatmulInterpolatorDefaultTableTest;
 
-TEST_P(H100F8Test, EstimatesRuntimeForFP8) {
+TEST_P(H100S8Test, EstimatesRuntimeForS8) {
   const auto& [_, spec, expected_duration] = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          DotFP8(spec.b, spec.m, spec.n, spec.k));
+                          DotS8(spec.b, spec.m, spec.n, spec.k));
   // Compare with nanosecond precision.
   EXPECT_EQ(
       absl::Trunc(*GetMatmulInterpolatorH100()->EstimatedRuntime(*context.dot),
@@ -432,31 +473,128 @@ TEST_P(H100F8Test, EstimatesRuntimeForFP8) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    MatmulInterpolatorDefaultTableTestInstantiationFP8, H100F8Test,
+    MatmulInterpolatorDefaultTableTestInstantiationS8, H100S8Test,
     ValuesIn<ParametrizedTestCase>({
         {
-            /*test_name=*/"extrapolate_small_fp8",
+            /*test_name=*/"extrapolate_small_s8",
             /*spec=*/
-            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,512,512,512)
             // flops/sec and scaling by new dimensions.
             /*expected_duration=*/absl::Microseconds(0),
         },
         {
-            /*test_name=*/"interpolate_larger_fp8",
+            /*test_name=*/"interpolate_larger_s8",
             /*spec=*/
-            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,2048,2048,2048)
             // flops/sec and scaling by new dimensions.
-            /*expected_duration=*/absl::Microseconds(72),
+            /*expected_duration=*/absl::Microseconds(67),
         },
         {
-            /*test_name=*/"interpolate_larger_batch_fp8",
+            /*test_name=*/"interpolate_larger_batch_s8",
             /*spec=*/
-            {/*b=*/8, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
+            {/*b=*/8, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
             // Expected duration based on nearest point (1,2048,2048,2048)
             // flops/sec and scaling by new dimensions.
-            /*expected_duration=*/absl::Microseconds(280),
+            /*expected_duration=*/absl::Microseconds(275),
+        },
+    }),
+    [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
+           info) { return info.param.test_name; });
+
+using B200S8Test = MatmulInterpolatorDefaultTableTest;
+
+TEST_P(B200S8Test, EstimatesRuntimeForS8) {
+  const auto& [_, spec, expected_duration] = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
+                          DotS8(spec.b, spec.m, spec.n, spec.k));
+  // Compare with nanosecond precision.
+  EXPECT_EQ(
+      absl::Trunc(*GetMatmulInterpolatorB200()->EstimatedRuntime(*context.dot),
+                  absl::Microseconds(1)),
+      expected_duration);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MatmulInterpolatorDefaultTableTestInstantiationS8, B200S8Test,
+    ValuesIn<ParametrizedTestCase>({
+        {
+            /*test_name=*/"exact_match1_s8",
+            /*spec=*/
+            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(39),
+        },
+        {
+            /*test_name=*/"exact_match2_s8",
+            /*spec=*/
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"s8",
+             /*rhs_type=*/"s8",
+             /*result_type=*/"s8",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(59),
+        },
+    }),
+    [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
+           info) { return info.param.test_name; });
+
+using H100F8Test = MatmulInterpolatorDefaultTableTest;
+
+TEST_P(H100F8Test, EstimatesRuntimeForF8) {
+  const auto& [_, spec, expected_duration] = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
+  // Compare with nanosecond precision.
+  EXPECT_EQ(
+      absl::Trunc(*GetMatmulInterpolatorH100()->EstimatedRuntime(*context.dot),
+                  absl::Microseconds(1)),
+      expected_duration);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MatmulInterpolatorDefaultTableTestInstantiationF8, H100F8Test,
+    ValuesIn<ParametrizedTestCase>({
+        {
+            /*test_name=*/"extrapolate_small_f8e4m3fn_f8e4m3fn_bf16",
+            /*spec=*/
+            {/*b=*/1, /*m=*/64, /*n=*/64, /*k=*/64,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            // Expected duration based on nearest point (1,512,512,512)
+            // flops/sec and scaling by new dimensions.
+            /*expected_duration=*/absl::Microseconds(0),
+        },
+        {
+            /*test_name=*/"interpolate_larger_f8e4m3fn_f8e4m3fn_bf16",
+            /*spec=*/
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            // Expected duration based on nearest point (1,2048,2048,2048)
+            // flops/sec and scaling by new dimensions.
+            /*expected_duration=*/absl::Microseconds(12),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
@@ -464,10 +602,11 @@ INSTANTIATE_TEST_SUITE_P(
 
 using B200F8Test = MatmulInterpolatorDefaultTableTest;
 
-TEST_P(B200F8Test, EstimatesRuntimeForFP8) {
+TEST_P(B200F8Test, EstimatesRuntimeForF8) {
   const auto& [_, spec, expected_duration] = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(DotContext context,
-                          DotFP8(spec.b, spec.m, spec.n, spec.k));
+                          Dot(spec.b, spec.m, spec.n, spec.k, spec.lhs_type,
+                              spec.rhs_type, spec.result_type));
   // Compare with nanosecond precision.
   EXPECT_EQ(
       absl::Trunc(*GetMatmulInterpolatorB200()->EstimatedRuntime(*context.dot),
@@ -476,19 +615,27 @@ TEST_P(B200F8Test, EstimatesRuntimeForFP8) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    MatmulInterpolatorDefaultTableTestInstantiationFP8, B200F8Test,
+    MatmulInterpolatorDefaultTableTestInstantiationF8, B200F8Test,
     ValuesIn<ParametrizedTestCase>({
         {
-            /*test_name=*/"exact_match1_fp8",
+            /*test_name=*/"exact_match1_f8e4m3fn_f8e5m2_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512, /*clock_cycles=*/0},
-            /*expected_duration=*/absl::Microseconds(44),
+            {/*b=*/1, /*m=*/512, /*n=*/512, /*k=*/512,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e5m2",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(5),
         },
         {
-            /*test_name=*/"exact_match2_fp8",
+            /*test_name=*/"exact_match2_f8e4m3fn_f8e4m3fn_bf16",
             /*spec=*/
-            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048, /*clock_cycles=*/0},
-            /*expected_duration=*/absl::Microseconds(64),
+            {/*b=*/1, /*m=*/2048, /*n=*/2048, /*k=*/2048,
+             /*lhs_type=*/"f8e4m3fn",
+             /*rhs_type=*/"f8e4m3fn",
+             /*result_type=*/"bf16",
+             /*clock_cycles=*/0},
+            /*expected_duration=*/absl::Microseconds(12),
         },
     }),
     [](const TestParamInfo<MatmulInterpolatorDefaultTableTest::ParamType>&
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
index 7c1757a0423b8f..ee4f33af0fa467 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
@@ -153,7 +153,7 @@ absl::Status MaybeRecordPerfTablesForDotsAndCustomCalls(
 
 }  // namespace
 
-absl::StatusOr<bool> MatmulPerfTableStatsCollection::Run(
+absl::StatusOr<bool> MatmulPerfTableStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
index 7d7d254c5ef53a..d5cd2742ce06c3 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
@@ -38,9 +38,8 @@ class MatmulPerfTableStatsCollection : public HloModulePass {
     return "matmul-perf-table-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
index f20361d2036c89..e66939cce1651d 100644
--- a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
@@ -262,7 +262,7 @@ TEST_F(MatmulStatsCollectionTest,
                   ->backend_config<GpuBackendConfig>()
                   ->reification_cost(),
               ElementsAre(Property(&ReificationCost::exec_time_us,
-                                   DoubleNear(199, /*max_abs_error=*/1))));
+                                   DoubleNear(141, /*max_abs_error=*/1))));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
index 7db74bdeb0e373..05417dbb997dc3 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
@@ -138,6 +138,9 @@ SolGPUCostModel::Config GetPlatformConfig(
     } else if (option_name == kSolChunkSizeBytes &&
                absl::SimpleAtoi(option_value, &value) && value > 0) {
       config.chunk_size_bytes = value;
+    } else if (option_name == kSolPartitionSize &&
+               absl::SimpleAtoi(option_value, &value) && value > 0) {
+      config.partition_size = value;
     }
   }
   return config;
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
index 6adbb4dcf602da..0634f118c528cd 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -40,6 +40,8 @@ class SolGPUCostModel {
     absl::Duration rtt;
     int64_t gpus_per_node;
     int64_t chunk_size_bytes;
+    // Partition size (devices per fast-interconnect domain). 0 means unset.
+    int64_t partition_size;
   };
 
   enum CollectiveAlgorithmType {
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
index e713feb2c9757c..ad8040fc2ac8ae 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
@@ -60,8 +60,18 @@ bool SetReificationCost(HloInstruction* instr, double cost_us) {
     return false;
   }
   auto reification_cost = gpu_config->add_reification_cost();
+  VLOG(3) << "Setting exec_time_us=" << cost_us << " for " << instr->name()
+          << " in SolGpuCostModelStatsCollection";
   reification_cost->set_exec_time_us(cost_us);
   reification_cost->set_name("sol");
+  if (instr->opcode() == HloOpcode::kAsyncStart &&
+      instr->async_wrapped_instruction() != nullptr) {
+    VLOG(9) << "AsyncStart: Setting reification cost for async start "
+            << instr->ToString() << " computation:"
+            << instr->async_wrapped_computation()->ToString();
+    return SetReificationCost(
+        instr->async_wrapped_computation()->root_instruction(), cost_us);
+  }
   return instr->set_backend_config(*gpu_config).ok();
 }
 
@@ -72,15 +82,19 @@ bool RecordReificationCost(HloInstruction& instr,
     HloGraphNode from(&instr, /*original_position=*/-1);
     HloGraphNode to(instr.users()[0], /*original_position=*/-1);
     if (estimator.IsAsyncPair(from, to)) {
+      VLOG(10) << "Recording reification cost for async pair from: "
+               << instr.ToString() << " to: " << instr.users()[0]->ToString();
       return SetReificationCost(&instr, estimator.GetLatencyBetween(from, to));
     }
   }
+  VLOG(10) << "Recording reification cost for single node: "
+           << instr.ToString();
   return SetReificationCost(&instr, estimator.NodeCost(&instr));
 }
 
 }  // namespace
 
-absl::StatusOr<bool> SolGpuCostModelStatsCollection::Run(
+absl::StatusOr<bool> SolGpuCostModelStatsCollection::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto cost_analysis =
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
index de512c38c9ed3f..bc117cf204ab70 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/stream_executor/device_description.h"
 
@@ -42,9 +42,8 @@ class SolGpuCostModelStatsCollection : public HloModulePass {
     return "sol-gpu-cost-model-stats-collection";
   }
 
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
index b9355f55128a8a..a41a6ab3fe7b66 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
@@ -23,10 +23,12 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -43,26 +45,9 @@ using ShapeSizeFn = std::function<int64_t(const Shape&)>;
 
 class SolGpuCostModelStatsCollectionTest
     : public HloHardwareIndependentTestBase {
- public:
-  explicit SolGpuCostModelStatsCollectionTest() {
-    ShapeSizeFn shape_size_bytes =
-        [&shape_size_bytes](const Shape& shape) -> int64_t {
-      int64_t shape_size = 0;
-      if (shape.IsTuple()) {
-        for (auto& sub_shape : shape.tuple_shapes()) {
-          shape_size += shape_size_bytes(sub_shape);
-        }
-        return shape_size;
-      }
-      return ShapeUtil::ByteSizeOfElements(shape);
-    };
-    shape_size_fn_ = shape_size_bytes;
-  }
-
  protected:
   se::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo(se::CudaComputeCapability(9, 0));
-  ShapeSizeFn shape_size_fn_;
   int pointer_size_ = 8;
   mlir::MLIRContext mlir_context_;
   SymbolicExprContext symbolic_expr_context_{&mlir_context_};
@@ -89,11 +74,11 @@ TEST_F(SolGpuCostModelStatsCollectionTest,
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      SolGpuCostModelStatsCollection(device_info_, shape_size_fn_,
-                                     pointer_size_, &symbolic_expr_context_)
-          .Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          SolGpuCostModelStatsCollection(
+                              device_info_, HloCostAnalysis::DefaultShapeSize,
+                              pointer_size_, &symbolic_expr_context_)
+                              .Run(module.get()));
 
   VLOG(1) << module->ToString();
 
@@ -105,6 +90,42 @@ TEST_F(SolGpuCostModelStatsCollectionTest,
                   ->reification_cost(),
               ElementsAre(Property(&ReificationCost::exec_time_us, Gt(0))));
 }
+TEST_F(SolGpuCostModelStatsCollectionTest,
+       RecordsRuntimeInfoForAsyncStartReduceScatter) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule async_rs_test
+    %add.f32 (x: f32[], y: f32[]) -> f32[] {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(%x, %y)
+    }
+    %async_rs {
+      %p0 = f32[4096,128256] parameter(0)
+      ROOT %rs = f32[512,128256] reduce-scatter(%p0), channel_id=1,
+        replica_groups={{0,1,2,3,4,5,6,7}}, dimensions={0}, to_apply=%add.f32
+    }
+    ENTRY main {
+      %param = f32[4096,128256] parameter(0)
+      %rs_start = ((f32[4096,128256]), f32[512,128256], u32[])
+        async-start(%param), calls=%async_rs
+      ROOT %rs_done = f32[512,128256] async-done(%rs_start)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          SolGpuCostModelStatsCollection(
+                              device_info_, HloCostAnalysis::DefaultShapeSize,
+                              pointer_size_, &symbolic_expr_context_)
+                              .Run(module.get()));
+  VLOG(1) << module->ToString();
+  EXPECT_FALSE(changed);
+  HloInstruction* rs_start = FindInstruction(module.get(), "rs_start");
+  ASSERT_NE(rs_start, nullptr);
+  HloComputation* async_comp = rs_start->async_wrapped_computation();
+  ASSERT_NE(async_comp, nullptr);
+  HloInstruction* rs_instr = async_comp->root_instruction();
 
+  EXPECT_THAT(rs_instr->backend_config<GpuBackendConfig>()->reification_cost(),
+              ElementsAre(Property(&ReificationCost::exec_time_us, Gt(0))));
+}
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index ea443a520d2e0b..923d9190b149ff 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/flag_utils.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
@@ -189,6 +189,17 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
   return result;
 }
 
+int64_t GetPartitionSize(const HloInstruction& instr,
+                         const SolGPUCostModel::Config& sol_flags) {
+  if (sol_flags.partition_size > 0) {
+    return sol_flags.partition_size;
+  }
+  if (instr.GetModule()->config().partition_size() > 0) {
+    return instr.GetModule()->config().partition_size();
+  }
+  return sol_flags.gpus_per_node;
+}
+
 absl::StatusOr<absl::Duration> DispatchEstimation(
     const absl::StatusOr<GPUCommunicationType>& communication_type,
     const HloCollectiveInstruction& instr,
@@ -202,24 +213,26 @@ absl::StatusOr<absl::Duration> DispatchEstimation(
   GPUCommunicationType comm = *communication_type;
   TF_ASSIGN_OR_RETURN(auto num_groups_and_devices,
                       GetReplicaGroupCountAndSize(&instr));
+  int64_t partition_size = GetPartitionSize(instr, sol_flags);
 
   switch (comm) {
-    case GPUCommunicationType::RAIL_ALIGNED: {
+    case GPUCommunicationType::MULTI_HOST_WORLD_LEVEL: {
       return DCNCollectiveDuration(
-          num_groups_and_devices->second / sol_flags.gpus_per_node,
+          num_groups_and_devices->second / partition_size,
           /*num_communicators=*/num_groups_and_devices->first, instr,
           gpu_device_info, sol_flags, analysis, symbolic_expr_context);
     }
-    case GPUCommunicationType::NON_RAIL_ALIGNED: {
+    case GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL: {
       return DCNCollectiveDuration(
           num_groups_and_devices->second,
           /*num_communicators=*/num_groups_and_devices->first, instr,
           gpu_device_info, sol_flags, analysis, symbolic_expr_context);
     }
-    case GPUCommunicationType::SINGLE_HOST: {
+    case GPUCommunicationType::SINGLE_PARTITION: {
       if (collective_interpolator == nullptr) {
         return absl::InvalidArgumentError(
-            "Collective interpolator is required for single host collectives");
+            "Collective interpolator is required for single partition "
+            "collectives");
       }
       return collective_interpolator->EstimatedRuntime(instr);
     }
@@ -309,9 +322,10 @@ SolLatencyEstimator::ComputeCollectiveTime(
         absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
   }
 
+  int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
   TF_ASSIGN_OR_RETURN(
       GPUCommunicationType communication_type,
-      CommunicationType(sol_flags.gpus_per_node, *collective_instr,
+      CommunicationType(partition_size, *collective_instr,
                         gpu_device_info.gpu_compute_capability()));
   TF_ASSIGN_OR_RETURN(
       absl::Duration result,
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
index 7f728eb3223683..647cc293c2627e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/gpu/model/matmul_interpolator.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 3cff750694af15..b9b080649e6d6d 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -344,7 +344,7 @@ ENTRY e {
 })",
       /*opcode_to_find=*/HloOpcode::kFusion,
       /*cost_type=*/CostType::kNodeCost,
-      /*expected_latency=*/absl::Microseconds(9),
+      /*expected_latency=*/absl::Microseconds(8),
   };
 
   EstimatorTestCase cublas_matmul_bf16_batch1_1024_1024_1024 = {
@@ -374,7 +374,127 @@ ENTRY e {
 })",
       /*opcode_to_find=*/HloOpcode::kCustomCall,
       /*cost_type=*/CostType::kNodeCost,
-      /*expected_latency=*/absl::Microseconds(9),
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_mixed_fp8_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublas_matmul_mixed_fp8_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e5m2[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e5m2_f8e4m3fn_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e5m2_f8e4m3fn_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e5m2[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e4m3fn_f8e5m2_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e4m3fn_f8e5m2_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e4m3fn[1024,1024] parameter(0)
+  p1 = f8e5m2[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
+  };
+
+  EstimatorTestCase cublaslt_matmul_f8e4m3fn_f8e4m3fn_batch1_1024_1024_1024 = {
+      /*test_name=*/"cublaslt_matmul_f8e4m3fn_f8e4m3fn_batch1_1024_1024_1024",
+      /*module_string=*/R"(
+HloModule m
+
+ENTRY e {
+  p0 = f8e4m3fn[1024,1024] parameter(0)
+  p1 = f8e4m3fn[1024,1024] parameter(1)
+  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+    custom_call_target="__cublas$lt$matmul$f8",
+    backend_config={
+      "operation_queue_id":"0",
+      "wait_on_operation_queues":[],
+      "gemm_backend_config":{
+        "alpha_real":1,
+        "beta":1,
+        "dot_dimension_numbers": {
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["1"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        }
+      }
+    }
+})",
+      /*opcode_to_find=*/HloOpcode::kCustomCall,
+      /*cost_type=*/CostType::kNodeCost,
+      /*expected_latency=*/absl::Microseconds(8),
   };
 
   EstimatorTestCase simple_fusion_elementwise = {
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
index 4ed8705e16777a..40c118b29b17aa 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
@@ -44,10 +44,10 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
@@ -162,7 +162,11 @@ TritonEmitterConstraints::DeriveCustomConstraints(
       ConstraintExpression divisibility_constraints =
           ConstraintExpression::GetAlwaysSatisfied();
 
-      for (const HloInstruction* operand : hlo->operands()) {
+      // The last operand of the concat does not require the divisibility
+      // constraint.
+      for (int operand_id = 0; operand_id < hlo->operand_count() - 1;
+           ++operand_id) {
+        const HloInstruction* operand = hlo->operand(operand_id);
         AffineExpr operand_concat_dimension = mlir::getAffineConstantExpr(
             operand->shape().dimensions(concatenate_dimension_index), ctx);
         ConstraintExpression::Constraint divisibility_constraint{
@@ -334,6 +338,10 @@ absl::StatusOr<bool> TritonEmitterConstraints::ParametersSatisfyConstraints(
       // invalid. Otherwise we would for example compute the launch config
       // incorrectly.
       if ((tile_size & (tile_size - 1)) && tile_size != dim_size) {
+        VLOG(5)
+            << "Found a tile size that is not a power of 2 and is not equal "
+               "to the dimension size. Bailing out."
+            << tile_size << " " << dim_size;
         return false;
       }
     }
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
index 0258a77783328e..771b809a7dee0f 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
@@ -70,7 +70,7 @@ class TritonEmitterConstraintsTest : public HloHardwareIndependentTestBase {
   }
 
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
   se::DeviceDescription device_description_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
 };
@@ -280,15 +280,15 @@ TEST_F(TritonEmitterConstraintsTest,
 concatenate {
   p0 = bf16[8] parameter(0)
   p1 = bf16[8] parameter(1)
-  p2 = bf16[8] parameter(2)
-  ROOT concatenate = bf16[24] concatenate(p0, p1, p2), dimensions={0}
+  p2 = bf16[4] parameter(2)
+  ROOT concatenate = bf16[20] concatenate(p0, p1, p2), dimensions={0}
 }
 
 ENTRY main {
   p0 = bf16[8] parameter(0)
   p1 = bf16[8] parameter(1)
-  p2 = bf16[8] parameter(2)
-  ROOT fusion = bf16[24] fusion(p0, p1, p2),
+  p2 = bf16[4] parameter(2)
+  ROOT fusion = bf16[20] fusion(p0, p1, p2),
     kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
       "kind":"__triton_nested_gemm_fusion"}}
 })"));
@@ -322,9 +322,9 @@ ENTRY main {
                   Tiling({{fusion_root, FlatTiling({16})}})),
               absl_testing::IsOkAndHolds(false));
 
-  // However, (4,) is valid and should still work.
+  // However, (8,) is valid and should still work.
   EXPECT_THAT(analysis_with_triton_constraints->ParametersSatisfyConstraints(
-                  Tiling({{fusion_root, FlatTiling({4})}})),
+                  Tiling({{fusion_root, FlatTiling({8})}})),
               absl_testing::IsOkAndHolds(true));
 }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
index 70599b640dbb0c..fc2d786865262a 100644
--- a/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -51,11 +52,8 @@ std::optional<bool> NVPTXAliasInfo::MayAlias(
         GemmBackendConfig config =
             std::move(user->backend_config<GpuBackendConfig>())
                 ->gemm_backend_config();
-        return (config.beta() != 0.) && user->operand(2) == operand;
-      }
-      // The operand of cholesky can be shared with the first output.
-      if (user->custom_call_target() == kCusolverCholeskyCallTarget) {
-        return user_index.size() == 1 && user_index[0] == 0;
+        return (config.beta() != 0.) && operand == user->operand(2) &&
+               absl::c_count(user->operands(), operand) == 1;
       }
       return false;
     default:
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
new file mode 100644
index 00000000000000..0535ccd3968e29
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/nvptx_alias_info.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/service/copy_insertion.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+void ExpectOptionalTrue(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_TRUE(*value);
+}
+
+void ExpectOptionalFalse(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_FALSE(*value);
+}
+
+class NVPTXAliasInfoTest : public HloHardwareIndependentTestBase {
+ public:
+  std::optional<bool> MayAlias(const HloInstruction* user,
+                               const HloInstruction* operand,
+                               const ShapeIndex& user_index) {
+    return alias_info_.MayAlias(operand, {}, user, user_index);
+  }
+
+ private:
+  const se::DeviceDescription device_description_{
+      xla::gpu::TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  NVPTXAliasInfo alias_info_{device_description_};
+};
+
+TEST_F(NVPTXAliasInfoTest, BufferCanBeSharedForBiasMatmul) {
+  const char* const kModuleString = R"(
+HloModule m
+
+ENTRY main {
+  lhs = f32[20,20]{1,0} parameter(0)
+  rhs = f32[20,30]{1,0} parameter(1)
+  bias = f32[20,30]{1,0} parameter(2)
+  ROOT cublas-lt-matmul = (f32[20,30]{1,0}, s8[33554432]{0}) custom-call(lhs, rhs, bias), custom_call_target="__cublas$lt$matmul", frontend_attributes={grad_x="false",grad_y="false"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"selected_algorithm":"0","alpha_real":1,"beta":1,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["HIGHEST","HIGHEST"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"400","rhs_stride":"600","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* matmul = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(1), {0}));
+  ExpectOptionalTrue(MayAlias(matmul, matmul->operand(2), {0}));
+}
+
+TEST_F(NVPTXAliasInfoTest, DuplicateOperandBufferCannotBeSharedForBiasMatmul) {
+  const char* const kModuleString = R"(
+HloModule m
+
+ENTRY main {
+  lhs = f32[20,20]{1,0} parameter(0)
+  rhs = f32[20,30]{1,0} parameter(1)
+  ROOT cublas-lt-matmul = (f32[20,30]{1,0}, s8[33554432]{0}) custom-call(lhs, rhs, rhs), custom_call_target="__cublas$lt$matmul", frontend_attributes={grad_x="false",grad_y="false"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"selected_algorithm":"0","alpha_real":1,"beta":1,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["HIGHEST","HIGHEST"],"algorithm":"ALG_UNSET"},"epilogue":"DEFAULT","lhs_stride":"400","rhs_stride":"600","grad_x":false,"grad_y":false,"damax_output":false},"force_earliest_schedule":false,"reification_cost":[],"device_type":"DEVICE_TYPE_INVALID"}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* matmul = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(1), {0}));
+  ExpectOptionalFalse(MayAlias(matmul, matmul->operand(2), {0}));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index ca0395e16aa21a..88dd72790edf20 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -91,7 +91,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/transforms/cudnn_simplify_padding.h"
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -104,7 +103,6 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/cuda/cuda_solver_context.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/semantic_version.h"
@@ -198,8 +196,6 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   MatmulBfloat16Support matmul_bf16_support(*cuda_compute_capability);
   pipeline.AddPass<FloatNormalization>(&matmul_bf16_support);
 
-  pipeline.AddPass<GpusolverRewriter>(
-      stream_executor::CudaSolverContext::Create);
   if (!hlo_module->config()
            .debug_options()
            .xla_gpu_experimental_disable_binary_libraries()) {
@@ -439,15 +435,15 @@ absl::Status NVPTXCompiler::AddFusionAutotuningPass(
   }
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
+  auto native_backend = std::make_unique<NativeEmitterBackend>(
+      &debug_options, this, target_config);
+  native_backend->AllowRegisterSpills();
+  backends.push_back(std::move(native_backend));
   auto ble_backend = std::make_unique<BlockLevelEmitterBackend>(
       &debug_options, this, shape_size_fn, target_config,
       /*use_default_config=*/true);
   ble_backend->AllowRegisterSpills();
   backends.push_back(std::move(ble_backend));
-  auto native_backend = std::make_unique<NativeEmitterBackend>(
-      &debug_options, this, target_config);
-  native_backend->AllowRegisterSpills();
-  backends.push_back(std::move(native_backend));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<AutotunerPass> autotuner_pass,
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index d3c9da510d762d..3dbf805330efe7 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/ptx_compilation_test.cc b/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
index d37e0f035d47f0..b39dfbc09dff92 100644
--- a/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/ptx_compilation_test.cc
@@ -81,11 +81,25 @@ ENTRY main {
 )";
 
 constexpr absl::string_view kSM90AHlo = R"(
+lhs {
+  ROOT p0 = f16[64,1024]{1,0} parameter(0)
+}
+rhs {
+  p0 = f16[1024,32,32]{2,1,0} parameter(0)
+  ROOT bitcast = f16[1024,1024]{0,1} bitcast(p0)
+}
 gemm_fusion_dot {
-  %p0 = f16[64,1024]{1,0} parameter(0)
-  %p1 = f16[1024,32,32]{2,1,0} parameter(1)
-  %bitcast.74246 = f16[1024,1024]{0,1} bitcast(f16[1024,32,32]{2,1,0} %p1)
-  ROOT %dot.1302 = f16[64,1024]{1,0} dot(f16[64,1024]{1,0} %p0, f16[1024,1024]{0,1} %bitcast.74246), lhs_contracting_dims={1}, rhs_contracting_dims={0}, frontend_attributes={grad_x="false",grad_y="false"}
+  p0 = f16[64,1024]{1,0} parameter(0)
+  p1 = f16[1024,32,32]{2,1,0} parameter(1)
+  lhs = f16[64,1024]{1,0} fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[64,32]}]}}}
+  rhs = f16[1024,1024]{0,1} fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[32,32]}]}}}
+  ROOT dot = f16[64,1024]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    frontend_attributes={grad_x="false",grad_y="false"}
 }
 
 ENTRY e {
@@ -95,11 +109,8 @@ ENTRY e {
   // whether we properly enable SM 9.0A in all compilation and linking paths.
   ROOT triton_gemm_fusion_dot = f16[64,1024]{1,0} fusion(p0, p1), kind=kCustom,
     calls=gemm_fusion_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":64,"block_n":32,"block_k":32,
-         "split_k":1,"num_stages":1,"num_warps":4,
-         "num_ctas":1}}}
+    backend_config={fusion_backend_config:{kind:"__triton_nested_gemm_fusion",
+      block_level_fusion_config:{output_tiles:[{sizes:[64,32]}],num_stages:1,num_warps:4,num_ctas:1}}}
 })";
 
 constexpr absl::string_view kResultsInNoPtxHlo = R"(
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 53f6f72afc5ca0..99762d6df650da 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/repeat_buffer_kernel.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 96f0ba5192ef81..d6da1165c5a292 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -167,10 +167,12 @@ xla_test(
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:literal_test_util",
+        "//xla/tests:test_utils",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -236,7 +238,6 @@ xla_test(
         "gpu_too_many_blocks_test.cc",
     ],
     backends = ["gpu"],
-    #tags = ["cuda-only",], #(TODO)(rocm): weekly sync 24-11-05
     deps = [
         ":gpu_codegen_test",
         "//xla/hlo/ir:hlo",
@@ -599,7 +600,7 @@ xla_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = ["cuda-only"],
+    shard_count = 15,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -629,6 +630,7 @@ lit_test_suite_for_gpus(
             "offload_scan_output.hlo",
             "pad_to_static.hlo",
             "reduce_fold_zero_add.hlo",
+            "reduce-precision.hlo",
             "rng_get_and_update_state.hlo",
             "single_instruction.hlo",
             "slice_to_dynamic.hlo",
@@ -689,6 +691,7 @@ lit_test_suite_for_gpus(
 #         "@llvm-project//mlir:FuncExtensions",
 #         "@llvm-project//mlir:LLVMIRTransforms",
 #         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+#         "@llvm-project//mlir:MemRefDialect",
 #         "@llvm-project//mlir:MlirOptLib",
 #         "@llvm-project//mlir:Pass",
 #         "@llvm-project//mlir:RegisterAllExtensions",  # buildcleaner: keep
@@ -857,6 +860,7 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
index 10a62d4d146195..14e71359821ce3 100644
--- a/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -21,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
@@ -31,6 +34,7 @@ limitations under the License.
 #include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tests/test_utils.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
@@ -70,9 +74,110 @@ class CommandBufferTest
     debug_options.set_xla_gpu_command_buffer_scheduling_mode(GetParam());
     return debug_options;
   }
- public:
-  bool IsRocm() {
-    return test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuRocm);
+
+  // Execute compiled module three times to exercise warm-up, create, and
+  // update paths. Third run uses cloned arguments to encourage device buffer
+  // address changes.
+  void ExecuteThreePhasesAndExpect(std::unique_ptr<HloModule> module,
+                                   absl::Span<const Literal* const> arguments,
+                                   const Literal& expected,
+                                   bool run_hlo_passes) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<OpaqueExecutable> executable,
+        CreateExecutable(std::move(module), run_hlo_passes));
+
+    // 1) Warm-up (may run thunks)
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal result1,
+        test_runner().ExecuteWithExecutable(executable.get(), arguments));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result1));
+
+    // 2) Create (record and execute command buffer)
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal result2,
+        test_runner().ExecuteWithExecutable(executable.get(), arguments));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result2));
+
+    // 3) Update (execute with cloned arguments to attempt buffer changes)
+    std::vector<Literal> cloned_args_storage;
+    cloned_args_storage.reserve(arguments.size());
+    std::vector<const Literal*> cloned_args;
+    cloned_args.reserve(arguments.size());
+    for (const Literal* arg : arguments) {
+      cloned_args_storage.push_back(arg->Clone());
+      cloned_args.push_back(&cloned_args_storage.back());
+    }
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result3,
+                            test_runner().ExecuteWithExecutable(
+                                executable.get(), absl::MakeSpan(cloned_args)));
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result3));
+  }
+
+  // Same as above, but generates fake inputs and compares results to a
+  // reference execution. Useful for tests originally using RunAndCompare.
+  ::testing::AssertionResult RunAndCompareThreeIterations(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes,
+      const std::optional<ErrorSpec>& error) {
+    // Verify module then clone for reference.
+    TF_CHECK_OK(this->verifier().Run(module.get()).status());
+    std::unique_ptr<HloModule> reference_module = module->Clone();
+
+    // Prepare fake args for both runners.
+    absl::StatusOr<std::vector<Literal>> fake_args_or =
+        MakeFakeArguments(module.get());
+    if (!fake_args_or.ok()) {
+      return ::testing::AssertionFailure() << fake_args_or.status().message();
+    }
+    std::vector<Literal> fake_args = std::move(*fake_args_or);
+    std::vector<const Literal*> arg_ptrs = LiteralUtil::MakePointers(fake_args);
+
+    // Reference once.
+    absl::StatusOr<Literal> reference = reference_runner().Execute(
+        std::move(reference_module), absl::MakeSpan(arg_ptrs), run_hlo_passes);
+    if (!reference.ok()) {
+      return ::testing::AssertionFailure() << reference.status();
+    }
+
+    // Compile once on test backend and run three iterations.
+    absl::StatusOr<std::unique_ptr<OpaqueExecutable>> exec_or =
+        CreateExecutable(std::move(module), run_hlo_passes);
+    if (!exec_or.ok()) {
+      return ::testing::AssertionFailure() << exec_or.status();
+    }
+    std::unique_ptr<OpaqueExecutable> exec = std::move(*exec_or);
+
+    // 1) Warm-up
+    absl::StatusOr<Literal> r1 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(arg_ptrs));
+    if (!r1.ok()) return ::testing::AssertionFailure() << r1.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r1, error))
+      return ::testing::AssertionFailure() << "Mismatch on warm-up run";
+
+    // 2) Create
+    absl::StatusOr<Literal> r2 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(arg_ptrs));
+    if (!r2.ok()) return ::testing::AssertionFailure() << r2.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r2, error))
+      return ::testing::AssertionFailure() << "Mismatch on create run";
+
+    // 3) Update with cloned args
+    std::vector<Literal> cloned_args_storage;
+    cloned_args_storage.reserve(arg_ptrs.size());
+    std::vector<const Literal*> cloned_arg_ptrs;
+    cloned_arg_ptrs.reserve(arg_ptrs.size());
+    for (const Literal* a : arg_ptrs) {
+      cloned_args_storage.push_back(a->Clone());
+      cloned_arg_ptrs.push_back(&cloned_args_storage.back());
+    }
+
+    absl::StatusOr<Literal> r3 = test_runner().ExecuteWithExecutable(
+        exec.get(), absl::MakeSpan(cloned_arg_ptrs));
+    if (!r3.ok()) return ::testing::AssertionFailure() << r3.status();
+    if (!LiteralTestUtil::NearOrEqual(*reference, *r3, error))
+      return ::testing::AssertionFailure() << "Mismatch on update run";
+
+    return ::testing::AssertionSuccess();
   }
 };
 
@@ -123,16 +228,11 @@ TEST_P(CommandBufferTest, Fusions) {
   Literal argument = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   Literal expected = LiteralUtil::CreateR2<float>({{3.0, 8.0}, {15.0, 24.0}});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferTest, TrueFalseConditional) {
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
   constexpr absl::string_view hlo_text = R"(
   HloModule m, is_scheduled=true
 
@@ -177,9 +277,8 @@ TEST_P(CommandBufferTest, TrueFalseConditional) {
 
     Literal pred = LiteralUtil::CreateR0<bool>(true);
     Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&pred, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&pred, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `false` branch.
@@ -187,16 +286,12 @@ TEST_P(CommandBufferTest, TrueFalseConditional) {
 
     Literal pred = LiteralUtil::CreateR0<bool>(false);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&pred, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&pred, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 }
 
 TEST_P(CommandBufferTest, IndexConditional) {
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
   constexpr absl::string_view hlo_text = R"(
   HloModule m, is_scheduled=true
 
@@ -240,9 +335,8 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(0);
     Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `1` branch.
@@ -250,9 +344,8 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(1);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 
   {  // Execute `1024` branch (our of bound index executes N-1 branch).
@@ -260,16 +353,12 @@ TEST_P(CommandBufferTest, IndexConditional) {
 
     Literal index = LiteralUtil::CreateR0<int32_t>(1024);
     Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
-    TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(m), {&index, &p1},
-                                                    /*run_hlo_passes=*/false));
-    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+    ExecuteThreePhasesAndExpect(std::move(m), {&index, &p1}, expected,
+                                /*run_hlo_passes=*/false);
   }
 }
 
 TEST_P(CommandBufferTest, WhileLoop) {
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
   constexpr absl::string_view hlo_text = R"(
   HloModule m, is_scheduled=true
 
@@ -326,10 +415,8 @@ TEST_P(CommandBufferTest, WhileLoop) {
   Literal expected_value = LiteralUtil::CreateR0<float>(20.0);
   Literal expected = LiteralUtil::MakeTuple({&expected_cnt, &expected_value});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferTest, ControlDependencyTest) {
@@ -587,8 +674,8 @@ TEST_P(CommandBufferTest, DynamicSliceCopyFusionCmd) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_text, config));
 
-  EXPECT_TRUE(
-      RunAndCompareNoHloPasses(std::move(module), ErrorSpec{1e-3, 2e-3}));
+  EXPECT_TRUE(RunAndCompareThreeIterations(
+      std::move(module), /*run_hlo_passes=*/false, ErrorSpec{1e-3, 2e-3}));
 
   if (!IsAtLeastCuda12900(GpuExecutor())) {
     GTEST_SKIP() << "While loop unrolling is not supported for CUDA < 12.9";
@@ -609,8 +696,9 @@ TEST_P(CommandBufferTest, DynamicSliceCopyFusionCmd) {
   TF_ASSERT_OK_AND_ASSIGN(auto unrolled_module,
                           ParseAndReturnVerifiedModule(hlo_text, config));
 
-  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(unrolled_module),
-                                       ErrorSpec{1e-3, 2e-3}));
+  EXPECT_TRUE(RunAndCompareThreeIterations(std::move(unrolled_module),
+                                           /*run_hlo_passes=*/false,
+                                           ErrorSpec{1e-3, 2e-3}));
 }
 
 TEST_P(CommandBufferUnrollTest, WhileLoop) {
@@ -676,10 +764,8 @@ TEST_P(CommandBufferUnrollTest, WhileLoop) {
   Literal expected_value = LiteralUtil::CreateR0<float>(20.0);
   Literal expected = LiteralUtil::MakeTuple({&expected_cnt, &expected_value});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      Execute(std::move(module), {&argument}, /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  ExecuteThreePhasesAndExpect(std::move(module), {&argument}, expected,
+                              /*run_hlo_passes=*/false);
 }
 
 TEST_P(CommandBufferUnrollTest, WhileLoopMultiDevice) {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index b3465d0efcd928..db0e8ccd7740cf 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/array4d.h"
 #include "xla/error_spec.h"
@@ -358,7 +359,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string                                                   // NOLINT
   GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,128,2048]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,128,2048]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0},$bias_type[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
 
     region_0.28 {
       Arg_0.29 = bf16[] parameter(0)
@@ -379,8 +380,9 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
       constant.6 = bf16[] constant(2)
       broadcast.7 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.6), dimensions={}
       multiply.11 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.10, broadcast.7)
-      Arg_3.4 = bf16[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
-      add.27 = bf16[2,6,2048,2048]{3,2,1,0} add(multiply.11, Arg_3.4)
+      Arg_3.4 = $bias_type[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
+      convert.1 = bf16[2,6,2048,2048]{3,2,1,0} convert(Arg_3.4)
+      add.27 = bf16[2,6,2048,2048]{3,2,1,0} add(multiply.11, convert.1)
       constant.9 = bf16[] constant(-inf)
       reduce.32 = bf16[2,6,2048]{2,1,0} reduce(add.27, constant.9), dimensions={3}, to_apply=region_0.28
       reshape.33 = bf16[2,6,2048,1]{3,2,1,0} reshape(reduce.32)
@@ -408,15 +410,15 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string  // NOLINT
   GetModuleFlash_Attention_CuDNN_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, $bias_type[2,6,2048,2048]{3,2,1,0})->bf16[2,6,2048,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
 
     ENTRY main.52 {
       Arg_0.1 = bf16[2,6,2048,128]{3,2,1,0} parameter(0), sharding={replicated}
       Arg_1.2 = bf16[2,6,128,2048]{3,2,1,0} parameter(1), sharding={replicated}
       transpose = bf16[2,6,2048,128]{3,2,1,0} transpose(Arg_1.2), dimensions={0,1,3,2}
       Arg_2.3 = bf16[2,6,2048,128]{3,2,1,0} parameter(2), sharding={replicated}
-      Arg_3.4 = bf16[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
-      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,2048,128]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, Arg_3.4), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,2048]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"workspace_size":"0"},"fmha_scale":2,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","2048","2048"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
+      Arg_3.4 = $bias_type[2,6,2048,2048]{3,2,1,0} parameter(3), sharding={replicated}
+      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,2048,128]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, Arg_3.4), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, $bias_type[2,6,2048,2048]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"workspace_size":"0"},"fmha_scale":2,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","2048","2048"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
       ROOT get-tuple-element = bf16[2,6,2048,128]{3,2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm), index=0
     } // main.52
   )";
@@ -742,8 +744,24 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
         GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_HloString_BF16();
     std::string hlo_string_ref =
         GetModuleFlash_Attention_CuDNN_BMM1_Bias_Softmax_BMM2_HloString_BF16();
-    EXPECT_TRUE(RunAndCompareTwoModules(hlo_string, hlo_string_ref,
-                                        ErrorSpec{1e-3, 1e-5}));
+
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) >=
+        se::dnn::VersionInfo(9, 13, 0)) {
+      // fp32 bias is supported to cudnn 9.13 and above
+      std::string f32_bias_hlo_string =
+          absl::StrReplaceAll(hlo_string, {{"$bias_type", "f32"}});
+      std::string f32_bias_hlo_string_ref =
+          absl::StrReplaceAll(hlo_string_ref, {{"$bias_type", "f32"}});
+      EXPECT_TRUE(RunAndCompareTwoModules(
+          f32_bias_hlo_string, f32_bias_hlo_string_ref, ErrorSpec{1e-3, 1e-5}));
+    }
+
+    std::string bf16_bias_hlo_string =
+        absl::StrReplaceAll(hlo_string, {{"$bias_type", "bf16"}});
+    std::string bf16_bias_hlo_string_ref =
+        absl::StrReplaceAll(hlo_string_ref, {{"$bias_type", "bf16"}});
+    EXPECT_TRUE(RunAndCompareTwoModules(
+        bf16_bias_hlo_string, bf16_bias_hlo_string_ref, ErrorSpec{1e-3, 1e-5}));
   }
 
   void TestImpl_Flash_Attention_Training_BMM1_Bias_Softmax_BMM2() {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 37f6e54ce2d524..45ce8cf549335b 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -141,10 +141,16 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
         calls=fused_computation.1
     })";
 
-  // Check that a call to llvm.nvvm.barrier0 is generated.
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
+  // Disable autotuning because this test is checking for that the native
+  // emitter generates a kernel correctly. Autotuning may change it to generate
+  // a triton kernel instead, which uses a different barrier.
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_autotune_level(0);
+  // Check that a call to llvm.nvvm.barrier0 is generated.
   auto expected_ir = R"(
 ; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK: call void BARRIER()
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index b8d4c20d7e9959..26fa9db336994b 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -55,18 +55,38 @@ ENTRY main {
 
 HloModule t, is_scheduled=true
 
+c0 {
+  p0 = f16[15,19]{1,0} parameter(0)
+  ROOT r = f16[15,19]{1,0} copy(p0)
+}
+c1 {
+  p0 = s8[19,17]{1,0} parameter(0)
+  c = f16[19,17]{1,0} convert(p0)
+  ROOT r = f16[19,17]{1,0} copy(c)
+}
 triton_gemm_dot0 {
   parameter_1 = f16[15,19]{1,0} parameter(1)
   parameter_0 = s8[19,17]{1,0} parameter(0)
-  cp1.1 = f16[19,17]{1,0} convert(parameter_0)
-  ROOT dot0.1 = f16[15,17]{1,0} dot(parameter_1, cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  a = f16[15,19] fusion(parameter_1), kind=kCustom, calls=c0, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  b = f16[19,17] fusion(parameter_0), kind=kCustom, calls=c1, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  ROOT dot0.1 = f16[15,17]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
+c2 {
+  p0 = f16[15,19]{1,0} parameter(0)
+  ROOT r = f16[15,19]{1,0} copy(p0)
+}
+c3 {
+  p0 = s8[19,17]{1,0} parameter(0)
+  c = f16[19,17]{1,0} convert(p0)
+  ROOT r = f16[19,17]{1,0} copy(c)
+}
 triton_gemm_dot1 {
   parameter_1.1 = f16[15,19]{1,0} parameter(1)
   parameter_0.1 = s8[19,17]{1,0} parameter(0)
-  cp3.1 = f16[19,17]{1,0} convert(parameter_0.1)
-  ROOT dot1.1 = f16[15,17]{1,0} dot(parameter_1.1, cp3.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  a = f16[15,19] fusion(parameter_1.1), kind=kCustom, calls=c2, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  b = f16[19,17] fusion(parameter_0.1), kind=kCustom, calls=c3, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}]}}}"
+  ROOT dot1.1 = f16[15,17]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
@@ -74,8 +94,8 @@ ENTRY e {
   p2 = f16[15,19]{1,0} parameter(2)
   p1 = s8[19,17]{1,0} parameter(1)
   p0 = f16[15,19]{1,0} parameter(0)
-  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
-  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
+  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}], \"num_stages\":\"4\", \"num_warps\":\"4\", \"num_ctas\":\"1\"}}}"
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{\"fusion_backend_config\": {kind: \"__triton_nested_gemm_fusion\", block_level_fusion_config: {\"output_tiles\":[{\"sizes\":[\"16\",\"16\"]}], \"num_stages\":\"4\", \"num_warps\":\"4\", \"num_ctas\":\"1\"}}}"
   ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
new file mode 100644
index 00000000000000..94752f8871c81e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
@@ -0,0 +1,8 @@
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+
+e {
+  a = bf16[] parameter(0)
+  b = bf16[] reduce-precision(a), exponent_bits=8, mantissa_bits=7
+}
+
+// CHECK-NOT: reduce-precision
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index bd5066841f7110..c86363f5193060 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -163,8 +163,8 @@ compare {
 // CHECK:         %[[VAL_130:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_103]]
 // CHECK:         %[[VAL_131:.*]] = addrspacecast ptr addrspace(3) %[[VAL_130]] to ptr
 // CHECK-GCN:     %[[VAL_11_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_11]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_129]], ptr %[[VAL_131]], ptr %[[VAL_11]])
 // CHECK-PTX:     %[[VAL_132:.*]] = load i8, ptr %[[VAL_11]], align 1
 // CHECK-GCN:     %[[VAL_132:.*]] = load i8, ptr addrspace(5) %[[VAL_11]], align 1
 // CHECK:         %[[VAL_133:.*]] = icmp ne i8 %[[VAL_132]], 0
@@ -185,8 +185,8 @@ compare {
 // CHECK:         %[[VAL_141:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_110]]
 // CHECK:         %[[VAL_142:.*]] = addrspacecast ptr addrspace(3) %[[VAL_141]] to ptr
 // CHECK-GCN:     %[[VAL_10_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_10]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_140]], ptr %[[VAL_142]], ptr %[[VAL_10]])
 // CHECK-PTX:     %[[VAL_143:.*]] = load i8, ptr %[[VAL_10]], align 1
 // CHECK-GCN:     %[[VAL_143:.*]] = load i8, ptr addrspace(5) %[[VAL_10]], align 1
 // CHECK:         %[[VAL_144:.*]] = icmp ne i8 %[[VAL_143]], 0
@@ -207,8 +207,8 @@ compare {
 // CHECK:         %[[VAL_152:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_118]]
 // CHECK:         %[[VAL_153:.*]] = addrspacecast ptr addrspace(3) %[[VAL_152]] to ptr
 // CHECK-GCN:     %[[VAL_9_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_9]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_151]], ptr %[[VAL_153]], ptr %[[VAL_9]])
 // CHECK-PTX:     %[[VAL_154:.*]] = load i8, ptr %[[VAL_9]], align 1
 // CHECK-GCN:     %[[VAL_154:.*]] = load i8, ptr addrspace(5) %[[VAL_9]], align 1
 // CHECK:         %[[VAL_155:.*]] = icmp ne i8 %[[VAL_154]], 0
@@ -229,8 +229,8 @@ compare {
 // CHECK:         %[[VAL_163:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_124]]
 // CHECK:         %[[VAL_164:.*]] = addrspacecast ptr addrspace(3) %[[VAL_163]] to ptr
 // CHECK-GCN:     %[[VAL_8_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_8]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_162]], ptr %[[VAL_164]], ptr %[[VAL_8]])
 // CHECK-PTX:     %[[VAL_165:.*]] = load i8, ptr %[[VAL_8]], align 1
 // CHECK-GCN:     %[[VAL_165:.*]] = load i8, ptr addrspace(5) %[[VAL_8]], align 1
 // CHECK:         %[[VAL_166:.*]] = icmp ne i8 %[[VAL_165]], 0
@@ -283,8 +283,8 @@ compare {
 // CHECK:         %[[VAL_205:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_175]]
 // CHECK:         %[[VAL_206:.*]] = addrspacecast ptr addrspace(3) %[[VAL_205]] to ptr
 // CHECK-GCN:     %[[VAL_7_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_7]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_204]], ptr %[[VAL_206]], ptr %[[VAL_7]])
 // CHECK-PTX:     %[[VAL_207:.*]] = load i8, ptr %[[VAL_7]], align 1
 // CHECK-GCN:     %[[VAL_207:.*]] = load i8, ptr addrspace(5) %[[VAL_7]], align 1
 // CHECK:         %[[VAL_208:.*]] = icmp ne i8 %[[VAL_207]], 0
@@ -305,8 +305,8 @@ compare {
 // CHECK:         %[[VAL_216:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_183]]
 // CHECK:         %[[VAL_217:.*]] = addrspacecast ptr addrspace(3) %[[VAL_216]] to ptr
 // CHECK-GCN:     %[[VAL_6_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_6]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_215]], ptr %[[VAL_217]], ptr %[[VAL_6]])
 // CHECK-PTX:     %[[VAL_218:.*]] = load i8, ptr %[[VAL_6]], align 1
 // CHECK-GCN:     %[[VAL_218:.*]] = load i8, ptr addrspace(5) %[[VAL_6]], align 1
 // CHECK:         %[[VAL_219:.*]] = icmp ne i8 %[[VAL_218]], 0
@@ -327,8 +327,8 @@ compare {
 // CHECK:         %[[VAL_227:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_192]]
 // CHECK:         %[[VAL_228:.*]] = addrspacecast ptr addrspace(3) %[[VAL_227]] to ptr
 // CHECK-GCN:     %[[VAL_5_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_5]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_226]], ptr %[[VAL_228]], ptr %[[VAL_5]])
 // CHECK-PTX:     %[[VAL_229:.*]] = load i8, ptr %[[VAL_5]], align 1
 // CHECK-GCN:     %[[VAL_229:.*]] = load i8, ptr addrspace(5) %[[VAL_5]], align 1
 // CHECK:         %[[VAL_230:.*]] = icmp ne i8 %[[VAL_229]], 0
@@ -349,8 +349,8 @@ compare {
 // CHECK:         %[[VAL_238:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_199]]
 // CHECK:         %[[VAL_239:.*]] = addrspacecast ptr addrspace(3) %[[VAL_238]] to ptr
 // CHECK-GCN:     %[[VAL_4_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_4]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_237]], ptr %[[VAL_239]], ptr %[[VAL_4]])
 // CHECK-PTX:     %[[VAL_240:.*]] = load i8, ptr %[[VAL_4]], align 1
 // CHECK-GCN:     %[[VAL_240:.*]] = load i8, ptr addrspace(5) %[[VAL_4]], align 1
 // CHECK:         %[[VAL_241:.*]] = icmp ne i8 %[[VAL_240]], 0
@@ -399,8 +399,8 @@ compare {
 // CHECK:         %[[VAL_275:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_249]]
 // CHECK:         %[[VAL_276:.*]] = addrspacecast ptr addrspace(3) %[[VAL_275]] to ptr
 // CHECK-GCN:     %[[VAL_3_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_3]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_274]], ptr %[[VAL_276]], ptr %[[VAL_3]])
 // CHECK-PTX:     %[[VAL_277:.*]] = load i8, ptr %[[VAL_3]], align 1
 // CHECK-GCN:     %[[VAL_277:.*]] = load i8, ptr addrspace(5) %[[VAL_3]], align 1
 // CHECK:         %[[VAL_278:.*]] = icmp ne i8 %[[VAL_277]], 0
@@ -421,8 +421,8 @@ compare {
 // CHECK:         %[[VAL_286:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_256]]
 // CHECK:         %[[VAL_287:.*]] = addrspacecast ptr addrspace(3) %[[VAL_286]] to ptr
 // CHECK-GCN:     %[[VAL_2_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_2]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_285]], ptr %[[VAL_287]], ptr %[[VAL_2]])
 // CHECK-PTX:     %[[VAL_288:.*]] = load i8, ptr %[[VAL_2]], align 1
 // CHECK-GCN:     %[[VAL_288:.*]] = load i8, ptr addrspace(5) %[[VAL_2]], align 1
 // CHECK:         %[[VAL_289:.*]] = icmp ne i8 %[[VAL_288]], 0
@@ -443,8 +443,8 @@ compare {
 // CHECK:         %[[VAL_297:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_263]]
 // CHECK:         %[[VAL_298:.*]] = addrspacecast ptr addrspace(3) %[[VAL_297]] to ptr
 // CHECK-GCN:     %[[VAL_1_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_1]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_296]], ptr %[[VAL_298]], ptr %[[VAL_1]])
 // CHECK-PTX:     %[[VAL_299:.*]] = load i8, ptr %[[VAL_1]], align 1
 // CHECK-GCN:     %[[VAL_299:.*]] = load i8, ptr addrspace(5) %[[VAL_1]], align 1
 // CHECK:         %[[VAL_300:.*]] = icmp ne i8 %[[VAL_299]], 0
@@ -465,8 +465,8 @@ compare {
 // CHECK:         %[[VAL_308:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_269]]
 // CHECK:         %[[VAL_309:.*]] = addrspacecast ptr addrspace(3) %[[VAL_308]] to ptr
 // CHECK-GCN:     %[[VAL_0_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_0]] to ptr
-// CHECK-GCN:     call void @[[REGION:compare_.*]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0_2]])
-// CHECK-PTX:     call void @[[REGION:compare_.*]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0]])
+// CHECK-GCN:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0_2]])
+// CHECK-PTX:     call void @[[REGION:compare_[0-9]+]](ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_0]])
 // CHECK-PTX:     %[[VAL_310:.*]] = load i8, ptr %[[VAL_0]], align 1
 // CHECK-GCN:     %[[VAL_310:.*]] = load i8, ptr addrspace(5) %[[VAL_0]], align 1
 // CHECK:         %[[VAL_311:.*]] = icmp ne i8 %[[VAL_310]], 0
@@ -756,8 +756,8 @@ compare {
 // CHECK:         %[[VAL_529:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_498]]
 // CHECK:         %[[VAL_530:.*]] = addrspacecast ptr addrspace(3) %[[VAL_529]] to ptr
 // CHECK-GCN:     %[[VAL_353_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_353]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_524]], ptr %[[VAL_526]], ptr %[[VAL_528]], ptr %[[VAL_530]], ptr %[[VAL_353_2]])
 // CHECK-PTX:     %[[VAL_531:.*]] = load i8, ptr %[[VAL_353]], align 1
 // CHECK-GCN:     %[[VAL_531:.*]] = load i8, ptr addrspace(5) %[[VAL_353]], align 1
 // CHECK:         %[[VAL_532:.*]] = icmp ne i8 %[[VAL_531]], 0
@@ -788,8 +788,8 @@ compare {
 // CHECK:         %[[VAL_548:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_505]]
 // CHECK:         %[[VAL_549:.*]] = addrspacecast ptr addrspace(3) %[[VAL_548]] to ptr
 // CHECK-GCN:     %[[VAL_352_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_352]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_543]], ptr %[[VAL_545]], ptr %[[VAL_547]], ptr %[[VAL_549]], ptr %[[VAL_352_2]])
 // CHECK-PTX:     %[[VAL_550:.*]] = load i8, ptr %[[VAL_352]], align 1
 // CHECK-GCN:     %[[VAL_550:.*]] = load i8, ptr addrspace(5) %[[VAL_352]], align 1
 // CHECK:         %[[VAL_551:.*]] = icmp ne i8 %[[VAL_550]], 0
@@ -820,8 +820,8 @@ compare {
 // CHECK:         %[[VAL_567:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_513]]
 // CHECK:         %[[VAL_568:.*]] = addrspacecast ptr addrspace(3) %[[VAL_567]] to ptr
 // CHECK-GCN:     %[[VAL_351_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_351]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_562]], ptr %[[VAL_564]], ptr %[[VAL_566]], ptr %[[VAL_568]], ptr %[[VAL_351_2]])
 // CHECK-PTX:     %[[VAL_569:.*]] = load i8, ptr %[[VAL_351]], align 1
 // CHECK-GCN:     %[[VAL_569:.*]] = load i8, ptr addrspace(5) %[[VAL_351]], align 1
 // CHECK:         %[[VAL_570:.*]] = icmp ne i8 %[[VAL_569]], 0
@@ -852,8 +852,8 @@ compare {
 // CHECK:         %[[VAL_586:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_519]]
 // CHECK:         %[[VAL_587:.*]] = addrspacecast ptr addrspace(3) %[[VAL_586]] to ptr
 // CHECK-GCN:     %[[VAL_350_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_350]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_581]], ptr %[[VAL_583]], ptr %[[VAL_585]], ptr %[[VAL_587]], ptr %[[VAL_350_2]])
 // CHECK-PTX:     %[[VAL_588:.*]] = load i8, ptr %[[VAL_350]], align 1
 // CHECK-GCN:     %[[VAL_588:.*]] = load i8, ptr addrspace(5) %[[VAL_350]], align 1
 // CHECK:         %[[VAL_589:.*]] = icmp ne i8 %[[VAL_588]], 0
@@ -916,8 +916,8 @@ compare {
 // CHECK:         %[[VAL_636:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_602]]
 // CHECK:         %[[VAL_637:.*]] = addrspacecast ptr addrspace(3) %[[VAL_636]] to ptr
 // CHECK-GCN:     %[[VAL_349_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_349]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_631]], ptr %[[VAL_633]], ptr %[[VAL_635]], ptr %[[VAL_637]], ptr %[[VAL_349_2]])
 // CHECK-PTX:     %[[VAL_638:.*]] = load i8, ptr %[[VAL_349]], align 1
 // CHECK-GCN:     %[[VAL_638:.*]] = load i8, ptr addrspace(5) %[[VAL_349]], align 1
 // CHECK:         %[[VAL_639:.*]] = icmp ne i8 %[[VAL_638]], 0
@@ -948,8 +948,8 @@ compare {
 // CHECK:         %[[VAL_655:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_610]]
 // CHECK:         %[[VAL_656:.*]] = addrspacecast ptr addrspace(3) %[[VAL_655]] to ptr
 // CHECK-GCN:     %[[VAL_348_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_348]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_650]], ptr %[[VAL_652]], ptr %[[VAL_654]], ptr %[[VAL_656]], ptr %[[VAL_348_2]])
 // CHECK-PTX:     %[[VAL_657:.*]] = load i8, ptr %[[VAL_348]], align 1
 // CHECK-GCN:     %[[VAL_657:.*]] = load i8, ptr addrspace(5) %[[VAL_348]], align 1
 // CHECK:         %[[VAL_658:.*]] = icmp ne i8 %[[VAL_657]], 0
@@ -980,8 +980,8 @@ compare {
 // CHECK:         %[[VAL_674:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_619]]
 // CHECK:         %[[VAL_675:.*]] = addrspacecast ptr addrspace(3) %[[VAL_674]] to ptr
 // CHECK-GCN:     %[[VAL_347_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_347]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_669]], ptr %[[VAL_671]], ptr %[[VAL_673]], ptr %[[VAL_675]], ptr %[[VAL_347_2]])
 // CHECK-PTX:     %[[VAL_676:.*]] = load i8, ptr %[[VAL_347]], align 1
 // CHECK-GCN:     %[[VAL_676:.*]] = load i8, ptr addrspace(5) %[[VAL_347]], align 1
 // CHECK:         %[[VAL_677:.*]] = icmp ne i8 %[[VAL_676]], 0
@@ -1012,8 +1012,8 @@ compare {
 // CHECK:         %[[VAL_693:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_626]]
 // CHECK:         %[[VAL_694:.*]] = addrspacecast ptr addrspace(3) %[[VAL_693]] to ptr
 // CHECK-GCN:     %[[VAL_346_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_346]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_688]], ptr %[[VAL_690]], ptr %[[VAL_692]], ptr %[[VAL_694]], ptr %[[VAL_346_2]])
 // CHECK-PTX:     %[[VAL_695:.*]] = load i8, ptr %[[VAL_346]], align 1
 // CHECK-GCN:     %[[VAL_695:.*]] = load i8, ptr addrspace(5) %[[VAL_346]], align 1
 // CHECK:         %[[VAL_696:.*]] = icmp ne i8 %[[VAL_695]], 0
@@ -1072,8 +1072,8 @@ compare {
 // CHECK:         %[[VAL_738:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_708]]
 // CHECK:         %[[VAL_739:.*]] = addrspacecast ptr addrspace(3) %[[VAL_738]] to ptr
 // CHECK-GCN:     %[[VAL_345_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_345]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_733]], ptr %[[VAL_735]], ptr %[[VAL_737]], ptr %[[VAL_739]], ptr %[[VAL_345_2]])
 // CHECK-PTX:     %[[VAL_740:.*]] = load i8, ptr %[[VAL_345]], align 1
 // CHECK-GCN:     %[[VAL_740:.*]] = load i8, ptr addrspace(5) %[[VAL_345]], align 1
 // CHECK:         %[[VAL_741:.*]] = icmp ne i8 %[[VAL_740]], 0
@@ -1104,8 +1104,8 @@ compare {
 // CHECK:         %[[VAL_757:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_715]]
 // CHECK:         %[[VAL_758:.*]] = addrspacecast ptr addrspace(3) %[[VAL_757]] to ptr
 // CHECK-GCN:     %[[VAL_344_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_344]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_752]], ptr %[[VAL_754]], ptr %[[VAL_756]], ptr %[[VAL_758]], ptr %[[VAL_344_2]])
 // CHECK-PTX:     %[[VAL_759:.*]] = load i8, ptr %[[VAL_344]], align 1
 // CHECK-GCN:     %[[VAL_759:.*]] = load i8, ptr addrspace(5) %[[VAL_344]], align 1
 // CHECK:         %[[VAL_760:.*]] = icmp ne i8 %[[VAL_759]], 0
@@ -1136,8 +1136,8 @@ compare {
 // CHECK:         %[[VAL_776:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_722]]
 // CHECK:         %[[VAL_777:.*]] = addrspacecast ptr addrspace(3) %[[VAL_776]] to ptr
 // CHECK-GCN:     %[[VAL_343_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_343]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_771]], ptr %[[VAL_773]], ptr %[[VAL_775]], ptr %[[VAL_777]], ptr %[[VAL_343_2]])
 // CHECK-PTX:     %[[VAL_778:.*]] = load i8, ptr %[[VAL_343]], align 1
 // CHECK-GCN:     %[[VAL_778:.*]] = load i8, ptr addrspace(5) %[[VAL_343]], align 1
 // CHECK:         %[[VAL_779:.*]] = icmp ne i8 %[[VAL_778]], 0
@@ -1168,8 +1168,8 @@ compare {
 // CHECK:         %[[VAL_795:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_728]]
 // CHECK:         %[[VAL_796:.*]] = addrspacecast ptr addrspace(3) %[[VAL_795]] to ptr
 // CHECK-GCN:     %[[VAL_342_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_342]] to ptr
-// CHECK-PTX:     call void @[[REGION2:compare_.*]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342]])
-// CHECK-GCN:     call void @[[REGION2:compare_.*]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342_2]])
+// CHECK-PTX:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342]])
+// CHECK-GCN:     call void @[[REGION2:compare_[0-9]+]](ptr %[[VAL_790]], ptr %[[VAL_792]], ptr %[[VAL_794]], ptr %[[VAL_796]], ptr %[[VAL_342_2]])
 // CHECK-PTX:     %[[VAL_797:.*]] = load i8, ptr %[[VAL_342]], align 1
 // CHECK-GCN:     %[[VAL_797:.*]] = load i8, ptr addrspace(5) %[[VAL_342]], align 1
 // CHECK:         %[[VAL_798:.*]] = icmp ne i8 %[[VAL_797]], 0
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 20f51dfbd55b23..cf4762121764c9 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -59,12 +59,17 @@ ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
 }
 
 ENTRY test {
-p0 = $0[32]{0} parameter(0)
-ROOT sort = $0[32]{0} sort(p0), dimensions={0}, is_stable=true,
+p0 = $0[132000]{0} parameter(0)
+ROOT sort = $0[132000]{0} sort(p0), dimensions={0}, is_stable=false,
 to_apply=compare
 })";
   std::string hlo = absl::Substitute(
       kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
+  // We expect that all types except PRED and F8 types are rewritten to a custom
+  // call.
+  if (GetParam() != PRED && !primitive_util::IsF8Type(GetParam())) {
+    MatchOptimizedHlo(hlo, "CHECK: custom-call");
+  }
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{0, 0}));
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 1ba0c00af3cffa..0578a2b058fcde 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -3,17 +3,55 @@
 // CHECK-PTX: define ptx_kernel void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
 
-HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,17]{1,0}}
+HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19],s8[19,17])->f16[15,17]}
 
-%triton_gemm_r (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
-  %parameter_1 = f16[15,19]{1,0} parameter(1)
-  %parameter_0 = s8[19,17]{1,0} parameter(0)
-  %cp1.1 = f16[19,17]{1,0} convert(%parameter_0)
-  ROOT %r.1 = f16[15,17]{1,0} dot(%parameter_1, %cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+lhs {
+  ROOT p0 = f16[15,19] parameter(0)
 }
 
-ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
-  %p1 = s8[19,17]{1,0} parameter(1)
-  %p0 = f16[15,19]{1,0} parameter(0)
-  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\",\"num_ctas\":\"1\"}}}"
+rhs {
+  p0 = s8[19,17] parameter(0)
+  ROOT convert = f16[19,17] convert(p0)
+
+}
+
+dot_computation {
+  p1 = f16[15,19] parameter(1)
+  lhs = f16[15,19] fusion(p1), kind=kCustom, calls=lhs,
+    backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,64]}]
+        }
+      }
+    }
+  p0 = s8[19,17] parameter(0)
+  rhs = f16[19,17] fusion(p0), kind=kCustom, calls=rhs,
+    backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,32]}]
+        }
+      }
+    }
+  ROOT dot = f16[15,17] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
+  p1 = s8[19,17] parameter(1)
+  p0 = f16[15,19] parameter(0)
+  ROOT triton_gemm_r = f16[15,17] fusion(p1, p0),
+    kind=kCustom, calls=dot_computation, backend_config={
+      fusion_backend_config:{
+        kind:"__triton_nested_gemm_fusion",
+        block_level_fusion_config:{
+          output_tiles:[{sizes:[64,32]}],
+          num_stages:2,
+          num_warps:8,
+          num_ctas:1
+        }
+      }
+    }
 }
diff --git a/third_party/xla/xla/service/gpu/tests/xla-opt.cc b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
index ee783721b7eae4..850a788aaf73ef 100644
--- a/third_party/xla/xla/service/gpu/tests/xla-opt.cc
+++ b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/InitAllExtensions.h"
 #include "mlir/Pass/PassOptions.h"
@@ -71,7 +72,8 @@ mlir::PassPipelineRegistration<TritonPipelineOptions>
             gpu_cc = rocm_cc;
           }
           xla::gpu::CreateTritonXlaPipeline(&pm, gpu_cc, options.rewrite_int4,
-                                            options.allow_tma);
+                                            options.allow_tma,
+                                            options.num_stages);
           xla::gpu::CreateTritonPipeline(&pm, gpu_cc, options.num_warps,
                                          options.num_ctas, options.num_stages,
                                          cluster_info);
@@ -87,10 +89,10 @@ int main(int argc, char** argv) {
   mlir::LLVM::registerInlinerInterface(registry);
   mlir::func::registerInlinerExtension(registry);
   registerTritonDialects(registry);  // This registers all passes as well.
-  registry
-      .insert<mlir::func::FuncDialect, mlir::tensor::TensorDialect,
-              mlir::triton::xla::XlaTritonDialect, xla::XlaDialect,
-              xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect>();
+  registry.insert<mlir::func::FuncDialect, mlir::tensor::TensorDialect,
+                  mlir::triton::xla::XlaTritonDialect, xla::XlaDialect,
+                  xla::xtile::XTileDialect, mlir::stablehlo::StablehloDialect,
+                  mlir::memref::MemRefDialect>();
   mlir::triton::xla::registerTritonXlaTransformsPasses();
   xla::emitters::registerTransformsPasses();
   xla::gpu::registerGpuFusionTransformsPasses();
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 65cf1584d70ee0..9ccb9f16b4e419 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -196,6 +196,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_traversal",
@@ -206,7 +207,6 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/service/gpu/model:gpu_indexing_performance_model",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -228,13 +228,13 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:statusor",
@@ -331,9 +331,12 @@ xla_cc_test(
     deps = [
         ":block_scaling_rewriter",
         "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -745,6 +748,7 @@ cc_library(
     srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]),
     hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
     deps = if_cuda_is_configured([
+        ":block_scaling_rewriter",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
@@ -777,7 +781,7 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
     ]) + [
         "//xla/service:matmul_indexing_utils",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
@@ -1734,57 +1738,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "gpusolver_rewriter",
-    srcs = ["gpusolver_rewriter.cc"],
-    hdrs = ["gpusolver_rewriter.h"],
-    tags = ["gpu"],
-    deps = [
-        "//xla:comparison_util",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "gpusolver_rewriter_test",
-    srcs = ["gpusolver_rewriter_test.cc"],
-    tags = ["gpu"],
-    deps = [
-        ":gpusolver_rewriter",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/service:pattern_matcher",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:stream",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "layout_assignment",
     srcs = ["layout_assignment.cc"],
@@ -1920,6 +1873,7 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla/hlo/analysis:hlo_dfs_reachability",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_cost_analysis",
@@ -1929,7 +1883,6 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:gpu_performance_model",
         "//xla/service/gpu/model:gpu_performance_model_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -1955,6 +1908,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -1962,7 +1916,6 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:gpu_fusible",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
@@ -1982,6 +1935,7 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile",
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -1994,7 +1948,6 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tools:hlo_extractor",
@@ -2023,6 +1976,7 @@ xla_cc_test(
     deps = [
         ":nest_gemm_fusion",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -2030,7 +1984,6 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/core:status_test_util",
@@ -2054,6 +2007,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/analysis:hlo_dfs_reachability",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_instruction_utils",
         "//xla/hlo/pass:hlo_pass",
@@ -2074,7 +2028,6 @@ cc_library(
         "//xla/service/gpu/model:gpu_performance_model",
         "//xla/service/gpu/model:gpu_performance_model_base",
         "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -2103,6 +2056,7 @@ xla_cc_test(
     deps = [
         ":priority_fusion",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2113,7 +2067,6 @@ xla_cc_test(
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
@@ -2285,10 +2238,12 @@ xla_test(
         "reduction_layout_normalizer_test.cc",
     ],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":reduction_layout_normalizer",
         "//xla:error_spec",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
@@ -2571,6 +2526,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2587,7 +2543,6 @@ cc_library(
         "//xla/service/gpu/model:gpu_performance_model",
         "//xla/service/gpu/model:gpu_performance_model_base",
         "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
@@ -2611,6 +2566,7 @@ xla_cc_test(
         ":softmax_rewriter_triton",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2620,7 +2576,6 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:errors",
@@ -2986,6 +2941,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/runtime:buffer_comparator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -3002,7 +2958,6 @@ cc_library(
         "//xla/service/gpu/autotuning:autotuner_compile_util",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/service/gpu/autotuning:redzone_buffers",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream",
         "//xla/tools:hlo_decomposer_lib",
@@ -3035,11 +2990,11 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/service/gpu/autotuning:autotuner_compile_util",
         "//xla/service/gpu/autotuning:autotuner_util",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:platform",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
index 380b36668e8272..f365a2b142baf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
@@ -26,7 +26,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AddTrackingSuffixToInstructionNames::Run(
+absl::StatusOr<bool> AddTrackingSuffixToInstructionNames::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
index 741937b79336e1..159fc89227d475 100644
--- a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
@@ -40,8 +40,8 @@ class AddTrackingSuffixToInstructionNames : public HloModulePass {
  public:
   absl::string_view name() const override { return "rename-instructions"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
index ff073176f92c9b..fd918e74ead356 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/algebraic_simplifier.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
@@ -99,4 +101,21 @@ GpuAlgebraicSimplifierVisitor::MakeMultiplyForPrecisionAlgorithm(
       lhs, rhs, dot->precision_config().algorithm());
 }
 
+absl::StatusOr<bool> GpuAlgebraicSimplifier::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      2, "GpuAlgebraicSimplifier::RunImpl(), before:\n" + module->ToString());
+  bool changed = false;
+  GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
+  for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
+    if (visitor.Run(comp, options_, this)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "GpuAlgebraicSimplifier::RunImpl(), after:\n" + module->ToString());
+  return changed;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
index 3d61f7c87a5a51..116fbe7bd8cd4c 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
@@ -73,23 +73,10 @@ class GpuAlgebraicSimplifier : public AlgebraicSimplifier {
       : AlgebraicSimplifier(options),
         compute_capability_(std::move(compute_capability)) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
-    XLA_VLOG_LINES(
-        2, "GpuAlgebraicSimplifier::Run(), before:\n" + module->ToString());
-    bool changed = false;
-    GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
-    for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
-      if (visitor.Run(comp, options_, this)) {
-        changed = true;
-      }
-    }
-    XLA_VLOG_LINES(
-        2, "GpuAlgebraicSimplifier::Run(), after:\n" + module->ToString());
-    return changed;
-  }
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   se::GpuComputeCapability compute_capability_;
diff --git a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
index 69ac80a67ce890..8b825222cacd91 100644
--- a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.cc
@@ -97,7 +97,7 @@ class AlgorithmCheckerVisitor : public ConstDfsHloVisitorWithDefault {
 
 }  // namespace
 
-absl::StatusOr<bool> AlgorithmChecker::Run(
+absl::StatusOr<bool> AlgorithmChecker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(AlgorithmCheckerVisitor(gpu_compute_capability_)
diff --git a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
index 6fdddf28c413d3..cc41dae5ca1cf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
+++ b/third_party/xla/xla/service/gpu/transforms/algorithm_checker.h
@@ -39,8 +39,8 @@ class AlgorithmChecker : public HloModulePass {
 
   absl::string_view name() const override { return "algorithm-checker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
index 62f15e3d41a017..bfe580c8db2c6b 100644
--- a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
+++ b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AliasPassthroughParams::Run(
+absl::StatusOr<bool> AliasPassthroughParams::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloInstruction* root = module->entry_computation()->root_instruction();
diff --git a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
index aeb219589ec42f..cc87e23603f1ff 100644
--- a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
+++ b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params.h
@@ -38,8 +38,8 @@ class AliasPassthroughParams : public HloModulePass {
   ~AliasPassthroughParams() override = default;
   absl::string_view name() const override { return "alias_passthrough_params"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
index ebc4a8b13200cf..a96615edca6eb3 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace xla::gpu {
 
-absl::StatusOr<bool> AsyncWrapper::Run(
+absl::StatusOr<bool> AsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
index 52490dab2a24a2..941d0c712fa031 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.h
@@ -38,7 +38,9 @@ class AsyncWrapper : public HloModulePass {
       : predicate_(std::move(predicate)) {}
 
   absl::string_view name() const override { return "async-wrapper"; }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
index 9d62ffbb66bdba..3efcad134688c2 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -262,6 +263,12 @@ enum class CudnnMxType {
 
 CudnnMxType GetCudnnMxType(const Shape& input_shape, const Shape& scale_shape,
                            std::optional<int64_t> block_size) {
+  // Non-default layout is not supported.
+  if (!LayoutUtil::IsMonotonicWithDim0Major(input_shape.layout()) ||
+      !LayoutUtil::IsMonotonicWithDim0Major(scale_shape.layout())) {
+    return CudnnMxType::UNSUPPORTED_TYPE;
+  }
+
   // Determine the block size from shapes, unless explicitly given.
   int64_t actual_block_size =
       block_size.has_value()
@@ -306,7 +313,8 @@ bool IsSupportedByCudnn(CudnnMxType lhs, CudnnMxType rhs) {
 
 // Reshape inputs to shapes compatible with cuDNN.
 absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
-    XlaOp input_op, XlaOp scale_op, std::optional<int64_t> block_size) {
+    XlaOp input_op, XlaOp scale_op, std::optional<int64_t> block_size,
+    bool pad_input) {
   // Get shapes from the inputs.
   XlaBuilder& builder = *input_op.builder();
   TF_ASSIGN_OR_RETURN(Shape input_shape, builder.GetShape(input_op));
@@ -329,12 +337,6 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
   TF_RET_CHECK(size_noncontracting <= scale_noncontracting);
   TF_RET_CHECK(rank == 2 || scale_shape.dimensions(0) == batch_size);
 
-  // Reshape inputs, if necessary.
-  if (rank != 3) {
-    input_op = Reshape(input_op, {1, size_noncontracting, size_contracting});
-    scale_op = Reshape(scale_op, {1, scale_noncontracting, scale_contracting});
-  }
-
   // cuDNN kernel imposes constraints on the input shape sizes.
   const int64_t kInputNonContractingTileSize = 128;
   const int64_t kScaleContractingTileSize = 4;
@@ -349,9 +351,9 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
         RoundUpTo(scale_contracting, kScaleContractingTileSize);
 
     // Pad input tensor, if necessary.
-    if (size_noncontracting != padded_noncontracting) {
-      PaddingConfig input_padding_config = MakeNoPaddingConfig(/*rank=*/3);
-      input_padding_config.mutable_dimensions(1)->set_edge_padding_high(
+    if (size_noncontracting != padded_noncontracting && pad_input) {
+      PaddingConfig input_padding_config = MakeNoPaddingConfig(rank);
+      input_padding_config.mutable_dimensions(rank - 2)->set_edge_padding_high(
           padded_noncontracting - size_noncontracting);
       input_op = Pad(input_op, Zero(&builder, input_shape.element_type()),
                      input_padding_config);
@@ -360,10 +362,10 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
     // Pad scale tensor, if necessary.
     if (scale_noncontracting != padded_noncontracting ||
         scale_contracting != padded_contracting) {
-      PaddingConfig scale_padding_config = MakeNoPaddingConfig(/*rank=*/3);
-      scale_padding_config.mutable_dimensions(1)->set_edge_padding_high(
+      PaddingConfig scale_padding_config = MakeNoPaddingConfig(rank);
+      scale_padding_config.mutable_dimensions(rank - 2)->set_edge_padding_high(
           padded_noncontracting - scale_noncontracting);
-      scale_padding_config.mutable_dimensions(2)->set_edge_padding_high(
+      scale_padding_config.mutable_dimensions(rank - 1)->set_edge_padding_high(
           padded_contracting - scale_contracting);
       scale_op = Pad(scale_op, Zero(&builder, scale_shape.element_type()),
                      scale_padding_config);
@@ -377,8 +379,8 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildCudnnScaledDotInput(
   // using non-vectorized loads or using an extra shared memory buffer).
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x
   TF_ASSIGN_OR_RETURN(Shape scale_valid_shape, builder.GetShape(scale_op));
-  int64_t scale_rows = scale_valid_shape.dimensions(1);
-  int64_t scale_cols = scale_valid_shape.dimensions(2);
+  int64_t scale_rows = scale_valid_shape.dimensions(rank - 2);
+  int64_t scale_cols = scale_valid_shape.dimensions(rank - 1);
   scale_op =
       Reshape(scale_op, {batch_size, scale_rows / kInputNonContractingTileSize,
                          4, 32, scale_cols / kScaleContractingTileSize,
@@ -401,23 +403,27 @@ absl::StatusOr<XlaOp> BuildCudnnScaledDot(XlaOp lhs_input, XlaOp rhs_input,
       cudnn_version >= kCudnnSupportsBlockScaledDotWithGlobalScale;
 
   // Get inputs from parameters.
-  TF_ASSIGN_OR_RETURN(
-      auto lhs_ops_and_size,
-      BuildCudnnScaledDotInput(lhs_input, lhs_scale, block_size));
+  TF_ASSIGN_OR_RETURN(auto lhs_ops_and_size,
+                      BuildCudnnScaledDotInput(lhs_input, lhs_scale, block_size,
+                                               /*pad_input=*/true));
   auto [lhs_input_op, lhs_scale_op, lhs_size] = lhs_ops_and_size;
 
-  TF_ASSIGN_OR_RETURN(
-      auto rhs_ops_and_size,
-      BuildCudnnScaledDotInput(rhs_input, rhs_scale, block_size));
+  TF_ASSIGN_OR_RETURN(auto rhs_ops_and_size,
+                      BuildCudnnScaledDotInput(rhs_input, rhs_scale, block_size,
+                                               /*pad_input=*/true));
   auto [rhs_input_op, rhs_scale_op, rhs_size] = rhs_ops_and_size;
 
   // Calculate output shape.
   XlaBuilder& builder = *lhs_input.builder();
   TF_ASSIGN_OR_RETURN(Shape lhs_shape, builder.GetShape(lhs_input_op));
   TF_ASSIGN_OR_RETURN(Shape rhs_shape, builder.GetShape(rhs_input_op));
-  Shape result_shape = ShapeUtil::MakeShape(
-      result_type, {lhs_shape.dimensions(0), lhs_shape.dimensions(1),
-                    rhs_shape.dimensions(1)});
+  int rank = lhs_shape.dimensions().size();
+  std::vector<int64_t> result_dims{lhs_shape.dimensions(rank - 2),
+                                   rhs_shape.dimensions(rank - 2)};
+  if (rank == 3) {
+    result_dims.insert(result_dims.begin(), lhs_shape.dimensions(0));
+  }
+  Shape result_shape = ShapeUtil::MakeShape(result_type, result_dims);
   Shape scratch_shape = ShapeUtil::MakeShape(PrimitiveType::U8, {0});
   Shape output_shape = ShapeUtil::MakeTupleShape({result_shape, scratch_shape});
 
@@ -439,10 +445,13 @@ absl::StatusOr<XlaOp> BuildCudnnScaledDot(XlaOp lhs_input, XlaOp rhs_input,
   }
 
   // Slice the result, if necessary.
-  if (lhs_size != lhs_shape.dimensions(1) ||
-      rhs_size != rhs_shape.dimensions(1)) {
-    std::vector<int64_t> limit{lhs_shape.dimensions(0), lhs_size, rhs_size};
-    result = Slice(result, {0, 0, 0}, limit, {1, 1, 1});
+  if (lhs_size != lhs_shape.dimensions(rank - 2) ||
+      rhs_size != rhs_shape.dimensions(rank - 2)) {
+    std::vector<int64_t> start(rank, 0);
+    std::vector<int64_t> strides(rank, 1);
+    result_dims[rank - 2] = lhs_size;
+    result_dims[rank - 1] = rhs_size;
+    result = Slice(result, start, result_dims, strides);
   }
   return result;
 }
@@ -608,22 +617,82 @@ absl::StatusOr<HloInstruction*> ExpandBlockScaledDotCustomCall(
   // Build replacement instruction sequence.
   XlaBuilder builder(std::string(instruction->name()));
   auto operands = absl::MakeSpan(instruction->operands());
-  TF_ASSIGN_OR_RETURN(
-      XlaOp block_scaled_dot,
+  TF_RETURN_IF_ERROR(
       BuildBlockScaledDot(builder, operands[0], operands[1], operands[2],
                           operands.size() >= 4 ? operands[3] : nullptr,
                           operands.size() == 5 ? operands[4] : nullptr, dnums,
-                          result_type, block_size, std::move(cudnn_version)));
+                          result_type, block_size, std::move(cudnn_version))
+          .status());
+  return ExpandInstructionUsingBuilder(builder, instruction);
+}
+
+// ----- cuDNN scale swizzling
+
+absl::StatusOr<HloComputation*> CreateScaleSwizzleComputation(
+    const HloInstruction* input, const HloInstruction* scale) {
+  // Create XLA builder and parameters.
+  std::string name = absl::StrCat(scale->name(), "_swizzle");
+  XlaBuilder builder(name);
+  XlaOp input_op = Parameter(&builder, 0, input->shape(), "input");
+  XlaOp scale_op = Parameter(&builder, 1, scale->shape(), "scale");
+
+  // Build swizzle computation.
+  TF_ASSIGN_OR_RETURN(
+      auto ops_and_size,
+      BuildCudnnScaledDotInput(input_op, scale_op, /*block_size=*/std::nullopt,
+                               /*pad_input=*/false));
+  auto [result_input_op, result_scale_op, _] = ops_and_size;
+  Tuple(&builder, {result_input_op, result_scale_op});
 
-  // Reshape to the expected output shape.
-  // This should only happen when a unit-sized dimension is added by the pass.
-  TF_ASSIGN_OR_RETURN(Shape result_shape, builder.GetShape(block_scaled_dot));
-  if (result_shape != instruction->shape()) {
-    CHECK_EQ(ShapeUtil::ElementsIn(instruction->shape()),
-             ShapeUtil::ElementsIn(result_shape));
-    Reshape(instruction->shape(), block_scaled_dot);
+  TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+  TF_ASSIGN_OR_RETURN(
+      HloComputation * computation,
+      XlaComputationToHloComputation(xla_computation, input->GetModule()));
+
+  for (HloInstruction* instr : computation->instructions()) {
+    // Replace reshapes with bitcasts (post layout assignment).
+    if (instr->opcode() == HloOpcode::kReshape) {
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(
+          instr, computation->AddInstruction(HloInstruction::CreateBitcast(
+                     instr->shape(), instr->mutable_operand(0)))));
+    }
+    // Fix transpose layouts (generated as no-ops).
+    if (instr->opcode() == HloOpcode::kTranspose) {
+      *instr->mutable_shape()->mutable_layout() =
+          LayoutUtil::GetDefaultLayoutForShape(instr->shape());
+    }
   }
-  return ExpandInstructionUsingBuilder(builder, instruction);
+  return computation;
+}
+
+absl::Status SliceScaledDotOperands(HloInstruction* scaled_dot) {
+  // Create scaled dot operation with noncontracting dimensions sliced.
+  int rank = scaled_dot->shape().dimensions().size();
+  HloComputation* computation = scaled_dot->parent();
+
+  // Create slice operations for LHS/RHS.
+  std::vector<HloInstruction*> new_operands(scaled_dot->operands().begin(),
+                                            scaled_dot->operands().end());
+  for (int i = 0; i < 2; ++i) {
+    const Shape& input_shape = scaled_dot->operand(i)->shape();
+    const Shape& scale_shape = scaled_dot->operand(i + 2)->shape();
+    if (input_shape.dimensions(rank - 2) != scale_shape.dimensions(rank - 2)) {
+      std::vector<int64_t> start(rank, 0);
+      std::vector<int64_t> strides(rank, 1);
+      std::vector<int64_t> limit(input_shape.dimensions().begin(),
+                                 input_shape.dimensions().end());
+      limit[rank - 1] = scale_shape.dimensions(rank - 1);
+      new_operands[i + 2] =
+          computation->AddInstruction(HloInstruction::CreateSlice(
+              ShapeUtil::MakeShape(scale_shape.element_type(), limit),
+              scaled_dot->mutable_operand(i + 2), start, limit, strides));
+    }
+  }
+
+  // Replace scaled dot instruction operands.
+  HloInstruction* new_scaled_dot = computation->AddInstruction(
+      scaled_dot->CloneWithNewOperands(scaled_dot->shape(), new_operands));
+  return computation->ReplaceInstruction(scaled_dot, new_scaled_dot);
 }
 
 }  // namespace
@@ -651,4 +720,90 @@ absl::StatusOr<HloInstruction*> BlockScalingRewriter::ExpandInstruction(
              << instruction->custom_call_target();
 }
 
+bool CudnnScaledDotHelper::IsSupported(
+    const HloScaledDotInstruction* scaled_dot) {
+  const HloInstruction* lhs_input = scaled_dot->operand(0);
+  const HloInstruction* rhs_input = scaled_dot->operand(1);
+  const HloInstruction* lhs_scale = scaled_dot->operand(2);
+  const HloInstruction* rhs_scale = scaled_dot->operand(3);
+
+  // Input fusion is not supported, as the underlying kernel reads from HBM.
+  auto is_parameter = [](const HloInstruction* instr, int index) {
+    return instr->opcode() == HloOpcode::kParameter &&
+           instr->parameter_number() == index && instr->user_count() == 1;
+  };
+  if (!is_parameter(lhs_input, 0) || !is_parameter(rhs_input, 1) ||
+      !is_parameter(lhs_scale, 2) || !is_parameter(rhs_scale, 3)) {
+    return false;
+  }
+
+  // The dot dimension numbers must have fixed order: batch dimension first
+  // (if present) and contracting dimension last.
+  const DotDimensionNumbers& dnums = scaled_dot->dot_dimension_numbers();
+  int rank = lhs_input->shape().dimensions().size();
+  if (dnums.lhs_contracting_dimensions()[0] != rank - 1 ||
+      dnums.rhs_contracting_dimensions()[0] != rank - 1 ||
+      (rank == 3 && (dnums.lhs_batch_dimensions()[0] != 0 ||
+                     dnums.rhs_batch_dimensions()[0] != 0))) {
+    return false;
+  }
+
+  // cuDNN kernel supports a subset of block scaled types.
+  return IsSupportedByCudnn(
+      GetCudnnMxType(lhs_input->shape(), lhs_scale->shape(), std::nullopt),
+      GetCudnnMxType(rhs_input->shape(), rhs_scale->shape(), std::nullopt));
+}
+
+absl::StatusOr<HloInstruction*> CudnnScaledDotHelper::AddScaleSwizzle(
+    HloFusionInstruction* fusion) {
+  HloComputation* parent = fusion->parent();
+  int rank = fusion->shape().dimensions().size();
+
+  // Add swizzling to LHS/RHS.
+  std::vector<HloInstruction*> swizzled_operands(4);
+  for (int i = 0; i < 2; ++i) {
+    TF_ASSIGN_OR_RETURN(HloComputation * swizzle_computation,
+                        CreateScaleSwizzleComputation(fusion->operand(i),
+                                                      fusion->operand(i + 2)));
+    HloInstruction* call = parent->AddInstruction(HloInstruction::CreateCall(
+        swizzle_computation->root_instruction()->shape(),
+        {fusion->mutable_operand(i), fusion->mutable_operand(i + 2)},
+        swizzle_computation));
+    for (int j = 0; j < 2; ++j) {
+      swizzled_operands[i + j * 2] =
+          parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+              call->shape().tuple_shapes(j), call, j));
+    }
+  }
+
+  // Update fusion computation parameter shapes, if needed.
+  HloComputation* computation = fusion->fused_instructions_computation();
+  bool need_slicing = false;
+  for (int i = 0; i < 4; ++i) {
+    HloInstruction* param = computation->parameter_instruction(i);
+    const Shape& swizzled_shape = swizzled_operands[i]->shape();
+    Shape* param_shape = param->mutable_shape();
+    if (*param_shape != swizzled_shape) {
+      need_slicing |= param_shape->dimensions(rank - 2) !=
+                      swizzled_shape.dimensions(rank - 2);
+      *param_shape = swizzled_shape;
+    }
+  }
+
+  // Replace scaled dot if any inputs need slicing.
+  if (need_slicing) {
+    HloInstruction* scaled_dot =
+        computation->parameter_instruction(0)->users()[0];
+    TF_RETURN_IF_ERROR(SliceScaledDotOperands(scaled_dot));
+  }
+
+  // Create new fusion with the swizzled operands.
+  HloInstruction* new_fusion =
+      parent->AddInstruction(HloInstruction::CreateFusion(
+          computation->root_instruction()->shape(), fusion->fusion_kind(),
+          swizzled_operands, fusion->fused_instructions_computation()));
+  TF_RETURN_IF_ERROR(parent->ReplaceInstruction(fusion, new_fusion));
+  return new_fusion;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
index 2aea00533f6911..5cc1c31c0b40cc 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/transforms/expanders/op_expander_pass.h"
 #include "xla/stream_executor/dnn.h"
 
@@ -98,6 +99,18 @@ class BlockScalingRewriter : public OpExpanderPass {
   se::dnn::VersionInfo cudnn_version_;
 };
 
+// Helper class for building cuDNN scaled dot operations.
+class CudnnScaledDotHelper {
+ public:
+  // Check if the scaled dot fusion is supported by cuDNN.
+  static bool IsSupported(const HloScaledDotInstruction* scaled_dot);
+
+  // Extract scale tensor swizzling from the block scaled dot fusion into
+  // separate computations.
+  static absl::StatusOr<HloInstruction*> AddScaleSwizzle(
+      HloFusionInstruction* fusion);
+};
+
 }  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_TRANSFORMS_BLOCK_SCALING_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
index 768a340f21649a..6b15e98ae18c08 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_test.cc
@@ -18,9 +18,12 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -336,26 +339,21 @@ ENTRY main {
   BlockScalingRewriter pass(kCudnnSupportsBlockScaledDot);
   RunAndFilecheckHloRewrite(hlo_string, std::move(pass), R"(
   CHECK: [[lhs:%.+]] = f8e4m3fn[128,96]{1,0} parameter(0)
-  CHECK: [[lhs_rs:%.+]] = f8e4m3fn[1,128,96]{2,1,0} reshape([[lhs]])
   CHECK: [[rhs:%.+]] = f8e4m3fn[120,96]{1,0} parameter(1)
-  CHECK: [[rhs_rs:%.+]] = f8e4m3fn[1,120,96]{2,1,0} reshape([[rhs]])
-  CHECK: [[rhs_pad:%.+]] = f8e4m3fn[1,128,96]{2,1,0} pad([[rhs_rs]], {{.+}}), padding=0_0x0_8x0_0
+  CHECK: [[rhs_pad:%.+]] = f8e4m3fn[128,96]{1,0} pad([[rhs]], {{.+}}), padding=0_8x0_0
   CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[128,3]{1,0} parameter(2)
-  CHECK: [[lhs_scale_rs:%.+]] = f8e8m0fnu[1,128,3]{2,1,0} reshape([[lhs_scale]])
-  CHECK: [[lhs_scale_pad:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} pad([[lhs_scale_rs]], {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[lhs_scale_pad:%.+]] = f8e8m0fnu[128,4]{1,0} pad([[lhs_scale]], {{.+}}), padding=0_0x0_1
   CHECK: [[lhs_scale_rs2:%.+]] = f8e8m0fnu[1,1,4,32,1,4]{5,4,3,2,1,0} reshape([[lhs_scale_pad]])
   CHECK: [[lhs_scale_tr2:%.+]] = f8e8m0fnu[1,1,1,32,4,4]{5,2,3,4,1,0} transpose([[lhs_scale_rs2]]), dimensions={0,1,4,3,2,5}
-  CHECK: [[lhs_scale_swizzle:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} reshape([[lhs_scale_tr2]])
+  CHECK: [[lhs_scale_swizzle:%.+]] = f8e8m0fnu[128,4]{1,0} reshape([[lhs_scale_tr2]])
   CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[120,3]{1,0} parameter(3)
-  CHECK: [[rhs_scale_rs:%.+]] = f8e8m0fnu[1,120,3]{2,1,0} reshape([[rhs_scale]])
-  CHECK: [[rhs_scale_pad:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} pad([[rhs_scale_rs]], {{.+}}), padding=0_0x0_8x0_1
+  CHECK: [[rhs_scale_pad:%.+]] = f8e8m0fnu[128,4]{1,0} pad([[rhs_scale]], {{.+}}), padding=0_8x0_1
   CHECK: [[rhs_scale_rs2:%.+]] = f8e8m0fnu[1,1,4,32,1,4]{5,4,3,2,1,0} reshape([[rhs_scale_pad]])
   CHECK: [[rhs_scale_tr2:%.+]] = f8e8m0fnu[1,1,1,32,4,4]{5,2,3,4,1,0} transpose([[rhs_scale_rs2]]), dimensions={0,1,4,3,2,5}
-  CHECK: [[rhs_scale_swizzle:%.+]] = f8e8m0fnu[1,128,4]{2,1,0} reshape([[rhs_scale_tr2]])
-  CHECK: [[call:%.+]] = ({{.+}}) custom-call([[lhs_rs]], [[rhs_pad]], [[lhs_scale_swizzle]], [[rhs_scale_swizzle]])
-  CHECK: [[gte:%.+]] = f16[1,128,128]{2,1,0} get-tuple-element([[call]]), index=0
-  CHECK: [[slice:%.+]] = f16[1,128,120]{2,1,0} slice([[gte]]), slice={[0:1], [0:128], [0:120]}
-  CHECK: ROOT {{.+}} = f16[128,120]{1,0} reshape([[slice]])
+  CHECK: [[rhs_scale_swizzle:%.+]] = f8e8m0fnu[128,4]{1,0} reshape([[rhs_scale_tr2]])
+  CHECK: [[call:%.+]] = ({{.+}}) custom-call([[lhs]], [[rhs_pad]], [[lhs_scale_swizzle]], [[rhs_scale_swizzle]])
+  CHECK: [[gte:%.+]] = f16[128,128]{1,0} get-tuple-element([[call]]), index=0
+  CHECK: ROOT {{.+}} = f16[128,120]{1,0} slice([[gte]]), slice={[0:128], [0:120]}
 })");
 }
 
@@ -394,5 +392,232 @@ ENTRY main {
 })");
 }
 
+TEST_F(BlockScalingRewriterTest, CudnnFusionSupportedE4M3) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128,256] parameter(0)
+  %rhs = f8e4m3fn[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedE5M2) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e5m2[128,256] parameter(0)
+  %rhs = f8e5m2[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedDimensions) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128,256] parameter(0)
+  %rhs = f8e4m3fn[256,128] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[8,128] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnupportedLayout) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  %lhs = f8e4m3fn[16,128]{0,1} parameter(0)
+  %rhs = f8e4m3fn[32,128]{1,0} parameter(1)
+  %lhs_scale = f8e8m0fnu[16,4]{1,0} parameter(2)
+  %rhs_scale = f8e8m0fnu[32,4]{1,0} parameter(3)
+  ROOT %result = f16[16,32]{1,0} scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionUnsupportedInputs) {
+  constexpr absl::string_view hlo_string = R"(
+ENTRY main {
+  %lhs = f8e4m3fn[128] parameter(0)
+  %lhs_bc = f8e4m3fn[128,256] broadcast(%lhs), dimensions={0}
+  %rhs = f8e4m3fn[128,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[128,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[128,8] parameter(3)
+  ROOT %result = f16[128,128] scaled-dot(%lhs_bc, %rhs, %lhs_scale, %rhs_scale),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(CudnnScaledDotHelper::IsSupported(Cast<HloScaledDotInstruction>(
+      test_module->entry_computation()->root_instruction())));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzleSimple) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,384,256] parameter(0)
+  %rhs = f8e4m3fn[4,512,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,8] parameter(3)
+  ROOT %result = f32[4,384,512] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,384,256] parameter(0)
+  %rhs = f8e4m3fn[4,512,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,8] parameter(3)
+  ROOT %result = f32[4,384,512] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast({{.+}})
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast({{.+}})
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,384,256]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,512,256]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: {{.+}} = f32[4,384,512]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_scale]], [[rhs_scale]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzlePadContracting) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,384,224] parameter(0)
+  %rhs = f8e4m3fn[4,512,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,7] parameter(3)
+  ROOT %result = f32[4,384,512] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,384,224] parameter(0)
+  %rhs = f8e4m3fn[4,512,224] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,384,7] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,512,7] parameter(3)
+  ROOT %result = f32[4,384,512] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_pad:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast([[lhs_pad]])
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_pad:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_0x0_1
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast([[rhs_pad]])
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,384,224]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,512,224]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: {{.+}} = f32[4,384,512]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_scale]], [[rhs_scale]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
+TEST_F(BlockScalingRewriterTest, CudnnFusionSwizzlePadNoncontracting) {
+  constexpr absl::string_view hlo_string = R"(
+fusion {
+  %lhs = f8e4m3fn[4,320,256] parameter(0)
+  %rhs = f8e4m3fn[4,448,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,320,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,448,8] parameter(3)
+  ROOT %result = f32[4,320,448] scaled-dot(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={2}
+}
+
+ENTRY main {
+  %lhs = f8e4m3fn[4,320,256] parameter(0)
+  %rhs = f8e4m3fn[4,448,256] parameter(1)
+  %lhs_scale = f8e8m0fnu[4,320,8] parameter(2)
+  %rhs_scale = f8e8m0fnu[4,448,8] parameter(3)
+  ROOT %result = f32[4,320,448] fusion(%lhs, %rhs, %lhs_scale, %rhs_scale),
+      kind=kCustom, calls=fusion,
+      backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto test_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(CudnnScaledDotHelper::AddScaleSwizzle(Cast<HloFusionInstruction>(
+      test_module->entry_computation()->root_instruction())));
+
+  constexpr absl::string_view expected = R"(
+  // %lhs_scale.1_swizzle.1
+  CHECK: [[lhs_pad:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_64x0_0
+  CHECK: [[lhs_bc:%.+]] = f8e8m0fnu[4,3,4,32,2,4]{5,4,3,2,1,0} bitcast([[lhs_pad]])
+  CHECK: [[lhs_tr:%.+]] = f8e8m0fnu[4,3,2,32,4,4]{5,4,3,2,1,0} transpose([[lhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,384,8]{2,1,0} bitcast([[lhs_tr]])
+  // %rhs_scale.1_swizzle.1
+  CHECK: [[rhs_pad:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} pad({{.+}}, {{.+}}), padding=0_0x0_64x0_0
+  CHECK: [[rhs_bc:%.+]] = f8e8m0fnu[4,4,4,32,2,4]{5,4,3,2,1,0} bitcast([[rhs_pad]])
+  CHECK: [[rhs_tr:%.+]] = f8e8m0fnu[4,4,2,32,4,4]{5,4,3,2,1,0} transpose([[rhs_bc]]), dimensions={0,1,4,3,2,5}
+  CHECK: {{.+}} = f8e8m0fnu[4,512,8]{2,1,0} bitcast([[rhs_tr]])
+  // %fusion
+  CHECK: [[lhs:%.+]] = f8e4m3fn[4,320,256]{2,1,0} parameter(0)
+  CHECK: [[rhs:%.+]] = f8e4m3fn[4,448,256]{2,1,0} parameter(1)
+  CHECK: [[lhs_scale:%.+]] = f8e8m0fnu[4,384,8]{2,1,0} parameter(2)
+  CHECK: [[lhs_slice:%.+]] = f8e8m0fnu[4,320,8]{2,1,0} slice([[lhs_scale]]), slice={[0:4], [0:320], [0:8]}
+  CHECK: [[rhs_scale:%.+]] = f8e8m0fnu[4,512,8]{2,1,0} parameter(3)
+  CHECK: [[rhs_slice:%.+]] = f8e8m0fnu[4,448,8]{2,1,0} slice([[rhs_scale]]), slice={[0:4], [0:448], [0:8]}
+  CHECK: {{.+}} = f32[4,320,448]{2,1,0} scaled-dot([[lhs]], [[rhs]], [[lhs_slice]], [[rhs_slice]])
+)";
+  EXPECT_THAT(RunFileCheck(test_module->ToString(), expected),
+              absl_testing::IsOkAndHolds(true));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index bc65cf0fd333e4..caab9b91f0e074 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -105,7 +105,9 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -134,6 +136,7 @@ cc_library(
     srcs = ["gpu_collective_combiner_utils.cc"],
     hdrs = ["gpu_collective_combiner_utils.h"],
     deps = [
+        ":collective_ops_utils",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -301,13 +304,13 @@ cc_library(
         ":collective_ops_utils",
         ":convert_async_collectives_to_sync",
         "//xla:util",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/utils:hlo_query",
         "//xla/service/gpu:alias_info",
         "//xla/service/gpu:gpu_hlo_schedule",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -325,11 +328,11 @@ xla_cc_test(
     srcs = ["collective_combiner_annotator_test.cc"],
     deps = [
         ":collective_combiner_annotator",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:alias_info",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
index b30a465670b76b..fe06365394a466 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
@@ -75,7 +75,7 @@ std::optional<AllGatherCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuAllGatherCombiner::Run(
+absl::StatusOr<bool> GpuAllGatherCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
index c165b9c1310f28..9f5784e2e8bf2d 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.h
@@ -45,8 +45,8 @@ class GpuAllGatherCombiner : public AllGatherCombiner {
 
   absl::string_view name() const override { return "gpu-all-gather-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
index 1764a29679f1af..17ce38bb0cecb3 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
@@ -33,7 +33,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AllGatherOptimizer::Run(
+absl::StatusOr<bool> AllGatherOptimizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
index a4d6e71953a34a..057486b7225398 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
@@ -37,8 +37,8 @@ class AllGatherOptimizer : public HloModulePass {
   AllGatherOptimizer() = default;
   absl::string_view name() const override { return "all-gather-optimizer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
index 936d54e7670978..90781cab12c352 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
@@ -339,7 +339,7 @@ static absl::StatusOr<bool> TryDecomposeAllReduce(
   return true;
 }
 
-absl::StatusOr<bool> AllReduceBlueConnect::Run(
+absl::StatusOr<bool> AllReduceBlueConnect::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(1) << "Running AllReduceBlueConnect";
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
index aab921621106fa..cd5176bd4f3d26 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
@@ -42,8 +42,8 @@ class AllReduceBlueConnect : public HloModulePass {
 
   absl::string_view name() const override { return "all-reduce-blueconnect"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
index d1e099f2ecb05c..c76c5302220db0 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
@@ -73,7 +73,7 @@ std::optional<AllReduceCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuAllReduceCombiner::Run(
+absl::StatusOr<bool> GpuAllReduceCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
index 9f1013b45a50b3..65eb8f33dc67e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.h
@@ -42,8 +42,8 @@ class GpuAllReduceCombiner : public AllReduceCombiner {
 
   absl::string_view name() const override { return "gpu-all-reduce-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
index 8ed113b873dc12..b0322a453596b2 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
@@ -138,7 +138,7 @@ static absl::StatusOr<bool> DecomposeAllReduce(HloInstruction* hlo,
   return true;
 }
 
-absl::StatusOr<bool> AllReduceDecomposer::Run(
+absl::StatusOr<bool> AllReduceDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
index 844f3e82cd20eb..3c5604004693ed 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
@@ -30,7 +30,8 @@ class AllReduceDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-decomposer"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
index 92484beff3e8d3..a0b48fd4a1f7c5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
@@ -415,7 +415,7 @@ static absl::StatusOr<bool> SplitAllReduce(const HloModule& module,
                         computation);  // changed
 }
 
-absl::StatusOr<bool> AllReduceSplitter::Run(
+absl::StatusOr<bool> AllReduceSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
index f11241f945ba74..ecc1a1fdac2502 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
@@ -66,8 +66,8 @@ class AllReduceSplitter : public HloModulePass {
  public:
   absl::string_view name() const override { return "all-reduce-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
index 4a045bb8f364df..a2d8a5dbc218ae 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> AsyncCollectiveAnnotator::Run(
+absl::StatusOr<bool> AsyncCollectiveAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
index e3e534e1ae6d76..9e40855193752f 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/async_collective_annotator.h
@@ -37,8 +37,8 @@ class AsyncCollectiveAnnotator : public HloModulePass {
     return "async-collective-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
index 344e399448c148..6885fc84b4b2bd 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.cc
@@ -78,7 +78,7 @@ absl::StatusOr<GPUCommunicationType> GetCommunicationType(
 // based on:
 // 1. Communication pattern (intranode vs internode)
 // 2. Message size (compared against threshold_in_bytes)
-absl::StatusOr<bool> CollectiveBackendAssigner::Run(
+absl::StatusOr<bool> CollectiveBackendAssigner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -92,14 +92,17 @@ absl::StatusOr<bool> CollectiveBackendAssigner::Run(
           GPUCommunicationType comm_type,
           GetCommunicationType(instr, num_visible_devices_per_process_,
                                gpu_version_));
+      int64_t shape_size = GetShapeSize(instr->shape());
       VLOG(1) << "CollectiveBackendAssigner: comm_type="
-              << static_cast<int>(comm_type)
-              << " shape_size=" << GetShapeSize(instr->shape())
-              << " threshold_in_bytes_=" << threshold_in_bytes_;
-      bool use_nvshmem = (num_visible_devices_per_process_ == 1 ||
-                          comm_type == GPUCommunicationType::SINGLE_HOST) &&
-                         (!IsAllReduceOp(instr) ||
-                          GetShapeSize(instr->shape()) < threshold_in_bytes_);
+              << static_cast<int>(comm_type) << " shape_size=" << shape_size
+              << " threshold_in_bytes_=" << threshold_in_bytes_
+              << " slice_size_=" << slice_size_;
+      bool use_nvshmem =
+          (num_visible_devices_per_process_ == 1 ||
+           comm_type == GPUCommunicationType::SINGLE_PARTITION ||
+           (slice_size_ > 0 &&
+            IsIntraNVLinkDomain(module->config(), slice_size_))) &&
+          (!IsAllReduceOp(instr) || shape_size < threshold_in_bytes_);
       if (!use_nvshmem) {
         continue;
       }
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
index 2f0838ce535ccd..ed1ef6ad35d949 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner.h
@@ -43,17 +43,19 @@ class CollectiveBackendAssigner : public HloModulePass {
  public:
   explicit CollectiveBackendAssigner(
       const se::GpuComputeCapability& gpu_version,
-      int num_visible_devices_per_process,
+      int num_visible_devices_per_process, int64_t slice_size = 0,
       int64_t threshold_in_bytes = kDefaultThresholdInBytes)
       : gpu_version_(gpu_version),
         num_visible_devices_per_process_(num_visible_devices_per_process),
-        threshold_in_bytes_(threshold_in_bytes) {}
+        threshold_in_bytes_(threshold_in_bytes),
+        slice_size_(slice_size) {}
 
   absl::string_view name() const override {
     return "collective-backend-assigner";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -61,6 +63,7 @@ class CollectiveBackendAssigner : public HloModulePass {
   se::GpuComputeCapability gpu_version_;
   int num_visible_devices_per_process_;
   int64_t threshold_in_bytes_;
+  int64_t slice_size_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
index fef570812387d0..3a8b20d1d68dcf 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_backend_assigner_test.cc
@@ -38,11 +38,13 @@ using ::tsl::testing::IsOkAndHolds;
 
 class CollectiveBackendAssignerTest : public HloHardwareIndependentTestBase {
  protected:
-  absl::StatusOr<bool> RunCollectiveBackendAssigner(HloModule* module) {
+  absl::StatusOr<bool> RunCollectiveBackendAssigner(HloModule* module,
+                                                    int num_devices_per_host,
+                                                    int64_t slice_size = 0) {
     se::GpuComputeCapability gpu_version = se::CudaComputeCapability(8, 0);
-    return RunHloPass(
-        CollectiveBackendAssigner(gpu_version, /*num_devices_per_host=*/1),
-        module);
+    return RunHloPass(CollectiveBackendAssigner(
+                          gpu_version, num_devices_per_host, slice_size),
+                      module);
   }
 
   absl::StatusOr<CollectiveBackendConfig_CollectiveBackend>
@@ -70,7 +72,9 @@ TEST_F(CollectiveBackendAssignerTest, SmallAllReduceUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* all_reduce =
@@ -96,7 +100,9 @@ TEST_F(CollectiveBackendAssignerTest, LargeAllReduceUsesDefault) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(false));
 
   const HloInstruction* all_reduce =
@@ -117,7 +123,9 @@ TEST_F(CollectiveBackendAssignerTest, SmallCollectivePermuteUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* permute =
@@ -138,7 +146,9 @@ TEST_F(CollectiveBackendAssignerTest, LargeCollectivePermuteUsesNvshmem) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  EXPECT_THAT(RunCollectiveBackendAssigner(module.get()),
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/1, /*slice_size=*/0),
               absl_testing::IsOkAndHolds(true));
 
   const HloInstruction* permute =
@@ -147,6 +157,97 @@ TEST_F(CollectiveBackendAssignerTest, LargeCollectivePermuteUsesNvshmem) {
               absl_testing::IsOkAndHolds(CollectiveBackendConfig::NVSHMEM));
 }
 
+TEST_F(CollectiveBackendAssignerTest, IntraNvlinkDomainUsesNvshmem) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[1024,1024] parameter(0)
+      ROOT result = f32[1024,1024] all-reduce(p0), to_apply=add, replica_groups={{0,1}}, channel_id=5
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(2);
+  module->mutable_config().set_replica_count(2);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/4),
+              absl_testing::IsOkAndHolds(true));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::NVSHMEM));
+}
+
+TEST_F(CollectiveBackendAssignerTest,
+       IntraNvlinkDomainLargeAllReduceUsesDefault) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[8192,8192] parameter(0)
+      ROOT result = f32[8192,8192] all-reduce(p0), to_apply=add, replica_groups={{0,1}}, channel_id=8
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(2);
+  module->mutable_config().set_replica_count(2);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/4),
+              absl_testing::IsOkAndHolds(false));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::DEFAULT));
+}
+
+TEST_F(CollectiveBackendAssignerTest, NonIntraNvlinkDomainUsesDefault) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY main {
+      p0 = f32[1024,1024] parameter(0)
+      ROOT result = f32[1024,1024] all-reduce(p0), to_apply=add, channel_id=13
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  module->mutable_config().set_num_partitions(1);
+  module->mutable_config().set_replica_count(4);
+
+  EXPECT_THAT(RunCollectiveBackendAssigner(
+                  module.get(), /*num_devices_per_host=*/2, /*slice_size=*/2),
+              absl_testing::IsOkAndHolds(false));
+
+  const HloInstruction* all_reduce =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(GetCollectiveBackendConfig(all_reduce),
+              absl_testing::IsOkAndHolds(CollectiveBackendConfig::DEFAULT));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
index c957ac78a64b13..fa458d8e48bd4c 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
 #include "xla/stream_executor/device_description.h"
@@ -124,7 +124,7 @@ int64_t MaxAvailableMemory(const HloModule& module,
 
 }  // namespace
 
-absl::StatusOr<bool> CollectiveCombinerAnnotator::Run(
+absl::StatusOr<bool> CollectiveCombinerAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
index 135eee6c8c2bdc..47eb782ba5c664 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
@@ -23,11 +23,11 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/alias_info.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
@@ -45,14 +45,15 @@ class CollectiveCombinerAnnotator : public HloModulePass {
         pointer_size_(pointer_size),
         symbolic_expr_context_(symbolic_expr_context) {}
 
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   absl::string_view name() const override {
     return "collective-combiner-annotator";
   }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const se::DeviceDescription device_info_;
   const GpuAliasInfo* alias_info_;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
index 0305462952cd1d..77f0caf68a4c78 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/alias_info.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/status_matchers.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
index 309498780a4099..dc9e63e8b36c8a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
@@ -18,15 +18,19 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <variant>
+#include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,9 +46,26 @@ namespace xla {
 namespace gpu {
 namespace {
 
-struct CommunicationMetadata {
-  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
-  int num_devices_per_host;
+// Computes a map from source partition ID to a set of target partition IDs for
+// a collective-permute instruction. A partition ID is computed by dividing the
+// device (replica) ID by the number of devices per host.
+absl::flat_hash_map<int64_t, absl::flat_hash_set<int64_t>>
+GetSourceToTargetsNodeMap(const HloCollectivePermuteInstruction& instr,
+                          int num_devices_per_partition) {
+  absl::flat_hash_map<int64_t, absl::flat_hash_set<int64_t>>
+      source_to_targets_partition_map;
+  for (const auto& [source, target] : instr.source_target_pairs()) {
+    int64_t source_partition = source / num_devices_per_partition;
+    int64_t target_partition = target / num_devices_per_partition;
+    source_to_targets_partition_map[source_partition].insert(target_partition);
+  }
+  return source_to_targets_partition_map;
+}
+
+struct CollectiveMetadata {
+  // map for ops with `replica_groups`, e.g. all-gather.
+  absl::flat_hash_map<int64_t, size_t> partition_to_participant_count;
+  int num_devices_per_partition;
   int64_t replica_count;
 };
 
@@ -65,73 +86,147 @@ bool SameParticipantCounts(const absl::flat_hash_map<int64_t, size_t>& lhs,
   return lhs_counts == rhs_counts;
 }
 
-absl::StatusOr<CommunicationMetadata> CommunicationContext(
-    const HloChannelInstruction& instr, int num_devices_per_host) {
-  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
-
-  if (auto* collective =
-          dynamic_cast<const HloCollectiveInstruction*>(&instr)) {
-    for (const ReplicaGroup& replica_group :
-         collective->device_list().replica_groups()) {
-      absl::flat_hash_map<int64_t, size_t> buffer;
-      for (int64_t rank : replica_group.replica_ids()) {
-        int64_t node_id = rank / num_devices_per_host;
-        buffer[node_id]++;
-      }
-      if (!node_to_participant_count.empty() &&
-          !SameParticipantCounts(buffer, node_to_participant_count)) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Non homogenous replica group: ",
-                         collective->device_list().ToString()));
-      }
-      if (node_to_participant_count.empty()) {
-        node_to_participant_count = buffer;
-      }
+absl::StatusOr<CollectiveMetadata> CommunicationContext(
+    const HloCollectiveInstruction& instr, int num_devices_per_partition) {
+  absl::flat_hash_map<int64_t, size_t> partition_to_participant_count;
+
+  for (const ReplicaGroup& replica_group :
+       instr.device_list().replica_groups()) {
+    absl::flat_hash_map<int64_t, size_t> buffer;
+    for (int64_t rank : replica_group.replica_ids()) {
+      int64_t partition_id = rank / num_devices_per_partition;
+      buffer[partition_id]++;
     }
-  } else if (auto* permute =
-                 dynamic_cast<const HloCollectivePermuteInstruction*>(&instr)) {
-    for (const auto& [source, target] : permute->source_target_pairs()) {
-      int64_t source_node = source / num_devices_per_host;
-      int64_t target_node = target / num_devices_per_host;
-      node_to_participant_count[source_node]++;
-      node_to_participant_count[target_node]++;
+    if (!partition_to_participant_count.empty() &&
+        !SameParticipantCounts(buffer, partition_to_participant_count)) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Non homogenous replica group: ", instr.device_list().ToString()));
+    }
+    if (partition_to_participant_count.empty()) {
+      partition_to_participant_count = buffer;
     }
-  } else {
-    return absl::FailedPreconditionError(
-        "Cannot determine communication context for non-collective channel "
-        "instruction");
   }
-
-  return CommunicationMetadata{node_to_participant_count, num_devices_per_host,
-                               instr.GetModule()->config().replica_count()};
+  return CollectiveMetadata{partition_to_participant_count,
+                            num_devices_per_partition,
+                            instr.GetModule()->config().replica_count()};
 }
 
-bool IsSingleHost(const CommunicationMetadata& pattern) {
-  if (pattern.node_to_participant_count.size() == 1) {
+bool IsSingleHost(const CollectiveMetadata& pattern) {
+  if (pattern.partition_to_participant_count.size() == 1) {
     return true;
   }
   return pattern.replica_count > 0 &&
-         pattern.node_to_participant_count.empty() &&
-         pattern.replica_count <= pattern.num_devices_per_host;
+         pattern.partition_to_participant_count.empty() &&
+         pattern.replica_count <= pattern.num_devices_per_partition;
 }
 
-bool IsRailAligned(const CommunicationMetadata& pattern) {
-  if (!IsSingleHost(pattern) && pattern.node_to_participant_count.empty()) {
+bool IsWorldLevelCommunication(const CollectiveMetadata& pattern) {
+  if (!IsSingleHost(pattern) &&
+      pattern.partition_to_participant_count.empty()) {
     return true;
   }
   return absl::c_all_of(
-      pattern.node_to_participant_count, [&pattern](const auto& elem) {
-        const auto& [node_id, participant_count] = elem;
-        return participant_count == pattern.num_devices_per_host;
+      pattern.partition_to_participant_count, [&pattern](const auto& elem) {
+        const auto& [partition_id, participant_count] = elem;
+        return participant_count == pattern.num_devices_per_partition;
       });
 }
 
-bool IsNonRailAligned(const CommunicationMetadata& pattern) {
-  return !IsSingleHost(pattern) && !IsRailAligned(pattern);
+bool IsNonWorldLevelCommunication(const CollectiveMetadata& pattern) {
+  return !IsSingleHost(pattern) && !IsWorldLevelCommunication(pattern);
+}
+
+// Properties of a collective-permute instruction, categorizing its
+// communication pattern.
+struct CollectivePermuteProperty {
+  std::vector<std::pair<int64_t, int64_t>> intra_partition_source_target_pairs;
+  std::vector<std::pair<int64_t, int64_t>> inter_partition_source_target_pairs;
+  // If true, at least one device both sends and receives data. If false, every
+  // device involved in the collective-permute either only sends or only
+  // receives data.
+  bool has_devices_with_two_edges = false;
+  // True if for every pair (s,t) in source_target_pairs, the pair (t,s) is
+  // also present in source_target_pairs.
+  bool is_all_mutual = false;
+};
+
+// TODO(b/460155942): remove the optional wrapper once the HLO verifier stop
+// supporting empty source-target pairs.
+std::optional<CollectivePermuteProperty> GetCollectivePermuteProperty(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition) {
+  if (instr.source_target_pairs().empty()) {
+    return std::nullopt;
+  }
+
+  CollectivePermuteProperty property;
+  absl::flat_hash_set<int64_t> sources, targets;
+  absl::flat_hash_set<std::pair<int64_t, int64_t>> pairs_set;
+  absl::c_for_each(instr.source_target_pairs(),
+                   [&](const auto& pair) { pairs_set.insert(pair); });
+
+  property.is_all_mutual = true;
+
+  for (const auto& [source, target] : instr.source_target_pairs()) {
+    sources.insert(source);
+    targets.insert(target);
+
+    bool is_intra_partition = (source / num_devices_per_partition ==
+                               target / num_devices_per_partition);
+
+    if (is_intra_partition) {
+      property.intra_partition_source_target_pairs.push_back({source, target});
+    } else {
+      property.inter_partition_source_target_pairs.push_back({source, target});
+    }
+    // If anyone of the pair (t,s) is not present in source_target_pairs, the
+    // communication pattern is not all-mutual.
+    if (property.is_all_mutual && !pairs_set.contains({target, source})) {
+      property.is_all_mutual = false;
+    }
+  }
+
+  // If any source device is a target device, then it has two edges.
+  for (int64_t source : sources) {
+    if (targets.contains(source)) {
+      property.has_devices_with_two_edges = true;
+      break;
+    }
+  }
+
+  return property;
 }
 
 }  // namespace
 
+CollectivePermuteCostModelType GetCollectivePermuteCostModelType(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition) {
+  std::optional<CollectivePermuteProperty> property =
+      GetCollectivePermuteProperty(instr, num_devices_per_partition);
+  if (!property) {
+    return CollectivePermuteCostModelType::kUnknown;
+  }
+
+  if (!property->inter_partition_source_target_pairs.empty()) {
+    if (property->has_devices_with_two_edges) {
+      return property->is_all_mutual ? CollectivePermuteCostModelType::
+                                           kInterPartitionTwoWayAllMutual
+                                     : CollectivePermuteCostModelType::
+                                           kInterPartitionTwoWayHasNonMutual;
+    }
+    return CollectivePermuteCostModelType::kInterPartitionOneWay;
+  }
+
+  if (property->has_devices_with_two_edges) {
+    return property->is_all_mutual
+               ? CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual
+               : CollectivePermuteCostModelType::
+                     kIntraPartitionTwoWayHasNonMutual;
+  }
+  return CollectivePermuteCostModelType::kIntraPartitionOneWay;
+}
+
 bool IsGPUSyncCollective(const HloInstruction& instr) {
   auto backend_config = instr.backend_config<GpuBackendConfig>();
   if (!backend_config.ok()) {
@@ -141,26 +236,56 @@ bool IsGPUSyncCollective(const HloInstruction& instr) {
 }
 
 absl::StatusOr<GPUCommunicationType> CommunicationType(
-    int num_devices_per_host, const HloChannelInstruction& instr,
+    int num_devices_per_partition, const HloChannelInstruction& instr,
     const se::GpuComputeCapability& gpu_version) {
   if (!gpu_version.IsCuda()) {
     return absl::FailedPreconditionError("Only CUDA is supported.");
   }
 
-  TF_ASSIGN_OR_RETURN(CommunicationMetadata comm,
-                      CommunicationContext(instr, num_devices_per_host));
-  if (IsSingleHost(comm)) {
-    return GPUCommunicationType::SINGLE_HOST;
-  }
-  if (IsRailAligned(comm)) {
-    return GPUCommunicationType::RAIL_ALIGNED;
-  }
-  if (IsNonRailAligned(comm)) {
-    return GPUCommunicationType::NON_RAIL_ALIGNED;
+  if (const auto* collective = DynCast<HloCollectiveInstruction>(&instr)) {
+    TF_ASSIGN_OR_RETURN(
+        CollectiveMetadata comm,
+        CommunicationContext(*collective, num_devices_per_partition));
+    if (IsSingleHost(comm)) {
+      return GPUCommunicationType::SINGLE_PARTITION;
+    }
+    if (IsWorldLevelCommunication(comm)) {
+      return GPUCommunicationType::MULTI_HOST_WORLD_LEVEL;
+    }
+    if (IsNonWorldLevelCommunication(comm)) {
+      return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+    }
+  } else if (const auto* collective_permute =
+                 DynCast<HloCollectivePermuteInstruction>(&instr)) {
+    const auto source_to_targets_partition_map = GetSourceToTargetsNodeMap(
+        *collective_permute, num_devices_per_partition);
+    for (const auto& [source_partition, target_partition_set] :
+         source_to_targets_partition_map) {
+      if (target_partition_set.size() > 1) {
+        return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+      }
+      CHECK_EQ(target_partition_set.size(), 1);
+      if (source_partition != *target_partition_set.begin()) {
+        return GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL;
+      }
+    }
+    return GPUCommunicationType::SINGLE_PARTITION;
+  } else {
+    return absl::FailedPreconditionError(
+        "Cannot determine communication type for non-collective channel "
+        "instruction");
   }
 
   return GPUCommunicationType::UNDEFINED;
 }
 
+bool IsIntraNVLinkDomain(const HloModuleConfig& config, int64_t slice_size) {
+  int device_count = config.num_partitions() * config.replica_count();
+  bool is_intra = device_count <= slice_size;
+  VLOG(1) << "IsIntraNVLinkDomain: device_count=" << device_count
+          << " slice_size=" << slice_size << " is_intra=" << is_intra;
+  return is_intra;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
index 62161101f5d725..3d0087f51ad673 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
@@ -16,9 +16,15 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -29,23 +35,57 @@ enum class GPUCommunicationType {
   UNDEFINED = 0,
   // Communication involves devices from multiple hosts, and every host
   // involved in the communication pattern has all of its devices participating.
-  RAIL_ALIGNED = 1,
+  MULTI_HOST_WORLD_LEVEL = 1,
   // Communication involves devices from multiple hosts, but at least one of
   // the involved hosts has only a subset of its devices participating.
-  NON_RAIL_ALIGNED = 2,
+  MULTI_HOST_NON_WORLD_LEVEL = 2,
   // All devices participating in the collective operation reside on the same
-  // host machine.
-  SINGLE_HOST = 3
+  // fast-interconnect domain.
+  SINGLE_PARTITION = 3
 };
 
 // Returns the type of communication pattern for a channel instruction.
 absl::StatusOr<GPUCommunicationType> CommunicationType(
-    int num_devices_per_host, const HloChannelInstruction& instr,
+    int partition_size, const HloChannelInstruction& instr,
     const se::GpuComputeCapability& gpu_version);
 
+// Enum to categorize collective-permute cost models based on communication
+// patterns. The cost model is determined by the highest-latency pattern
+// present in any device: TwoWayHasNonMutual > TwoWayAllMutual > OneWay.
+enum class CollectivePermuteCostModelType {
+  // This is currently only used for CollectivePermute instructions with empty
+  // source-target pairs.
+  // TODO(b/460155942): Remove this field once the HLO verifier stop supporting
+  // empty source-target pairs.
+  kUnknown,
+  // Intra-partition: All devices only send or only receive data.
+  kIntraPartitionOneWay,
+  // Intra-partition: Devices send/receive, but only with the same peer
+  // (e.g., {{0,1},{1,0}}).
+  kIntraPartitionTwoWayAllMutual,
+  // Intra-partition: At least one device sends to one peer and receives from
+  // another (e.g., {{0,1},{1,2}}).
+  kIntraPartitionTwoWayHasNonMutual,
+  // Inter-partition: All devices only send or only receive data.
+  kInterPartitionOneWay,
+  // Inter-partition: Devices send/receive, but only with the same peer.
+  kInterPartitionTwoWayAllMutual,
+  // Inter-partition: At least one device sends to one peer and receives from
+  // another.
+  kInterPartitionTwoWayHasNonMutual,
+};
+
+// Returns cost model type based on collective-permute properties.
+CollectivePermuteCostModelType GetCollectivePermuteCostModelType(
+    const HloCollectivePermuteInstruction& instr,
+    int64_t num_devices_per_partition);
+
 // Returns true if instruction is a synchronous collective op.
 bool IsGPUSyncCollective(const HloInstruction& instr);
 
+// Returns true if all devices are within the same NVLink domain (slice).
+bool IsIntraNVLinkDomain(const HloModuleConfig& config, int64_t slice_size);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
index 350241a5385d50..3d9aea2e020140 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
@@ -62,7 +62,7 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost8Devices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost4Devices) {
@@ -85,7 +85,7 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost4Devices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost16Devices) {
@@ -106,12 +106,12 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost16Devices) {
 
   HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
       module->entry_computation()->root_instruction());
-  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+  EXPECT_THAT(CommunicationType(/*partition_size=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectRailAlignedAllDevices) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelAllDevices) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -131,10 +131,10 @@ TEST_F(CommunicationTypeTest, DetectRailAlignedAllDevices) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectRailAlignedHalfMesh) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelHalfMesh) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=32
 
@@ -157,10 +157,10 @@ TEST_F(CommunicationTypeTest, DetectRailAlignedHalfMesh) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectNonRailAligned) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevel) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -178,10 +178,9 @@ TEST_F(CommunicationTypeTest, DetectNonRailAligned) {
 
   HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
       module->entry_computation()->root_instruction());
-  EXPECT_THAT(
-      CommunicationType(/*num_devices_per_host=*/8, *instr,
-                        device_info().gpu_compute_capability()),
-      absl_testing::IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHost16DevicesForEmptyReplicaGroups) {
@@ -202,10 +201,10 @@ TEST_F(CommunicationTypeTest, DetectsSingleHost16DevicesForEmptyReplicaGroups) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/16, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectsRailAligned8DevicesForEmptyReplicaGroups) {
+TEST_F(CommunicationTypeTest, DetectWorldLevel8DevicesForEmptyReplicaGroups) {
   absl::string_view kHlo = R"(
     HloModule m, replica_count=16
 
@@ -223,10 +222,10 @@ TEST_F(CommunicationTypeTest, DetectsRailAligned8DevicesForEmptyReplicaGroups) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectsNonRailAligned16Devices) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevel16Devices) {
   absl::string_view kHlo = R"(
     HloModule m, replica_count=16
 
@@ -242,10 +241,9 @@ TEST_F(CommunicationTypeTest, DetectsNonRailAligned16Devices) {
 
   HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
       module->entry_computation()->root_instruction());
-  EXPECT_THAT(
-      CommunicationType(/*num_devices_per_host=*/8, *instr,
-                        device_info().gpu_compute_capability()),
-      absl_testing::IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
 TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermute) {
@@ -265,10 +263,30 @@ TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermute) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSingleHostCollectivePermuteSinglePair) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,7},{7,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloChannelInstruction* instr = Cast<HloChannelInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
 }
 
-TEST_F(CommunicationTypeTest, DetectsNonRailAlignedCollectivePermute) {
+TEST_F(CommunicationTypeTest, DetectNonWorldLevelCollectivePermute) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -284,13 +302,12 @@ TEST_F(CommunicationTypeTest, DetectsNonRailAlignedCollectivePermute) {
 
   HloChannelInstruction* instr = Cast<HloChannelInstruction>(
       module->entry_computation()->root_instruction());
-  EXPECT_THAT(
-      CommunicationType(/*num_devices_per_host=*/8, *instr,
-                        device_info().gpu_compute_capability()),
-      absl_testing::IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
 }
 
-TEST_F(CommunicationTypeTest, DetectsRailAlignedCollectivePermute) {
+TEST_F(CommunicationTypeTest, DetectWorldLevelCollectivePermute) {
   absl::string_view kHlo = R"(
     HloModule m, num_partitions=16
 
@@ -307,8 +324,312 @@ TEST_F(CommunicationTypeTest, DetectsRailAlignedCollectivePermute) {
       module->entry_computation()->root_instruction());
   EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
                                 device_info().gpu_compute_capability()),
-              absl_testing::IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsCrossHostCollectivePermuteMixed) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+       source_target_pairs={{0,7},
+                            {0,8},
+                            {1,9},
+                            {2,10},
+                            {3,11},
+                            {4,12},
+                            {5,13},
+                            {6,14},
+                            {7,15}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloChannelInstruction* instr = Cast<HloChannelInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*num_devices_per_host=*/8, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSinglePartitionMultiHost) {
+  // 16 devices across 2 hosts with partition_size=16 (single partition spanning
+  // 2 hosts)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[2048] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,16]<=[16]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/16, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsMultiPartitionWith8DevicePartitions) {
+  // 64 devices across 2 partitions with partition_size=32
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[2048] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1, 64]<=[64]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/32, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsMultiPartitionNonRailAligned) {
+  // 64 devices with partition_size=36: partition 0 has 36 devices, partition 1
+  // has 28 devices
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=12
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[1536] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1, 64]<=[64]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  // With partition_size=8, spans 2 partitions but not rail-aligned (8 and 4
+  // devices)
+  EXPECT_THAT(CommunicationType(/*partition_size=*/36, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_NON_WORLD_LEVEL));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSinglePartitionSubset) {
+  // 6 devices within a single partition (partition_size=36)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=4
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups={{0,1,2,3,4,5}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/36, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_PARTITION));
+}
+
+TEST_F(CommunicationTypeTest, DetectsRailAlignedMultiPartition) {
+  // 128 devices across 2 partitions with partition_size=8 (rail-aligned: 64
+  // devices per partition)
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=32
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[4096] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,128]<=[128]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(/*partition_size=*/64, *instr,
+                                device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::MULTI_HOST_WORLD_LEVEL));
 }
 
 }  // namespace
+
+TEST_F(CommunicationTypeTest, CollectivePermuteIntraPartitionOneWay) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{2,3},{4,5},{6,7}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionOneWay);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteIntraPartitionTwoWayMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=4
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{1,0},{2,3},{3,2}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteInterPartitionTwoWayMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{8,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionTwoWayAllMutual);
+}
+
+TEST_F(CommunicationTypeTest, CollectivePermuteInterPartitionOneWay) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{1,9}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionOneWay);
+}
+
+TEST_F(CommunicationTypeTest,
+       CollectivePermuteIntraPartitionTwoWayHasNonMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,1},{1,2},{2,0}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual);
+}
+
+TEST_F(CommunicationTypeTest,
+       CollectivePermuteInterPartitionTwoWayHasNonMutual) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={{0,8},{1,9},{8,2}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kInterPartitionTwoWayHasNonMutual);
+}
+
+// TODO(b/460155942): remove once the collective-permute with empty pairs is
+// disallowed by the HLO verifier.
+TEST_F(CommunicationTypeTest, CollectivePermuteEmptyPairs) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[128] collective-permute(p),
+        source_target_pairs={}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectivePermuteInstruction* instr =
+      Cast<HloCollectivePermuteInstruction>(
+          module->entry_computation()->root_instruction());
+  EXPECT_EQ(GetCollectivePermuteCostModelType(*instr,
+                                              /*num_devices_per_partition=*/8),
+            CollectivePermuteCostModelType::kUnknown);
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
index 703f9e1fc0ea18..faf8c9375d9a8a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
@@ -273,7 +273,7 @@ absl::Status DecomposeCollectivePermuteCycle(
 }
 }  // namespace
 
-absl::StatusOr<bool> CollectivePermuteCycleDecomposer::Run(
+absl::StatusOr<bool> CollectivePermuteCycleDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
index 916c7c8f616a21..3b98f801c6403b 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.h
@@ -60,7 +60,8 @@ class CollectivePermuteCycleDecomposer : public HloModulePass {
     return "collective-permute-cycle-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
index 3028f190f91acf..68fbda42ff1964 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.cc
@@ -218,7 +218,7 @@ bool IsTriviallyPipelineable(const HloInstruction& instr) {
   return instr.frontend_attributes().map().contains(kTriviallyPipelineable);
 }
 
-absl::StatusOr<bool> CollectivePipeliningAnalyzer::Run(
+absl::StatusOr<bool> CollectivePipeliningAnalyzer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloModuleConfig& config = module->config();
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
index 3585120d308394..07fdae7138b903 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_pipelining_analyzer.h
@@ -69,8 +69,9 @@ class CollectivePipeliningAnalyzer : public HloModulePass {
   absl::string_view name() const override {
     return "collective-pipelining-analyzer";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
index 40bd821e8c273e..7efd2078d59593 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
@@ -212,7 +212,7 @@ static absl::StatusOr<bool> TryFoldColectivePermuteOfSelect(
   return true;
 }
 
-absl::StatusOr<bool> CollectiveSelectFolder::Run(
+absl::StatusOr<bool> CollectiveSelectFolder::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
index 7d2172d18517b4..a9cd0bb99beced 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
@@ -76,7 +76,8 @@ class CollectiveSelectFolder : public HloModulePass {
  public:
   absl::string_view name() const override { return "collective-select-folder"; }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
index 9f2e904d1bd3ea..35c0a7012a4b7e 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.cc
@@ -144,7 +144,7 @@ static absl::Status CreateAsyncStartAndAsyncDone(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> CollectiveSendRecvCombiner::Run(
+absl::StatusOr<bool> CollectiveSendRecvCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
index 17e1200d5a10b7..c62b572a884fb0 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_send_recv_combiner.h
@@ -49,8 +49,8 @@ class CollectiveSendRecvCombiner : public HloModulePass {
     return "collective-send-recv-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
index 1c99bd414fb5b5..52ea29f9a7ac53 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
@@ -76,17 +77,17 @@ bool EnableHeuristicCollectiveCombining(
   if (!cc.IsAtLeastAmpere()) {
     return false;
   }
-  int hlo_device_count = config.num_partitions() * config.replica_count();
-  if (hlo_device_count <= nvlink_slice_size) {
+  if (IsIntraNVLinkDomain(config, nvlink_slice_size)) {
     VLOG(1) << "Disabled heuristic collective combining for intra-NVLink "
                "domain communication: HLO device count "
-            << hlo_device_count << " <= NVLink slice size "
-            << nvlink_slice_size;
+            << (config.num_partitions() * config.replica_count())
+            << " <= NVLink slice size " << nvlink_slice_size;
     return false;
   }
   VLOG(1) << "Enabled heuristic collective combining for inter-NVLink domain "
              "communication: HLO device count "
-          << hlo_device_count << " > NVLink slice size " << nvlink_slice_size;
+          << (config.num_partitions() * config.replica_count())
+          << " > NVLink slice size " << nvlink_slice_size;
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
index e822678cc453b9..a670ce12aceec5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
@@ -73,7 +73,7 @@ std::optional<ReduceScatterCombiner::GroupKey> CustomCombinerKey(
 
 }  // namespace
 
-absl::StatusOr<bool> GpuReduceScatterCombiner::Run(
+absl::StatusOr<bool> GpuReduceScatterCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Combiner threshold is specified. Running parent pass code.
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
index 7383877c7a97e2..96fb1d4f8ea466 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h
@@ -46,8 +46,8 @@ class GpuReduceScatterCombiner : public ReduceScatterCombiner {
     return "gpu-reduce-scatter-combiner";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
index 96fa801f5b33a8..3f103ed6eeb5df 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
@@ -258,7 +258,7 @@ static bool IsCommand(const HloCustomCallInstruction* hlo,
   // Check if FFI handler is compatible with command buffers.
   auto registration = ffi::FindHandler(hlo->custom_call_target(), "gpu");
   return registration.ok()
-             ? ffi::IsCommandBufferCompatible(registration->traits)
+             ? ffi::IsCommandBufferCompatible(registration->metadata)
              : false;
 }
 
@@ -794,7 +794,7 @@ CommandBufferScheduling::CommandBufferScheduling(
     const se::DeviceDescription& device_description)
     : device_description_(device_description) {}
 
-absl::StatusOr<bool> CommandBufferScheduling::Run(
+absl::StatusOr<bool> CommandBufferScheduling::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We run command buffer scheduling after a regular scheduling to guarantee
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
index 702d22a0356b49..ac6c884b4228aa 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
@@ -85,11 +85,6 @@ class CommandBufferScheduling : public HloModulePass {
     return "command-buffer-scheduling";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static std::vector<HloInstructionSequence> CollectCommandBufferSequences(
       HloInstructionSequence schedule, const CommandBufferConfig& config,
       int32_t min_num_commands = 1);
@@ -121,6 +116,11 @@ class CommandBufferScheduling : public HloModulePass {
       HloComputation* parent, const HloInstructionSequence& seq,
       CommandBuffer command_buffer);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   se::DeviceDescription device_description_;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
index 653470b6b097b6..fc731cd45386cf 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
@@ -120,11 +120,13 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
     TF_ASSIGN_OR_RETURN(
         DotDimensionNumbers dot_dimension_numbers,
         ParseDimensionNumbers(frontend_attrs.at("composite.attributes")));
+    PrecisionConfig precision{};
+    precision.mutable_operand_precision()->Resize(2, PrecisionConfig::DEFAULT);
     auto* scaled_dot =
         computation->AddInstruction(HloInstruction::CreateScaledDot(
             call->shape(), call->mutable_operand(0), call->mutable_operand(1),
             call->mutable_operand(2), call->mutable_operand(3),
-            dot_dimension_numbers, PrecisionConfig{}));
+            dot_dimension_numbers, precision));
     TF_RETURN_IF_ERROR(call->ReplaceAllUsesWith(scaled_dot));
     TF_RETURN_IF_ERROR(computation->RemoveInstruction(call));
     changed = true;
@@ -132,7 +134,7 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
   return changed;
 }
 
-absl::StatusOr<bool> CompositeRewriter::Run(
+absl::StatusOr<bool> CompositeRewriter::RunImpl(
     HloModule* module, const absl::flat_hash_set<absl::string_view>&) {
   bool changed = false;
   for (HloComputation* computation : module->computations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
index bcad646641470d..7896a04be027cd 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.h
@@ -31,12 +31,12 @@ class CompositeRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "composite-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
index 25d79a931b075f..cc8e752d349e91 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
@@ -52,7 +52,7 @@ bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
         conv.custom_call_target() == kCudnnConvForwardGraphCallTarget);
   return window_util::HasSymmetricPadding(conv.window()) &&
          !window_util::HasNegativePadding(conv.window()) &&
-         !window_util::HasDilation(conv.window());
+         !window_util::HasBaseDilation(conv.window());
 }
 
 // If the (positive and negative) padding on the input operand of a convolution
@@ -139,8 +139,10 @@ HloInstruction* MaybePaddedAndSlicedInput(
 // operand.
 HloInstruction* MaybePaddedKernel(const Window& conv_window,
                                   const ConvolutionDimensionNumbers& conv_dnums,
-                                  HloInstruction* kernel) {
-  if (!window_util::HasWindowDilation(conv_window)) {
+                                  HloInstruction* kernel,
+                                  bool preserve_window_dilation = false) {
+  if (!window_util::HasWindowDilation(conv_window) ||
+      preserve_window_dilation) {
     return kernel;
   }
 
@@ -172,6 +174,12 @@ bool ConvPaddingLegalization::CanonicalizeForwardConvolution(
     return false;
   }
 
+  bool has_window_dilation = window_util::HasWindowDilation(conv->window());
+  bool preserve_window_dilation =
+      has_window_dilation && window_util::HasSymmetricPadding(conv->window()) &&
+      !window_util::HasNegativePadding(conv->window()) &&
+      !window_util::HasBaseDilation(conv->window());
+
   // Insert slices and/or pads between the convolution and its input and/or
   // kernel operand.
   Window new_conv_window = conv->window();
@@ -180,17 +188,17 @@ bool ConvPaddingLegalization::CanonicalizeForwardConvolution(
       conv->mutable_operand(0));
   HloInstruction* new_kernel =
       MaybePaddedKernel(new_conv_window, conv->convolution_dimension_numbers(),
-                        conv->mutable_operand(1));
+                        conv->mutable_operand(1), preserve_window_dilation);
 
-  // Remove the window dilation from convolution's window field. These paddings
-  // are made explicit with the pads inserted by MaybePaddedKernel().
   for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
     WindowDimension* dim = new_conv_window.mutable_dimensions(i);
 
     // The size of the kernel may have changed so update the Window to match.
     dim->set_size(new_kernel->shape().dimensions(
         conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
-    dim->set_window_dilation(1);
+    if (!preserve_window_dilation) {
+      dim->set_window_dilation(1);
+    }
   }
 
   // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
@@ -445,7 +453,7 @@ absl::StatusOr<bool> ConvPaddingLegalization::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> ConvPaddingLegalization::Run(
+absl::StatusOr<bool> ConvPaddingLegalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
index 9be21ebd3f1f2f..9fbf4aa08cae7d 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.h
@@ -36,8 +36,8 @@ class ConvPaddingLegalization : public HloModulePass {
     return "conv-padding-legalization";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
index dacefe586b2a63..89181cbebf01bb 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
@@ -91,6 +91,74 @@ ENTRY %convolution (operand f64[2,2,2,3]{3,2,1,0}) -> (f64[2,2,4,4]{3,2,1,0}, u8
   EXPECT_TRUE(ShapeUtil::Equal(conv->shape(), expected_conv_shape));
 }
 
+TEST_F(ConvPaddingLegalizationTest, ForwardConvolveWithWindowDilation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,5,5]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,5,5]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  EXPECT_FALSE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Parameter(0), m::Parameter(1))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(2, dim.window_dilation());
+    EXPECT_EQ(3, dim.size());
+  }
+}
+
+TEST_F(ConvPaddingLegalizationTest,
+       ForwardConvolveWithWindowDilationAndAsymmetricPadding) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,5,5]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,5,5]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=1_2x1_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  ASSERT_TRUE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Pad(m::Parameter(0), m::Op()),
+                                       m::Pad(m::Parameter(1), m::Op()))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(1, dim.window_dilation());
+  }
+}
+
+TEST_F(ConvPaddingLegalizationTest,
+       ForwardConvolveWithWindowDilationAndBaseDilation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (input f32[1,3,5,5]{3,2,1,0}, kernel f32[3,3,3,3]{3,2,1,0}) -> (f32[1,3,9,9]{3,2,1,0}, u8[0]) {
+  %input = f32[1,3,5,5]{3,2,1,0} parameter(0)
+  %kernel = f32[3,3,3,3]{3,2,1,0} parameter(1)
+  ROOT %custom-call = (f32[1,3,9,9]{3,2,1,0}, u8[0]{0}) custom-call(%input, %kernel), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2 lhs_dilate=2x2}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward"
+}
+                                               )")
+                    .value();
+  ASSERT_TRUE(ConvPaddingLegalization().Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::CustomCall({kCudnnConvForwardCallTarget},
+                                       m::Pad(m::Parameter(0), m::Op()),
+                                       m::Pad(m::Parameter(1), m::Op()))));
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& dim = root->window().dimensions(i);
+    EXPECT_EQ(1, dim.window_dilation());
+  }
+}
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
index b09477f06125b3..0d2824be649be5 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
@@ -138,6 +138,42 @@ bool MaybeConv1dToConv2d(HloInstruction* conv) {
   return false;
 }
 
+bool LooksLikeForwardConvolution(const HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  const Shape& lhs_shape = conv->operand(0)->shape();
+  const Shape& rhs_shape = conv->operand(1)->shape();
+  const Shape& result_shape = conv->shape();
+
+  // Compare batch and output feature counts. Backward-filter convolutions swap
+  // these, so matching values are a strong signal that this is a forward
+  // convolution, even if it has dilation.
+  int64_t lhs_batches = lhs_shape.dimensions(dnums.input_batch_dimension());
+  int64_t result_batches =
+      result_shape.dimensions(dnums.output_batch_dimension());
+  if (lhs_batches != result_batches) {
+    return false;
+  }
+
+  int64_t rhs_output_features =
+      rhs_shape.dimensions(dnums.kernel_output_feature_dimension());
+  int64_t result_output_features =
+      result_shape.dimensions(dnums.output_feature_dimension());
+  if (rhs_output_features != result_output_features) {
+    return false;
+  }
+
+  for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+    int64_t kdim = rhs_shape.dimensions(dnums.kernel_spatial_dimensions(i));
+    int64_t odim = result_shape.dimensions(dnums.output_spatial_dimensions(i));
+    if (kdim > odim) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CanImplementAsGpuForwardConv(HloInstruction* conv) {
   const ConvolutionDimensionNumbers& dnums =
       conv->convolution_dimension_numbers();
@@ -191,6 +227,12 @@ ConvolutionMatch MatchBackwardFilter(HloInstruction* conv) {
   //              Convolution
   //                 conv
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
+  if (LooksLikeForwardConvolution(conv)) {
+    VLOG(1) << "Convolution " << conv->ToString()
+            << " looks like a forward convolution; skipping backward filter "
+               "rewrite.";
+    return std::nullopt;
+  }
 
   // Step 2: match paddings and dimension numbers of the forward convolution.
   const ConvolutionDimensionNumbers& conv_dnums =
@@ -859,10 +901,10 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
 }
 }  // namespace
 
-absl::StatusOr<bool> ConvRewriter::Run(
+absl::StatusOr<bool> ConvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(2, "ConvRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvRewriter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -871,7 +913,7 @@ absl::StatusOr<bool> ConvRewriter::Run(
         RunOnComputation(computation, compute_capability_, dnn_version_));
     changed |= result;
   }
-  XLA_VLOG_LINES(2, "ConvRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "ConvRewriter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
index 49fb69c0f5b930..c0b7596dd8f828 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
@@ -47,8 +47,8 @@ class ConvRewriter : public HloModulePass {
 
   static bool ConvIsLowerable(HloInstruction* conv);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
index cde6c0291b6c38..0b1d98628e80f4 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
@@ -754,6 +754,27 @@ TEST_F(ConvRewriterTest, TestConv1dBackwardInputPatternMatch) {
                   0)));
 }
 
+TEST_F(ConvRewriterTest, ForwardConvolutionWithWindowDilation) {
+  // Forward convolution with window dilation should be preserved and not
+  // misclassified as backward filter convolution.
+  const std::string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = f32[8,128,32,32] parameter(0)
+      filter = f32[3,3,128,128] parameter(1)
+      ROOT conv = f32[8,128,32,32] convolution(input, filter), window={size=3x3 pad=2_2x2_2 rhs_dilate=2x2}, dim_labels=bf01_01io->bf01
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({kCudnnConvForwardCallTarget}, m::Parameter(0),
+                                m::Parameter(1)),
+                  0)));
+}
+
 TEST_F(ConvRewriterTest, TestInvalidTypes) {
   const std::string module_str = absl::StrFormat(R"(
     HloModule Test
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
index 2a504f8a9a82a3..434a33236b76d4 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
@@ -204,7 +204,7 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(
   return changed;
 }
 
-absl::StatusOr<bool> CopyFusion::Run(
+absl::StatusOr<bool> CopyFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Only for the entry computation we can be sure that the copies do not share
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.h b/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
index 86e82912d92ea8..c64aac0dde734f 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.h
@@ -39,8 +39,8 @@ class CopyFusion : public HloModulePass {
 
   absl::string_view name() const override { return "copy_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
index a78c14a156ea71..496d50122c2701 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
@@ -196,7 +196,7 @@ absl::StatusOr<std::vector<HloDotInstruction*>> GetRelevantDots(
 
 }  // namespace
 
-absl::StatusOr<bool> CublasPadForGemms::Run(
+absl::StatusOr<bool> CublasPadForGemms::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
index 512f70956a2cb2..ca0a5c4d43ea26 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
@@ -47,8 +47,8 @@ class CublasPadForGemms : public HloModulePass {
 
   absl::string_view name() const override { return "cublas-pad-for-gemms"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
index e0e1138a33c774..aa004d301a32ad 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
@@ -572,9 +572,9 @@ class CuDnnCustomCallVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnCustomCallCompiler::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> CuDnnCustomCallCompiler::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER_LEVEL("cuDNN custom call compiler", 8);
   return CuDnnCustomCallVisitor(dnn_support_, compilation_results_)
       .RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
index b29d09d3cddaa7..634bdbe97b0fa2 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.h
@@ -41,8 +41,8 @@ class CuDnnCustomCallCompiler : public HloModulePass {
     return "cudnn-custom-call-compiler";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
index 41e6026166d066..c7fe997e832980 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.cc
@@ -55,9 +55,9 @@ class CustomCallVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnCustomCallConverter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> CuDnnCustomCallConverter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return CustomCallVisitor().RunOnModule(module, execution_threads);
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
index 26dda4b14c3c09..86f2d86a8b8d8a 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter.h
@@ -35,8 +35,8 @@ class CuDnnCustomCallConverter : public HloModulePass {
     return "cudnn-custom-call-converter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
index 5ff4d17cd24a84..5311481035909b 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
@@ -97,8 +97,6 @@ bool IsNonDepthwiseConvCustomCall(const HloInstruction* instr) {
   return IsConvCustomCall(instr) && !IsConvDepthwise(instr);
 }
 
-bool IsROCm(se::GpuComputeCapability cc) { return cc.IsRocm(); }
-
 // elu, relu6, and leaky-relu activations are supported in cudnn via the
 // "runtime fusion" engine, which JIT compiles C++ code.  This can be slow to
 // compile, so we guard it with a debug option.
@@ -1452,7 +1450,7 @@ absl::StatusOr<bool> FuseConvertToF16(HloComputation* comp) {
 
 absl::StatusOr<bool> FuseConvertToS8(HloComputation* comp,
                                      se::GpuComputeCapability cc) {
-  if (IsROCm(cc)) return false;
+  if (cc.IsRocm()) return false;
   bool changed = false;
   for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
     HloInstruction* gte = nullptr;
@@ -1684,7 +1682,7 @@ void VlogStats(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
+absl::StatusOr<bool> CudnnFusedConvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool any_changed = false;
@@ -1694,7 +1692,7 @@ absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
     bool changed = false;
     // Rewrite FP8 convolutions and supported adjacent pointwise ops into a
     // ForwardGraph Custom Call.
-    if (!IsROCm(compute_capability_)) {
+    if (!compute_capability_.IsRocm()) {
       auto* cc = compute_capability_.cuda_compute_capability();
       TF_ASSIGN_OR_RETURN(
           changed, F8GraphConv(comp, *cc, dnn_version_, toolkit_version_));
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
index 2654ff59f6a304..ec3fc1380219de 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.h
@@ -123,8 +123,8 @@ class CudnnFusedConvRewriter : public HloModulePass {
     return "cudnn-fused-convolution-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
index f2da50e6bcaf55..c2e31f4603ccde 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
@@ -78,11 +78,11 @@ static const std::initializer_list<absl::string_view> kf16f32{"f16", "f32"};
 class CudnnFusedConvRewriterHloTest : public HloTestBase {
  public:
   bool IsCuda() const {
-    return std::holds_alternative<se::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability()
+        .IsCuda();
   }
   se::CudaComputeCapability GetCudaComputeCapability() const {
     return backend()
@@ -119,11 +119,11 @@ class CudnnFusedConvRewriterHloTest : public HloTestBase {
 class CudnnFusedConvRewriterTest : public GpuCodegenTest {
  public:
   bool IsCuda() const {
-    return std::holds_alternative<se::CudaComputeCapability>(
-        backend()
-            .default_stream_executor()
-            ->GetDeviceDescription()
-            .gpu_compute_capability());
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability()
+        .IsCuda();
   }
   se::CudaComputeCapability GetCudaComputeCapability() const {
     return backend()
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
index 025a5da61ef8c7..14a417ed2b9573 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape_util.h"
@@ -149,6 +150,8 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::BFLOAT16;
     case PrimitiveType::S32:
       return t::INT32;
+    case PrimitiveType::S4:
+      return t::INT4;
     case PrimitiveType::S8:
       return t::INT8;
     case PrimitiveType::PRED:
@@ -383,8 +386,10 @@ class ConvDimensionAdapter {
                         fusion.backend_config<GpuBackendConfig>());
     const FusionBackendConfig& fusion_backend_config =
         gpu_config.fusion_backend_config();
-    if (!fusion_backend_config.has_cudnn_fusion_config()) {
-      VLOG(3) << "Can't find cudnn fusion config for cudnn conv fusion.";
+    if (!fusion_backend_config.has_cudnn_fusion_config() ||
+        !fusion_backend_config.cudnn_fusion_config().has_kind()) {
+      VLOG(3) << "Can't find cudnn fusion config or conv kind for cudnn conv "
+                 "fusion.";
       return std::nullopt;
     }
     CuDnnFusionConfig_Kind conv_kind =
@@ -531,6 +536,9 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
   VLOG(5) << fusion.ToString();
   VLOG(5) << computation.ToString();
   graph::Graph graph;
+  // Intermediate data type is needed for `block_scale_dequantize` graph nodes.
+  graph.set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT);
+
   std::vector<HloInstruction*> instructions =
       computation.MakeInstructionPostOrder();
   absl::flat_hash_map<const HloInstruction*,
@@ -731,30 +739,25 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       if (!compute_dtype.has_value()) {
         return std::nullopt;
       }
-      const auto& dimension_numbers = hlo->dot_dimension_numbers();
       std::array<std::shared_ptr<graph::Tensor_attributes>, 2> dot_operands;
       for (int i = 0; i < 2; ++i) {
-        const Shape& input_shape = hlo->operand(i)->shape();
         const Shape& scale_shape = hlo->operand(i + 2)->shape();
-        int dim = i == 0 ? dimension_numbers.lhs_contracting_dimensions(0)
-                         : dimension_numbers.rhs_contracting_dimensions(0);
-        int block_size =
-            input_shape.dimensions(dim) / scale_shape.dimensions(dim);
-
+        int block_size = scale_shape.element_type() == F8E8M0FNU
+                             ? BlockScalingRewriter::kBlockSizeMXFP8
+                             : BlockScalingRewriter::kBlockSizeNVFP4;
         auto scale = operand(i + 2);
         scale->set_reordering_type(fe::TensorReordering_t::F8_128x4);
         auto dq_attrs = graph::Block_scale_dequantize_attributes()
                             .set_block_size(block_size)
-                            .set_compute_data_type(fe::DataType_t::FLOAT);
+                            .set_compute_data_type(*compute_dtype);
         dot_operands[i] =
             graph.block_scale_dequantize(operand(i), scale, dq_attrs);
         dot_operands[i]->set_name(
             absl::StrCat(hlo->name(), i == 0 ? "_lhs" : "_rhs", "_dq"));
       }
-      hlo_to_cudnn[hlo] =
-          graph.matmul(dot_operands[0], dot_operands[1],
-                       graph::Matmul_attributes().set_compute_data_type(
-                           compute_dtype.value()));
+      hlo_to_cudnn[hlo] = graph.matmul(
+          dot_operands[0], dot_operands[1],
+          graph::Matmul_attributes().set_compute_data_type(*compute_dtype));
     } else if (HloPredicateIsOp<HloOpcode::kConvolution>(hlo)) {
       // translate conv windows to cudnn conv attr
       const Window& window = DynCast<HloConvolutionInstruction>(hlo)->window();
@@ -865,9 +868,9 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
     return absl::InternalError("Construction of cuDNN graph failed.");
   }
   TF_RETURN_IF_ERROR(graph->Prepare(
-      dnn_support,
-      se::NumericOptions{RequireDeterminism(hlo.GetModule()->config()),
-                         /*allow_tf32=*/true}));
+      dnn_support, se::EngineOptions{
+                       RequireDeterminism(hlo.GetModule()->config()),
+                       /*allow_tf32=*/true, /*require_command_buffer=*/false}));
   return *graph;
 }
 
@@ -1009,7 +1012,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> CuDnnFusionCompiler::Run(
+absl::StatusOr<bool> CuDnnFusionCompiler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("cuDNN fusion compiler");
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
index 0729bd7a4758fd..bdd591fabd072a 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.h
@@ -40,14 +40,14 @@ class CuDnnFusionCompiler : public HloModulePass {
 
   absl::string_view name() const override { return "cudnn-fusion-compiler"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static int GetAvailablePlanCount(se::StreamExecutor& stream_exec,
                                    const HloFusionInstruction& hlo);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   se::dnn::DnnSupport& dnn_support_;
   BinaryMap& compilation_results_;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
index b46cfe0d8e1b4c..d8786cc5f0ace7 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
@@ -1512,7 +1512,7 @@ CudnnNormRewriter::CudnnNormRewriter(
     const se::CudaComputeCapability& cuda_compute_capability)
     : cuda_compute_capability_(cuda_compute_capability) {}
 
-absl::StatusOr<bool> CudnnNormRewriter::Run(
+absl::StatusOr<bool> CudnnNormRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
index 0f6433dd5df59a..7b4899d1f8ab34 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.h
@@ -34,8 +34,8 @@ class CudnnNormRewriter : public HloModulePass {
       const se::CudaComputeCapability& cuda_compute_capability);
   absl::string_view name() const override { return "norm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
index d988e0ade45141..65d423023e7e64 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
@@ -489,7 +489,7 @@ absl::StatusOr<bool> TryResolvePaddedShapesForIntegerConvolution(
   return changed;
 }
 
-absl::StatusOr<bool> CudnnPadForConvolutions::Run(
+absl::StatusOr<bool> CudnnPadForConvolutions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
index db17516dfa9376..00a07f857caacc 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.h
@@ -37,9 +37,9 @@ class CudnnPadForConvolutions : public HloModulePass {
   absl::string_view name() const override {
     return "cudnn_pad_for_convolutions";
   }
-  // Run PadForConvolutions on the given module and return if any change is made
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
index b25d11293f0ff3..6f36dd1f6c94de 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
@@ -236,7 +236,7 @@ absl::StatusOr<bool> TrySimplifyPadding(HloInstruction* instr) {
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> CudnnSimplifyPadding::Run(
+absl::StatusOr<bool> CudnnSimplifyPadding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
index 147379561eb3d8..9da5ab57c4f1e8 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.h
@@ -38,8 +38,8 @@ class CudnnSimplifyPadding : public HloModulePass {
 
   absl::string_view name() const override { return "cudnn_simplify_padding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
index c10bd10e2b3ab3..3eba5bd4e72d02 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
@@ -190,7 +190,7 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 }
 }  // namespace
 
-absl::StatusOr<bool> CustomKernelFusionRewriter::Run(
+absl::StatusOr<bool> CustomKernelFusionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<CustomKernelFusionPattern::Match> matches;
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
index 5738bf5283a345..665603659439df 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h
@@ -71,8 +71,8 @@ class CustomKernelFusionRewriter : public HloModulePass {
     return "custom-kernel-fusion-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
index fac30e4daac8c4..6a58462ab30012 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
@@ -309,7 +309,7 @@ void RewriteF32ToTF32X3(HloInstruction* instr) {
 
 }  // namespace
 
-absl::StatusOr<bool> DotAlgorithmRewriter::Run(
+absl::StatusOr<bool> DotAlgorithmRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
index 3a47c1dda5e5f8..c9902751c59f53 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.h
@@ -29,11 +29,6 @@ class DotAlgorithmRewriter : public HloModulePass {
  public:
   DotAlgorithmRewriter() = default;
   absl::string_view name() const override { return "dot-algorithm-rewriter"; }
-  using HloPassInterface::Run;
-
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   static absl::StatusOr<HloInstruction*> MakeMultiplyForBF16BF16F32(
       HloInstruction* lhs, HloInstruction* rhs);
@@ -47,6 +42,11 @@ class DotAlgorithmRewriter : public HloModulePass {
       HloInstruction* lhs, HloInstruction* rhs);
   static absl::StatusOr<HloInstruction*> MakeMultiplyForTF32TF32F32X3(
       HloInstruction* lhs, HloInstruction* rhs);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
index 70b20f88b19a59..5c5613a1dda83b 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
@@ -81,7 +81,7 @@ absl::Status SortDotDimensions(HloDotInstruction* dot) {
 
 }  // namespace
 
-absl::StatusOr<bool> DotDimensionSorter::Run(
+absl::StatusOr<bool> DotDimensionSorter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<HloInstruction*> dots_to_process;
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
index 6ba9dbf543b351..6bb946441baf8c 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.h
@@ -36,10 +36,8 @@ class DotDimensionSorter : public HloModulePass {
  public:
   absl::string_view name() const override { return "dot_dimension_sorter"; }
 
-  // Run the pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index 4fc39636187455..f3058a680e0dd4 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -555,7 +555,7 @@ absl::StatusOr<bool> AutoUnroll(HloInstruction* while_instr,
 
 }  // namespace
 
-absl::StatusOr<bool> DoubleBufferLoopUnrolling::Run(
+absl::StatusOr<bool> DoubleBufferLoopUnrolling::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
index aa4803457a1815..f780380f46732c 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.h
@@ -58,8 +58,8 @@ class DoubleBufferLoopUnrolling : public HloModulePass {
     return "loop-double-buffer-transformer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
index d56c5be5ef4d66..76b27097015bd2 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
@@ -246,7 +246,7 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 
 }  // namespace
 
-absl::StatusOr<bool> DynamicSliceFusionRewriter::Run(
+absl::StatusOr<bool> DynamicSliceFusionRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<HloInstruction*,
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
index 3bf8f37ce1909d..a35c6c6bd646e6 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h
@@ -76,8 +76,8 @@ class DynamicSliceFusionRewriter : public HloModulePass {
   explicit DynamicSliceFusionRewriter(std::string platform_name)
       : platform_name_(std::move(platform_name)) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
index 5a1efeb58dcc8f..3607994ec28f0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
@@ -70,7 +70,7 @@ absl::StatusOr<bool> CreateCollectivesGroupAsyncPair(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> ExplicitCollectivesGroupAsyncWrapper::Run(
+absl::StatusOr<bool> ExplicitCollectivesGroupAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
index 21795c32824bd6..a0734b78263dfb 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
@@ -38,8 +38,8 @@ class ExplicitCollectivesGroupAsyncWrapper : public HloModulePass {
     return "explicit-collectives-group-async-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
index 1555e48a4d6bd8..7ad37afa58c0dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
@@ -75,7 +75,7 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> ExplicitStreamAnnotationAsyncWrapper::Run(
+absl::StatusOr<bool> ExplicitStreamAnnotationAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
index f5613016159a36..cf897c6937e92c 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h
@@ -37,8 +37,8 @@ class ExplicitStreamAnnotationAsyncWrapper : public HloModulePass {
     return "explicit-stream-annotation-async-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
index 3b7f9c229997e5..6c70da660aee16 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/MathExtras.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -237,7 +237,7 @@ absl::StatusOr<bool> ProcessFusionInstruction(
 
 }  // anonymous namespace
 
-absl::StatusOr<bool> FusionBlockLevelRewriter::Run(
+absl::StatusOr<bool> FusionBlockLevelRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(EnsureTritonSupportsComputeCapability(
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
index 5397f8e0ffc376..99bd97ce608eeb 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 
@@ -42,8 +42,8 @@ class FusionBlockLevelRewriter : public HloModulePass {
     return "fusion-block-level-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
index 7686b7170cd340..b0216ba32ea683 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
@@ -27,6 +27,7 @@ License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -34,7 +35,6 @@ License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
index 49299410337253..14c633f063b310 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
@@ -267,7 +267,7 @@ absl::Status SetLoopMemcpyConfig(
 
 }  // namespace
 
-absl::StatusOr<bool> FusionDynamicMemcpyRewriter::Run(
+absl::StatusOr<bool> FusionDynamicMemcpyRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool has_changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
index 3da53bee2eaf0e..60a014bcc3d741 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
@@ -33,8 +33,8 @@ class FusionDynamicMemcpyRewriter : public HloModulePass {
     return "fusion-dynamic-memcpy-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
index 72d09238f98a4d..023b6e2c2af5fc 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
@@ -108,9 +108,9 @@ static absl::StatusOr<bool> RunOnComputation(HloComputation *computation) {
   return visitor.changed();
 }
 
-absl::StatusOr<bool> GemmBroadcastFoldingRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> GemmBroadcastFoldingRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   for (HloComputation *computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
index 1c615b93ed16db..d8b6d2a79e24e1 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h
@@ -39,8 +39,8 @@ class GemmBroadcastFoldingRewriter : public HloModulePass {
     return "cublas-gemm-broadcast-folding-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
index 47b688a30b62c5..a37fdb77e97be3 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
@@ -106,11 +106,6 @@ class AdjacencyList {
   std::vector<std::vector<NodeId>> adj_;
 };
 
-struct HloAndDimOrder {
-  const HloInstruction* original_hlo = nullptr;
-  DimensionOrder dim_order;
-};
-
 struct HloAndIterSpec {
   const HloInstruction* original_hlo;
   TensorIterationSpec iter_spec;
@@ -159,16 +154,18 @@ struct HlosAndRequirements {
 };
 
 // Clones the hero kDot operation into the fusion.
-HloInstruction& FuseDot(const HloDotInstruction& dot,
-                        const HloInstruction& fused_lhs,
-                        const HloInstruction& fused_rhs,
+HloInstruction& FuseDot(const HloInstruction& dot,
+                        const std::vector<HlosAndRequirements>& hlos_and_reqs,
                         HloComputation::Builder& builder  // append
 ) {
   VLOG(3) << "Fusing " << dot.ToString();
 
-  std::vector<HloInstruction*> hlo_new_operands = {
-      const_cast<HloInstruction*>(&fused_lhs),
-      const_cast<HloInstruction*>(&fused_rhs)};
+  std::vector<HloInstruction*> hlo_new_operands;
+  hlo_new_operands.reserve(dot.operand_count());
+  for (int i = 0; i < hlos_and_reqs.size(); ++i) {
+    hlo_new_operands.push_back(
+        const_cast<HloInstruction*>(hlos_and_reqs[i].fused_hlo));
+  }
   return *builder.AddInstruction(
       dot.CloneWithNewOperands(dot.shape(), hlo_new_operands));
 }
@@ -276,6 +273,87 @@ std::optional<DimOrdersAndReqs> GetUserDimOrdersAndCombinedReqsIfProfitable(
       std::get<DotRequirements>(combined_reqs)};
 }
 
+class FusionPlanBuilder {
+ public:
+  // Builds and returns the FusionPlan. Clears internal state.
+  FusionPlan BuildPlan() {
+    FusionPlan fusion_plan;
+    for (auto& [node_id, node] : node_map_) {
+      CHECK(node.should_fuse.has_value());
+      fusion_plan.map[node_id] =
+          NodeFusionPlan{node.original_hlo, *node.should_fuse};
+    }
+
+    node_map_.clear();
+    node_reuse_map_.clear();
+    fusion_plan.graph = std::move(graph_);
+    return fusion_plan;
+  }
+
+  void ReserveSpaceForOutNeighbors(AdjacencyList::NodeId node_id,
+                                   size_t count) {
+    graph_.ReserveSpaceForOutNeighbors(node_id, count);
+  }
+
+  void AddArc(AdjacencyList::NodeId from, AdjacencyList::NodeId to) {
+    graph_.AddArc(from, to);
+  }
+
+  const HloInstruction* GetOriginalHlo(AdjacencyList::NodeId node_id) const {
+    return node_map_.at(node_id).original_hlo;
+  }
+
+  const DimensionOrder& GetDimOrder(AdjacencyList::NodeId node_id) const {
+    return node_map_.at(node_id).dim_order;
+  }
+
+  // Inserts a node for the given HLO and `dim_order` unless already exists.
+  // Returns the node id and a bool indicating if a new node was inserted.
+  std::pair<AdjacencyList::NodeId, bool> InsertNode(
+      const HloInstruction& hlo, const DimensionOrder& dim_order) {
+    HloAndIterSpec reuse_key{&hlo, dim_order.ToTensorIterationSpec()};
+
+    // Attempt to insert a placeholder. If the key already exists, inserted is
+    // false.
+    auto [it, inserted] = node_reuse_map_.insert({reuse_key, -1});
+    if (!inserted) {
+      return {it->second, false};
+    }
+
+    // Key was not present. Create the node and update the map.
+    AdjacencyList::NodeId node_id = graph_.AddNode();
+    it->second = node_id;
+    CHECK(node_map_
+              .insert({node_id,
+                       Node{&hlo, dim_order, /*should_fuse=*/std::nullopt}})
+              .second);
+    return {node_id, true};
+  }
+
+  // Assigns fusion decision for the specified node.
+  // The node must not have an already assigned decision.
+  void SetShouldFuseNode(AdjacencyList::NodeId node_id, bool should_fuse) {
+    Node& node = node_map_.at(node_id);
+    CHECK(!node.should_fuse.has_value());
+    node.should_fuse = should_fuse;
+  }
+
+ private:
+  AdjacencyList graph_;
+
+  struct Node {
+    const HloInstruction* original_hlo;
+    DimensionOrder dim_order;
+    std::optional<bool> should_fuse;
+  };
+  absl::flat_hash_map<AdjacencyList::NodeId, Node> node_map_;
+
+  // Allows reusing nodes when multiple instructions iterate over the same HLO
+  // using the same iteration spec. In that case we don't duplicate the
+  // instruction in the fusion.
+  absl::flat_hash_map<HloAndIterSpec, AdjacencyList::NodeId> node_reuse_map_;
+};
+
 // Builds the fusion map and the requirements which can later be used to
 // actually fuse that subgraph.
 FusionPlanAndRequirements BuildFusionPlanTowardOperands(
@@ -286,61 +364,32 @@ FusionPlanAndRequirements BuildFusionPlanTowardOperands(
     const DotRequirements& requirements_so_far) {
   CHECK(!max_params.has_value() || max_params.value() >= 1);
 
-  // The graph describing the structure of the fusion that we build - nodes
-  // corresponding to the instructions and arcs pointing from users to operands.
-  // We can build and modify this graph easily without the need to create
-  // HloInstructions at this point.
-  AdjacencyList graph;
-  // Stores the original HLO and the dimension order for each node. This is a
-  // temporary map which is used when processing the nodes in this function.
-  absl::flat_hash_map<AdjacencyList::NodeId, HloAndDimOrder>
-      hlo_and_dim_order_map;
-  // Stores the information needed to build the fused HLO for each node (what
-  // was the original HLO and whether we should fuse it or create a parameter).
-  // This is one of the outputs of this function.
-  absl::flat_hash_map<AdjacencyList::NodeId, NodeFusionPlan> fusion_plan_map;
-  // Allows reusing nodes when multiple instructions iterate over the same HLO
-  // using the same iteration spec. In that case we don't duplicate the
-  // instruction in the fusion.
-  absl::flat_hash_map<HloAndIterSpec, AdjacencyList::NodeId> node_reuse_map;
+  FusionPlanBuilder fusion_builder;
+
   // The requirements imposed by the fusion choices made in this function,
-  // combined with the existing requirements. This is one of the outputs of this
-  // function.
+  // combined with the existing requirements. This is one of the outputs of
+  // this function.
   DotRequirements combined_reqs = requirements_so_far;
 
-  auto get_or_create_fusion_node =
-      [&](const HloInstruction& hlo, const DimensionOrder& dim_order,
-          bool* is_new_node = nullptr) -> AdjacencyList::NodeId {
-    HloAndIterSpec reuse_key = {&hlo, dim_order.ToTensorIterationSpec()};
-    if (auto it = node_reuse_map.find(reuse_key); it != node_reuse_map.end()) {
-      if (is_new_node != nullptr) {
-        *is_new_node = false;
-      }
-      return it->second;
-    }
-    AdjacencyList::NodeId node_id = graph.AddNode();
-    CHECK(hlo_and_dim_order_map.insert({node_id, {&hlo, dim_order}}).second);
-    CHECK(node_reuse_map.insert({reuse_key, node_id}).second);
-    if (is_new_node != nullptr) {
-      *is_new_node = true;
-    }
-    return node_id;
-  };
   AdjacencyList::NodeId root =
-      get_or_create_fusion_node(root_hlo, root_dim_order);
+      fusion_builder.InsertNode(root_hlo, root_dim_order).first;
 
   // Nodes at the fusion edge that can either get fused too or become parameters
   // of the fusion. Used to track the number of parameters.
   absl::flat_hash_set<AdjacencyList::NodeId> inputs({root});
+
   std::queue<AdjacencyList::NodeId> queue({root});
   int64_t num_requeued = 0;
+
   // BFS
+  // If all queued instructions are re-queued, they all exceed the parameter
+  // limit, so stop fusing.
   while (queue.size() > num_requeued) {
     AdjacencyList::NodeId node_id = queue.front();
     queue.pop();
-    const HloAndDimOrder& hlo_and_dim_order = hlo_and_dim_order_map.at(node_id);
-    const HloInstruction& original_hlo = *hlo_and_dim_order.original_hlo;
-    const DimensionOrder& dim_order = hlo_and_dim_order.dim_order;
+    const HloInstruction& original_hlo =
+        *fusion_builder.GetOriginalHlo(node_id);
+    const DimensionOrder& dim_order = fusion_builder.GetDimOrder(node_id);
 
     // Watch the total number of fusion parameters.
     if (max_params.has_value() &&
@@ -353,55 +402,57 @@ FusionPlanAndRequirements BuildFusionPlanTowardOperands(
       continue;
     }
     num_requeued = 0;
+
     if (original_hlo.opcode() == HloOpcode::kParameter) {
-      CHECK(fusion_plan_map
-                .insert({node_id, {&original_hlo, /*should_fuse=*/false}})
-                .second);
+      fusion_builder.SetShouldFuseNode(node_id, false);
       continue;
     }
+
+    // TODO(b/393299275): this check cannot be replaced by a
+    // `IsTritonSupportedComputation` because we will do some rewrites
+    // later that might change the decision. For example 'scaled-dot-rewriter'
+    // replaces unsupported F8E8M0FNU with u8. We should have a more principled
+    // way check if we will be able to emit the triton code for the fusion.
+    if (original_hlo.opcode() == HloOpcode::kDynamicSlice) {
+      // TODO(b/417172838): support dynamic slice op.
+      fusion_builder.SetShouldFuseNode(node_id, false);
+      LOG(INFO) << "Not fusing dynamic slice: " << original_hlo.ToString();
+      continue;
+    }
+
     auto opt_result = GetOperandDimOrdersAndCombinedReqsIfProfitable(
         original_hlo, dim_order, properties, gpu_version, combined_reqs);
     if (!opt_result.has_value()) {
-      CHECK(fusion_plan_map
-                .insert({node_id, {&original_hlo, /*should_fuse=*/false}})
-                .second);
+      fusion_builder.SetShouldFuseNode(node_id, false);
       continue;
     }
+
     const DimOrderMap operand_dim_orders = std::move(opt_result->dim_orders);
     combined_reqs = std::move(opt_result->requirements);
+
     inputs.erase(node_id);
-    graph.ReserveSpaceForOutNeighbors(node_id, original_hlo.operand_count());
-    for (int64_t i = 0; i < original_hlo.operand_count(); ++i) {
-      const HloInstruction& operand = *original_hlo.operand(i);
-      const DimensionOrder& operand_dim_order = operand_dim_orders.at(&operand);
-      bool is_new_node = false;
-      AdjacencyList::NodeId operand_node_id =
-          get_or_create_fusion_node(operand, operand_dim_order, &is_new_node);
-      graph.AddArc(node_id, operand_node_id);
+    fusion_builder.ReserveSpaceForOutNeighbors(node_id,
+                                               original_hlo.operand_count());
+    for (const HloInstruction* operand : original_hlo.operands()) {
+      const DimensionOrder& operand_dim_order = operand_dim_orders.at(operand);
+      auto [operand_node_id, is_new_node] =
+          fusion_builder.InsertNode(*operand, operand_dim_order);
+      fusion_builder.AddArc(node_id, operand_node_id);
       if (is_new_node) {
-        VLOG(6) << "Enqueueing " << operand.ToString() << ":"
+        VLOG(6) << "Enqueueing " << operand->ToString() << ":"
                 << operand_dim_order.ToString();
         inputs.insert(operand_node_id);
         queue.push(operand_node_id);
       }
     }
-    CHECK(
-        fusion_plan_map.insert({node_id, {&original_hlo, /*should_fuse=*/true}})
-            .second);
+    fusion_builder.SetShouldFuseNode(node_id, true);
   }
   // Handle the remaining requeued items.
-  while (!queue.empty()) {
+  for (; !queue.empty(); queue.pop()) {
     AdjacencyList::NodeId node_id = queue.front();
-    queue.pop();
-
-    const HloAndDimOrder& hlo_and_dim_order = hlo_and_dim_order_map.at(node_id);
-    CHECK(fusion_plan_map
-              .insert({node_id,
-                       {hlo_and_dim_order.original_hlo, /*should_fuse=*/false}})
-              .second);
+    fusion_builder.SetShouldFuseNode(node_id, false);
   }
-  return {{std::move(graph), std::move(fusion_plan_map)},
-          std::move(combined_reqs)};
+  return {fusion_builder.BuildPlan(), std::move(combined_reqs)};
 }
 
 // Builds the HLO instructions for the fusion represented by `fusion_plan`,
@@ -672,14 +723,17 @@ absl::StatusOr<Decision> CreateDotFusion(
     return Decision::Deny(is_supported.Explain());
   }
 
+  std::vector<HlosAndRequirements> hlos_and_reqs;
+  hlos_and_reqs.reserve(dot.operand_count());
   TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
                       FuseDotOperand(dot, /*operand_index=*/0, gpu_version,
                                      builder, fusion_inputs));
+  hlos_and_reqs.push_back(lhs_hlos_and_reqs);
   TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
                       FuseDotOperand(dot, /*operand_index=*/1, gpu_version,
                                      builder, fusion_inputs));
-  HloInstruction& fused_dot = FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo,
-                                      *rhs_hlos_and_reqs.fused_hlo, builder);
+  hlos_and_reqs.push_back(rhs_hlos_and_reqs);
+  HloInstruction& fused_dot = FuseDot(dot, hlos_and_reqs, builder);
   // For now the RHS doesn't support splits, so it also doesn't impose any
   // requirements.
   HlosAndRequirements fused_output_and_reqs =
@@ -860,29 +914,38 @@ class GemmFusionVisitor : public DfsHloRewriteVisitor {
     HloComputation::Builder builder(
         absl::StrCat("fusion_", scaled_dot->name()));
 
-    auto create_parameter = [&](int64_t parameter_number,
-                                absl::string_view name) {
-      return builder.AddInstruction(HloInstruction::CreateParameter(
-          parameter_number, scaled_dot->operand(parameter_number)->shape(),
-          name));
-    };
-    std::vector<HloInstruction*> new_operands{
-        create_parameter(0, "lhs"),
-        create_parameter(1, "rhs"),
-        create_parameter(2, "lhs_scale"),
-        create_parameter(3, "rhs_scale"),
-    };
-    builder.AddInstruction(
-        scaled_dot->CloneWithNewOperands(scaled_dot->shape(), new_operands));
+    std::vector<HloInstruction*> fusion_inputs;
+
+    std::vector<HlosAndRequirements> hlos_and_reqs;
+    hlos_and_reqs.reserve(scaled_dot->operand_count());
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/0,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(lhs_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/1,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(rhs_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_scale_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/2,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(lhs_scale_hlos_and_reqs);
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_scale_hlos_and_reqs,
+                        FuseDotOperand(*scaled_dot, /*operand_index=*/3,
+                                       gpu_version_, builder, fusion_inputs));
+    hlos_and_reqs.push_back(rhs_scale_hlos_and_reqs);
+
+    FuseDot(*scaled_dot, hlos_and_reqs, builder);
 
     HloComputation* computation =
         scaled_dot->GetModule()->AddComputationAndUnifyNamesAndIds(
             builder.Build(),
             /*is_entry=*/false);
-    HloInstruction* fusion = scaled_dot->parent()->AddInstruction(
-        HloInstruction::CreateFusion(computation->root_instruction()->shape(),
-                                     HloInstruction::FusionKind::kCustom,
-                                     scaled_dot->operands(), computation));
+
+    HloInstruction* fusion =
+        scaled_dot->parent()->AddInstruction(HloInstruction::CreateFusion(
+            computation->root_instruction()->shape(),
+            HloInstruction::FusionKind::kCustom, fusion_inputs, computation));
 
     TF_ASSIGN_OR_RETURN(auto gpu_config,
                         fusion->backend_config<GpuBackendConfig>());
@@ -917,7 +980,7 @@ bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
       ->WantToFuse();
 }
 
-absl::StatusOr<bool> GemmFusion::Run(
+absl::StatusOr<bool> GemmFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
index d8656a44d25ce8..bb2ba263358f2a 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.h
@@ -41,8 +41,8 @@ class GemmFusion : public HloModulePass {
       : compute_capability_(compute_capability) {}
   absl::string_view name() const override { return "triton-gemm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
index acbb37127cb199..9f1db5d2911073 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
@@ -209,7 +209,7 @@ absl::StatusOr<bool> MaybeSwapOperands(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> GemmFusionSwapOperands::Run(
+absl::StatusOr<bool> GemmFusionSwapOperands::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool any_changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
index 1eeedef74063ec..8e4cad9366ca51 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.h
@@ -31,9 +31,8 @@ class GemmFusionSwapOperands : public HloModulePass {
     return "gemm-fusion-swap-operands";
   }
 
- public:
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index 491357bebc3a4c..a0a0e9f759e592 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -144,9 +144,10 @@ ENTRY e {
   r1 = s8[8,32,8] reshape(p0)
   t1 = s8[32,8,8] transpose(r1), dimensions={1,0,2}
   r0 = s8[32,64] reshape(t1)
+  c1 = f16[32,64] convert(r0)
   p1 = s8[32,32] parameter(1)
   c0 = f16[32,32] convert(p1)
-  ROOT d = f16[64,32] dot(r0, c0),
+  ROOT d = f16[64,32] dot(c1, c0),
     lhs_contracting_dims={0}, rhs_contracting_dims={1}
 })")
                     .value();
@@ -263,7 +264,8 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, DynamicSliceIsFused) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DynamicSliceIsFused) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -287,7 +289,8 @@ ENTRY e {
                                     m::Parameter(), m::Constant()))));
 }
 
-TEST_F(GemmFusionTest, DynamicSlicesAreFusedEvenIfTheyShareIndices) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DynamicSlicesAreFusedEvenIfTheyShareIndices) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -318,7 +321,8 @@ ENTRY e {
                             m::Parameter(), m::Parameter()))));
 }
 
-TEST_F(GemmFusionTest, DoNotFuseDynamicSliceOfNonMajorFragments) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest, DISABLED_DoNotFuseDynamicSliceOfNonMajorFragments) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -337,7 +341,9 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, CanFuseDynamicSliceOfContractingDimIfItIsMajor) {
+// TODO(b/417172838): support dynamic slice op.
+TEST_F(GemmFusionTest,
+       DISABLED_CanFuseDynamicSliceOfContractingDimIfItIsMajor) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -1398,7 +1404,7 @@ ENTRY e {
 TEST_F(SmallDotGemmFusionTest, Int4DotIsRewritten) {
   constexpr auto kInt4Dot = R"(
     ENTRY e {
-      p0 = s8[16,16] parameter(0)
+      p0 = bf16[16,16] parameter(0)
       p1 = s4[16,16] parameter(1)
       p1c = bf16[16,16] convert(p1)
       ROOT dot = bf16[16,16] dot(p0, p1c),
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index 3ae9a8088ce089..5e7082cc2e6fca 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -653,7 +653,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                  const_cast<HloInstruction*>(instr->operand(0)))) &&
             (b = MatchFp8Param(
                  const_cast<HloInstruction*>(instr->operand(1))))) {
-          if (IsRocm(gpu_version_) &&
+          if (gpu_version_.IsRocm() &&
               toolkit_version_ < stream_executor::SemanticVersion{6, 2, 0} &&
               instr->shape().element_type() != F16 &&
               instr->shape().element_type() != F32) {
@@ -1048,10 +1048,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
-  static bool IsCuda(const se::GpuComputeCapability& gpu_version) {
-    return gpu_version.IsCuda();
-  }
-
   static absl::StatusOr<se::CudaComputeCapability> GetCudaComputeCapability(
       const se::GpuComputeCapability& gpu_version) {
     auto* cuda_cc = gpu_version.cuda_compute_capability();
@@ -1061,10 +1057,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return *cuda_cc;
   }
 
-  static bool IsRocm(const se::GpuComputeCapability& gpu_version) {
-    return gpu_version.IsRocm();
-  }
-
   static absl::StatusOr<se::RocmComputeCapability> GetRocmComputeCapability(
       const se::GpuComputeCapability& gpu_version) {
     auto rocm_cc = gpu_version.rocm_compute_capability();
@@ -1081,7 +1073,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     GemmBackendConfig& gemm_backend_config =
         *gpu_backend_config.mutable_gemm_backend_config();
     se::CudaComputeCapability cuda_compute_capability;
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       TF_ASSIGN_OR_RETURN(cuda_compute_capability,
                           GetCudaComputeCapability(gpu_version_));
       // FP8 GEMM kernels are only available on Ada, Hopper, and later
@@ -1100,7 +1092,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
                           GetRocmComputeCapability(gpu_version_));
       if (!rocm_compute_capability.has_fp8_support()) {
@@ -1119,7 +1111,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       if (a_type == F8E5M2 && b_type == F8E5M2) {
         VLOG(1)
             << "Failed to rewrite " << instr->ToShortString()
@@ -1138,7 +1130,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
                           GetRocmComputeCapability(gpu_version_));
       if (rocm_compute_capability.has_ocp_fp8_support()) {
@@ -1232,7 +1224,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     PrimitiveType d_type = instr->shape().element_type();
     std::unordered_set<PrimitiveType> supported_d_types = {BF16, F16, F32};
-    if (IsCuda(gpu_version_)) {
+    if (gpu_version_.IsCuda()) {
       supported_d_types.insert(F8E4M3FN);
       supported_d_types.insert(F8E5M2);
       if (supported_d_types.find(d_type) == supported_d_types.end()) {
@@ -1243,7 +1235,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         return false;
       }
     }
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       if (toolkit_version_ < stream_executor::SemanticVersion{6, 2, 0}) {
         if (supported_d_types.find(d_type) == supported_d_types.end()) {
           VLOG(1) << "Failed to rewrite " << instr->ToShortString()
@@ -1476,7 +1468,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                           HloInstruction* clamp_upper,
                           bool mult_scale = false) {
     // TODO: add ROCm support to this fusion pattern
-    if (IsRocm(gpu_version_)) {
+    if (gpu_version_.IsRocm()) {
       return absl::OkStatus();
     }
     // Verify the data types and the operands of clamp.
@@ -1946,7 +1938,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
     // matmul. We cannot check the patch version, so disable this fusion with
     // CUDA versions less than 12.4.
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         toolkit_version_ < stream_executor::SemanticVersion{12, 4, 0} &&
         IsCublasLtMatmulF8(*gemm)) {
       return absl::OkStatus();
@@ -2002,7 +1994,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
     // matmul. We cannot check the patch version, so disable this fusion with
     // CUDA versions less than 12.4.
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         toolkit_version_ < stream_executor::SemanticVersion{12, 4, 0} &&
         IsCublasLtMatmulF8(*gemm)) {
       return absl::OkStatus();
@@ -2278,7 +2270,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         {ComputationType::kF64, DataType::kComplexDouble, PrimitiveType::C128,
          PrimitiveType::C128, DataType::kComplexDouble},
     };
-    if (IsCuda(gpu_version_) &&
+    if (gpu_version_.IsCuda() &&
         absl::c_linear_search(supported_cublas_type_combinations,
                               std::tuple{compute_type, scale_type, a_dtype,
                                          b_dtype, output_dtype})) {
@@ -2349,7 +2341,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
          PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
     };
-    if (IsRocm(gpu_version_) &&
+    if (gpu_version_.IsRocm() &&
         absl::c_linear_search(supported_hipblas_type_combinations,
                               std::tuple{compute_type, scale_type, a_dtype,
                                          b_dtype, output_dtype})) {
@@ -2580,8 +2572,8 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
 
     if (instr->shape().IsTuple()) {
       for (auto user : instr->users()) {
-        auto user_get_tuple =
-            dynamic_cast<HloGetTupleElementInstruction*>(user);
+        HloGetTupleElementInstruction* user_get_tuple =
+            DynCast<HloGetTupleElementInstruction>(user);
         TF_RET_CHECK(user_get_tuple);
         HloInstruction* get_output =
             instr->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -2620,7 +2612,7 @@ GemmRewriter::GemmRewriter(se::GpuComputeCapability gpu_version,
       toolkit_version_(toolkit_version),
       options_(options) {}
 
-absl::StatusOr<bool> GemmRewriter::Run(
+absl::StatusOr<bool> GemmRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
index 8448d260e58eca..f49f7ed3a1da84 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
@@ -81,8 +81,8 @@ class GemmRewriter : public HloModulePass {
                GemmRewriterOptions options = {});
   absl::string_view name() const override { return "cublas-gemm-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 370bc26a8b2ee2..873e7435e46fe9 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -980,8 +980,13 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (!IsCuda() ||
-      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+  if (!IsCuda()) {
+    MatchOptimizedHlo(hlo_text,
+                      R"(
+; CHECK: {{.*}} custom-call(bf16[12,4]{1,0} {{.*}}, bf16[4,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+  )",
+                      /*print_operand_shape=*/true);
+  } else if (HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
@@ -1005,8 +1010,13 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (!IsCuda() ||
-      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+  if (!IsCuda()) {
+    MatchOptimizedHlo(hlo_text,
+                      R"(
+    ; CHECK: {{.*}} custom-call(bf16[3,3,4]{2,1,0} {{.*}}, bf16[3,3,2]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
+    )",
+                      /*print_operand_shape=*/true);
+  } else if (HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
     ; CHECK: {{.*}} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
index 1e02700ca84ca7..8497aa18e280ec 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
@@ -158,7 +158,7 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> GemvRewriter::Run(
+absl::StatusOr<bool> GemvRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   GemvRewriterVisitor gemv_rewriter;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
index 89bf9d499cc9be..55bbb9643b3f0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.h
@@ -32,8 +32,8 @@ class GemvRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "gemv-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
deleted file mode 100644
index fa582eb58adfea..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/comparison_util.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-void SetFortranLayout(Shape* shape) {
-  LayoutUtil::SetToDefaultLayout(shape);
-  int n = shape->mutable_layout()->minor_to_major().size();
-  CHECK_GE(n, 2);
-  std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
-            shape->mutable_layout()->mutable_minor_to_major()->at(1));
-}
-
-absl::StatusOr<HloInstruction*> CreateCholesky(
-    stream_executor::GpuSolverContext* context, HloInstruction* operand,
-    const CholeskyOptions& options, const OpMetadata& metadata) {
-  HloComputation* computation = operand->parent();
-
-  Shape a_shape = operand->shape();
-  int ndim = a_shape.dimensions().size();
-  CHECK_GE(ndim, 2);
-  int64_t n = a_shape.dimensions(ndim - 1);
-
-  std::vector<int64_t> batch_dims(a_shape.dimensions().begin(),
-                                  a_shape.dimensions().end() - 2);
-  std::vector<int64_t> batch_dim_ids(batch_dims.size());
-  absl::c_iota(batch_dim_ids, 0);
-  int64_t batch_size = absl::c_accumulate(batch_dims, 1, std::multiplies<>{});
-
-  // Find the workspace size.
-  se::blas::UpperLower uplo = options.lower() ? se::blas::UpperLower::kLower
-                                              : se::blas::UpperLower::kUpper;
-  int64_t workspace_size;  // Number of elements of size a_shape.element_type()
-  TF_ASSIGN_OR_RETURN(
-      workspace_size,
-      context->PotrfBufferSize(a_shape.element_type(), uplo, n, n, batch_size));
-
-  // TODO(phawkins): Ideally we would relax this constraint. What we actually
-  // want is that:
-  // a) the batch dimensions are major, in no particular order.
-  // b) the two minor dimensions are in fortran (column-major) order,
-
-  SetFortranLayout(&a_shape);
-
-  // This call returns a tuple of (cholesky_result, workspace, info) where:
-  // * cholesky_result is the result of the Cholesky decomposition,
-  // * workspace is temporary scratch memory used by cuSolver.
-  // * info contains the Potrf success/failure status.
-  // Currently we have no meaningful way to report an error, so we simply
-  // discard the success/failure information. Obviously this is suboptimal.
-  Shape info_shape = ShapeUtil::MakeShape(S32, batch_dims);
-  Shape call_shape = ShapeUtil::MakeTupleShape(
-      {a_shape,
-       ShapeUtil::MakeShape(operand->shape().element_type(), {workspace_size}),
-       info_shape});
-
-  HloInstruction* custom_call =
-      computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {operand}, kCusolverCholeskyCallTarget, {a_shape}));
-  custom_call->set_metadata(metadata);
-  TF_RETURN_IF_ERROR(custom_call->set_backend_config(options));
-  HloInstruction* out = computation->AddInstruction(
-      HloInstruction::CreateGetTupleElement(a_shape, custom_call, 0));
-  HloInstruction* info = computation->AddInstruction(
-      HloInstruction::CreateGetTupleElement(info_shape, custom_call, 2));
-
-  // If info was non-zero, indicating that the Cholesky decomposition failed,
-  // returns an array full of NaNs for the corresponding batch element.
-  HloInstruction* zero = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-  HloInstruction* zeros =
-      computation->AddInstruction(HloInstruction::CreateBroadcast(
-          info_shape, zero, /*broadcast_dimensions=*/{}));
-  HloInstruction* ok = computation->AddInstruction(
-      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, batch_dims),
-                                    info, zeros, ComparisonDirection::kEq));
-  ok = computation->AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(PRED, a_shape.dimensions()), ok,
-      /*broadcast_dimensions=*/batch_dim_ids));
-
-  TF_ASSIGN_OR_RETURN(Literal nan_literal,
-                      LiteralUtil::NanValue(a_shape.element_type()));
-  HloInstruction* nan = computation->AddInstruction(
-      HloInstruction::CreateConstant(std::move(nan_literal)));
-  HloInstruction* nans =
-      computation->AddInstruction(HloInstruction::CreateBroadcast(
-          a_shape, nan, /*broadcast_dimensions=*/{}));
-
-  HloInstruction* select =
-      computation->AddInstruction(HloInstruction::CreateTernary(
-          a_shape, HloOpcode::kSelect, ok, out, nans));
-  return select;
-}
-
-// Tries to rewrite a single convolution into a call to cudnn.
-absl::StatusOr<bool> RunOnInstruction(
-    stream_executor::GpuSolverContext* context, HloInstruction* instruction) {
-  if (HloPredicateIsNotOp<HloOpcode::kCholesky>(instruction)) {
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * custom_call,
-      CreateCholesky(context, instruction->mutable_operand(0),
-                     instruction->cholesky_options(), instruction->metadata()));
-
-  VLOG(1) << "Replacing " << instruction->ToString() << " with "
-          << custom_call->ToString();
-
-  TF_RETURN_IF_ERROR(
-      instruction->parent()->ReplaceInstruction(instruction, custom_call));
-  return true;
-}
-
-}  // namespace
-
-// Rewrites the convolutions in the given computation into calls to cudnn.
-// Returns true if it made any changes.
-absl::StatusOr<bool> GpusolverRewriter::RunOnComputation(
-    HloComputation* computation) {
-  std::vector<HloInstruction*> cusolver_calls;
-  for (auto* hlo : computation->instructions()) {
-    if (HloPredicateIsOp<HloOpcode::kCholesky>(hlo)) {
-      cusolver_calls.push_back(hlo);
-    }
-  }
-
-  if (cusolver_calls.empty()) {
-    return false;
-  }
-
-  TF_ASSIGN_OR_RETURN(auto context, solver_context_creator_());
-
-  bool changed = false;
-  for (HloInstruction* instruction : cusolver_calls) {
-    TF_ASSIGN_OR_RETURN(bool result,
-                        RunOnInstruction(context.get(), instruction));
-    changed |= result;
-  }
-  return changed;
-}
-
-GpusolverRewriter::GpusolverRewriter(
-    absl::AnyInvocable<
-        absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-        solver_context_creator)
-    : solver_context_creator_(std::move(solver_context_creator)) {}
-
-absl::StatusOr<bool> GpusolverRewriter::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  for (HloComputation* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
-    changed |= result;
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
deleted file mode 100644
index 405b488d7d32a9..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "absl/functional/any_invocable.h"
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
-
-#include <memory>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-
-namespace xla {
-namespace gpu {
-
-// Rewrites Cholesky calls into CustomCall HLOs that call into cuSolver.
-class GpusolverRewriter : public HloModulePass {
- public:
-  explicit GpusolverRewriter(
-      absl::AnyInvocable<
-          absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-          solver_context_creator);
-  absl::string_view name() const override { return "gpusolver-rewriter"; }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
-  absl::AnyInvocable<
-      absl::StatusOr<std::unique_ptr<stream_executor::GpuSolverContext>>()>
-      solver_context_creator_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_GPUSOLVER_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
deleted file mode 100644
index 840435acf55411..00000000000000
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
-
-#include <complex>
-#include <cstdint>
-#include <memory>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/testlib/pattern_matcher_gmock.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace m = ::xla::match;
-
-class GpuSolverContextStub : stream_executor::GpuSolverContext {
- public:
-  GpuSolverContextStub() = default;
-  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create() {
-    return absl::WrapUnique(
-        static_cast<GpuSolverContext*>(new GpuSolverContextStub));
-  }
-
-  absl::Status SetStream(stream_executor::Stream* stream) override {
-    return UnimplementedError();
-  }
-
-  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
-                            stream_executor::DeviceMemory<float*> as, int lda,
-                            stream_executor::DeviceMemory<int> lapack_info,
-                            int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
-                            stream_executor::DeviceMemory<double*> as, int lda,
-                            stream_executor::DeviceMemory<int> lapack_info,
-                            int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<float>*> as, int lda,
-      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
-    return UnimplementedError();
-  }
-  absl::Status PotrfBatched(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<double>*> as, int lda,
-      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
-    return UnimplementedError();
-  }
-
-  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
-                     stream_executor::DeviceMemory<float> a, int lda,
-                     stream_executor::DeviceMemory<int> lapack_info,
-                     stream_executor::DeviceMemory<float> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
-                     stream_executor::DeviceMemory<double> a, int lda,
-                     stream_executor::DeviceMemory<int> lapack_info,
-                     stream_executor::DeviceMemory<double> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<float>> a, int lda,
-      stream_executor::DeviceMemory<int> lapack_info,
-      stream_executor::DeviceMemory<std::complex<float>> workspace) override {
-    return UnimplementedError();
-  }
-  absl::Status Potrf(
-      stream_executor::blas::UpperLower uplo, int n,
-      stream_executor::DeviceMemory<std::complex<double>> a, int lda,
-      stream_executor::DeviceMemory<int> lapack_info,
-      stream_executor::DeviceMemory<std::complex<double>> workspace) override {
-    return UnimplementedError();
-  }
-
-  absl::StatusOr<int64_t> PotrfBufferSize(
-      xla::PrimitiveType type, stream_executor::blas::UpperLower uplo, int n,
-      int lda, int batch_size) override {
-    return 0;
-  }
-
- private:
-  static absl::Status UnimplementedError() {
-    return absl::UnimplementedError("Not needed for the unit test");
-  }
-};
-
-class GpusolverRewriterTest : public HloHardwareIndependentTestBase {
- public:
-  GpusolverRewriter gpusolver_rewriter_{GpuSolverContextStub::Create};
-};
-
-TEST_F(GpusolverRewriterTest, CholeskyTest) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-  HloModule CholeskyTest
-
-  ENTRY entry_computation {
-    input = f32[1,256,256] parameter(0)
-    ROOT decomp = f32[1,256,256] cholesky(input)
-  }
-)")
-                    .value();
-
-  EXPECT_TRUE(gpusolver_rewriter_.Run(module.get()).value());
-
-  const HloInstruction* entry_root =
-      module->entry_computation()->root_instruction();
-  ASSERT_THAT(
-      entry_root,
-      GmockMatch(m::Select(
-          m::Broadcast(
-              m::Compare(m::GetTupleElement(), m::Broadcast(m::Constant()))),
-          m::GetTupleElement(m::CustomCall()), m::Broadcast(m::Constant()))));
-}
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
index 8ca9f42234c2bb..39d34d9bec073d 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -646,7 +646,9 @@ ENTRY entry {
       hlo_module->entry_computation()->ComputeProgramShape());
 
   GpuLayoutAssignment layout_assignment(
-      &computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestRDNASupport()},
       GetDnnVersion(), GetDeviceDescription());
 
   EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
@@ -683,7 +685,9 @@ ENTRY entry {
       hlo_module->entry_computation()->ComputeProgramShape());
 
   GpuLayoutAssignment layout_assignment(
-      &computation_layout, se::RocmComputeCapability::EarliestRDNASupport(),
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestRDNASupport()},
       GetDnnVersion(), GetDeviceDescription());
 
   EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
@@ -723,7 +727,9 @@ ENTRY entry {
       hlo_module->entry_computation()->ComputeProgramShape());
 
   GpuLayoutAssignment layout_assignment(
-      &computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestCDNASupport()},
       GetDnnVersion(), GetDeviceDescription());
 
   EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
@@ -763,7 +769,9 @@ ENTRY entry {
       hlo_module->entry_computation()->ComputeProgramShape());
 
   GpuLayoutAssignment layout_assignment(
-      &computation_layout, se::RocmComputeCapability::EarliestCDNASupport(),
+      &computation_layout,
+      se::GpuComputeCapability{
+          se::RocmComputeCapability::EarliestCDNASupport()},
       GetDnnVersion(), GetDeviceDescription());
 
   EXPECT_THAT(layout_assignment.Run(hlo_module.get()),
diff --git a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
index e4570b6cbd3464..47d1b46483cdd6 100644
--- a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
+++ b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.cc
@@ -235,7 +235,7 @@ class MoveCopyToUsersVisitor : public DfsHloRewriteVisitor {
 
 }  // end namespace
 
-absl::StatusOr<bool> MoveCopyToUsers::Run(
+absl::StatusOr<bool> MoveCopyToUsers::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return MoveCopyToUsersVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
index e3e5550119267b..a5baf5b6aae0ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
+++ b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users.h
@@ -28,8 +28,9 @@ namespace xla {
 class MoveCopyToUsers : public HloModulePass {
  public:
   absl::string_view name() const override { return "move_copy_to_users"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
index 283d3c9f31ed77..03428a713da151 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
@@ -524,7 +524,7 @@ void MultiOutputFusion::DumpFusionState(const HloInstruction& consumer,
   }
 }
 
-absl::StatusOr<bool> MultiOutputFusion::Run(
+absl::StatusOr<bool> MultiOutputFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
index 88d5d4c2295769..6da3983260f96d 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.h
@@ -22,12 +22,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_dfs_reachability.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/gpu_fusible.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
@@ -105,8 +105,8 @@ class MultiOutputFusion : public HloModulePass {
 
   absl::string_view name() const override { return "multi_output_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
index 8d2db3da7cd3f4..39d78db39184e2 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_fusible.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 289f8963d67b37..aa38a60ed23625 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/symbolic_tiled_hlo_instruction.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -58,7 +59,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/triton_emitter_constraints.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/matmul_indexing_utils.h"
@@ -202,6 +202,8 @@ absl::Status AnnotateDotOperandNestedFusionImpl(
   block_level_parameters.num_ctas = config.num_ctas;
   block_level_parameters.num_stages = config.num_stages;
   block_level_parameters.is_tma_allowed = config.is_tma_allowed;
+  block_level_parameters.is_warp_specialization_allowed =
+      config.is_warp_specialization_allowed;
 
   TF_ASSIGN_OR_RETURN(auto gpu_config,
                       nested_fusion.backend_config<GpuBackendConfig>());
@@ -335,11 +337,6 @@ absl::Status MakeNestedFusionFromGemmFusion(
   return absl::OkStatus();
 }
 
-size_t GetDotCount(HloComputation* computation) {
-  return absl::c_count_if(computation->instructions(),
-                          HloPredicateIsOp<HloOpcode::kDot>);
-}
-
 using HloInstructionSetVector =
     llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
                     HloInstructionSet>;
@@ -1324,7 +1321,7 @@ absl::StatusOr<bool> NestGemmFusion::RunOnModule(
   return changed;
 }
 
-absl::StatusOr<bool> NestGemmFusion::Run(
+absl::StatusOr<bool> NestGemmFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "--xla_gpu_unsupported_generic_triton_emitter_features="
@@ -1338,9 +1335,7 @@ absl::StatusOr<bool> NestGemmFusion::Run(
     VLOG(1) << "Generic Triton emitter for gemms is disabled, exiting";
     return false;
   }
-
-  TF_ASSIGN_OR_RETURN(bool result, RunOnModule(module, execution_threads));
-  return result;
+  return RunOnModule(module, execution_threads);
 }
 
 namespace detail {
@@ -1443,6 +1438,8 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       params.num_ctas = config.num_ctas;
       params.num_stages = config.num_stages;
       params.is_tma_allowed = config.is_tma_allowed;
+      params.is_warp_specialization_allowed =
+          config.is_warp_specialization_allowed;
       return params;
     }
     VLOG(4) << "mapped_dot_tile_sizes: "
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index 505eebb2f89c06..4ccc417fefa0ea 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
@@ -53,8 +53,8 @@ class NestGemmFusion : public HloModulePass {
 
   absl::string_view name() const override { return "nest_gemm_fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index 4c0c9a74109c64..2f61db06073219 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
index 1e5ef80ddda776..4d951ce4ea1bf8 100644
--- a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla::gpu {
 
-absl::StatusOr<bool> PGLEAccuracyChecker::Run(
+absl::StatusOr<bool> PGLEAccuracyChecker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(pgle_estimator_.CheckAccuracy(*module));
diff --git a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
index b2484f26844562..b952a1f50e5119 100644
--- a/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
+++ b/third_party/xla/xla/service/gpu/transforms/pgle_accuracy_checker.h
@@ -34,8 +34,8 @@ class PGLEAccuracyChecker : public HloModulePass {
       : pgle_estimator_(pgle_estimator) {}
   absl::string_view name() const override { return "pgle-accuracy-checker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index dead7f43ede47f..8a60209fda4615 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/analysis/hlo_dfs_reachability.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instruction_utils.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
@@ -81,6 +81,12 @@ namespace gpu {
 
 namespace {
 
+// Bitcasts are fusible if they don't change the bit width.
+bool IsFusibleBitcast(const HloInstruction& instr) {
+  return instr.opcode() == HloOpcode::kBitcast &&
+         hlo_instruction_utils::KeepsBitwidth(instr);
+}
+
 bool IsFusible(const HloInstruction& instr) {
   // Side-effecting operations are not fusible.
   if (!instr.IsFusible()) {
@@ -92,13 +98,16 @@ bool IsFusible(const HloInstruction& instr) {
     return true;
   }
 
+  // Bitcasts are fusible if they don't change the bit width.
+  if (IsFusibleBitcast(instr)) {
+    return true;
+  }
+
   // Other non-elementwise ops also supported by elemental fusion.
   switch (instr.opcode()) {
     case HloOpcode::kFusion:
       return IsGenericTritonFusion(instr) ||
              instr.fusion_kind() != HloInstruction::FusionKind::kCustom;
-    case HloOpcode::kBitcast:
-      return hlo_instruction_utils::KeepsBitwidth(instr);
     case HloOpcode::kCopy:
     case HloOpcode::kIota:
     case HloOpcode::kConstant:
@@ -265,7 +274,7 @@ class PriorityFusionQueue {
         current_consumers_ = {*preferred_consumer};
       }
 
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(current_producer_)) {
+      if (IsFusibleBitcast(*current_producer_)) {
         // We don't check if bitcasts can be fused with all consumers, so we
         // have to do it here.
         llvm::erase_if(current_consumers_, [&](HloInstruction* consumer) {
@@ -547,7 +556,7 @@ class PriorityFusionQueue {
       preferred_consumer_.erase(producer);
     }
     // Bitcasts should always be fused first, since they are no-ops.
-    if (HloPredicateIsOp<HloOpcode::kBitcast>(producer)) {
+    if (IsFusibleBitcast(*producer)) {
       return absl::InfiniteDuration();
     }
     // We always fuse constants, but the cost model doesn't handle them very
@@ -790,7 +799,7 @@ class PriorityFusionQueue {
       return can_fuse_triton;
     }
 
-    if (HloPredicateIsOp<HloOpcode::kBitcast>(consumer)) {
+    if (IsFusibleBitcast(*consumer)) {
       return FusionDecision::Forbid(
           "not fusing into a single bitcast as consumer");
     }
@@ -926,7 +935,7 @@ class PriorityFusionQueue {
     }
     std::vector<HloInstruction*> possible_consumers;
     for (const auto& user : producer->users()) {
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(user)) {
+      if (IsFusibleBitcast(*user)) {
         continue;
       }
       if (CanFuseTriton(producer, user, /*use_multi_output_fusion=*/true) &&
@@ -960,7 +969,7 @@ class PriorityFusionQueue {
 
     bool has_non_bitcast_user = false;
     for (const auto& user : producer->users()) {
-      if (HloPredicateIsOp<HloOpcode::kBitcast>(user)) {
+      if (IsFusibleBitcast(*user)) {
         continue;
       }
       has_non_bitcast_user = true;
@@ -1127,7 +1136,7 @@ FusionDecision PriorityFusion::CanFuseConstant(const HloInstruction* constant,
   return FusionDecision::Allow();
 }
 
-absl::StatusOr<bool> PriorityFusion::Run(
+absl::StatusOr<bool> PriorityFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool dump_enabled =
@@ -1181,7 +1190,7 @@ absl::StatusOr<bool> PriorityFusion::Run(
       for (auto* consumer : consumers) {
         // Don't fuse into single bitcasts. We ignore them in the check
         // CanFuseWithAllNonBitcastUsers(), so we need to check it here.
-        if (HloPredicateIsOp<HloOpcode::kBitcast>(consumer)) {
+        if (IsFusibleBitcast(*consumer)) {
           continue;
         }
         if (!ConsumeFuel(producer, consumer)) {
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
index 240ee909a052be..1ccd3a635c1b88 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
@@ -22,12 +22,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -53,11 +53,6 @@ class PriorityFusion : public HloModulePass {
 
   absl::string_view name() const override { return "priority-fusion"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
                                         const HloInstruction* consumer);
@@ -65,6 +60,10 @@ class PriorityFusion : public HloModulePass {
   HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
                        bool use_multi_output_fusion = false);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Consumes a unit of compiler fuel and returns true if we should
   // continue with the transformation.
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index 4ef54ed4b830c2..74a8d5cddf8d61 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
@@ -218,17 +218,19 @@ CHECK-NEXT: ROOT %{{.*}} = (f32[512]{0}, s32[512]{0}) tuple(%[[FUSION_F32]], %[[
 }
 
 TEST_F(PriorityFusionTest, DoNotFuseBitWidthChangingBitcast) {
-  EXPECT_TRUE(RunAndCheckHloRewrite(R"(
-e {
-  a = s8[3,5,2]{2,1,0} parameter(0)
-  n = s8[3,5,2]{2,1,0} negate(a)
-  b = s16[3,5]{1,0} bitcast(n)
-  m = s16[3,5]{1,0} multiply(b, b)
-})",
-                                    std::move(priority_fusion_),
-                                    /*expect_change=*/false)
-                  .status()
-                  .ok());
+  // `neg` is the producer that could be fused with `bitcast` and `mul`, but
+  // since `bitcast` changes the bit width, we don't fuse it.
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    ENTRY main {
+      p0 = s8[3,5,2]{2,1,0} parameter(0)
+      neg = s8[3,5,2]{2,1,0} negate(p0)
+      bitcast = s16[3,5]{1,0} bitcast(neg)
+      mul = s8[3,5,2]{2,1,0} add(neg, neg)
+      ROOT result = (s16[3,5]{1,0}, s8[3,5,2]{2,1,0}) tuple(bitcast, mul)
+    })");
+
+  EXPECT_THAT(priority_fusion_.Run(module.get()),
+              absl_testing::IsOkAndHolds(false));
 }
 
 TEST_F(PriorityFusionTest, FuseConvertIntoReduce) {
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
index efa91c039730c2..c8dacb3355a89b 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.cc
@@ -73,7 +73,7 @@ absl::StatusOr<bool> CanonicalizeRaggedAllToAll(
   return true;
 }
 
-absl::StatusOr<bool> RaggedAllToAllCanonicalizer::Run(
+absl::StatusOr<bool> RaggedAllToAllCanonicalizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
index 75c5e740d80078..8b58846f288be9 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h
@@ -35,7 +35,8 @@ class RaggedAllToAllCanonicalizer : public HloModulePass {
     return "ragged-all-to-all-canonicalizer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
index cf06fecc776e32..c7ddadb4a8eb73 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
@@ -401,7 +401,7 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(HloInstruction* hlo,
   return true;
 }
 
-absl::StatusOr<bool> RaggedAllToAllDecomposer::Run(
+absl::StatusOr<bool> RaggedAllToAllDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
index 13fd87d69a76b4..f0b1dff755ba59 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.h
@@ -45,7 +45,8 @@ class RaggedAllToAllDecomposer : public HloModulePass {
     return "ragged-all-to-all-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
index 19a3ad327526ee..e6da5ffb509679 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
@@ -47,39 +47,85 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+
 using hlo_query::NextChannelId;
 
-// Exchanges the metadata between the hosts and computes the intra-host
+// Corrects the offsets in the local metadata to account for the number of input
+// rows in the combined ragged tensor.
+HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
+                               HloComputation* computation) {
+  const Shape& shape = local_metadata->shape();
+
+  HloInstruction* iota = computation->AddInstruction(
+      HloInstruction::CreateIota(/*shape=*/shape, /*iota_dimension=*/0));
+
+  HloInstruction* num_input_rows_constant = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64_t>(offset)));
+
+  HloInstruction* num_input_rows_constant_broadcast =
+      computation->AddInstruction(HloInstruction::CreateBroadcast(
+          /*shape=*/shape, num_input_rows_constant,
+          /*broadcast_dimensions=*/{}));
+
+  HloInstruction* input_offsets_offset =
+      computation->AddInstruction(HloInstruction::CreateBinary(
+          /*shape=*/shape, HloOpcode::kMultiply,
+          /*lhs=*/iota, /*rhs=*/num_input_rows_constant_broadcast));
+
+  return computation->AddInstruction(HloInstruction::CreateBinary(
+      /*shape=*/shape, HloOpcode::kAdd,
+      /*lhs=*/local_metadata,
+      /*rhs=*/input_offsets_offset));
+}
+
+// Exchanges the metadata operands between the hosts and computes the intra-host
 // metadata.
-//
-// If `correct_offsets` is true, the offsets are corrected to account for the
-// number of input rows in the combined ragged tensor. It's needed for
-// `input_offsets`.
-HloInstruction* GetIntraHostMetadata(
+absl::InlinedVector<HloInstruction*, 4> GetIntraHostMetadata(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
-    HloInstruction* metadata_operand, HloComputation* computation,
-    absl::Span<ReplicaGroup const> replica_groups, int64_t num_hosts,
-    int64_t num_devices_in_replica, bool correct_offsets) {
+    HloComputation* computation, absl::Span<ReplicaGroup const> replica_groups,
+    int64_t num_hosts, int64_t num_devices_in_replica) {
   int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
 
+  absl::InlinedVector<HloInstruction*, 4> metadata_operands;
+  metadata_operands.reserve(4);
+  for (int i = 2; i < 6; ++i) {
+    metadata_operands.push_back(ragged_all_to_all->mutable_operand(i));
+  }
+
+  Shape metadata_operand_shape = metadata_operands[0]->shape();
+
   int64_t num_updates_per_replica =
-      metadata_operand->shape().dimensions(0) / num_devices_in_replica;
+      metadata_operand_shape.dimensions(0) / num_devices_in_replica;
 
   Shape new_metadata_shape = ShapeUtil::MakeShape(
-      metadata_operand->shape().element_type(),
+      metadata_operand_shape.element_type(),
       {num_hosts, num_devices_in_replica_per_host, num_updates_per_replica});
 
   Shape new_metadata_transposed_shape = ShapeUtil::MakeShape(
-      metadata_operand->shape().element_type(),
+      metadata_operand_shape.element_type(),
       {num_devices_in_replica_per_host, num_hosts, num_updates_per_replica});
 
-  HloInstruction* new_input_offsets = computation->AddInstruction(
-      HloInstruction::CreateReshape(new_metadata_shape, metadata_operand));
+  for (int64_t i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            new_metadata_shape, metadata_operands[i]));
+  }
 
-  HloInstruction* new_local_metadata =
+  Shape all_to_all_shape =
+      ShapeUtil::MakeShape(metadata_operand_shape.element_type(),
+                           {num_hosts, num_devices_in_replica_per_host,
+                            4 * num_updates_per_replica});
+
+  HloInstruction* all_to_all_input =
+      computation->AddInstruction(HloInstruction::CreateConcatenate(
+          /*shape=*/all_to_all_shape,
+          /*operands=*/metadata_operands,
+          /*dimension=*/2));
+
+  HloInstruction* all_to_all =
       computation->AddInstruction(HloInstruction::CreateAllToAll(
-          /*shape=*/new_metadata_shape,
-          /*operands=*/{new_input_offsets},
+          /*shape=*/all_to_all_shape,
+          /*operands=*/{all_to_all_input},
           /*device_list=*/CollectiveDeviceList(replica_groups),
           /*constrain_layout=*/false,
           /*channel_id=*/ragged_all_to_all->channel_id().has_value()
@@ -87,47 +133,264 @@ HloInstruction* GetIntraHostMetadata(
               : std::nullopt,
           /*split_dimension=*/0));
 
-  if (correct_offsets) {
-    HloInstruction* iota =
-        computation->AddInstruction(HloInstruction::CreateIota(
+  for (int i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateSlice(
             /*shape=*/new_metadata_shape,
-            /*iota_dimension=*/0));
-
-    int64_t num_input_rows =
-        ragged_all_to_all->operand(0)->shape().dimensions(0);
-
-    HloInstruction* num_input_rows_constant =
-        computation->AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<int64_t>(num_input_rows)));
-
-    HloInstruction* num_input_rows_constant_broadcast =
-        computation->AddInstruction(HloInstruction::CreateBroadcast(
-            /*shape=*/new_metadata_shape, num_input_rows_constant,
-            /*broadcast_dimensions=*/{}));
-
-    HloInstruction* input_offsets_offset =
-        computation->AddInstruction(HloInstruction::CreateBinary(
-            /*shape=*/new_metadata_shape, HloOpcode::kMultiply,
-            /*lhs=*/iota, /*rhs=*/num_input_rows_constant_broadcast));
-
-    new_local_metadata =
-        computation->AddInstruction(HloInstruction::CreateBinary(
-            /*shape=*/new_metadata_shape, HloOpcode::kAdd,
-            /*lhs=*/new_local_metadata,
-            /*rhs=*/input_offsets_offset));
+            /*operand=*/all_to_all,
+            /*start_indices=*/{0, 0, i * num_updates_per_replica},
+            /*limit_indices=*/
+            {num_hosts, num_devices_in_replica_per_host,
+             (i + 1) * num_updates_per_replica},
+            /*strides=*/{1, 1, 1}));
+  }
+
+  // Correct input offsets that need to be adjusted for the number of input
+  // rows.
+  metadata_operands[0] =
+      CorrectOffsets(ragged_all_to_all->operand(0)->shape().dimensions(0),
+                     metadata_operands[0], computation);
+
+  for (int i = 0; i < metadata_operands.size(); ++i) {
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateTranspose(
+            /*shape=*/new_metadata_transposed_shape,
+            /*operand=*/metadata_operands[i],
+            /*dimensions=*/{1, 0, 2}));
+    metadata_operands[i] =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            metadata_operand_shape, metadata_operands[i]));
   }
 
-  HloInstruction* new_local_metadata_transposed =
-      computation->AddInstruction(HloInstruction::CreateTranspose(
-          /*shape=*/new_metadata_transposed_shape,
-          /*operand=*/new_local_metadata,
-          /*dimensions=*/{1, 0, 2}));
+  return metadata_operands;
+}
+
+// Decomposes a dispatch `ragged-all-to-all` collective into an inter-host
+// `all-gather` and an intra-host `ragged-all-to-all`.
+//
+// Dispatch phase of MoE layer is characterized by the following properties:
+//   - The input is dense and all or most of the rows are significant.
+//   - The output is larger than the input, because we need to have a static
+//   allocation that will accommodate all the possible rows.
+// In case of dispatch phase, doing `all-gather` on inputs first is more
+// efficient, because we're only transferring significant data with up to 2x
+// overhead.
+absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
+    HloRaggedAllToAllInstruction* ragged_all_to_all,
+    HloComputation* computation,
+    absl::Span<ReplicaGroup const> inter_host_replica_groups,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    int64_t num_devices_in_replica) {
+  HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
+
+  Shape new_input_shape = input_operand->shape();
+  new_input_shape.set_dimensions(
+      0, num_hosts * input_operand->shape().dimensions(0));
+
+  // The collective can run in two modes: cross-replica and cross-partition. If
+  // the original `ragged-all-to-all` has a channel id set, then it's a
+  // cross-partition collective. In that case `all-gather` needs a channel_id
+  // and `use_global_device_ids=true`.
+  // Otherwise, when `ragged-all-to-all` has no channel id, it's a cross-replica
+  // collective. In that case `all-gather` doesn't need a `channel_id` and
+  // `use_global_device_ids` should be set to false.
+  HloInstruction* all_gather_input =
+      computation->AddInstruction(HloInstruction::CreateAllGather(
+          /*shape=*/new_input_shape,
+          /*operands=*/{ragged_all_to_all->mutable_operand(0)},
+          /*all_gather_dimension=*/0,
+          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
+          /*constrain_layout=*/false,
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt,
+          /*use_global_device_ids=*/
+          ragged_all_to_all->channel_id().has_value()));
+
+  absl::InlinedVector<HloInstruction*, 4> intra_host_metadata =
+      GetIntraHostMetadata(ragged_all_to_all, computation,
+                           inter_host_replica_groups, num_hosts,
+                           num_devices_in_replica);
 
-  HloInstruction* intra_host_metadata =
+  HloInstruction* new_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/ragged_all_to_all->shape(),
+          /*operands=*/
+          {all_gather_input, ragged_all_to_all->mutable_operand(1),
+           intra_host_metadata[0], intra_host_metadata[1],
+           intra_host_metadata[2], intra_host_metadata[3]},
+          /*replica_groups=*/intra_host_replica_groups,
+          /*channel_id=*/ragged_all_to_all->channel_id()));
+
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
+                                                     new_ragged_all_to_all));
+
+  return true;
+}
+
+// Decomposes a combine `ragged-all-to-all` collective.
+//
+// Combine phase of MoE layer is characterized by the following properties:
+//   - The input is larget than the output, because it contains rows distributed
+//     by the dispatch phase.
+//   - Most of the input rows are not significant, because it's padded to
+//     accommodate all possible rows.
+//   - The distribution of the significant rows depends on the runtime state of
+//     the MoE layer, so we can't reason about it in an HLO rewrite pass.
+//
+// An `all-gather` as a first step would be inefficient in this case, because
+// we would be transferring a lot of padding. An optimal way is to do
+// `ragged-all-to-all` within the hosts to partially gather the significant data
+// into smaller temporary buffer of output size. Exchange the data cross-host
+// and the do another local `ragged-all-to-all` to the final output. This way we
+// transfer more significant data with minimal padding with up to 2x overhead.
+absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
+    HloRaggedAllToAllInstruction* ragged_all_to_all,
+    HloComputation* computation,
+    absl::Span<ReplicaGroup const> inter_host_replica_groups,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    int64_t num_devices_in_replica, int64_t num_participating_devices) {
+  auto* zero = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(
+          ragged_all_to_all->operand(1)->shape().element_type())));
+
+  Shape tmp_output_shape = ragged_all_to_all->shape();
+  tmp_output_shape.set_dimensions(0,
+                                  num_hosts * tmp_output_shape.dimensions(0));
+
+  auto* zero_broadcast =
+      computation->AddInstruction(HloInstruction::CreateBroadcast(
+          /*shape=*/tmp_output_shape, zero, /*broadcast_dimensions=*/{}));
+
+  int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
+
+  int64_t num_updates_per_replica =
+      ragged_all_to_all->operand(2)->shape().dimensions(0) /
+      num_devices_in_replica;
+
+  auto get_intra_host_metadata = [&](HloInstruction* metadata_operand,
+                                     bool correct_offsets) {
+    metadata_operand =
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            /*shape=*/ShapeUtil::MakeShape(
+                metadata_operand->shape().element_type(),
+                {num_hosts, num_devices_in_replica_per_host,
+                 num_updates_per_replica}),
+            /*operand=*/metadata_operand));
+
+    if (correct_offsets) {
+      metadata_operand =
+          CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
+                         metadata_operand, computation);
+    }
+
+    metadata_operand =
+        computation->AddInstruction(HloInstruction::CreateTranspose(
+            /*shape=*/ShapeUtil::MakeShape(
+                metadata_operand->shape().element_type(),
+                {num_devices_in_replica_per_host, num_hosts,
+                 num_updates_per_replica}),
+            /*operand=*/metadata_operand,
+            /*dimensions=*/{1, 0, 2}));
+
+    return computation->AddInstruction(HloInstruction::CreateReshape(
+        /*shape=*/ragged_all_to_all->operand(2)->shape(),
+        /*operand=*/metadata_operand));
+  };
+
+  absl::InlinedVector<HloInstruction*, 4> intra_host_ragged_all_to_all_operands{
+      ragged_all_to_all->mutable_operand(0),
+      zero_broadcast,
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(2),
+                              /*correct_offsets=*/false),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(3),
+                              /*correct_offsets=*/false),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(4),
+                              /*correct_offsets=*/true),
+      get_intra_host_metadata(ragged_all_to_all->mutable_operand(5),
+                              /*correct_offsets=*/false),
+  };
+
+  HloInstruction* intra_host_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/zero_broadcast->shape(),
+          /*operands=*/intra_host_ragged_all_to_all_operands,
+          /*device_list=*/CollectiveDeviceList(intra_host_replica_groups),
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt));
+
+  HloInstruction* local_inputs =
+      computation->AddInstruction(HloInstruction::CreateAllToAll(
+          intra_host_ragged_all_to_all->shape(), {intra_host_ragged_all_to_all},
+          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
+          /*constrain_layout=*/false,
+          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+              ? std::make_optional(NextChannelId(*computation->parent()))
+              : std::nullopt,
+          /*split_dimension=*/0));
+
+  absl::InlinedVector<ReplicaGroup, 16> degenerated_replica_groups(
+      num_participating_devices);
+  for (int64_t i = 0; i < num_participating_devices; ++i) {
+    degenerated_replica_groups[i].add_replica_ids(i);
+  }
+
+  HloInstruction* output_offsets = ragged_all_to_all->mutable_operand(4);
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/ShapeUtil::MakeShape(
+          output_offsets->shape().element_type(),
+          {num_devices_in_replica, num_updates_per_replica}),
+      /*operand=*/output_offsets));
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateAllToAll(
+      /*shape=*/output_offsets->shape(),
+      /*operands=*/{output_offsets},
+      /*device_list=*/ragged_all_to_all->device_list(),
+      /*constrain_layout=*/false,
+      /*channel_id=*/ragged_all_to_all->channel_id().has_value()
+          ? std::make_optional(NextChannelId(*computation->parent()))
+          : std::nullopt,
+      /*split_dimension=*/0));
+
+  HloInstruction* corrected_output_offsets = output_offsets;
+
+  corrected_output_offsets =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          /*shape=*/ShapeUtil::MakeShape(
+              output_offsets->shape().element_type(),
+              {num_hosts, num_devices_in_replica_per_host,
+               num_updates_per_replica}),
+          /*operand=*/corrected_output_offsets));
+
+  corrected_output_offsets =
+      CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
+                     corrected_output_offsets, computation);
+
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/ragged_all_to_all->operand(2)->shape(),
+      /*operand=*/output_offsets));
+
+  corrected_output_offsets =
       computation->AddInstruction(HloInstruction::CreateReshape(
-          metadata_operand->shape(), new_local_metadata_transposed));
+          /*shape=*/ragged_all_to_all->operand(2)->shape(),
+          /*operand=*/corrected_output_offsets));
 
-  return intra_host_metadata;
+  HloInstruction* local_ragged_all_to_all =
+      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          /*shape=*/ragged_all_to_all->shape(),
+          /*operands=*/
+          {local_inputs, ragged_all_to_all->mutable_operand(1),
+           corrected_output_offsets, ragged_all_to_all->mutable_operand(5),
+           output_offsets, ragged_all_to_all->mutable_operand(5)},
+          /*device_list=*/CollectiveDeviceList(degenerated_replica_groups),
+          /*channel_id=*/ragged_all_to_all->channel_id()));
+
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
+                                                     local_ragged_all_to_all));
+
+  return true;
 }
 
 absl::StatusOr<bool> DecomposeRaggedAllToAll(
@@ -212,58 +475,23 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
     }
   }
 
-  std::vector<HloInstruction*> intra_host_metadata;
-
-  HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
+  int64_t num_input_rows = ragged_all_to_all->operand(0)->shape().dimensions(0);
+  int64_t num_output_rows =
+      ragged_all_to_all->operand(1)->shape().dimensions(0);
 
-  Shape new_input_shape = input_operand->shape();
-  new_input_shape.set_dimensions(
-      0, num_hosts * input_operand->shape().dimensions(0));
-
-  // The collective can run in two modes: cross-replica and cross-partition. If
-  // the original `ragged-all-to-all` has a channel id set, then it's a
-  // cross-partition collective. In that case `all-gather` needs a channel_id
-  // and `use_global_device_ids=true`.
-  // Otherwise, when `ragged-all-to-all` has no channel id, it's a cross-replica
-  // collective. In that case `all-gather` doesn't need a `channel_id` and
-  // `use_global_device_ids` should be set to false.
-  HloInstruction* all_gather_input =
-      computation->AddInstruction(HloInstruction::CreateAllGather(
-          /*shape=*/new_input_shape,
-          /*operands=*/{ragged_all_to_all->mutable_operand(0)},
-          /*all_gather_dimension=*/0,
-          /*device_list=*/CollectiveDeviceList(inter_host_replica_groups),
-          /*constrain_layout=*/false,
-          /*channel_id=*/ragged_all_to_all->channel_id().has_value()
-              ? std::make_optional(NextChannelId(*computation->parent()))
-              : std::nullopt,
-          /*use_global_device_ids=*/
-          ragged_all_to_all->channel_id().has_value()));
-
-  for (int i = 2; i < 6; ++i) {
-    intra_host_metadata.push_back(GetIntraHostMetadata(
-        ragged_all_to_all, ragged_all_to_all->mutable_operand(i), computation,
-        inter_host_replica_groups, num_hosts, num_devices_in_replica,
-        /*correct_offsets=*/i == 2));
+  if (num_input_rows > num_output_rows) {
+    return DecomposeCombineRaggedAllToAll(
+        ragged_all_to_all, computation, inter_host_replica_groups,
+        intra_host_replica_groups, num_hosts, num_devices_in_replica,
+        num_participating_devices);
   }
 
-  HloInstruction* new_ragged_all_to_all =
-      computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
-          /*shape=*/ragged_all_to_all->shape(),
-          /*operands=*/
-          {all_gather_input, ragged_all_to_all->mutable_operand(1),
-           intra_host_metadata[0], intra_host_metadata[1],
-           intra_host_metadata[2], intra_host_metadata[3]},
-          /*replica_groups=*/intra_host_replica_groups,
-          /*channel_id=*/ragged_all_to_all->channel_id()));
-
-  TF_RETURN_IF_ERROR(
-      computation->ReplaceInstruction(hlo, new_ragged_all_to_all));
-
-  return true;
+  return DecomposeDispatchRaggedAllToAll(
+      ragged_all_to_all, computation, inter_host_replica_groups,
+      intra_host_replica_groups, num_hosts, num_devices_in_replica);
 }
 
-absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::Run(
+absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
index 011b5749ebe199..cd34fc20c0fea1 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.h
@@ -37,7 +37,8 @@ class RaggedAllToAllMultiHostDecomposer : public HloModulePass {
     return "ragged-all-to-all-multi-host-decomposer";
   }
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
index 689f7041b21a92..83534c80f0853a 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
@@ -62,7 +62,7 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
-    // CHECK-COUNT-4: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
     // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
   )"));
 }
@@ -96,7 +96,7 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
-    // CHECK-COUNT-4: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
     // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
   )"));
 }
@@ -124,6 +124,40 @@ ENTRY main {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(RaggedAllToAllDecomposerTest, CombineRaggedAllToAllIsDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module, replica_count=16
+
+ENTRY main {
+  input = bf16[4096,128] parameter(0)
+  output = bf16[256,128] parameter(1)
+  input_offsets = s64[16] parameter(2)
+  send_sizes = s64[16] parameter(3)
+  output_offsets = s64[16] parameter(4)
+  recv_sizes = s64[16] parameter(5)
+  ROOT ra2a = bf16[256,128] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes),
+    replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}
+}
+)"));
+
+  RaggedAllToAllMultiHostDecomposer decomposer(
+      /*fast_interconnect_slice_size=*/8);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+
+  EXPECT_TRUE(changed);
+  TF_EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  TF_EXPECT_OK(HloDCE().Run(module.get()));
+  TF_EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}{{[}]}}
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}{{[}]}}
+  )"));
+}
+
 TEST_F(RaggedAllToAllDecomposerTest, MultipleReplicaGroupsAreSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module
@@ -152,7 +186,7 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{2,10},{4,12},{6,14},{1,9},{3,11},{5,13},{7,15}{{[}]}}
-    // CHECK-COUNT-4: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{2,10},{4,12},{6,14},{1,9},{3,11},{5,13},{7,15}{{[}]}}
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{2,10},{4,12},{6,14},{1,9},{3,11},{5,13},{7,15}{{[}]}}
     // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,2,4,6},{8,10,12,14},{1,3,5,7},{9,11,13,15}{{[}]}}
   )"));
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
index 607c0d1c14a9ff..7217dce5e09ac0 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
@@ -39,9 +39,9 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> ReduceScatterCreator::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterCreator::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   const HloModuleConfig &config = module->config();
   int64_t next_channel_id = hlo_query::NextChannelId(*module);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
index 23f9c96ec28c85..62da897e4afb4e 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.h
@@ -31,8 +31,8 @@ class ReduceScatterCreator : public HloModulePass {
   ReduceScatterCreator() = default;
   absl::string_view name() const override { return "reduce-scatter-creator"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
index c8ac038a67d7b3..bd2c5f756a57dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
@@ -119,9 +119,9 @@ class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionDegenerateDimRemover::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionDegenerateDimRemover::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed,
                       ReductionDegenerateDimRemoverVisitor().RunOnModule(
                           module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
index fadec68761f8a8..c497b467a74319 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.h
@@ -44,8 +44,9 @@ class ReductionDegenerateDimRemover : public HloModulePass {
   absl::string_view name() const override {
     return "reduction-degenerate-dim-remover";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
index 333b0843cee9e1..1d041735564557 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
@@ -112,9 +112,9 @@ class ReduceDimensionGroupVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionDimensionGrouper::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionDimensionGrouper::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed, ReduceDimensionGroupVisitor().RunOnModule(
                                         module, execution_threads));
   return changed;
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
index d0372920fbf93d..8f91b837d01859 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.h
@@ -44,8 +44,9 @@ class ReductionDimensionGrouper : public HloModulePass {
   absl::string_view name() const override {
     return "reduction-dimension-grouper";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
index 97462a033dbc74..05ad0257d7f62b 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
@@ -183,9 +183,9 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-absl::StatusOr<bool> ReductionLayoutNormalizer::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionLayoutNormalizer::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(bool changed,
                       EnforceMinorToMajorReduceOpVisitor().RunOnModule(
                           module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
index 454fb7f91ed7f3..a7b9cb80d9d6a9 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.h
@@ -42,8 +42,8 @@ class ReductionLayoutNormalizer : public HloModulePass {
     return "reduction-layout-normalizer";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
index 8910f5b118ed01..48491a500cf6f6 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer_test.cc
@@ -23,7 +23,8 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -33,7 +34,8 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class ReductionLayoutNormalizerTest : public HloTestBase {
+class ReductionLayoutNormalizerTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  public:
   void CheckReductionLayoutNormalizer(
       absl::string_view hlo, std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
index 12f5ecef9a160b..e1366f8f52ad11 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.cc
@@ -131,9 +131,9 @@ class ReductionSplitterVisitor : public DfsHloRewriteVisitor {
   const bool ignore_small_dims_;
 };
 
-absl::StatusOr<bool> ReductionSplitter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReductionSplitter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
       bool changed,
       ReductionSplitterVisitor(device_description_, ignore_small_dims_)
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
index f5abe00c4014e7..439e40d3100d31 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_splitter.h
@@ -47,8 +47,8 @@ class ReductionSplitter : public HloModulePass {
         ignore_small_dims_(ignore_small_dims) {}
   absl::string_view name() const override { return "reduction-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc b/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
index ac396b3fd5915f..ab2250f88a5cbc 100644
--- a/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/rename_fusions.cc
@@ -73,7 +73,7 @@ void RenameFusion(HloModule* module, HloInstruction* instruction) {
 
 }  // namespace
 
-absl::StatusOr<bool> RenameFusions::Run(
+absl::StatusOr<bool> RenameFusions::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/rename_fusions.h b/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
index 9313a72d68669e..45a41e1c1c2b4d 100644
--- a/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
+++ b/third_party/xla/xla/service/gpu/transforms/rename_fusions.h
@@ -35,8 +35,9 @@ namespace gpu {
 
 class RenameFusions : public HloModulePass {
   absl::string_view name() const override { return "rename_fusions"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
index 4cb6c672044277..3f6c400bed5b0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.cc
@@ -29,7 +29,7 @@ namespace xla {
 
 namespace gpu {
 
-absl::StatusOr<bool> SanitizeConstantNames::Run(
+absl::StatusOr<bool> SanitizeConstantNames::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
index a7081455d89ca4..8474ec77388232 100644
--- a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
+++ b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names.h
@@ -32,8 +32,8 @@ class SanitizeConstantNames : public HloModulePass {
  public:
   absl::string_view name() const override { return "sanitize-constant-names"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
index 21e13582793d22..ba91f2ffabf524 100644
--- a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> ScalarConstantSinker::Run(
+absl::StatusOr<bool> ScalarConstantSinker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
index f3fffefb6854b0..4173f914beefb2 100644
--- a/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
+++ b/third_party/xla/xla/service/gpu/transforms/scalar_constant_sinker.h
@@ -34,8 +34,8 @@ class ScalarConstantSinker : public HloModulePass {
  public:
   absl::string_view name() const override { return "scalar-constant-sinker"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
index 324cf215361b91..ff19b68123c5db 100644
--- a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.cc
@@ -183,7 +183,7 @@ absl::StatusOr<bool> ScaledDotRewriter::RewriteComputation(
   return changed;
 }
 
-absl::StatusOr<bool> ScaledDotRewriter::Run(
+absl::StatusOr<bool> ScaledDotRewriter::RunImpl(
     HloModule* module, const absl::flat_hash_set<absl::string_view>&) {
   bool changed = false;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
index 587acf8b781864..d3a3a822d5c321 100644
--- a/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/scaled_dot_rewriter.h
@@ -32,12 +32,12 @@ class ScaledDotRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "scaled-dot-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  absl::StatusOr<bool> RewriteComputation(HloComputation* computation);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
index df6fcd183cf1a3..1cf15ae8b37f3c 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
@@ -260,7 +260,7 @@ class ScatterSliceSimplifierVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> ScatterSliceSimplifier::Run(
+absl::StatusOr<bool> ScatterSliceSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return ScatterSliceSimplifierVisitor{}.RunOnModule(module, execution_threads);
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
index 4ee985e49a3076..8d8e23cc880716 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.h
@@ -47,8 +47,8 @@ class ScatterSliceSimplifier : public HloModulePass {
  public:
   absl::string_view name() const override { return "scatter-slice-simplifier"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
index e55f11da103cbc..61a85b76a69835 100644
--- a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.cc
@@ -52,7 +52,7 @@ absl::StatusOr<bool> AnnotateSchedulingInstructionNames(
 
 }  // namespace
 
-absl::StatusOr<bool> SchedulingInstructionAnnotator::Run(
+absl::StatusOr<bool> SchedulingInstructionAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   CHECK(module->has_schedule())
diff --git a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
index e73ad81cc1caaa..90acbc62b2360a 100644
--- a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator.h
@@ -33,8 +33,8 @@ class SchedulingInstructionAnnotator : public HloModulePass {
     return "scheduling-instruction-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
index cbd01f0a91d167..77d2e1fdeb71a2 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_indexing_performance_model.h"
@@ -647,7 +647,7 @@ absl::StatusOr<bool> SoftmaxRewriterTriton::MaybeFuseNormalizationDiamond(
                               use_cost_model_to_evaluate_fusions_);
 }
 
-absl::StatusOr<bool> SoftmaxRewriterTriton::Run(
+absl::StatusOr<bool> SoftmaxRewriterTriton::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_RETURN_IF_ERROR(EnsureTritonSupportsComputeCapability(
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
index 5f740e79f9f153..3d5f6a70ba472b 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
@@ -60,11 +60,6 @@ class SoftmaxRewriterTriton : public HloModulePass {
 
   absl::string_view name() const override { return "triton-softmax-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Finds and returns all the fusible normalization diamonds in the module. The
   // resulting vector is sorted according to a post-order matching (i.e. within
   // the same computation, producer diamonds appear before consumer diamonds).
@@ -102,6 +97,11 @@ class SoftmaxRewriterTriton : public HloModulePass {
   DiamondMatchingDecision MatchesTritonCompatibleClosedReductionDiamond(
       HloInstruction* instr) const;
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const se::DeviceDescription& device_info_;
   const HloCostAnalysis::ShapeSizeFunction shape_size_;
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index 8a08d754e81741..848fa43f6e50fe 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/pattern_matcher.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
index d6f37317bbd613..724af50e3010dc 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
@@ -573,17 +573,17 @@ absl::StatusOr<bool> SortRewriter::RunOnComputation(
 }
 
 // Replace compatible sort operations with custom calls.
-absl::StatusOr<bool> SortRewriter::Run(
+absl::StatusOr<bool> SortRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3, "SortRewriter::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "SortRewriter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
     changed |= result;
   }
-  XLA_VLOG_LINES(3, "SortRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3, "SortRewriter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
index 7d8907bf87f30f..c541ac54a452af 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
@@ -59,8 +59,8 @@ class SortRewriter : public HloModulePass {
     sort_mode_ = sort_mode;
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
index 89e0215f79da45..e0effeae280f6a 100644
--- a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
@@ -56,6 +56,8 @@ struct DotDimensions {
   int64_t m;  // lhs non-contracting dimensions
   int64_t n;  // rhs non-contracting dimensions
   int64_t k;  // contracting dimensions
+  // LHS and RHS element sizes, after going up the chain of elementwise
+  // operations. That approximates what will be fused.
   int64_t lhs_element_bits;
   int64_t rhs_element_bits;
   int64_t acc_element_bits;
@@ -228,14 +230,13 @@ HloInstruction* PadInstruction(HloInstruction* instr, int64_t dimension_idx,
 }
 
 // Returns the padded K dimension so that it is a multiple of split_k and 16B.
-int64_t GetPaddedK(HloInstruction& dot, int64_t k, int64_t split_k) {
+int64_t GetPaddedK(HloInstruction& dot, int64_t split_k) {
+  DotDimensions dims = GetDotDimensions(&dot);
   const int64_t alignment_in_bits = 16 * 8;
-  int64_t min_element_size_in_bits = alignment_in_bits;
-  for (const HloInstruction* p : dot.parent()->parameter_instructions()) {
-    min_element_size_in_bits = std::min(
-        min_element_size_in_bits, ShapeUtil::ElementSizeInBits(p->shape()));
-  }
-  return RoundUpTo(k, split_k * alignment_in_bits / min_element_size_in_bits);
+  int64_t min_element_size_in_bits = std::min(
+      {alignment_in_bits, dims.lhs_element_bits, dims.rhs_element_bits});
+  return RoundUpTo(dims.k,
+                   split_k * alignment_in_bits / min_element_size_in_bits);
 }
 
 // The contracting dimension index becomes new batch (split) dimension, and all
@@ -305,8 +306,7 @@ absl::StatusOr<HloInstruction*> SplitKDimensionOfDot(HloDotInstruction* src_dot,
       src_dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
   const int64_t rhs_k_idx =
       src_dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
-  const int64_t padded_k = GetPaddedK(
-      *src_dot, src_dot->operand(0)->shape().dimensions(lhs_k_idx), split_k);
+  const int64_t padded_k = GetPaddedK(*src_dot, split_k);
   // The operands' K dimension are split into [split_k, K/split_k] (shifting
   // right all the dimensions after it).
   HloInstruction* lhs =
@@ -384,7 +384,7 @@ class SplitkRewriterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> SplitkRewriter::Run(
+absl::StatusOr<bool> SplitkRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!module->config()
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
index 2a06bc9cc16b85..012147bb314d8f 100644
--- a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
@@ -35,12 +35,14 @@ class SplitkRewriter : public HloModulePass {
   explicit SplitkRewriter(se::DeviceDescription device_description)
       : device_description_(device_description) {}
 
- private:
   absl::string_view name() const override { return "splitk-rewriter"; }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+ private:
   se::DeviceDescription device_description_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
index 3128eca6c1d9c5..95e7766bc6607d 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.cc
@@ -156,6 +156,9 @@ absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
   std::vector<HloInstruction*> all_consumers;
   for (auto user : instr->users()) {
     if (HloPredicateIsOp<HloOpcode::kGetTupleElement>(user)) {
+      if (user->user_count() == 0) {
+        continue;
+      }
       user = user->users()[0];
     }
     all_consumers.push_back(user);
@@ -177,11 +180,11 @@ absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
 }
 }  // namespace
 
-absl::StatusOr<bool> StreamAttributeAnnotator::Run(
+absl::StatusOr<bool> StreamAttributeAnnotator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      5, "StreamAttributeAnnotator::Run(), before:\n" + module->ToString());
+      5, "StreamAttributeAnnotator::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   int64_t channel_id = hlo_query::NextChannelId(*module);
   for (const HloComputation* comp :
@@ -225,7 +228,7 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
     }
   }
   XLA_VLOG_LINES(
-      5, "StreamAttributeAnnotator::Run(), after:\n" + module->ToString());
+      5, "StreamAttributeAnnotator::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
index 74b2002670afca..68d47c5bf367e8 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator.h
@@ -54,8 +54,8 @@ class StreamAttributeAnnotator : public HloModulePass {
     return "stream-attribute-annotator";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
index 29a48e7cbbc172..a15c4d04c7b0e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
@@ -159,6 +159,28 @@ TEST_F(StreamAttributeAnnotatorTest, GTEUserIsAnnotated) {
   EXPECT_EQ(gpu_config.wait_on_operation_queues()[0], 1);
 }
 
+TEST_F(StreamAttributeAnnotatorTest, GTENoUserIsHandled) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsync
+
+  ENTRY entry {
+    p1_32 = f32[16,32] parameter(0)
+    p2_32 = f32[32,16] parameter(1)
+
+    custom-call.3 = (f32[16,16], s8[1028]{0}) custom-call(p1_32, p2_32), custom_call_target="__cublas$gemm", backend_config={"operation_queue_id":"1","wait_on_operation_queues":[],"gemm_backend_config":{"alpha_real":1,"alpha_imag":0,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT","grad_x":false,"grad_y":false}}
+    ROOT get-tuple-element.24 = f32[16,16] get-tuple-element(custom-call.3), index=0
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  StreamAttributeAnnotator attr_annotator{device_description()};
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, attr_annotator.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
 TEST_F(StreamAttributeAnnotatorTest, FusionIsAnnotated) {
   constexpr absl::string_view kHloString = R"(
   HloModule ModuleWithFusion
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
index 6420b9d4de4004..80657227db4e98 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.cc
@@ -55,11 +55,11 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
 }
 }  // namespace
 
-absl::StatusOr<bool> StreamAttributeAsyncWrapper::Run(
+absl::StatusOr<bool> StreamAttributeAsyncWrapper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(
-      2, "StreamAttributeAsyncWrapper::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "StreamAttributeAsyncWrapper::RunImpl(), before:\n" +
+                        module->ToString());
   bool changed = false;
   for (const HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -68,8 +68,8 @@ absl::StatusOr<bool> StreamAttributeAsyncWrapper::Run(
       changed |= result;
     }
   }
-  XLA_VLOG_LINES(
-      2, "StreamAttributeAsyncWrapper::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "StreamAttributeAsyncWrapper::RunImpl(), after:\n" +
+                        module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
index 387c6dbaf04f7a..b01d6ebbe04d7d 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper.h
@@ -38,8 +38,8 @@ class StreamAttributeAsyncWrapper : public HloModulePass {
     return "async-stream-attribute-wrapper";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
index 96edae3141c3fd..d176f8dfea25e4 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
@@ -151,7 +151,7 @@ class SpecializeTopkVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> TopkSpecializer::Run(
+absl::StatusOr<bool> TopkSpecializer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return SpecializeTopkVisitor(compute_capability_)
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer.h b/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
index f5e593c175f2d7..24f29d2c3747e9 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer.h
@@ -35,8 +35,8 @@ class TopkSpecializer : public HloModulePass {
       : compute_capability_(std::move(compute_capability)) {}
   absl::string_view name() const override { return "topk-specializer"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc b/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
index d817a2495c05f4..b3fc5d9ffbfaab 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer_test.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/topk_rewriter.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -63,13 +62,6 @@ class TopkTest : public HloTestBase, public ParameterizedInterface {
       : HloTestBase(*PlatformUtil::GetPlatform("gpu"),
                     *PlatformUtil::GetPlatform("gpu"), true, true, {}) {}
 
-  const se::GpuComputeCapability& GetGpuComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
-
  protected:
   absl::StatusOr<std::unique_ptr<HloModule>> TopkHlo(int n, int k,
                                                      int batch_size,
@@ -125,10 +117,10 @@ class GeneralizeTopk : public HloModulePass {
  public:
   absl::string_view name() const override { return "generalized-topk"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+ protected:
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     return GeneralizeTopkVisitor().RunOnModule(module, execution_threads);
   }
 };
@@ -141,11 +133,6 @@ void ToSortAndSlice(HloModule* module) {
 }
 
 TEST_P(TopkTest, ProducesCorrectResult) {
-  const auto& gpu_desc = GetGpuComputeCapability();
-  if (std::holds_alternative<se::RocmComputeCapability>(gpu_desc)) {
-    // TODO(rocm): weekly sync 24-12-10
-    GTEST_SKIP() << "Currently failing on ROCm!";
-  }
   const auto [n_kb, k, batch_size, dtype] = GetParam();
   const size_t n = n_kb * 1024;
   TF_ASSERT_OK_AND_ASSIGN(auto topk_module, TopkHlo(n, k, batch_size, dtype));
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
index 9d7e017a93c014..1f6c1742e03994 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
@@ -145,7 +145,7 @@ class TopkSplitterVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> TopKSplitter::Run(
+absl::StatusOr<bool> TopKSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return TopkSplitterVisitor(split_threshold_)
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_splitter.h b/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
index 0a155c172464ee..9599c8ea436046 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/topk_splitter.h
@@ -37,8 +37,8 @@ class TopKSplitter : public HloModulePass {
       : split_threshold_(split_threshold) {}
   absl::string_view name() const override { return "topk-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index 457c398396a833..d6dfc08863b5a2 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -199,9 +199,9 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
 };
 }  // namespace
 
-absl::StatusOr<bool> TransposeDimensionGrouper::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> TransposeDimensionGrouper::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
       bool changed,
       TransposeDimensionGroupVisitor().RunOnModule(module, execution_threads));
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
index 743d6047484a7a..9a35e5dad09f3a 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.h
@@ -45,8 +45,9 @@ class TransposeDimensionGrouper : public HloModulePass {
   absl::string_view name() const override {
     return "transpose-dimension-grouper";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
index 63a53b12a26357..5861d1e271c8de 100644
--- a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.cc
@@ -371,9 +371,9 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
   const se::DeviceDescription &device_description_;
 };
 
-absl::StatusOr<bool> TreeReductionRewriter::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> TreeReductionRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Rewriter input: " << module->ToString();
   TF_ASSIGN_OR_RETURN(bool changed,
                       ReductionRewriterVisitor(device_description_)
diff --git a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
index 864965836910db..da27e4badc4451 100644
--- a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter.h
@@ -82,8 +82,8 @@ class TreeReductionRewriter : public HloModulePass {
   ~TreeReductionRewriter() override = default;
   absl::string_view name() const override { return "tree-reduction-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
index f033a270fb5467..5d31b3648ecabd 100644
--- a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> TriangularSolveRewriter::Run(
+absl::StatusOr<bool> TriangularSolveRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
index a19e10f878ca01..d6697081a9dbe7 100644
--- a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.h
@@ -48,8 +48,8 @@ class TriangularSolveRewriter : public HloModulePass {
     return "triangular-solve-rewriter";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
index 935c391cf4e73b..551ccb55c3d456 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
@@ -297,10 +297,10 @@ TritonFusionNumericsVerifier::FusionCacheKey CacheKeyForFusion(
 
 }  // namespace
 
-absl::StatusOr<bool> TritonFusionNumericsVerifier::Run(
+absl::StatusOr<bool> TritonFusionNumericsVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  VLOG(3) << "TritonFusionNumericsVerifier::Run";
+  VLOG(3) << "TritonFusionNumericsVerifier::RunImpl";
   if (config_.IsDeviceless()) {
     return absl::InternalError(
         "Cannot run TritonFusionNumericsVerifier on a deviceless compilation.");
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
index 363e7d8b1d8ab8..6a018465f73c30 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
@@ -24,12 +24,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream.h"
@@ -49,15 +49,15 @@ class TritonFusionNumericsVerifier : public HloModulePass {
   static absl::string_view Name() { return "triton-numerics-verifier"; }
   absl::string_view name() const override { return Name(); }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using FusionCacheKey = std::string;
 
   int CacheHitsForTestingOnly() const { return cache_hits_; }
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   DeviceOrDevicelessConfig config_;
   SymbolicExprContext* symbolic_expr_context_;
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index d104bb0f8d6b8e..dc9f8e2aa33348 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -25,12 +25,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
@@ -145,10 +145,6 @@ TEST_P(TritonFusionNumericsVerifierTest, VerifyExactSoftmaxFusionNumerics) {
 }
 
 TEST_P(TritonFusionNumericsVerifierTest, VerifyNestedGemmNumerics) {
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
-  
   constexpr absl::string_view kNestedGemmFusionHloText = R"(
 flhs {
   ROOT flhs.p0 = $0[16,16] parameter(0)
@@ -375,10 +371,6 @@ TEST_F(TritonFusionNumericsVerifierTest, CheckMismatch) {
 // spill. Verify that the numerics verifier still runs on those kernels.
 TEST_F(TritonFusionNumericsVerifierTest,
        CompilationSucceedsEvenIfKernelWillSpillRegisters) {
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
-
   auto module = Module(R"(
 HloModule m
 
@@ -491,9 +483,6 @@ TEST_F(TritonFusionNumericsVerifierTest, VerifyThatDisablingTritonIsFast) {
   // compiled without Triton and without rerunning the fusion pass, the
   // resulting kernel is extremely slow and the test will timeout. This test
   // ensures that the fusion pass is rerun.
-  if (IsRocm()) {
-    GTEST_SKIP() << "Test currently failing on ROCm"; //TODO(rocm): weekly sync 25-07-14
-  }
   absl::string_view hlo_text = R"(
 max {
   p0 = f32[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
index 319ac2ae0bb4fc..1eabac1257d4a0 100644
--- a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.cc
@@ -96,7 +96,7 @@ std::vector<HloInstruction*> GetRelevantVariadicOps(HloComputation* comp) {
 
 }  // namespace
 
-absl::StatusOr<bool> VariadicOpSplitter::Run(
+absl::StatusOr<bool> VariadicOpSplitter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
index b0f8a789e0fa04..c28feb57bc186a 100644
--- a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter.h
@@ -31,8 +31,8 @@ class VariadicOpSplitter : public HloModulePass {
  public:
   absl::string_view name() const override { return "variadic-op-splitter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
index d551058516345c..554cf4d5bfa05c 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
@@ -1355,11 +1355,11 @@ class WindowedEinsumVisitor : public DfsHloRewriteVisitor {
 
 }  // namespace
 
-absl::StatusOr<bool> WindowedEinsumHandler::Run(
+absl::StatusOr<bool> WindowedEinsumHandler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      5, "WindowedEinsumHandler::Run(), before:\n" + module->ToString());
+      5, "WindowedEinsumHandler::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   int64_t stream_id = hlo_query::NextChannelId(*module);
   std::vector<HloInstruction*> all_windowed_einsum_loops;
@@ -1461,8 +1461,8 @@ absl::StatusOr<bool> WindowedEinsumHandler::Run(
     }
     changed |= result.unrolled;
   }
-  XLA_VLOG_LINES(5,
-                 "WindowedEinsumHandler::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      5, "WindowedEinsumHandler::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
index 5ebe38e0c6b385..b5b64f5cd95eb8 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.h
@@ -72,16 +72,16 @@ class WindowedEinsumHandler : public HloModulePass {
     bool consumed = false;
   };
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   constexpr static const char* kWindowedEinsumRsLoopName =
       "windowed_dot_general_body_rs";
   constexpr static const char* kWindowedEinsumAgLoopName =
       "windowed_dot_general_body_ag";
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   std::vector<WindowedEinsumAgLoops> all_ag_loops_;
 };
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 6a8bf2aadec1dc..6ea3da98fd7cba 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -667,6 +667,7 @@ message BufferAllocationProto {
     int64 logical_buffer_id = 1;
     int64 offset = 2;
     int64 size = 3;
+    xla.PrimitiveType element_type = 4;
   }
 
   int64 index = 1;
diff --git a/third_party/xla/xla/service/hlo_creation_utils.cc b/third_party/xla/xla/service/hlo_creation_utils.cc
index 2d427ef4569cc0..7fd80c89763a36 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils.cc
@@ -949,4 +949,61 @@ HloInstruction* MakeScalarLikeFromLiteral(HloInstruction* base,
       ShapeUtil::MakeStaticShape(base->shape()), scalar, {}));
 }
 
+std::unique_ptr<HloModule> NewModuleWithFusion(
+    const HloInstruction* instruction, HloInstruction::FusionKind fusion_kind) {
+  auto hlo_module = std::make_unique<HloModule>(
+      absl::StrCat("wrapped_module_", instruction->name()),
+      instruction->GetModule()->config());
+
+  // New computation  with a single instruction as given by the instruction
+  // parameter.
+  HloComputation::Builder fusion_builder(
+      absl::StrCat("wrapped_", instruction->name()));
+
+  const auto build_parameter_instructions =
+      [instruction](HloComputation::Builder& builder) {
+        std::vector<HloInstruction*> parameters;
+        parameters.reserve(instruction->operand_count());
+        for (int i = 0; i < instruction->operand_count(); ++i) {
+          const HloInstruction* operand = instruction->operand(i);
+          parameters.push_back(
+              builder.AddInstruction(HloInstruction::CreateParameter(
+                  i, operand->shape(), absl::StrCat("param_", i))));
+        }
+        return parameters;
+      };
+  std::vector<HloInstruction*> fusion_parameters =
+      build_parameter_instructions(fusion_builder);
+  HloInstruction* fused_root =
+      fusion_builder.AddInstruction(instruction->CloneWithNewOperands(
+          instruction->shape(), fusion_parameters));
+
+  // If the original instruction had any sub-computations (like to_apply), clone
+  // them.
+  if (!instruction->called_computations().empty()) {
+    HloCloneContext context(hlo_module.get());
+    fused_root->ReplaceCalledComputations([&](HloComputation* callee) {
+      if (callee->parent() != hlo_module.get()) {
+        return hlo_module->DeepCloneComputation(callee, &context);
+      }
+      return callee;
+    });
+  }
+
+  HloComputation* fused_computation =
+      hlo_module->AddEmbeddedComputation(fusion_builder.Build(fused_root));
+
+  // Entry computation for the new module.
+  HloComputation::Builder entry_builder("entry");
+  std::vector<HloInstruction*> entry_parameters =
+      build_parameter_instructions(entry_builder);
+  HloInstruction* fusion_instruction = entry_builder.AddInstruction(
+      HloInstruction::CreateFusion(instruction->shape(), fusion_kind,
+                                   entry_parameters, fused_computation));
+
+  hlo_module->AddEntryComputation(entry_builder.Build(fusion_instruction));
+
+  return hlo_module;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index d22d0affbf181a..c7258129cf1279 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -442,6 +442,11 @@ absl::StatusOr<HloInstruction*> MakeWithinBounds(HloInstruction* inst,
                                                  HloInstruction* lower_bound,
                                                  HloInstruction* upper_bound);
 
+// Creates a new module with a single computation that contains a fusion of the
+// given instruction with the given fusion kind.
+std::unique_ptr<HloModule> NewModuleWithFusion(
+    const HloInstruction* instruction, HloInstruction::FusionKind fusion_kind);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/third_party/xla/xla/service/hlo_creation_utils_test.cc b/third_party/xla/xla/service/hlo_creation_utils_test.cc
index d90e26615dee58..058ced67fc5196 100644
--- a/third_party/xla/xla/service/hlo_creation_utils_test.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils_test.cc
@@ -20,10 +20,13 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -559,5 +562,46 @@ TEST_F(HloCreationUtilsTest, DynamicBroadcastShape) {
   EXPECT_TRUE(one_constant->shape().is_static());
 }
 
+TEST_F(HloCreationUtilsTest, NewModuleWithFusion) {
+  static constexpr absl::string_view kModuleStr = R"(
+    HloModule test
+    apply_op {
+      x = f32[] parameter(0)
+      y = f32[] parameter(1)
+      ROOT apply_op = f32[] add(x, y)
+    }
+
+    ENTRY test_computation {
+      param_0 = f32[65536] parameter(0)
+      all-reduce-start = f32[65536] all-reduce-start(param_0), to_apply=apply_op, replica_groups={{0,1}}
+      ROOT all-reduce-done = f32[65536] all-reduce-done(all-reduce-start)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  const HloInstruction* all_reduce_start =
+      module->entry_computation()->GetInstructionWithName("all-reduce-start");
+  std::unique_ptr<HloModule> fusion_module =
+      NewModuleWithFusion(all_reduce_start, HloInstruction::FusionKind::kLoop);
+  EXPECT_EQ(fusion_module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kFusion);
+  auto* fusion_instruction = Cast<HloFusionInstruction>(
+      fusion_module->entry_computation()->root_instruction());
+  EXPECT_EQ(fusion_instruction->fusion_kind(),
+            HloInstruction::FusionKind::kLoop);
+  EXPECT_EQ(fusion_instruction->fused_instructions_computation()
+                ->root_instruction()
+                ->opcode(),
+            HloOpcode::kAllReduceStart);
+  HloAllReduceInstruction* all_reduce = Cast<HloAllReduceInstruction>(
+      fusion_instruction->fused_instructions_computation()->root_instruction());
+  EXPECT_EQ(all_reduce->replica_groups().size(), 1);
+  EXPECT_EQ(all_reduce->replica_groups()[0].replica_ids().size(), 2);
+  // Check that all-reduce has the correct to_apply.
+  HloComputation* to_apply = all_reduce->to_apply();
+  EXPECT_EQ(to_apply->name(), "apply_op");
+  EXPECT_EQ(to_apply->num_parameters(), 2);
+  EXPECT_EQ(to_apply->root_instruction()->opcode(), HloOpcode::kAdd);
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index 22c5ffc2228d26..74f60c2daeb959 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -264,19 +264,6 @@ bool HloCSE::ShouldEliminateInstruction(const HloInstruction* instruction) {
   return true;
 }
 
-absl::StatusOr<bool> HloCSE::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-
-  for (auto* computation : module->computations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool computation_changed,
-                        RunOnComputation(computation));
-    changed |= computation_changed;
-  }
-  return changed;
-}
-
 absl::StatusOr<bool> HloCSE::RunOnComputation(HloComputation* computation) {
   if (should_eliminate_computation_ &&
       !should_eliminate_computation_(computation)) {
@@ -393,4 +380,17 @@ absl::StatusOr<bool> HloCSE::RunOnComputation(HloComputation* computation) {
   return changed;
 }
 
+absl::StatusOr<bool> HloCSE::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+
+  for (auto* computation : module->computations(execution_threads)) {
+    TF_ASSIGN_OR_RETURN(bool computation_changed,
+                        RunOnComputation(computation));
+    changed |= computation_changed;
+  }
+  return changed;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse.h b/third_party/xla/xla/service/hlo_cse.h
index a2e9670c98133b..dfe5a86b184acf 100644
--- a/third_party/xla/xla/service/hlo_cse.h
+++ b/third_party/xla/xla/service/hlo_cse.h
@@ -55,19 +55,19 @@ class HloCSE : public HloModulePass {
   ~HloCSE() override = default;
   absl::string_view name() const override { return "cse"; }
 
-  // Run CSE on the given module. Returns whether the module was changed (common
-  // subexpressions were found and eliminated).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Run CSE on the given computation. Returns whether the computation was
   // changed.
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
 
   static bool ShouldEliminateInstruction(const HloInstruction* instruction);
 
+ protected:
+  // Run CSE on the given module. Returns whether the module was changed (common
+  // subexpressions were found and eliminated).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   const bool is_layout_sensitive_;
   const bool ignore_control_dependencies_;
diff --git a/third_party/xla/xla/service/hlo_cycle_detection.h b/third_party/xla/xla/service/hlo_cycle_detection.h
index c7c1239b640bbe..a31410610b48ec 100644
--- a/third_party/xla/xla/service/hlo_cycle_detection.h
+++ b/third_party/xla/xla/service/hlo_cycle_detection.h
@@ -51,11 +51,11 @@ class HloCycleDetection : public HloModulePass {
  public:
   absl::string_view name() const override { return "hlo-cycle-detection"; }
 
+ protected:
   // Never returns true; no instructions are ever modified by this pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(HloModule* module,
-                           const absl::flat_hash_set<absl::string_view>&
-                               execution_threads) override {
+  absl::StatusOr<bool> RunImpl(HloModule* module,
+                               const absl::flat_hash_set<absl::string_view>&
+                                   execution_threads) override {
     TF_RETURN_IF_ERROR(visitor_.VerifyNoCycle(module));
     return false;
   }
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.cc b/third_party/xla/xla/service/hlo_domain_isolator.cc
index c5355ef98d6777..072a180f03a737 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.cc
+++ b/third_party/xla/xla/service/hlo_domain_isolator.cc
@@ -123,7 +123,7 @@ absl::StatusOr<bool> HloDomainIsolator::UpdateDomains(
   return changed;
 }
 
-absl::StatusOr<bool> HloDomainIsolator::Run(
+absl::StatusOr<bool> HloDomainIsolator::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   DomainCreator creator = creator_factory_();
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.h b/third_party/xla/xla/service/hlo_domain_isolator.h
index d10dc501babc39..41f6c698778b32 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.h
+++ b/third_party/xla/xla/service/hlo_domain_isolator.h
@@ -45,8 +45,8 @@ class HloDomainIsolator : public HloModulePass {
   // Update domains for an instruction.
   absl::StatusOr<bool> UpdateDomains(HloInstruction* instruction);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_remover.cc b/third_party/xla/xla/service/hlo_domain_remover.cc
index 514f2466910be2..b9917647ab714f 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.cc
+++ b/third_party/xla/xla/service/hlo_domain_remover.cc
@@ -131,7 +131,7 @@ absl::StatusOr<int64_t> HloDomainRemover::RemoveExitDomains(
   return removed_domains;
 }
 
-absl::StatusOr<bool> HloDomainRemover::Run(
+absl::StatusOr<bool> HloDomainRemover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
diff --git a/third_party/xla/xla/service/hlo_domain_remover.h b/third_party/xla/xla/service/hlo_domain_remover.h
index 970f4f164a5e2b..da5765e9e178c8 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.h
+++ b/third_party/xla/xla/service/hlo_domain_remover.h
@@ -49,8 +49,8 @@ class HloDomainRemover : public HloModulePass {
   static absl::StatusOr<int64_t> RemoveExitDomains(
       HloInstruction* instruction, absl::string_view domain_kind);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_test.cc b/third_party/xla/xla/service/hlo_domain_test.cc
index f31b087e88d4b4..2fd15f0fd28aa4 100644
--- a/third_party/xla/xla/service/hlo_domain_test.cc
+++ b/third_party/xla/xla/service/hlo_domain_test.cc
@@ -432,16 +432,16 @@ TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token0 = token[] after-all(), sharding={maximal device=-1}
+  token0 = token[] after-all(), sharding={maximal device=1}
   a = (f32[4], u32[], token[]) recv(token0), channel_id=1,
-        sharding={{maximal device=-1},{maximal device=-1},{maximal device=-1}}
+        sharding={{maximal device=1},{maximal device=1},{maximal device=1}}
   b = (f32[4], token[]) recv-done(a), channel_id=1,
-        sharding={{maximal device=-1},{maximal device=-1}}
-  b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=-1}
-  c = f32[4] add(b_element, b_element), sharding={maximal device=-1}
+        sharding={{maximal device=1},{maximal device=1}}
+  b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=1}
+  c = f32[4] add(b_element, b_element), sharding={maximal device=1}
   d = (f32[4], u32[], token[]) send(c, token0), channel_id=2,
-        sharding={{maximal device=-1},{maximal device=-1},{maximal device=-1}}
-  ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=-1}
+        sharding={{maximal device=1},{maximal device=1},{maximal device=1}}
+  ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=1}
 }
 )";
 
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.cc b/third_party/xla/xla/service/hlo_domain_verifier.cc
index 519f572c3dfbb5..89b415791ec3b1 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.cc
+++ b/third_party/xla/xla/service/hlo_domain_verifier.cc
@@ -85,7 +85,7 @@ absl::Status HloDomainVerifier::RunContext::Run(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> HloDomainVerifier::Run(
+absl::StatusOr<bool> HloDomainVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.h b/third_party/xla/xla/service/hlo_domain_verifier.h
index 32dce237cb03dd..0950b228c1c412 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.h
+++ b/third_party/xla/xla/service/hlo_domain_verifier.h
@@ -36,11 +36,6 @@ class HloDomainVerifier : public HloModulePass {
 
   absl::string_view name() const override { return "domain_verifier"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Verify that the whole kDomain frontier bounding the instruction reach set,
   // has matching metadata.
   // A kDomain instruction has two sides of metadata, a user facing and an
@@ -58,6 +53,11 @@ class HloDomainVerifier : public HloModulePass {
   static absl::StatusOr<const DomainMetadata*> VerifyDomain(
       const DomainMetadata::Domain& domain);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   class RunContext;
 
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index 2aafa418689b01..78644b55eda325 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -110,6 +110,9 @@ std::string HloModuleConfig::compilation_cache_key() const {
     StrAppend(&key, "::device_memory_size=", device_memory_size());
   }
   StrAppend(&key, "::use_shardy_partitioner=", use_shardy_partitioner());
+  if (partition_size() != 0) {
+    StrAppend(&key, "::partition_size=", partition_size());
+  }
   return key;
 }
 
@@ -339,6 +342,7 @@ HloModuleConfigProto HloModuleConfig::ToProto() const {
   proto.set_fdo_profile(fdo_profile_);
   proto.set_device_memory_size(device_memory_size_);
   proto.set_use_shardy_partitioner(use_shardy_partitioner_);
+  proto.set_partition_size(partition_size_);
   *proto.mutable_sharding_config() = ShardingConfig::ToProto(sharding_config_);
   *proto.mutable_schedule_config() = ScheduleConfig::ToProto(schedule_config_);
   return proto;
@@ -418,6 +422,7 @@ HloModuleConfig::CreateFromProto(const HloModuleConfigProto& proto) {
   config->fdo_profile_ = proto.fdo_profile();
   config->device_memory_size_ = proto.device_memory_size();
   config->use_shardy_partitioner_ = proto.use_shardy_partitioner();
+  config->partition_size_ = proto.partition_size();
   config->sharding_config_ = ShardingConfig::FromProto(proto.sharding_config());
   config->schedule_config_ = ScheduleConfig::FromProto(proto.schedule_config());
   return std::move(config);
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 5f7592c874c519..19949ce76c08c0 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -451,6 +451,12 @@ class HloModuleConfig {
     use_shardy_partitioner_ = use_shardy_partitioner;
   }
 
+  // Number of devices in a fast-interconnect domain.
+  int64_t partition_size() const { return partition_size_; }
+  void set_partition_size(int64_t partition_size) {
+    partition_size_ = partition_size;
+  }
+
   // Do channel IDs in this module carry semantic information.
   bool ChannelIdSensitive() const {
     // TODO(b/430952564): Base this on num_partitions / num_replicas instead
@@ -625,6 +631,9 @@ class HloModuleConfig {
 
   bool use_shardy_partitioner_ = false;
 
+  // Number of devices in a fast-interconnect domain.
+  int64_t partition_size_ = 0;
+
   // Sharding configuration, where sharding_config_.nodes[v] controls the
   // sharding of operation v.
   ShardingConfig sharding_config_;
diff --git a/third_party/xla/xla/service/hlo_module_dce.cc b/third_party/xla/xla/service/hlo_module_dce.cc
index fa4da849792503..504c65f315e561 100644
--- a/third_party/xla/xla/service/hlo_module_dce.cc
+++ b/third_party/xla/xla/service/hlo_module_dce.cc
@@ -105,7 +105,7 @@ absl::StatusOr<bool> RunWhileDCE(
 
 }  // namespace
 
-absl::StatusOr<bool> HloModuleDCE::Run(
+absl::StatusOr<bool> HloModuleDCE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Before HloModuleDCE:";
diff --git a/third_party/xla/xla/service/hlo_module_dce.h b/third_party/xla/xla/service/hlo_module_dce.h
index 453deb26bb847a..b6d59c67a9fe71 100644
--- a/third_party/xla/xla/service/hlo_module_dce.h
+++ b/third_party/xla/xla/service/hlo_module_dce.h
@@ -33,10 +33,10 @@ class HloModuleDCE : public HloModulePass {
   ~HloModuleDCE() override {}
   absl::string_view name() const override { return "hlo-module-dce"; }
 
+ protected:
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index 6e7d5809cab379..b967f8ee038ff4 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -73,15 +73,20 @@ absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto, debug_options));
-  return HloModule::CreateFromProto(proto, config);
+  return HloModule::CreateFromProto(proto, config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config,
     bool is_module_post_optimizations) {
   VLOG(4) << proto.ShortDebugString();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(proto, module_config));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProto(proto, module_config,
+                                 /*buffer_assignment_proto=*/nullptr,
+                                 /*preserve_instruction_ids=*/false));
   TF_RETURN_IF_ERROR(
       HloVerifier(/*layout_sensitive=*/false,
                   /*allow_mixed_precision=*/is_module_post_optimizations)
@@ -133,7 +138,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleBinaryProtofile(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
 
-  return HloModule::CreateFromProto(module_proto, module_config);
+  return HloModule::CreateFromProto(module_proto, module_config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleTextProtoFile(
@@ -146,7 +153,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> ReadModuleFromModuleTextProtoFile(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
 
-  return HloModule::CreateFromProto(module_proto, module_config);
+  return HloModule::CreateFromProto(module_proto, module_config,
+                                    /*buffer_assignment_proto=*/nullptr,
+                                    /*preserve_instruction_ids=*/false);
 }
 
 absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index f1938b0b94513b..c050db3ab724c0 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -16,14 +16,13 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -481,12 +480,12 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
-    std::function<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
+    absl::AnyInvocable<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
         const std::vector<ServiceExecutableRunOptions>&,
         const std::vector<absl::Span<const ShapedBuffer* const>>&)>
         execution_helper,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   std::vector<std::unique_ptr<se::Stream>> streams;
@@ -671,9 +670,9 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   DeviceAssignment computation_device_assignment;
@@ -725,7 +724,8 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         }
         return results;
       },
-      argument_count_provider, argument_provider, options, device_assignment);
+      std::move(argument_count_provider), std::move(argument_provider), options,
+      device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index e6c40007c0c9cb..0673c0444aafdc 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
 #include "absl/base/nullability.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -188,9 +188,9 @@ class HloRunner : public HloRunnerInterface {
   // Note that this call ignores `ReplicatedExecutionOptions::run_hlo_passes`,
   // since we've already compiled the `Executable`.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
@@ -264,12 +264,12 @@ class HloRunner : public HloRunnerInterface {
 
   // Common implementation code for ExecuteReplicated() above.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
-      std::function<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
+      absl::AnyInvocable<absl::StatusOr<std::vector<ScopedShapedBuffer>>(
           const std::vector<ServiceExecutableRunOptions>&,
           const std::vector<absl::Span<const ShapedBuffer* const>>&)>
           execution_helper,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment);
 
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index 4c1e98184a75ec..a7141a9c02482e 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -17,12 +17,12 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_INTERFACE_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -291,9 +291,9 @@ class HloRunnerInterface {
       DeviceAssignment* device_assignment) = 0;
 
   virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) = 0;
 
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 255a2b6ca61f2f..45d3963cc253d1 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,6 +25,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/future.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner_interface.h"
@@ -137,11 +137,6 @@ absl::StatusOr<std::vector<Layout>> FlattenedParameterLayouts(
 
 absl::StatusOr<ExecuteOptions> GenerateExecuteOptions(const HloModule& module) {
   ExecuteOptions execute_options;
-
-  // PjRt requires untuple_result if the output is a tuple.
-  if (module.result_shape().IsTuple()) {
-    execute_options.untuple_result = true;
-  }
   return execute_options;
 }
 
@@ -567,9 +562,10 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
   xla::ExecuteOptions execute_options;
-  execute_options.untuple_result = true;
   return ExecuteReplicatedImpl(
-      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
+      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
+          absl::AnyInvocable<OpaqueExecutable*(int64_t)>
+              executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
@@ -587,15 +583,17 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(device_assignment->computation_count() == 1)
       << "Only single-computation execution is supported.";
   return ExecuteReplicatedImpl(
-      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
+      [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
+          absl::AnyInvocable<OpaqueExecutable*(int64_t)>
+              executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
@@ -619,7 +617,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
             }
             TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const executable,
                                 HloRunnerPjRtExecutable::TryUnwrap(
-                                    *this, executable_provider(i)));
+                                    *this, executable_provider_arg(i)));
             TF_ASSIGN_OR_RETURN(
                 PjRtLoadedExecutable * pjrt_executable,
                 executable->GetOrLoadExecutable(pjrt_client_.get()));
@@ -631,7 +629,6 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
                            args = argument_buffer_slices[i], device_ptr]() {
               std::optional<Future<>> returned_future = {};
               xla::ExecuteOptions options;
-              options.untuple_result = true;
               per_replica_results[i] = pjrt_executable->ExecuteSharded(
                   args, device_ptr, options,
                   /*returned_future=*/returned_future,
@@ -657,18 +654,19 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
         }
         return results;
       },
-      executable_provider, argument_count_provider, argument_provider, options,
-      device_assignment);
+      std::move(executable_provider), std::move(argument_count_provider),
+      std::move(argument_provider), options, device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
-    std::function<
+    absl::AnyInvocable<
         absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
-            absl::Span<const std::vector<PjRtBuffer*>>)>
+            absl::Span<const std::vector<PjRtBuffer*>>,
+            absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
         execution_helper,
-    std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    std::function<int64_t(int64_t)> argument_count_provider,
-    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(options.infeed_values.empty() ||
@@ -777,7 +775,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   TF_ASSIGN_OR_RETURN(
       const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
           result_buffers,
-      execution_helper(BufferMatToPointerMat(argument_buffer_slices)));
+      execution_helper(BufferMatToPointerMat(argument_buffer_slices),
+                       std::move(executable_provider)));
   VLOG(1) << "Replicated execution terminated";
 
   // Get the result from execution.
@@ -917,7 +916,7 @@ std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
-  const std::string filename =
+  const std::string path =
       tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> wrapped_executable,
@@ -928,19 +927,19 @@ CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
 
   TF_ASSIGN_OR_RETURN(const std::string serialized_executable,
                       executable->executable()->SerializeExecutable());
-  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), filename,
-                                            serialized_executable));
+  TF_RETURN_IF_ERROR(
+      tsl::WriteStringToFile(tsl::Env::Default(), path, serialized_executable));
   return wrapped_executable;
 }
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
-  const std::string filename =
-      tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
+  const std::string filename = MakeFilename(*module, run_hlo_passes);
+  const std::string path = tsl::io::JoinPath(artifact_dir_, filename);
   std::string serialized_executable;
   if (const absl::Status status = tsl::ReadFileToString(
-          tsl::Env::Default(), filename, &serialized_executable);
+          tsl::Env::Default(), path, &serialized_executable);
       !status.ok()) {
     if (!compile_if_not_found_) {
       return absl::NotFoundError(absl::StrCat(
@@ -949,6 +948,31 @@ ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
     LOG(INFO) << "Failed to read serialized executable. " << status;
     return HloRunnerPjRt::CreateExecutable(std::move(module), run_hlo_passes);
   }
+  LOG(INFO) << "ExecutePhase deserializing " << module->name() << " from "
+            << filename;
+
+  // If fail_duplicate_loads_ is enabled, fail if we previously loaded an
+  // executable at this path, and the file at this path exists.
+  if (fail_duplicate_loads_) {
+    if (const auto [unused_it, did_insert] =
+            loaded_executable_paths_.insert(path);
+        !did_insert) {
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "ExecutePhaseHloRunnerPjRt::CreateExecutable called with a module "
+          "that loads an executable that was previously loaded. The module "
+          "name is %s and the filename is %s. If this is intentional, please "
+          "set fail_duplicate_loads to false. This error exists to snuff out "
+          "accidental duplicate loads originating from fingerprint collisions. "
+          "If you intended to load two different executables, this error "
+          "indicates that their fingerprints are the same. If you wish to "
+          "avoid this issue in a test, you can either force their fingerprints "
+          "to be different through some superficial change in the module (e.g. "
+          "the module name), or by disabling split compilation by setting "
+          "precompile_test = False in the corresponding xla_test.",
+          module->name(), filename));
+    }
+  }
+
   return DeserializeExecutable(serialized_executable);
 }
 
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index c01c62980d283e..488a0777aadcf4 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -17,13 +17,14 @@ limitations under the License.
 #define XLA_SERVICE_HLO_RUNNER_PJRT_H_
 
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -101,9 +102,9 @@ class HloRunnerPjRt : public HloRunnerInterface {
       DeviceAssignment* device_assignment) override;
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
@@ -136,13 +137,14 @@ class HloRunnerPjRt : public HloRunnerInterface {
       HloModule* module, bool run_hlo_passes);
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
-      std::function<
+      absl::AnyInvocable<
           absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
-              absl::Span<const std::vector<PjRtBuffer*>>)>
+              absl::Span<const std::vector<PjRtBuffer*>>,
+              absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
           execution_helper,
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment);
 
@@ -174,9 +176,9 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
   }
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override {
     return absl::UnimplementedError(
@@ -189,18 +191,26 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
 };
 
 // This class works just like a HloRunnerPjRt, but it only runs execution
-// (reading the executable from disk) and does not compile the executable.  If
-// `compile_if_not_found` is true, this class will attempt to compile the
+// (reading the executable from disk) and does not compile the executable.
+//
+// If `compile_if_not_found` is true, this class will attempt to compile the
 // executable if the serialized version from the compile phase could not be
 // found. This effectively makes this class equivalent to HloRunnerPjRt.
+//
+// If `fail_duplicate_loads` is true, calls to CreateExecutable will fail if the
+// executable was previously loaded using the same runner. Most tests do not
+// need to load an executable more than once and setting this can help catch
+// instances where e.g. fingerprints are colliding.
 class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
  public:
   ExecutePhaseHloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client,
                             absl::string_view artifact_dir,
-                            bool compile_if_not_found = true)
+                            bool compile_if_not_found = true,
+                            bool fail_duplicate_loads = true)
       : HloRunnerPjRt(std::move(pjrt_client)),
         artifact_dir_(artifact_dir),
-        compile_if_not_found_(compile_if_not_found) {}
+        compile_if_not_found_(compile_if_not_found),
+        fail_duplicate_loads_(fail_duplicate_loads) {}
 
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
       std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
@@ -208,6 +218,9 @@ class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
  private:
   std::string artifact_dir_;
   bool compile_if_not_found_;
+  bool fail_duplicate_loads_;
+
+  absl::flat_hash_set<std::string> loaded_executable_paths_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt_test.cc b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
index 51444677cbfdea..bef7797b6216e7 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
@@ -40,7 +40,9 @@ limitations under the License.
 #include "xla/service/hlo_runner_interface.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/path.h"
@@ -48,6 +50,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::StatusIs;
+using ::testing::StartsWith;
+
 class FakeClient : public PjRtClient {
  public:
   class Executable : public PjRtExecutable {
@@ -189,5 +194,36 @@ TEST_F(ExecutePhaseHloRunnerPjRtTest, CreateExecutableReadsFileCorrectly) {
   ASSERT_EQ(*serialized_representation_read, "hello world");
 }
 
+TEST_F(ExecutePhaseHloRunnerPjRtTest,
+       CreateExecutableFailsOnDuplicateLoadIfFeatureEnabled) {
+  TF_ASSERT_OK(tsl::WriteStringToFile(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(artifact_dir_, kModuleSerializedName), "hello world"));
+  ExecutePhaseHloRunnerPjRt runner(std::make_unique<FakeClient>(),
+                                   artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(runner.CreateExecutable(m->Clone(""), /*run_hlo_passes=*/false));
+  EXPECT_THAT(
+      runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false),
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          StartsWith(
+              "ExecutePhaseHloRunnerPjRt::CreateExecutable called with a "
+              "module that loads an executable that was previously loaded.")));
+}
+
+TEST_F(ExecutePhaseHloRunnerPjRtTest,
+       CreateExecutableSucceedsOnDuplicateLoadIfFeatureDisabled) {
+  TF_ASSERT_OK(tsl::WriteStringToFile(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(artifact_dir_, kModuleSerializedName), "hello world"));
+  ExecutePhaseHloRunnerPjRt runner(
+      std::make_unique<FakeClient>(), artifact_dir_,
+      /*compile_if_not_found=*/false, /*fail_duplicate_loads=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(runner.CreateExecutable(m->Clone(""), /*run_hlo_passes=*/false));
+  TF_EXPECT_OK(runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index 7050880018a56e..a00be7f860f013 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -59,31 +60,45 @@ std::vector<OpMetadata> ListMetadata() {
 
 class HloShardingTest : public HloHardwareIndependentTestBase {};
 
-TEST_F(HloShardingTest, Replicate) {
-  HloSharding sharding = HloSharding::Replicate();
+// TODO(b/456418464): Parameterize `HloShardingTest` itself after supporting
+// NamedSharding in all methods.
+class HloShardingRepresentationTest
+    : public HloShardingTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(HloShardingRepresentationTest, Replicate) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::Replicate({}, use_named_sharding);
+  EXPECT_EQ(sharding.UseNamedShardingLeaf(), use_named_sharding);
   EXPECT_TRUE(sharding.IsReplicated());
   EXPECT_TRUE(sharding.IsTileMaximal());
   EXPECT_TRUE(sharding.UsesDevice(0));
   EXPECT_TRUE(sharding.UsesDevice(65535));
 
-  HloSharding other = HloSharding::Replicate();
+  HloSharding other = HloSharding::Replicate({}, use_named_sharding);
   EXPECT_EQ(other, sharding);
+  EXPECT_NE(HloSharding::Replicate(),
+            HloSharding::Replicate({}, /*use_named_sharding=*/true));
 
   EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}),
                                  /*num_devices=*/2));
   EXPECT_FALSE(sharding.HasUniqueDevice());
 }
 
-TEST_F(HloShardingTest, DevicePlacement) {
-  HloSharding sharding = HloSharding::AssignDevice(5);
+TEST_P(HloShardingRepresentationTest, DevicePlacement) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::AssignDevice(5, {}, use_named_sharding);
+  EXPECT_EQ(sharding.UseNamedShardingLeaf(), use_named_sharding);
   EXPECT_FALSE(sharding.IsReplicated());
   EXPECT_TRUE(sharding.IsTileMaximal());
   EXPECT_FALSE(sharding.UsesDevice(0));
   EXPECT_TRUE(sharding.UsesDevice(5));
   EXPECT_EQ(5, sharding.GetUniqueDevice());
 
-  HloSharding other = HloSharding::Replicate();
+  HloSharding other = HloSharding::Replicate({}, use_named_sharding);
   EXPECT_NE(other, sharding);
+  EXPECT_NE(HloSharding::AssignDevice(5),
+            HloSharding::AssignDevice(5, {}, /*use_named_sharding=*/true));
 
   EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}),
                                  /*num_devices=*/6));
@@ -337,21 +352,31 @@ TEST_F(HloShardingTest, V1V2SubgroupEquivalence) {
 }
 
 // Tests that empty tuple is supported.
-TEST_F(HloShardingTest, EmptySingleTuple) {
-  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
-                                                  HloSharding::AssignDevice(0));
+TEST_P(HloShardingRepresentationTest, EmptySingleTuple) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::SingleTuple(
+      ShapeUtil::MakeTupleShape({}),
+      HloSharding::AssignDevice(0, {}, use_named_sharding));
   EXPECT_TRUE(sharding.ExtractSingleSharding());
+  EXPECT_EQ(sharding.ExtractSingleSharding()->UseNamedShardingLeaf(),
+            use_named_sharding);
 }
 
 // Tests that empty tuple is not a shard group.
-TEST_F(HloShardingTest, EmptySingleTupleIsNotShardGroup) {
-  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
-                                                  HloSharding::AssignDevice(0));
+TEST_P(HloShardingRepresentationTest, EmptySingleTupleIsNotShardGroup) {
+  bool use_named_sharding = GetParam();
+  HloSharding sharding = HloSharding::SingleTuple(
+      ShapeUtil::MakeTupleShape({}),
+      HloSharding::AssignDevice(0, {}, use_named_sharding));
   EXPECT_FALSE(sharding.IsShardGroup());
   EXPECT_FALSE(sharding.IsShardAs());
   EXPECT_FALSE(sharding.IsShardLike());
 }
 
+INSTANTIATE_TEST_SUITE_P(HloShardingRepresentationTest,
+                         HloShardingRepresentationTest,
+                         ::testing::Values(false, true));
+
 TEST_F(HloShardingTest, NestedTuple) {
   // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
   Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({
diff --git a/third_party/xla/xla/service/hlo_unstacker.cc b/third_party/xla/xla/service/hlo_unstacker.cc
deleted file mode 100644
index 045ba3d6a27a74..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker.cc
+++ /dev/null
@@ -1,1503 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/hlo_unstacker.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/map_util.h"
-#include "xla/service/hlo_creation_utils.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/tuple_util.h"
-#include "xla/service/while_loop_unroller.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-
-// TODO: b/352400145 - Unify the patterns, handlers and their type into a class
-// or struct.
-enum class PatternType {
-  DSFusionNoBitcastPattern,
-  DSFusionPattern,
-  NestedDSFusionPattern,
-  Other,
-};
-
-static std::string PatternTypeToString(PatternType pattern_type) {
-  switch (pattern_type) {
-    case PatternType::DSFusionNoBitcastPattern:
-      return "DSFusionNoBitcastPattern";
-    case PatternType::DSFusionPattern:
-      return "DSFusionPattern";
-    case PatternType::NestedDSFusionPattern:
-      return "NestedDSFusionPattern";
-    case PatternType::Other:
-      return "Other";
-  }
-}
-
-// Holds the information about custom unstacking patterns.
-struct PatternInfo {
-  PatternType type;
-  std::vector<const HloInstruction*> unstacked_instrs;
-  const HloInstruction* instr;
-  Shape unstacked_shape;
-  HloComputation* unstacking_computation;
-
-  std::string ToString() const {
-    if (unstacking_computation == nullptr) {
-      return absl::StrCat("type: \n\t", PatternTypeToString(type), "\n",
-                          "instr: \n\t", instr->name(), "\n", "shape: \n\t",
-                          unstacked_shape.ToString(true));
-    } else {
-      return absl::StrCat("type: \n\t", PatternTypeToString(type), "\n",
-                          "instr: \n\t", instr->name(), "\n", "shape: \n\t",
-                          unstacked_shape.ToString(true), "\n", "comp: \n",
-                          unstacking_computation->name());
-    }
-  }
-};
-
-// TODO: b/342457472 - Remove this struct and move its field to the
-// UnstackerTransformer as static members. A struct that holds the required
-// information for unstacking that is fixed across different unstacker
-// instastances.
-struct UnstackerMetadata {
-  static absl::StatusOr<UnstackerMetadata> Create(
-      HloModule* module, std::function<bool(HloInstruction*)> unfuse_slice) {
-    UnstackerMetadata metadata;
-    TF_ASSIGN_OR_RETURN(
-        bool prepared,
-        WhileLoopUnroller::PrepareModuleForUnrolling(module, {}));
-    if (prepared) {
-      VLOG(3) << "Prepared module: " << module->name() << " for unstacking.";
-    }
-    std::vector<std::pair<HloInstruction*, WhileLoopConfig>> loops =
-        WhileLoopUnroller::GetUnrollableLoops(module, {},
-                                              /*unroll_config=*/std::nullopt);
-    for (const auto& [instr, while_loop_config] : loops) {
-      metadata.unrollable_loop_bodies[instr->while_body()] = while_loop_config;
-      metadata.bodies[instr->while_body()] = instr;
-    }
-    metadata.unfuse_slice = unfuse_slice;
-    return metadata;
-  }
-  absl::flat_hash_map<HloComputation*, WhileLoopConfig> unrollable_loop_bodies;
-  absl::flat_hash_map<const HloComputation*, HloInstruction*> bodies;
-  // Vector containing pairs of custom patterns and their corresponding handler
-  // lambdas. The patterns are checked in the order in which they are inserted
-  // into this vector.
-  std::vector<
-      std::pair<std::function<std::optional<PatternInfo>(
-                    const UnstackerMetadata&, const HloInstruction*, int64_t)>,
-                std::function<absl::Status(HloInstruction*, const Shape&)>>>
-      custom_handlers;
-  std::function<bool(HloInstruction*)> unfuse_slice;
-};
-
-// Performs the two-step unstacking. Each instance of this class is responsible
-// for a single operand of a while loop.
-class UnstackerTransformer {
- public:
-  // Default unroll_factor of -1 indicates full unrolling
-  explicit UnstackerTransformer(const UnstackerMetadata& metadata)
-      : metadata_(metadata) {}
-
-  // Given an instruction and the index of the its changed operand, it applies
-  // the custom handler and populates body_changes lambdas that unstacks the hlo
-  // graph accordingly.
-  std::vector<const HloInstruction*> HandleInstruction(
-      const HloInstruction* instr, int64_t changed_idx) {
-    // Currently, we only unstack operands that are used within fusion
-    // computations.
-    if (instr->opcode() != HloOpcode::kFusion) {
-      return {};
-    }
-    VLOG(3) << "HandleInstruction(" << instr->shape().ToString()
-            << instr->name() << ", " << changed_idx << ")";
-
-    for (const auto& [custom_pattern, custom_handler] :
-         metadata_.custom_handlers) {
-      std::optional<PatternInfo> stacked_user =
-          custom_pattern(metadata_, instr, changed_idx);
-      // Try the next pattern if current pattern is not found.
-      if (!stacked_user.has_value()) {
-        continue;
-      }
-      PatternInfo& pattern_info = stacked_user.value();
-      pattern_type_ = pattern_info.type;
-      VLOG(3) << "PatternInfo:" << "\n" << pattern_info.ToString();
-
-      if (pattern_info.unstacking_computation != nullptr &&
-          unstacking_computation_ != nullptr) {
-        if (!absl::EqualsIgnoreCase(
-                pattern_info.unstacking_computation->ToString(
-                    HloPrintOptions::Fingerprint()),
-                unstacking_computation_->ToString(
-                    HloPrintOptions::Fingerprint()))) {
-          VLOG(3) << "Seen multiple unstacking computations, cannot handle: "
-                  << "\n previous computations: \n"
-                  << unstacking_computation_->ToString(
-                         HloPrintOptions::Fingerprint())
-                  << "\n current computations: \n"
-                  << pattern_info.unstacking_computation->ToString(
-                         HloPrintOptions::Fingerprint());
-          return {};
-        }
-      }
-
-      if (pattern_info.unstacking_computation != nullptr) {
-        unstacking_computation_ = pattern_info.unstacking_computation;
-      }
-
-      unstacked_shape_ = std::make_unique<Shape>(pattern_info.unstacked_shape);
-      unstacked_instrs_.push_back(instr);
-
-      // Wrapper function around the unstacker lambda which calls the unstacker.
-      std::function<absl::Status()> unstack_wrapper =
-          [&custom_handler = custom_handler,
-           pattern_info]() mutable -> absl::Status {
-        HloInstruction* mutable_dynamic_slicing_fusion =
-            const_cast<HloInstruction*>(pattern_info.instr);
-        return custom_handler(mutable_dynamic_slicing_fusion,
-                              pattern_info.unstacked_shape.tuple_shapes(0));
-      };
-      body_changes_.push_back(unstack_wrapper);
-      return pattern_info.unstacked_instrs;
-    }
-    return {};
-  }
-
-  const UnstackerMetadata& GetMetadata() const { return metadata_; }
-
-  std::vector<const HloInstruction*>& GetUnstackedInstructions() {
-    return unstacked_instrs_;
-  }
-
-  const Shape* GetUnstackedShape() const { return unstacked_shape_.get(); }
-
-  // The function returns a mutable pointer to the unstacking computation since
-  // the pointer is later used to clone the computation.
-  HloComputation* GetUnstackingComputation() const {
-    return unstacking_computation_;
-  }
-
-  std::vector<std::function<void(const UnstackerTransformer&)>>&
-  GetLoopChanges() {
-    return loop_changes_;
-  }
-
-  std::vector<std::function<absl::Status()>>& GetBodyChanges() {
-    return body_changes_;
-  }
-
-  absl::flat_hash_map<HloInstruction*, std::vector<int64_t>>&
-  GetOperandChanges() {
-    return operand_changes_;
-  }
-
-  void AddOperandChange(HloInstruction* instr, int64_t index) {
-    operand_changes_[instr].push_back(index);
-  }
-
-  void AddLoopChange(
-      std::function<void(const UnstackerTransformer&)> loop_change) {
-    loop_changes_.push_back(loop_change);
-  }
-
-  PatternType GetPatternType() const { return pattern_type_; }
-
- private:
-  PatternType pattern_type_;
-  const UnstackerMetadata& metadata_;
-  // This pointer is populated if the unstacker finds unstackable loop input.
-  std::unique_ptr<Shape> unstacked_shape_ = nullptr;
-  // This is a pointer to the computation that is responsible for unstacking. It
-  // is used to hoist the unstacking computations outside the loop bodies.
-  // std::unique_ptr<HloComputation>
-  HloComputation* unstacking_computation_ = nullptr;
-  // A vector of lambdas that describe necessary changes to the shape of the
-  // loops to unstack. The lambdas accept the pointer to the new unstacked
-  // shape.
-  std::vector<std::function<void(const UnstackerTransformer&)>> loop_changes_;
-  // a list of lambdas that captures all the changes to the hlo graph needed for
-  // unstacking.
-  std::vector<std::function<absl::Status()>> body_changes_;
-  // A map that tracks the index of the changed operand for instructions of type
-  // get-tuple-element, tuple, and while during unstacking.
-  absl::flat_hash_map<HloInstruction*, std::vector<int64_t>> operand_changes_;
-  // Holds the list of unstacked instructions that will be used to identify
-  // loops that need to be unrolled.
-  std::vector<const HloInstruction*> unstacked_instrs_;
-};
-
-bool CanUnstackWhileOperand(const HloInstruction* while_instr,
-                            UnstackerTransformer& unstacker, int64_t index);
-
-bool UnstackWhileOperandAtIndex(
-    const UnstackerMetadata& metadata, HloInstruction* while_instr,
-    int64_t index, std::vector<const HloInstruction*>& unstacked_instructions);
-
-// Given a gte and an unstacker instance, this function walks down the graph of
-// the users in BFS manner and propagates the index of the changed input operand
-// for kGetTupleElement, kTuple, and kWhile instructions. Moreover, if checks if
-// the a user should be handled with the provided custom handler(s) inside the
-// unstacker instance. Note that this function does NOT change the shape of any
-// instruction, it merely keeps track of the instructions and where in the input
-// operands the change need to be applied later.
-bool PropagateGteShapeChange(HloInstruction* gte,
-                             UnstackerTransformer& unstacker) {
-  VLOG(5) << "PropagateGteShapeChange(" << gte->name() << ")";
-
-  HloInstruction* parent_while = nullptr;
-  if (unstacker.GetMetadata().bodies.contains(gte->parent())) {
-    parent_while = unstacker.GetMetadata().bodies.at(gte->parent());
-    if (parent_while->while_body() != gte->parent()) {
-      parent_while = nullptr;
-    }
-  }
-
-  std::vector<const HloInstruction*> handled_instrs;
-  // TODO: b/343457903 - Use HloDataflowAnalysis to track the usage of a value
-  // instead of manually applying bfs
-  //
-  // Apply BFS to propagate the index of the changed operand. We put all the
-  // changed instructions along with the index of the changed operand in the
-  // visited map and then propagate the change to the users of the instruction.
-  absl::flat_hash_map<HloInstruction*, int64_t> visited;
-  std::deque<HloInstruction*> worklist;
-  worklist.push_back(gte);
-  visited.insert({gte, gte->tuple_index()});
-  unstacker.AddOperandChange(gte, gte->tuple_index());
-  while (!worklist.empty()) {
-    HloInstruction* changed_instr_to_propagate = worklist.front();
-    // The index of the changed operand that needs to be propagated.
-    int64_t changed_operand_index =
-        FindOrDie(visited, changed_instr_to_propagate);
-    worklist.pop_front();
-    for (HloInstruction* user : changed_instr_to_propagate->users()) {
-      if (ContainsKey(visited, user)) {
-        continue;
-      }
-      // We explicitly propagate the changed index for three types of users,
-      // namely, get-tuple-element, tuple and while users. The rationale is that
-      // the output shape of these three instruction types are inferred only by
-      // their input operand(s). Finally, we check if the user can be handled by
-      // the provided custom handler in HandleInstruction method.
-      if (user->opcode() == HloOpcode::kGetTupleElement) {
-        if (user->tuple_index() != changed_operand_index) {
-          continue;
-        }
-        // Since we insert the gte user only if the index of the gte is equal to
-        // the changed operand of its tuple input, we are sure that this gte
-        // instruction will get the new shape eventually and the
-        // change_operand_index does not matter.
-        visited.insert({user, changed_operand_index});
-        unstacker.AddOperandChange(user, changed_operand_index);
-        worklist.push_back(user);
-      } else if (user->opcode() == HloOpcode::kTuple) {
-        for (int64_t i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) == changed_instr_to_propagate) {
-            visited.insert({user, i});
-            unstacker.AddOperandChange(user, i);
-            worklist.push_back(user);
-            if (parent_while != nullptr && user->IsRoot() &&
-                i != gte->tuple_index()) {
-              bool changed_nested_while =
-                  CanUnstackWhileOperand(parent_while, unstacker, i);
-              if (!changed_nested_while) {
-                return false;
-              }
-            }
-          }
-        }
-      } else if (user->opcode() == HloOpcode::kWhile) {
-        // Recursively check the inner while for unstacking and populate
-        // unstacker instance.
-        bool changed_nested_while =
-            CanUnstackWhileOperand(user, unstacker, changed_operand_index);
-        if (!changed_nested_while) {
-          return false;
-        }
-        visited.insert({user, changed_operand_index});
-        unstacker.AddOperandChange(user, changed_operand_index);
-        worklist.push_back(user);
-      } else {
-        if (absl::c_find(handled_instrs, user) != handled_instrs.end()) {
-          continue;
-        }
-        // If already unstacked, we do not need to handle again.
-        if (user->IsCustomCall("DynamicGte") ||
-            user->IsCustomCall("DynamicTuple")) {
-          continue;
-        }
-        int64_t use_index = user->operand_index(changed_instr_to_propagate);
-        std::vector<const HloInstruction*> curr_handled_instrs =
-            unstacker.HandleInstruction(user, use_index);
-        if (curr_handled_instrs.empty()) {
-          VLOG(3) << "Custom unstacker not found for " << user->name();
-          return false;
-        }
-        for (const HloInstruction* instr : curr_handled_instrs) {
-          // TODO: b/352400145 - Here we check if the user has the same shape as
-          // the stacked tensor (how to capture this more robustly?). if so, we
-          // need to add the user to the worklist to get updated.
-          for (HloInstruction* handled_instr_user : instr->users()) {
-            if (user->shape() == gte->shape()) {
-              visited.insert({handled_instr_user, changed_operand_index});
-              unstacker.AddOperandChange(handled_instr_user,
-                                         changed_operand_index);
-              worklist.push_back(handled_instr_user);
-            }
-          }
-          handled_instrs.push_back(instr);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-// Within the given computation, finds all the gte instruction with the
-// following form: get-tuple-elements(operand), index=idx and collects all the
-// new shapes. new_shape is the new shape at idx of the operand of the gte.
-bool CanPropagateGteShapeChangesInComputation(
-    const HloComputation* comp, const HloInstruction* operand,
-    UnstackerTransformer& shape_transformer, int64_t idx) {
-  VLOG(3) << "Propagating shape change of index " << idx
-          << " in : " << comp->name();
-  for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
-    // We only need to propagate changes through the gte instructions with index
-    // = idx.
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == idx) {
-      if (instr->operand(0) != operand) {
-        continue;
-      }
-      // If propagation is not possible (no custom handler provided for the
-      // users of the candidate), we bail early.
-      bool can_propagate = PropagateGteShapeChange(instr, shape_transformer);
-      if (!can_propagate) {
-        VLOG(3) << "Failed to propagate shape change for " << instr->name();
-        return false;
-      }
-    }
-  }
-  VLOG(3) << "Finish propagating shape change of index " << idx
-          << " in: " << comp->name();
-  return true;
-}
-
-std::unique_ptr<HloInstruction> DynamicSliceToSlice(
-    HloInstruction* dynamic_slice, HloInstruction* input, int64_t i) {
-  std::vector<int64_t> new_start_indices;
-  new_start_indices.reserve(dynamic_slice->shape().dimensions().size());
-  std::vector<int64_t> new_limit_indices;
-  new_limit_indices.reserve(dynamic_slice->shape().dimensions().size());
-  std::vector<int64_t> new_strides;
-  new_strides.reserve(dynamic_slice->shape().dimensions().size());
-  new_start_indices.push_back(i);
-  new_limit_indices.push_back(i + 1);
-  new_strides.push_back(1);
-  for (int64_t j = 1; j < dynamic_slice->shape().dimensions().size(); ++j) {
-    new_start_indices.push_back(0);
-    new_limit_indices.push_back(
-        dynamic_slice->mutable_operand(0)->shape().dimensions(j));
-    new_strides.push_back(1);
-  }
-  return HloInstruction::CreateSlice(dynamic_slice->shape(), input,
-                                     new_start_indices, new_limit_indices,
-                                     new_strides);
-}
-
-bool ShouldUnfuseSlices(const UnstackerMetadata& metadata, HloInstruction* ds) {
-  HloInstruction* input = ds->mutable_operand(0);
-  for (int64_t i = 0; i < input->shape().dimensions(0); ++i) {
-    HloInstruction* slice =
-        ds->AddInstruction(DynamicSliceToSlice(ds, input, i));
-    if (!metadata.unfuse_slice(slice)) {
-      CHECK_OK(slice->parent()->RemoveInstruction(slice));
-      return false;
-    }
-    CHECK_OK(slice->parent()->RemoveInstruction(slice));
-  }
-  return true;
-}
-
-// This function is responsible for:
-// 1. Hoisting the unstacking computation outside the while_instr.
-// 2. Replacing the input of the while_instr with the new unstacked version.
-void UnstackWhileInput(const UnstackerTransformer& unstacker,
-                       HloInstruction* while_instr, int64_t index) {
-  VLOG(3) << "Unstacking while input: " << while_instr->name() << " at "
-          << index;
-  const Shape* new_shape = unstacker.GetUnstackedShape();
-  HloComputation* unstacking_computation = unstacker.GetUnstackingComputation();
-  const Shape& slice_shape = new_shape->tuple_shapes(0);
-  HloInstruction* old_while_input =
-      while_instr->while_init()->mutable_operand(index);
-  // If the input is a tuple, i.e., while_instr has already been unstacked
-  // during unstacking of its parent, we do not need to unstack it again.
-  if (old_while_input->shape().IsTuple()) {
-    VLOG(3) << "Input is already unstacked: " << old_while_input->name();
-    return;
-  }
-
-  std::vector<HloInstruction*> slices;
-  // If the input is an AllocateBuffer, we simply break it down into a tuple of
-  // AllocateBuffer instructions, one per slice.
-  if (old_while_input->IsCustomCall("AllocateBuffer")) {
-    for (int64_t i = 0; i < new_shape->tuple_shapes().size(); ++i) {
-      slices.push_back(while_instr->AddInstruction(
-          HloInstruction::CreateCustomCall(slice_shape, {}, "AllocateBuffer")));
-    }
-  } else {
-    // TODO: b/341815540 - Instead of creating the unstacked tuple for every
-    // input index, we should reuse if the input and unstacking computations are
-    // the same.
-    //
-    // Hoist the unstacking computation outside the while_instr and create a
-    // tuple of slices.
-    for (int64_t i = 0; i < new_shape->tuple_shapes().size(); ++i) {
-      HloInstruction* root_instr = unstacking_computation->root_instruction();
-      // TODO: b/352400145 - After unifying patterns and handlers, instead of
-      // using the pattern type to determine the unstacked input, we should use
-      // the pattern object to call the appropriate method.
-      //
-      // For DSFusionPattern and NestedDSFusionPattern, we rewrite the
-      // dynamic-slice as a slice instruction in the hope that these slices are
-      // later prefetched using async-slice by MSA. For other patterns, we
-      // resort to the original unstacking computation until we find benefit in
-      // doing otherwise.
-      HloInstruction* slice = nullptr;
-      if (unstacker.GetPatternType() == PatternType::DSFusionPattern ||
-          unstacker.GetPatternType() == PatternType::NestedDSFusionPattern ||
-          unstacker.GetPatternType() == PatternType::DSFusionNoBitcastPattern) {
-        if (unstacker.GetPatternType() == PatternType::DSFusionPattern ||
-            unstacker.GetPatternType() == PatternType::NestedDSFusionPattern) {
-          slice = while_instr->AddInstruction(DynamicSliceToSlice(
-              root_instr->mutable_operand(0), old_while_input, i));
-        } else if (unstacker.GetPatternType() ==
-                   PatternType::DSFusionNoBitcastPattern) {
-          slice = while_instr->AddInstruction(
-              DynamicSliceToSlice(root_instr, old_while_input, i));
-        }
-      }
-      if (slice == nullptr || !unstacker.GetMetadata().unfuse_slice(slice)) {
-        std::vector<HloInstruction*> operands = {
-            old_while_input,
-            while_instr->AddInstruction(MakeScalarConstantWithShape(
-                unstacking_computation->parameter_instruction(1)->shape(), i))};
-        slice = while_instr->AddInstruction(HloInstruction::CreateFusion(
-            slice_shape, HloInstruction::FusionKind::kLoop, operands,
-            while_instr->GetModule()->AddEmbeddedComputation(
-                unstacking_computation->Clone()),
-            "hoisted"));
-      }
-      slices.push_back(slice);
-    }
-  }
-  HloInstruction* new_operand_element =
-      while_instr->AddInstruction(HloInstruction::CreateTuple(slices));
-  HloInstruction* new_while_init =
-      TupleUtil::ReplaceTupleWith(new_operand_element,
-                                  while_instr->while_init(), {index}, false)
-          .value();
-  CHECK_OK(while_instr->ReplaceOperandWithDifferentShape(0, new_while_init));
-}
-
-bool CanUnstackWhileOperand(const HloInstruction* while_instr,
-                            UnstackerTransformer& unstacker, int64_t index) {
-  VLOG(5) << "ReplaceWhileOperandShape: " << while_instr->name() << " at "
-          << index;
-
-  bool body_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->while_body(),
-      while_instr->while_body()->parameter_instruction(0), unstacker, index);
-  if (!body_changes_collected) {
-    return false;
-  }
-
-  bool condition_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->while_condition(),
-      while_instr->while_condition()->parameter_instruction(0), unstacker,
-      index);
-  if (!condition_changes_collected) {
-    return false;
-  }
-
-  // Check if we can propagate the changes through the output of the while
-  // at index.
-  bool parent_changes_collected = CanPropagateGteShapeChangesInComputation(
-      while_instr->parent(), while_instr, unstacker, index);
-  if (!parent_changes_collected) {
-    VLOG(3) << "Failed: parent_changes_collected";
-    return false;
-  }
-
-  HloInstruction* root_operand =
-      while_instr->while_body()->root_instruction()->mutable_operand(index);
-  if (root_operand == nullptr) {
-    return false;
-  }
-
-  HloInstruction* gte_operand = nullptr;
-  // Currently, we only support unstacking of while operands that either:
-  // 1. Are parameters of the while_body.
-  // 2. Are get-tuple-elements of another while instruction.
-  if (Match(root_operand, match::GetTupleElement(match::Op(&gte_operand)))) {
-    if (Match(gte_operand, match::While())) {
-      VLOG(3) << "Faced a gte originating from loop: "
-              << root_operand->ToString();
-      bool loop_feeding_root_changes_collected = CanUnstackWhileOperand(
-          root_operand->operand(0), unstacker, root_operand->tuple_index());
-      if (!loop_feeding_root_changes_collected) {
-        VLOG(3) << "Failed: loop " << root_operand->operand(0)->name()
-                << " output at " << index << " is not unstackable";
-        return false;
-      }
-    } else if (!Match(gte_operand, match::Parameter().WithParameterNum(0))) {
-      VLOG(3) << "Failed: root operand of while_body at " << index
-              << " is not a parameter";
-      return false;
-    }
-  }
-
-  auto loop_change = [=](const UnstackerTransformer& unstacker,
-                         HloInstruction* loop, int64_t idx) mutable {
-    Shape old_shape = ShapeUtil::MakeStaticShape(
-        loop->while_body()->parameter_instruction(0)->shape());
-    ShapeUtil::UpdateTupleShape(*unstacker.GetUnstackedShape(), idx,
-                                &old_shape);
-
-    loop->while_body()->ReplaceParameter(
-        0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
-    loop->while_condition()->ReplaceParameter(
-        0, HloInstruction::CreateParameter(0, old_shape, "unstacked"));
-
-    CHECK_NE(unstacker.GetUnstackingComputation(), nullptr);
-    UnstackWhileInput(unstacker, loop, idx);
-    // Update the input and output shape of the loop.
-    *loop->mutable_shape() = old_shape;
-  };
-  auto loop_change_wrapper = [&loop_change, while_instr,
-                              index](const UnstackerTransformer& unstacker) {
-    HloInstruction* mutable_loop = const_cast<HloInstruction*>(while_instr);
-    loop_change(unstacker, mutable_loop, index);
-  };
-  unstacker.AddLoopChange(loop_change_wrapper);
-  return true;
-}
-
-// Apply the two-step unstacking algorithm to the given while_instr at the given
-// index.
-bool UnstackWhileOperandAtIndex(
-    const UnstackerMetadata& metadata, HloInstruction* while_instr,
-    int64_t index, std::vector<const HloInstruction*>& unstacked_instructions) {
-  UnstackerTransformer unstacker = UnstackerTransformer(metadata);
-
-  // First step of unstacking to determine whether while_instr at index is
-  // unstackable.
-  bool can_unstack = CanUnstackWhileOperand(while_instr, unstacker, index);
-  if (!can_unstack) {
-    VLOG(3) << "Unstacking failed for " << while_instr->name() << " at "
-            << index;
-    return false;
-  }
-
-  // If unstacker has not found an unstackable shape, there is no point in
-  // applying the unstacker changes.
-  if (unstacker.GetUnstackedShape() == nullptr) {
-    VLOG(3) << "Failed: unstacked shape is null";
-    return false;
-  }
-
-  // If unstacker has not found an unstackable shape, there is no point in
-  // applying the unstacker changes.
-  if (unstacker.GetUnstackingComputation() == nullptr) {
-    VLOG(3) << "Failed: unstacking computation is null";
-    return false;
-  }
-
-  // At this point, we have the unstacked_shape at hand. We go ahead and apply
-  // all the changes that required the unstacked shape.
-  //
-  // Update the shape of get-tuple-element, tuple, and, while instructions
-  // based on the unstacked_shape and the index of the changed operand.
-  for (auto& [instr, indices] : unstacker.GetOperandChanges()) {
-    switch (instr->opcode()) {
-      case HloOpcode::kGetTupleElement:
-        VLOG(3) << "Changing shape of: " << instr->name();
-        *instr->mutable_shape() = *unstacker.GetUnstackedShape();
-        break;
-      case HloOpcode::kTuple: {
-        for (int64_t index : indices) {
-          VLOG(3) << "Changing shape of: " << instr->name() << " at " << index;
-          *instr->mutable_shape()->mutable_tuple_shapes(index) =
-              *unstacker.GetUnstackedShape();
-        }
-        break;
-      }
-      case HloOpcode::kWhile:
-        for (int64_t index : indices) {
-          VLOG(3) << "Changing shape of: " << instr->name() << " at " << index;
-          ShapeUtil::UpdateTupleShape(*unstacker.GetUnstackedShape(), index,
-                                      instr->mutable_shape());
-        }
-        break;
-      default:
-        LOG(FATAL) << "Unsupported opcode: " << instr->name();
-    }
-  }
-  // Apply the changes to the body according to the provided custom handler.
-  for (const auto& body_change : unstacker.GetBodyChanges()) {
-    CHECK_OK(body_change());
-  }
-  // Apply the changes to the shape of the loop body and condition computations.
-  for (auto& loop_change : unstacker.GetLoopChanges()) {
-    loop_change(unstacker);
-  }
-  for (const HloInstruction* instr : unstacker.GetUnstackedInstructions()) {
-    unstacked_instructions.push_back(instr);
-  }
-  return true;
-}
-
-Shape MakeUnstackedShapeFromSlice(const Shape& slice_shape, int64_t layers) {
-  std::vector<Shape> shapes;
-  shapes.reserve(layers);
-  for (int64_t i = 0; i < layers; ++i) {
-    shapes.push_back(slice_shape);
-  }
-  return ShapeUtil::MakeTupleShape(shapes);
-}
-
-// Checks if the given instruction is a fusion with num_fusion_params
-// parameters inside an unrollable loop. If so, it returns the loop config.
-std::optional<WhileLoopConfig> IsFusionInsideUnrollableLoopWithNumParameter(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    std::optional<int64_t> num_fusion_params) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  if (num_fusion_params.has_value()) {
-    if (instr->fused_parameters().size() != num_fusion_params) {
-      VLOG(3) << "Fusion has different number of parameters";
-      return std::nullopt;
-    }
-  }
-  if (!metadata.unrollable_loop_bodies.contains(instr->parent())) {
-    VLOG(5) << "Fusion not inside unrollable while body, " << instr->name()
-            << " inside " << instr->parent()->name();
-    return std::nullopt;
-  }
-  return metadata.unrollable_loop_bodies.at(instr->parent());
-}
-
-// Checks if the instruction is a fusion with num_fusion_params parameters
-// inside an unrollable loop and within its fusion computation there is an
-// effectively static dynamic-slice instruction on the most major dimension of
-// the operand at the given stacked_operand_idx. If so, it returns the
-// dynamic-slice instruction.
-HloInstruction* GetMostMajorEffectivelyStaticDynamicSliceInFusion(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    std::optional<int64_t> num_fusion_params, int64_t stacked_operand_idx) {
-  std::optional<WhileLoopConfig> while_instr_config =
-      IsFusionInsideUnrollableLoopWithNumParameter(metadata, instr,
-                                                   num_fusion_params);
-  if (!while_instr_config.has_value()) {
-    return nullptr;
-  }
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    std::optional<int64_t> dynamic_index =
-        MatchEffectivelyStaticDynamicSliceInsideLoop(
-            fused_instr,
-            instr->fused_instructions_computation()->parameter_instruction(
-                stacked_operand_idx),
-            while_instr_config.value());
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      return fused_instr;
-    }
-  }
-  return nullptr;
-}
-
-// Checks if the instruction is a fusion with num_fusion_params parameters
-// inside an unrollable loop and within its fusion computation looks for the
-// dynamic-index instruction that covers the shape of the operand at the given
-// index.
-HloInstruction* GetMostMajorShapeCoveringDynamicIndexInFusion(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    HloOpcode opcode, int64_t num_fusion_params, int64_t stacked_operand_idx) {
-  std::optional<WhileLoopConfig> while_instr_config =
-      IsFusionInsideUnrollableLoopWithNumParameter(metadata, instr,
-                                                   num_fusion_params);
-  if (!while_instr_config.has_value()) {
-    return nullptr;
-  }
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    if (fused_instr->opcode() != opcode) {
-      continue;
-    }
-    std::optional<int64_t> dynamic_index =
-        MatchShapeCoveringDynamicIndexInstruction(
-            fused_instr,
-            instr->fused_instructions_computation()->parameter_instruction(
-                stacked_operand_idx),
-            opcode, while_instr_config.value());
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      return fused_instr;
-    }
-  }
-  return nullptr;
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, f(loop_iteration_var))
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic_slice(p0, p1, zero, ...)
-//   ROOT bitcast = bitcast(slice)
-// }
-// where f is a function of loop_iteration_var. It indicates that the slicing
-// offset is effectively static after unrolling.
-std::optional<PatternInfo> GetDSFusionPattern(const UnstackerMetadata& metadata,
-                                              const HloInstruction* instr,
-                                              int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorEffectivelyStaticDynamicSliceInFusion(metadata, instr, 2,
-                                                        stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (!ShouldUnfuseSlices(metadata, shape_covering_instr)) {
-    return std::nullopt;
-  }
-  HloInstruction* bitcast_operand = nullptr;
-  if (Match(instr->fused_instructions_computation()->root_instruction(),
-            match::Bitcast(match::Op(&bitcast_operand)))) {
-    if (bitcast_operand == shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::DSFusionPattern;
-      pattern_info.instr = instr;
-      const Shape& slice_shape = shape_covering_instr->shape();
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-      pattern_info.unstacking_computation =
-          instr->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDSFusionPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slicing_fusion->parent();
-
-  HloInstruction* stacked = mutable_dynamic_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slicing_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-
-  HloInstruction* bitcast = mutable_dynamic_slicing_fusion->AddInstruction(
-      HloInstruction::CreateBitcast(mutable_dynamic_slicing_fusion->shape(),
-                                    new_operand));
-  return mutable_dynamic_slicing_fusion->ReplaceAllUsesWithDifferentShape(
-      bitcast);
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, f(loop_iteration_var))
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   ROOT slice = dynamic_slice(p0, p1, zero, ...)
-// }
-// where f is a function of loop_iteration_var. It indicates that the slicing
-// offset is effectively static after unrolling.
-std::optional<PatternInfo> GetDSFusionNoBitcastPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusionNoBitcast";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorEffectivelyStaticDynamicSliceInFusion(metadata, instr, 2,
-                                                        stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (instr->fused_instructions_computation()->root_instruction() !=
-      shape_covering_instr) {
-    return std::nullopt;
-  }
-  PatternInfo pattern_info;
-  pattern_info.type = PatternType::DSFusionNoBitcastPattern;
-  pattern_info.instr = instr;
-  const Shape& slice_shape = shape_covering_instr->shape();
-  const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-  pattern_info.unstacked_shape =
-      MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-  pattern_info.unstacking_computation = instr->fused_instructions_computation();
-  pattern_info.unstacked_instrs.push_back(instr);
-  return pattern_info;
-}
-
-absl::Status UnstackDSFusionNoBitcastPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slicing_fusion->parent();
-
-  HloInstruction* stacked = mutable_dynamic_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slicing_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-
-  return mutable_dynamic_slicing_fusion->ReplaceAllUsesWithDifferentShape(
-      new_operand);
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   p2 = parameter(2)
-//   update = bitcast(p1)
-//   ROOT dus = dynamic_update_slice(p0, update, p2, zero, ...)
-// }
-std::optional<PatternInfo> GetDUSFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DUSFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicUpdateSlice, 3,
-          stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (Match(shape_covering_instr->operand(1),
-            match::Bitcast(match::Parameter()))) {
-    if (shape_covering_instr->parent()->root_instruction() ==
-        shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape = MakeUnstackedShapeFromSlice(
-          instr->operand(2)->shape(), instr->operand(0)->shape().dimensions(0));
-      pattern_info.unstacking_computation = nullptr;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDUSFusionPattern(
-    HloInstruction* mutable_dynamic_update_slicing_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_update_slicing_fusion->parent();
-  // TODO: (b/350043079) - automatically find the input, offset and update
-  // indices.
-  HloInstruction* stacked =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(0);
-  HloInstruction* offset =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(1);
-  HloInstruction* update =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(2);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          stacked->shape(), {stacked, update, offset}, "DynamicTuple"));
-  for (HloInstruction* user : mutable_dynamic_update_slicing_fusion->users()) {
-    TF_RETURN_IF_ERROR(
-        mutable_dynamic_update_slicing_fusion->ReplaceUseWithDifferentShape(
-            user, new_operand));
-  }
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stackd, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   p2 = parameter(2)
-//   pad = pad(p1, ...)
-//   update = bitcast(pad)
-//   ROOT dus = dynamic_update_slice(p0, update, p2, zero, ...)
-// }
-std::optional<PatternInfo> GetDUSFusionWithPadPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DUSFusionWithPad";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicUpdateSlice, 3,
-          stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (Match(
-          shape_covering_instr->operand(1),
-          match::Bitcast(match::Pad(match::Parameter(), match::Constant())))) {
-    if (shape_covering_instr->parent()->root_instruction() ==
-        shape_covering_instr) {
-      const HloInstruction* pad_instr =
-          shape_covering_instr->operand(1)->operand(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape = MakeUnstackedShapeFromSlice(
-          pad_instr->shape(),
-          shape_covering_instr->operand(0)->shape().dimensions(0));
-      pattern_info.unstacking_computation = nullptr;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-// Unstacks the DUSFusionWithPad pattern by removing the dynamic-update-slice
-// from the fusion and feeding the padding fusion to the dynamic-tuple
-// custom-call.
-absl::Status UnstackDUSFusionWithPadPattern(
-    HloInstruction* mutable_dynamic_update_slicing_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_update_slicing_fusion->parent();
-  HloComputation* fused_computation =
-      mutable_dynamic_update_slicing_fusion->fused_instructions_computation();
-  HloInstruction* stacked =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(
-          fused_computation->root_instruction()
-              ->mutable_operand(0)
-              ->parameter_number());
-  HloInstruction* offset =
-      mutable_dynamic_update_slicing_fusion->mutable_operand(
-          fused_computation->root_instruction()
-              ->mutable_operand(2)
-              ->parameter_number());
-
-  HloInstruction* pad_instr = fused_computation->root_instruction()
-                                  ->mutable_operand(1)
-                                  ->mutable_operand(0);
-  fused_computation->set_root_instruction(pad_instr, true);
-  *mutable_dynamic_update_slicing_fusion->mutable_shape() = pad_instr->shape();
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          stacked->shape(),
-          {stacked, mutable_dynamic_update_slicing_fusion, offset},
-          "DynamicTuple"));
-  for (HloInstruction* user : mutable_dynamic_update_slicing_fusion->users()) {
-    if (user != new_operand) {
-      TF_RETURN_IF_ERROR(
-          mutable_dynamic_update_slicing_fusion->ReplaceUseWithDifferentShape(
-              user, new_operand));
-    }
-  }
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stackd, update, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic-slice(p0, p1, zero)
-//   broadcast = broadcast(constant)
-//   add = add(slice, broadcast)
-//   ROOT reduce = reduce(add, zero), apply=+
-// }
-std::optional<PatternInfo> GetDSFusionWithAddPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSFusionWithAdd";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  HloComputation* fused_computation = instr->fused_instructions_computation();
-  HloInstruction* fusion_root = fused_computation->root_instruction();
-  HloInstruction* add_operand;
-  if (Match(fusion_root,
-            match::Reduce(match::Add(match::Op(&add_operand),
-                                     match::Broadcast(match::Constant())),
-                          match::Constant()))) {
-    if (add_operand == shape_covering_instr) {
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(instr->shape(), num_layers);
-      HloComputation::Builder builder("unstack_add");
-      HloInstruction* p0 =
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              0, fused_computation->parameter_instruction(0)->shape(), "p0"));
-      HloInstruction* p1 =
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, fused_computation->parameter_instruction(1)->shape(), "p1"));
-      HloInstruction* zero =
-          builder.AddInstruction(MakeScalarConstantWithShape(p1->shape(), 0));
-      std::vector<HloInstruction*> slice_starts;
-      slice_starts.reserve(shape_covering_instr->shape().dimensions().size());
-      slice_starts.push_back(p1);
-      for (int64_t i = 0;
-           i < static_cast<int64_t>(
-                   shape_covering_instr->shape().dimensions().size()) -
-                   1;
-           i++) {
-        slice_starts.push_back(zero);
-      }
-      HloInstruction* slice =
-          builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-              shape_covering_instr->shape(), p0, slice_starts,
-              shape_covering_instr->dynamic_slice_sizes()));
-      HloInstruction* zero_reduce =
-          builder.AddInstruction(MakeScalarConstantWithShape(
-              ShapeUtil::MakeScalarShape(slice->shape().element_type()), 0));
-      HloInstruction* reduce =
-          builder.AddInstruction(HloInstruction::CreateReduce(
-              instr->shape(), slice, zero_reduce, fusion_root->dimensions(),
-              fused_computation->root_instruction()->to_apply()));
-      HloComputation* unstack_add =
-          instr->GetModule()->AddEmbeddedComputation(builder.Build());
-      unstack_add->set_root_instruction(reduce);
-      pattern_info.unstacking_computation = unstack_add;
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-absl::Status UnstackDSFusionWithAddPattern(
-    HloInstruction* mutable_dynamic_slice_with_add_fusion,
-    const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_dynamic_slice_with_add_fusion->parent();
-  HloInstruction* stacked =
-      mutable_dynamic_slice_with_add_fusion->mutable_operand(0);
-  HloInstruction* offset =
-      mutable_dynamic_slice_with_add_fusion->mutable_operand(1);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  HloInstruction* one = parent_loop->AddInstruction(MakeScalarConstantWithShape(
-      ShapeUtil::MakeScalarShape(slice_shape.element_type()), 1));
-  HloInstruction* broadcast = parent_loop->AddInstruction(
-      HloInstruction::CreateBroadcast(slice_shape, one, {}));
-  HloInstruction* add = mutable_dynamic_slice_with_add_fusion->AddInstruction(
-      HloInstruction::CreateBinary(new_operand->shape(), HloOpcode::kAdd,
-                                   new_operand, broadcast));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slice_with_add_fusion->ReplaceAllUsesWith(add));
-  return absl::OkStatus();
-}
-
-// This method checks if the given instruction is a fusion with the following
-// properties:
-// 1. It is inside the body of an unrollable loop
-// 2. The parameter at stacked_operand_index has a single user inside the
-//    fused computation.
-// 3. The single user is a fusion with two operands with the following form:
-//    fusion(stacked_param, slicing_offset)
-//    (We assume that the stacked parameter is always the first operand and
-//    the slicing offset is the second operand.)
-// 4. The fusion user contains a shape-covering dynamic-slice instruction.
-std::optional<PatternInfo> GetNestedDSFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  if (!metadata.unrollable_loop_bodies.contains(instr->parent())) {
-    VLOG(5) << "Instruction not inside unrollable while body, " << instr->name()
-            << " inside " << instr->parent()->name();
-    return std::nullopt;
-  }
-
-  WhileLoopConfig while_instr_config =
-      metadata.unrollable_loop_bodies.at(instr->parent());
-
-  VLOG(3) << "Checking NestedDSFusionPattern";
-
-  HloInstruction* inner_fusion_user = nullptr;
-  for (HloInstruction* fused_instr :
-       instr->fused_instructions_computation()->MakeInstructionPostOrder()) {
-    // Find the changed parameter in the fused computation
-    if (Match(fused_instr, match::Parameter(stacked_operand_idx))) {
-      // There must be a single fusion user
-      if (fused_instr->user_count() != 1) {
-        return std::nullopt;
-      }
-      if (Match(fused_instr->users()[0],
-                match::Fusion(match::Op(), match::Op()))) {
-        inner_fusion_user = fused_instr->users()[0];
-        break;
-      }
-    }
-  }
-  if (inner_fusion_user == nullptr) {
-    return std::nullopt;
-  }
-  for (HloInstruction* inner_fusion_instr :
-       inner_fusion_user->fused_instructions_computation()
-           ->MakeInstructionPostOrder()) {
-    if (!Match(inner_fusion_instr, match::DynamicSlice())) {
-      continue;
-    }
-    std::optional<int64_t> dynamic_index =
-        MatchEffectivelyStaticDynamicSliceInsideLoop(
-            inner_fusion_instr,
-            inner_fusion_user->fused_instructions_computation()
-                ->parameter_instruction(0),
-            while_instr_config);
-    if (dynamic_index.has_value() && dynamic_index.value() == 0) {
-      const int64_t num_layers =
-          inner_fusion_user->operand(0)->shape().dimensions(0);
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::NestedDSFusionPattern;
-      pattern_info.instr = inner_fusion_user;
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(inner_fusion_instr->shape(), num_layers);
-      pattern_info.unstacking_computation =
-          inner_fusion_user->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(inner_fusion_user);
-      return pattern_info;
-    }
-  }
-  return std::nullopt;
-}
-
-// The function below captures all the changes necessary to hlo graph for it's
-// corresponding (GetNestedDSFusionPattern) pattern to unstack.
-absl::Status UnstackNestedDSFusionPattern(
-    HloInstruction* mutable_dynamic_slicing_fusion, const Shape& slice_shape) {
-  // We are sure that this lambda is called with a nested fusion.
-  HloInstruction* parent_fusion =
-      mutable_dynamic_slicing_fusion->parent()->FusionInstruction();
-
-  // Under the assumption that the stacked parameter is always the first
-  // operand of the inner fusion.
-  HloInstruction* stacked_in_ds_fusion =
-      mutable_dynamic_slicing_fusion->mutable_operand(0);
-  CHECK_EQ(stacked_in_ds_fusion->opcode(), HloOpcode::kParameter);
-  int64_t stacked_param_number = stacked_in_ds_fusion->parameter_number();
-  HloInstruction* stacked =
-      parent_fusion->mutable_operand(stacked_param_number);
-
-  // Under the assumption that the slicing offset is always the second
-  // operand of the inner fusion.
-  HloInstruction* offset_in_ds_fusion =
-      mutable_dynamic_slicing_fusion->mutable_operand(1);
-  CHECK_EQ(offset_in_ds_fusion->opcode(), HloOpcode::kParameter);
-  HloInstruction* offset =
-      parent_fusion->mutable_operand(offset_in_ds_fusion->parameter_number());
-
-  HloInstruction* sliced_param =
-      parent_fusion->fused_instructions_computation()->ReplaceParameter(
-          stacked_param_number,
-          HloInstruction::CreateParameter(stacked_param_number, slice_shape,
-                                          "sliced"));
-  HloInstruction* bitcast = mutable_dynamic_slicing_fusion->AddInstruction(
-      HloInstruction::CreateBitcast(mutable_dynamic_slicing_fusion->shape(),
-                                    sliced_param));
-  HloInstruction* bitcast_fusion =
-      mutable_dynamic_slicing_fusion->AddInstruction(
-          HloInstruction::CreateFusion(mutable_dynamic_slicing_fusion->shape(),
-                                       HloInstruction::FusionKind::kLoop,
-                                       bitcast));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slicing_fusion->ReplaceAllUsesWith(bitcast_fusion));
-  // Create the custom-call to dynamically get the tuple element given the
-  // loop iteration number. We rely on WhileLoopUnroller to rewrite this as
-  // a get-tuple-element hlo once the iteration number is known and loop
-  // bodies are unrolled.
-  HloInstruction* new_operand =
-      parent_fusion->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  return parent_fusion->ReplaceOperandWithDifferentShape(
-      sliced_param->parameter_number(), new_operand);
-}
-
-// Identifies the following pattern:
-//  computation {
-//     ...
-//     fusion.1 = fusion(...stacked...) // this is GetDSFusionPattern
-//     fusion.2 = fusion(...stacked...) // this is GetDUSFusionPattern
-//     ...
-//   }
-std::optional<PatternInfo> GetDSAndDUSPattern(const UnstackerMetadata& metadata,
-                                              const HloInstruction* instr,
-                                              int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking DSAndDUSPattern";
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return std::nullopt;
-  }
-  const HloInstruction* stacked = instr->operand(stacked_operand_idx);
-  if (stacked->user_count() != 2) {
-    return std::nullopt;
-  }
-
-  HloInstruction* shape_covering_ds_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_ds_instr == nullptr) {
-    return std::nullopt;
-  }
-  HloInstruction* bitcast_operand = nullptr;
-  if (!Match(instr->fused_instructions_computation()->root_instruction(),
-             match::Bitcast(match::Op(&bitcast_operand)))) {
-    return std::nullopt;
-  }
-  if (bitcast_operand != shape_covering_ds_instr) {
-    return std::nullopt;
-  }
-  if (!GetDUSFusionPattern(metadata, stacked->users()[1],
-                           stacked->users()[1]->operand_index(stacked))) {
-    return std::nullopt;
-  }
-  PatternInfo pattern_info;
-  pattern_info.type = PatternType::Other;
-  pattern_info.instr = instr;
-  const Shape& slice_shape = instr->shape();
-  const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-  pattern_info.unstacked_shape =
-      MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-  pattern_info.unstacking_computation = instr->fused_instructions_computation();
-  pattern_info.unstacked_instrs.push_back(instr);
-  pattern_info.unstacked_instrs.push_back(stacked->users()[1]);
-  return pattern_info;
-}
-
-absl::Status UnstackDSAndDUSPattern(HloInstruction* mutable_dynamic_slice,
-                                    const Shape& slice_shape) {
-  HloInstruction* stacked_gte = mutable_dynamic_slice->mutable_operand(0);
-  int64_t stacked_gte_index = stacked_gte->tuple_index();
-  HloComputation* parent = stacked_gte->parent();
-  ShapeUtil::UpdateTupleShape(stacked_gte->shape(), stacked_gte_index,
-                              parent->root_instruction()->mutable_shape());
-
-  HloComputation* parent_loop = mutable_dynamic_slice->parent();
-  HloInstruction* stacked = mutable_dynamic_slice->mutable_operand(0);
-  HloInstruction* offset = mutable_dynamic_slice->mutable_operand(1);
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  TF_RETURN_IF_ERROR(
-      mutable_dynamic_slice->ReplaceAllUsesWithDifferentShape(new_operand));
-
-  HloInstruction* mutable_dynamic_update_slice = stacked_gte->users()[1];
-  TF_RETURN_IF_ERROR(
-      UnstackDUSFusionPattern(mutable_dynamic_update_slice, slice_shape));
-  return absl::OkStatus();
-}
-
-// This function recognizes fusions with the following pattern:
-// fusion(stacked, loop_iteration_var)
-// computation {
-//   p0 = parameter(0)
-//   p1 = parameter(1)
-//   slice = dynamic_slice(p0, p1, zero, ...)
-//   ROOT reduce = reduce(slice, constant)
-// }
-std::optional<PatternInfo> GetReduceFusionPattern(
-    const UnstackerMetadata& metadata, const HloInstruction* instr,
-    int64_t stacked_operand_idx) {
-  VLOG(3) << "Checking ReduceFusion";
-  HloInstruction* shape_covering_instr =
-      GetMostMajorShapeCoveringDynamicIndexInFusion(
-          metadata, instr, HloOpcode::kDynamicSlice, 2, stacked_operand_idx);
-  if (shape_covering_instr == nullptr) {
-    return std::nullopt;
-  }
-  if (!ShouldUnfuseSlices(metadata, shape_covering_instr)) {
-    return std::nullopt;
-  }
-  HloInstruction* reduce_operand = nullptr;
-  HloInstruction* fusion_root =
-      instr->fused_instructions_computation()->root_instruction();
-  if (Match(fusion_root, match::Reduce(match::Op(&reduce_operand),
-                                       match::ConstantScalar())) &&
-      Match(fusion_root->to_apply()->root_instruction(),
-            match::Add(match::Parameter(), match::Parameter()))) {
-    if (reduce_operand == shape_covering_instr) {
-      PatternInfo pattern_info;
-      pattern_info.type = PatternType::Other;
-      pattern_info.instr = instr;
-      const Shape& slice_shape = instr->shape();
-      const int64_t num_layers = instr->operand(0)->shape().dimensions(0);
-      pattern_info.unstacked_shape =
-          MakeUnstackedShapeFromSlice(slice_shape, num_layers);
-      pattern_info.unstacking_computation =
-          instr->fused_instructions_computation();
-      pattern_info.unstacked_instrs.push_back(instr);
-      return pattern_info;
-    }
-  }
-
-  return std::nullopt;
-}
-
-absl::Status UnstackReduceFusionPattern(HloInstruction* mutable_reduce_fusion,
-                                        const Shape& slice_shape) {
-  HloComputation* parent_loop = mutable_reduce_fusion->parent();
-
-  HloInstruction* stacked = mutable_reduce_fusion->mutable_operand(0);
-  HloInstruction* offset = mutable_reduce_fusion->mutable_operand(1);
-
-  HloInstruction* new_operand =
-      parent_loop->AddInstruction(HloInstruction::CreateCustomCall(
-          slice_shape, {stacked, offset}, "DynamicGte"));
-  return mutable_reduce_fusion->ReplaceAllUsesWithDifferentShape(new_operand);
-}
-
-};  // namespace
-
-// The entry point of the unstacking algorithm. Given a module, it creates the
-// unstacking metadata and populates the unstacking custom handler(s). Moreover,
-// it attempts unstacking each index of the loops in the entry computation of
-// the module. Finally, it removes the unused computations and unrolls the
-// module.
-absl::StatusOr<bool> HloUnstacker::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(auto metadata,
-                      UnstackerMetadata::Create(module, unfuse_slice_));
-  // The order of the patterns below is important, as it determines the order
-  // in which the unstacking custom handlers are called. For example, applying
-  // GetDSAndDUSPattern after GetDSFusionPattern would result in patterns of
-  // GetDSAndDUSPattern not being recognized since GetDSFusionPattern is a
-  // sub-pattern of GetDSAndDUSPattern.
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSAndDUSPattern, UnstackDSAndDUSPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSFusionPattern, UnstackDSFusionPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDUSFusionPattern, UnstackDUSFusionPattern));
-  metadata.custom_handlers.push_back(std::make_pair(
-      GetDUSFusionWithPadPattern, UnstackDUSFusionWithPadPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetDSFusionWithAddPattern, UnstackDSFusionWithAddPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetReduceFusionPattern, UnstackReduceFusionPattern));
-  metadata.custom_handlers.push_back(
-      std::make_pair(GetNestedDSFusionPattern, UnstackNestedDSFusionPattern));
-  metadata.custom_handlers.push_back(std::make_pair(
-      GetDSFusionNoBitcastPattern, UnstackDSFusionNoBitcastPattern));
-
-  std::vector<HloInstruction*> entry_loops;
-  for (HloInstruction* instr :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    // Only unstack standard loops with tuple input and output.
-    if (Match(instr, match::While(match::Tuple())) &&
-        Match(instr->while_body()->root_instruction(), match::Tuple())) {
-      entry_loops.push_back(instr);
-    }
-  }
-
-  int64_t num_unstacked = 0;
-  bool unstacked = false;
-  std::vector<const HloInstruction*> unstacked_instructions;
-  for (HloInstruction* loop : entry_loops) {
-    for (int64_t i = 0; i < loop->shape().tuple_shapes().size(); ++i) {
-      // We don't handle tuples and if we see then we assume they come from a
-      // previous unstacking attempt.
-      if (loop->while_init()->operand(i)->shape().IsTuple()) {
-        continue;
-      }
-      VLOG(3) << "Attempting to unstack " << loop->name() << " at " << i
-              << " = " << loop->while_init()->operand(i)->shape().ToString(true)
-              << loop->while_init()->operand(i)->ToShortString();
-      bool current_unstacked =
-          UnstackWhileOperandAtIndex(metadata, loop, i, unstacked_instructions);
-      if (current_unstacked) {
-        num_unstacked++;
-        unstacked = true;
-      }
-      VLOG(3) << "###################";
-    }
-  }
-  if (!unstacked) {
-    return false;
-  }
-  // Unstacking computations are cloned, leaving the original unstacking
-  // computation unused.
-  TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
-  // We rely on the WhileLoopUnroller pass to unroll unstacked loop bodies
-  // and rewrite custom-calls created by unstacker, i.e., DynamicGte and
-  // DynamicTuple.
-  std::vector<HloInstruction*> loops_to_unroll;
-  for (const HloInstruction* instr : unstacked_instructions) {
-    HloInstruction* loop = metadata.bodies[instr->parent()];
-    if (std::find(loops_to_unroll.begin(), loops_to_unroll.end(), loop) ==
-        loops_to_unroll.end()) {
-      loops_to_unroll.push_back(loop);
-    }
-  }
-  // Go over the loops in reverse order to unroll the inner loops first.
-  for (int64_t i = loops_to_unroll.size() - 1; i >= 0; --i) {
-    HloInstruction* loop = loops_to_unroll[i];
-    TF_ASSIGN_OR_RETURN(UnrollResult unroll_result,
-                        WhileLoopUnroller::UnrollAndReturnReplacement(
-                            loop, /*unroll_factor=*/-1,
-                            /*wrap_in_trivial_loop=*/false,
-                            /*force_unroll=*/true, /*prepare=*/false));
-    bool unrolled = unroll_result.unrolled;
-    CHECK(unrolled);
-  }
-  VLOG(3) << "after unstacking \n" << module->ToString();
-  VLOG(3) << "Num unstacked: " << num_unstacked;
-  return true;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_unstacker.h b/third_party/xla/xla/service/hlo_unstacker.h
deleted file mode 100644
index bd312e89a2a66a..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_UNSTACKER_H_
-#define XLA_SERVICE_HLO_UNSTACKER_H_
-
-#include <stdbool.h>
-
-#include <functional>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-// This pass implements unstacking for loop operands. Generally speaking,
-// unstacking is the act of breaking a rank n tensor into n smaller n-1 rank
-// tensors without changing the semantics of the program. There are different
-// patterns that can benefit from unstacking. This pass aims to implement such
-// patterns. The patterns implemented are not exhaustive by any means. Lets
-// consider a simple example:
-// In the pattern below, `I` (the most-major dimension in the stacked tensor),
-// is equal to the trip count of the while loop and `i` is the iteration
-// variable of the loop. The stacked input is used only as input to a
-// shape-covering dynamic-slice (check the definition of a shape-covering
-// dynamic-slice: `tensorflow/compiler/xla/service/while_loop_unroller.h`)
-//
-//   +-while----------------------------------------------------+
-//   | param = tuple(..., [I,x1,...,xn]stacked, ...)            |
-//   | ...                                                      |
-//   | [1,x1,...,xn]slice = ds([I,x1,...,xn]stacked, i, 0, ...) |
-//   | ...                                                      |
-//   | ops using the slice                                      |
-//   | ...                                                      |
-//   | ROOT = tuple(..., stacked, ...)                          |
-//   +----------------------------------------------------------+
-//
-// This pattern can be unstacked and rewritten as following:
-//
-//   +-while-----------------------------------------------------------------+
-//   | param = tuple(..., ([1,x1,...,xn], ..., [1,x1,...,xn])unstacked, ...) |
-//   | ...                                                                   |
-//.  | slice_1 = get_tuple_element(unstacked), index=i                       |
-//   | ops using the slice_i                                                 |
-//   | ...                                                                   |
-//   | ROOT = tuple(..., unstacked, ...)                                     |
-//   +-----------------------------------------------------------------------+
-//
-// where the unstacked input is initialized with the slices outside of the loop:
-// unstacked = tuple(slice_1, ..., slice_n)
-// To get each slice, the pass introduces a dynamic version of the
-// kGetTupleElement instruction using a custom-call. This custom-call is then
-// replaced with a normal get-tuple-element during loop unrolling.
-//
-// Below is a high-level overview of the unstacking algorithm:
-// We unstack a module by unstacking inputs to the while loops within the entry
-// computation for every index. Given a while loop and a candidate for
-// unstacking, the algorithm performs the following two steps:
-// 1. The first step is to determine if unstacking is possible by checking if
-//  the unstacking of the while operand at the given index can be propagated
-//  through the body (and nested bodies if any). Unstacking is possible
-//  if a pair of pattern and handler is provided that can identify and handle
-//  such pattern that involves all the uses of the stacked operand at the given
-//  index.
-// 2. Apply the unstacking by executing the changes gathered in the first phase.
-class HloUnstacker : public HloModulePass {
- public:
-  ~HloUnstacker() override = default;
-
-  explicit HloUnstacker(std::function<bool(HloInstruction*)> unfuse_slice =
-                            [](HloInstruction* instr) { return true; })
-      : unfuse_slice_(unfuse_slice) {}
-
-  absl::string_view name() const override { return "hlo_unstacker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  std::function<bool(HloInstruction*)> unfuse_slice_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_SERVICE_HLO_UNSTACKER_H_
diff --git a/third_party/xla/xla/service/hlo_unstacker_test.cc b/third_party/xla/xla/service/hlo_unstacker_test.cc
deleted file mode 100644
index 3a552c5542cfe7..00000000000000
--- a/third_party/xla/xla/service/hlo_unstacker_test.cc
+++ /dev/null
@@ -1,1503 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/hlo_unstacker.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-
-using UnstackerTest = HloTestBase;
-
-int64_t GetInstrCountWithOpcodeInEntry(HloModule* module, HloOpcode opcode) {
-  int64_t instr_with_opcode_count = 0;
-  for (HloInstruction* instr :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    if (instr->opcode() == opcode) {
-      instr_with_opcode_count++;
-    }
-  }
-  return instr_with_opcode_count;
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, NotUnstackDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.tuple {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    mult = multiply(param_0.51117, param_0.51117)
-    ROOT out = tuple(param_0.51117, mult)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    fusion_mult = (s8[3,128,128], s8[3,128,128]) fusion(s8[3,128,128] p1), kind=kLoop, calls=%fused_computation.tuple
-    mult = s8[3,128,128] get-tuple-element(fusion_mult), index=1
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, mult)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternMultipleLoopRootUse) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p2 = s8[3,128,128] get-tuple-element(wide_p), index=3
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p2, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(inc, conv, p2, p2)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    zero = s8[] constant(0)
-    buffer = s8[3,128,128] broadcast(zero), dimensions={}
-    while.input = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(init, p1, p0, buffer)
-    while.out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 6);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternWithUnusedOperand) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(inc, conv, p1, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    zero = s8[] constant(0)
-    buffer = s8[3,128,128] broadcast(zero), dimensions={}
-    while.input = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) tuple(init, p1, p0, buffer)
-    while.out = (s32[], bf16[8,128], s8[3,128,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 6);
-  // Check that the bitcast is unfused and there are not fusions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackReduceFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  dynamic-slice.609.reduce_sub_computation {
-    lhs.53 = s8[] parameter(0)
-    rhs.53 = s8[] parameter(1)
-    ROOT add.3090 = s8[] add(lhs.53, rhs.53)
-  }
-
-  fused_computation.1096.clone {
-    param_0.5572 = s8[3,128,128] parameter(0)
-    param_1.6711 = s32[]{:T(128)} parameter(1)
-    constant.12008 = s32[]{:T(128)} constant(0)
-    dynamic-slice.1545 = s8[1,128,128] dynamic-slice(param_0.5572, param_1.6711, constant.12008, constant.12008), dynamic_slice_sizes={1,128, 128}
-    constant.12009 = s8[] constant(-0)
-    ROOT reduce.919 = s8[128,128] reduce(dynamic-slice.1545, constant.12009), dimensions={0}, to_apply=dynamic-slice.609.reduce_sub_computation
-  } // fused_computation.1096.clone
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.1096.clone
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternNoBitcast) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[1,128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    bitcast.102 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] bitcast.102), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  // Check that all the fusions are removed.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternNoBitcastKeepFused) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[1,128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    bitcast.102 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] bitcast.102), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  auto unfuse = [](HloInstruction* instruction) { return false; };
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked,
-                          HloUnstacker(unfuse).Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 0);
-  // Check that dynamic-slices are still fused.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternKeepFused) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT out = s8[128,128] bitcast(%dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] p1, i), kind=kLoop, calls=%fused_computation.slice
-    conv = bf16[8,128] convolution(bf16[8,128] p0, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  auto unfuse = [](HloInstruction* instruction) { return false; };
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked,
-                          HloUnstacker(unfuse).Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDSFusionPatternWithDifferentLayout) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.30.clone (param_0.153: bf16[32,4,64,64,3], param_1.123: s32[]) -> bf16[64,4,64,3] {
-    %param_0.153 = bf16[32,4,64,64,3]{2,1,4,3,0} parameter(0)
-    %param_1.123 = s32[]{:T(128)} parameter(1)
-    %constant.227 = s32[]{:T(128)} constant(0)
-    %dynamic-slice.5 = bf16[1,4,64,64,3]{2,1,4,3,0} dynamic-slice(bf16[32,4,64,64,3]{2,1,4,3,0} %param_0.153, s32[]{:T(128)} %param_1.123, s32[]{:T(128)} %constant.227, s32[]{:T(128)} %constant.227, s32[]{:T(128)} %constant.227, /*index=5*/s32[]{:T(128)} %constant.227), dynamic_slice_sizes={1,4,64,64,3}
-    ROOT %bitcast.102 = bf16[64,4,64,3]{0,1,3,2} bitcast(bf16[1,4,64,64,3]{2,1,4,3,0} %dynamic-slice.5)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], bf16[32,4,64,64,3])) -> (s32[], bf16[8,128], bf16[32,4,64,64,3]) {
-    wide_p = (s32[], bf16[8,128], bf16[32,4,64,64,3]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = bf16[32,4,64,64,3]{2,1,4,3,0} get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67830 = bf16[64,4,64,3]{0,1,3,2} fusion(p1, i), kind=kLoop, calls=%fused_computation.30.clone
-    ROOT out = (s32[], bf16[8,128], bf16[32,4,64,64,3]) tuple(inc, p0, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], bf16[32,4,64,64,3])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], bf16[32,4,64,64,3]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(32)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = bf16[32,4,64,64,3] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], bf16[32,4,64,64,3]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], bf16[32,4,64,64,3]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = bf16[32,4,64,64,3] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice),
-            32);
-  // Check that dynamic-slices are still fused.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kFusion),
-            0);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Instead of slicing the entire shape, this test slices only even elements from
-// the first parameter.
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithDynamicIndex) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[6,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[6,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[6,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[6,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[6,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[6,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[6,128,128])) -> (s32[], bf16[8,128], s8[6,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[6,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[6,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    two = s32[] constant(2)
-    mult = s32[] multiply(i, two)
-    fusion.conv = bf16[8,128] fusion(p0, p1, mult), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[6,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[6,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[6,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[6,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[6,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[6,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[6,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithMultipleIndex) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice.1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.slice.2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      // to_be_sliced_while_gte
-      p1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      p2 = s8[4,128,128] get-tuple-element(wide_p), index=3
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv.1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
-      fusion.conv.2 = bf16[8,128] fusion(p0, p2, i), kind=kOutput, calls=%fused_computation.inner.2
-      plus = bf16[8,128] add(fusion.conv.1, fusion.conv.2)
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(inc, plus, p1, p2)
-    }
-
-    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    ENTRY main {
-      p0 = s8[4,128,128] parameter(0)
-      p1 = s8[4,128,128] parameter(1)
-      p2 = bf16[8,128] parameter(2)
-      init = s32[] constant(0)
-      while.input = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) tuple(init, p2, p0, p1)
-      while.out = (s32[], bf16[8,128], s8[4,128,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
-      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions. For each unstacked input, we
-  // create 4 slices, 8 in total.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 8);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithDiffereOperandsOrder) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner (param_1.30691: s8[3,128,128], p2: s32[], param_0.34523: bf16[8,128]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(2)
-    %param_1.30691 = s8[3,128,128] parameter(0)
-    p2 = s32[] parameter(1)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv = bf16[8,128] fusion(p1, i, p0), kind=kOutput, calls=%fused_computation.inner
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, fusion.conv, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternWithSameUnstackingComps) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice.1 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.slice.2 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %fused_computation.inner.1 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.1
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %fused_computation.inner.2 (param_0.34523: bf16[8,128], param_1.30691: s8[3,128,128], p2: s32[]) -> bf16[8,128] {
-    %param_0.34523 = bf16[8,128] parameter(0)
-    %param_1.30691 = s8[3,128,128] parameter(1)
-    p2 = s32[] parameter(2)
-    %fusion.67830 = s8[128,128] fusion(s8[3,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice.2
-    ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    fusion.conv1 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.1
-    fusion.conv2 = bf16[8,128] fusion(p0, p1, i), kind=kOutput, calls=%fused_computation.inner.2
-    add = bf16[8,128] add(fusion.conv1, fusion.conv2)
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, add, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 3);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest,
-       NotUnstackNestedDSFusionPatternWithDifferentUnstackingComps) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice.1 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[1,128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-  }
-
-  %fused_computation.slice.2 (param_0.51117: s8[3,128,128], p1: s32[]) -> s8[128,128] {
-    %param_0.51117 = s8[3,128,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %constant.85694 = s32[] constant(0)
-    %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[3,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-    ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-  }
-
-  %while.body (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> (s32[], bf16[8,128], s8[3,128,128]) {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    p0 = bf16[8,128] get-tuple-element(wide_p), index=1
-    p1 = s8[3,128,128] get-tuple-element(wide_p), index=2
-    one = s32[] constant(1)
-    inc = s32[] add(i, one)
-    %fusion.67831 = s8[128,128] fusion(p1, i), kind=kLoop, calls=%fused_computation.slice.2
-    %fusion.67830 = s8[1,128,128] fusion(p1, i), kind=kLoop, calls=%fused_computation.slice.1
-    %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %fusion.67830)
-    ROOT out = (s32[], bf16[8,128], s8[3,128,128]) tuple(inc, p0, p1)
-  }
-
-  %while.cond (wide_param: (s32[], bf16[8,128], s8[3,128,128])) -> pred[] {
-    wide_p = (s32[], bf16[8,128], s8[3,128,128]) parameter(0)
-    i = s32[] get-tuple-element(wide_p), index=0
-    %constant.12857 = s32[] constant(3)
-    ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-  }
-
-  ENTRY main {
-    p0 = s8[3,128,128] parameter(0)
-    p1 = bf16[8,128] parameter(1)
-    init = s32[] constant(0)
-    while.input = (s32[], bf16[8,128], s8[3,128,128]) tuple(init, p1, p0)
-    while.out = (s32[], bf16[8,128], s8[3,128,128]) while(while.input), condition=%while.cond , body=%while.body
-    while_use = s8[3,128,128] get-tuple-element(while.out), index=2
-    ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  // Currently, we don't unroll if there are multiple nested ds fusions.
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_FALSE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternSingleNestedLoop) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in), condition=%while.cond.inner, body=%while.body.inner
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    ENTRY main {
-      weight = s8[4,128,128] parameter(0)
-      p1 = bf16[8,128] parameter(1)
-      init = s32[] constant(1)
-      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
-      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond , body=%while.body
-      ROOT out = bf16[8,128] get-tuple-element(while.out), index=1
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 4);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackNestedDSFusionPatternTwoNestedLoops) {
-  std::string hlo_string = R"(
-    HloModule SimpleLoop
-    %fused_computation.slice1 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner1 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice1
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in.1 = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out.1 = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in.1), condition=%while.cond.inner1, body=%while.body.inner1
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out.1), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond1 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    %fused_computation.slice2 (param_0.51117: s8[4,128,128], p1: s32[]) -> s8[128,128] {
-      %param_0.51117 = s8[4,128,128] parameter(0)
-      p1 = s32[] parameter(1)
-      %constant.85694 = s32[] constant(0)
-      %dynamic-slice.22040 = s8[1,128,128] dynamic-slice(s8[4,128,128] %param_0.51117, p1, s32[] %constant.85694, s32[] %constant.85694), dynamic_slice_sizes={1,128,128}
-      ROOT %bitcast.31250 = s8[128,128] bitcast(s8[1,128,128] %dynamic-slice.22040)
-    }
-
-    %fused_computation.inner2 (param_0.34523: bf16[8,128], param_1.30691: s8[4,128,128], p2: s32[]) -> bf16[8,128] {
-      %param_0.34523 = bf16[8,128] parameter(0)
-      %param_1.30691 = s8[4,128,128] parameter(1)
-      p2 = s32[] parameter(2)
-      %fusion.67830 = s8[128,128] fusion(s8[4,128,128] %param_1.30691, p2), kind=kLoop, calls=%fused_computation.slice2
-      ROOT %convolution.3447 = bf16[8,128] convolution(bf16[8,128] %param_0.34523, s8[128,128] %fusion.67830), dim_labels=bf_io->bf
-    }
-
-    %while.body.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      inner_param_0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      inner_param_1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(1)
-      inc = s32[] add(i, one)
-      fusion.conv = bf16[8,128] fusion(inner_param_0, inner_param_1, i), kind=kOutput, calls=%fused_computation.inner2
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(inc, fusion.conv, inner_param_1)
-    }
-
-    %while.cond.inner2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(4)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, s32[] %constant.12857), direction=LT
-    }
-
-    %while.body2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> (s32[], bf16[8,128], s8[4,128,128]) {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      param0 = bf16[8,128] get-tuple-element(wide_p), index=1
-      param1 = s8[4,128,128] get-tuple-element(wide_p), index=2
-      one = s32[] constant(2)
-      zero = s32[] constant(0)
-      mult = s32[] multiply(i, one)
-      inner.in.2 = (s32[], bf16[8,128], s8[4,128,128]) tuple(zero, param0, param1)
-      inner.out.2 = (s32[], bf16[8,128], s8[4,128,128]) while(inner.in.2), condition=%while.cond.inner2, body=%while.body.inner2
-      fusion.conv.inner = bf16[8,128] get-tuple-element(inner.out.2), index=1
-      ROOT out = (s32[], bf16[8,128], s8[4,128,128]) tuple(mult, fusion.conv.inner, param1)
-    }
-
-    %while.cond2 (wide_param: (s32[], bf16[8,128], s8[4,128,128])) -> pred[] {
-      wide_p = (s32[], bf16[8,128], s8[4,128,128]) parameter(0)
-      i = s32[] get-tuple-element(wide_p), index=0
-      %constant.12857 = s32[] constant(20)
-      add = s32[] add(%constant.12857, %constant.12857)
-      ROOT %compare.1921 = pred[]{:T(512)} compare(s32[] i, add), direction=LT
-    }
-
-    ENTRY main {
-      weight = s8[4,128,128] parameter(0)
-      p1 = bf16[8,128] parameter(1)
-      init = s32[] constant(1)
-      while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init, p1, weight)
-      while.out = (s32[], bf16[8,128], s8[4,128,128]) while(while.input), condition=%while.cond1 , body=%while.body1
-      init2 = s32[] get-tuple-element(while.out), index=0
-      second.while.input = (s32[], bf16[8,128], s8[4,128,128]) tuple(init2, p1, weight)
-      second.while.out = (s32[], bf16[8,128], s8[4,128,128]) while(second.while.input), condition=%while.cond2 , body=%while.body2
-      out = bf16[8,128] get-tuple-element(while.out), index=1
-      second.out = bf16[8,128] get-tuple-element(second.while.out), index=1
-      ROOT result = bf16[8,128] add(out, second.out)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  // Check for the creation of slice instructions. For each loop there is one
-  // unstacked input that creates 4 slices, in total 8 slices for two loops.
-  EXPECT_EQ(GetInstrCountWithOpcodeInEntry(module.get(), HloOpcode::kSlice), 8);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-TEST_F(UnstackerTest, UnstackDSAndDUSPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  %fused_computation.slice (param_0.51117: s32[4,3], offset: s32[]) -> s32[3] {
-    %param_0.51117 = s32[4,3] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = s32[1,3] dynamic-slice(s32[4,3] %param_0.51117, offset, zero), dynamic_slice_sizes={1,3}
-    ROOT %bitcast.31250 = s32[3] bitcast(s32[1,3] %dynamic-slice.22040)
-  }
-
-  %fused_computation.update.slice (param_0.51117: s32[4,3], p1: s32[], p2: s32[3]) -> s32[4,3] {
-    %param_0.51117 = s32[4,3] parameter(0)
-    %p1 = s32[] parameter(1)
-    %p2 = s32[3] parameter(2)
-    %zero = s32[] constant(0)
-    %bitcast.31250 = s32[1,3] bitcast(%p2)
-    ROOT output_dus = s32[4,3]{1,0} dynamic-update-slice(%param_0.51117, %bitcast.31250, %p1, zero)
-  }
-
-  SimpleLoop.body {
-    loop_var.1 = (s32[], s32[4,3]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = s32[4,3] get-tuple-element(loop_var.1), index=1
-    zero = s32[] constant(0)
-
-    some_const = s32[3] constant({0,1,2})
-    constant.1 = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, constant.1)
-    ds = s32[3]{0} fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    update = s32[3] add(ds, ds)
-    dus = s32[3] dynamic-update-slice(ds, update, zero)
-    output = s32[4,3] fusion(get-tuple-element.2, get-tuple-element.1, dus), kind=kLoop, calls=%fused_computation.update.slice
-    ROOT tuple = (s32[], s32[4,3]) tuple(idx, output)
-  }
-  SimpleLoop.condition {
-    loop_var.1 = (s32[], s32[4,3]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-  ENTRY SimpleLoop {
-    reference = s32[4,3] parameter(0)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = (s32[], s32[4,3]) tuple(zero, reference)
-    while = (s32[], s32[4,3]) while(tuple.1), condition=SimpleLoop.condition, body=SimpleLoop.body
-    ROOT out = s32[] get-tuple-element(while), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Unstacking outer loop at index 1 forces to unstacked inner while at index 1
-// as well. This is because the output of the outer loop at index 1 is aliased
-// to the output of the inner while at index 1.
-TEST_F(UnstackerTest, UnstackDSAndDUSPatternNestedLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  %fused_computation.slice (param_0.51117: bf16[4,1,8,257,128], offset: s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128]
-    dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  %fused_computation.slice.2 (param_0.51117: bf16[4,1,8,257,128], offset: s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128] dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  inner.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    get-tuple-element.3 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=2
-    sliced = bf16[1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    sliced.2 = bf16[1,8,257,128] fusion(get-tuple-element.3, get-tuple-element.1), kind=kLoop,calls=%fused_computation.slice.2
-    temp = bf16[1,8,257,128] add(sliced, sliced.2)
-    one = s32[] constant(1) idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, get-tuple-element.2, get-tuple-element.3)
-  }
-  inner.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128])
-    parameter(0) get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),
-    index=0 constant.2 = s32[] constant(4) ROOT less-than = pred[]
-    compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  outer.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    get-tuple-element.3 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=2
-    zero = s32[] constant(0)
-    buffer = bf16[4,1,8,257,128] custom-call(), custom_call_target="AllocateBuffer"
-    inner.input = tuple(zero, buffer, get-tuple-element.2)
-    inner = while(inner.input), condition=inner.condition, body=inner.body
-    out1 = bf16[4,1,8,257,128] get-tuple-element(inner), index=1
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT tuple = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128]) tuple(idx, out1, get-tuple-element.3)
-  }
-  outer.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128], bf16[4,1,8,257,128])
-    parameter(0) get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),
-    index=0 constant.2 = s32[] constant(4) mul = s32[]
-    multiply(get-tuple-element.1, constant.2) ROOT less-than = pred[]
-    compare(get-tuple-element.1, mul), direction=LT
-  }
-
-  ENTRY SimpleLoop {
-    param1 = bf16[4,1,8,257,128] parameter(0)
-    param2 = bf16[4,1,8,257,128] parameter(1)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = tuple(zero, param1, param2)
-    while = while(tuple.1), condition=outer.condition, body=outer.body
-    ROOT out = s32[] get-tuple-element(while), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-// Unstacking the first loop at index 1 forces to unstack the second loop at
-// index 1 as well.
-TEST_F(UnstackerTest, UnstackDSAndDUSPatternLoopFeedingLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  %fused_computation.update.slice (param_0.51117: bf16[4,1,8,257,128], p1: s32[], param_0.51118: bf16[1,8,257,128]) -> bf16[4,1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    p1 = s32[] parameter(1)
-    %param_0.51118 = bf16[1,8,257,128] parameter(2)
-    bitcast = bf16[1,1,8,257,128] bitcast(param_0.51118)
-    %constant.85694 = s32[] constant(0)
-    ROOT %dynamic-update-slice.22040 = bf16[4,1,8,257,128] dynamic-update-slice(bf16[4,1,8,257,128] %param_0.51117, bitcast, p1, s32[] %constant.85694, s32[] %constant.85694, s32[] %constant.85694, s32[] %constant.85694)
-  }
-
-  %fused_computation.slice (param_0.51117: bf16[4,1,8,257,128], offset:s32[]) -> bf16[1,8,257,128] {
-    %param_0.51117 = bf16[4,1,8,257,128] parameter(0)
-    offset = s32[] parameter(1)
-    zero = s32[] constant(0)
-    %dynamic-slice.22040 = bf16[1,1,8,257,128] dynamic-slice(bf16[4,1,8,257,128] %param_0.51117, offset, zero, zero, zero, zero), dynamic_slice_sizes={1,1,8,257,128}
-    ROOT %bitcast.31250 = bf16[1,8,257,128] bitcast(%dynamic-slice.22040)
-  }
-
-  first.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    constant = bf16[1,8,257,128] constant({...})
-    sliced = bf16[1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1), kind=kLoop, calls=%fused_computation.slice
-    tmp = bf16[1,8,257,128] add(sliced, sliced)
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, get-tuple-element.2)
-  }
-  first.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  next.body {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1),index=0
-    get-tuple-element.2 = bf16[4,1,8,257,128] get-tuple-element(loop_var.1), index=1
-    constant = bf16[1,8,257,128] constant({...})
-    update.sliced = bf16[4,1,8,257,128] fusion(get-tuple-element.2, get-tuple-element.1, constant), kind=kLoop, calls=%fused_computation.update.slice
-    one = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, one)
-    ROOT out = tuple(idx, update.sliced)
-  }
-  next.condition {
-    loop_var.1 = (s32[], bf16[4,1,8,257,128]) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.2 = s32[] constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.1, constant.2), direction=LT
-  }
-
-  ENTRY SimpleLoop {
-    param1 = bf16[4,1,8,257,128] parameter(0)
-    param2 = bf16[4,1,8,257,128] parameter(1)
-    zero = s32[] constant(0)
-    zero1 = s32[] constant(0)
-    one = s32[] constant(1)
-    tuple.1 = tuple(zero, param1)
-    while = while(tuple.1), condition=first.condition, body=first.body
-    while.out = bf16[4,1,8,257,128] get-tuple-element(while), index=1
-    next.input = tuple(zero, while.out)
-    next = while(next.input), condition=next.condition, body=next.body
-    ROOT out = s32[] get-tuple-element(next), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDUSFusionWithPadPatternLoopFeedingLoop) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-  fused_computation.75.clone {
-    param_0.5713 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} parameter(0)
-    param_2.4396 = bf16[1,8,257,128]{3,2,1,0:T(8,128)(2,1)} parameter(2)
-    constant.12166 = bf16[]{:T(256)} constant(0)
-    pad.496 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} pad(param_2.4396, constant.12166), padding=0_0x0_0x0_256x0_0
-    bitcast.1262 = bf16[1,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} bitcast(pad.496)
-    param_1.6823 = s32[]{:T(128)} parameter(1)
-    constant.12165 = s32[]{:T(128)} constant(0)
-    ROOT dynamic-update-slice.193 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} dynamic-update-slice(param_0.5713, bitcast.1262, param_1.6823, constant.12165, constant.12165, /*index=5*/constant.12165, constant.12165)
-  } // fused_computation.75.clone
-
-  fused_computation.1 {
-    param_0.5712 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}parameter(0)
-    param_1.6822 = s32[]{:T(128)} parameter(1)
-    constant.12164 = s32[]{:T(128)} constant(0)
-    dynamic-slice.1597 = bf16[1,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} dynamic-slice(param_0.5712, param_1.6822, constant.12164, constant.12164, constant.12164, /*index=5*/constant.12164), dynamic_slice_sizes={1,1,8,513,128}
-    ROOT bitcast.1261 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} bitcast(dynamic-slice.1597)
-  }
-
-  first.body {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    fusion.2381 = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177), kind=kLoop, calls=fused_computation.1
-    tmp = bf16[1,8,513,128]{3,2,1,0:T(8,128)(2,1)} add(fusion.2381, fusion.2381)
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) tuple(add.4517, get-tuple-element.12178)
-  } // wide.region_54.2652.clone_spmd
-
-  first.cond {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-
-  wide.region_54.2652.clone_spmd {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    update = bf16[1,8,257,128]{3,2,1,0:T(8,128)(2,1)} constant({...})
-    fusion.2382 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177, update), kind=kLoop, calls=fused_computation.75.clone
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) tuple(add.4517, fusion.2382)
-  } // wide.region_54.2652.clone_spmd
-
-  wide.region_55.2732.clone_spmd {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-  ENTRY main {
-    p0 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} parameter(0)
-    init = s32[]{:T(128)} constant(0)
-    first.input = tuple(init, p0)
-    first.out = while(first.input), condition=first.cond , body=first.body
-    o1 = bf16[2,1,8,513,128]{4,3,2,1,0:T(8,128)(2,1)} get-tuple-element(first.out), index=1
-    input = tuple(init, o1)
-    out = while(input), condition=wide.region_55.2732.clone_spmd , body=wide.region_54.2652.clone_spmd
-    ROOT res = s32[]{:T(128)} get-tuple-element(out), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-}
-
-TEST_F(UnstackerTest, UnstackDUSFusionWithAddPattern) {
-  std::string hlo_string = R"(
-  HloModule SimpleLoop
-
-  add.2771.reduce_sub_computation {
-    lhs.44 = bf16[] parameter(0)
-    rhs.44 = bf16[] parameter(1)
-    ROOT add.3079 = bf16[] add(lhs.44, rhs.44)
-  }
-
-  fused_computation.75.clone {
-    param_0.31658 = bf16[2,4096]{1,0:T(8,128)(2,1)} parameter(0)
-    param_1.26202 = s32[]{:T(128)} parameter(1)
-    constant.47557 = s32[]{:T(128)} constant(0)
-    dynamic-slice.12289 = bf16[1,4096]{1,0:T(2,128)(2,1)} dynamic-slice(param_0.31658, param_1.26202, constant.47557), dynamic_slice_sizes={1,4096}
-    constant.47559 = bf16[]{:T(256)} constant(1)
-    broadcast.39214 = bf16[1,4096]{1,0:T(2,128)(2,1)} broadcast(constant.47559), dimensions={}
-    add.13176 = bf16[1,4096]{1,0:T(2,128)(2,1)} add(dynamic-slice.12289, broadcast.39214)
-    constant.47558 = bf16[] constant(-0)
-    ROOT reduce.8210 = bf16[4096]{0:T(1024)(128)(2,1)} reduce(add.13176, constant.47558), dimensions={0}, to_apply=add.2771.reduce_sub_computation
-  } // fused_computation.75.clone
-
-  first.body {
-    wide.param.29 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12177 = s32[]{:T(128)} get-tuple-element(wide.param.29), index=0
-    constant.12144..sunk.2 = s32[]{:T(128)} constant(1)
-    add.4517 = s32[]{:T(128)} add(get-tuple-element.12177, constant.12144..sunk.2)
-    get-tuple-element.12178 = bf16[2,4096]{1,0:T(8,128)(2,1)} get-tuple-element(wide.param.29), index=1
-    fusion.2381 = bf16[4096]{0:T(1024)(128)(2,1)} fusion(get-tuple-element.12178, get-tuple-element.12177), kind=kLoop, calls=fused_computation.75.clone
-    tmp = bf16[4096]{0:T(1024)(128)(2,1)} add(fusion.2381, fusion.2381)
-    ROOT tuple.949 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) tuple(add.4517, get-tuple-element.12178)
-  } // wide.region_54.2652.clone_spmd
-
-  first.cond {
-    wide.param.28 = (s32[]{:T(128)}, bf16[2,4096]{1,0:T(8,128)(2,1)}) parameter(0)
-    get-tuple-element.12167 = s32[]{:T(128)} get-tuple-element(wide.param.28), index=0
-    constant.12162 = s32[]{:T(128)} constant(2)
-    ROOT compare.1815 = pred[]{:T(512)} compare(get-tuple-element.12167, constant.12162), direction=LT
-  }
-
-  ENTRY main {
-    p0 = bf16[2,4096]{1,0:T(8,128)(2,1)} parameter(0)
-    init = s32[]{:T(128)} constant(0)
-    first.input = tuple(init, p0)
-    first.out = while(first.input), condition=first.cond , body=first.body
-    ROOT o1 = s32[]{:T(128)} get-tuple-element(first.out), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  auto original = module->Clone();
-  TF_ASSERT_OK_AND_ASSIGN(bool unstacked, HloUnstacker().Run(module.get()));
-  EXPECT_TRUE(unstacked);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(original),
-                                      std::nullopt, false));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index c9ef2fb0ae3040..e51877adeac5be 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -2871,7 +2871,7 @@ absl::Status VerifyLayoutConstrainedAllReduce(const HloModule& module) {
 
 namespace {
 std::string FormatShapeIndexValidationError(
-    absl::string_view instruction_name,
+    const HloInstruction* instruction,
     const absl::flat_hash_set<ShapeIndex>& shape_leaf_indices,
     const absl::flat_hash_set<ShapeIndex>& ov_leaf_indices) {
   std::vector<ShapeIndex> shape_only;
@@ -2895,7 +2895,8 @@ std::string FormatShapeIndexValidationError(
       "Mismatched tuple structure in original_value for "
       "instruction %s. Leaf indices in shape and original_value "
       "do not match.\nIn shape only: {%s}\nIn original_value only: {%s}",
-      instruction_name, absl::StrJoin(shape_only, ", ", shape_index_formatter),
+      instruction->ToString(),
+      absl::StrJoin(shape_only, ", ", shape_index_formatter),
       absl::StrJoin(ov_only, ", ", shape_index_formatter));
 }
 
@@ -2924,9 +2925,9 @@ absl::Status VerifyOriginalValue(const HloModule& module) {
         }
 
         if (shape_leaf_indices != ov_leaf_indices) {
-          return Internal("%s", FormatShapeIndexValidationError(
-                                    instruction->name(), shape_leaf_indices,
-                                    ov_leaf_indices));
+          return Internal(
+              "%s", FormatShapeIndexValidationError(
+                        instruction, shape_leaf_indices, ov_leaf_indices));
         }
       }
     }
@@ -3212,7 +3213,7 @@ int64_t CountWriters(const HloInstruction* inst,
 int64_t CountWritersInUser(const HloInstruction* inst,
                            absl::Span<const int64_t> shape_index,
                            const HloInstruction* user) {
-  if (dynamic_cast<const HloCallableInstruction*>(user) ||
+  if (HloCallableInstruction::ClassOf(user) ||
       user->opcode() == HloOpcode::kWhile ||
       user->opcode() == HloOpcode::kConditional) {
     // For HloCallableInstruction, we may overcount here if we will allow
@@ -3843,7 +3844,7 @@ absl::Status InstructionVerifier::VerifyNoHostMemorySpace(
       });
 }
 
-absl::StatusOr<bool> HloVerifier::Run(
+absl::StatusOr<bool> HloVerifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto disabled = module->config().debug_options().xla_disable_hlo_passes();
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index 100abb8df8dced..2fffe6adfbb414 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -487,8 +487,8 @@ class HloVerifier : public HloModulePass {
   absl::string_view name() const override { return "hlo-verifier"; }
 
   // Never returns true; no instructions are ever modified by this pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index b7ddcaa959e0a5..8eab67139f5f54 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -3235,7 +3235,30 @@ ENTRY main {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              HasSubstr("device 2 > num_devices (2) in tile assignment"));
+              HasSubstr("device 2 >= num_devices (2) in tile assignment"));
+}
+
+TEST_F(HloVerifierTest, NegativeDeviceID) {
+  const char* const hlo = R"(
+HloModule Module
+
+ENTRY main {
+  p = f32[4,2] parameter(0), sharding={maximal device=-1}
+  ROOT r = f32[4,2] copy(p)
+}
+)";
+
+  HloModuleConfig config;
+  config.set_num_partitions(2);
+  config.set_use_spmd_partitioning(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo, config));
+  ASSERT_TRUE(module->config().use_spmd_partitioning());
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("device -1 is negative in tile assignment"));
 }
 
 TEST_F(HloVerifierTest, InconsistentWhileSharding) {
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 6b4af8e671a4bc..6674991d12436d 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -599,7 +599,7 @@ std::unique_ptr<FusionQueue> InstructionFusion::GetFusionQueue(
   return std::make_unique<ReversePostOrderFusionQueue>(computation);
 }
 
-absl::StatusOr<bool> InstructionFusion::Run(
+absl::StatusOr<bool> InstructionFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index 5f572d4be546ef..85ff4dde04035c 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -202,13 +202,6 @@ class InstructionFusion : public HloModulePass {
   ~InstructionFusion() override = default;
   absl::string_view name() const override { return "fusion"; }
 
-  // Run instruction fusion on the given computation. Returns whether the
-  // computation was changed (instructions were fused).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns true if the computation of the given instruction is significantly
   // more expensive than just writing all the values of the instructions' result
   // array. Expensive operations will not be duplicated.
@@ -340,6 +333,12 @@ class InstructionFusion : public HloModulePass {
 
   bool may_duplicate() const { return may_duplicate_; }
 
+  // Run instruction fusion on the given computation. Returns whether the
+  // computation was changed (instructions were fused).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Returns the reused operands of `instruction` from reused_fusion_operands_,
   // computing them if they have not previously been computed for that
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 16cfd49f8f0aa6..3ca8f126f9708f 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/layout.h"
 #include "xla/map_util.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/dump.h"
@@ -151,6 +153,18 @@ bool IsCustomCallWithForceDelayAttribute(const HloInstruction* instr) {
          attr.value() == "force_delay";
 }
 
+int GetCustomCallForceDelayPriority(const HloInstruction* instr) {
+  auto attr = instr->get_frontend_attribute("scheduler_delay_priority");
+  if (instr->opcode() == HloOpcode::kCustomCall && attr.has_value()) {
+    int out;
+    CHECK(absl::SimpleAtoi(attr.value(), &out))
+        << "Failed to parse scheduler_delay_priority attribute: "
+        << attr.value();
+    return out;
+  }
+  return 0;
+}
+
 absl::flat_hash_map<int64_t, int64_t>
 GetNumResourcesNeededForAnnotationWithKeepOriginalOrderAttrs(
     const DefaultSchedulerCore::SchedulingState& sched_state,
@@ -208,6 +222,12 @@ int64_t EstimateFragmentationSize(HloModule* module,
     if (!shape.IsArray()) {
       return 0;
     }
+    if (!shape.has_layout()) {
+      return 0;
+    }
+    if (shape.layout().memory_space() != Layout::kDefaultMemorySpace) {
+      return 0;
+    }
     return ShapeUtil::ByteSizeOf(shape);
   };
   auto result =
@@ -1258,8 +1278,13 @@ class ReadySetLt {
     HloGraphNode* bn = b.node;
     // Schedule according to ForceEarly.
     CMP_PROPERTY(GetForceEarly(), "kForceEarly");
-    // Schedule according to ForceDelay first.
+    // Schedule according to ForceDelay, if exactly one of the two instructions
+    // has ForceDelay set.
     CMP_EXPLICIT(!an->GetForceDelay(), !bn->GetForceDelay(), "kForceDelay");
+    // Schedule according to highest ForceDelay first, if both instructions
+    // have ForceDelay set.
+    CMP_EXPLICIT(-an->GetForceDelayPriority(), -bn->GetForceDelayPriority(),
+                 "kForceDelayPriority");
     // Use the preference value (comes from a heuristic) to choose between
     // the two candidates. If two preferences are the same regular LHS logic
     // will run as usual, we take advantage of this fact when initializing
@@ -2161,9 +2186,26 @@ absl::Status DefaultSchedulerCore::ScheduleAnnotation(
     VLOG(2) << "Scheduled annotated node (" << num_scheduled << "/"
             << annotation_size << "): " << node->GetInstr().name();
   }
-  // Check that we scheduled all the nodes in the annotation.
-  TF_RET_CHECK(num_scheduled == annotation_size - non_ready_instr)
-      << "Couldn't schedule all annotated nodes in one go.";
+  // If for some reason we could not schedule all the instructions in the
+  // annotation in one go, we clear the annotation for the remaining
+  // instruction. Currently this should only happen for async-start
+  // instructions.
+  if (num_scheduled < annotation_size - non_ready_instr) {
+    for (auto* inst :
+         annotation_tracker_->GetInstructions(computation, annotation)) {
+      HloGraphNode& node = sched_state->sched_graph.GetNode(inst);
+      if (!node.IsScheduled()) {
+        TF_RET_CHECK(
+            scheduling_context_->GetAsyncTracker()->IsSupportedAsyncStart(
+                node.GetInstr()));
+        VLOG(2) << "Could not schedule all annotated nodes with annotation ID "
+                << annotation << " in one go; clearing annotation for "
+                << node.GetInstr().name();
+        node.ClearAnnotation();
+        sched_state->nodes_holding_annotations.insert(&node);
+      }
+    }
+  }
   return absl::OkStatus();
 }
 
@@ -2591,6 +2633,7 @@ HloScheduleGraph::HloScheduleGraph(
     }
     if (IsCustomCallWithForceDelayAttribute(instr)) {
       n->SetForceDelay(true);
+      n->SetForceDelayPriority(GetCustomCallForceDelayPriority(instr));
     }
   }
 
@@ -3578,7 +3621,7 @@ LatencyHidingScheduler::ScheduleWithPreferences(
   return std::make_pair(new_schedule, schedule_info);
 }
 
-absl::StatusOr<bool> LatencyHidingScheduler::Run(
+absl::StatusOr<bool> LatencyHidingScheduler::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Original module:";
@@ -3599,7 +3642,8 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
       if (scheduling_context_->GetAsyncTracker()->IsSupportedAsyncStart(
               *instr) ||
           scheduling_context_->GetAsyncTracker()->IsSupportedAsyncDone(
-              *instr)) {
+              *instr) ||
+          IsCustomCallWithForceDelayAttribute(instr)) {
         computations_to_schedule_.push_back(computation);
         break;
       }
@@ -3658,9 +3702,8 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
        iter++) {
     LOG(INFO) << "LatencyHidingScheduler current memory usage: "
               << scheduler_core_->GetMemoryPeak() + fragmentation_size
-              << " bytes, does not fit in limit: "
-              << scheduler_core_->GetMemoryLimit()
-              << ". Setting the new limit to "
+              << " bytes, does not fit in initial limit: "
+              << initial_memory_limit << ". Setting the new limit to "
               << static_cast<uint64_t>(scheduler_core_->GetMemoryLimit() * 0.9);
     TF_RETURN_IF_ERROR(scheduler_core_->InitializeScheduler(module));
     scheduler_core_->SetMemoryLimit(scheduler_core_->GetMemoryLimit() * 0.9);
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 41f4d61662d4c4..6ba04a26381121 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -705,6 +705,10 @@ class HloGraphNode {
   void SetGraphDepth(TimeCost graph_depth) { graph_depth_ = graph_depth; }
   bool GetForceDelay() const { return force_delay_; }
   void SetForceDelay(bool force_delay) { force_delay_ = force_delay; }
+  int GetForceDelayPriority() const { return force_delay_priority_; }
+  void SetForceDelayPriority(int force_delay_priority) {
+    force_delay_priority_ = force_delay_priority;
+  }
   bool GetForceEarly() const { return force_early_; }
   void SetForceEarly(bool force_early) { force_early_ = force_early; }
   bool GetForceDelayAfterTarget() const { return force_delay_after_target_; }
@@ -947,6 +951,9 @@ class HloGraphNode {
   // bitfields
   // Force the scheduling of the nodes with attribute set as late as possible.
   bool force_delay_ = false;
+  // If multiple nodes are there with force_delay_ = true, the one with the
+  // lowest delay priority will be scheduled first.
+  int force_delay_priority_ = 0;
   // Force the scheduling of the nodes with attribute set as early as possible.
   bool force_early_ = false;
   // If has_rare_ is false, then all the fields in rare can assumed to be
@@ -1832,14 +1839,13 @@ class LatencyHidingScheduler : public HloModulePass {
                           const std::vector<double>& preferences,
                           const HloComputation* computation);
 
-  using HloPassInterface::Run;
+  virtual void LogScheduleStatistics(const HloComputation* computation);
 
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  virtual void LogScheduleStatistics(const HloComputation* computation);
-
  protected:
   std::shared_ptr<const SchedulingContext> scheduling_context_;
   std::shared_ptr<SchedulerCore> scheduler_core_;
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 5d973e6be77677..b788717b535984 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -793,6 +793,25 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, cp_start->name()));
 }
 
+TEST_F(LatencyHidingSchedulerTest, ForceDelayCustomCall) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY %module {
+  %p0 = f32[100] parameter(0)
+  %custom-call = f32[100] custom-call(%p0), custom_call_target="foo", frontend_attributes={scheduler_hint="force_delay"}
+  ROOT %copy = f32[100] copy(%custom-call)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  // We expect RunScheduler to return true because of the force_delay attribute,
+  // even though there are no async collectives.
+  auto result = RunScheduler(hlo_module.get());
+  TF_ASSERT_OK(result);
+  EXPECT_TRUE(result.value());
+}
+
 TEST_F(LatencyHidingSchedulerTest, WhileLoopAliasingBug2) {
   // Like WhileLoopAliasingBug above, but this time the input buffer of the
   // first collective permute aliases with the output buffer of the second
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 931599773b9fac..2aef975852c2ce 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -309,7 +309,7 @@ absl::Status LayoutAssignment::SetBufferLayout(const Layout& layout,
           << buffer_constraint->ToString();
   PushAddedConstraints(buffer_constraint.get());
   const HloInstruction* instruction = buffer.instruction();
-  if (dynamic_cast<const HloCallableInstruction*>(instruction) != nullptr) {
+  if (HloCallableInstruction::ClassOf(instruction)) {
     // Check and propagate via output-operand aliasing
     VLOG(3) << "Propagating aliasing:" << instruction->ToString() << "\n";
     for (const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>&
@@ -2635,7 +2635,7 @@ absl::Status LayoutAssignment::PropagateComputationLayouts(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> LayoutAssignment::Run(
+absl::StatusOr<bool> LayoutAssignment::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running layout assignment on module " << module->name();
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index 29a4179cea01f6..63347ecb6ccb14 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -279,13 +279,6 @@ class LayoutAssignment : public HloModulePass {
   }
   absl::string_view name() const override { return "layout-assignment"; }
 
-  // Assign layouts to the given module. Returns whether the module was changed
-  // (any layouts were changed).
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Class encapsulating the layout constraints of the values in a HLO
   // computation.
   class LayoutConstraints {
@@ -571,6 +564,12 @@ class LayoutAssignment : public HloModulePass {
       const HloInstruction* user,
       const OperandLayoutConstraint& operand_constraint);
 
+  // Assign layouts to the given module. Returns whether the module was changed
+  // (any layouts were changed).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // Initializes the layout assignment object for a new Run() call.
   absl::Status Init(HloModule* module);
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 4cc6507e5f2ae3..c74f1ec4a76197 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -86,13 +86,12 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // To handle a constant, just give the literal data a new layout.
   absl::Status HandleConstant(HloInstruction* hlo) override {
+    Shape shape = hlo->shape();
     Literal& literal = *Cast<HloConstantInstruction>(hlo)->mutable_literal();
-    if (literal.shape().IsTuple()) {
-      // TODO(cheshire): Tuple constants.
+    if (literal.shape().IsTuple() || ShapeUtil::IsZeroElementArray(shape)) {
       return absl::OkStatus();
     }
 
-    Shape shape = hlo->shape();
     Shape normalized_shape = Normalize(shape);
     *literal.mutable_shape_do_not_use() = normalized_shape;
     // Ensure element_size_in_bits of literal is 0, because literals do not
@@ -872,7 +871,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
 }  // end namespace
 
-absl::StatusOr<bool> LayoutNormalization::Run(
+absl::StatusOr<bool> LayoutNormalization::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return LayoutNormalizationVisitor{this, custom_call_transformer_}.RunOnModule(
diff --git a/third_party/xla/xla/service/layout_normalization.h b/third_party/xla/xla/service/layout_normalization.h
index da416aa9687f47..b4ca1b2a9ead02 100644
--- a/third_party/xla/xla/service/layout_normalization.h
+++ b/third_party/xla/xla/service/layout_normalization.h
@@ -50,8 +50,9 @@ class LayoutNormalization : public HloModulePass {
       : custom_call_transformer_(custom_call_transformer) {}
 
   absl::string_view name() const override { return "layout_normalization"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index cbb8fcabbed34b..67790c37ad9f3f 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -601,6 +601,17 @@ ENTRY main {
   )");
 }
 
+TEST_F(LayoutNormalizationTest, ZeroSizedConstant) {
+  const char* hlo = R"(
+  HloModule zero_sized_constant, entry_computation_layout={()->s32[0,179]{0,1}}
+  ENTRY main() -> s32[0,179] {
+    ROOT %constant = s32[0,179]{1,0} constant({  })
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(auto status, LayoutNormalization().Run(module.get()));
+  EXPECT_FALSE(status);
+}
+
 TEST_F(LayoutNormalizationTest, ConstantAvoidRevisitOfUser) {
   const char* hlo = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.cc b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
index 4d8db8fec80f06..b1b539f8fe6e0a 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.cc
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
@@ -479,7 +479,7 @@ absl::Status LegalizeSchedulingAnnotations::Verify(HloModule* module) {
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
+absl::StatusOr<bool> LegalizeSchedulingAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<HloInstruction*, Annotation> instruction_to_annotation;
@@ -605,7 +605,7 @@ absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
   return changed;
 }
 
-absl::StatusOr<bool> CheckNoDataDependencyInSchedulingAnnotations::Run(
+absl::StatusOr<bool> CheckNoDataDependencyInSchedulingAnnotations::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation :
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.h b/third_party/xla/xla/service/legalize_scheduling_annotations.h
index df3dfa47737f46..06891826ec9847 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.h
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.h
@@ -66,8 +66,8 @@ class LegalizeSchedulingAnnotations : public HloModulePass {
 
   absl::Status Verify(HloModule* module);
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -90,7 +90,9 @@ class CheckNoDataDependencyInSchedulingAnnotations : public HloModulePass {
   absl::string_view name() const override {
     return "check-no-data-dependency-in-scheduling-annotations";
   }
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.cc b/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.cc
index 190e76459818ff..2461e63877b3dc 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/macros.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/hash/hash.h"
 #include "absl/log/check.h"
@@ -89,5 +90,26 @@ LLVMCommandLineOptionsLock::~LLVMCommandLineOptionsLock() {
   num_active_clients_ -= 1;
 }
 
+void LLVMCommandLineOptionsLock::
+    UpgradeToExclusiveAccessToRawLLVMCommandLine() {
+  {
+    absl::MutexLock lock(lock_);
+    CHECK_GT(num_active_clients_, 0);
+    if (ABSL_PREDICT_TRUE(num_active_clients_ == 1)) {
+      active_client_signature_ = 0;
+      return;
+    }
+    num_active_clients_ -= 1;
+  }
+  // Slow path
+  auto no_other_clients = []() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    return num_active_clients_ == 0;
+  };
+  lock_.LockWhen(absl::Condition(&no_other_clients));
+  active_client_signature_ = 0;
+  num_active_clients_ = 1;
+  lock_.unlock();
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h b/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h
index 605e9d1ef2e9c7..eb8397784fee11 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h
+++ b/third_party/xla/xla/service/llvm_ir/llvm_command_line_options.h
@@ -76,6 +76,8 @@ class ABSL_SCOPED_LOCKABLE LLVMCommandLineOptionsLock {
 
   ~LLVMCommandLineOptionsLock() ABSL_UNLOCK_FUNCTION();
 
+  void UpgradeToExclusiveAccessToRawLLVMCommandLine();
+
   static std::vector<std::string>& GetGlobalOptions() {
     // absl::NoDestructor is not available in OSS XLA.
     static std::vector<std::string>* global_options =
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 2d0ab8fad9131a..876b5f222f4a41 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -626,14 +627,12 @@ std::string SanitizeFunctionName(std::string function_name) {
   // are illegal.
 
   // Sanitize chars in function_name.
-  std::transform(function_name.begin(), function_name.end(),
-                 function_name.begin(), [](char c) {
-                   if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
-                       ('0' <= c && c <= '9') || c == '_' || c == '$') {
-                     return c;
-                   }
-                   return '_';
-                 });
+  absl::c_transform(function_name, function_name.begin(), [](char c) {
+    if (absl::ascii_isalnum(c) || c == '_' || c == '$') {
+      return c;
+    }
+    return '_';
+  });
 
   // Ensure the name isn't empty.
   if (function_name.empty()) {
@@ -641,8 +640,7 @@ std::string SanitizeFunctionName(std::string function_name) {
   }
 
   // Ensure the name doesn't start with a number.
-  if (!function_name.empty() && function_name[0] >= '0' &&
-      function_name[0] <= '9') {
+  if (!function_name.empty() && absl::ascii_isdigit(function_name[0])) {
     function_name.insert(function_name.begin(), '_');
   }
 
diff --git a/third_party/xla/xla/service/loop_schedule_linearizer.cc b/third_party/xla/xla/service/loop_schedule_linearizer.cc
index a4503fed5bc121..39329ad2c7440c 100644
--- a/third_party/xla/xla/service/loop_schedule_linearizer.cc
+++ b/third_party/xla/xla/service/loop_schedule_linearizer.cc
@@ -164,7 +164,7 @@ static absl::StatusOr<bool> AddControlEdgesForLoopWrites(
   return changed;
 }
 
-absl::StatusOr<bool> LoopScheduleLinearizer::Run(
+absl::StatusOr<bool> LoopScheduleLinearizer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Constructing HloAliasAnalysis is expensive, so don't do it until we find at
diff --git a/third_party/xla/xla/service/loop_schedule_linearizer.h b/third_party/xla/xla/service/loop_schedule_linearizer.h
index d16a7391e3cb40..7d4780f41e9912 100644
--- a/third_party/xla/xla/service/loop_schedule_linearizer.h
+++ b/third_party/xla/xla/service/loop_schedule_linearizer.h
@@ -44,8 +44,8 @@ class LoopScheduleLinearizer : public HloModulePass {
   explicit LoopScheduleLinearizer(const AliasInfo* alias_info)
       : alias_info_(alias_info) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/map_inliner.cc b/third_party/xla/xla/service/map_inliner.cc
index 7f96c1e8aa80a4..e003f189f60203 100644
--- a/third_party/xla/xla/service/map_inliner.cc
+++ b/third_party/xla/xla/service/map_inliner.cc
@@ -114,7 +114,7 @@ absl::Status MapInlinerVisitor::HandleMap(HloInstruction* map) {
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> MapInliner::Run(
+absl::StatusOr<bool> MapInliner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   MapInlinerVisitor visitor(/*computation=*/nullptr);
diff --git a/third_party/xla/xla/service/map_inliner.h b/third_party/xla/xla/service/map_inliner.h
index 33821208770aeb..1bb5adcd9683ae 100644
--- a/third_party/xla/xla/service/map_inliner.h
+++ b/third_party/xla/xla/service/map_inliner.h
@@ -32,10 +32,10 @@ class MapInliner : public HloModulePass {
   ~MapInliner() override = default;
   absl::string_view name() const override { return "map-inline"; }
 
+ protected:
   // Run map inlining on the given computation. Returns whether the computation
   // was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index ff7db6b67d64c4..cb87aea3475920 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -4,6 +4,7 @@
 load(
     "//xla:xla.default.bzl",
     "xla_cc_test",
+    "xla_py_proto_library",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load(
@@ -31,6 +32,11 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+xla_py_proto_library(
+    name = "memory_space_assignment_proto_py_pb2",
+    deps = [":memory_space_assignment_proto"],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index 97f189396d6210..0eedc0fd47f8bf 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -1323,8 +1323,8 @@ void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
               << " loop end: " << loop_end_idx
               << " num iterations: " << num_iterations;
 
-      TF_CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
-                                          loop_size_candidate));
+      CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
+                                       loop_size_candidate));
     }
   }
 }
@@ -2089,11 +2089,6 @@ std::optional<int64_t> MsaAlgorithm::EarliestBlockPrefetchStartTime(
 
 namespace {
 
-struct UseInterval {
-  int64_t first_use_time;
-  int64_t last_use_time;
-};
-
 absl::flat_hash_map<const HloValue*, UseInterval> GetUseIntervals(
     const std::vector<const HloValue*>& values,
     const absl::flat_hash_map<const HloInstruction*, int64_t>&
@@ -2133,6 +2128,19 @@ absl::flat_hash_set<HloPosition> GetParameterInstructionsAliasedToOutput(
   return aliased_parameter_positions;
 }
 
+// Marks all RepackAllocationBlocks in the list as colocated, forming a circular
+// linked list, representing colocated allocations.
+void MarkRepackAllocationBlocksColocated(
+    std::vector<AllocationBlock*>& colocations) {
+  if (colocations.empty()) {
+    return;
+  }
+  for (size_t i = 0; i < colocations.size() - 1; ++i) {
+    colocations[i]->next_colocated = colocations[i + 1];
+  }
+  colocations.back()->next_colocated = colocations.front();
+}
+
 void PopulateExistingBlockPrefetchedValues(
     const Options& options, const HloAliasAnalysis& alias_analysis,
     std::vector<const HloValue*>& block_prefetched_values,
@@ -2237,9 +2245,11 @@ absl::Status MsaAlgorithm::AllocateAndScheduleExistingBlockPrefetches() {
   // 2. Update the operands in alternate memory map.
   // 3. Add the copy done and copy start values to the finalized values set.
   // 4. Add a repack allocation block to the repack allocation blocks list.
-  // 5. Serve the uses of the original value from the pinned allocation in the
+  // 5. If the prefetch done value is aliased with values other than the
+  //    prefetch start value, allocate the aliased values.
+  // 6. Serve the uses of the original value from the pinned allocation in the
   //    default memory.
-  // 6. Clear the pending chunks after the loop.
+  // 7. Clear the pending chunks after the loop.
   for (const HloValue* prefetch_done_value : block_prefetched_values) {
     UseInterval use_interval = value_to_use_intervals.at(prefetch_done_value);
     int64_t first_use_time = use_interval.first_use_time;
@@ -2349,23 +2359,164 @@ absl::Status MsaAlgorithm::AllocateAndScheduleExistingBlockPrefetches() {
         allocations_->back().get()));
     repack_allocation_blocks_.back().next_colocated =
         &(repack_allocation_blocks_.back());
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*prefetch_done_value);
+
+    // 5. If the prefetch done value is aliased with values other than the
+    //    prefetch start value, allocate the aliased values.
+    if (buffer.values().size() > 2) {
+      ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+          prefetch_done_value, buffer, chunk_candidate, buffer_size,
+          &repack_allocation_blocks_.back(), instruction_schedule,
+          value_to_use_intervals,
+          prefetch_done_value_to_prefetch_start_instruction,
+          prefetch_end_times);
+    }
   }
 
-  // 5. Serve the uses of the original value from the pinned allocation in the
+  // 6. Serve the uses of the original value from the pinned allocation in the
   //    default memory.
   for (auto [_, original_value] : prefetch_done_value_to_original_value) {
-    Allocation* allocation = value_to_pinned_allocation[original_value];
-    for (const HloUse& use : original_value->GetUses()) {
-      allocation->AddUse(use);
+    // Finalize all values aliased to the original value.
+    // Note: We do not need to add pinned allocations for the aliased values,
+    // just finalizing them is sufficient to ensure that they will be served
+    // from default memory.
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*original_value);
+    for (const HloValue* aliased_value : buffer.values()) {
+      if (finalized_values_.contains(aliased_value)) {
+        continue;
+      }
+      // If a pinned allocation already exists for the aliased value, add the
+      // uses of the original value to the pinned allocation.
+      auto it = value_to_pinned_allocation.find(aliased_value);
+      if (it != value_to_pinned_allocation.end()) {
+        Allocation* allocation = it->second;
+        for (const HloUse& use : original_value->GetUses()) {
+          allocation->AddUse(use);
+        }
+      }
+      finalized_values_.insert(aliased_value);
     }
-    finalized_values_.insert(original_value);
   }
 
-  // 6. Clear the pending chunks.
+  // 7. Clear the pending chunks.
   ClearPendingChunks();
   return absl::OkStatus();
 }
 
+void MsaAlgorithm::ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+    const HloValue* prefetch_done_value, const HloBuffer& buffer,
+    const Chunk& chunk_candidate, int64_t buffer_size,
+    AllocationBlock* first_colocated_repack_allocation,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_schedule,
+    const absl::flat_hash_map<const HloValue*, UseInterval>&
+        value_to_use_intervals,
+    absl::flat_hash_map<const HloValue*, HloInstruction*>&
+        prefetch_done_value_to_prefetch_start_instruction,
+    std::vector<int64_t>& prefetch_end_times) {
+  VLOG(1) << "HloBuffer for block prefetched value: "
+          << prefetch_done_value->ToShortString()
+          << " aliases with: " << (buffer.values().size() - 1)
+          << " other values";
+
+  std::vector<const HloValue*> colocated_values;
+  for (const HloValue* aliased_value : buffer.values()) {
+    colocated_values.push_back(aliased_value);
+  }
+  absl::c_sort(colocated_values, [&](const HloValue* a, const HloValue* b) {
+    return instruction_schedule.at(a->defining_instruction()) <
+           instruction_schedule.at(b->defining_instruction());
+  });
+
+  // After sorting by schedule time, the first two values in the colocated
+  // values list should be the prefetch start and prefetch done values, followed
+  // by their uses which might be aliased.
+  CHECK_EQ(
+      colocated_values[0]->instruction(),
+      prefetch_done_value_to_prefetch_start_instruction[prefetch_done_value]);
+  CHECK_EQ(colocated_values[1], prefetch_done_value);
+
+  int64_t prev_last_use_time =
+      value_to_use_intervals.at(prefetch_done_value).last_use_time;
+
+  std::vector<AllocationBlock*> colocations;
+  colocations.push_back(first_colocated_repack_allocation);
+
+  int64_t maybe_sliced_value_definition_time =
+      instruction_schedule.at(prefetch_done_value->defining_instruction());
+
+  // For each of the colocated values that follow the block prefetched value,
+  // extend the chunk candidate to the right and add a pinned allocation in
+  // the alternate memory. We start from index 2, since the first two values
+  // in the colocated values list are the prefetch start and prefetch done
+  // values.
+  for (int i = 2; i < colocated_values.size(); ++i) {
+    const HloValue* aliased_value = colocated_values[i];
+    CHECK(!finalized_values_.contains(aliased_value));
+    int64_t aliased_value_definition_time =
+        instruction_schedule.at(aliased_value->defining_instruction());
+    CHECK_LT(maybe_sliced_value_definition_time, aliased_value_definition_time);
+    // The last use time of the previous value in the colocated values list
+    // should be the definition time of the current value in the colocated
+    // values list. This is because only the last use of a value can be
+    // aliased.
+    CHECK_EQ(prev_last_use_time, aliased_value_definition_time);
+    int64_t aliased_value_last_use_time = std::numeric_limits<int64_t>::min();
+    for (const HloUse& use : aliased_value->GetUses()) {
+      aliased_value_last_use_time =
+          std::max(aliased_value_last_use_time,
+                   instruction_schedule.at(use.instruction));
+    }
+    prev_last_use_time = aliased_value_last_use_time;
+
+    MsaBufferInterval aliased_interval = MsaBufferInterval{
+        /*buffer=*/aliased_value,
+        /*size=*/buffer_size,
+        /*start=*/aliased_value_definition_time +
+            1,  // We need to add 1 because a chunk is already reserved till
+                // the prev_last_use_time which is equal to the
+                // aliased_value_definition_time.
+        /*end=*/aliased_value_last_use_time,
+        /*colocations=*/{},
+        /*need_allocation=*/true};
+    Chunk aliased_chunk_candidate = FindChunkCandidate(
+        aliased_interval, /*preferred_offset=*/chunk_candidate.offset);
+    // The aliased chunk candidate should be the same as the chunk candidate,
+    // since they are colocated and aliased. We are in principle extending the
+    // same chunk candidate to the right and we should always be able to do
+    // that because we are processing the values from left to right and we
+    // have checked that the prefetched value is only aliased to the right.
+    CHECK_EQ(aliased_chunk_candidate, chunk_candidate);
+    allocations_->push_back(std::make_unique<PinnedAllocation>(
+        aliased_value->defining_position(), MemorySpace::kAlternate,
+        aliased_chunk_candidate, aliased_value_definition_time,
+        aliased_value_last_use_time));
+    AddToPendingChunks(aliased_interval, aliased_chunk_candidate);
+    for (const HloUse& use : aliased_value->GetUses()) {
+      allocations_->back()->AddUse(use);
+      operands_in_alternate_memory_map_[use.instruction].insert(
+          std::make_pair(use.operand_number, use.operand_index));
+    }
+    auto const sorted_position =
+        std::lower_bound(prefetch_end_times.begin(), prefetch_end_times.end(),
+                         aliased_value_last_use_time);
+    prefetch_end_times.insert(sorted_position, aliased_value_last_use_time);
+    finalized_values_.insert(aliased_value);
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        aliased_value_definition_time, aliased_value_last_use_time,
+        aliased_chunk_candidate.size, aliased_chunk_candidate.offset,
+        allocations_->back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+    colocations.push_back(&repack_allocation_blocks_.back());
+  }
+
+  // Mark repack allocation blocks as colocated.
+  MarkRepackAllocationBlocksColocated(colocations);
+}
+
 absl::Status MsaAlgorithm::CreateNewBlockPrefetches() {
   if (!options_.hlo_position_to_custom_call_prefetch_details.empty() ||
       options_.reserved_bytes_for_block_prefetches <= 0) {
@@ -2387,20 +2538,14 @@ absl::Status MsaAlgorithm::CreateNewBlockPrefetches() {
     const HloValue* value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             position.instruction, position.index);
-    const HloBuffer& buffer = alias_analysis_.GetBufferContainingValue(*value);
-    if (!aliased_parameter_positions.contains(value->defining_position()) &&
-        buffer.values().size() == 1) {
+    if (!aliased_parameter_positions.contains(value->defining_position())) {
       block_prefetched_values.push_back(value);
-    } else if (aliased_parameter_positions.contains(position)) {
+    } else {
       // TODO(b/441344194): Add support for block allocations for parameters
       // that are aliased to outputs.
       LOG(WARNING) << "Skipping block prefetch for value: "
                    << position.ToString()
                    << " because it is aliased to a program output.";
-    } else {
-      LOG(WARNING) << "Skipping block prefetch for value: "
-                   << position.ToString()
-                   << " because it is aliased to multiple values.";
     }
 
     // As mentioned above, we also track slices of block prefetched values.
@@ -2409,14 +2554,6 @@ absl::Status MsaAlgorithm::CreateNewBlockPrefetches() {
         const HloValue* slice_value =
             &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
                 use.instruction, {});
-        const HloBuffer& buffer =
-            alias_analysis_.GetBufferContainingValue(*slice_value);
-        if (buffer.values().size() > 1) {
-          VLOG(1) << "Skipping block prefetch for value: "
-                  << value->defining_position().ToString()
-                  << " because it is aliased to multiple values.";
-          continue;
-        }
         block_prefetched_values.push_back(slice_value);
         sliced_value_to_original_value[slice_value] = value;
       }
@@ -2496,11 +2633,16 @@ absl::Status MsaAlgorithm::CreateNewBlockPrefetches() {
   // 2. Update the operands in alternate memory map.
   // 3. Add the value to the finalized values set.
   // 4. Add a repack allocation block to the repack allocation blocks list.
+  // 5. If the block prefetched value is aliased with other values that come
+  //    after it in the schedule, colocate and finalize the aliased values.
   // Outside the loop:
-  // 5. Finalize the original sources of the sliced values that have not yet
-  //    been finalized, so we don't try to process those sources again, outside
-  //    of block prefetching.
-  // 6. Clear the pending chunks after the loop.
+  // 6. Finalize the prefetch source and its aliases that come before it in the
+  //    schedule, to default memory. If source value was successfully block
+  //    prefetched, the aliased values that come after in the schedule are
+  //    already colocated with the prefetch and finalized to the alternate
+  //    memory. If the block prefetch failed, finalize source and its aliases
+  //    to default memory.
+  // 7. Clear the pending chunks after the loop.
   for (const HloValue* maybe_sliced_value : block_prefetched_values) {
     UseInterval use_interval = value_to_use_intervals.at(maybe_sliced_value);
     int64_t first_use_time = use_interval.first_use_time;
@@ -2609,27 +2751,163 @@ absl::Status MsaAlgorithm::CreateNewBlockPrefetches() {
         allocations_->back().get()));
     repack_allocation_blocks_.back().next_colocated =
         &(repack_allocation_blocks_.back());
-  }
 
-  // 5. Finalize the original sources of the sliced values that have not yet
-  //    been finalized, so we don't try to process those sources again, outside
-  //    of block prefetching.
-  for (auto [_, original_value] : sliced_value_to_original_value) {
-    if (finalized_values_.contains(original_value)) {
-      continue;
-    }
-    Allocation* allocation = value_to_pinned_allocation[original_value];
-    for (const HloUse& use : original_value->GetUses()) {
-      allocation->AddUse(use);
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*maybe_sliced_value);
+    // 5. If the block prefetched value is aliased with other values that come
+    //    after it in the schedule, colocate and finalize the aliased values.
+    if (buffer.values().size() > 1) {
+      ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+          maybe_sliced_value, buffer, chunk_candidate, buffer_size,
+          &repack_allocation_blocks_.back(), instruction_schedule,
+          value_to_use_intervals, prefetch_end_times);
+    }
+  }
+
+  // 6. Finalize the prefetch source and its aliases that come before it in the
+  //    schedule, to default memory. If source value was successfully block
+  //    prefetched, the aliased values that come after in the schedule are
+  //    already colocated with the prefetch and finalized to the alternate
+  //    memory. If the block prefetch failed, finalize source and its aliases
+  //    to default memory.
+  for (const HloValue* original_value : block_prefetched_values) {
+    // Note: We do not need to add pinned allocations for the aliased values,
+    // just finalizing them is sufficient to ensure that they will be served
+    // from default memory.
+    const HloBuffer& buffer =
+        alias_analysis_.GetBufferContainingValue(*original_value);
+    for (const HloValue* aliased_value : buffer.values()) {
+      if (finalized_values_.contains(aliased_value)) {
+        continue;
+      }
+      // If a pinned allocation already exists for the aliased value, add the
+      // uses of the original value to the pinned allocation.
+      auto it = value_to_pinned_allocation.find(aliased_value);
+      if (it != value_to_pinned_allocation.end()) {
+        Allocation* allocation = it->second;
+        for (const HloUse& use : original_value->GetUses()) {
+          allocation->AddUse(use);
+        }
+      }
+      finalized_values_.insert(aliased_value);
     }
-    finalized_values_.insert(original_value);
   }
 
-  // 6. Clear the pending chunks.
+  // 7. Clear the pending chunks.
   ClearPendingChunks();
   return absl::OkStatus();
 }
 
+void MsaAlgorithm::ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+    const HloValue* maybe_sliced_value, const HloBuffer& buffer,
+    const Chunk& chunk_candidate, int64_t buffer_size,
+    AllocationBlock* first_colocated_repack_allocation,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_schedule,
+    const absl::flat_hash_map<const HloValue*, UseInterval>&
+        value_to_use_intervals,
+    std::vector<int64_t>& prefetch_end_times) {
+  VLOG(1) << "HloBuffer for block prefetched value: "
+          << maybe_sliced_value->ToShortString()
+          << " aliases with: " << (buffer.values().size() - 1)
+          << " other values";
+
+  int64_t maybe_sliced_value_definition_time =
+      instruction_schedule.at(maybe_sliced_value->defining_instruction());
+  std::vector<const HloValue*> colocated_values;
+
+  // We want to process only those values that are aliased to the right of the
+  // block prefetched value. What is aliased to the left will be pinned to the
+  // default memory.
+  for (const HloValue* aliased_value : buffer.values()) {
+    if (instruction_schedule.at(aliased_value->defining_instruction()) >
+        maybe_sliced_value_definition_time) {
+      colocated_values.push_back(aliased_value);
+    }
+  }
+
+  if (colocated_values.empty()) {
+    return;
+  }
+
+  absl::c_sort(colocated_values, [&](const HloValue* a, const HloValue* b) {
+    return instruction_schedule.at(a->defining_instruction()) <
+           instruction_schedule.at(b->defining_instruction());
+  });
+
+  int64_t prev_last_use_time =
+      value_to_use_intervals.at(maybe_sliced_value).last_use_time;
+
+  std::vector<AllocationBlock*> colocations;
+  colocations.push_back(first_colocated_repack_allocation);
+
+  // For each of the colocated values that follow the block prefetched value,
+  // extend the chunk candidate to the right and add a pinned allocation in
+  // the alternate memory.
+  for (int i = 0; i < colocated_values.size(); ++i) {
+    const HloValue* aliased_value = colocated_values[i];
+    CHECK(!finalized_values_.contains(aliased_value));
+    int64_t aliased_value_definition_time =
+        instruction_schedule.at(aliased_value->defining_instruction());
+    CHECK_LT(maybe_sliced_value_definition_time, aliased_value_definition_time);
+    // The last use time of the previous value in the colocated values list
+    // should be the definition time of the current value in the colocated
+    // values list. This is because only the last use of a value can be
+    // aliased.
+    CHECK_EQ(prev_last_use_time, aliased_value_definition_time);
+    int64_t aliased_value_last_use_time = std::numeric_limits<int64_t>::min();
+    for (const HloUse& use : aliased_value->GetUses()) {
+      aliased_value_last_use_time =
+          std::max(aliased_value_last_use_time,
+                   instruction_schedule.at(use.instruction));
+    }
+    prev_last_use_time = aliased_value_last_use_time;
+
+    MsaBufferInterval aliased_interval = MsaBufferInterval{
+        /*buffer=*/aliased_value,
+        /*size=*/buffer_size,
+        /*start=*/aliased_value_definition_time +
+            1,  // We need to add 1 because a chunk is already reserved till
+                // the prev_last_use_time which is equal to the
+                // aliased_value_definition_time.
+        /*end=*/aliased_value_last_use_time,
+        /*colocations=*/{},
+        /*need_allocation=*/true};
+    Chunk aliased_chunk_candidate = FindChunkCandidate(
+        aliased_interval, /*preferred_offset=*/chunk_candidate.offset);
+    // The aliased chunk candidate should be the same as the chunk candidate,
+    // since they are colocated and aliased. We are in principle extending the
+    // same chunk candidate to the right and we should always be able to do
+    // that because we are processing the values from left to right and we
+    // have checked that the prefetched value is only aliased to the right.
+    CHECK_EQ(aliased_chunk_candidate, chunk_candidate);
+    allocations_->push_back(std::make_unique<PinnedAllocation>(
+        aliased_value->defining_position(), MemorySpace::kAlternate,
+        aliased_chunk_candidate, aliased_value_definition_time,
+        aliased_value_last_use_time));
+    AddToPendingChunks(aliased_interval, aliased_chunk_candidate);
+    for (const HloUse& use : aliased_value->GetUses()) {
+      allocations_->back()->AddUse(use);
+      operands_in_alternate_memory_map_[use.instruction].insert(
+          std::make_pair(use.operand_number, use.operand_index));
+    }
+    auto const sorted_position =
+        std::lower_bound(prefetch_end_times.begin(), prefetch_end_times.end(),
+                         aliased_value_last_use_time);
+    prefetch_end_times.insert(sorted_position, aliased_value_last_use_time);
+    finalized_values_.insert(aliased_value);
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        aliased_value_definition_time, aliased_value_last_use_time,
+        aliased_chunk_candidate.size, aliased_chunk_candidate.offset,
+        allocations_->back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+    colocations.push_back(&repack_allocation_blocks_.back());
+  }
+  // Mark repack allocation blocks as colocated.
+  MarkRepackAllocationBlocksColocated(colocations);
+}
+
 absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
   // Note: Memory Space Assignment creates a HeapSimulator and passes an
   // MsaAlgorithm object to it. buffer_intervals_ is populated by calling the
@@ -4069,19 +4347,19 @@ AllocationRequest MsaAlgorithm::CreateAllocationRequest(
             options_.preferred_prefetch_overrides, allocation_value.size(),
             hlo_use, instruction_schedule, live_range_start_time,
             latest_prefetch_time);
-    TF_CHECK_OK(overridden_preferred_prefetch_time.status());
+    CHECK_OK(overridden_preferred_prefetch_time.status());
     if (overridden_preferred_prefetch_time.value().has_value()) {
-      LOG(INFO) << "Overriding preferred prefetch for "
-                << hlo_use.instruction->name() << " operand number "
-                << hlo_use.operand_number << " operand index "
-                << hlo_use.operand_index.ToString() << " size "
-                << allocation_value.size() << " live range ("
-                << live_range_start_time << ", " << latest_prefetch_time
-                << ") from "
-                << (preferred_prefetch_time.has_value()
-                        ? preferred_prefetch_time.value()
-                        : -1)
-                << " to " << overridden_preferred_prefetch_time.value().value();
+      VLOG(1) << "Overriding preferred prefetch for "
+              << hlo_use.instruction->name() << " operand number "
+              << hlo_use.operand_number << " operand index "
+              << hlo_use.operand_index.ToString() << " size "
+              << allocation_value.size() << " live range ("
+              << live_range_start_time << ", " << latest_prefetch_time
+              << ") from "
+              << (preferred_prefetch_time.has_value()
+                      ? preferred_prefetch_time.value()
+                      : -1)
+              << " to " << overridden_preferred_prefetch_time.value().value();
       preferred_prefetch_time = overridden_preferred_prefetch_time.value();
     }
 
@@ -4782,13 +5060,7 @@ void MsaAlgorithm::AllocateCrossProgramPrefetchBuffer(
       colocations.push_back(&repack_allocation_blocks_.back());
     }
   }
-  for (int i = 0; i < colocations.size() - 1; ++i) {
-    colocations[i]->next_colocated = colocations[i + 1];
-  }
-  if (!colocations.empty()) {
-    colocations.back()->next_colocated = colocations.front();
-  }
-
+  MarkRepackAllocationBlocksColocated(colocations);
   ClearPendingChunks();
 }
 
@@ -5596,12 +5868,7 @@ void MsaAlgorithm::FinalizeAllocations(
           colocated_allocation->chunk().offset, colocated_allocation));
       colocations.push_back(&repack_allocation_blocks_.back());
     }
-    for (int i = 0; i < colocations.size() - 1; ++i) {
-      colocations[i]->next_colocated = colocations[i + 1];
-    }
-    if (!colocations.empty()) {
-      colocations.back()->next_colocated = colocations.front();
-    }
+    MarkRepackAllocationBlocksColocated(colocations);
   }
   ClearPendingChunks();
 }
@@ -6948,7 +7215,7 @@ absl::Status MsaAlgorithm::WindowPrefetch() {
   // Remove the cloned instructions.
   for (auto [_, cloned] : cloned_insts) {
     HloComputation* computation = cloned->parent();
-    TF_CHECK_OK(computation->RemoveInstruction(cloned));
+    CHECK_OK(computation->RemoveInstruction(cloned));
     computation->Cleanup();
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
index 048a8cc389e349..e6ae2d6101eea9 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.h
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/allocation.h"
 #include "xla/service/memory_space_assignment/allocation_value.h"
@@ -60,6 +61,13 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
+
+// A struct representing use intervals.
+struct UseInterval {
+  int64_t first_use_time;
+  int64_t last_use_time;
+};
+
 // A struct representing an asynchronous copy with its logical start and end
 // time (time that copy done is scheduled), the resource this copy would use,
 // its destination memory space, and a unique ID.
@@ -338,6 +346,19 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // in one pass within a block of memory space in the alternate memory. This
   // guarantees FIFO ordering of all prefetches and allows for more aggressive
   // prefetching i.e. allowing for bandwidth saturation.
+  //
+  // 1) Prefetches are copy-like operations and generate a new HloValue.
+  // 2) For compiler-inserted block prefetches:
+  //    A) The prefetch done, and everything that aliases with the prefetch
+  //       source and comes after the prefetch done will now alias with the new
+  //       HloValue and get alternate memory.
+  //    B) Everything that aliases with the prefetch source and comes before the
+  //       prefetch done will get default memory.
+  // 3) For user-inserted block prefetches:
+  //    A) The prefetch done and everything that aliases with it will get
+  //       alternate memory.
+  //    B) Everything that aliases with the source of the prefetch will get
+  //       default memory.
 
   // Processes existing, explicit block prefetched copy start/done instructions.
   // Such instructions are inserted before MSA. MSA just needs to schedule them.
@@ -352,6 +373,32 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   //           is called.
   absl::Status CreateNewBlockPrefetches();
 
+  // Creates colocated allocations for values aliased to the new block
+  // prefetches and finalizes them.
+  void ColocateAndFinalizeValuesAliasedToNewBlockPrefetches(
+      const HloValue* maybe_sliced_value, const HloBuffer& buffer,
+      const Chunk& chunk_candidate, int64_t buffer_size,
+      AllocationBlock* first_colocated_repack_allocation,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          instruction_schedule,
+      const absl::flat_hash_map<const HloValue*, UseInterval>&
+          value_to_use_intervals,
+      std::vector<int64_t>& prefetch_end_times);
+
+  // Creates colocated allocations for values aliased to existing block
+  // prefetches and finalizes them.
+  void ColocateAndFinalizeValuesAliasedToExistingBlockPrefetches(
+      const HloValue* prefetch_done_value, const HloBuffer& buffer,
+      const Chunk& chunk_candidate, int64_t buffer_size,
+      AllocationBlock* first_colocated_repack_allocation,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          instruction_schedule,
+      const absl::flat_hash_map<const HloValue*, UseInterval>&
+          value_to_use_intervals,
+      absl::flat_hash_map<const HloValue*, HloInstruction*>&
+          prefetch_done_value_to_prefetch_start_instruction,
+      std::vector<int64_t>& prefetch_end_times);
+
   // Returns the maximum amount of scoped memory that is reserved at any time in
   // the program.
   int64_t MaxScopedMemoryOffset();
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index c582ecb70371dd..5730a254836fa2 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -14855,6 +14855,72 @@ ENTRY entry {
       /*operand_memory_space=*/kAlternateMemorySpace);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestBlockPrefetchingWithAlisedUses) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  p2 = f32[2,3]{1,0} parameter(2)
+  p3 = f32[2,3]{1,0} parameter(3)
+  p4 = f32[2,3]{1,0} parameter(4)
+  p5 = f32[2,3]{1,0} parameter(5)
+  custom_call0 = f32[2,3]{1,0} custom-call(p0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate1 = f32[2,3]{1,0} negate(custom_call0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add3 = f32[2,3]{1,0} add(p1, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(p2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(p3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  negate11 = f32[2,3]{1,0} negate(negate10)
+  add12 = f32[2,3]{1,0} add(p4, negate11)
+  custom_call13 = f32[2,3]{1,0} custom-call(custom_call0, add12), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate14 = f32[2,3]{1,0} negate(custom_call13)
+  ROOT add15 = f32[2,3]{1,0} add(p5, negate14)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 24;
+  memory_space_options.reserved_bytes_for_block_prefetches = 24;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  memory_space_options.block_prefetched_positions = GetHloPositions(
+      /*module=*/module.get(),
+      /*instruction_names=*/{"p0", "p1", "p2", "p3", "p4", "p5"});
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetched_uses = {"custom_call0", "add15"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> aliased_uses = {"negate1", "custom_call13",
+                                           "negate14"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/aliased_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> default_memory_uses = {"add3", "add6", "add9",
+                                                  "add12"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kParameter,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        TestBlockPrefetchingWithInputOutputAliasConfig) {
   absl::string_view hlo_string = R"(
@@ -15037,6 +15103,125 @@ ENTRY entry {
   EXPECT_EQ(add3->operand(0), add13->operand(0));
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestBlockPrefetchSourceValueAliased) {
+  // In this test, the source value of the block prefetch aliases to the left.
+  // custom_call1 and custom_call3 alias to the left, we test that all aliases
+  // to the left should be pinned to default memory space.
+  // custom_call3 also aliases to the right, which should be pinned to
+  // alternate memory space.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  p2 = f32[2,3]{1,0} parameter(2)
+  p3 = f32[2,3]{1,0} parameter(3)
+  p4 = f32[2,3]{1,0} parameter(4)
+  p5 = f32[2,3]{1,0} parameter(5)
+
+  custom_call0 = f32[2,3]{1,0} custom-call(p0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc0 = f32[2,3]{1,0} negate(custom_call0)
+  negate_cc1 = f32[2,3]{1,0} negate(negate_cc0)
+  custom_call1 = f32[2,3]{1,0} custom-call(custom_call0 ,negate_cc1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc2 = f32[2,3]{1,0} negate(custom_call1)
+  negate_cc3 = f32[2,3]{1,0} negate(negate_cc2)
+
+  custom_call2 = f32[2,3]{1,0} custom-call(p1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc4 = f32[2,3]{1,0} negate(custom_call2)
+  negate_cc5 = f32[2,3]{1,0} negate(negate_cc4)
+  custom_call3 = f32[2,3]{1,0} custom-call(custom_call2 ,negate_cc5), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate_cc6 = f32[2,3]{1,0} negate(custom_call3)
+  negate_cc7 = f32[2,3]{1,0} negate(negate_cc6)
+
+  add0 = f32[2,3]{1,0} add(custom_call1, negate_cc3)
+  add1 = f32[2,3]{1,0} add(add0, negate_cc7)
+  negate2 = f32[2,3]{1,0} negate(add1)
+  add3 = f32[2,3]{1,0} add(custom_call3, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(p2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(p3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  negate11 = f32[2,3]{1,0} negate(negate10)
+  add12 = f32[2,3]{1,0} add(p4, negate11)
+  custom_call13 = f32[2,3]{1,0} custom-call(custom_call3, add12), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add14 = f32[2,3]{1,0} add(custom_call1, add12)
+  add15 = f32[2,3]{1,0} add(custom_call13, add14)
+  ROOT add16 = f32[2,3]{1,0} add(p5, add15)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 400;
+  memory_space_options.reserved_bytes_for_block_prefetches = 400;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  memory_space_options.block_prefetched_positions = GetHloPositions(
+      /*module=*/module.get(),
+      /*instruction_names=*/{"custom_call1", "custom_call3", "p2", "p3", "p4",
+                             "p5"});
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher {
+        instruction_name_regex: "custom_call0|custom_call1|custom_call2|custom_call3"
+      }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+  memory_space_options.msa_sort_order_overrides =
+      std::move(msa_sort_order_overrides);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  // Check uses of block prefetche are from alternate memory space.
+  std::vector<std::string> alternate_memory_uses = {
+      "negate_cc2", "negate_cc6", "add0",          "add3",  "add6",
+      "add9",       "add12",      "custom_call13", "add14", "add16"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/alternate_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that the uses of the values aliased to the right of prefetched value
+  // are from alternate memory space.
+  std::vector<std::string> alt_memory_uses_from_aliased_pinned_values = {
+      "add15"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(),
+      /*instruction_names=*/alt_memory_uses_from_aliased_pinned_values,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  HloInstruction* add0 = FindInstruction(module.get(), "add0");
+  HloInstruction* add3 = FindInstruction(module.get(), "add3");
+  HloInstruction* custom_call13 =
+      FindInstruction(module.get(), "custom_call13");
+  HloInstruction* add14 = FindInstruction(module.get(), "add14");
+  // Check that the prefetch of p0 is reused for add14.
+  EXPECT_EQ(add0->operand(0), add14->operand(0));
+  // Check that the prefetch of p1 is reused for custom_call13.
+  EXPECT_EQ(add3->operand(0), custom_call13->operand(0));
+
+  // Check that all the uses of the hlo values that alias to the left of the
+  // prefetched hlo value (custom_call1, custom_call3) are from default memory
+  // space even though they are higher in the sort order.
+  std::vector<std::string> default_memory_uses = {"negate_cc0", "custom_call1",
+                                                  "negate_cc4", "custom_call3"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        TestBlockPrefetchingDoubleBufferedWithColoring) {
   absl::string_view hlo_string = R"(
@@ -15621,6 +15806,271 @@ ENTRY entry {
       /*operand_memory_space=*/kDefaultMemorySpace);
 }
 
+TEST_F(MemorySpaceAssignmentTest,
+       TestScheduleCustomCallPrefetchesWithAliasing) {
+  // params p0, p1, p2 have sliced and non-sliced custom call prefetches.
+  // slice_done_1 and prefetch_done_0 have multiple uses.
+  // p0, p1 and p2 have uses apart from the custom call prefetches.
+  // custom_call17 output aliases with prefetch_done_0, extending the live range
+  // of prefetch_done_0 by sharing the buffer with one additional hlo value.
+  // custom_call15 output aliases with slice_done_5 and custom_call21
+  // output aliases with custom_call17, extending the live range of
+  // slice_done_5, by sharing the buffer with two additional hlo values.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[4,3]{1,0} parameter(0)
+  p1 = f32[4,3]{1,0} parameter(1)
+  p2 = f32[4,3]{1,0} parameter(2)
+
+  prefetch_start0 = (f32[4,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  prefetch_start0_gte_0 = f32[4,3]{1,0} get-tuple-element(prefetch_start0), index=0
+  prefetch_start0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start0), index=1
+  prefetch_done_0 = f32[4,3]{1,0} custom-call(p0, prefetch_start0_gte_0, prefetch_start0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+
+  slice_start_0 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_0_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_0), index=0
+  slice_start_0_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_0), index=1
+  slice_done_0 = f32[2,3]{1,0} custom-call(p0, slice_start_0_gte_0, slice_start_0_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_1 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_1_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_1), index=0
+  slice_start_1_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_1), index=1
+  slice_done_1 = f32[2,3]{1,0} custom-call(p1, slice_start_1_gte_0, slice_start_1_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_2 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_2_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_2), index=0
+  slice_start_2_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_2), index=1
+  slice_done_2 = f32[2,3]{1,0} custom-call(p2, slice_start_2_gte_0, slice_start_2_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_3 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p0), custom_call_target="tpu_custom_call"
+  slice_start_3_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_3), index=0
+  slice_start_3_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_3), index=1
+  slice_done_3 = f32[2,3]{1,0} custom-call(p0, slice_start_3_gte_0, slice_start_3_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_4 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p1), custom_call_target="tpu_custom_call"
+  slice_start_4_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_4), index=0
+  slice_start_4_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_4), index=1
+  slice_done_4 = f32[2,3]{1,0} custom-call(p1, slice_start_4_gte_0, slice_start_4_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  slice_start_5 = (f32[2,3]{1,0}, s32[]{:T(128)S(2)}) custom-call(p2), custom_call_target="tpu_custom_call"
+  slice_start_5_gte_0 = f32[2,3]{1,0} get-tuple-element(slice_start_5), index=0
+  slice_start_5_gte_1 = s32[]{:T(128)S(2)} get-tuple-element(slice_start_5), index=1
+  slice_done_5 = f32[2,3]{1,0} custom-call(p2, slice_start_5_gte_0, slice_start_5_gte_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate0 = f32[2,3]{1,0} negate(slice_done_0)
+  negate1 = f32[2,3]{1,0} negate(negate0)
+  negate2 = f32[2,3]{1,0} negate(negate1)
+  add3 = f32[2,3]{1,0} add(slice_done_1, negate2)
+  negate4 = f32[2,3]{1,0} negate(add3)
+  negate5 = f32[2,3]{1,0} negate(negate4)
+  add6 = f32[2,3]{1,0} add(slice_done_2, negate5)
+  negate7 = f32[2,3]{1,0} negate(add6)
+  negate8 = f32[2,3]{1,0} negate(negate7)
+  add9 = f32[2,3]{1,0} add(slice_done_3, negate8)
+  negate10 = f32[2,3]{1,0} negate(add9)
+  add11 = f32[2,3]{1,0} add(slice_done_1, negate10)
+  add12 = f32[2,3]{1,0} add(slice_done_4, add11)
+  negate13 = f32[2,3]{1,0} negate(add12)
+  negate14 = f32[2,3]{1,0} negate(negate13)
+  custom_call15 = f32[2,3]{1,0} custom-call(slice_done_5, negate14), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate16 = f32[4,3]{1,0} negate(prefetch_done_0)
+  custom_call17 = f32[4,3]{1,0} custom-call(prefetch_done_0, negate16), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add18 = f32[4,3]{1,0} add(p0, custom_call17)
+  add19 = f32[4,3]{1,0} add(p1, add18)
+  add20 = f32[4,3]{1,0} add(p2, custom_call17)
+  custom_call21 = f32[4,3]{1,0} custom-call(add19, custom_call17), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+  add22 = f32[4,3]{1,0} add(add20, custom_call21)
+  add23 = f32[2,3]{1,0} add(negate14, custom_call15)
+  ROOT tuple = (f32[4,3]{1,0}, f32[2,3]{1,0}) tuple(add22, add23)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 96;
+  memory_space_options.reserved_bytes_for_block_prefetches = 96;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+  memory_space_options.verify = true;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"p0", "prefetch_start0", "prefetch_done_0"},
+      {"p0", "slice_start_0", "slice_done_0"},
+      {"p0", "slice_start_3", "slice_done_3"},
+      {"p1", "slice_start_1", "slice_done_1"},
+      {"p1", "slice_start_4", "slice_done_4"},
+      {"p2", "slice_start_2", "slice_done_2"},
+      {"p2", "slice_start_5", "slice_done_5"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  std::vector<std::string> prefetched_uses = {
+      "negate0", "add3",          "add6",     "add9",         "add11",
+      "add12",   "custom_call15", "negate16", "custom_call17"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetched_uses,
+      /*operand_index=*/0,
+      /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that all aliased uses of custom call prefetches are in alternate
+  // memory space.
+  std::vector<std::string> aliased_uses = {"add18", "add20", "custom_call21",
+                                           "add22", "add23"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/aliased_uses,
+      /*operand_index=*/1,
+      /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  // Check that all direct uses of parameters are in default memory space.
+  std::vector<std::string> direct_uses = {"add18", "add19", "add20"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/direct_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kParameter,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, TestCustomCallPrefetchSourceAliasing) {
+  // The block prefetched value (custom_call1) is aliased to the left and to the
+  // right, we test that all aliased values are in default memory space.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  param0 = f32[2,1,3]{2,1,0} parameter(0), sharding={replicated}
+
+  custom_call0 = f32[2,1,3]{2,1,0} custom-call(param0), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate0 = f32[2,1,3]{2,1,0} negate(custom_call0)
+  negate1 = f32[2,1,3]{2,1,0} negate(negate0)
+  custom_call1 = f32[2,1,3]{2,1,0} custom-call(custom_call0, negate1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate2 = f32[2,1,3]{2,1,0} negate(custom_call1)
+  negate3 = f32[2,1,3]{2,1,0} negate(negate2)
+
+  prefetch_start_param0 = (f32[2,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  gte_param0_0 = f32[2,1,3]{2,1,0:S(1)} get-tuple-element(prefetch_start_param0), index=0
+  gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(prefetch_start_param0), index=1
+  prefetch_done_param0 = f32[2,1,3]{2,1,0:S(1)} custom-call(custom_call1, gte_param0_0, gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  first_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  first_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(first_slice_prefetch_start_param0), index=0
+  first_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(first_slice_prefetch_start_param0), index=1
+  first_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(custom_call1, first_slice_gte_param0_0, first_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  second_slice_prefetch_start_param0 = (f32[1,1,3]{2,1,0:S(1)}, s32[]{:T(128)S(2)}) custom-call(custom_call1), custom_call_target="tpu_custom_call"
+  second_slice_gte_param0_0 = f32[1,1,3]{2,1,0:S(1)} get-tuple-element(second_slice_prefetch_start_param0), index=0
+  second_slice_gte_param0_1 = s32[]{:T(128)S(2)} get-tuple-element(second_slice_prefetch_start_param0), index=1
+  second_slice_prefetch_done_param0 = f32[1,1,3]{2,1,0:S(1)} custom-call(custom_call1, second_slice_gte_param0_0, second_slice_gte_param0_1), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (1, {})}
+
+  negate_param0 = f32[2,1,3]{2,1,0} negate(prefetch_done_param0)
+  add_param0 = f32[2,1,3]{2,1,0} add(prefetch_done_param0, negate_param0)
+  negate_param0_first_slice = f32[1,1,3]{2,1,0} negate(first_slice_prefetch_done_param0)
+  add_param0_first_slice = f32[1,1,3]{2,1,0} add(first_slice_prefetch_done_param0, negate_param0_first_slice)
+  negate_param0_second_slice = f32[1,1,3]{2,1,0} negate(second_slice_prefetch_done_param0)
+  add_param0_second_slice = f32[1,1,3]{2,1,0} add(second_slice_prefetch_done_param0, negate_param0_second_slice)
+
+  custom_call2 = f32[2,1,3]{2,1,0} custom-call(custom_call1, negate3), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  negate4 = f32[2,1,3]{2,1,0} negate(custom_call2)
+  negate5 = f32[2,1,3]{2,1,0} negate(negate4)
+  custom_call3 = f32[2,1,3]{2,1,0} custom-call(custom_call2, negate5), custom_call_target="tpu_custom_call", output_to_operand_aliasing={{}: (0, {})}
+  add0 = f32[2,1,3]{2,1,0} add(custom_call3, add_param0)
+  negate6 = f32[2,1,3]{2,1,0} negate(add0)
+
+  ROOT tuple = (f32[2,1,3]{2,1,0}, f32[1,1,3]{2,1,0}, f32[1,1,3]{2,1,0}) tuple(negate6, add_param0_first_slice, add_param0_second_slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 1000;
+  memory_space_options.reserved_bytes_for_block_prefetches = 1000;
+  memory_space_options.max_outstanding_block_prefetches = 10;
+  memory_space_options.max_outstanding_prefetches = 0;
+
+  std::vector<CustomCallPrefetchInfo> custom_call_prefetch_instructions = {
+      {"custom_call1", "prefetch_start_param0", "prefetch_done_param0"},
+      {"custom_call1", "first_slice_prefetch_start_param0",
+       "first_slice_prefetch_done_param0"},
+      {"custom_call1", "second_slice_prefetch_start_param0",
+       "second_slice_prefetch_done_param0"}};
+
+  memory_space_options.hlo_position_to_custom_call_prefetch_details =
+      GetCustomCallPrefetchDetailsMap(/*module=*/module.get(),
+                                      /*custom_call_prefetch_instructions=*/
+                                      custom_call_prefetch_instructions);
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher {
+        instruction_name_regex: "param0|custom_call0|custom_call1|custom_call2|custom_call3"
+      }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+  memory_space_options.msa_sort_order_overrides =
+      std::move(msa_sort_order_overrides);
+
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+
+  std::vector<std::string> prefetch_uses = {
+      "add_param0",
+      "add_param0_first_slice",
+      "add_param0_second_slice",
+      "negate_param0",
+      "negate_param0_first_slice",
+      "negate_param0_second_slice",
+  };
+
+  // Check that all uses of custom call prefetches are in alternate memory
+  // space.
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/prefetch_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+
+  std::vector<std::string> default_memory_space_uses = {
+      "negate0",
+      "custom_call1",
+      "negate2",
+      "prefetch_start_param0",
+      "prefetch_done_param0",
+      "first_slice_prefetch_start_param0",
+      "first_slice_prefetch_done_param0",
+      "second_slice_prefetch_start_param0",
+      "second_slice_prefetch_done_param0",
+      "custom_call2",
+      "negate4",
+      "custom_call3",
+      "add0"};
+
+  // Check that all the uses of the hlo values that alias with the prefetched
+  // hlo value (custom_call1) are in default memory space even though they are
+  // higher in the sort order.
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/default_memory_space_uses,
+      /*operand_index=*/0, /*operand_opcode=*/HloOpcode::kCustomCall,
+      /*operand_memory_space=*/kDefaultMemorySpace);
+}
+
 TEST_F(SlicedPrefetchTest, TestMultiplePinnedAllocationsBug) {
   // When block prefetching, finalize the original value if a sliced value is
   // prefetched successfully and the original value is not, if not finalized it
diff --git a/third_party/xla/xla/service/metrics.proto b/third_party/xla/xla/service/metrics.proto
index a1252dbe168f6f..d9014a16da939a 100644
--- a/third_party/xla/xla/service/metrics.proto
+++ b/third_party/xla/xla/service/metrics.proto
@@ -48,6 +48,11 @@ message JobInfo {
   optional int64 task_id = 5;
   // Task unique id, which may change across job restarts.
   optional int64 task_uid = 6;
+  // Process id -- track subprocesses.
+  optional int64 process_id = 7;
+  // Thread unique id for simultaneous events -- indicate dependencies
+  // and code flow for compilations within a task.
+  optional int64 thread_id = 8;
 }
 
 // Key-Value pair for metrics metadata tags.
diff --git a/third_party/xla/xla/service/multi_output_fusion.cc b/third_party/xla/xla/service/multi_output_fusion.cc
index 1803e586ce6abe..e1c009d4b1e21e 100644
--- a/third_party/xla/xla/service/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/multi_output_fusion.cc
@@ -36,7 +36,7 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> MultiOutputFusion::Run(
+absl::StatusOr<bool> MultiOutputFusion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/multi_output_fusion.h b/third_party/xla/xla/service/multi_output_fusion.h
index 7348f015bde791..3fde9bdf3e3419 100644
--- a/third_party/xla/xla/service/multi_output_fusion.h
+++ b/third_party/xla/xla/service/multi_output_fusion.h
@@ -57,13 +57,6 @@ class MultiOutputFusion : public HloModulePass {
 
   absl::string_view name() const override { return "multi_output_fusion"; }
 
-  // Run multi-output fusion on the given module. Returns whether the module
-  // was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
  protected:
   // Main entry for the optimization. Returns true if the optimization happens.
   bool Perform();
@@ -164,6 +157,12 @@ class MultiOutputFusion : public HloModulePass {
   // computation.
   virtual void CreateFusionWorkListForCurrentComputation();
 
+  // Run multi-output fusion on the given module. Returns whether the module
+  // was changed.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // The pair of candidates to be fused and the profit score.
   struct ToBeFused {
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 4000b87c33b05f..11ab2325d38df3 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -883,7 +883,7 @@ absl::Status LinearizeCollectivesWithPipelinedP2PChild(
 
 }  // namespace
 
-absl::StatusOr<bool> P2PSchedulePreparation::Run(
+absl::StatusOr<bool> P2PSchedulePreparation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   P2PGroupMap p2p_group_map;
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.h b/third_party/xla/xla/service/p2p_schedule_preparation.h
index 1545b3b51ddf7d..3d52f231176dc2 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.h
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.h
@@ -193,10 +193,8 @@ class P2PSchedulePreparation : public HloModulePass {
     return "latency-hiding-scheduler-preparation";
   }
 
-  using HloPassInterface::Run;
-  // Runs P2PSchedulePreparation pass on computations in 'module'.
-  // Returns whether the 'module' was changed.
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.cc b/third_party/xla/xla/service/reduce_scatter_combiner.cc
index f2fa462a0fc8d6..32def903a34550 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.cc
@@ -256,7 +256,7 @@ ReduceScatterCombiner::ReduceScatterCombiner(int64_t combine_threshold_in_bytes,
       combine_by_dim_(combine_by_dim),
       combine_while_loops_(combine_while_loops) {}
 
-absl::StatusOr<bool> ReduceScatterCombiner::Run(
+absl::StatusOr<bool> ReduceScatterCombiner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.h b/third_party/xla/xla/service/reduce_scatter_combiner.h
index 22134bcb258e18..095c69803d186b 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.h
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.h
@@ -45,11 +45,6 @@ class ReduceScatterCombiner : public HloModulePass {
 
   absl::string_view name() const override { return "reduce-scatter-combiner"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   using GroupKey = std::tuple<AllReduceKey, /*scatter_dimension*/ int64_t,
                               /*extra_args*/ std::string>;
 
@@ -70,6 +65,10 @@ class ReduceScatterCombiner : public HloModulePass {
           const HloInstruction*, const HloDomainMap&, bool)>
           combine_key);
 
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // Combine reduce-scatter ops up to this threshold.
   int64_t combine_threshold_in_bytes_;
 
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index c0a5c8d5993238..f01254ecb2de81 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -34,9 +34,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> ReduceScatterDecomposer::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterDecomposer::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   int64_t next_channel_id = hlo_query::NextChannelId(*module);
 
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/xla/xla/service/reduce_scatter_decomposer.h
index ea52de241020bc..2b24266ef34853 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.h
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -30,18 +31,19 @@ class ReduceScatterDecomposer : public HloModulePass {
  public:
   explicit ReduceScatterDecomposer(
       std::function<void(Shape&)> update_layout = nullptr,
-      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
+      std::function<bool(HloReduceScatterInstruction*)> should_decompose =
+          nullptr)
       : update_layout_(update_layout), should_decompose_(should_decompose) {}
   absl::string_view name() const override {
     return "reduce-scatter-decomposer";
   }
+  std::function<void(Shape&)> update_layout_;
+  std::function<bool(HloReduceScatterInstruction*)> should_decompose_;
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  std::function<void(Shape&)> update_layout_;
-  std::function<bool(const HloInstruction*)> should_decompose_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_scatter_reassociate.cc b/third_party/xla/xla/service/reduce_scatter_reassociate.cc
index 8134b3fc792601..41273b93a2c5aa 100644
--- a/third_party/xla/xla/service/reduce_scatter_reassociate.cc
+++ b/third_party/xla/xla/service/reduce_scatter_reassociate.cc
@@ -53,9 +53,9 @@ bool AreCompatible(const HloReduceScatterInstruction *rs0,
 
 }  // namespace
 
-absl::StatusOr<bool> ReduceScatterReassociate::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+absl::StatusOr<bool> ReduceScatterReassociate::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedCollective(
           *module, HloOpcode::kReduceScatter)) {
     VLOG(1)
diff --git a/third_party/xla/xla/service/reduce_scatter_reassociate.h b/third_party/xla/xla/service/reduce_scatter_reassociate.h
index fb7a0eccfce042..8a0015fb478328 100644
--- a/third_party/xla/xla/service/reduce_scatter_reassociate.h
+++ b/third_party/xla/xla/service/reduce_scatter_reassociate.h
@@ -34,8 +34,8 @@ class ReduceScatterReassociate : public HloModulePass {
     return "reduce-scatter-reassociate";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
index 7104e1ad310675..092d17df9c8611 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
@@ -247,7 +247,7 @@ absl::StatusOr<bool> UnifyAccumulatorWithInput(
 
 }  // namespace
 
-absl::StatusOr<bool> ScanLoopAccumulatorInputUnification::Run(
+absl::StatusOr<bool> ScanLoopAccumulatorInputUnification::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before ScanLoopAccumulatorInputUnification:";
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
index e105dddb194cb2..d26946965948d1 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
@@ -86,8 +86,9 @@ class ScanLoopAccumulatorInputUnification : public HloModulePass {
   absl::string_view name() const override {
     return "scan_loop_accumulator_input_unification";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index 7750da1d7d5b86..5d243167b8b659 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -347,18 +347,11 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
 
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
   switch (opcode) {
-    case HloOpcode::kAsin:
-    case HloOpcode::kAsinh:
-    case HloOpcode::kAcos:
-    case HloOpcode::kAcosh:
-    case HloOpcode::kAtanh:
-    case HloOpcode::kCosh:
     case HloOpcode::kFloor:
     case HloOpcode::kCbrt:  // Complex cbrt is not implemented in either of the
                             // backends.
     case HloOpcode::kCeil:
     case HloOpcode::kErf:
-    case HloOpcode::kSinh:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRoundNearestEven:
       if (!ShapeUtil::ElementIsFloating(shape)) {
@@ -368,7 +361,14 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
             HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type()));
       }
       return shape;
+    case HloOpcode::kAsin:
+    case HloOpcode::kAsinh:
+    case HloOpcode::kAcos:
+    case HloOpcode::kAcosh:
+    case HloOpcode::kAtanh:
+    case HloOpcode::kCosh:
     case HloOpcode::kCos:
+    case HloOpcode::kSinh:
     case HloOpcode::kSin:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index ca421afcd1e12e..2638f829a4fd5b 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -1393,8 +1393,7 @@ absl::StatusOr<bool> ProcessShardingInstruction(
         shard_group_id_to_shard_as_group,
     absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group,
-    const std::vector<bool>*
-        allow_spmd_sharding_propagation_to_parameters_vector,
+    absl::Span<const bool> allow_spmd_sharding_propagation_to_parameters_vector,
     bool remove_unknown_shardings) {
   bool changed = false;
 
@@ -1412,11 +1411,10 @@ absl::StatusOr<bool> ProcessShardingInstruction(
           instruction->sharding().IsShardGroup()) {
         if (instruction->IsCustomCall("Sharding")) {
           CHECK(instruction->operand(0)->opcode() != HloOpcode::kParameter ||
-                (allow_spmd_sharding_propagation_to_parameters_vector &&
-                 allow_spmd_sharding_propagation_to_parameters_vector->size() ==
+                (allow_spmd_sharding_propagation_to_parameters_vector.size() ==
                      module->entry_computation()->num_parameters() &&
-                 allow_spmd_sharding_propagation_to_parameters_vector->at(
-                     instruction->operand(0)->parameter_number())));
+                 allow_spmd_sharding_propagation_to_parameters_vector
+                     [instruction->operand(0)->parameter_number()]));
         }
         if (instruction->IsCustomCall("Sharding") && !replaced_with_copy) {
           // Pass shard group to operand sharding custom-call if it's not
@@ -3121,7 +3119,7 @@ std::vector<HloInstruction*> ShardingPropagation::GetRelatedInstructions(
   }
 };
 
-absl::StatusOr<bool> ShardingPropagation::Run(
+absl::StatusOr<bool> ShardingPropagation::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   LOG(WARNING) << "GSPMD sharding propagation is going to be deprecated and "
@@ -3190,7 +3188,7 @@ absl::StatusOr<bool> ShardingPropagation::Run(
               : nullptr,
           &instruction_to_shard_group_id, &shard_group_id_to_shard_as_group,
           &shard_group_id_to_shard_like_group,
-          &allow_spmd_sharding_propagation_to_parameters_vector_));
+          allow_spmd_sharding_propagation_to_parameters_vector_));
   any_changed |= changed;
 
   for (const auto& [shard_group_id, shard_as_group] :
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index f1b05bcba32aee..21d77783b69989 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -79,8 +80,8 @@ absl::StatusOr<bool> ProcessShardingInstruction(
         shard_group_id_to_shard_as_group = nullptr,
     absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group = nullptr,
-    const std::vector<bool>*
-        allow_spmd_sharding_propagation_to_parameters_vector = nullptr,
+    absl::Span<const bool>
+        allow_spmd_sharding_propagation_to_parameters_vector = {},
     bool remove_unknown_shardings = false);
 
 int64_t ComputeNonRootUsers(const HloInstruction* instr);
@@ -130,10 +131,6 @@ class ShardingPropagation : public HloModulePass {
     }
   }
   absl::string_view name() const override { return "sharding-propagation"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Function which can be used to apply a spatially partitioned sharding onto a
   // given domain. It will apply the sharding into the exit edges of the domain
@@ -147,6 +144,11 @@ class ShardingPropagation : public HloModulePass {
       int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
       const CustomCallShardingHelper* sharding_helper);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   bool InferShardingFromShardGroup(
       HloInstruction* instruction, int64_t aggressiveness,
@@ -197,8 +199,10 @@ class ShardingPropagation : public HloModulePass {
   bool propagate_metadata_;
   bool allow_spmd_sharding_propagation_to_output_;
   bool allow_spmd_sharding_propagation_to_parameters_;
-  std::vector<bool> allow_spmd_sharding_propagation_to_output_vector_;
-  std::vector<bool> allow_spmd_sharding_propagation_to_parameters_vector_;
+  absl::InlinedVector<bool, 1>
+      allow_spmd_sharding_propagation_to_output_vector_;
+  absl::InlinedVector<bool, 1>
+      allow_spmd_sharding_propagation_to_parameters_vector_;
   // If true, the pass keeps the propagation results only on selected
   // instructions to prevent CSE across unrelated subgraphs. (A common case is
   // scalar broadcasts).
diff --git a/third_party/xla/xla/service/sharding_remover.cc b/third_party/xla/xla/service/sharding_remover.cc
index e2d926a9045aa8..d7c52c68a534a9 100644
--- a/third_party/xla/xla/service/sharding_remover.cc
+++ b/third_party/xla/xla/service/sharding_remover.cc
@@ -36,7 +36,7 @@ namespace xla {
 
 // Remove Sharding custom-call instruction by assigning its users to
 // to its operand.
-absl::StatusOr<bool> ShardingRemover::Run(
+absl::StatusOr<bool> ShardingRemover::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/sharding_remover.h b/third_party/xla/xla/service/sharding_remover.h
index 5ea1b6e1273bce..338680bdd64f9d 100644
--- a/third_party/xla/xla/service/sharding_remover.h
+++ b/third_party/xla/xla/service/sharding_remover.h
@@ -32,8 +32,9 @@ namespace xla {
 class ShardingRemover : public HloModulePass {
  public:
   absl::string_view name() const override { return "sharding-remover"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 8ac3fb10ac4201..60f0bd115b035e 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -4194,11 +4194,11 @@ absl::Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
 
 }  // namespace
 
-absl::StatusOr<bool> SpaceToBatchConverter::Run(
+absl::StatusOr<bool> SpaceToBatchConverter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2, "SpaceToBatchConverter::Run(), before:\n" + module->ToString());
+      2, "SpaceToBatchConverter::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
@@ -4208,8 +4208,8 @@ absl::StatusOr<bool> SpaceToBatchConverter::Run(
     }
     VLOG(1) << "Done operating on computation";
   }
-  XLA_VLOG_LINES(2,
-                 "SpaceToBatchConverter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      2, "SpaceToBatchConverter::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/space_to_batch_converter.h b/third_party/xla/xla/service/space_to_batch_converter.h
index 37cf1809703d21..7455b86657b68e 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.h
+++ b/third_party/xla/xla/service/space_to_batch_converter.h
@@ -56,15 +56,15 @@ class SpaceToBatchConverter : public HloModulePass {
 
   absl::string_view name() const override { return "space-to-batch-converter"; }
 
+  // Controller for various knobs.
+  SpaceToBatchController ctrl_;
+
+ protected:
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  // Controller for various knobs.
-  SpaceToBatchController ctrl_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 57fd96feaeb9fb..d0b65c766b1a86 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -89,7 +89,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/utility",
         "@local_tsl//tsl/platform:numbers",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index 513ceb2e462dcd..7d92183f735c3a 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -104,7 +104,7 @@ absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunOnComputation(
   return changed;
 }
 
-absl::StatusOr<bool> CanonicalizeAllGatherForCSE::Run(
+absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
index 113ffa17ee27d6..e65e6b26c5ec5b 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.h
@@ -35,8 +35,8 @@ class CanonicalizeAllGatherForCSE : public HloModulePass {
   ~CanonicalizeAllGatherForCSE() override = default;
   absl::string_view name() const override { return "canon-all-gather-for-cse"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion.cc b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
index 8bc97141982c85..63e43fc4dc4c2e 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion.cc
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
@@ -305,7 +305,7 @@ absl::StatusOr<bool> MoveCollectivePermutes(HloComputation* computation,
   return changed;
 }
 
-absl::StatusOr<bool> CollectivePermuteMotion::Run(
+absl::StatusOr<bool> CollectivePermuteMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion.h b/third_party/xla/xla/service/spmd/collective_permute_motion.h
index 8a97b165d5beb0..3eac12cefcc26f 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion.h
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion.h
@@ -33,8 +33,8 @@ class CollectivePermuteMotion : public HloModulePass {
     return "collective-permute-motion";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 3fb82561ec710c..59ef91f222e5fe 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -341,7 +341,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
   };
   HloInstruction* rotated0 = rotate_with_padding(amount);
   if (right_padding == 0) {
-    SetPartitionedHlo(hlo, [&] { return rotated0; });
+    SetPartitionedHlo(hlo, rotated0);
     return absl::OkStatus();
   }
 
@@ -374,10 +374,9 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
   HloInstruction* pred = b_.AddInstruction(HloInstruction::CreateCompare(
       ShapeUtil::ChangeElementType(iota->shape(), PRED), iota,
       selection_boundary, Comparison::Direction::kLt));
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(HloInstruction::CreateTernary(
-        rotated0->shape(), HloOpcode::kSelect, pred, rotated1, rotated0));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateTernary(
+                             rotated0->shape(), HloOpcode::kSelect, pred,
+                             rotated1, rotated0)));
   return absl::OkStatus();
 }
 
@@ -405,7 +404,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
         input->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
     auto copy = b_.AddInstruction(
         HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
-    SetPartitionedHlo(hlo, [&] { return copy; });
+    SetPartitionedHlo(hlo, copy);
     return absl::OkStatus();
   }
   if (hlo->custom_call_target() == "SPMDShardToFullShape") {
@@ -416,7 +415,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
         HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
     CHECK(ShapeUtil::Compatible(
         copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
-    SetPartitionedHlo(hlo, [&] { return copy; });
+    SetPartitionedHlo(hlo, copy);
     return absl::OkStatus();
   }
 
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index d8844cb90887da..c737330b49fea5 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -4349,7 +4349,7 @@ absl::Status SpmdPartitioningVisitor::HandleDotHelper(
                      num_partitions_, create_sharded_dot, conv_window, module_,
                      hlo, options_, &b_, &windowed_dot_general_loops_, this));
   }
-  SetPartitionedHlo(hlo, [partitioned_dot] { return partitioned_dot; });
+  SetPartitionedHlo(hlo, partitioned_dot);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/fft_handler.cc b/third_party/xla/xla/service/spmd/fft_handler.cc
index 086ae31d139d15..715a634193a606 100644
--- a/third_party/xla/xla/service/spmd/fft_handler.cc
+++ b/third_party/xla/xla/service/spmd/fft_handler.cc
@@ -426,10 +426,7 @@ absl::Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
       partitioned_input.state().next_channel_id, module_,
       partitioned_input.state().b);
 
-  result->set_sharding(hlo->sharding());
-  auto partitioned_fft =
-      PartitionedHlo(result, hlo->shape(), partitioned_input.state());
-  SetPartitionedHlo(hlo, std::move(partitioned_fft));
+  SetPartitionedHlo(hlo, result);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 178a9f5849a02e..80279bb773d1de 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -1009,8 +1009,7 @@ absl::Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
       PartitionGather(gather, operand, indices, gather->shape(),
                       gather->sharding(), absl::MakeConstSpan(batch_dims),
                       gather->gather_slice_sizes(), this));
-  SetPartitionedHlo(gather, PartitionedHlo(pgather, gather->shape(),
-                                           MakePartitioningState()));
+  SetPartitionedHlo(gather, pgather);
   return absl::OkStatus();
 }
 
@@ -1904,8 +1903,7 @@ absl::Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
   if (!pscatter) {
     return DefaultAction(hlo);
   }
-  SetPartitionedHlo(scatter, PartitionedHlo(pscatter, scatter->shape(),
-                                            MakePartitioningState()));
+  SetPartitionedHlo(scatter, pscatter);
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/spmd/partition_assignment.cc b/third_party/xla/xla/service/spmd/partition_assignment.cc
index b74605087df834..6e009c6d8cbca0 100644
--- a/third_party/xla/xla/service/spmd/partition_assignment.cc
+++ b/third_party/xla/xla/service/spmd/partition_assignment.cc
@@ -94,7 +94,7 @@ PartitionAssignment::ChoosePartitioningAlgorithm(
   return PartitioningAlgorithm::CreateNoopPartitioning(num_partitions());
 }
 
-absl::StatusOr<bool> PartitionAssignment::Run(
+absl::StatusOr<bool> PartitionAssignment::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Running partition assignment on module " << module->name();
diff --git a/third_party/xla/xla/service/spmd/partition_assignment.h b/third_party/xla/xla/service/spmd/partition_assignment.h
index 5ba8a2080bb1d8..25b9ba91289921 100644
--- a/third_party/xla/xla/service/spmd/partition_assignment.h
+++ b/third_party/xla/xla/service/spmd/partition_assignment.h
@@ -101,18 +101,18 @@ class PartitionAssignment : public HloModulePass {
   virtual std::unique_ptr<PartitioningAlgorithm> ChoosePartitioningAlgorithm(
       const HloModule& module) const;
 
-  // Runs the pass.
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Returns the algorithm being used.
   const PartitioningAlgorithm* algorithm();
 
   // Returns the number of partitions.
   int64_t num_partitions() const;
 
+ protected:
+  // Runs the pass.
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // The partitioning algorithm to be used. For now, it is determined by a flag.
   std::unique_ptr<PartitioningAlgorithm> algorithm_ = nullptr;
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
index bba155578f477c..2883a5acc637b2 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
@@ -168,7 +168,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
 
 }  // namespace
 
-absl::StatusOr<bool> ScheduleAwareCollectiveOpsCSE::Run(
+absl::StatusOr<bool> ScheduleAwareCollectiveOpsCSE::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
index b23216be99f837..0b6463d758aa20 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.h
@@ -42,8 +42,8 @@ class ScheduleAwareCollectiveOpsCSE : public HloModulePass {
     return "schedule-aware-collective-cse";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/sharding_format_picker.cc b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
index c6a1a6a8008194..8a4e0c691a3616 100644
--- a/third_party/xla/xla/service/spmd/sharding_format_picker.cc
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
@@ -165,7 +165,7 @@ std::unique_ptr<HloSharding> MaybeConvertToV1(const HloSharding& sharding) {
 
 }  // namespace
 
-absl::StatusOr<bool> ShardingFormatPicker::Run(
+absl::StatusOr<bool> ShardingFormatPicker::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/sharding_format_picker.h b/third_party/xla/xla/service/spmd/sharding_format_picker.h
index 583444157b631e..8eb420cd1d735b 100644
--- a/third_party/xla/xla/service/spmd/sharding_format_picker.h
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.h
@@ -35,8 +35,9 @@ class ShardingFormatPicker : public HloModulePass {
   explicit ShardingFormatPicker(ShardingType sharding_type)
       : sharding_type_(sharding_type) {}
   absl::string_view name() const override { return "sharding-format-picker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
index 27fc5795529149..2bfffd7b68a266 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
@@ -39,6 +39,7 @@ cc_library(
     srcs = ["export_named_computations.cc"],
     hdrs = ["export_named_computations.h"],
     deps = [
+        "//xla/mlir_hlo",
         "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
@@ -57,7 +58,6 @@ cc_library(
     srcs = ["import_func_calls.cc"],
     hdrs = ["import_func_calls.h"],
     deps = [
-        "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
index 4bcfd3cecbb23d..fdafc3438c8ee3 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "xla/service/spmd/shardy/round_trip_common/export_named_computations.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <tuple>
+#include <utility>
 
 #include "absl/log/check.h"
 #include "llvm/ADT/DenseMap.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/SymbolTable.h"
@@ -39,6 +42,7 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/utils.h"
 
@@ -50,12 +54,14 @@ namespace {
 using ::mlir::ArrayAttr;
 using ::mlir::ModuleOp;
 using ::mlir::NamedAttribute;
+using ::mlir::StringAttr;
 using ::mlir::StringRef;
 using ::mlir::SymbolTable;
+using ::mlir::SymbolTableCollection;
+using ::mlir::SymbolUserMap;
 using ::mlir::func::CallOp;
 using ::mlir::func::FuncOp;
 
-using ::mlir::StringAttr;
 using ::mlir::sdy::kShardingAttr;
 using ::mlir::sdy::ManualAxesAttr;
 using ::mlir::sdy::NamedComputationOp;
@@ -145,12 +151,68 @@ class ExportNamedComputationsPass
     this->dedupFunctionsFully = other.dedupFunctionsFully;
   }
 
-  llvm::SmallDenseMap<ComputationKey, StringAttr> funcCache;
-
   void runOnOperation() final {
     ModuleOp moduleOp = getOperation();
-    SymbolTable symbolTable(moduleOp);
+    SymbolTableCollection symbolTableCollection;
+    SymbolTable& symbolTable = symbolTableCollection.getSymbolTable(moduleOp);
     mlir::Block& moduleBlock = moduleOp.getRegion().front();
+    llvm::SmallDenseMap<ComputationKey, StringAttr> funcCache;
+
+    if (dedupFunctionsFully) {
+      using FuncNameKey = std::pair<StringRef, ManualAxesAttr>;
+      llvm::SmallDenseMap<ComputationKey, int64_t> funcCallSiteCounts;
+      llvm::SmallDenseMap<FuncNameKey, std::pair<NamedComputationOp, int64_t>>
+          funcToNamedComputations;
+      // TODO(enver): Instead of a SmallDenseMap and a separate SmallVector to
+      // guarantee a deterministic iteration order, consider using
+      // llvm::MapVector.
+      // Required to iterate on functions in a deterministic order.
+      llvm::SmallVector<FuncNameKey> funcNames;
+      moduleOp.walk([&](NamedComputationOp namedComputationOp) {
+        ManualAxesAttr manualAxesAttr =
+            namedComputationOp->getAttrOfType<ManualAxesAttr>(kManualAxes);
+        auto key =
+            std::make_tuple(namedComputationOp.getName(),
+                            namedComputationOp.getInShardings().value_or(
+                                TensorShardingPerValueAttr()),
+                            namedComputationOp.getOutShardings().value_or(
+                                TensorShardingPerValueAttr()),
+                            manualAxesAttr);
+        const int64_t callSiteCount = funcCallSiteCounts[key]++;
+        FuncNameKey funcNameKey =
+            std::pair(namedComputationOp.getName(), manualAxesAttr);
+        if (auto [it, inserted] = funcToNamedComputations.try_emplace(
+                funcNameKey, namedComputationOp, callSiteCount);
+            !inserted) {
+          auto& [cachedNamedComputationOp, cachedCallSiteCount] = it->second;
+          if (callSiteCount > cachedCallSiteCount) {
+            cachedNamedComputationOp = namedComputationOp;
+            cachedCallSiteCount = callSiteCount;
+          }
+        } else {  // inserted is true.
+          funcNames.push_back(funcNameKey);
+        }
+      });
+
+      for (FuncNameKey funcNameKey : funcNames) {
+        auto& [namedComputationOp, callSiteCount] =
+            funcToNamedComputations.find(funcNameKey)->second;
+        mlir::IRRewriter rewriter(namedComputationOp);
+        rewriter.setInsertionPointToEnd(&moduleBlock);
+        ManualAxesAttr manualAxesAttr =
+            namedComputationOp->getAttrOfType<ManualAxesAttr>(kManualAxes);
+        StringAttr funcSymName =
+            createFuncOp(namedComputationOp, rewriter, symbolTable,
+                         namedComputationOp.getInShardings(),
+                         namedComputationOp.getOutShardings(), manualAxesAttr);
+        funcCache.try_emplace(
+            std::make_tuple(namedComputationOp.getName(),
+                            TensorShardingPerValueAttr(),
+                            TensorShardingPerValueAttr(), manualAxesAttr),
+            funcSymName);
+      }
+    }
+
     // NOTE: The walk needs to be in post order, which is the default order, to
     // account for nested named computations.
     moduleOp.walk([&](NamedComputationOp namedComputationOp) {
@@ -182,18 +244,45 @@ class ExportNamedComputationsPass
       callOp->setAttrs(callOpAttrs);
 
       // Copy the func output shardings to the call op.
-      // TODO(enver): Add explicit reshard if callOp and funcOp result shardings
-      // mismatch.
       FuncOp funcOp = symbolTable.lookup<FuncOp>(funcSymName);
       if (TensorShardingPerValueAttr funcResultShardings =
               getFuncResultShardings(callOp, funcOp, symbolTable);
           funcResultShardings) {
         mlir::sdy::setShardings(callOp, funcResultShardings);
+        if (outShardings.has_value()) {
+          for (auto [funcResultSharding, outSharding, result] : llvm::zip_equal(
+                   funcResultShardings.getShardings(),
+                   outShardings->getShardings(), callOp.getResults())) {
+            if (!funcResultSharding.isEquivalent(outSharding)) {
+              rewriter.setInsertionPointAfterValue(result);
+              auto copyOp =
+                  mlir::mhlo::CopyOp::create(rewriter, result.getLoc(), result);
+              mlir::sdy::setShardings(copyOp, outSharding);
+              rewriter.replaceAllUsesExcept(result, copyOp, copyOp);
+            }
+          }
+        }
         if (manualAxesAttr) {
           callOp->setAttr(kManualAxes, manualAxesAttr);
         }
       }
     });
+
+    // Drop uncalled inlineable manual computation funcs.
+    // TODO(enver): Drop generically, not just inlined manual computation funcs.
+    llvm::SmallVector<FuncOp> uncalledInlineableManualComputationFuncs;
+    SymbolUserMap symbolUserMap(symbolTableCollection, moduleOp);
+    for (FuncOp funcOp : moduleOp.getOps<FuncOp>()) {
+      if (StringRef funcSymName = funcOp.getName();
+          funcSymName.contains(kInlineableManualComputationFuncName) &&
+          symbolUserMap.useEmpty(funcOp)) {
+        uncalledInlineableManualComputationFuncs.push_back(funcOp);
+      }
+    }
+    // TODO(enver): Erase directly without collecting on a vector.
+    for (FuncOp funcOp : uncalledInlineableManualComputationFuncs) {
+      symbolTable.erase(funcOp);
+    }
   }
 
   StringRef getArgument() const override {
@@ -207,13 +296,17 @@ class ExportNamedComputationsPass
            "`NamedComputationOp`s operands/results.";
   }
 
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect, mlir::mhlo::MhloDialect>();
+  }
+
   Option<bool> dedupFunctionsFully{
       *this, "dedup-functions-fully",
       llvm::cl::desc(
-          "Whether to deduplicate functions fully, regardless of the input and "
-          "output shardings of functions, and it keeps one callee function for "
-          "each caller function. The default is false, meaning it will "
-          "deduplicate only if the input and output shardings are the same."),
+          "If true, regardless of the input and output shardings of functions, "
+          "it keeps one callee function for each caller function. The default "
+          "is false, meaning it will deduplicate only if the input and output "
+          "shardings are the same."),
       llvm::cl::init(false)};
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
index 621b1294ece1af..99a6e6d167339b 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_func_calls.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Threading.h"
 #include "mlir/Analysis/CallGraph.h"
@@ -46,7 +45,6 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
-#include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/utils.h"
 
 namespace xla {
@@ -65,15 +63,6 @@ using ::mlir::sdy::NamedComputationOp;
 using ::mlir::sdy::TensorShardingAttr;
 using ::mlir::sdy::TensorShardingPerValueAttr;
 
-bool isInlineableCallOp(CallOp callOp) {
-  if (hasFrontendAttr(callOp, kXlaBackendConfigAttr)) {
-    return false;
-  }
-  auto inlineableAttr =
-      tryGetFrontendAttr<mlir::BoolAttr>(callOp, kXlaInlineableAttr);
-  return !inlineableAttr || inlineableAttr->getValue();
-}
-
 // Returns the first non-maximal mesh on the argument shardings, if there is
 // one. Otherwise returns `std::nullopt`.
 // TODO(enver): Move to utils and potentially with a common helper that takes an
@@ -130,11 +119,10 @@ void importCallOp(
   rewriter.setInsertionPoint(callOp);
   TensorShardingPerValueAttr callOpResultShardings =
       mlir::sdy::getShardingPerValue(callOp);
-  auto namedCompOp = rewriter.create<NamedComputationOp>(
-      callOp->getLoc(), callOp->getResultTypes(), calleeName,
+  auto namedCompOp = NamedComputationOp::create(
+      rewriter, callOp->getLoc(), callOp->getResultTypes(), calleeName,
       callOp.getOperands(),
-      /*inShardings=*/
-      getFuncArgShardings(callOp, funcOp, symbolTable),
+      /*inShardings=*/getFuncArgShardings(callOp, funcOp, symbolTable),
       // TODO(b/439018088): Take func result shardings if call op result
       // shardings are empty.
       /*outShardings=*/
@@ -187,7 +175,9 @@ class ImportFuncCallsPass
     mlir::CallGraph callGraph(moduleOp);
     llvm::ReversePostOrderTraversal<const mlir::CallGraph*> rpo(&callGraph);
     for (mlir::CallGraphNode* node : llvm::reverse(rpo)) {
-      if (node->isExternal()) continue;
+      if (node->isExternal()) {
+        continue;
+      }
       node->getCallableRegion()->walk([&](CallOp op) {
         importCallOp(op, calleeNameToMovedRegion, rewriter, symbolTable);
       });
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
index d7a1d67eed2d67..c0edc9df9ddd77 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
@@ -133,9 +133,8 @@ mlir::LogicalResult rewriteManualComputation(
   sdy::TensorShardingPerValueAttr outShardings =
       sdy::TensorShardingPerValueAttr::get(context, {});
   sdy::ManualAxesAttr manualAxes = sdy::ManualAxesAttr::get(context, {});
-  bool newCodePath = false;
 
-  auto setShardingAttrs = [&newCodePath, &manualAxes](
+  auto setShardingAttrs = [&manualAxes](
                               CustomCallOp customCallOp,
                               sdy::TensorShardingPerValueAttr& shardings,
                               llvm::StringRef shardingAttrName) {
@@ -143,7 +142,6 @@ mlir::LogicalResult rewriteManualComputation(
       return;
     }
     if (mlir::DictionaryAttr frontendAttrs = getFrontendAttrs(customCallOp)) {
-      newCodePath = true;
       shardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
           frontendAttrs, shardingAttrName);
       if (manualAxes.empty()) {
@@ -155,18 +153,6 @@ mlir::LogicalResult rewriteManualComputation(
 
   setShardingAttrs(globalToLocalShape, inShardings, kInShardings);
   setShardingAttrs(localToGlobalShape, outShardings, kOutShardings);
-  // TODO(b/410499196): Code to handle loading an old checkpoint. Remove after
-  // 6 months of cl/745735176 being submitted.
-  mlir::DictionaryAttr callOpFrontendAttrs = getFrontendAttrs(callOp);
-  if (!newCodePath && callOpFrontendAttrs &&
-      callOpFrontendAttrs.contains(kManualAxes)) {
-    inShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
-        callOpFrontendAttrs, kInShardings);
-    outShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
-        callOpFrontendAttrs, kOutShardings);
-    manualAxes =
-        parseStringAttr<sdy::ManualAxesAttr>(callOpFrontendAttrs, kManualAxes);
-  }
   auto manualComputationOp =
       rewriter.replaceOpWithNewOp<sdy::ManualComputationOp>(
           callOp, resultTypes, operands, inShardings, outShardings, manualAxes);
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
index a5ece5dc63d802..d89259c5c5907e 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
@@ -427,7 +427,7 @@ bool eraseInlineableAttrForShardyManualComputations(HloModule* module) {
 
 }  // namespace
 
-absl::StatusOr<bool> ShardyXLA::Run(
+absl::StatusOr<bool> ShardyXLA::RunImpl(
     HloModule* hloModule,
     const absl::flat_hash_set<absl::string_view>& executionThreads) {
   auto moduleFrontendAttrs = hloModule->frontend_attributes().map();
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
index 4cac123b6de228..0de981e99524bd 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
@@ -42,8 +42,8 @@ class ShardyXLA : public xla::HloModulePass {
 
   absl::string_view name() const override { return "shardy-xla"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       xla::HloModule* hloModule,
       const absl::flat_hash_set<absl::string_view>& executionThreads) override;
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
index def6596fb859f5..6a71171ef4fa8e 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.cc
@@ -28,8 +28,8 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
-void addStablehloExportPipeline(
-    mlir::OpPassManager& pm, const StablehloExportPipelineOptions& options) {
+void addStablehloExportPipeline(mlir::OpPassManager& pm,
+                                const StablehloExportPipelineOptions& options) {
   pm.addPass(createStablehloExportManualReductionCollectivesPass());
   // This pass converts `sdy.constant` (which isn't foldable) into
   // `stablehlo.constant` (which is foldable), therefore greedy pattern
@@ -44,14 +44,15 @@ void addStablehloExportPipeline(
   // free variable that has a sharding is lifted as an additional result, and in
   // effect the op will have a replicated sharding for all results.
   pm.addPass(createExportStablehloShardingsPass(
-      /*addMissingShardingToControlFlow=*/true));
+      /*addMissingShardingToControlFlow=*/options
+          .addMissingShardingToControlFlow));
   pm.addPass(createStablehloRoundTripExportCallbackCustomCallsPass());
 }
 
 namespace {
 
-void stablehloExportPipeline(
-    mlir::OpPassManager& pm, const StablehloExportPipelineOptions& options) {
+void stablehloExportPipeline(mlir::OpPassManager& pm,
+                             const StablehloExportPipelineOptions& options) {
   addStablehloExportPipeline(pm, options);
 }
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
index d089b1a2a9f45f..bfb1a60d467288 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h
@@ -27,12 +27,12 @@ namespace sdy {
 struct StablehloExportPipelineOptions
     : public mlir::PassPipelineOptions<StablehloExportPipelineOptions> {
   Option<bool> keepHloShardingConstraints{
-    *this, "keep-hlo-sharding-constraints",
-    llvm::cl::desc(
-        "Whether to convert SDY sharding constraints to @Sharding custom "
-        "calls - the HLO sharding constraint op. Else export "
-        "them to MHLO copy ops. By default, export to MHLO copy ops."),
-    llvm::cl::init(false)};
+      *this, "keep-hlo-sharding-constraints",
+      llvm::cl::desc(
+          "Whether to convert SDY sharding constraints to @Sharding custom "
+          "calls - the HLO sharding constraint op. Else export "
+          "them to MHLO copy ops. By default, export to MHLO copy ops."),
+      llvm::cl::init(false)};
   Option<bool> dedupFunctionsFully{
       *this, "dedup-functions-fully",
       llvm::cl::desc(
@@ -41,6 +41,11 @@ struct StablehloExportPipelineOptions
           "each caller function. The default is false, meaning it will "
           "deduplicate only if the input and output shardings are the same."),
       llvm::cl::init(false)};
+  Option<bool> addMissingShardingToControlFlow{
+      *this, "add-missing-sharding-to-control-flow",
+      llvm::cl::desc(
+          "Whether to add a sharding to a control flow op without one."),
+      llvm::cl::init(true)};
 };
 
 // Register the xla-sdy-stablehlo-export-pipeline.
diff --git a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
index 8a8fbd591bdeb9..a389c3f6fbbf9b 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations.mlir
@@ -101,6 +101,56 @@ func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x
 
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
+// CHECK-LABEL: func @multiple_same_named_computations_same_shardings_named_computations_have_different_manual_computation_calls(
+func.func @multiple_same_named_computations_same_shardings_named_computations_have_different_manual_computation_calls(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"y"}, {"x"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %0 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %1 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %2 : tensor<8x2xi32>
+  %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body_0(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {}]>] out_shardings=[<@mesh, [{}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = func.call @local_xla.sdy.inlinable_manual_computation_body_1(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %2 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func @local_xla.sdy.inlinable_manual_computation_body(
+func.func @local_xla.sdy.inlinable_manual_computation_body(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-NOT:   func @local_xla.sdy.inlinable_manual_computation_body_0(
+func.func @local_xla.sdy.inlinable_manual_computation_body_0(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-NOT:   func @local_xla.sdy.inlinable_manual_computation_body_1(
+func.func @local_xla.sdy.inlinable_manual_computation_body_1(
+  %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>})
+ -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>}) {
+  return %arg0 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}]>})
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
 // CHECK-LABEL: func @non_flat_nested_named_computations_same_shardings(
 // CHECK-SAME:      %arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}]>}
 // CHECK-SAME:      -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}]>}) {
diff --git a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
index a028966dcaab3b..dac7bb953b4b85 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/export_named_computations_deduplicate_functions_fully.mlir
@@ -3,40 +3,114 @@
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
 // CHECK-LABEL: func @multiple_same_named_computations_different_shardings(
-func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"y"}, {"x"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  // CHECK-NEXT: %0 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: %1 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: return %1 : tensor<8x2xi32>
+func.func @multiple_same_named_computations_different_shardings(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[CALL1:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %[[CALL1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[CALL0]], %[[COPY]]
+  // CHECK-NEXT: return %[[ADD]]
   %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>] (%arg1: tensor<8x2xi32>) {
-    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
     sdy.return %2 : tensor<8x2xi32>
   } : (tensor<8x2xi32>) -> tensor<8x2xi32>
   %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
     %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
     sdy.return %3 : tensor<8x2xi32>
   } : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %1 : tensor<8x2xi32>
+  %4 = stablehlo.add %0, %1 : tensor<8x2xi32>
+  return %4 : tensor<8x2xi32>
 }
 
 // CHECK-LABEL: func private @baz(
 // CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
 // CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>}
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @multiple_same_named_computations_different_shardings_different_number_of_call_sites(
+func.func @multiple_same_named_computations_different_shardings_different_number_of_call_sites(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %[[CALL0]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>}
+  // CHECK-NEXT: %[[CALL1:.*]] = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[CALL2:.*]] = call @baz(%[[COPY]]) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: return %[[CALL2]] : tensor<8x2xi32>
+  %0 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>] (%arg1: tensor<8x2xi32>) {
+    %2 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
+    sdy.return %2 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %2 = sdy.named_computation<"baz">(%0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %3 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %3 : tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %2 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>}
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @multiple_same_named_computations_multiple_outputs_different_shardings(
+func.func @multiple_same_named_computations_multiple_outputs_different_shardings(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  // CHECK-NEXT: %[[CALL0:.*]]:2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>}
+  // CHECK-NEXT: %[[DIVIDE0:.*]] = stablehlo.divide %[[CALL0]]#0, %[[CALL0]]#1
+  // CHECK-NEXT: %[[CALL1:.*]]:2 = call @baz(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>}
+  // CHECK-NEXT: %[[COPY0:.*]] = mhlo.copy %[[CALL1]]#1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}, {"x"}]>]>}
+  // CHECK-NEXT: %[[COPY1:.*]] = mhlo.copy %[[CALL1]]#0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>}
+  // CHECK-NEXT: %[[DIVIDE1:.*]] = stablehlo.divide %[[COPY1]], %[[COPY0]]
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[DIVIDE0]], %[[DIVIDE1]]
+  // CHECK-NEXT: return %[[ADD]]
+  %0:2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+    %5 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>} : tensor<8x2xi32>
+    sdy.return %5, %5 : tensor<8x2xi32>, tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %1 = stablehlo.divide %0#0, %0#1 : tensor<8x2xi32>
+  %2:2 = sdy.named_computation<"baz">(%arg0) in_shardings=[<@mesh, [{}, {"y"}]>] out_shardings=[<@mesh, [{"x"}, {"y"}]>, <@mesh, [{"y"}, {"x"}]>] (%arg1: tensor<8x2xi32>) {
+    %5 = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {"y", ?}]>]>} : tensor<8x2xi32>
+    sdy.return %5, %5 : tensor<8x2xi32>, tensor<8x2xi32>
+  } : (tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %3 = stablehlo.divide %2#0, %2#1 : tensor<8x2xi32>
+  %4 = stablehlo.add %1, %3 : tensor<8x2xi32>
+  return %4 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func private @baz(
+// CHECK-SAME:    %arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-SAME:    -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}, tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>})
+// CHECK-NEXT:  stablehlo.multiply %arg0, %arg0
+// CHECK-SAME:  sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x", ?}, {?}]>]>}
 
 // -----
 
 sdy.mesh @mesh = <["x"=2, "y"=2]>
 
 // CHECK-LABEL: func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(
-// CHECK-SAME:      %arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}
-// CHECK-SAME:      -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) {
-// CHECK:       %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
-// CHECK-NEXT:    %2 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
-// CHECK-NEXT:    %3 = func.call @foo(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
-// CHECK-NEXT:    sdy.return %3 : tensor<4xf32>
+// CHECK-NEXT:  %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+// CHECK-NEXT:    %3 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    %4 = func.call @foo(%3) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    %5 = mhlo.copy %4 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}]>]>} : tensor<4xf32>
+// CHECK-NEXT:    sdy.return %5 : tensor<4xf32>
 // CHECK-NEXT:  } : (tensor<8xf32>) -> tensor<8xf32>
 // CHECK-NEXT:  %1 = call @foo_0(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : (tensor<8xf32>) -> tensor<8xf32>
-// CHECK-NEXT:  return %1 : tensor<8xf32>
-func.func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(%arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) -> (tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x", "y"}]>}) {
+// CHECK-NEXT:  %2 = sdy.manual_computation(%1) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+// CHECK-NEXT:    %3 = func.call @foo(%arg1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+// CHECK-NEXT:    sdy.return %3 : tensor<4xf32>
+// CHECK-NEXT:  } : (tensor<8xf32>) -> tensor<8xf32>
+// CHECK-NEXT:  return %2 : tensor<8xf32>
+func.func @named_computations_same_funcs_two_same_manual_axes_different_shardings_one_without_manual_axes(%arg0: tensor<8xf32>) -> tensor<8xf32> {
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
     %1 = sdy.named_computation<"foo">(%arg1) in_shardings=[<@mesh, [{"y"}]>] out_shardings=[<@mesh, [{"y"}]>] (%arg2: tensor<4xf32>) {
       %2 = stablehlo.abs %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<4xf32>
@@ -52,7 +126,14 @@ func.func @named_computations_same_funcs_two_same_manual_axes_different_sharding
     %6 = stablehlo.abs %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<8xf32>
     sdy.return %6 : tensor<8xf32>
   } : (tensor<8xf32>) -> tensor<8xf32>
-  return %5 : tensor<8xf32>
+  %7 = sdy.manual_computation(%5) in_shardings=[<@mesh, [{"x", "y"}]>] out_shardings=[<@mesh, [{"x", "y"}]>] manual_axes={"x"} (%arg1: tensor<4xf32>) {
+    %1 = sdy.named_computation<"foo">(%arg1) in_shardings=[<@mesh, [{"y"}]>] out_shardings=[<@mesh, [{"y"}]>] (%arg2: tensor<4xf32>) {
+      %2 = stablehlo.abs %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"y"}]>]>} : tensor<4xf32>
+      sdy.return %2 : tensor<4xf32>
+    } {xla.sdy.manual_axes = #sdy<manual_axes{"x"}>} : (tensor<4xf32>) -> tensor<4xf32>
+    sdy.return %1 : tensor<4xf32>
+  } : (tensor<8xf32>) -> tensor<8xf32>
+  return %7 : tensor<8xf32>
 }
 
 // CHECK-LABEL: func private @foo(
diff --git a/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
index 923d940989864d..170056aaead49d 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
@@ -68,7 +68,8 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> (tensor<16x32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>}) {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD]]) <{
   // CHECK: }>
   // CHECK-SAME: sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK-SAME: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j, l], [i, l, k], [m])->([i, j, k]) {i=16, j=32, k=8, l=64, m=1} reduction={l}>
diff --git a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
index 9b9c97aa1fbd24..ad893647423a63 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards.mlir
@@ -62,7 +62,8 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> tensor<16x32x8xf32> {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD]]) <{
   // CHECK: }>
   // CHECK-SAME: {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {"b"}, {"d"}]> : tensor<16x32x8xf32>
diff --git a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
index 1f401e10b2b84e..df81b84f627ac1 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/ragged_dot_insert_explicit_reshards_enable_full_version_true.mlir
@@ -64,12 +64,13 @@ func.func @ragged_dot_mode_batch(
     %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
     %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
     %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> tensor<16x32x8xf32> {
-  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: %[[RESHARD0:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{}]> : tensor<4xi32>
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %[[RESHARD0]]) <{
   // CHECK: }>
   // CHECK-SAME: {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
   // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {"b"}, {"d"}]> : tensor<16x32x8xf32>
-  // CHECK-NEXT: %[[RESHARD:.*]] = sdy.reshard %[[ALL_REDUCE]] <@mesh_abcd, [{}, {}, {}]> : tensor<16x32x8xf32>
-  // CHECK: return %[[RESHARD]] : tensor<16x32x8xf32>
+  // CHECK-NEXT: %[[RESHARD1:.*]] = sdy.reshard %[[ALL_REDUCE]] <@mesh_abcd, [{}, {}, {}]> : tensor<16x32x8xf32>
+  // CHECK: return %[[RESHARD1]] : tensor<16x32x8xf32>
   %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
     #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
     lhs_batching_dimensions = [0], rhs_batching_dimensions = [0],
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
deleted file mode 100644
index 611044113035fd..00000000000000
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
+++ /dev/null
@@ -1,265 +0,0 @@
-// RUN: sdy_opt %s -xla-sdy-round-trip-shard-map-import 2>&1 | FileCheck %s
-
-sdy.mesh @mesh_0 = <["a"=4, "b"=2]>
-sdy.mesh @mesh_1 = <["a"=2, "b"=2, "c"=2, "d"=2]>
-
-// CHECK-LABEL: func @single_manual_comp
-func.func @single_manual_comp(%arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<8x32xf32>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {"b"}]>, <@mesh_0, [{"b"}, {}], replicated={"a"}>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}], replicated={"b"}>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a", "b"}
-  // CHECK-SAME:              (%arg2: tensor<2x8xf32>, %arg3: tensor<8x32xf32>) {
-  // CHECK-NEXT:            %[[ADD_0:.*]] = stablehlo.add %arg2, %arg2 : tensor<2x8xf32>
-  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %[[ADD_0]], %arg3 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT:            %[[REDUCE:.*]] = "stablehlo.all_reduce"(%[[DOT]])
-  // CHECK-NEXT:            ^bb0(%arg4: tensor<f32>, %arg5: tensor<f32>):
-  // CHECK-NEXT:              %[[ADD_1:.*]] = stablehlo.add %arg4, %arg5 : tensor<f32>
-  // CHECK-NEXT:              stablehlo.return %[[ADD_1]] : tensor<f32>
-  // CHECK-NEXT:            }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT:            sdy.return %[[REDUCE]] : tensor<2x32xf32>
-  // CHECK-NEXT:          } : (tensor<8x16xf32>, tensor<16x32xf32>) -> tensor<8x32xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x32xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x32xf32>) -> tensor<8x32xf32>
-  return %2 : tensor<8x32xf32>
-}
-
-// CHECK-LABEL: func @single_manual_comp_name_is_not_prefix_nor_suffix
-func.func @single_manual_comp_name_is_not_prefix_nor_suffix(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>) {
-  // CHECK-NOT: call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  return %2 : tensor<8x8xf32>
-}
-
-// CHECK-LABEL: func @manual_comp_using_another
-func.func @manual_comp_using_another(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_0
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_1
-  // CHECK-NEXT:          %[[MAN_COMP_1:.*]] = sdy.manual_computation(%[[MAN_COMP_0]])
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME:              (%arg1: tensor<8x4xf32>) {
-  // CHECK-NEXT:            sdy.return %arg1 : tensor<8x4xf32>
-  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_1]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_0(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) : (tensor<8x8xf32>) -> tensor<8x4xf32>
-  %4 = call @local_xla.sdy.manual_computation_body_1(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}} : (tensor<8x4xf32>) -> tensor<8x4xf32>
-  %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) : (tensor<8x4xf32>) -> tensor<8x8xf32>
-  return %5 : tensor<8x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_3(
-func.func @local_xla.sdy.manual_computation_body_3(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_2(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  return %2 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_2(
-func.func @local_xla.sdy.manual_computation_body_2(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
-  return %0 : tensor<2x4xf32>
-}
-
-// CHECK-LABEL: func @nested_shmaps
-func.func @nested_shmaps(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_3
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
-  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
-  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
-  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
-  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
-  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT:            sdy.return %[[MAN_COMP_1]] : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_3(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  return %2 : tensor<4x8xf32>
-}
-
-// CHECK-LABEL: func @nested_shmaps_extra_op
-func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_5
-  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
-  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
-  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
-  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
-  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
-  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
-  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
-  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
-  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT:            %[[ADD:.*]] = stablehlo.add %[[MAN_COMP_1]], %[[MAN_COMP_1]] : tensor<2x8xf32>
-  // CHECK-NEXT:            sdy.return %[[ADD]] : tensor<2x8xf32>
-  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_5(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  return %2 : tensor<4x8xf32>
-}
-
-// CHECK-LABEL: func @manual_computation_no_inputs
-func.func @manual_computation_no_inputs() -> tensor<4xi64> {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_6
-  // CHECK:               %[[SHMAP:.*]] = sdy.manual_computation()
-  // CHECK-SAME{LITERAL}:     in_shardings=[]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"b"}]>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME{LITERAL}:     () {
-  // CHECK-NEXT:            %[[C:.*]] = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
-  // CHECK-NEXT:            sdy.return %[[C]] : tensor<2xi64>
-  // CHECK-NEXT:          } : () -> tensor<4xi64>
-  // CHECK-NEXT:          return %[[SHMAP]] : tensor<4xi64>
-  %0 = call @local_xla.sdy.manual_computation_body_6() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}} : () -> tensor<2xi64>
-  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) : (tensor<2xi64>) -> tensor<4xi64>
-  return %1 : tensor<4xi64>
-}
-
-// CHECK-LABEL: func @manual_computation_no_outputs
-func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_7
-  // CHECK:               sdy.manual_computation(%arg0)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"b"}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME{LITERAL}:     (%arg1: tensor<2xi64>) {
-  // CHECK-NEXT:            stablehlo.custom_call @sdy_testonly(%arg1) : (tensor<2xi64>) -> ()
-  // CHECK-NEXT:            sdy.return
-  // CHECK-NEXT:          } : (tensor<4xi64>) -> ()
-  // CHECK-NEXT:          return
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4xi64>) -> tensor<2xi64>
-  call @local_xla.sdy.manual_computation_body_7(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : (tensor<2xi64>) -> ()
-  return
-}
-
-// CHECK-LABEL: func @manual_computation_no_inputs_no_outputs
-func.func @manual_computation_no_inputs_no_outputs() {
-  // CHECK-NEXT: sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
-  // CHECK-NEXT:   sdy.return
-  // CHECK-NEXT: } : () -> ()
-  // CHECK-NEXT: return
-  call @local_xla.sdy.manual_computation_body_8() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : () -> ()
-  return
-}
-
-// CHECK-LABEL: func @manual_computation_zero_dim_inputs
-func.func @manual_computation_zero_dim_inputs(%arg0: tensor<0x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<0x32xf32>) {
-  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>, <@mesh_0, [{"b"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {}], replicated={"b"}>]
-  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
-  // CHECK-SAME:              (%arg2: tensor<0x8xf32>, %arg3: tensor<8x32xf32>) {
-  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %arg2, %arg3
-  // CHECK-NEXT:            sdy.return %[[DOT]]
-  // CHECK-NEXT:          } : (tensor<0x16xf32>, tensor<16x32xf32>) -> tensor<0x32xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]]
-  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body_9(%c, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {}], replicated={\22b\22}>]>"}} : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<0x32xf32>) -> tensor<0x32xf32>
-  return %2 : tensor<0x32xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body(
-func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<2x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<2x32xf32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2x8xf32>
-  %1 = stablehlo.dot %0, %arg1 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  %2 = "stablehlo.all_reduce"(%1) <{replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>}> ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %3 = stablehlo.add %arg2, %arg3 : tensor<f32>
-    stablehlo.return %3 : tensor<f32>
-  }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
-  return %2 : tensor<2x32xf32>
-}
-
-func.func @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  return %arg0 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_0(
-func.func @local_xla.sdy.manual_computation_body_0(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  return %arg0 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_1(
-func.func @local_xla.sdy.manual_computation_body_1(%arg0: tensor<8x4xf32>) -> tensor<8x4xf32> {
-  return %arg0 : tensor<8x4xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_4(
-func.func @local_xla.sdy.manual_computation_body_4(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
-  return %0 : tensor<2x4xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_5(
-func.func @local_xla.sdy.manual_computation_body_5(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_4(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  %3 = stablehlo.add %2, %2 : tensor<2x8xf32>
-  return %3 : tensor<2x8xf32>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_6(
-func.func @local_xla.sdy.manual_computation_body_6() -> tensor<2xi64> {
-  %c = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
-  return %c : tensor<2xi64>
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_7(
-func.func @local_xla.sdy.manual_computation_body_7(%arg0: tensor<2xi64>) {
-  stablehlo.custom_call @sdy_testonly(%arg0) : (tensor<2xi64>) -> ()
-  return
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_8(
-func.func @local_xla.sdy.manual_computation_body_8() {
-  return
-}
-
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_9(
-func.func @local_xla.sdy.manual_computation_body_9(%arg0: tensor<0x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<0x32xf32> {
-  %0 = stablehlo.dot %arg0, %arg1 : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  return %0 : tensor<0x32xf32>
-}
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index d1b01dbd3996a1..8fbc2db207677f 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -2467,10 +2467,9 @@ SpmdPartitioningVisitor::MakePartitioningState() {
     state.collective_ops_creator = *visiting_collective_ops_creator_;
     state.partition_id = *visiting_partition_id_;
     return CreatePerGroupPartitioningState(state, *device_groups_, &b_);
-  } else {
-    state.collective_ops_creator = collective_ops_creator_;
-    state.partition_id = partition_id_;
   }
+  state.collective_ops_creator = collective_ops_creator_;
+  state.partition_id = partition_id_;
   return state;
 }
 
@@ -2783,10 +2782,10 @@ absl::Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
     new_operands.push_back(
         GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands)));
   return absl::OkStatus();
 }
 
@@ -2909,7 +2908,7 @@ absl::Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
     return DefaultAction(hlo);
   }
 
-  SetPartitionedHlo(hlo, [&] { return final_operand; });
+  SetPartitionedHlo(hlo, final_operand);
   return absl::OkStatus();
 }
 
@@ -2934,9 +2933,7 @@ absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
     }
     auto clone = b_.AddInstruction(
         hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-    clone->set_sharding(sharding);
-    SetPartitionedHlo(
-        hlo, PartitionedHlo(clone, hlo->shape(), MakePartitioningState()));
+    SetPartitionedHlo(hlo, clone);
     return absl::OkStatus();
   }
   // Special handling for sort in TopK when first operand partitioined at
@@ -3127,10 +3124,10 @@ absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
     new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands)));
   return absl::OkStatus();
 }
 
@@ -3150,10 +3147,10 @@ absl::Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
   auto operand = GetPartitionedHlo(hlo->operand(0))
                      .Reshard(desired_operand_sharding)
                      .hlo();
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand})));
   return absl::OkStatus();
 }
 
@@ -3200,7 +3197,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     PartitionedHlo reshard_reshape =
         PartitionedHlo(reshape, hlo->shape(), MakePartitioningState())
             .Reshard(sharding);
-    SetPartitionedHlo(hlo, [&] { return reshard_reshape.hlo(); });
+    SetPartitionedHlo(hlo, reshard_reshape.hlo());
 
     if (sharding_pairs.size() == 2 &&
         sharding_pairs[1].first == operand.sharding() &&
@@ -3323,7 +3320,8 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
           output_shard_size * split_factor);
       return operand.state().b->AddInstruction(HloInstruction::CreateReshape(
           output_shard_shape, reshard_operand->sharded_input));
-    } else if (output_dim_size % input_dim_size == 0) {
+    }
+    if (output_dim_size % input_dim_size == 0) {
       // Merge dims.
       int64_t merge_factor = output_dim_size / input_dim_size;
       // First reshape locally. (The sharded dimension could include padded
@@ -3456,7 +3454,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   };
   TF_ASSIGN_OR_RETURN(HloInstruction * partitioned,
                       recursive_shard(operand, sharding, hlo->shape()));
-  SetPartitionedHlo(hlo, [&] { return partitioned; });
+  SetPartitionedHlo(hlo, partitioned);
   return absl::OkStatus();
 }
 
@@ -3545,11 +3543,9 @@ absl::Status SpmdPartitioningVisitor::HandleSingleDevice(
     false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
   }
 
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        hlo->shape(), pred, operand, true_computation, operand,
-        false_computation));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateConditional(
+                             hlo->shape(), pred, operand, true_computation,
+                             operand, false_computation)));
   return absl::OkStatus();
 }
 
@@ -3665,10 +3661,8 @@ absl::Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
       new_dims);
   auto input = operand.Reshard(desired_input_sharding).hlo();
   auto output_shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(
-        hlo->CloneWithNewOperands(output_shard_shape, {input}));
-  });
+  SetPartitionedHlo(hlo, b_.AddInstruction(hlo->CloneWithNewOperands(
+                             output_shard_shape, {input})));
   return absl::OkStatus();
 }
 
@@ -3727,17 +3721,57 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
+HloInstruction* PadHelper(SpmdPartitioningVisitor& visitor,
+                          PartitionedHlo operand,
+                          HloInstruction* replicated_padding_value,
+                          const PaddingConfig& padding_config,
+                          const Shape& base_shape,
+                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return nullptr;
+  }
+
+  std::optional<PartitionedHlo::WindowedInputShardReturnValue> reshard_operand =
+      ReshardDataForPad(replicated_padding_value, padding_config, operand,
+                        sharding, visitor.builder());
+
+  if (!reshard_operand.has_value()) {
+    return nullptr;
+  }
+
+  HloInstruction* sharded_pad = PadDataFromWindowReshard(
+      *reshard_operand, replicated_padding_value, visitor.builder());
+
+  if (!reshard_operand->dynamic_slice_index_on_output) {
+    return sharded_pad;
+  }
+  Shape shard_shape = MakePartitionedShape(base_shape, sharding);
+
+  HloInstruction* result =
+      visitor.builder()->AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_pad,
+          *reshard_operand->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+  return result;
+}
+
+// TODO: b/457492726 - Simplify HandleDynamicUpdateSlice in spmd_partitioner.cc
 absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
     HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    return b_.AddInstruction(std::move(to_add));
+  };
+  const HloInstruction* input_tensor = hlo->operand(0);
+  const HloInstruction* update_tensor = hlo->operand(1);
 
   std::vector<HloInstruction*> new_indices;
   new_indices.reserve(hlo->shape().dimensions().size());
   for (int64_t i = 0; i < hlo->shape().dimensions().size(); ++i) {
     const HloInstruction* index = hlo->operand(i + 2);
-    if (hlo->operand(1)->shape().dimensions(i) == hlo->shape().dimensions(i)) {
+    if (update_tensor->shape().dimensions(i) == hlo->shape().dimensions(i)) {
       new_indices.emplace_back(CreateZero(index->shape(), &b_));
     } else {
       // Replicate the indices.
@@ -3748,12 +3782,8 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
   DynamicUpdateSliceAnalysis analysis = AnalyzeDynamicUpdateSlice(hlo);
 
   // Method 1. Replicate the slice dimensions for all involved tensors.
-  // TODO(b/407610806). Add support if all partitioned slice dimensions have
-  // constant indices.
-  if (analysis.method == DynamicUpdateSliceMethod::kDefault ||
-      analysis.method == DynamicUpdateSliceMethod::
-                             kAllPartitionedSliceDimsHaveConstantIndices) {
-    const HloSharding& input_sharding = hlo->operand(0)->sharding();
+  if (analysis.method == DynamicUpdateSliceMethod::kDefault) {
+    const HloSharding& input_sharding = input_tensor->sharding();
     const HloSharding& output_sharding = hlo->sharding();
     const HloSharding& better_sharding =
         input_sharding.NumTiles() > output_sharding.NumTiles()
@@ -3763,9 +3793,9 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
     HloSharding replicated_sharding =
         hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
             better_sharding, analysis.slice_dims);
-    auto base = GetPartitionedHlo(hlo->operand(0)).Reshard(replicated_sharding);
+    auto base = GetPartitionedHlo(input_tensor).Reshard(replicated_sharding);
     auto operand =
-        GetPartitionedHlo(hlo->operand(1)).Reshard(replicated_sharding);
+        GetPartitionedHlo(update_tensor).Reshard(replicated_sharding);
     auto dus = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
         base.hlo()->shape(), base.hlo(), operand.hlo(), new_indices));
     dus->set_sharding(replicated_sharding);
@@ -3776,105 +3806,169 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
 
   // Method 2. Keep the sharding for input and output since the update is fully
   // contained in a single partition.
-  CHECK(analysis.method == DynamicUpdateSliceMethod::kUpdateOnASinglePartition);
+  if (analysis.method == DynamicUpdateSliceMethod::kUpdateOnASinglePartition) {
+    // Get partitioned input.
+    const auto& dus_sharding = hlo->sharding();
+    const auto& partitioned_input =
+        GetPartitionedHlo(input_tensor).Reshard(dus_sharding).hlo();
 
-  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
-    return b_.AddInstruction(std::move(to_add));
-  };
+    HloSharding update_sharding =
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            dus_sharding, analysis.slice_dims);
+
+    HloInstruction* replicate_update =
+        GetPartitionedHlo(update_tensor).Reshard(update_sharding).hlo();
+
+    const Shape& partitioned_shape = partitioned_input->shape();
+    std::vector<HloInstruction*> partition_ordinals =
+        MakeTiledPartitionOrdinals(hlo->sharding(),
+                                   MakePartitioningState().partition_id, &b_);
+    HloInstruction* all_dims_within_partition = add_hlo(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+
+    for (int64_t dim : analysis.partitioned_slice_dims) {
+      // Calculate per partition size.
+      const int64_t per_partition_size = partitioned_shape.dimensions(dim);
+
+      // within_partition = (offset >= partition_id * per_partition_size) &&
+      //                    (offset < (partition_id + 1) * per_partition_size)
+      const Shape& compare_shape =
+          ShapeUtil::ChangeElementType(partition_id_->shape(), PRED);
+      auto per_partition_size_hlo = add_hlo(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int>(per_partition_size)));
+      const Shape& offset_shape = per_partition_size_hlo->shape();
+      const Shape& index_shape = new_indices[dim]->shape();
+      if (offset_shape.element_type() != index_shape.element_type()) {
+        new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(index_shape,
+                                         offset_shape.element_type()),
+            new_indices[dim]));
+      }
+      HloInstruction* partition_offset = add_hlo(HloInstruction::CreateBinary(
+          offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
+          per_partition_size_hlo));
+      // offset >= partition_id * per_partition_size
+      HloInstruction* offset_ge = add_hlo(HloInstruction::CreateCompare(
+          compare_shape, new_indices[dim], partition_offset,
+          ComparisonDirection::kGe));
+      // offset < (partition_id + 1) * per_partition_size
+      HloInstruction* offset_lt = add_hlo(HloInstruction::CreateCompare(
+          compare_shape, new_indices[dim],
+          add_hlo(HloInstruction::CreateBinary(
+              offset_shape, HloOpcode::kMultiply,
+              add_hlo(HloInstruction::CreateBinary(
+                  offset_shape, HloOpcode::kAdd, partition_ordinals[dim],
+                  add_hlo(HloInstruction::CreateConstant(
+                      LiteralUtil::CreateR0<int>(1))))),
+              per_partition_size_hlo)),
+          ComparisonDirection::kLt));
+      HloInstruction* update_within_partition =
+          add_hlo(HloInstruction::CreateBinary(compare_shape, HloOpcode::kAnd,
+                                               offset_ge, offset_lt));
+
+      all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
+          compare_shape, HloOpcode::kAnd, all_dims_within_partition,
+          update_within_partition));
+
+      // Calculate offset.
+      // slice dim offset = within_partition ?
+      //                    offset - partition_id * per_partition_size : 0
+      new_indices[dim] = add_hlo(HloInstruction::CreateTernary(
+          new_indices[dim]->shape(), HloOpcode::kSelect,
+          update_within_partition,
+          add_hlo(HloInstruction::CreateBinary(
+              new_indices[dim]->shape(), HloOpcode::kSubtract, new_indices[dim],
+              partition_offset)),
+          add_hlo(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)))));
+      if (new_indices[dim]->shape().element_type() !=
+          index_shape.element_type()) {
+        new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(new_indices[dim]->shape(),
+                                         index_shape.element_type()),
+            new_indices[dim]));
+      }
+    }
 
-  // Get partitioned input.
-  const auto& dus_sharding = hlo->sharding();
-  const auto& partitioned_input =
-      GetPartitionedHlo(hlo->operand(0)).Reshard(dus_sharding).hlo();
-
-  auto update_sharding =
-      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-          dus_sharding, analysis.slice_dims);
-
-  // TODO(wangtao): use collective permute for sharded update.
-  HloInstruction* replicate_update =
-      GetPartitionedHlo(hlo->operand(1)).Reshard(update_sharding).hlo();
-
-  const auto& partitioned_shape = partitioned_input->shape();
-  auto partition_ordinals = MakeTiledPartitionOrdinals(
-      hlo->sharding(), MakePartitioningState().partition_id, &b_);
-  HloInstruction* all_dims_within_partition = add_hlo(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
-
-  for (int64_t dim : analysis.partitioned_slice_dims) {
-    // Calculate per partition size.
-    const int64_t per_partition_size = partitioned_shape.dimensions(dim);
-
-    // within_partition = (offset >= partition_id * per_partition_size) &&
-    //                    (offset < (partition_id + 1) * per_partition_size)
-    const Shape& compare_shape =
-        ShapeUtil::ChangeElementType(partition_id_->shape(), PRED);
-    auto per_partition_size_hlo = add_hlo(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR0<int>(per_partition_size)));
-    const Shape& offset_shape = per_partition_size_hlo->shape();
-    const Shape& index_shape = new_indices[dim]->shape();
-    if (offset_shape.element_type() != index_shape.element_type()) {
-      new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
-          ShapeUtil::ChangeElementType(index_shape,
-                                       offset_shape.element_type()),
-          new_indices[dim]));
-    }
-    auto partition_offset = add_hlo(HloInstruction::CreateBinary(
-        offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
-        per_partition_size_hlo));
-    // offset >= partition_id * per_partition_size
-    auto offset_ge = add_hlo(HloInstruction::CreateCompare(
-        compare_shape, new_indices[dim], partition_offset,
-        ComparisonDirection::kGe));
-    // offset < (partition_id + 1) * per_partition_size
-    auto offset_lt = add_hlo(HloInstruction::CreateCompare(
-        compare_shape, new_indices[dim],
-        add_hlo(HloInstruction::CreateBinary(
-            offset_shape, HloOpcode::kMultiply,
-            add_hlo(HloInstruction::CreateBinary(
-                offset_shape, HloOpcode::kAdd, partition_ordinals[dim],
-                add_hlo(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<int>(1))))),
-            per_partition_size_hlo)),
-        ComparisonDirection::kLt));
-    auto update_within_partition = add_hlo(HloInstruction::CreateBinary(
-        compare_shape, HloOpcode::kAnd, offset_ge, offset_lt));
-
-    all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
-        compare_shape, HloOpcode::kAnd, all_dims_within_partition,
-        update_within_partition));
-
-    // Calculate offset.
-    // slice dim offset = within_partition ?
-    //                    offset - partition_id * per_partition_size : 0
-    new_indices[dim] = add_hlo(HloInstruction::CreateTernary(
-        new_indices[dim]->shape(), HloOpcode::kSelect, update_within_partition,
-        add_hlo(HloInstruction::CreateBinary(
-            new_indices[dim]->shape(), HloOpcode::kSubtract, new_indices[dim],
-            partition_offset)),
-        add_hlo(
-            HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)))));
-    if (new_indices[dim]->shape().element_type() !=
-        index_shape.element_type()) {
-      new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
-          ShapeUtil::ChangeElementType(new_indices[dim]->shape(),
-                                       index_shape.element_type()),
-          new_indices[dim]));
-    }
-  }
-
-  // Create dynamic update slice.
-  auto dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
-      partitioned_shape, partitioned_input, replicate_update, new_indices));
-  SetPartitionedHlo(hlo, [&]() {
-    // Select if update is needed.
-    return add_hlo(HloInstruction::CreateTernary(
-        dus->shape(), HloOpcode::kSelect,
-        add_hlo(HloInstruction::CreateBroadcast(
-            ShapeUtil::ChangeElementType(dus->shape(), PRED),
-            all_dims_within_partition, {})),
-        dus, partitioned_input));
-  });
+    // Create dynamic update slice.
+    HloInstruction* dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, partitioned_input, replicate_update, new_indices));
+    // Select if update is needed
+    SetPartitionedHlo(hlo,
+                      add_hlo(HloInstruction::CreateTernary(
+                          dus->shape(), HloOpcode::kSelect,
+                          add_hlo(HloInstruction::CreateBroadcast(
+                              ShapeUtil::ChangeElementType(dus->shape(), PRED),
+                              all_dims_within_partition, {})),
+                          dus, partitioned_input)));
+    return absl::OkStatus();
+  }
+
+  // Method 3: All partitioned slice dimensions have compile-time constant
+  // indices.
+  if (analysis.method == DynamicUpdateSliceMethod::
+                             kAllPartitionedSliceDimsHaveConstantIndices &&
+      module_->config().debug_options().xla_enable_enzyme_comms_opt()) {
+    PaddingConfig padding_config;
+    for (int64_t input_tensor_dim = 0;
+         input_tensor_dim < hlo->shape().dimensions().size();
+         ++input_tensor_dim) {
+      auto padding_dim = padding_config.add_dimensions();
+      padding_dim->set_interior_padding(0);
+
+      const HloInstruction* dus_index = hlo->operand(input_tensor_dim + 2);
+      CHECK(dus_index->IsConstant());
+
+      int64_t start_index = dus_index->literal().GetIntegralAsS64({}).value();
+      int64_t end_index =
+          start_index + update_tensor->shape().dimensions(input_tensor_dim);
+      int64_t padding_high =
+          hlo->shape().dimensions(input_tensor_dim) - end_index;
+      padding_dim->set_edge_padding_low(start_index);
+      padding_dim->set_edge_padding_high(padding_high);
+    }
+
+    const Shape operand_pred_shape =
+        ShapeUtil::ChangeElementType(hlo->shape(), PRED);
+    const Shape update_pred_shape =
+        ShapeUtil::ChangeElementType(update_tensor->shape(), PRED);
+    const Shape sharded_update_pred_shape =
+        MakePartitionedShape(update_pred_shape, hlo->sharding());
+
+    auto zeroOperand = CreateZero(sharded_update_pred_shape, &b_);
+    zeroOperand->set_sharding(hlo->sharding());
+
+    HloInstruction* paddingValue = CreateOne(Shape(PRED, {}), &b_);
+    HloInstruction* maskOp = PadHelper(
+        *this,
+        PartitionedHlo(zeroOperand, update_pred_shape, MakePartitioningState()),
+        paddingValue, padding_config, operand_pred_shape, hlo->sharding());
+    if (!maskOp) {
+      maskOp = add_hlo(HloInstruction::CreatePad(
+          operand_pred_shape, zeroOperand, paddingValue, padding_config));
+      maskOp->set_sharding(hlo->sharding());
+    }
+
+    auto zeroElemOp = add_hlo(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    HloInstruction* newOperand =
+        PadHelper(*this, GetPartitionedHlo(update_tensor), zeroElemOp,
+                  padding_config, hlo->shape(), hlo->sharding());
+    if (!newOperand) {
+      newOperand = add_hlo(HloInstruction::CreatePad(
+          hlo->shape(), GetPartitionedHlo(update_tensor).hlo(), zeroElemOp,
+          padding_config));
+      newOperand->set_sharding(hlo->sharding());
+    }
+
+    auto shard_result_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto result = add_hlo(HloInstruction::CreateTernary(
+        shard_result_shape, HloOpcode::kSelect, maskOp,
+        GetPartitionedHlo(input_tensor).hlo(), newOperand));
+    SetPartitionedHlo(hlo, result);
+    return absl::OkStatus();
+  }
   return absl::OkStatus();
 }
 
@@ -3893,8 +3987,7 @@ absl::Status SpmdPartitioningVisitor::HandleGetTupleElement(
   PartitionedHlo source_partitioned_gte(
       gte, tuple.base_shape().tuple_shapes(hlo->tuple_index()),
       MakePartitioningState());
-  source_partitioned_gte = source_partitioned_gte.Reshard(hlo->sharding());
-  SetPartitionedHlo(hlo, std::move(source_partitioned_gte));
+  SetPartitionedHlo(hlo, source_partitioned_gte.Reshard(hlo->sharding()));
   return absl::OkStatus();
 }
 
@@ -3907,19 +4000,15 @@ absl::Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
     // elements for non-empty tuple. So if it has a nested empty tuple, we
     // cannot invoke GetSubSharding() since it expects a sharding for the empty
     // tuple. This is a workaround for that case.
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(
-          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
-    });
+    SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateInfeed(
+                               shape, token, hlo->infeed_config())));
     return absl::OkStatus();
   }
   auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
   auto shard_shape = MakePartitionedShape(shape, sharding);
   if (EvenlyPartitions(shape, sharding)) {
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(HloInstruction::CreateInfeed(
-          shard_shape, token, hlo->infeed_config()));
-    });
+    SetPartitionedHlo(hlo, b_.AddInstruction(HloInstruction::CreateInfeed(
+                               shard_shape, token, hlo->infeed_config())));
     return absl::OkStatus();
   }
 
@@ -4023,38 +4112,22 @@ absl::Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
     }
     branches[i] = module_->AddEmbeddedComputation(branch_b.Build());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
-        branches, std::vector<HloInstruction*>(branches.size(), token)));
-  });
+  SetPartitionedHlo(
+      hlo, b_.AddInstruction(HloInstruction::CreateConditional(
+               ShapeUtil::MakeTupleShape({shard_shape, token->shape()}),
+               branch_index, branches,
+               std::vector<HloInstruction*>(branches.size(), token))));
   return absl::OkStatus();
 }
 
 absl::Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
-  if (hlo->sharding().IsTileMaximal()) {
-    return DefaultAction(hlo);
-  }
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1)).Replicate().hlo();
-  auto reshard_operand = ReshardDataForPad(
-      replicated_rhs, hlo->padding_config(), lhs, hlo->sharding(), &b_);
-  if (!reshard_operand.has_value()) {
+  auto result = PadHelper(*this, GetPartitionedHlo(hlo->operand(0)),
+                          GetPartitionedHlo(hlo->operand(1)).Replicate().hlo(),
+                          hlo->padding_config(), hlo->shape(), hlo->sharding());
+  if (!result) {
     return DefaultAction(hlo);
   }
-  auto* sharded_pad =
-      PadDataFromWindowReshard(*reshard_operand, replicated_rhs, &b_);
-
-  SetPartitionedHlo(hlo, [&]() {
-    if (!reshard_operand->dynamic_slice_index_on_output) {
-      return sharded_pad;
-    }
-    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
-        shard_shape, sharded_pad,
-        *reshard_operand->dynamic_slice_index_on_output,
-        shard_shape.dimensions()));
-  });
+  SetPartitionedHlo(hlo, result);
   return absl::OkStatus();
 }
 
@@ -4223,10 +4296,9 @@ absl::Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   if (!left_padded_operand) {
     return DefaultAction(hlo);
   }
-  SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        left_padded_operand->shape(), {left_padded_operand}));
-  });
+  SetPartitionedHlo(hlo,
+                    b_.AddInstruction(hlo->CloneWithNewOperands(
+                        left_padded_operand->shape(), {left_padded_operand})));
   return absl::OkStatus();
 }
 
@@ -4237,7 +4309,7 @@ absl::Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
       hlo->while_body(),
       GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
   hlo->SetupDerivedInstruction(whileOp);
-  SetPartitionedHlo(hlo, [&] { return whileOp; });
+  SetPartitionedHlo(hlo, whileOp);
   return absl::OkStatus();
 }
 
@@ -4282,21 +4354,15 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
     return HandleSingleDevice(hlo);
   }
   if (hlo->sharding().IsManual()) {
-    auto clone_from_original = [&](const HloSharding& shared_sharding) {
-      std::vector<HloInstruction*> new_operands;
-      new_operands.reserve(hlo->operand_count());
-      for (int64_t i = 0; i < hlo->operand_count(); ++i) {
-        new_operands.push_back(
-            GetPartitionedHlo(hlo->operand(i)).Reshard(shared_sharding).hlo());
-      }
-      auto clone = b_.AddInstruction(
-          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-      clone->set_sharding(shared_sharding);
-      return clone;
-    };
-
-    SetPartitionedHlo(hlo,
-                      [&] { return clone_from_original(hlo->sharding()); });
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(hlo->operand_count());
+    for (int64_t i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(
+          GetPartitionedHlo(hlo->operand(i)).Reshard(hlo->sharding()).hlo());
+    }
+    auto clone = b_.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    SetPartitionedHlo(hlo, clone);
     return absl::OkStatus();
   }
 
@@ -4331,10 +4397,9 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
     Shape outfeed_shape = operand->shape();
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(hlo->outfeed_shape(),
                                                            &outfeed_shape));
-    SetPartitionedHlo(hlo, [&]() {
-      return b_.AddInstruction(HloInstruction::CreateOutfeed(
-          outfeed_shape, operand, token, hlo->outfeed_config()));
-    });
+    SetPartitionedHlo(
+        hlo, b_.AddInstruction(HloInstruction::CreateOutfeed(
+                 outfeed_shape, operand, token, hlo->outfeed_config())));
     return absl::OkStatus();
   }
 
@@ -4453,13 +4518,13 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
         hlo->outfeed_config()));
     branches[i] = module_->AddEmbeddedComputation(branch_b.Build());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateConditional(
-        token->shape(), branch_index, branches,
-        std::vector<HloInstruction*>(
-            branches.size(),
-            b_.AddInstruction(HloInstruction::CreateTuple({operand, token})))));
-  });
+  SetPartitionedHlo(
+      hlo,
+      b_.AddInstruction(HloInstruction::CreateConditional(
+          token->shape(), branch_index, branches,
+          std::vector<HloInstruction*>(
+              branches.size(), b_.AddInstruction(HloInstruction::CreateTuple(
+                                   {operand, token}))))));
   return absl::OkStatus();
 }
 
@@ -4481,8 +4546,7 @@ absl::Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   };
 
   if (hlo->sharding().IsManual()) {
-    SetPartitionedHlo(hlo,
-                      [&] { return clone_from_original(hlo->sharding()); });
+    SetPartitionedHlo(hlo, clone_from_original(hlo->sharding()));
     return absl::OkStatus();
   }
 
@@ -4507,11 +4571,10 @@ absl::Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   }
 
   if (!hlo->sharding().ReplicateOnLastTileDim()) {
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(HloInstruction::CreateRng(
-          MakePartitionedShape(hlo->shape(), hlo->sharding()),
-          hlo->random_distribution(), new_operands));
-    });
+    SetPartitionedHlo(hlo,
+                      b_.AddInstruction(HloInstruction::CreateRng(
+                          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+                          hlo->random_distribution(), new_operands)));
   } else {
     std::vector<int64_t> group_dims(
         hlo->sharding().tile_assignment().num_dimensions() - 1);
@@ -4830,9 +4893,8 @@ absl::Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
             .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
             .hlo());
   }
-  SetPartitionedHlo(hlo, [&]() {
-    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
-  });
+  SetPartitionedHlo(
+      hlo, b_.AddInstruction(HloInstruction::CreateTuple(new_operands)));
   return absl::OkStatus();
 }
 
@@ -4921,7 +4983,7 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
         MakeBinaryAdd(phlo->shape().element_type(), lhs.state().module));
   }
 
-  SetPartitionedHlo(hlo, [&]() { return phlo; });
+  SetPartitionedHlo(hlo, phlo);
   return absl::OkStatus();
 }
 
@@ -5054,22 +5116,20 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           // If the src/dst pairs are empty, then the collective permute
           // just initializes the output to zero.
           return CreateZero(operand->shape(), b);
-        } else {
-          // A collective-permute is a copy if all pairs are "identity" and
-          // all partitions are listed.
-          bool is_copy =
-              src_dst_pairs.size() == num_partitions &&
-              absl::c_all_of(src_dst_pairs,
-                             [](const std::pair<int64_t, int64_t>& pair) {
-                               return pair.first == pair.second;
-                             });
-          if (is_copy) {
-            return operand;
-          } else {
-            return b->AddInstruction(HloInstruction::CreateCollectivePermute(
-                operand->shape(), operand, src_dst_pairs, channel_id));
-          }
         }
+        // A collective-permute is a copy if all pairs are "identity" and
+        // all partitions are listed.
+        bool is_copy =
+            src_dst_pairs.size() == num_partitions &&
+            absl::c_all_of(src_dst_pairs,
+                           [](const std::pair<int64_t, int64_t>& pair) {
+                             return pair.first == pair.second;
+                           });
+        if (is_copy) {
+          return operand;
+        }
+        return b->AddInstruction(HloInstruction::CreateCollectivePermute(
+            operand->shape(), operand, src_dst_pairs, channel_id));
       },
       [create_all_to_all_list_of_lists](
           SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
@@ -5309,13 +5369,13 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
           .create_cross_partition_all_reduce_with_iota_device_list(
               b, operand, reduction, partition_group_list.value(),
               (*next_channel_id)++);
-    } else {
-      auto partition_subgroups =
-          GetPartitionGroupsForReplication(sharding, selected_dims);
-      return collectives_creator.create_cross_partition_all_reduce(
-          b, operand, reduction, partition_subgroups, (*next_channel_id)++);
     }
+    auto partition_subgroups =
+        GetPartitionGroupsForReplication(sharding, selected_dims);
+    return collectives_creator.create_cross_partition_all_reduce(
+        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
   }
+
   auto result = operand;
   for (auto it = selected_dims.rbegin(); it != selected_dims.rend(); ++it) {
     if (sharding.tile_assignment().dim(*it) == 1) {
@@ -5430,7 +5490,7 @@ int64_t SpmdPartitioner::CommunicationCostInBytes(HloInstruction* hlo) {
   module->set_spmd_output_sharding(entry_root->sharding());
 }
 
-absl::StatusOr<bool> SpmdPartitioner::Run(
+absl::StatusOr<bool> SpmdPartitioner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   set_execution_threads(execution_threads);
@@ -6180,50 +6240,50 @@ void SpmdPartitioningVisitor::SetPartitionedHlo(
   } else if (!sharding.IsReplicated()) {
     // Adds recovery computation to the original value recovery table.
     auto* module = const_cast<HloModule*>(hlo->parent()->parent());
-    module->mutable_original_value_recovery_table().AddRecoveryComputation(
-        hlo, partitioned_hlo.hlo(),
+    auto build_recovery_computation =
         [&](const ShapeIndex& index, const OriginalArray& old_original_array,
-            const xla::Shape& old_array_shape,
-            const xla::Shape& new_array_shape)
-            -> std::optional<std::unique_ptr<HloModule>> {
-          if (ShapeUtil::Compatible(old_array_shape, new_array_shape)) {
-            // If the shapes are the same, nothing is sharded so we return
-            // nullptr to indicate identity recovery module. This may happen
-            // for scalars in tuples.
-            return nullptr;
-          }
-          SpmdBuilder builder("recovery_computation", nullptr);
-          auto* param =
-              builder.AddInstruction(xla::HloInstruction::CreateParameter(
-                  0, new_array_shape, "param"));
-          if (sharding.IsTuple()) {
-            param->set_sharding(sharding.GetSubSharding(hlo->shape(), index));
-          } else {
-            param->set_sharding(sharding);
-          }
-          xla::HloModuleConfig config;
-          auto recovery_module =
-              std::make_unique<HloModule>("recovery_module", config);
-          PartitionedHlo::ReshardCache reshard_cache;
-          int64_t next_channel_id = hlo_query::NextChannelId(*recovery_module);
-
-          xla::spmd::PartitionedHlo::PartitioningState partitioning_state =
-              partitioned_hlo.state();
-          partitioning_state.b = &builder;
-          partitioning_state.module = recovery_module.get();
-          partitioning_state.partition_id =
-              partitioning_state.collective_ops_creator.create_partition_id(
-                  &builder);
-          partitioning_state.next_channel_id = &next_channel_id;
-          partitioning_state.reshard_cache = &reshard_cache;
-
-          PartitionedHlo param_partitioned_hlo(param, old_array_shape,
-                                               partitioning_state);
-          // Creates computation to recover the partitioned value.
-          param_partitioned_hlo.Replicate();
-          recovery_module->AddEntryComputation(builder.Build());
-          return recovery_module;
-        });
+            const xla::Shape& old_shape, const xla::Shape& new_shape)
+        -> std::optional<std::unique_ptr<HloModule>> {
+      SpmdBuilder builder("recovery_computation", nullptr);
+      auto* param = builder.AddInstruction(
+          xla::HloInstruction::CreateParameter(0, new_shape, "param"));
+      if (sharding.IsTuple()) {
+        const HloSharding sub_sharding =
+            sharding.GetSubSharding(hlo->shape(), index);
+        if (sub_sharding.IsReplicated()) {
+          return nullptr;
+        }
+        param->set_sharding(sub_sharding);
+      } else {
+        param->set_sharding(sharding);
+      }
+
+      xla::HloModuleConfig config;
+      auto recovery_module =
+          std::make_unique<HloModule>("recovery_module", config);
+      PartitionedHlo::ReshardCache reshard_cache;
+      int64_t next_channel_id = hlo_query::NextChannelId(*recovery_module);
+
+      xla::spmd::PartitionedHlo::PartitioningState partitioning_state =
+          partitioned_hlo.state();
+      partitioning_state.b = &builder;
+      partitioning_state.module = recovery_module.get();
+      partitioning_state.partition_id =
+          partitioning_state.collective_ops_creator.create_partition_id(
+              &builder);
+      partitioning_state.next_channel_id = &next_channel_id;
+      partitioning_state.reshard_cache = &reshard_cache;
+
+      PartitionedHlo param_partitioned_hlo(param, old_shape,
+                                           partitioning_state);
+      // Creates computation to recover the partitioned value.
+      param_partitioned_hlo.Replicate();
+      recovery_module->AddEntryComputation(builder.Build());
+      return recovery_module;
+    };
+
+    module->mutable_original_value_recovery_table().AddRecoveryComputation(
+        hlo, partitioned_hlo.hlo(), build_recovery_computation);
   }
 
   partitioned_instructions_.emplace(hlo, partitioned_hlo);
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index a5df88a74a331c..6cafbe12bcf90e 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -158,8 +158,6 @@ class SpmdBuilder : public HloComputation::Builder {
     instructions_[hlo];
   }
 
-  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
-
   // Wrapper of queries to broadcast_dims_.
   std::optional<const absl::flat_hash_set<int64_t>*> BroadcastDimsForCreatedHlo(
       const HloInstruction* hlo) {
@@ -316,10 +314,6 @@ class SpmdPartitioner : public HloModulePass {
         options_(std::move(options)),
         collective_ops_creator_(std::move(collective_ops_creator)) {}
   absl::string_view name() const override { return "spmd-partitioning"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   // Transforms the given computation with SPMD instructions, replacing it with
   // a new computation.
@@ -370,13 +364,17 @@ class SpmdPartitioner : public HloModulePass {
   }
 
   // Update module's parameter and output sharding information, based on the
-  // sharding information of the module's parameters and outptuts.
+  // sharding information of the module's parameters and outputs.
   static void RecordInputsOutputsSharding(HloModule* module);
 
   int64_t num_partitions() const { return num_partitions_; }
   int64_t num_replicas() const { return num_replicas_; }
 
  protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
   // This is the internal implementation for AllGatherShards(), returns a pair
   // of hlo instructions whose first element is the result of the all-gather
   // shard(which might not be the all-gather itself and it could go through
@@ -443,7 +441,6 @@ class SpmdPartitioner : public HloModulePass {
 
   SpmdPartitionerOptions options_;
   SPMDCollectiveOpsCreator collective_ops_creator_;
-  std::vector<std::vector<int64_t>> device_groups_;
   absl::flat_hash_set<absl::string_view> execution_threads_;
 };
 
@@ -722,6 +719,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
 
   absl::Status DefaultAction(HloInstruction* hlo) override;
 
+  // go/keep-sorted start
   absl::Status HandleAllReduce(HloInstruction* hlo) override;
   absl::Status HandleBitcastConvert(HloInstruction* hlo) override;
   absl::Status HandleBroadcast(HloInstruction* hlo) override;
@@ -760,6 +758,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   absl::Status HandleTriangularSolve(HloInstruction* hlo) override;
   absl::Status HandleTuple(HloInstruction* hlo) override;
   absl::Status HandleWhile(HloInstruction* hlo) override;
+  // go/keep-sorted end
 
   // Implementation of dot partitioning given DotGeneralDimsMapping.
   template <typename CreateShardedFunctor>
@@ -796,17 +795,21 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   void SetPartitionedHlo(const HloInstruction* hlo,
                          PartitionedHlo&& partitioned_hlo);
 
-  // Convenient wrapper that creates PartitionedHlo from the result of the func
-  // and maps it to the given original hlo.
-  void SetPartitionedHlo(const HloInstruction* hlo,
-                         absl::FunctionRef<HloInstruction*()> func) {
-    HloInstruction* new_hlo = func();
+  // Convenient wrapper that creates PartitionedHlo from `new_hlo`.
+  void SetPartitionedHlo(const HloInstruction* hlo, HloInstruction* new_hlo) {
     new_hlo->set_sharding(hlo->sharding());
     SetPartitionedHlo(
         hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
     changed_ = true;
   }
 
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         absl::FunctionRef<HloInstruction*()> func) {
+    return SetPartitionedHlo(hlo, func());
+  }
+
   int64_t NewChannel() { return (*next_channel_id_)++; }
 
   PartitionedHlo::PartitioningState MakePartitioningState();
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index ce141ffd8c53a5..21e153e7659218 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -85,7 +85,7 @@ class SpmdPartitioningTest
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_devices,
       SpmdPartitionerOptions options = SpmdPartitionerOptions(),
-      bool use_all_gather = true) {
+      bool use_all_gather = true, bool enable_enzyme_opt = false) {
     options.allow_module_signature_change = true;
     auto collective_ops_creator =
         GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
@@ -96,6 +96,9 @@ class SpmdPartitioningTest
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);
     config.set_num_partitions(num_devices);
+    if (enable_enzyme_opt) {
+      config.mutable_debug_options().set_xla_enable_enzyme_comms_opt(true);
+    }
     TF_ASSIGN_OR_RETURN(auto module,
                         ParseAndReturnVerifiedModule(hlo_module, config));
 
@@ -8144,6 +8147,164 @@ ENTRY entry {
                     op::Shape("s32[64,32]")));
 }
 
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantInRange) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+
+  ENTRY entry {
+    %input = s32[128,64] parameter(0), sharding={devices=[1,2]<=[2]}
+    %update = s32[10,10] parameter(1), sharding={devices=[1,2]<=[2]}
+    %c59 = s32[] constant(59)
+    %c27 = s32[] constant(27)
+    ROOT %dynamic-update-slice = s32[128,64]
+      dynamic-update-slice(%input, %update, %c59, %c27),
+      sharding={devices=[1,2]<=[2]}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[10,5]"));
+  auto zero_one_mask = AllOf(op::Pad(op::Broadcast(_), op::Constant()),
+                             op::Shape("pred[10,59]"));
+  auto sliced_mask_based_on_partition_id =
+      AllOf(op::DynamicSlice(zero_one_mask, _, _), op::Shape("pred[10,32]"));
+  auto dus_range_for_partition_id =
+      AllOf(op::Select(op::And(_, _), sliced_mask_based_on_partition_id,
+                       op::Broadcast(op::Constant())),
+            op::Shape("pred[10,32]"));
+  auto padded_dus_range_for_partition_id =
+      AllOf(op::Pad(dus_range_for_partition_id, op::Constant()),
+            op::Shape("pred[128,32]"));
+  auto padded_sharded_update =
+      AllOf(op::Pad(sharded_update, op::Constant()), op::Shape("s32[10,59]"));
+  auto sharded_update_for_partition_id =
+      AllOf(op::DynamicSlice(padded_sharded_update, op::Constant(),
+                             op::Multiply(_, _)),
+            op::Shape("s32[10,32]"));
+  auto fully_padded_sharded_update_for_partition_id =
+      AllOf(op::Pad(sharded_update_for_partition_id, op::Constant()),
+            op::Shape("s32[128,32]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Select(padded_dus_range_for_partition_id, sharded_input,
+                               fully_padded_sharded_update_for_partition_id),
+                    op::Shape("s32[128,32]")));
+}
+
+// Out of range DUS is legal. The update index will be recalculated so that the
+// update tensor fits in the input tensor. Eg. for input[7], update[3],
+// dus_index = 5, the input tensor will be updated from index 4 to 6. More
+// details in the StableHlo spec: http://shortn/_g5KIGyMt9X.
+// TODO: b/457448098 - fix out-of-range indexing test case for
+// collective_ops_e2e_test.cc
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantOutOfRange) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+
+  ENTRY entry {
+    %input = s32[128,64] parameter(0), sharding={devices=[1,2]<=[2]}
+    %update = s32[128,20] parameter(1), sharding={devices=[1,2]<=[2]}
+    %c20 = s32[] constant(20)
+    %c60 = s32[] constant(60)
+    ROOT %dynamic-update-slice = s32[128,64]
+      dynamic-update-slice(%input, %update, %c20, %c60),
+      sharding={devices=[1,2]<=[2]}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[128,10]"));
+  auto all_gather_input =
+      AllOf(op::AllGather(sharded_input), op::Shape("s32[128,64]"));
+  auto all_gather_update =
+      AllOf(op::AllGather(sharded_update), op::Shape("s32[128,20]"));
+  auto dus = op::DynamicUpdateSlice(all_gather_input, all_gather_update, _, _);
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(dus, op::Constant(),
+                                     op::Reshape(op::DynamicSlice(
+                                         op::Constant(), op::PartitionId()))),
+                    op::Shape("s32[128,32]")));
+}
+
+TEST_P(SpmdPartitioningTest, DynamicUpdateSliceSingleDimensionWithEnzymeOpt) {
+  absl::string_view hlo_string = R"(
+    HloModule module
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16]
+        dynamic-update-slice(%input, %update, %c3),
+        sharding={devices=[4]<=[4]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/4, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
+  auto c3 = AllOf(op::Constant(), op::Shape("s32[]"));
+  auto per_partition_padded_mask = AllOf(op::Select(
+      _, op::DynamicSlice(op::Pad(_, _), op::Multiply(_, _)), op::Broadcast()));
+  auto sliced_sharded_update_fwd_edge = AllOf(
+      op::CollectivePermute(op::Slice(sharded_update)), op::Shape("s32[1]"));
+  auto sharded_update_bwd_edge =
+      AllOf(op::CollectivePermute(sharded_update), op::Shape("s32[2]"));
+  auto sliced_sharded_update_non_neighboring_devices = AllOf(
+      op::CollectivePermute(op::Slice(sharded_update)), op::Shape("s32[1]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Select(
+                per_partition_padded_mask, sharded_input,
+                op::DynamicSlice(
+                    op::Pad(op::Concatenate(
+                                sliced_sharded_update_fwd_edge, sharded_update,
+                                sharded_update_bwd_edge,
+                                sliced_sharded_update_non_neighboring_devices),
+                            op::Constant()),
+                    op::Multiply(_, _))),
+            op::Shape("s32[4]")));
+}
+
+TEST_P(SpmdPartitioningTest,
+       DynamicUpdateSliceSingleDimensionWithoutEnzymeOpt) {
+  absl::string_view hlo_string = R"(
+    HloModule module
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16]
+        dynamic-update-slice(%input, %update, %c3),
+        sharding={devices=[4]<=[4]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  const auto root = module->entry_computation()->root_instruction();
+  auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
+  auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
+  auto c3 = AllOf(op::Constant(), op::Shape("s32[]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice((op::DynamicUpdateSlice(
+                                         op::AllGather(sharded_input),
+                                         op::AllGather(sharded_update), c3)),
+                                     op::Reshape(_)),
+                    op::Shape("s32[4]")));
+}
+
 TEST_P(SpmdPartitioningTest, UnpartitionedGather) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -14815,6 +14976,8 @@ ENTRY entry {
                                _, _, _, _));
 }
 
+// TODO: fix this test; right now it breaks in collective_ops_e2e_test.cc, even
+// though it passes the SPMD partitioner unit test.
 TEST_P(SpmdPartitioningTest,
        KeepPartitionedNonSlicedDimensionWithConstantIndices) {
   const char* const hlo_string = R"(
@@ -14834,15 +14997,17 @@ ENTRY entry {
   ROOT c = bf16[16,224,224,384]{3,2,1,0} copy(dynamic-update-slice.128), sharding={devices=[2,2,2,1]<=[8]}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_devices=*/8, SpmdPartitionerOptions(),
+                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
 
   XLA_VLOG_LINES(1, module->ToString());
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Copy(op::DynamicSlice(
-          AllOf(op::DynamicUpdateSlice(), op::Shape("bf16[8,224, 224,384]")), _,
-          _, _, _)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              AllOf(op::Copy(op::Select(op::Select(_, _, _),
+                                        op::DynamicSlice(_, _, _, _, _),
+                                        op::DynamicSlice(_, _, _, _, _))),
+                    op::Shape("bf16[8,112,112,384]")));
 }
 
 TEST_P(SpmdPartitioningTest, CustomCallManualSharding) {
@@ -16495,6 +16660,27 @@ ENTRY entry {
               ::testing::ElementsAre(8, 10, 12, 14, 9, 11, 13, 15))));
 }
 
+TEST_P(SpmdPartitioningTest, OriginalValueWithTupleTypeShardingAnnotation) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}, origin={({"a"}, {"b"})}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  EXPECT_EQ(module->original_value_recovery_table().size(), 1);
+  EXPECT_EQ(module->original_value_recovery_table().begin()->first.ToString(),
+            R"("a")");
+  EXPECT_EQ(
+      module->original_value_recovery_table().begin()->second.first.ToString(),
+      R"("a)" + std::string(kOriginalValuePlaceholderDelimiter) + R"(0")");
+}
+
 TEST_P(SpmdPartitioningTest, ShardingPreprocessOrderWhile) {
   absl::string_view hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 33cd27e4c0d724..bdaa4c3fc3515e 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -2534,10 +2534,9 @@ HloSharding CreateMatchingShardingOnDims(
   if (to_be_partially_replicated) {
     return AlignShardingOnDims(HloSharding::PartialTile(tgt_tile_assignment),
                                target_dims, source_sharding, source_dims);
-  } else {
-    return AlignShardingOnDims(HloSharding::Tile(tgt_tile_assignment),
-                               target_dims, source_sharding, source_dims);
   }
+  return AlignShardingOnDims(HloSharding::Tile(tgt_tile_assignment),
+                             target_dims, source_sharding, source_dims);
 }
 
 std::optional<GatherScatterParallelDimSharding>
@@ -2909,8 +2908,8 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
       [&](absl::Span<const int64_t> indices, int64_t device) {
         int64_t group_id = 0;
         for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          auto it = absl::c_find(target_dims, dim);
-          if (it != target_dims.end()) {
+          if (auto it = absl::c_find(target_dims, dim);
+              it != target_dims.end()) {
             int64_t group_size =
                 group_sizes[std::distance(target_dims.begin(), it)];
             group_id *= sharding.tile_assignment().dim(dim) / group_size;
@@ -2963,8 +2962,7 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
   std::vector<int64_t> target_dim_locations;
   for (int64_t dim = 0; dim < sharding.tile_assignment().num_dimensions();
        ++dim) {
-    auto it = std::find(target_dims.begin(), target_dims.end(), dim);
-    if (it != target_dims.end()) {
+    if (auto it = absl::c_find(target_dims, dim); it != target_dims.end()) {
       int64_t current_val = sharding.tile_assignment().dim(dim);
       int64_t group_size = group_sizes[std::distance(target_dims.begin(), it)];
       reshape_dimensions.push_back(current_val / group_size);
@@ -2978,8 +2976,8 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
   std::vector<int> transpose_dims(reshape_dimensions.size());
   std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
   for (int64_t loc : target_dim_locations) {
-    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), loc);
-    if (it != transpose_dims.end()) {
+    if (auto it = absl::c_find(transpose_dims, loc);
+        it != transpose_dims.end()) {
       transpose_dims.erase(it);
       transpose_dims.push_back(loc);
     }
@@ -3047,8 +3045,7 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
                                            replication_dims.end());
   std::sort(replication_dims_sorted.begin(), replication_dims_sorted.end());
   for (int64_t i : replication_dims_sorted) {
-    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), i);
-    if (it != transpose_dims.end()) {
+    if (auto it = absl::c_find(transpose_dims, i); it != transpose_dims.end()) {
       transpose_dims.erase(it);
       transpose_dims.push_back(i);
     }
@@ -3166,6 +3163,37 @@ DynamicUpdateSliceAnalysis AnalyzeDynamicUpdateSlice(
         DynamicUpdateSliceMethod::kAllPartitionedSliceDimsHaveConstantIndices;
   }
 
+  // For now, only enable Method 3 if enzyme optimization is enabled.
+  bool is_enzyme_opt_enabled = hlo->parent()
+                                   ->parent()
+                                   ->config()
+                                   .debug_options()
+                                   .xla_enable_enzyme_comms_opt();
+  if (!is_enzyme_opt_enabled &&
+      analysis.method == DynamicUpdateSliceMethod::
+                             kAllPartitionedSliceDimsHaveConstantIndices) {
+    analysis.method = DynamicUpdateSliceMethod::kDefault;
+    return analysis;
+  }
+
+  // Extra check for out-of-bounds indexing
+  const HloInstruction* update_tensor = hlo->operand(1);
+  if (analysis.method ==
+      DynamicUpdateSliceMethod::kAllPartitionedSliceDimsHaveConstantIndices) {
+    for (int64_t dim = 0; dim < hlo->shape().dimensions().size(); ++dim) {
+      const HloInstruction* dus_index = hlo->operand(dim + 2);
+      CHECK(dus_index->IsConstant());
+
+      int64_t start_index = dus_index->literal().GetIntegralAsS64({}).value();
+      int64_t end_index = start_index + update_tensor->shape().dimensions(dim);
+      int64_t padding_high = hlo->shape().dimensions(dim) - end_index;
+      if (start_index < 0 || padding_high < 0) {
+        analysis.method = DynamicUpdateSliceMethod::kDefault;
+        return analysis;
+      }
+    }
+  }
+
   return analysis;
 }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 74f1be9c92583f..b54e6b87e3981c 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -17,29 +17,21 @@ limitations under the License.
 #define XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
 
 #include <algorithm>
-#include <cstddef>
 #include <cstdint>
 #include <initializer_list>
 #include <limits>
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_replace.h"
 #include "absl/types/span.h"
-#include "absl/utility/utility.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -47,7 +39,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -55,7 +46,6 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -822,9 +812,6 @@ template <typename Arg, IsHloModulePointer<Arg> = 0>
 std::decay_t<Arg> FakeHloModule(Arg&& module, HloModule* fake_module) {
   return fake_module;
 }
-template <class T>
-using decay_rvalue_reference_t =
-    std::conditional_t<std::is_rvalue_reference<T>::value, std::decay_t<T>, T>;
 
 // Modifies SpmdPartitioningVisitor* type objects.
 template <typename Arg, IsSpmdPartitioningVisitorPointer<Arg> = 0>
diff --git a/third_party/xla/xla/service/spmd/spmd_prepare.cc b/third_party/xla/xla/service/spmd/spmd_prepare.cc
index 3fc98bda1ec747..02e37864e813a4 100644
--- a/third_party/xla/xla/service/spmd/spmd_prepare.cc
+++ b/third_party/xla/xla/service/spmd/spmd_prepare.cc
@@ -174,7 +174,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
 }
 }  // namespace
 
-absl::StatusOr<bool> SpmdPrepare::Run(
+absl::StatusOr<bool> SpmdPrepare::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/spmd_prepare.h b/third_party/xla/xla/service/spmd/spmd_prepare.h
index 7789287add8084..c9cf799c9197a0 100644
--- a/third_party/xla/xla/service/spmd/spmd_prepare.h
+++ b/third_party/xla/xla/service/spmd/spmd_prepare.h
@@ -35,8 +35,8 @@ class SpmdPrepare : public HloModulePass {
   ~SpmdPrepare() override = default;
   absl::string_view name() const override { return "spmd-prepare"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
index beb173c38aa6eb..8f9c0598a82b59 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.cc
@@ -61,7 +61,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation) {
 
 }  // namespace
 
-absl::StatusOr<bool> WholeGraphManualPass::Run(
+absl::StatusOr<bool> WholeGraphManualPass::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
index ba3264aa2f4919..8ee2a787e747b5 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass.h
@@ -32,8 +32,8 @@ class WholeGraphManualPass : public HloModulePass {
   WholeGraphManualPass() : HloModulePass() {}
   absl::string_view name() const override { return "whole-graph-manual-pass"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 688f49e1595026..f72ed2f4bc92a2 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -16,13 +16,20 @@ limitations under the License.
 #include "xla/service/topk_rewriter.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
-#include <memory>
+#include <limits>
 #include <optional>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/builder/lib/comparators.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -35,6 +42,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
@@ -430,7 +438,7 @@ absl::StatusOr<bool> TopkRewriter::TransformToCustomCall(
   return changed;
 }
 
-absl::StatusOr<bool> TopkRewriter::Run(
+absl::StatusOr<bool> TopkRewriter::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -537,7 +545,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
   HloPredicate should_decompose_;
 };
 
-absl::StatusOr<bool> TopkDecomposer::Run(
+absl::StatusOr<bool> TopkDecomposer::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return TopkDecomposerVisitor(should_decompose_)
diff --git a/third_party/xla/xla/service/topk_rewriter.h b/third_party/xla/xla/service/topk_rewriter.h
index cf673fe702b197..2a982584eff52a 100644
--- a/third_party/xla/xla/service/topk_rewriter.h
+++ b/third_party/xla/xla/service/topk_rewriter.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -40,12 +43,11 @@ class TopkRewriter : public HloModulePass {
 
   absl::string_view name() const override { return "topk-rewriter"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
- protected:
   // Check if the sort instruction is in TopK.
   std::optional<int64_t> SortIsInTopK(HloInstruction* inst);
 
@@ -73,8 +75,8 @@ class TopkDecomposer : public HloModulePass {
   explicit TopkDecomposer(HloPredicate should_decompose = {})
       : should_decompose_(should_decompose) {}
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/topk_rewriter_test.cc b/third_party/xla/xla/service/topk_rewriter_test.cc
index f32060582e8bed..3ed45101cf4bb7 100644
--- a/third_party/xla/xla/service/topk_rewriter_test.cc
+++ b/third_party/xla/xla/service/topk_rewriter_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 4e4c6c8191e809..8dad45bb164069 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -24,7 +24,10 @@ limitations under the License.
 #include "absl/base/const_init.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "xla/literal.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/notification.h"
diff --git a/third_party/xla/xla/service/transpose_folding.cc b/third_party/xla/xla/service/transpose_folding.cc
index 2550e3e98c4fac..279b0a88752fe5 100644
--- a/third_party/xla/xla/service/transpose_folding.cc
+++ b/third_party/xla/xla/service/transpose_folding.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "xla/service/transpose_folding.h"
 
-#include <algorithm>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -189,7 +194,7 @@ TransposeFolding::TransposeFolding(
           std::move(dot_can_fold_transpose_operand)),
       transposable_conv_operands_(std::move(transposable_conv_operands)) {}
 
-absl::StatusOr<bool> TransposeFolding::Run(
+absl::StatusOr<bool> TransposeFolding::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Modifying the graph while traversing is dangerous, so we find all folding
diff --git a/third_party/xla/xla/service/transpose_folding.h b/third_party/xla/xla/service/transpose_folding.h
index 447b96771318ba..d73bb548219418 100644
--- a/third_party/xla/xla/service/transpose_folding.h
+++ b/third_party/xla/xla/service/transpose_folding.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -62,14 +65,14 @@ class TransposeFolding : public HloModulePass {
           AlwaysFoldTranspose);
   absl::string_view name() const override { return "transpose-folding"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   static absl::StatusOr<bool> IsRowColumnTransposeDotOperand(
       const HloInstruction& dot, int64_t operand_idx);
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   CanFoldTransposeOperand dot_can_fold_transpose_operand_;
   TransposableConvOperandsFn transposable_conv_operands_;
diff --git a/third_party/xla/xla/service/transpose_folding_test.cc b/third_party/xla/xla/service/transpose_folding_test.cc
index 49c5417d110a65..9a88b178cb3624 100644
--- a/third_party/xla/xla/service/transpose_folding_test.cc
+++ b/third_party/xla/xla/service/transpose_folding_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "xla/service/transpose_folding.h"
 
+#include <cstdint>
 #include <memory>
-#include <vector>
+#include <optional>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_computation.h"
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index 67dc0235810de8..049249aa5b0481 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/triangular_solve_expander.h b/third_party/xla/xla/service/triangular_solve_expander.h
index 87aaf5612ce48a..ce19b1c8b17b21 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.h
+++ b/third_party/xla/xla/service/triangular_solve_expander.h
@@ -17,8 +17,11 @@ limitations under the License.
 #define XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/transforms/expanders/op_expander_pass.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/value_range.cc b/third_party/xla/xla/service/value_range.cc
index 130d07c4d9e20e..bf9b21c9757b04 100644
--- a/third_party/xla/xla/service/value_range.cc
+++ b/third_party/xla/xla/service/value_range.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/constant_value.h"
 #include "xla/service/hlo_value.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index 76cc18204706eb..806ff2d7b1efab 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/while_loop_all_reduce_code_motion.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <stack>
@@ -23,7 +24,13 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/hlo_replication_analysis.h"
 #include "xla/hlo/analysis/while_loop_analysis.h"
@@ -1225,7 +1232,7 @@ absl::StatusOr<HloInstruction*> AddSinkedAllReducesAndReplaceWhile(
 
 }  // namespace
 
-absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool is_changed = false;
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
index 956f7d232faa52..2e4c9c0304961d 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
@@ -69,8 +69,9 @@ class WhileLoopAllReduceCodeMotion : public HloModulePass {
   static constexpr absl::string_view kName =
       "while-loop-all-reduce-code-motion";
   absl::string_view name() const override { return kName; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
index 677355413aa0f9..44844ada9433dd 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index 3f8c298423648f..b7d7888ac99f6b 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -1040,7 +1040,7 @@ absl::StatusOr<bool> RunOnLoop(HloInstruction* loop,
 
 }  // namespace
 
-absl::StatusOr<bool> WhileLoopConcatCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopConcatCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.h b/third_party/xla/xla/service/while_loop_concat_code_motion.h
index 236d84e93c500e..aeaa9a2ff1d1e2 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.h
@@ -69,8 +69,8 @@ class WhileLoopConcatCodeMotion : public HloModulePass {
     return "while-loop-concat-code-motion";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index 806d5e229ed130..396b221748c30c 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -163,7 +163,7 @@ absl::StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
   return body_clone || cond_clone;
 }
 
-absl::StatusOr<bool> WhileLoopConstantSinking::Run(
+absl::StatusOr<bool> WhileLoopConstantSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before WhileLoopConstantSinking:";
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/xla/xla/service/while_loop_constant_sinking.h
index 4db22bea33a529..98cb2aab411977 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.h
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -60,8 +60,8 @@ class WhileLoopConstantSinking : public HloModulePass {
   static constexpr absl::string_view kName = "while-loop-constant-sinking";
   absl::string_view name() const override { return kName; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
index 209b15aa510468..2fb18f31dd3f73 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
@@ -298,7 +298,7 @@ absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
   return true;
 }
 
-absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before WhileLoopExpensiveInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
index 06b4bac1908fae..06d5525a1481ab 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -49,8 +49,9 @@ class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-expensive-invariant-code-motion";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 39690ded697dd1..5c870e08967614 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -34,14 +34,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/while_util.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -122,6 +121,9 @@ absl::StatusOr<HloInstruction*> AppendToWhileState(
   TF_RETURN_IF_ERROR(
       UpdateWhileUsesWithTuple(while_instr, while_input->operand_count() - 1));
   *while_instr->mutable_shape() = while_input->shape();
+  // The new body root tuple element has the same value as the new operand.
+  AppendToWhileLoopOriginalValue(while_instr, {new_operand});
+
   return new_gte;
 }
 
@@ -459,6 +461,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
     HloInstruction* parameter = while_body->parameter_instruction(0);
     int64_t next_index = init_value->operand_count();
     new_operands.resize(fusion->operand_count());
+
     for (int64_t i = 0; i < fusion->operand_count(); ++i) {
       init_value->AppendOperand(fusion->mutable_operand(i));
       parameter->mutable_shape()->mutable_tuple_shapes()->push_back(
@@ -469,9 +472,14 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
     }
     *(init_value->mutable_shape()) = parameter->shape();
     *(while_instr->mutable_shape()) = parameter->shape();
+    //
+    // The new body root tuple elements have the same value as the fusion
+    // operands.
+    AppendToWhileLoopOriginalValue(while_instr, fusion->operands());
     *(while_cond->parameter_instruction(0)->mutable_shape()) =
         parameter->shape();
     *(root->mutable_shape()) = parameter->shape();
+
     auto cloned_fusion = while_body->AddInstruction(
         fusion->CloneWithNewOperands(fusion->shape(), new_operands));
     TF_RETURN_IF_ERROR(fusion->parent()->RemoveInstruction(fusion));
@@ -482,7 +490,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
   return changed;
 }
 
-absl::StatusOr<bool> WhileLoopFusibleSinking::Run(
+absl::StatusOr<bool> WhileLoopFusibleSinking::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Before WhileLoopFusibleSinking " << module->unique_id();
@@ -539,6 +547,7 @@ absl::StatusOr<bool> WhileLoopFusibleSinking::Run(
       }
     }
   }
+
   return changed;
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/xla/xla/service/while_loop_fusible_sinking.h
index e3f65e6bdb1a53..4f9ef6268b39a1 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.h
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -96,8 +96,8 @@ class WhileLoopFusibleSinking : public HloModulePass {
     return "while-loop-fusible-sinking";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index d5ecda426902ac..f5baf95b30dc17 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
@@ -429,5 +430,100 @@ TEST_F(WhileLoopFusibleSinkingTest, TestNoPlumbWithUnknonwnTripCount) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(WhileLoopFusibleSinkingTest, SinkMaskWithOriginalValue) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[5,7],f32[5,7]) parameter(0)
+  p_body.0 = get-tuple-element(p_body), index=0
+  p_body.1 = get-tuple-element(p_body), index=1
+
+  add.0 = add(p_body.0, p_body.1)
+  ROOT root = tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[5,7],f32[5,7]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[5,7] parameter(0), origin={{"constant"}}
+  p = f32[5] parameter(1), origin={{"parameter"}}
+  a = f32[5,7] iota(), iota_dimension=0
+  b = f32[5,7] iota(), iota_dimension=1
+  c = add(a, b)
+  d = f32[5,7] broadcast(p), dimensions={0}
+  mask = multiply(c,d), origin={{"mask"}}
+  while_init = tuple(const_0, mask), origin={({"constant"}, {"mask"})}
+  ROOT while = while(while_init), condition=condition, body=body, origin={({"while" {0}}, {"while" {1}})}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = FindInstruction(module.get(), "while");
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"parameter"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"constant"}, {"mask"}, {"parameter"}))");
+}
+
+TEST_F(WhileLoopFusibleSinkingTest, PlumbSingleBroadcastWithOriginalValue) {
+  const std::string hlo_string_before = R"(
+  HloModule test
+
+  loop.body {
+    loop_var.1 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.1 = s32[]{:T(128)} get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s32[4,3,5]{2,1,0} get-tuple-element(loop_var.1), index=2
+    bitcast.12855 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} bitcast(get-tuple-element.3)
+    add.40974 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.2, bitcast.12855)
+    constant.1 = s32[]{:T(128)} constant(1)
+    idx = s32[]{:T(128)} add(get-tuple-element.1, constant.1)
+    ROOT tuple = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(idx, add.40974, get-tuple-element.3)
+  }
+
+  loop.condition {
+    loop_var.2 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.3 = s32[]{:T(128)} get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[]{:T(128)} constant(4)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+
+  ENTRY %main {
+    param.1 = s32[4,3,5]{2,1,0} iota(), iota_dimension=0
+    zero = s32[]{:T(128)} constant(0), origin={{"zero"}}
+    zeros32 = s32[]{:T(128)} constant(0), origin={{"zeros32"}}
+    broadcast = s32[1,1,1,4,3,5]{5,4,3,2,1,0} broadcast(zeros32), origin={{"broadcast"}}
+    input = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(zero, broadcast, param.1), origin={({"zero"}, {"zeros32"}, {"broadcast"})}
+    ROOT while = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) while(input), condition=loop.condition, body=loop.body, origin={({"while" {0}}, {"while" {1}}, {"while" {2}})}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string_before));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindInstruction(module.get(), "while");
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"while" {2}}, {"zero"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"zero"}, {"zeros32"}, {"broadcast"}, {"zero"}))");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index c8d48876e0771f..ba68c757c13f79 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -292,7 +291,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   return true;
 }
 
-absl::StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopInvariantCodeMotion::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(3) << "HLO module before WhileLoopInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index 695f08d0a706ad..1c2962d3971134 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -71,8 +71,9 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   absl::string_view name() const override {
     return "while-loop-invariant-code-motion";
   }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
index e0f4f6c39db580..2f8838dfc7dd1b 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
@@ -108,7 +108,7 @@ int64_t WhileLoopPipelineUnroller::ComputeWhileLoopPipelineDepth(
   return pipeline_depth;
 }
 
-absl::StatusOr<bool> WhileLoopPipelineUnroller::Run(
+absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::vector<std::pair<HloInstruction*, int64_t>> while_instructions;
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.h b/third_party/xla/xla/service/while_loop_pipeline_unroller.h
index ec5cab6da0c03a..ee2d45d0544dc7 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.h
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.h
@@ -44,16 +44,16 @@ class WhileLoopPipelineUnroller : public HloModulePass {
     return "while_loop_pipeline_unroller";
   }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // The pipeline depth of a while loop is the number of loop iterations that
   // pipelined loop inputs live throughout. This is used to determine how many
   // times to unroll the loop in order to remove aliasing interference.
   static int64_t ComputeWhileLoopPipelineDepth(
       const HloInstruction& while_instruction);
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index faa0a6d97061f4..55c260fb08571d 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/service/while_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -150,6 +152,35 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
     HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
     std::optional<absl::flat_hash_map<int32_t, int32_t>>
         dead_to_surviving_index = std::nullopt) {
+  auto copy_remaining_original_arrays =
+      [&](const HloInstruction* src_instruction,
+          HloInstruction* dest_instruction,
+          const absl::flat_hash_map<int64_t, int64_t>& old_to_new_tuple_idx) {
+        std::shared_ptr<OriginalValue> original_value =
+            src_instruction->original_value();
+        if (!original_value) {
+          return;
+        }
+
+        const int64_t src_tuple_size =
+                          src_instruction->shape().tuple_shapes().size(),
+                      dest_tuple_size =
+                          dest_instruction->shape().tuple_shapes().size();
+        std::shared_ptr<OriginalValue> old_original_value =
+            src_instruction->original_value();
+        std::shared_ptr<xla::OriginalValue> new_original_value =
+            std::make_shared<xla::OriginalValue>(dest_instruction->shape());
+        for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
+          if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
+              new_idx >= dest_tuple_size) {
+            return;
+          }
+          new_original_value->mutable_tree()->CopySubtreeFrom(
+              old_original_value->tree(), {old_idx}, {new_idx});
+        }
+        dest_instruction->set_original_value(new_original_value);
+      };
+
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64_t> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                             used_tuple_indices.end());
@@ -275,6 +306,10 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
 
+  copy_remaining_original_arrays(while_init, new_while_init,
+                                 old_to_new_tuple_idx);
+  copy_remaining_original_arrays(while_op, new_while_op, old_to_new_tuple_idx);
+
   // Create a tuple op that recreates the output of the old while op.  That is,
   // we transform to
   //
@@ -1160,6 +1195,20 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
 }
 
 static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+  auto flatten_original_value = [&](HloInstruction* old_instr,
+                                    HloInstruction* new_instr) {
+    if (old_instr->original_value()) {
+      auto new_original_value =
+          std::make_shared<OriginalValue>(new_instr->shape());
+      int64_t i = 0;
+      for (auto& [shape_index, original_array] :
+           old_instr->original_value()->tree().leaves()) {
+        *new_original_value->mutable_tree()->mutable_element({i++}) =
+            original_array;
+      }
+      new_instr->set_original_value(new_original_value);
+    }
+  };
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
   auto* while_init = while_op->mutable_operand(0);
@@ -1261,6 +1310,9 @@ static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   for (auto& instr : new_instrs) {
     computation->AddInstruction(std::move(instr));
   }
+
+  flatten_original_value(while_init, new_while_op->mutable_operand(0));
+  flatten_original_value(while_op, new_while_op);
   return true;
 }
 
@@ -1403,6 +1455,8 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
           add_binary_op(elem_shape, HloOpcode::kMultiply,
                         add_gte(instr, *trip_counter),
                         add_new_instr(induction_vars.at(i)->Clone()))));
+      // Copy the original value of the induction variable to its replacement.
+      tuple_elems.back()->CopyOriginalValue(while_body_root->operand(i));
     }
     return HloInstruction::CreateTuple(tuple_elems);
   };
@@ -1496,6 +1550,15 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
       module->AddEmbeddedComputation(std::move(new_while_body)),
       get_new_while_init(while_init)));
   new_while->CopyBackendConfigFrom(while_op);
+  if (auto original_value = while_init->original_value()) {
+    new_while->while_init()->set_original_value(original_value);
+  }
+  if (auto original_value = while_op->original_value()) {
+    new_while->set_original_value(original_value);
+  }
+  if (added_trip_counter) {
+    AppendToWhileLoopOriginalValue(new_while, {});
+  }
   CopyFrontendAttributes(while_op, new_while);
   CopyMetadata(while_op, new_while);
   TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
@@ -1506,11 +1569,11 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
   return new_while;
 }
 
-absl::StatusOr<bool> WhileLoopSimplifier::Run(
+absl::StatusOr<bool> WhileLoopSimplifier::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(3,
-                 "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopSimplifier::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
 
   // Gather all the while ops in our module.  We do this ahead of time so we
@@ -1631,8 +1694,8 @@ absl::StatusOr<bool> WhileLoopSimplifier::Run(
     HloDCE dce;
     TF_RETURN_IF_ERROR(dce.Run(module).status());
   }
-  XLA_VLOG_LINES(3,
-                 "WhileLoopSimplifier::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopSimplifier::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 5fc34b22a3db4d..3ad61b38cb6c05 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -67,8 +67,9 @@ class WhileLoopSimplifier : public HloModulePass {
   ~WhileLoopSimplifier() override = default;
   static constexpr absl::string_view kName = "simplify-while-loops";
   absl::string_view name() const override { return kName; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index 953526affe78ee..24202035575301 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -1481,5 +1481,172 @@ ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
                         op::GetTupleElement(op::While(), 0)));
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveDeadTupleIndicesWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule dus
+
+%while.body (arg_tuple: (f32[3], f32[2], f32[2], f32[3], s32[])) -> (f32[3], f32[2], f32[2], f32[3], s32[]) {
+  %arg_tuple = (f32[3], f32[2], f32[2], f32[3], s32[]) parameter(0)
+  %get-tuple-element.0 = f32[3] get-tuple-element(%arg_tuple), index=0
+  %get-tuple-element.1 = f32[2] get-tuple-element(%arg_tuple), index=1
+  %get-tuple-element.2 = f32[2] get-tuple-element(%arg_tuple), index=2
+  %get-tuple-element.3 = f32[3] get-tuple-element(%arg_tuple), index=3
+  %get-tuple-element.4 = s32[] get-tuple-element(%arg_tuple), index=4
+  %constant.1 = s32[] constant(1)
+  %constant.v0 = f32[1] constant({0.0})
+  %constant.v1 = f32[1] constant({1.0})
+  %dynamic-update-slice.0 = f32[3] dynamic-update-slice(%get-tuple-element.0, %constant.v0, s32[] %constant.1)
+  %dynamic-update-slice.3 = f32[3] dynamic-update-slice(%get-tuple-element.3, %constant.v0, s32[] %constant.1)
+  %add = add(s32[] %get-tuple-element.4, s32[] %constant.1)
+  ROOT %tuple = tuple(%dynamic-update-slice.0, %get-tuple-element.1, %get-tuple-element.2, %dynamic-update-slice.3, %add)
+}
+
+%while.condition (arg_tuple.cond:(f32[3], f32[2], f32[2], f32[3], s32[])) -> pred[] {
+  %arg_tuple.cond = (f32[3], f32[2], f32[2], f32[3], s32[]) parameter(0)
+  %get-tuple-element.cond = s32[] get-tuple-element(%arg_tuple.cond), index=4
+  %constant.3 = s32[] constant(3)
+  ROOT %compare = pred[] compare(s32[] %get-tuple-element.cond, s32[] %constant.3), direction=LT
+}
+
+ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
+  %constant.0 = s32[] constant(0)
+  %arg.0 = f32[3] parameter(0)
+  %arg.1 = f32[2] parameter(1)
+  %input = tuple(%arg.0, %arg.1, %arg.1, %arg.0, %constant.0), origin={({"arg.0"}, {"arg.1"}, {"arg.1"}, {"arg.0"}, {"constant.0"})}
+  %while = while(%input), condition=%while.condition, body=%while.body, origin={({"while.116" {0}}, {"while.116" {1}}, {"while.116" {2}}, {"while.116" {3}}, {"while.116" {4}})}
+  %get-tuple-element.out0 = f32[3] get-tuple-element(%while), index=0
+  %get-tuple-element.out1 = f32[2] get-tuple-element(%while), index=1
+  %get-tuple-element.out2 = f32[2] get-tuple-element(%while), index=2
+  %get-tuple-element.out3 = f32[3] get-tuple-element(%while), index=3
+  ROOT %root = tuple(%get-tuple-element.out0, %get-tuple-element.out1, %get-tuple-element.out2, %get-tuple-element.out3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(WhileLoopSimplifier().Run(module.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while.116" {0}}, {"while.116" {1}}, {"while.116" {4}}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"arg.0"}, {"arg.1"}, {"constant.0"}))");
+}
+
+TEST_F(WhileLoopSimplifierTest, FlattenNestedTupleWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ta = (s32[1]) get-tuple-element(param), index=0
+    a = s32[1] get-tuple-element(ta), index=0
+    a.1 = s32[1] add(a, a)
+    tbcd = (s32[2], s32[3], (s32[4])) get-tuple-element(param), index=1
+    ROOT tuple = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+  }
+  Cond {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({0,1})
+    c = s32[3] constant({0,1,2})
+    d = s32[4] constant({0,1,2,3})
+    ta = (s32[1]) tuple(a)
+    td = (s32[4]) tuple(d)
+    tbcd = (s32[2], s32[3], (s32[4])) tuple(b, c, td)
+    init = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd), origin={(({"a"}), (
+      {"b"}, {"c"}, ({"d"})))}
+    ROOT while = ((s32[1]), (s32[2], s32[3], (s32[4]))) while(init),
+      condition=Cond, body=Body, origin={(({"while.116" {0}}), (
+      {"while.116" {1}}, {"while.116" {2}}, ({"while.116" {3}})))}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(
+      while_instr->original_value()->ToString(),
+      R"(({"while.116" {0}}, {"while.116" {1}}, {"while.116" {2}}, {"while.116" {3}}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"a"}, {"b"}, {"c"}, {"d"}))");
+}
+
+const char* const kSimpleMergeInductionVariablesModuleWithOriginalValue = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one), origin={{"induction_var_0"}}
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone), origin={{"induction_var_1"}}
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] compare(sum, ten), direction=LT
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c), origin={({"a"}, {"b"}, {"c"})}
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body, origin={({"while" {0}}, {"while" {1}}, {"while" {2}})}
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    c1 = TYPE[] get-tuple-element(while), index=2
+    sum = TYPE[] add(a1, b1)
+    ROOT sum.1 = TYPE[] add(sum, c1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariablesWithOriginalValue) {
+  std::string hlo_string = absl::StrReplaceAll(
+      kSimpleMergeInductionVariablesModuleWithOriginalValue, {{"TYPE", "s32"}});
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instr = FindFirstWhile(module.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(),
+            R"(({"while" {0}}, {"while" {1}}, {"while" {2}}, {}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(),
+            R"(({"a"}, {"b"}, {"c"}, {}))");
+  const HloInstruction* add =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* induction_var_0 = add->operand(0);
+  ASSERT_NE(induction_var_0->original_value(), nullptr);
+  EXPECT_EQ(induction_var_0->original_value()->ToString(),
+            R"({"induction_var_0"})");
+  const HloInstruction* induction_var_1 = add->operand(1);
+  ASSERT_NE(induction_var_1->original_value(), nullptr);
+  EXPECT_EQ(induction_var_1->original_value()->ToString(),
+            R"({"induction_var_1"})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index d69ba416c5d961..a9dbb06099045e 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/algorithm.h"
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -1432,7 +1431,7 @@ WhileLoopUnroller::UnrollAndReturnReplacement(
   return result;
 }
 
-absl::StatusOr<bool> WhileLoopUnroller::Run(
+absl::StatusOr<bool> WhileLoopUnroller::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // TODO(b/288130138) For now, we only support full unrolling. Will add partial
@@ -1440,7 +1439,8 @@ absl::StatusOr<bool> WhileLoopUnroller::Run(
   if (unroll_factor_ != -1) {
     return false;
   }
-  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), before:\n" + module->ToString());
+  XLA_VLOG_LINES(
+      3, "WhileLoopUnroller::RunImpl(), before:\n" + module->ToString());
   bool changed = false;
   // Make sure all the necessary passes are executed before unrolling in order
   // to unroll every possible loop.
@@ -1478,7 +1478,8 @@ absl::StatusOr<bool> WhileLoopUnroller::Run(
     TF_RETURN_IF_ERROR(HloDCE().Run(module, execution_threads).status());
   }
 
-  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(3,
+                 "WhileLoopUnroller::RunImpl(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index 0982bf290ac749..846e0f6ed01d19 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -139,11 +139,6 @@ class WhileLoopUnroller : public HloModulePass {
 
   absl::string_view name() const override { return "while_loop_unroller"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
   // Runs a sequence of passes that are necessary to prepare loops for
   // unrolling. Failure to run these passes will prevent unroller from unrolling
   // loops that would have been otherwise unrollable.
@@ -176,6 +171,11 @@ class WhileLoopUnroller : public HloModulePass {
       bool wrap_in_trivial_loop = false, bool force_unroll = false,
       bool prepare = true, const UnrollConfig& unroll_config = UnrollConfig());
 
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   int64_t unroll_factor_;
   // Whether to wrap the unrolled computation in a loop with trip count of one.
diff --git a/third_party/xla/xla/service/while_util.cc b/third_party/xla/xla/service/while_util.cc
index 5149850241c67d..7e092d2bc2682d 100644
--- a/third_party/xla/xla/service/while_util.cc
+++ b/third_party/xla/xla/service/while_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/while_util.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -615,4 +616,70 @@ absl::Status WhileUtil::IncrementWhileLoopTripCount(
   return induction_var->ReplaceAllUsesWith(decremented_induction_var);
 }
 
+void AppendToWhileLoopOriginalValue(
+    HloInstruction* while_instr,
+    const HloInstruction::InstructionVector& new_while_input_tuple_elements) {
+  auto append_to_original_value = [&](HloInstruction* instr,
+                                      int64_t next_index) {
+    std::shared_ptr<OriginalValue> old_original_value = instr->original_value();
+    if (old_original_value != nullptr &&
+        old_original_value->IsCompatibleWith(instr->shape())) {
+      return;
+    }
+
+    // Returns if neither the instruction nor any of its new tuple elements have
+    // an original value.
+    if (old_original_value == nullptr) {
+      bool has_original_value = false;
+      std::for_each(new_while_input_tuple_elements.begin(),
+                    new_while_input_tuple_elements.end(),
+                    [&has_original_value](const HloInstruction* instr) {
+                      has_original_value |=
+                          (instr->original_value() != nullptr &&
+                           !instr->original_value()->IsEmpty());
+                    });
+      if (!has_original_value) {
+        return;
+      }
+    }
+
+    std::shared_ptr<OriginalValue> new_original_value =
+        std::make_shared<OriginalValue>(instr->shape());
+    if (old_original_value != nullptr) {
+      if (!old_original_value->IsTuple()) {
+        new_original_value->mutable_tree()->CopySubtreeFrom(
+            old_original_value->tree(), {}, {0});
+      } else {
+        for (auto& [shape_index, original_array] : old_original_value->tree()) {
+          *new_original_value->mutable_original_array(shape_index) =
+              original_array;
+        }
+      }
+    }
+
+    for (int64_t i = 0; i < new_while_input_tuple_elements.size(); ++i) {
+      if (new_while_input_tuple_elements[i]->original_value() != nullptr) {
+        new_original_value->mutable_tree()->CopySubtreeFrom(
+            new_while_input_tuple_elements[i]->original_value()->tree(), {},
+            {next_index + i});
+      }
+    }
+    instr->set_original_value(new_original_value);
+  };
+
+  if (while_instr->opcode() != HloOpcode::kWhile) {
+    return;
+  }
+  const Shape& while_shape = while_instr->shape();
+  if (!while_shape.IsTuple()) {
+    return;
+  }
+  // Calculates the start index for the new tuple elements in the new original
+  // value.
+  int64_t next_index =
+      while_shape.tuple_shapes().size() - new_while_input_tuple_elements.size();
+  append_to_original_value(while_instr->while_init(), next_index);
+  append_to_original_value(while_instr, next_index);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index c05234d28ae3f0..7b2a820ced8024 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -155,6 +155,15 @@ class WhileUtil {
   static absl::Status IncrementWhileLoopTripCount(
       const HloInstruction& while_instruction, int32_t increment);
 };
+
+// This is a helper function to update the original value after some
+// transformations append new elements to the while input tuple (or turn it into
+// a tuple if it was not one before). It appends the original values of the
+// new elements after existing children of the root node of the old original
+// value. This is done for both the input and output of the loop respectively.
+void AppendToWhileLoopOriginalValue(
+    HloInstruction* while_instr,
+    const HloInstruction::InstructionVector& new_while_input_tuple_elements);
 }  // namespace xla
 
 #endif  // XLA_SERVICE_WHILE_UTIL_H_
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 4d06c741cbca4b..e1aa15d94b6ef2 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -99,7 +99,7 @@ int main(int argc, char* argv[]) {
                 "Required if --output_file is not set."),
   };
 
-  tsl::string usage = xla::xla_compile::kUsageHeader;
+  std::string usage = xla::xla_compile::kUsageHeader;
   usage += tsl::Flags::Usage(argv[0], flag_list);
   if (argc > 1 && absl::string_view(argv[1]) == "--help") {
     std::cerr << usage << "\n";
diff --git a/third_party/xla/xla/shape.h b/third_party/xla/xla/shape.h
index ffaa4ae7ea3747..b385704f7058e1 100644
--- a/third_party/xla/xla/shape.h
+++ b/third_party/xla/xla/shape.h
@@ -77,12 +77,6 @@ class Shape {
   Shape& operator=(const Shape&);
   Shape& operator=(Shape&&) noexcept;
 
-  // Constructs a shape from a ShapeProto. Results in an invalid shape (as
-  // opposed to crashing) if the proto has logically invalid fields.
-  ABSL_DEPRECATE_AND_INLINE()
-  explicit Shape(const ShapeProto& shape_proto)
-      : Shape(FromProto(shape_proto).value_or(Shape())) {}
-
   // Creates a token, opaque or buffer shape.
   // Precondition:
   //  - `element_type` must be TOKEN, OPAQUE_TYPE or BUFFER.
@@ -695,12 +689,6 @@ class ProgramShape {
   ProgramShape& operator=(const ProgramShape&);
   ProgramShape& operator=(ProgramShape&&);
 
-  // Constructs a ProgramShape from a ProgramShapeProto protobuf. If the
-  // ProgramShapeProto is invalid, an empty ProgramShape is constructed.
-  ABSL_DEPRECATE_AND_INLINE()
-  explicit ProgramShape(const ProgramShapeProto& program_shape_proto)
-      : ProgramShape(FromProto(program_shape_proto).value_or(ProgramShape())) {}
-
   // Creates a ProgramShape from a ProgramShapeProto protobuf.
   static absl::StatusOr<ProgramShape> FromProto(
       const ProgramShapeProto& program_shape_proto);
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index e1412666c5bd40..0d736f73cb036e 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2196,8 +2196,8 @@ Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
 }
 
 /*static*/
-absl::Status ShapeUtil::ByteStrides(const Shape& shape,
-                                    absl::Span<int64_t> strides) {
+absl::Status ShapeUtil::UnpackedByteStrides(const Shape& shape,
+                                            absl::Span<int64_t> strides) {
   TF_RET_CHECK(shape.IsArray());
   TF_RET_CHECK(shape.has_layout());
   TF_RET_CHECK(shape.dimensions().size() == strides.size());
@@ -2211,15 +2211,29 @@ absl::Status ShapeUtil::ByteStrides(const Shape& shape,
 }
 
 /*static*/
-std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::ByteStrides(
+absl::Status ShapeUtil::ByteStrides(const Shape& shape,
+                                    absl::Span<int64_t> strides) {
+  return UnpackedByteStrides(shape, strides);
+}
+
+/*static*/
+std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::UnpackedByteStrides(
     const Shape& shape) {
   absl::InlinedVector<int64_t, 4> strides(shape.dimensions().size());
-  if (!ByteStrides(shape, absl::MakeSpan(strides)).ok()) {
+  if (!UnpackedByteStrides(shape, absl::MakeSpan(strides)).ok()) {
     return std::nullopt;
   }
   return strides;
 }
 
+/*static*/ std::optional<absl::InlinedVector<int64_t, 4>>
+ShapeUtil::ByteStrides(const Shape& shape) {
+  if (shape.layout().element_size_in_bits() % CHAR_BIT != 0) {
+    return std::nullopt;
+  }
+  return UnpackedByteStrides(shape);
+}
+
 /*static*/ int64_t ShapeUtil::ElementSizeInBits(const Shape& shape) {
   if (shape.has_layout() && shape.layout().element_size_in_bits() != 0) {
     return shape.layout().element_size_in_bits();
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index a4e134bcb3000e..3968ed9ad6c043 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -1132,10 +1132,17 @@ class ShapeUtil {
 
   // Computes byte strides of an array shape `shape`. `shape` must have a
   // layout. Ignores tiling. `strides` must have size equal to the number of
-  // dimensions of `shape`.
+  // dimensions of `shape`. Ignores element_size_in_bits - uses its default
+  // value, ByteSizeOfPrimitiveType - therefore `unpacked`.
+  static absl::Status UnpackedByteStrides(const Shape& shape,
+                                          absl::Span<int64_t> strides);
+  ABSL_DEPRECATE_AND_INLINE()
   static absl::Status ByteStrides(const Shape& shape,
                                   absl::Span<int64_t> strides);
   // Same as above but returns the stride array, or std::nullopt if error.
+  static std::optional<absl::InlinedVector<int64_t, 4>> UnpackedByteStrides(
+      const Shape& shape);
+  // Same as above but returns std::nullopt if elements are not byte-aligned.
   static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
       const Shape& shape);
 
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 77e241e4c0b465..b714bf9661d660 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -45,6 +46,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::StatusIs;
 using ::testing::ElementsAre;
 
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
@@ -342,12 +344,21 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20})));
 }
 
-TEST(ShapeUtilTest, ByteStrides) {
+TEST(ShapeUtilTest, UnpackedByteStrides) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 5, 7});
   Shape shape2 = ShapeUtil::MakeShape(F16, {5, 7, 9});
+  Shape shape3 = ShapeUtil::MakeShape(S4, {2, 4});
+  shape3.mutable_layout()->set_element_size_in_bits(4);
 
-  EXPECT_THAT(*ShapeUtil::ByteStrides(shape1), ElementsAre(140, 28, 4));
-  EXPECT_THAT(*ShapeUtil::ByteStrides(shape2), ElementsAre(126, 18, 2));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape1), ElementsAre(140, 28, 4));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape2), ElementsAre(126, 18, 2));
+  EXPECT_THAT(*ShapeUtil::UnpackedByteStrides(shape3), ElementsAre(4, 1));
+}
+
+TEST(ShapeUtilTest, ByteStridesFailsOnFractionalElementSize) {
+  Shape shape = ShapeUtil::MakeShape(S4, {10, 20});
+  shape.mutable_layout()->set_element_size_in_bits(4);
+  EXPECT_THAT(ShapeUtil::ByteStrides(shape), std::nullopt);
 }
 
 TEST(ShapeUtilTest, NilShape) {
diff --git a/third_party/xla/xla/side_effect_util.cc b/third_party/xla/xla/side_effect_util.cc
index 787021405242d5..05b8d19b3adbbd 100644
--- a/third_party/xla/xla/side_effect_util.cc
+++ b/third_party/xla/xla/side_effect_util.cc
@@ -33,6 +33,8 @@ const char kXlaComputeTypeDense[] = "dense";
 
 const char kXlaComputeTypeHost[] = "host";
 
+const char kXlaComputeTypeSparseOffload[] = "sparseoffload";
+
 const char kXlaMaxIdsPerPartitionAttr[] = "_xla_max_ids_per_partition";
 
 const char kXlaMaxUniqueIdsPerPartitionAttr[] =
diff --git a/third_party/xla/xla/side_effect_util.h b/third_party/xla/xla/side_effect_util.h
index 6d4751aca435c7..901544a4d521fc 100644
--- a/third_party/xla/xla/side_effect_util.h
+++ b/third_party/xla/xla/side_effect_util.h
@@ -36,6 +36,7 @@ extern const char kXlaComputeTypeAttr[];
 extern const char kXlaComputeTypeSparse[];
 extern const char kXlaComputeTypeDense[];
 extern const char kXlaComputeTypeHost[];
+extern const char kXlaComputeTypeSparseOffload[];
 
 // XLA frontend attribute name for the maximum number of ids expected per
 // partition *before* an input batch is partitioned.
diff --git a/third_party/xla/xla/sort_json.cc b/third_party/xla/xla/sort_json.cc
index 65ef59e0232cf8..412f30b4226077 100644
--- a/third_party/xla/xla/sort_json.cc
+++ b/third_party/xla/xla/sort_json.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
@@ -34,7 +35,7 @@ limitations under the License.
 namespace {
 
 void SkipWhitespace(absl::string_view json, size_t& index) {
-  while (index < json.size() && std::isspace(json[index])) {
+  while (index < json.size() && absl::ascii_isspace(json[index])) {
     ++index;
   }
 }
@@ -103,7 +104,7 @@ absl::StatusOr<std::unique_ptr<T>> ParseSequence(absl::string_view outer_json,
 
 absl::Status EnsureValidLiteralStart(char c) {
   if (c != '"' && c != '+' && c != '-' && c != 'f' && c != 't' && c != 'n' &&
-      (c < '0' || c > '9')) {
+      !absl::ascii_isdigit(c)) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Invalid first character of literal: '", std::string(1, c), "'."));
   }
@@ -134,8 +135,8 @@ bool LiteralIsFinished(absl::string_view outer_json, size_t& index,
     return c == '"';
   }
 
-  return std::isspace(c) || c == ',' || c == '{' || c == '}' || c == '[' ||
-         c == ']' || c == ':';
+  return absl::ascii_isspace(c) || c == ',' || c == '{' || c == '}' ||
+         c == '[' || c == ']' || c == ':';
 }
 
 absl::StatusOr<absl::string_view> ParseLiteral(absl::string_view outer_json,
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index bc1890bfc2f32d..4d6be2de585f74 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -68,6 +68,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -318,8 +319,8 @@ cc_library(
 )
 
 cc_library(
-    name = "numeric_options",
-    hdrs = ["numeric_options.h"],
+    name = "engine_options",
+    hdrs = ["engine_options.h"],
 )
 
 cc_library(
@@ -345,7 +346,7 @@ cc_library(
         ":blas_proto_cc",
         ":data_type",
         ":device_memory",
-        ":numeric_options",
+        ":engine_options",
         ":scratch_allocator",
         ":stream",
         "//xla/tsl/protobuf:dnn_proto_cc",
@@ -366,7 +367,7 @@ cc_library(
         ":data_type",
         ":device_description_proto_cc",
         ":device_memory",
-        ":numeric_options",
+        ":engine_options",
         ":scratch_allocator",
         ":stream",
         "//xla:util",
@@ -544,13 +545,54 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "kernel_symbol_registry",
+    srcs = ["kernel_symbol_registry.cc"],
+    hdrs = ["kernel_symbol_registry.h"],
+    deps = [
+        ":platform",
+        ":platform_manager",
+        "//xla/stream_executor/platform:initialize",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_cc_test(
+    name = "kernel_symbol_registry_test",
+    srcs = ["kernel_symbol_registry_test.cc"],
+    deps = [
+        ":kernel_symbol_registry",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 cc_library(
     name = "kernel_spec",
     srcs = ["kernel_spec.cc"],
     hdrs = ["kernel_spec.h"],
     deps = [
         ":kernel",
+        ":kernel_args",
+        ":kernel_argument_packing_spec",
         ":kernel_spec_proto_cc",
+        ":kernel_symbol_registry",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -562,31 +604,92 @@ cc_library(
 tf_proto_library(
     name = "kernel_spec_proto",
     srcs = ["kernel_spec.proto"],
+    protodeps = [":kernel_argument_packing_spec_proto"],
 )
 
 xla_cc_test(
     name = "kernel_spec_test",
     srcs = ["kernel_spec_test.cc"],
     deps = [
+        ":kernel_argument_packing_spec",
         ":kernel_spec",
         ":kernel_spec_proto_cc",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "//xla/tsl/util/proto:parse_text_proto",
         "//xla/tsl/util/proto:proto_matchers",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "kernel_metadata",
+    hdrs = ["kernel_metadata.h"],
+)
+
+xla_cc_test(
+    name = "kernel_metadata_test",
+    srcs = ["kernel_metadata_test.cc"],
+    deps = [
+        ":kernel_metadata",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "kernel_argument_packing_spec",
+    srcs = ["kernel_argument_packing_spec.cc"],
+    hdrs = ["kernel_argument_packing_spec.h"],
+    deps = [
+        ":device_memory",
+        ":kernel_args_packed_vector",
+        ":kernel_argument_packing_spec_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_proto_library(
+    name = "kernel_argument_packing_spec_proto",
+    srcs = ["kernel_argument_packing_spec.proto"],
+)
+
+xla_cc_test(
+    name = "kernel_argument_packing_spec_test",
+    srcs = ["kernel_argument_packing_spec_test.cc"],
+    deps = [
+        ":device_memory",
+        ":kernel_args_packed_vector",
+        ":kernel_argument_packing_spec",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
     name = "kernel",
-    srcs = ["kernel.cc"],
     hdrs = ["kernel.h"],
     deps = [
         ":device_memory",
+        ":kernel_args",
+        ":kernel_argument_packing_spec",
+        ":kernel_metadata",
         ":launch_dim",
         ":stream",
         "@com_google_absl//absl/base:core_headers",
@@ -616,6 +719,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_args_packed_vector",
+    hdrs = ["kernel_args_packed_vector.h"],
+    deps = [
+        ":kernel_args",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "kernel_args_packed_vector_test",
+    srcs = ["kernel_args_packed_vector_test.cc"],
+    deps = [
+        ":kernel_args_packed_vector",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "semantic_version",
     srcs = ["semantic_version.cc"],
@@ -836,15 +959,31 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "kernel_args",
+    hdrs = ["kernel_args.h"],
+    deps = [
+        ":device_memory",
+        ":kernel_metadata",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:overload",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_cc_test(
-    name = "kernel_test",
-    srcs = ["kernel_test.cc"],
+    name = "kernel_args_test",
+    srcs = ["kernel_args_test.cc"],
     deps = [
         ":device_memory",
-        ":kernel",
-        ":platform",
-        ":platform_manager",
-        ":stream_executor_h",
+        ":kernel_args",
+        ":kernel_metadata",
         "//xla/stream_executor/host:host_platform",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
@@ -927,6 +1066,9 @@ xla_cc_test(
     deps = [
         ":device_description",
         ":semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/rocm:rocm_compute_capability",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index 1800c640078211..b1b1a2ab1ae24c 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.pb.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -296,12 +296,14 @@ class BlasSupport {
   //
   // Alpha/beta type matches `dtype`, unless `dtype` is `Eigen::half`, in that
   // case the expected alpha/beta type is `float`.
-  virtual absl::Status DoBlasGemm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
-      const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-      const void *beta, DeviceMemoryBase *c, int ldc,
-      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+  virtual absl::Status DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) = 0;
 
   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
@@ -322,95 +324,95 @@ class BlasSupport {
   // choosing the best algorithm among many (some of which may fail) without
   // creating a new Stream for each attempt.
   virtual absl::Status DoBlasGemmWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
-      const DeviceMemoryBase &a, DataType type_a, int lda,
-      const DeviceMemoryBase &b, DataType type_b, int ldb, const void *beta,
-      DeviceMemoryBase *c, DataType type_c, int ldc,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,
+      const DeviceMemoryBase& a, DataType type_a, int lda,
+      const DeviceMemoryBase& b, DataType type_b, int ldb, const void* beta,
+      DeviceMemoryBase* c, DataType type_c, int ldc,
       ComputationType computation_type, AlgorithmType algorithm,
-      const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ProfileResult* output_profile_result,
+      blas::CallContext context) = 0;
   virtual absl::Status DoBlasGemmStridedBatchedWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,
-      const DeviceMemoryBase &a, DataType type_a, int lda, int64_t stride_a,
-      const DeviceMemoryBase &b, DataType type_b, int ldb, int64_t stride_b,
-      const void *beta, DeviceMemoryBase *c, DataType type_c, int ldc,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,
+      const DeviceMemoryBase& a, DataType type_a, int lda, int64_t stride_a,
+      const DeviceMemoryBase& b, DataType type_b, int ldb, int64_t stride_b,
+      const void* beta, DeviceMemoryBase* c, DataType type_c, int ldc,
       int64_t stride_c, int batch_count, ComputationType computation_type,
-      AlgorithmType algorithm, const NumericOptions &numeric_options,
-      ProfileResult *output_profile_result, blas::CallContext context) = 0;
+      AlgorithmType algorithm, const EngineOptions& engine_options,
+      ProfileResult* output_profile_result, blas::CallContext context) = 0;
 
   // Computes a batch of matrix-matrix product with general matrices.
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
   // and c, which contain batch_count DeviceMemory objects.
-  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+  virtual bool DoBlasGemmBatched(Stream* stream, blas::Transpose transa,
                                  blas::Transpose transb, uint64_t m, uint64_t n,
                                  uint64_t k, float alpha,
                                  DeviceMemorySlice<Eigen::half> a, int lda,
                                  DeviceMemorySlice<Eigen::half> b, int ldb,
                                  float beta, DeviceMemorySlice<Eigen::half> c,
                                  int ldc, int batch_count,
-                                 const NumericOptions &numeric_options,
-                                 ScratchAllocator *scratch_allocator,
+                                 const EngineOptions& engine_options,
+                                 ScratchAllocator* scratch_allocator,
                                  blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, float alpha,
       DeviceMemorySlice<Eigen::bfloat16> a, int lda,
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
+      blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, float alpha,
       DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b, int ldb,
       float beta, DeviceMemorySlice<float> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
+      blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, double alpha,
       DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,
       int ldb, double beta, DeviceMemorySlice<double> c, int ldc,
-      int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,
       DeviceMemorySlice<std::complex<float>> a, int lda,
       DeviceMemorySlice<std::complex<float>> b, int ldb,
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int ldc, int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,
       DeviceMemorySlice<std::complex<double>> a, int lda,
       DeviceMemorySlice<std::complex<double>> b, int ldb,
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context) = 0;
+      int ldc, int batch_count, const EngineOptions& engine_options,
+      ScratchAllocator* scratch_allocator, blas::CallContext context) = 0;
   // Batched gemm with strides instead of pointer arrays.
   virtual absl::Status DoBlasGemmStridedBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void *alpha,
-      const DeviceMemoryBase &a, int lda, int64_t stride_a,
-      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-      const NumericOptions &numeric_options, blas::CallContext context) = 0;
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64_t n, uint64_t k, DataType dtype, const void* alpha,
+      const DeviceMemoryBase& a, int lda, int64_t stride_a,
+      const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+      DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+      const EngineOptions& engine_options, blas::CallContext context) = 0;
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmStridedBatchedWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
-      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
-      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      const DeviceMemory<InputType>& a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType>& b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType>* c, int ldc, int64_t stride_c,
       int batch_count, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+      blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+      blas::ProfileResult* output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -425,7 +427,7 @@ class BlasSupport {
         blas::ToDataType<InputType>::value, lda, stride_a, b,
         blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
-        computation_type, algorithm, numeric_options, output_profile_result,
+        computation_type, algorithm, engine_options, output_profile_result,
         context);
     if (output_profile_result) {
       // The error is recorded in the profile.
@@ -435,13 +437,13 @@ class BlasSupport {
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+  absl::Status BlasGemm(Stream* stream, blas::Transpose transa,
                         blas::Transpose transb, uint64_t m, uint64_t n,
                         uint64_t k, ConstantType alpha,
-                        const DeviceMemory<InputType> &a, int lda,
-                        const DeviceMemory<InputType> &b, int ldb,
-                        ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
-                        const NumericOptions &numeric_options,
+                        const DeviceMemory<InputType>& a, int lda,
+                        const DeviceMemory<InputType>& b, int ldb,
+                        ConstantType beta, DeviceMemory<OutputType>* c, int ldc,
+                        const EngineOptions& engine_options,
                         blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
@@ -466,33 +468,33 @@ class BlasSupport {
 
     return DoBlasGemm(stream, transa, transb, m, n, k,
                       blas::ToDataType<InputType>::value, alpha_ptr, a, lda, b,
-                      ldb, beta_ptr, c, ldc, numeric_options, context);
+                      ldb, beta_ptr, c, ldc, engine_options, context);
   }
 
   template <typename InputType, typename OutputType>
-  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+  absl::Status BlasGemm(Stream* stream, blas::Transpose transa,
                         blas::Transpose transb, uint64_t m, uint64_t n,
-                        uint64_t k, const DeviceMemory<InputType> &a, int lda,
-                        const DeviceMemory<InputType> &b, int ldb,
-                        DeviceMemory<OutputType> *c, int ldc,
-                        const NumericOptions &numeric_options,
+                        uint64_t k, const DeviceMemory<InputType>& a, int lda,
+                        const DeviceMemory<InputType>& b, int ldb,
+                        DeviceMemory<OutputType>* c, int ldc,
+                        const EngineOptions& engine_options,
                         blas::CallContext context) {
     InputType alpha{1.0};
     InputType beta{0.0};
     return BlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                    beta, c, ldc, numeric_options, context);
+                    beta, c, ldc, engine_options, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmWithAlgorithm(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda,
-      const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
-      DeviceMemory<OutputType> *c, int ldc,
+      const DeviceMemory<InputType>& a, int lda,
+      const DeviceMemory<InputType>& b, int ldb, ConstantType beta,
+      DeviceMemory<OutputType>* c, int ldc,
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-      const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+      const EngineOptions& engine_options,
+      blas::ProfileResult* output_profile_result, blas::CallContext context) {
     TF_RETURN_IF_ERROR(
         CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
             computation_type));
@@ -508,7 +510,7 @@ class BlasSupport {
         blas::ToDataType<InputType>::value, lda, b,
         blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
         blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
-        numeric_options, output_profile_result, context);
+        engine_options, output_profile_result, context);
 
     if (output_profile_result) {
       // The error is recorded in the profile.
@@ -530,18 +532,18 @@ class BlasSupport {
 
     return BlasGemmWithAlgorithm(stream, transa, transb, m, n, k, alpha, a, lda,
                                  b, ldb, beta, c, ldc, computation_type,
-                                 algorithm, NumericOptions{},
+                                 algorithm, EngineOptions{},
                                  output_profile_result, context);
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
   absl::Status BlasGemmStridedBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64_t k, ConstantType alpha,
-      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
-      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
-      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
-      int batch_count, const NumericOptions &numeric_options,
+      const DeviceMemory<InputType>& a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType>& b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType>* c, int ldc, int64_t stride_c,
+      int batch_count, const EngineOptions& engine_options,
       blas::CallContext context) {
     static_assert(
         detail::is_any_of<InputType, int8_t, float, Eigen::half,
@@ -563,7 +565,7 @@ class BlasSupport {
     return DoBlasGemmStridedBatched(
         stream, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
         alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
-        stride_c, batch_count, numeric_options, context);
+        stride_c, batch_count, engine_options, context);
   }
 
   // Solves a triangular matrix equation.
@@ -740,176 +742,175 @@ class BlasSupport {
 // BlasSupport base class.
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                  \
   absl::StatusOr<bool> IsMainStreamSet() const override;                       \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
-                  DeviceMemory<float> *x, int incx) override;                  \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
-                  DeviceMemory<double> *x, int incx) override;                 \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, float alpha,            \
-                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count, double alpha,           \
-                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<float>* x, int incx) override;                  \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<double>* x, int incx) override;                 \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, float alpha,            \
+                  DeviceMemory<std::complex<float>>* x, int incx) override;    \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count, double alpha,           \
+                  DeviceMemory<std::complex<double>>* x, int incx) override;   \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count,                         \
                   std::complex<float> alpha,                                   \
-                  DeviceMemory<std::complex<float>> *x, int incx) override;    \
-  bool DoBlasScal(Stream *stream, uint64_t elem_count,                         \
+                  DeviceMemory<std::complex<float>>* x, int incx) override;    \
+  bool DoBlasScal(Stream* stream, uint64_t elem_count,                         \
                   std::complex<double> alpha,                                  \
-                  DeviceMemory<std::complex<double>> *x, int incx) override;   \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
-                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
-                  int lda, const DeviceMemory<float> &x, int incx, float beta, \
-                  DeviceMemory<float> *y, int incy) override;                  \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
-                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
-                  int lda, const DeviceMemory<double> &x, int incx,            \
-                  double beta, DeviceMemory<double> *y, int incy) override;    \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  DeviceMemory<std::complex<double>>* x, int incx) override;   \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, float alpha, const DeviceMemory<float>& a,       \
+                  int lda, const DeviceMemory<float>& x, int incx, float beta, \
+                  DeviceMemory<float>* y, int incy) override;                  \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
+                  uint64_t n, double alpha, const DeviceMemory<double>& a,     \
+                  int lda, const DeviceMemory<double>& x, int incx,            \
+                  double beta, DeviceMemory<double>* y, int incy) override;    \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
                   uint64_t n, std::complex<float> alpha,                       \
-                  const DeviceMemory<std::complex<float>> &a, int lda,         \
-                  const DeviceMemory<std::complex<float>> &x, int incx,        \
+                  const DeviceMemory<std::complex<float>>& a, int lda,         \
+                  const DeviceMemory<std::complex<float>>& x, int incx,        \
                   std::complex<float> beta,                                    \
-                  DeviceMemory<std::complex<float>> *y, int incy) override;    \
-  bool DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,           \
+                  DeviceMemory<std::complex<float>>* y, int incy) override;    \
+  bool DoBlasGemv(Stream* stream, blas::Transpose trans, uint64_t m,           \
                   uint64_t n, std::complex<double> alpha,                      \
-                  const DeviceMemory<std::complex<double>> &a, int lda,        \
-                  const DeviceMemory<std::complex<double>> &x, int incx,       \
+                  const DeviceMemory<std::complex<double>>& a, int lda,        \
+                  const DeviceMemory<std::complex<double>>& x, int incx,       \
                   std::complex<double> beta,                                   \
-                  DeviceMemory<std::complex<double>> *y, int incy) override;   \
+                  DeviceMemory<std::complex<double>>* y, int incy) override;   \
   absl::Status DoBlasGemm(                                                     \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
-      const void *alpha, const DeviceMemoryBase &a, int lda,                   \
-      const DeviceMemoryBase &b, int ldb, const void *beta,                    \
-      DeviceMemoryBase *c, int ldc, const NumericOptions &numeric_options,     \
+      const void* alpha, const DeviceMemoryBase& a, int lda,                   \
+      const DeviceMemoryBase& b, int ldb, const void* beta,                    \
+      DeviceMemoryBase* c, int ldc, const EngineOptions& engine_options,       \
       blas::CallContext context) override;                                     \
   bool GetBlasGemmAlgorithms(                                                  \
-      Stream *stream, const gpu::MatrixDescriptor &a,                          \
-      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,          \
-      const void *alpha, const void *beta,                                     \
-      std::vector<blas::AlgorithmType> *out_algorithms) override;              \
+      Stream* stream, const gpu::MatrixDescriptor& a,                          \
+      const gpu::MatrixDescriptor& b, gpu::OutputMatrixDescriptor* c,          \
+      const void* alpha, const void* beta,                                     \
+      std::vector<blas::AlgorithmType>* out_algorithms) override;              \
   absl::Status DoBlasGemmWithAlgorithm(                                        \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
-      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
-      const DeviceMemoryBase &b, blas::DataType type_b, int ldb,               \
-      const void *beta, DeviceMemoryBase *c, blas::DataType type_c, int ldc,   \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,                   \
+      const DeviceMemoryBase& a, blas::DataType type_a, int lda,               \
+      const DeviceMemoryBase& b, blas::DataType type_b, int ldb,               \
+      const void* beta, DeviceMemoryBase* c, blas::DataType type_c, int ldc,   \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
-      const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      const EngineOptions& engine_options,                                     \
+      blas::ProfileResult* output_profile_result, blas::CallContext context)   \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
       DeviceMemorySlice<Eigen::half> a, int lda,                               \
       DeviceMemorySlice<Eigen::half> b, int ldb, float beta,                   \
       DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,              \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
       DeviceMemorySlice<Eigen::bfloat16> a, int lda,                           \
       DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,               \
       DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,          \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
-      override;                                                                \
-  bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, float alpha,                         \
-      DeviceMemorySlice<float> a, int lda, DeviceMemorySlice<float> b,         \
-      int ldb, float beta, DeviceMemorySlice<float> c, int ldc,                \
-      int batch_count, const NumericOptions &numeric_options,                  \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
+  bool DoBlasGemmBatched(Stream* stream, blas::Transpose transa,               \
+                         blas::Transpose transb, uint64_t m, uint64_t n,       \
+                         uint64_t k, float alpha, DeviceMemorySlice<float> a,  \
+                         int lda, DeviceMemorySlice<float> b, int ldb,         \
+                         float beta, DeviceMemorySlice<float> c, int ldc,      \
+                         int batch_count, const EngineOptions& engine_options, \
+                         ScratchAllocator* scratch_allocator,                  \
+                         blas::CallContext context) override;                  \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, double alpha,                        \
       DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,       \
       int ldb, double beta, DeviceMemorySlice<double> c, int ldc,              \
-      int batch_count, const NumericOptions &numeric_options,                  \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int batch_count, const EngineOptions& engine_options,                    \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, std::complex<float> alpha,           \
       DeviceMemorySlice<std::complex<float>> a, int lda,                       \
       DeviceMemorySlice<std::complex<float>> b, int ldb,                       \
       std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,      \
-      int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int ldc, int batch_count, const EngineOptions& engine_options,           \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   bool DoBlasGemmBatched(                                                      \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, std::complex<double> alpha,          \
       DeviceMemorySlice<std::complex<double>> a, int lda,                      \
       DeviceMemorySlice<std::complex<double>> b, int ldb,                      \
       std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,    \
-      int ldc, int batch_count, const NumericOptions &numeric_options,         \
-      ScratchAllocator *scratch_allocator, blas::CallContext context)          \
+      int ldc, int batch_count, const EngineOptions& engine_options,           \
+      ScratchAllocator* scratch_allocator, blas::CallContext context)          \
       override;                                                                \
   absl::Status DoBlasGemmStridedBatched(                                       \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,                \
-      const void *alpha, const DeviceMemoryBase &a, int lda, int64_t stride_a, \
-      const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,  \
-      DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,         \
-      const NumericOptions &numeric_options, blas::CallContext context)        \
+      const void* alpha, const DeviceMemoryBase& a, int lda, int64_t stride_a, \
+      const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,  \
+      DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,         \
+      const EngineOptions& engine_options, blas::CallContext context)          \
       override;                                                                \
   absl::Status DoBlasGemmStridedBatchedWithAlgorithm(                          \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64_t n, uint64_t k, const void *alpha,                   \
-      const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
-      int64_t stride_a, const DeviceMemoryBase &b, blas::DataType type_b,      \
-      int ldb, int64_t stride_b, const void *beta, DeviceMemoryBase *c,        \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64_t n, uint64_t k, const void* alpha,                   \
+      const DeviceMemoryBase& a, blas::DataType type_a, int lda,               \
+      int64_t stride_a, const DeviceMemoryBase& b, blas::DataType type_b,      \
+      int ldb, int64_t stride_b, const void* beta, DeviceMemoryBase* c,        \
       blas::DataType type_c, int ldc, int64_t stride_c, int batch_count,       \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
-      const NumericOptions &numeric_options,                                   \
-      blas::ProfileResult *output_profile_result, blas::CallContext context)   \
+      const EngineOptions& engine_options,                                     \
+      blas::ProfileResult* output_profile_result, blas::CallContext context)   \
       override;                                                                \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
-                  uint64_t n, float alpha, const DeviceMemory<float> &a,       \
-                  int lda, DeviceMemory<float> *b, int ldb) override;          \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  uint64_t n, float alpha, const DeviceMemory<float>& a,       \
+                  int lda, DeviceMemory<float>* b, int ldb) override;          \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
-                  uint64_t n, double alpha, const DeviceMemory<double> &a,     \
-                  int lda, DeviceMemory<double> *b, int ldb) override;         \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  uint64_t n, double alpha, const DeviceMemory<double>& a,     \
+                  int lda, DeviceMemory<double>* b, int ldb) override;         \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
                   uint64_t n, std::complex<float> alpha,                       \
-                  const DeviceMemory<std::complex<float>> &a, int lda,         \
-                  DeviceMemory<std::complex<float>> *b, int ldb) override;     \
-  bool DoBlasTrsm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
+                  const DeviceMemory<std::complex<float>>& a, int lda,         \
+                  DeviceMemory<std::complex<float>>* b, int ldb) override;     \
+  bool DoBlasTrsm(Stream* stream, blas::Side side, blas::UpperLower uplo,      \
                   blas::Transpose transa, blas::Diagonal diag, uint64_t m,     \
                   uint64_t n, std::complex<double> alpha,                      \
-                  const DeviceMemory<std::complex<double>> &a, int lda,        \
-                  DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+                  const DeviceMemory<std::complex<double>>& a, int lda,        \
+                  DeviceMemory<std::complex<double>>* b, int ldb) override;    \
   bool DoBlasTrsmBatched(                                                      \
-      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
       blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
-      float alpha, const DeviceMemory<float *> &as, int lda,                   \
-      DeviceMemory<float *> *bs, int ldb, int batch_count) override;           \
+      float alpha, const DeviceMemory<float*>& as, int lda,                    \
+      DeviceMemory<float*>* bs, int ldb, int batch_count) override;            \
   bool DoBlasTrsmBatched(                                                      \
-      Stream *stream, blas::Side side, blas::UpperLower uplo,                  \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
       blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
-      double alpha, const DeviceMemory<double *> &as, int lda,                 \
-      DeviceMemory<double *> *bs, int ldb, int batch_count) override;          \
-  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
-                         blas::UpperLower uplo, blas::Transpose transa,        \
-                         blas::Diagonal diag, uint64_t m, uint64_t n,          \
-                         std::complex<float> alpha,                            \
-                         const DeviceMemory<std::complex<float> *> &as,        \
-                         int lda, DeviceMemory<std::complex<float> *> *bs,     \
-                         int ldb, int batch_count) override;                   \
-  bool DoBlasTrsmBatched(Stream *stream, blas::Side side,                      \
+      double alpha, const DeviceMemory<double*>& as, int lda,                  \
+      DeviceMemory<double*>* bs, int ldb, int batch_count) override;           \
+  bool DoBlasTrsmBatched(                                                      \
+      Stream* stream, blas::Side side, blas::UpperLower uplo,                  \
+      blas::Transpose transa, blas::Diagonal diag, uint64_t m, uint64_t n,     \
+      std::complex<float> alpha, const DeviceMemory<std::complex<float>*>& as, \
+      int lda, DeviceMemory<std::complex<float>*>* bs, int ldb,                \
+      int batch_count) override;                                               \
+  bool DoBlasTrsmBatched(Stream* stream, blas::Side side,                      \
                          blas::UpperLower uplo, blas::Transpose transa,        \
                          blas::Diagonal diag, uint64_t m, uint64_t n,          \
                          std::complex<double> alpha,                           \
-                         const DeviceMemory<std::complex<double> *> &as,       \
-                         int lda, DeviceMemory<std::complex<double> *> *bs,    \
+                         const DeviceMemory<std::complex<double>*>& as,        \
+                         int lda, DeviceMemory<std::complex<double>*>* bs,     \
                          int ldb, int batch_count) override;                   \
-  absl::Status GetVersion(std::string *version) override;
+  absl::Status GetVersion(std::string* version) override;
 
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 2c10d92287c80c..8519a5ba53979f 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -311,6 +311,8 @@ class CommandBuffer {
   // Returns command buffer state.
   virtual State state() const = 0;
 
+  virtual std::string ToString() const = 0;
+
   //--------------------------------------------------------------------------//
   // Command buffer tracing API
   //--------------------------------------------------------------------------//
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 2e2f1c55a56de8..62a94579cc7802 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -99,35 +99,35 @@ cuda_only_cc_library(
         "gpu",
     ],
     visibility = ["//visibility:public"],
-    deps =
-        [
-            ":cuda_diagnostics",
-            ":cuda_executor",
-            ":cuda_platform_id",
-            ":cuda_status",
-            "//xla/stream_executor:device_description",
-            "//xla/stream_executor:executor_cache",
-            "//xla/stream_executor:platform",
-            "//xla/stream_executor:platform_manager",
-            "//xla/stream_executor:stream_executor_h",
-            "//xla/stream_executor/platform:initialize",
-            "//xla/tsl/platform:errors",
-            "//xla/tsl/platform:status",
-            "@com_google_absl//absl/base",
-            "@com_google_absl//absl/base:core_headers",
-            "@com_google_absl//absl/log",
-            "@com_google_absl//absl/log:check",
-            "@com_google_absl//absl/memory",
-            "@com_google_absl//absl/status",
-            "@com_google_absl//absl/status:statusor",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-            "@com_google_absl//absl/synchronization",
-            "@local_config_cuda//cuda:cuda_headers",
-            "@local_tsl//tsl/platform:errors",
-            "@local_tsl//tsl/platform:status",
-            "@local_tsl//tsl/platform:statusor",
-        ] + tf_additional_cuda_platform_deps(),
+    deps = [
+        ":cuda_diagnostics",
+        ":cuda_executor",
+        ":cuda_platform_id",
+        ":cuda_status",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:executor_cache",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/cuda:nvml",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ] + tf_additional_cuda_platform_deps(),
     alwayslink = True,  # Registers itself with the PlatformManager.
 )
 
@@ -282,9 +282,9 @@ cuda_only_cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
         "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -322,34 +322,6 @@ cuda_only_cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "cuda_solver_context",
-    srcs = ["cuda_solver_context.cc"],
-    hdrs = ["cuda_solver_context.h"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = [
-        ":cuda_platform_id",
-        "//xla:comparison_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:gpu_solver_context",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor/platform:platform_object_registry",
-        "//xla/tsl/cuda:cusolver",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "cuda_blas_utils",
     srcs = ["cuda_blas_utils.cc"],
@@ -433,9 +405,8 @@ cuda_library(
         "gpu",
     ],
     deps = [
-        ":cuda_platform",
+        ":cuda_platform_id",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
-        "//xla/backends/gpu/runtime:thunk_buffer_id",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:buffer_debug_xor_checksum_kernel",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
@@ -454,8 +425,6 @@ xla_test(
     deps = [
         ":buffer_debug_xor_checksum_kernel_cuda",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
-        "//xla/backends/gpu/runtime:thunk_buffer_id",
-        "//xla/backends/gpu/runtime:thunk_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -480,6 +449,61 @@ xla_test(
     ],
 )
 
+cuda_library(
+    name = "buffer_debug_float_check_kernel_cuda",
+    srcs = ["buffer_debug_float_check_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = True,
+)
+
+xla_test(
+    name = "buffer_debug_float_check_kernel_cuda_test",
+    srcs = ["buffer_debug_float_check_kernel_cuda_test.cc"],
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        ":buffer_debug_float_check_kernel_cuda",
+        "//xla:types",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/backends/gpu/runtime:thunk_id",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
+        "//xla/stream_executor/gpu:buffer_debug_log",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "cudnn_plugin",
     srcs = ["cuda_dnn.cc"],
@@ -501,8 +525,8 @@ cc_library(
         "//xla/stream_executor:data_type",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:semantic_version",
@@ -592,12 +616,17 @@ cc_library(
         "gpu",
     ],
     deps = [
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:dnn",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@cudnn_frontend_archive//:cudnn_frontend",
     ],
 )
@@ -617,6 +646,7 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/tsl/cuda:cudart",  # build_cleaner: keep
         "//xla/tsl/cuda:cudnn",  # build_cleaner: keep
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@cudnn_frontend_archive//:cudnn_frontend",
@@ -638,6 +668,7 @@ cc_library(
         ":cuda_status",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -669,6 +700,7 @@ xla_test(
         "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -869,6 +901,7 @@ xla_test(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1124,6 +1157,9 @@ cuda_only_cc_library(
         "//xla/stream_executor:generic_memory_allocation",
         "//xla/stream_executor:generic_memory_allocator",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:kernel_argument_packing_spec",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
@@ -1199,6 +1235,21 @@ xla_test(
     ],
 )
 
+cuda_library(
+    name = "cuda_executor_multigpu_test_kernels",
+    srcs = ["cuda_executor_multigpu_test_kernels.cu.cc"],
+    hdrs = ["cuda_executor_multigpu_test_kernels.h"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cuda_status",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/status",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_runtime",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
 xla_test(
     name = "cuda_executor_multigpu_test",
     srcs = ["cuda_executor_multigpu_test.cc"],
@@ -1212,15 +1263,18 @@ xla_test(
     tags = ["cuda-only"],
     deps = [
         ":cuda_executor",
+        ":cuda_executor_multigpu_test_kernels",
         ":cuda_platform",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_init",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1236,10 +1290,10 @@ cc_library(
     deps = [
         ":all_reduce_kernel_cuda",
         ":buffer_comparator_kernel_cuda",
+        ":buffer_debug_float_check_kernel_cuda",
         ":buffer_debug_xor_checksum_kernel_cuda",
         ":cublas_plugin",
         ":cuda_platform",
-        ":cuda_solver_context",
         ":cudnn_plugin",
         ":cufft_plugin",
         ":make_batch_pointers_kernel_cuda",
@@ -1247,7 +1301,6 @@ cc_library(
         ":redzone_allocator_kernel_cuda",
         ":repeat_buffer_kernel_cuda",
         ":topk_kernel_cuda",
-        "//xla/tsl/cuda:cusolver",
         "//xla/tsl/cuda:cusparse",
         "//xla/tsl/cuda:tensorrt_rpath",
     ] + [":cub_sort_kernel_cuda_" + suffix for suffix in get_cub_sort_kernel_types()],
@@ -1257,7 +1310,7 @@ cc_library(
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",
-    linkopts = ["-framework IOKit"],
+    linkopts = ["-frameworFk IOKit"],
 )
 
 cc_library(
@@ -1324,6 +1377,7 @@ xla_cc_test(
         ":cuda_version_parser",
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
@@ -1389,6 +1443,7 @@ xla_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -1445,6 +1500,7 @@ xla_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1490,6 +1546,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:path",
     ],
 )
 
@@ -1505,7 +1562,7 @@ xla_test(
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:dnn",
-        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
@@ -1513,6 +1570,7 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -1645,6 +1703,7 @@ xla_cc_test(
         ":subprocess_compilation",
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status_matchers",
@@ -1894,6 +1953,7 @@ xla_cc_test(
         ":mock_compilation_provider",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
@@ -1930,6 +1990,7 @@ xla_cc_test(
         ":compilation_provider",
         ":mock_compilation_provider",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
@@ -1984,6 +2045,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
@@ -1999,6 +2061,7 @@ xla_cc_test(
         ":mock_compilation_provider",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
@@ -2088,10 +2151,13 @@ xla_cc_test(
         ":cuda_platform",
         ":nvjitlink_support",
         ":ptx_compiler_support",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:cuda_root_path",
         "@local_tsl//tsl/platform:path",
     ],
@@ -2126,6 +2192,7 @@ xla_cc_test(
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -2306,6 +2373,64 @@ cuda_library(
     alwayslink = 1,
 ) for typename in get_cub_sort_kernel_types()]
 
+[cuda_library(
+    name = "cub_prefix_sum_kernel_cuda_{}".format(typename),
+    srcs = ["cub_prefix_sum_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    local_defines = ["CUB_TYPE_" + typename.upper()],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:prefix_sum_kernel",
+        "@local_config_cuda//cuda:cub_headers",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+) for typename in get_cub_sort_kernel_types()]
+
+xla_test(
+    name = "cub_prefix_sum_kernel_cuda_test",
+    srcs = ["cub_prefix_sum_kernel_cuda_test.cc"],
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:prefix_sum_kernel",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/lib/math:math_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cuda_headers",
+    ] + [":cub_prefix_sum_kernel_cuda_" + suffix for suffix in get_cub_sort_kernel_types()],
+)
+
 cuda_library(
     name = "topk_kernel_cuda",
     srcs = [
@@ -2321,6 +2446,7 @@ cuda_library(
     deps = [
         ":cuda_platform_id",
         "//xla:types",
+        "//xla/stream_executor:kernel_symbol_registry",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:topk_kernel",
         "//xla/tsl/lib/math:math_util",
@@ -2386,6 +2512,7 @@ cuda_library(
     deps = [
         ":cuda_platform_id",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:gpu_test_kernel_traits",
diff --git a/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
index 6fb7cdc818b6e1..539ef191707a10 100644
--- a/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
+++ b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
@@ -91,6 +91,9 @@ __device__ __forceinline__ void WaitSignalFlag<PlatformType::CUDA>(
 REGISTER_ALL_REDUCE_KERNEL(AddBF16, xla::bfloat16, __nv_bfloat16, SUM);
 REGISTER_ALL_REDUCE_KERNEL(AddF32, float, float, SUM);
 
+// Multimem so far supported only for f32.
+REGISTER_ALL_REDUCE_KERNEL_IMPL(AddF32, float, float, SUM, kMultimem);
+
 // AllReduce doesn't have a corresponding reduction kind for logical operations.
 // NCCL uses MAX and MIN on uint8_t for logical operations.
 REGISTER_ALL_REDUCE_KERNEL(OrPRED, bool, uint8_t, MAX);
diff --git a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
index 96ee34d87c55b8..d6c6496cc9fc13 100644
--- a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider_test.cc
@@ -21,14 +21,17 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 #include "xla/stream_executor/cuda/compilation_provider_options.h"
 #include "xla/stream_executor/cuda/nvjitlink_support.h"
 #include "xla/stream_executor/cuda/ptx_compiler_support.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/cuda_root_path.h"
 #include "tsl/platform/path.h"
+#include "tsl/platform/platform.h"
 
 namespace stream_executor::cuda {
 
@@ -37,6 +40,16 @@ using ::testing::AllOf;
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
+TEST(AssembleCompilationProviderTest, CandidateCudaRootsConsidersCUDA_HOME) {
+  const std::string cuda_home = "/my/cuda/home";
+  tsl::setenv("CUDA_HOME", cuda_home.c_str(), 1);
+  if (!tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "CUDA_HOME is only being considered in the OSS build of XLA.";
+  }
+  EXPECT_THAT(tsl::CandidateCudaRoots(), ::testing::Contains(cuda_home));
+}
+
 TEST(AssembleCompilationProviderTest,
      ReturnsErrorIfNoCompilationProviderIsAvailable) {
   if (!tsl::CandidateCudaRoots().empty()) {
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
new file mode 100644
index 00000000000000..2325478c1256fb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
@@ -0,0 +1,255 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+#include "absl/base/casts.h"
+#include "third_party/gpus/cuda/include/cuda/atomic"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace se = stream_executor;
+
+namespace {
+
+__device__ unsigned int ThreadIdx() {
+  return threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x +
+         threadIdx.x;
+}
+
+__device__ unsigned int BlockIdx() {
+  return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x +
+         blockIdx.x;
+}
+
+// Based on
+// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+template <unsigned int BLOCK_SIZE>
+__device__ void WarpReduceSum(unsigned int tid, volatile uint32_t* data) {
+  if (BLOCK_SIZE >= 64) data[tid] += data[tid + 32];
+  if (BLOCK_SIZE >= 32) data[tid] += data[tid + 16];
+  if (BLOCK_SIZE >= 16) data[tid] += data[tid + 8];
+  if (BLOCK_SIZE >= 8) data[tid] += data[tid + 4];
+  if (BLOCK_SIZE >= 4) data[tid] += data[tid + 2];
+  if (BLOCK_SIZE >= 2) data[tid] += data[tid + 1];
+}
+
+__device__ inline bool IsNan(float v) { return isnan(v); }
+__device__ inline bool IsNan(__nv_bfloat16 v) { return __isnan(v); }
+__device__ inline bool IsInf(float v) { return isinf(v); }
+__device__ inline bool IsInf(__nv_bfloat16 v) { return __isinf(v); }
+__device__ inline bool IsZero(float v) { return v == 0.0f; }
+__device__ inline bool IsZero(__nv_bfloat16 v) {
+  return v == __nv_bfloat16(0.0f);
+}
+
+// Calculates count of NaNs of all elements of `input` and puts result in
+// `output`.
+//
+// Optimized implementation based on
+// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+// that takes advantage of `BLOCK_SIZE` threads.
+//
+// `BLOCK_SIZE` must be a power of 2 no larger than 1024.
+template <typename T, unsigned int BLOCK_SIZE>
+__device__ void ReduceSum(const T* input, uint64_t input_size,
+                          uint32_t* nan_counter, uint32_t* inf_counter,
+                          uint32_t* zero_counter) {
+  __shared__ uint32_t nan_count[BLOCK_SIZE];
+  __shared__ uint32_t inf_count[BLOCK_SIZE];
+  __shared__ uint32_t zero_count[BLOCK_SIZE];
+
+  assert(BlockIdx() == 0);
+  const unsigned int tid = ThreadIdx();
+
+  nan_count[tid] = 0;
+  inf_count[tid] = 0;
+  zero_count[tid] = 0;
+  for (unsigned int i = tid; i < input_size; i += BLOCK_SIZE) {
+    if (IsNan(input[i])) {
+      nan_count[tid]++;
+    }
+    if (IsInf(input[i])) {
+      inf_count[tid]++;
+    }
+    if (IsZero(input[i])) {
+      zero_count[tid]++;
+    }
+  }
+
+  __syncthreads();
+
+  if (BLOCK_SIZE >= 1024) {
+    if (tid < 512) {
+      nan_count[tid] += nan_count[tid + 512];
+      inf_count[tid] += inf_count[tid + 512];
+      zero_count[tid] += zero_count[tid + 512];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 512) {
+    if (tid < 256) {
+      nan_count[tid] += nan_count[tid + 256];
+      inf_count[tid] += inf_count[tid + 256];
+      zero_count[tid] += zero_count[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 256) {
+    if (tid < 128) {
+      nan_count[tid] += nan_count[tid + 128];
+      inf_count[tid] += inf_count[tid + 128];
+      zero_count[tid] += zero_count[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (BLOCK_SIZE >= 128) {
+    if (tid < 64) {
+      nan_count[tid] += nan_count[tid + 64];
+      inf_count[tid] += inf_count[tid + 64];
+      zero_count[tid] += zero_count[tid + 64];
+    }
+    __syncthreads();
+  }
+  if (tid < 32) {
+    WarpReduceSum<BLOCK_SIZE>(tid, nan_count);
+    WarpReduceSum<BLOCK_SIZE>(tid, inf_count);
+    WarpReduceSum<BLOCK_SIZE>(tid, zero_count);
+  }
+  if (tid == 0) {
+    *nan_counter = nan_count[0];
+    *inf_counter = inf_count[0];
+    *zero_counter = zero_count[0];
+  }
+}
+
+// Attempts to append the NaN count of the `input` buffer to the
+// `float_check_entries`, using `log_header` to track available capacity and
+// used space.
+//
+// The log entry is tagged with `entry_id`. The NaN count is parallelized as
+// much as block dimensions allow it.
+//
+// If the log does not have enough space for the new entry, the entry is
+// discarded.
+//
+// `input_size_in_bytes` is the size of the input buffer in bytes.
+//
+// LIMITATIONS:
+// - Only a single thread block is supported.
+// - Block dimensions must be a power of 2.
+template <typename T>
+__global__ void AppendFloatCheck(
+    xla::gpu::BufferDebugLogEntryId entry_id, const T* input,
+    uint64_t input_size_in_bytes, xla::gpu::BufferDebugLogHeader* log_header,
+    xla::gpu::BufferDebugFloatCheckEntry* float_check_entries) {
+  const uint32_t block_size = blockDim.x * blockDim.y * blockDim.z;
+  const uint64_t input_size = input_size_in_bytes / sizeof(T);
+  uint32_t nan_count = 0;
+  uint32_t inf_count = 0;
+  uint32_t zero_count = 0;
+
+  assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
+  if (BlockIdx() != 0) {
+    return;
+  }
+
+  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+  // > CUDA architecture limits the numbers of threads per block (1024 threads
+  // > per block limit).
+  switch (block_size) {
+    case 1024:
+      ReduceSum<T, 1024>(input, input_size, &nan_count, &inf_count,
+                         &zero_count);
+      break;
+    case 512:
+      ReduceSum<T, 512>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 256:
+      ReduceSum<T, 256>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 128:
+      ReduceSum<T, 128>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 64:
+      ReduceSum<T, 64>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 32:
+      ReduceSum<T, 32>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 16:
+      ReduceSum<T, 16>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 8:
+      ReduceSum<T, 8>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 4:
+      ReduceSum<T, 4>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 2:
+      ReduceSum<T, 2>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    case 1:
+      ReduceSum<T, 1>(input, input_size, &nan_count, &inf_count, &zero_count);
+      break;
+    default:
+      // Unsupported block size.
+      assert(false);
+      return;
+  }
+
+  if (ThreadIdx() == 0) {
+    cuda::atomic_ref<uint32_t, cuda::thread_scope_system>
+        nan_count_log_write_idx(log_header->write_idx);
+#if __CUDA_ARCH__ >= 600
+    const uint32_t write_idx = nan_count_log_write_idx.fetch_add(1);
+    if (nan_count_log_write_idx.load() < log_header->capacity) {
+      float_check_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
+          entry_id, nan_count, inf_count, zero_count};
+    }
+#else
+    // Our toolchains generate a fetch_add PTX instructions with system scope,
+    // which is not supported on pre-Pascal architectures.
+    assert(false);
+#endif
+  }
+}
+
+se::KernelLoaderSpec GetFloatCheckF32KernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&AppendFloatCheck<float>),
+      "BufferDebugFloatCheckF32Kernel", arity);
+}
+
+se::KernelLoaderSpec GetFloatCheckBf16KernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&AppendFloatCheck<__nv_bfloat16>),
+      "BufferDebugFloatCheckBf16Kernel", arity);
+}
+
+}  // namespace
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugFloatCheckF32Kernel, se::gpu::BufferDebugFloatCheckF32Kernel,
+    se::cuda::kCudaPlatformId, GetFloatCheckF32KernelSpec);
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugFloatCheckBf16Kernel, se::gpu::BufferDebugFloatCheckBf16Kernel,
+    se::cuda::kCudaPlatformId, GetFloatCheckBf16KernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
new file mode 100644
index 00000000000000..e556af42a9aec1
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
+#include "xla/stream_executor/gpu/buffer_debug_log.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep, required for KernelType::FactoryType::Create
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+using xla::gpu::BufferDebugFloatCheckEntry;
+using xla::gpu::BufferDebugLogEntryId;
+using xla::gpu::ThunkId;
+
+class FloatCheckKernelTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+
+    if (!executor_->GetDeviceDescription()
+             .cuda_compute_capability()
+             .IsAtLeastPascal()) {
+      GTEST_SKIP()
+          << "Buffer checking is not supported on CUDA architectures older "
+             "than Pascal due to missing atomic fetch_add with system scope";
+    }
+  }
+
+  template <typename T>
+  absl::StatusOr<se::DeviceMemory<T>> CheckNotNull(
+      se::DeviceMemory<T> device_memory, absl::string_view name) {
+    if (device_memory.is_null()) {
+      return absl::InternalError(
+          absl::StrFormat("Device memory for %s is null", name));
+    }
+    return device_memory;
+  }
+
+  template <typename Kernel, typename InputType, typename BufferType>
+  absl::Status AppendFloatCheckOnDevice(
+      BufferDebugLogEntryId entry_id, const std::vector<InputType>& input,
+      se::gpu::BufferDebugLog<BufferType>& buffer_debug_log,
+      stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
+    // Load kernel
+    gpu::GpuKernelRegistry registry =
+        gpu::GpuKernelRegistry::GetGlobalRegistry();
+    TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+
+    // Setup device buffers
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemory<InputType> device_input,
+        CheckNotNull(executor_->AllocateArray<InputType>(input.size()),
+                     "input"));
+    auto cleanup_input =
+        absl::MakeCleanup([&]() { executor_->Deallocate(&device_input); });
+
+    // Call kernel
+    TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
+                                       input.size() * sizeof(input[0])));
+    TF_RETURN_IF_ERROR(kernel.Launch(
+        dim, stream_executor::BlockDim(1, 1, 1), stream_.get(), entry_id,
+        device_input, device_input.ElementCount() * sizeof(InputType),
+        buffer_debug_log.GetDeviceHeader(),
+        buffer_debug_log.GetDeviceEntries()));
+    TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+
+    // The result gets stored in `buffer_debug_log`.
+    return absl::OkStatus();
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsForF32) {
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  std::vector<float> input(1024, 1.0f);
+  input[100] = std::numeric_limits<float>::quiet_NaN();
+  input[200] = std::numeric_limits<float>::quiet_NaN();
+  input[300] = 0.0f;
+  input[400] = std::numeric_limits<float>::infinity();
+  input[500] = std::numeric_limits<float>::infinity();
+  input[600] = std::numeric_limits<float>::infinity();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{123}, input, device_log));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, 2);
+  EXPECT_EQ(host_log[0].inf_count, 3);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+}
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsForBf16) {
+  std::vector<xla::bfloat16> input(1024, xla::bfloat16(1.0f));
+  input[10] = xla::bfloat16(std::numeric_limits<float>::quiet_NaN());
+  input[20] = xla::bfloat16(std::numeric_limits<float>::quiet_NaN());
+  input[30] = xla::bfloat16(0.0f),
+  input[40] = xla::bfloat16(std::numeric_limits<float>::infinity());
+  input[50] = xla::bfloat16(std::numeric_limits<float>::infinity());
+  input[60] = xla::bfloat16(std::numeric_limits<float>::infinity());
+
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckBf16Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, 2);
+  EXPECT_EQ(host_log[0].inf_count, 3);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+}
+
+TEST_F(FloatCheckKernelTest, ChecksFloatsInParallel) {
+  se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
+  std::vector<float> input(1024, 1.0f);
+  input[100] = std::numeric_limits<float>::quiet_NaN();
+  input[200] = std::numeric_limits<float>::quiet_NaN();
+  input[300] = std::numeric_limits<float>::quiet_NaN();
+  input[400] = 0.0f;
+  input[600] = std::numeric_limits<float>::infinity();
+  input[700] = std::numeric_limits<float>::infinity();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, mem));
+
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+  TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
+      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+  ASSERT_GE(host_log.size(), 2);
+  EXPECT_EQ(host_log[0].nan_count, 3);
+  EXPECT_EQ(host_log[0].inf_count, 2);
+  EXPECT_EQ(host_log[0].zero_count, 1);
+  EXPECT_EQ(host_log[1].nan_count, 3);
+  EXPECT_EQ(host_log[1].inf_count, 2);
+  EXPECT_EQ(host_log[1].zero_count, 1);
+}
+
+}  // namespace
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
index 45e71cb4e06925..412441afb61926 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda.cu.cc
@@ -20,8 +20,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "third_party/gpus/cuda/include/cuda/atomic"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
-#include "xla/stream_executor/cuda/cuda_platform.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/kernel_spec.h"
@@ -117,7 +116,7 @@ __device__ void ReduceXor(const uint32_t* input, uint64_t input_size,
 // LIMITATIONS:
 // - Only a single thread block is supported.
 // - Block dimensions must be a power of 2.
-__global__ void AppendChecksum(xla::gpu::ThunkBufferId entry_id,
+__global__ void AppendChecksum(xla::gpu::BufferDebugLogEntryId entry_id,
                                const uint8_t* input, uint64_t input_size,
                                xla::gpu::BufferDebugLogHeader* log_header,
                                xla::gpu::BufferDebugLogEntry* log_entries) {
@@ -196,15 +195,14 @@ __global__ void AppendChecksum(xla::gpu::ThunkBufferId entry_id,
   }
 }
 
-absl::StatusOr<se::KernelLoaderSpec> GetChecksumKernelSpec() {
+se::KernelLoaderSpec GetChecksumKernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
       absl::bit_cast<void*>(&AppendChecksum), "BufferDebugXorChecksumKernel",
-      /*arity=*/5);
+      arity);
 }
 
 }  // namespace
 
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     BufferDebugXorChecksumKernel, se::gpu::BufferDebugXorChecksumKernel,
-    se::cuda::kCudaPlatformId,
-    ([](size_t _arity) { return GetChecksumKernelSpec().value(); }));
+    se::cuda::kCudaPlatformId, GetChecksumKernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
index 476c847da73d55..e9dfe49d313102 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
-#include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h"
@@ -50,9 +48,8 @@ namespace stream_executor::cuda {
 namespace {
 
 using xla::gpu::BufferDebugLogEntry;
+using xla::gpu::BufferDebugLogEntryId;
 using xla::gpu::BufferDebugLogHeader;
-using xla::gpu::ThunkBufferId;
-using xla::gpu::ThunkId;
 
 class ChecksumKernelTest : public ::testing::Test {
  protected:
@@ -85,8 +82,8 @@ class ChecksumKernelTest : public ::testing::Test {
 
   template <typename T>
   absl::Status AppendChecksumOnDevice(
-      ThunkBufferId entry_id, const T& input,
-      se::cuda::BufferDebugLog& buffer_debug_log,
+      BufferDebugLogEntryId entry_id, const T& input,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>& buffer_debug_log,
       stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
     // Load kernel
     gpu::GpuKernelRegistry registry =
@@ -135,14 +132,16 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumForMultipleOf32Bit) {
   constexpr uint32_t kExpectedChecksum = 0x12345678;
 
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(ThunkBufferId(), input, device_log));
+  TF_EXPECT_OK(
+      AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input, device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest,
@@ -150,15 +149,17 @@ TEST_F(ChecksumKernelTest,
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
   const std::vector<uint8_t> kInput = std::vector<uint8_t>(1023, 0x55);
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(ThunkBufferId(), kInput, device_log));
+  TF_EXPECT_OK(
+      AppendChecksumOnDevice(BufferDebugLogEntryId{0}, kInput, device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
   // Assumes the device uses little-endian byte order.
-  EXPECT_EQ(host_log[0].checksum, 0x55000000);
+  EXPECT_EQ(host_log[0].value, 0x55000000);
 }
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallel) {
@@ -169,15 +170,16 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallel) {
   input[1000] ^= 0x12345678;
   constexpr uint32_t kExpectedChecksum = 0x12345678;
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(ThunkBufferId(), input, device_log,
-                                      se::ThreadDim(2, 4, 8)));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input,
+                                      device_log, se::ThreadDim(2, 4, 8)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallelWithMaxThreads) {
@@ -188,67 +190,70 @@ TEST_F(ChecksumKernelTest, ComputesCorrectChecksumInParallelWithMaxThreads) {
   input[1000] ^= 0x12345678;
   constexpr uint32_t kExpectedChecksum = 0x12345678;
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(ThunkBufferId(), input, device_log,
-                                      se::ThreadDim(128, 4, 2)));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{0}, input,
+                                      device_log, se::ThreadDim(128, 4, 2)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 1);
-  EXPECT_EQ(host_log[0].checksum, kExpectedChecksum);
+  EXPECT_EQ(host_log[0].value, kExpectedChecksum);
 }
 
 TEST_F(ChecksumKernelTest, AppendsChecksumsToLog) {
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
-  ThunkBufferId kId123 = ThunkBufferId::Create(ThunkId(123), 0).value();
-  ThunkBufferId kId456 = ThunkBufferId::Create(ThunkId(456), 0).value();
-  ThunkBufferId kId789 = ThunkBufferId::Create(ThunkId(789), 0).value();
   constexpr std::array<uint32_t, 1> kInput123 = {0x01230123};
   constexpr std::array<uint32_t, 1> kInput456 = {0x04560456};
   constexpr std::array<uint32_t, 1> kInput789 = {0x07890789};
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
 
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId123, kInput123, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId456, kInput456, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId789, kInput789, device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{123}, kInput123,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{456}, kInput456,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{789}, kInput789,
+                                      device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 3);
-  EXPECT_EQ(host_log[0].entry_id, kId123);
-  EXPECT_EQ(host_log[0].checksum, 0x01230123);
-  EXPECT_EQ(host_log[1].entry_id, kId456);
-  EXPECT_EQ(host_log[1].checksum, 0x04560456);
-  EXPECT_EQ(host_log[2].entry_id, kId789);
-  EXPECT_EQ(host_log[2].checksum, 0x07890789);
+  EXPECT_EQ(host_log[0].entry_id, 123);
+  EXPECT_EQ(host_log[0].value, 0x01230123);
+  EXPECT_EQ(host_log[1].entry_id, 456);
+  EXPECT_EQ(host_log[1].value, 0x04560456);
+  EXPECT_EQ(host_log[2].entry_id, 789);
+  EXPECT_EQ(host_log[2].value, 0x07890789);
 }
 
 TEST_F(ChecksumKernelTest, DiscardsOverflowingChecksums) {
   se::DeviceMemory<uint8_t> mem = executor_->AllocateArray<uint8_t>(
       sizeof(BufferDebugLogHeader) + sizeof(BufferDebugLogEntry) * 2);
-  ThunkBufferId kId123 = ThunkBufferId::Create(ThunkId(123), 0).value();
-  ThunkBufferId kId456 = ThunkBufferId::Create(ThunkId(456), 0).value();
-  ThunkBufferId kId789 = ThunkBufferId::Create(ThunkId(789), 0).value();
   constexpr std::array<uint32_t, 1> kInput123 = {0x01230123};
   constexpr std::array<uint32_t, 1> kInput456 = {0x04560456};
   constexpr std::array<uint32_t, 1> kInput789 = {0x07890789};
   TF_ASSERT_OK_AND_ASSIGN(
-      se::cuda::BufferDebugLog device_log,
-      se::cuda::BufferDebugLog::CreateOnDevice(*stream_, mem));
-
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId123, kInput123, device_log));
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId456, kInput456, device_log));
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_,
+                                                                   mem));
+
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{123}, kInput123,
+                                      device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{456}, kInput456,
+                                      device_log));
   // This entry will be discarded.
-  TF_EXPECT_OK(AppendChecksumOnDevice(kId789, kInput789, device_log));
+  TF_EXPECT_OK(AppendChecksumOnDevice(BufferDebugLogEntryId{789}, kInput789,
+                                      device_log));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 2);
-  EXPECT_EQ(host_log[0].entry_id, kId123);
-  EXPECT_EQ(host_log[0].checksum, 0x01230123);
-  EXPECT_EQ(host_log[1].entry_id, kId456);
-  EXPECT_EQ(host_log[1].checksum, 0x04560456);
+  EXPECT_EQ(host_log[0].entry_id, 123);
+  EXPECT_EQ(host_log[0].value, 0x01230123);
+  EXPECT_EQ(host_log[1].entry_id, 456);
+  EXPECT_EQ(host_log[1].value, 0x04560456);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
index ae56303c25a8e1..6036f329c1c2b0 100644
--- a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
index cac3a954b2641f..79c507c8f39e1e 100644
--- a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 #include "xla/stream_executor/cuda/mock_compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc
new file mode 100644
index 00000000000000..b2b709f9865419
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda.cu.cc
@@ -0,0 +1,210 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+#include "cub/block/block_scan.cuh"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/prefix_sum_kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+template <unsigned int BLOCK_SIZE, typename ElementT>
+__device__ void RowPrefixSum(const ElementT* data_in, ElementT* data_out,
+                             size_t num_items) {
+  // `BLOCK_SIZE` must be a power of 2 no larger than 512.
+  static_assert(BLOCK_SIZE <= 512 && (BLOCK_SIZE & (BLOCK_SIZE - 1)) == 0);
+  using BlockScan = cub::BlockScan<ElementT, BLOCK_SIZE>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  ElementT total = 0;
+  size_t thread_idx =
+      ((threadIdx.z * blockDim.y) + threadIdx.y) * blockDim.x + threadIdx.x;
+  for (size_t offset = thread_idx; offset < num_items; offset += BLOCK_SIZE) {
+    if (offset < num_items) {
+      ElementT thread_data = data_in[offset];
+      ElementT block_aggregate;
+      BlockScan(temp_storage)
+          .InclusiveSum(thread_data, thread_data, block_aggregate);
+      data_out[offset] = thread_data + total;
+      total += block_aggregate;
+      __syncthreads();
+    }
+  }
+}
+
+template <typename ElementT>
+__global__ void PrefixSum(const void* data_in, void* data_out,
+                          size_t num_items) {
+  const ElementT* data_in_typed = static_cast<const ElementT*>(data_in);
+  ElementT* data_out_typed = static_cast<ElementT*>(data_out);
+  int64_t block_idx =
+      ((static_cast<int64_t>(blockIdx.z) * gridDim.y) + blockIdx.y) *
+          gridDim.x +
+      blockIdx.x;
+  int64_t row_offset = block_idx * num_items;
+  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+  // CUDA architecture limits the numbers of threads per block (1024 threads
+  // per block limit). We need to limit it to 512 to avoid running out of shared
+  // memory with 8 byte data types.
+  switch (blockDim.x * blockDim.y * blockDim.z) {
+    case 512:
+      RowPrefixSum<512>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 256:
+      RowPrefixSum<256>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 128:
+      RowPrefixSum<128>(data_in_typed + row_offset, data_out_typed + row_offset,
+                        num_items);
+      break;
+    case 64:
+      RowPrefixSum<64>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 32:
+      RowPrefixSum<32>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 16:
+      RowPrefixSum<16>(data_in_typed + row_offset, data_out_typed + row_offset,
+                       num_items);
+      break;
+    case 8:
+      RowPrefixSum<8>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 4:
+      RowPrefixSum<4>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 2:
+      RowPrefixSum<2>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    case 1:
+      RowPrefixSum<1>(data_in_typed + row_offset, data_out_typed + row_offset,
+                      num_items);
+      break;
+    default:
+      // Unsupported block size.
+      assert(false);
+      return;
+  }
+}
+
+#define XLA_CUB_PREFIX_SUM_KERNEL_SPEC(primitive_type, native_type)          \
+  se::KernelLoaderSpec GetPrefixSum##primitive_type##KernelSpec(int arity) { \
+    return se::KernelLoaderSpec::CreateInProcessSymbolSpec(                  \
+        absl::bit_cast<void*>(&PrefixSum<native_type>),                      \
+        "PrefixSum##primitive_type##Kernel", arity);                         \
+  }
+
+// Floating point types.
+#ifdef CUB_TYPE_BF16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(BF16, __nv_bfloat16)
+#endif
+#ifdef CUB_TYPE_F16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F16, __half)
+#endif
+#ifdef CUB_TYPE_F32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F32, float)
+#endif
+#ifdef CUB_TYPE_F64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(F64, double)
+#endif
+
+// Signed integer types.
+#ifdef CUB_TYPE_S8
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S8, int8_t)
+#endif
+#ifdef CUB_TYPE_S16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S16, int16_t)
+#endif
+#ifdef CUB_TYPE_S32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S32, int32_t)
+#endif
+#ifdef CUB_TYPE_S64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(S64, int64_t)
+#endif
+
+// Unsigned integer types.
+#ifdef CUB_TYPE_U8
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U8, uint8_t)
+#endif
+#ifdef CUB_TYPE_U16
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U16, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U32, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64
+XLA_CUB_PREFIX_SUM_KERNEL_SPEC(U64, uint64_t)
+#endif
+
+}  // namespace
+
+#define REGISTER_PREFIX_SUM_KERNEL(primitive_type)                 \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                  \
+      PrefixSum##primitive_type##Kernel,                           \
+      se::gpu::PrefixSum##primitive_type##Kernel, kCudaPlatformId, \
+      GetPrefixSum##primitive_type##KernelSpec)
+
+#ifdef CUB_TYPE_BF16
+REGISTER_PREFIX_SUM_KERNEL(BF16)
+#endif
+#ifdef CUB_TYPE_F16
+REGISTER_PREFIX_SUM_KERNEL(F16)
+#endif
+#ifdef CUB_TYPE_F32
+REGISTER_PREFIX_SUM_KERNEL(F32)
+#endif
+#ifdef CUB_TYPE_F64
+REGISTER_PREFIX_SUM_KERNEL(F64)
+#endif
+#ifdef CUB_TYPE_S8
+REGISTER_PREFIX_SUM_KERNEL(S8)
+#endif
+#ifdef CUB_TYPE_S16
+REGISTER_PREFIX_SUM_KERNEL(S16)
+#endif
+#ifdef CUB_TYPE_S32
+REGISTER_PREFIX_SUM_KERNEL(S32)
+#endif
+#ifdef CUB_TYPE_S64
+REGISTER_PREFIX_SUM_KERNEL(S64)
+#endif
+#ifdef CUB_TYPE_U8
+REGISTER_PREFIX_SUM_KERNEL(U8)
+#endif
+#ifdef CUB_TYPE_U16
+REGISTER_PREFIX_SUM_KERNEL(U16)
+#endif
+#ifdef CUB_TYPE_U32
+REGISTER_PREFIX_SUM_KERNEL(U32)
+#endif
+#ifdef CUB_TYPE_U64
+REGISTER_PREFIX_SUM_KERNEL(U64)
+#endif
+
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
new file mode 100644
index 00000000000000..2a18b7ad29b31b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/log.h"
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/prefix_sum_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep, required for KernelType::FactoryType::Create
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace se = stream_executor;
+
+namespace stream_executor::cuda {
+namespace {
+
+class CubPrefixSumKernelCudaTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<
+          std::tuple<xla::PrimitiveType, int, int, bool>> {
+ protected:
+  void SetUp() override {
+    TF_ASSERT_OK_AND_ASSIGN(platform_,
+                            se::PlatformManager::PlatformWithName("CUDA"));
+    TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
+    TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+  }
+
+  template <typename T>
+  absl::StatusOr<se::DeviceMemory<T>> CheckNotNull(
+      se::DeviceMemory<T> device_memory, absl::string_view name) {
+    if (device_memory.is_null()) {
+      return absl::InternalError(
+          absl::StrFormat("Device memory for %s is null", name));
+    }
+    return device_memory;
+  }
+
+  template <typename Kernel, typename T>
+  absl::Status ComputePrefixSumOnDevice(const std::vector<T>& input,
+                                        std::vector<T>& output, size_t num_rows,
+                                        size_t num_items, bool in_place) {
+    // Load kernel
+    gpu::GpuKernelRegistry registry =
+        gpu::GpuKernelRegistry::GetGlobalRegistry();
+    TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+
+    // Setup device buffers
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemory<T> device_input,
+        CheckNotNull(executor_->AllocateArray<T>(input.size()), "input"));
+    se::DeviceMemory<T> device_output;
+    if (in_place) {
+      device_output = device_input;
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          device_output,
+          CheckNotNull(executor_->AllocateArray<T>(output.size()), "output"));
+    }
+    auto cleanup = absl::MakeCleanup([&]() {
+      if (!in_place) {
+        executor_->Deallocate(&device_output);
+      }
+      executor_->Deallocate(&device_input);
+    });
+
+    TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
+                                       input.size() * sizeof(input[0])));
+    // For large number of items, limit the number of threads per block to 512
+    // to avoid running out of shared memory.
+    size_t num_threads_per_block =
+        std::min(size_t{512}, absl::bit_ceil(num_items));
+    // Call kernel
+    TF_RETURN_IF_ERROR(
+        kernel.Launch(stream_executor::ThreadDim(num_threads_per_block, 1, 1),
+                      stream_executor::BlockDim(num_rows, 1, 1), stream_.get(),
+                      device_input, device_output, num_items));
+    TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+    TF_RETURN_IF_ERROR(stream_->Memcpy(output.data(), device_output,
+                                       output.size() * sizeof(output[0])));
+    return absl::OkStatus();
+  }
+
+  template <typename Kernel, typename T>
+  absl::Status CheckComputePrefixSumOnDevice(size_t num_rows, size_t num_items,
+                                             bool in_place) {
+    std::vector<T> input(num_rows * num_items);
+    std::vector<T> output(input.size());
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_items; ++j) {
+        // We use only small values, otherwise we will get precision problems
+        // with small data types.
+        input[i * num_items + j] = static_cast<T>((i + j) % 5);
+        expected.push_back(input[i * num_items + j]);
+        if (j > 0) {
+          expected.back() += expected[expected.size() - 2];
+        }
+      }
+    }
+    TF_RETURN_IF_ERROR(ComputePrefixSumOnDevice<Kernel>(input, output, num_rows,
+                                                        num_items, in_place));
+    EXPECT_EQ(output, expected);
+    return absl::OkStatus();
+  }
+
+  se::Platform* platform_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+};
+
+TEST_P(CubPrefixSumKernelCudaTest, TestPrefixSum) {
+  absl::Status status;
+  const auto& [primitive_type, num_rows, num_items, in_place] = GetParam();
+  switch (primitive_type) {
+    case xla::BF16:
+      if (num_items > 128) {
+        GTEST_SKIP() << "Rounding errors";
+      }
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumBF16Kernel,
+                                             xla::bfloat16>(num_rows, num_items,
+                                                            in_place);
+      break;
+    case xla::F16:
+      status =
+          CheckComputePrefixSumOnDevice<gpu::PrefixSumF16Kernel, xla::half>(
+              num_rows, num_items, in_place);
+      break;
+    case xla::F32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumF32Kernel, float>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::F64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumF64Kernel, double>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S8:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS8Kernel, int8_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S16:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS16Kernel, int16_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS32Kernel, int32_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::S64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumS64Kernel, int64_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U8:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU8Kernel, uint8_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U16:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU16Kernel, uint16_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U32:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU32Kernel, uint32_t>(
+          num_rows, num_items, in_place);
+      break;
+    case xla::U64:
+      status = CheckComputePrefixSumOnDevice<gpu::PrefixSumU64Kernel, uint64_t>(
+          num_rows, num_items, in_place);
+      break;
+    default:
+      status = absl::OkStatus();
+  }
+  TF_EXPECT_OK(status);
+}
+
+std::string ParametersToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<xla::PrimitiveType, int, int, bool>>& data) {
+  const auto& [primitive_type, num_rows, num_items, in_place] = data.param;
+  return absl::StrFormat(
+      "Prefix_Sum_%dx%d_%s%s", num_rows, num_items,
+      xla::primitive_util::LowercasePrimitiveTypeName(primitive_type),
+      in_place ? "_in_place" : "");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CubPrefixSumKernelCudaTestInstance, CubPrefixSumKernelCudaTest,
+    ::testing::Combine(::testing::ValuesIn({xla::BF16, xla::F16, xla::F32,
+                                            xla::F64, xla::S8, xla::S16,
+                                            xla::S32, xla::S64, xla::U8,
+                                            xla::U16, xla::U32, xla::U64}),
+                       ::testing::ValuesIn({1, 2, 3, 128, 511, 512}),
+                       ::testing::ValuesIn({1, 2, 3, 128, 511, 513}),
+                       ::testing::ValuesIn({false, true})),
+    ParametersToString);
+
+}  // namespace
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
index 5d5cec0f19bb38..70e5d9553092e8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
@@ -124,108 +124,108 @@ cudaError_t CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
                    /*end_bit=*/sizeof(KeyT) * 8, stream);
 }
 
-#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                               \
+#define XLA_CUB_DEFINE_SORT_KEYS(type)                                       \
   template cudaError_t CubSortKeys<type>(void*, size_t&, const void*, void*, \
                                          size_t, bool, size_t, CUstream);
 
-#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                     \
+#define XLA_CUB_DEFINE_SORT_PAIRS(type1, type2)                             \
   template cudaError_t CubSortPairs<type1, type2>(                          \
       void*, size_t&, const void*, void*, const void*, void*, size_t, bool, \
       size_t, CUstream);
 
 // Floating point types.
 #ifdef CUB_TYPE_BF16
-XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+XLA_CUB_DEFINE_SORT_KEYS(__nv_bfloat16)
 #endif
 #ifdef CUB_TYPE_F16
-XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
+XLA_CUB_DEFINE_SORT_KEYS(__half)
 #endif
 #ifdef CUB_TYPE_F32
-XLA_CUB_DEFINE_SORT_KEYS(f32, float)
+XLA_CUB_DEFINE_SORT_KEYS(float)
 #endif
 #ifdef CUB_TYPE_F64
-XLA_CUB_DEFINE_SORT_KEYS(f64, double)
+XLA_CUB_DEFINE_SORT_KEYS(double)
 #endif
 
 // Signed integer types.
 #ifdef CUB_TYPE_S8
-XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
+XLA_CUB_DEFINE_SORT_KEYS(int8_t)
 #endif
 #ifdef CUB_TYPE_S16
-XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
+XLA_CUB_DEFINE_SORT_KEYS(int16_t)
 #endif
 #ifdef CUB_TYPE_S32
-XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
+XLA_CUB_DEFINE_SORT_KEYS(int32_t)
 #endif
 #ifdef CUB_TYPE_S64
-XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
+XLA_CUB_DEFINE_SORT_KEYS(int64_t)
 #endif
 
 // Unsigned integer types.
 #ifdef CUB_TYPE_U8
-XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint8_t)
 #endif
 #ifdef CUB_TYPE_U16
-XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint16_t)
 #endif
 #ifdef CUB_TYPE_U32
-XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint32_t)
 #endif
 #ifdef CUB_TYPE_U64
-XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
+XLA_CUB_DEFINE_SORT_KEYS(uint64_t)
 #endif
 
 // Pairs with 8-bit key.
 #ifdef CUB_TYPE_U8_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U8_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U8_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint8_t, uint64_t)
 #endif
 
 // Pairs with 16-bit key.
 #ifdef CUB_TYPE_U16_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U16_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U16_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint16_t, uint64_t)
 #endif
 
 // Pairs with 32-bit key.
 #ifdef CUB_TYPE_U32_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U32_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U32_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint32_t, uint64_t)
 #endif
 #ifdef CUB_TYPE_F32_B16
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b16, float, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint16_t)
 #endif
 #ifdef CUB_TYPE_F32_B32
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b32, float, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint32_t)
 #endif
 #ifdef CUB_TYPE_F32_B64
-XLA_CUB_DEFINE_SORT_PAIRS(f32_b64, float, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(float, uint64_t)
 #endif
 
 // Pairs with 64-bit key.
 #ifdef CUB_TYPE_U64_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint16_t)
 #endif
 #ifdef CUB_TYPE_U64_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint32_t)
 #endif
 #ifdef CUB_TYPE_U64_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
+XLA_CUB_DEFINE_SORT_PAIRS(uint64_t, uint64_t)
 #endif
 
 }  // namespace cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 1635641d202296..0844a5ed982d8a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -48,9 +48,9 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -515,12 +515,14 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64_t m,
                         CUDAComplex(GpuMemoryMutable(y)), incy);
 }
 
-absl::Status CUDABlas::DoBlasGemm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-    const void *beta, DeviceMemoryBase *c, int ldc,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+absl::Status CUDABlas::DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, blas::DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 
 #if CUDA_VERSION < 11000
@@ -530,7 +532,7 @@ absl::Status CUDABlas::DoBlasGemm(
 #else
   if (dtype == blas::DataType::kFloat) {
     math_type = CUBLAS_TF32_TENSOR_OP_MATH;
-    if (!numeric_options.allow_tf32) {
+    if (!engine_options.allow_tf32) {
       math_type = CUBLAS_DEFAULT_MATH;
     }
   }
@@ -643,8 +645,8 @@ static bool UsesTensorOps(blas::AlgorithmType algo) {
 }
 
 static absl::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
-    Stream *stream, blas::AlgorithmType algorithm, blas::DataType type_a,
-    blas::DataType type_b, const NumericOptions &numeric_options) {
+    Stream* stream, blas::AlgorithmType algorithm, blas::DataType type_a,
+    blas::DataType type_b, const EngineOptions& engine_options) {
   if (type_a != type_b) {
     return absl::InternalError("Types of inputs mismatch");
   }
@@ -688,7 +690,7 @@ static absl::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
                        " uses tensor ops which are not supported for input"));
     }
   }
-  if (!numeric_options.allow_tf32) {
+  if (!engine_options.allow_tf32) {
     math_type = CUBLAS_DEFAULT_MATH;
   }
 
@@ -709,16 +711,16 @@ static absl::Status PopulateProfileFromTimer(
 }
 
 absl::Status CUDABlas::DoBlasGemmWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, const void* beta, DeviceMemoryBase* c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
-      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
+      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, engine_options));
 
   std::unique_ptr<EventBasedTimer> timer;
   if (output_profile_result != nullptr) {
@@ -744,17 +746,17 @@ absl::Status CUDABlas::DoBlasGemmWithAlgorithm(
 }
 
 absl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* output_profile_result, blas::CallContext context) {
   TF_ASSIGN_OR_RETURN(
       cublasMath_t math_type,
-      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
+      GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, engine_options));
   std::unique_ptr<EventBasedTimer> timer;
   if (output_profile_result != nullptr) {
     TF_ASSIGN_OR_RETURN(timer,
@@ -912,13 +914,12 @@ T inline CUDAComplexValue(T v) {
 
 template <typename T, typename Scalar, typename FuncT>
 absl::Status CUDABlas::DoBlasGemmBatchedInternal(
-    FuncT cublas_func, Stream *stream, blas::Transpose transa,
+    FuncT cublas_func, Stream* stream, blas::Transpose transa,
     blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, Scalar alpha,
-    const DeviceMemorySlice<T> &a_ptrs_to_wrappers, int lda,
-    const DeviceMemorySlice<T> &b_ptrs_to_wrappers, int ldb, Scalar beta,
-    const DeviceMemorySlice<T> &c_ptrs_to_wrappers, int ldc, int batch_count,
-    const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator) {
+    const DeviceMemorySlice<T>& a_ptrs_to_wrappers, int lda,
+    const DeviceMemorySlice<T>& b_ptrs_to_wrappers, int ldb, Scalar beta,
+    const DeviceMemorySlice<T>& c_ptrs_to_wrappers, int ldc, int batch_count,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
     a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
@@ -968,7 +969,7 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 #if CUBLAS_VER_MAJOR >= 11
     } else if (data_type == CUDA_R_32F) {
-      if (numeric_options.allow_tf32 &&
+      if (engine_options.allow_tf32 &&
           tsl::tensor_float_32_execution_enabled()) {
         math_type = CUBLAS_TENSOR_OP_MATH;
         algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
@@ -1016,7 +1017,7 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
       DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
       TF_RETURN_IF_ERROR(DoBlasGemm(
           stream, transa, transb, m, n, k, blas::ToDataType<T>::value, &alpha,
-          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, numeric_options,
+          a_matrix, lda, b_matrix, ldb, &beta, c_matrix, ldc, engine_options,
           blas::CallContext::kNone));
     }
     return absl::OkStatus();
@@ -1024,17 +1025,17 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a_array,
     int lda, DeviceMemorySlice<Eigen::half> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1043,18 +1044,18 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha,
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of bf16 inside DoBlasGemmBatchedInternal.
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1063,15 +1064,15 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<float> a_array,
     int lda, DeviceMemorySlice<float> b_array, int ldb, float beta,
     DeviceMemorySlice<float> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1080,15 +1081,15 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, double alpha, DeviceMemorySlice<double> a_array,
     int lda, DeviceMemorySlice<double> b_array, int ldb, double beta,
     DeviceMemorySlice<double> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
 
       scratch_allocator);
   if (!status.ok()) {
@@ -1098,16 +1099,16 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, std::complex<float> alpha,
     DeviceMemorySlice<std::complex<float>> a_array, int lda,
     DeviceMemorySlice<std::complex<float>> b_array, int ldb,
     std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
+    int ldc, int batch_count, const EngineOptions& engine_options,
+    ScratchAllocator* scratch_allocator, blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
 
       scratch_allocator);
   if (!status.ok()) {
@@ -1117,16 +1118,16 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, std::complex<double> alpha,
     DeviceMemorySlice<std::complex<double>> a_array, int lda,
     DeviceMemorySlice<std::complex<double>> b_array, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
+    int ldc, int batch_count, const EngineOptions& engine_options,
+    ScratchAllocator* scratch_allocator, blas::CallContext context) {
   absl::Status status = DoBlasGemmBatchedInternal(
       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
-      b_array, ldb, beta, c_array, ldc, batch_count, numeric_options,
+      b_array, ldb, beta, c_array, ldc, batch_count, engine_options,
       scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -1135,19 +1136,19 @@ bool CUDABlas::DoBlasGemmBatched(
 }
 
 absl::Status CUDABlas::DoBlasGemmStridedBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, int64_t stride_a,
-    const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, blas::DataType dtype, const void* alpha,
+    const DeviceMemoryBase& a, int lda, int64_t stride_a,
+    const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+    const EngineOptions& engine_options, blas::CallContext context) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #if CUDA_VERSION < 11000
   if (dtype == dnn::kHalf) {
     math_type = CUBLAS_TENSOR_OP_MATH;
   }
 #else
-  if (dtype == dnn::kFloat && numeric_options.allow_tf32) {
+  if (dtype == dnn::kFloat && engine_options.allow_tf32) {
     math_type = CUBLAS_TF32_TENSOR_OP_MATH;
   }
 #endif
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index fc87558a8d4b59..eb1ad5c6f82b8a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -97,13 +97,12 @@ class CUDABlas : public blas::BlasSupport {
   // types.
   template <typename T, typename Scalar, typename FuncT>
   absl::Status DoBlasGemmBatchedInternal(
-      FuncT cublas_func, Stream *stream, blas::Transpose transa,
+      FuncT cublas_func, Stream* stream, blas::Transpose transa,
       blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, Scalar alpha,
-      const DeviceMemorySlice<T> &a_array, int lda,
-      const DeviceMemorySlice<T> &b_array, int ldb, Scalar beta,
-      const DeviceMemorySlice<T> &c_array, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator);
+      const DeviceMemorySlice<T>& a_array, int lda,
+      const DeviceMemorySlice<T>& b_array, int ldb, Scalar beta,
+      const DeviceMemorySlice<T>& c_array, int ldc, int batch_count,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator);
 
   // Guards the cuBLAS handle for this device.
   mutable absl::Mutex mu_;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
index 95f53fa907fba1..8dacdb6f642b70 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/path.h"
 
 namespace stream_executor::gpu {
 namespace {
@@ -78,19 +79,6 @@ CUgraphNode ToCudaGraphHandle(GraphNodeHandle handle) {
   return absl::bit_cast<CUgraphNode>(handle);
 }
 
-int ToCudaGraphKernelNodePriority(StreamPriority priority) {
-  switch (priority) {
-    case StreamPriority::Default:
-      return 0;
-    case StreamPriority::Lowest:
-      return -1;
-    case StreamPriority::Highest:
-      return 1;
-    default:
-      return 0;
-  }
-}
-
 // Converts a platform independent GraphConditionalHandle into a CUDA specific
 // CUgraphConditionalHandle.
 CUgraphConditionalHandle ToCudaGraphHandle(GraphConditionalHandle handle) {
@@ -542,7 +530,7 @@ absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateKernelNode(
 
   if (priority != StreamPriority::Default) {
     CUlaunchAttributeValue value;
-    value.priority = ToCudaGraphKernelNodePriority(priority);
+    value.priority = stream_exec_->GetGpuStreamPriority(priority);
     TF_RETURN_IF_ERROR(
         cuda::ToStatus(cuGraphKernelNodeSetAttribute(
                            node_handle, CU_LAUNCH_ATTRIBUTE_PRIORITY, &value),
@@ -702,6 +690,7 @@ absl::Status CudaCommandBuffer::SetPriority(StreamPriority priority) {
   TF_RETURN_IF_ERROR(
       cuda::ToStatus(cuGraphGetNodes(graph_, nodes.data(), &num_nodes)));
 
+  int priority_value = stream_exec_->GetGpuStreamPriority(priority);
   for (size_t i = 0; i < num_nodes; i++) {
     CUgraphNodeType type;
     TF_RETURN_IF_ERROR(cuda::ToStatus(cuGraphNodeGetType(nodes[i], &type),
@@ -709,7 +698,7 @@ absl::Status CudaCommandBuffer::SetPriority(StreamPriority priority) {
 
     if (type == CU_GRAPH_NODE_TYPE_KERNEL) {
       CUlaunchAttributeValue value;
-      value.priority = ToCudaGraphKernelNodePriority(priority);
+      value.priority = priority_value;
       TF_RETURN_IF_ERROR(
           cuda::ToStatus(cuGraphKernelNodeSetAttribute(
                              nodes[i], CU_LAUNCH_ATTRIBUTE_PRIORITY, &value),
@@ -831,4 +820,25 @@ absl::Status CudaCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
+std::string CudaCommandBuffer::ToString() const {
+  std::string path = tsl::io::GetTempFilename(/*extension=*/"dot");
+#if CUDA_VERSION >= 12000
+  int flags = CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE;
+  auto dot_print_status =
+      cuda::ToStatus(cuGraphDebugDotPrint(graph_, path.c_str(), flags),
+                     "Failed to print gpu graph debug file");
+  if (!dot_print_status.ok()) {
+    return std::string(dot_print_status.message());
+  }
+  std::string dot_file_contents;
+  auto read_status =
+      tsl::ReadFileToString(tsl::Env::Default(), path, &dot_file_contents);
+  if (!read_status.ok()) {
+    return std::string(read_status.message());
+  }
+  return dot_file_contents;
+#endif  // CUDA_VERSION >= 12000
+  return "CUDA graph debug dot print is not supported.";
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
index 74f59fcc31728c..84fb08a2cfa60d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
@@ -53,6 +53,8 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
   static absl::StatusOr<std::unique_ptr<CudaCommandBuffer>> Create(
       Mode mode, StreamExecutor* executor, CudaContext* cuda_context);
 
+  std::string ToString() const override;
+
   ~CudaCommandBuffer() override;
 
  private:
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
index 535f4c64cb2053..8d022e83c38d36 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"  // IWYU pragma: keep - cudnn frontend headers are not hermetic
@@ -32,7 +33,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -94,7 +95,10 @@ TEST(CudaCommandBufferTest, CuDnnExplicitConstructionAndUpdateWork) {
         .set_uid(3);
     return graph;
   }());
-  TF_ASSERT_OK(graph.Prepare(dnn_support, NumericOptions{}));
+  TF_ASSERT_OK(graph.Prepare(dnn_support,
+                             EngineOptions{/*require_determinism=*/false,
+                                           /*allow_tf32=*/true,
+                                           /*require_command_buffer=*/true}));
   TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
   EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
               absl_testing::IsOkAndHolds(true));
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
index d24d37d7391e9c..b5cf385bb50d4a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.pb.h"
 
 namespace stream_executor {
 
@@ -43,7 +44,7 @@ absl::StatusOr<CudaComputeCapability> CudaComputeCapability::FromString(
   }
 
   if (!split[1].empty() && (split[1].back() == 'f' || split[1].back() == 'F')) {
-    feature_extension = FeatureExtension::kForwardCompatibleFeatures;
+    feature_extension = FeatureExtension::kFamilyCompatibleFeatures;
     split[1].remove_suffix(1);
   }
 
@@ -63,7 +64,7 @@ static std::string FeatureExtensionToString(
       return "";
     case CudaComputeCapability::FeatureExtension::kAcceleratedFeatures:
       return "a";
-    case CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures:
+    case CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures:
       return "f";
   }
 }
@@ -110,8 +111,8 @@ absl::StatusOr<CudaComputeCapability> CudaComputeCapability::FromProto(
     case CudaComputeCapabilityProto::ACCELERATED_FEATURES:
       cc.feature_extension = FeatureExtension::kAcceleratedFeatures;
       break;
-    case CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES:
-      cc.feature_extension = FeatureExtension::kForwardCompatibleFeatures;
+    case CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES:
+      cc.feature_extension = FeatureExtension::kFamilyCompatibleFeatures;
       break;
     default:
       return absl::InvalidArgumentError(absl::StrCat(
@@ -133,9 +134,9 @@ CudaComputeCapabilityProto CudaComputeCapability::ToProto() const {
       proto.set_feature_extension(
           CudaComputeCapabilityProto::ACCELERATED_FEATURES);
       break;
-    case FeatureExtension::kForwardCompatibleFeatures:
+    case FeatureExtension::kFamilyCompatibleFeatures:
       proto.set_feature_extension(
-          CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES);
+          CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES);
       break;
   }
   return proto;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
index 0a7f8f3af3c031..a76923da917afd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
@@ -57,9 +57,9 @@ struct CudaComputeCapability {
             // a higher compute capability. Example: sm_90
     kAcceleratedFeatures,  // Enables features that only work on GPUs with the
                            // same compute capability. Example: sm_90a
-    kForwardCompatibleFeatures  // Enables features that only work on GPUs
-                                // within the same major version and a later
-                                // minor version. Example: sm_100f
+    kFamilyCompatibleFeatures  // Enables features that only work on GPUs
+                               // within the same major version and a later
+                               // minor version. Example: sm_100f
   };
   FeatureExtension feature_extension = FeatureExtension::kNone;
 
@@ -103,7 +103,7 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 9.0, notably H100, H200, and
   // GH200. When comparing with `IsAtLeast` this will only be true for GPUs with
   // compute capability 9.0.
-  constexpr static CudaComputeCapability H100Family() {
+  constexpr static CudaComputeCapability H100Accelerated() {
     return CudaComputeCapability{kHopper, 0,
                                  FeatureExtension::kAcceleratedFeatures};
   }
@@ -118,7 +118,7 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 10.0, notably B200 and GB200.
   // When comparing with `IsAtLeast` this will only be true for GPUs with
   // compute capability 10.0.
-  constexpr static CudaComputeCapability B200Family() {
+  constexpr static CudaComputeCapability B200Accelerated() {
     return CudaComputeCapability{kBlackwell, 0,
                                  FeatureExtension::kAcceleratedFeatures};
   }
@@ -132,9 +132,9 @@ struct CudaComputeCapability {
   // Includes all GPUs with compute capability 10.x. When comparing with
   // `IsAtLeast` this will true for all 10.x compute capabilities but not for
   // compute capabilities with a higher major version.
-  constexpr static CudaComputeCapability BlackwellGenerationOnly() {
+  constexpr static CudaComputeCapability BlackwellFamily() {
     return CudaComputeCapability{kBlackwell, 0,
-                                 FeatureExtension::kForwardCompatibleFeatures};
+                                 FeatureExtension::kFamilyCompatibleFeatures};
   }
 
   // Returns true if the compute capability is at least
@@ -193,7 +193,7 @@ struct CudaComputeCapability {
         return std::tie(major, minor) >= std::tie(other.major, other.minor);
       case FeatureExtension::kAcceleratedFeatures:
         return std::tie(major, minor) == std::tie(other.major, other.minor);
-      case FeatureExtension::kForwardCompatibleFeatures:
+      case FeatureExtension::kFamilyCompatibleFeatures:
         return major == other.major && minor >= other.minor;
     }
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
index 9c5a6446f843fb..3d31e78968a149 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.proto
@@ -22,10 +22,13 @@ message CudaComputeCapabilityProto {
   int32 minor = 2;
 
   enum FeatureExtension {
+    // UNSPECIFIED is only needed because when this field is not set
+    // it defaults to 0, but on Hopper and Blackwell we want to detect that
+    // and default to ACCELERATED_FEATURES, not to NONE.
     UNSPECIFIED = 0;
     NONE = 1;
     ACCELERATED_FEATURES = 2;
-    FORWARD_COMPATIBLE_FEATURES = 3;
+    FAMILY_COMPATIBLE_FEATURES = 3;
   }
 
   FeatureExtension feature_extension = 3;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
index 8b7618b50c7632..6465b8246c10c8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
@@ -44,7 +44,7 @@ TEST(CudaComputeCapabilityTest, ToString) {
   EXPECT_EQ(
       CudaComputeCapability(
           100, 52,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .ToString(),
       "100.52f");
 }
@@ -65,13 +65,13 @@ TEST(CudaComputeCapabilityTest, FromString) {
                   100, 52, FeatureExtension::kAcceleratedFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52f"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52F"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("100.52 f"),
               IsOkAndHolds(CudaComputeCapability(
-                  100, 52, FeatureExtension::kForwardCompatibleFeatures)));
+                  100, 52, FeatureExtension::kFamilyCompatibleFeatures)));
   EXPECT_THAT(CudaComputeCapability::FromString("1"),
               StatusIs(absl::StatusCode::kInvalidArgument));
   EXPECT_THAT(CudaComputeCapability::FromString("12"),
@@ -114,12 +114,12 @@ TEST(CudaComputeCapabilityTest, ToProto) {
   CudaComputeCapabilityProto proto2 =
       CudaComputeCapability(
           100, 5,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .ToProto();
   EXPECT_EQ(proto2.major(), 100);
   EXPECT_EQ(proto2.minor(), 5);
   EXPECT_EQ(proto2.feature_extension(),
-            CudaComputeCapabilityProto::FORWARD_COMPATIBLE_FEATURES);
+            CudaComputeCapabilityProto::FAMILY_COMPATIBLE_FEATURES);
 }
 
 TEST(CudaComputeCapabilityTest, FromProtoWithFeatureExtensionUnspecified) {
@@ -178,7 +178,7 @@ TEST(CudaComputeCapabilityTest, IsAtLeastMethods) {
   // IsAtLeastAmpere (sm_80)
   EXPECT_FALSE(CudaComputeCapability(7, 5).IsAtLeastAmpere());
   EXPECT_FALSE(
-      CudaComputeCapability(7, 5, FeatureExtension::kForwardCompatibleFeatures)
+      CudaComputeCapability(7, 5, FeatureExtension::kFamilyCompatibleFeatures)
           .IsAtLeastAmpere());
   EXPECT_TRUE(CudaComputeCapability(8, 0).IsAtLeastAmpere());
   EXPECT_TRUE(
@@ -199,7 +199,7 @@ TEST(CudaComputeCapabilityTest, IsAtLeastMethods) {
   // IsAtLeastBlackwell (sm_100)
   EXPECT_FALSE(CudaComputeCapability(9, 0).IsAtLeastBlackwell());
   EXPECT_FALSE(
-      CudaComputeCapability(9, 0, FeatureExtension::kForwardCompatibleFeatures)
+      CudaComputeCapability(9, 0, FeatureExtension::kFamilyCompatibleFeatures)
           .IsAtLeastBlackwell());
   EXPECT_TRUE(CudaComputeCapability(10, 0).IsAtLeastBlackwell());
   EXPECT_TRUE(
@@ -227,16 +227,16 @@ TEST(CudaComputeCapabilityTest, Hash) {
   EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly({
       CudaComputeCapability(0, 0),
       CudaComputeCapability(0, 0, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(0, 0, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(0, 0, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(0, 1),
       CudaComputeCapability(0, 1, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(0, 1, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(0, 1, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(1, 0),
       CudaComputeCapability(1, 0, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(1, 0, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(1, 0, FeatureExtension::kFamilyCompatibleFeatures),
       CudaComputeCapability(1, 1),
       CudaComputeCapability(1, 1, FeatureExtension::kAcceleratedFeatures),
-      CudaComputeCapability(1, 1, FeatureExtension::kForwardCompatibleFeatures),
+      CudaComputeCapability(1, 1, FeatureExtension::kFamilyCompatibleFeatures),
   }));
 }
 
@@ -252,12 +252,12 @@ TEST(CudaComputeCapabilityTest, ComparisonTest) {
   CudaComputeCapability base_but_accelerated{
       1, 0, FeatureExtension::kAcceleratedFeatures};
   CudaComputeCapability base_but_forward_compatible{
-      1, 0, FeatureExtension::kForwardCompatibleFeatures};
+      1, 0, FeatureExtension::kFamilyCompatibleFeatures};
   CudaComputeCapability newer_but_same_generation{1, 1};
   CudaComputeCapability newer_but_same_generation_accelerated{
       1, 1, FeatureExtension::kAcceleratedFeatures};
   CudaComputeCapability newer_but_same_generation_compatible{
-      1, 1, FeatureExtension::kForwardCompatibleFeatures};
+      1, 1, FeatureExtension::kFamilyCompatibleFeatures};
   CudaComputeCapability next_generation{2, 0};
 
   EXPECT_TRUE(base == base);
@@ -336,7 +336,7 @@ TEST(CudaComputeCapabilityTest, GetPtxAsTargetName) {
   EXPECT_EQ(
       CudaComputeCapability(
           10, 0,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .GetPtxAsTargetName(),
       "sm_100f");
 }
@@ -354,7 +354,7 @@ TEST(CudaComputeCapabilityTest, WithoutAnyFeatureExtension) {
   EXPECT_EQ(
       CudaComputeCapability(
           100, 52,
-          CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures)
+          CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures)
           .WithoutAnyFeatureExtension(),
       CudaComputeCapability(100, 52));
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 439ac237cf3704..04bdead2842655 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -63,8 +63,8 @@ limitations under the License.
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -945,7 +945,7 @@ class CudnnPoolingDescriptor {
  public:
   explicit CudnnPoolingDescriptor(
       const dnn::PoolingDescriptor& pooling_descriptor,
-      const NumericOptions& numeric_options)
+      const EngineOptions& engine_options)
       : handle_(CreatePoolingDescriptor()) {
     absl::Span<const int64_t> strides64 = pooling_descriptor.strides();
     absl::Span<const int64_t> padding64 = pooling_descriptor.padding();
@@ -962,7 +962,7 @@ class CudnnPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64_t, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    const auto cudnn_max_pooling_mode = numeric_options.require_determinism
+    const auto cudnn_max_pooling_mode = engine_options.require_determinism
                                             ? CUDNN_POOLING_MAX_DETERMINISTIC
                                             : CUDNN_POOLING_MAX;
     CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
@@ -1301,7 +1301,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
       cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
       cudnnDataType_t data_type, cudnnDataType_t compute_type,
       const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) {
     TF_ASSIGN_OR_RETURN(
         CudnnDropoutDescriptor dropout_desc,
@@ -1324,8 +1324,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // TODO(csigg): Minimal support cuDNN version is 7.3, clean up.
     bool allow_tensor_ops = data_type == CUDNN_DATA_HALF;
     if (data_type == CUDNN_DATA_FLOAT)
-      allow_tensor_ops = numeric_options.allow_tf32 &&
-                         tsl::tensor_float_32_execution_enabled();
+      allow_tensor_ops =
+          engine_options.allow_tf32 && tsl::tensor_float_32_execution_enabled();
     bool use_tensor_ops =
         algorithm_config.algorithm().has_value()
             ? algorithm_config.algorithm()->tensor_ops_enabled()
@@ -2263,7 +2263,7 @@ CudnnSupport::CreateRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    const NumericOptions& numeric_options, float dropout, uint64_t seed,
+    const EngineOptions& engine_options, float dropout, uint64_t seed,
     ScratchAllocator* state_allocator, bool use_padded_io) {
   // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
   // not enqueueing anything into a stream, we pass in the null stream.
@@ -2275,7 +2275,7 @@ CudnnSupport::CreateRnnDescriptor(
           ToCudnnRnnInputMode(input_mode),
           ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
           ToCudnnDataType(data_type), GetRnnComputeType(data_type),
-          algorithm_config, numeric_options, dropout, seed, state_allocator,
+          algorithm_config, engine_options, dropout, seed, state_allocator,
           use_padded_io));
   return std::unique_ptr<dnn::RnnDescriptor>(
       new CudnnRnnDescriptor(std::move(rnn_desc)));
@@ -4091,11 +4091,14 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
 
   // Setting bias
   if (bias_descriptor.has_value()) {
+    cudnn_frontend::DataType_t dataType =
+        ToCudnnFrontendDataType(bias_descriptor->type());
     auto bias_tensor =
         graph.tensor(Tensor_attributes()
                          .set_name("bias")
                          .set_dim(bias_descriptor->dimensions())
                          .set_stride(bias_descriptor->GetLogicalStrides())
+                         .set_data_type(dataType)
                          .set_uid(next_uid()));
     sdpa_options.set_bias(bias_tensor);
   }
@@ -4242,8 +4245,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   }
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b flash attention operation graph: " << cudnnGraph.Graph();
@@ -4388,8 +4392,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionF8OperationGraph(
   }
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4577,8 +4582,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardF8OperationGraph(
 
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4624,7 +4630,7 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
                         desc.GetPhysicalDimensionsMajorToMinor());
     std::vector<int64_t> strides = desc.GetPhysicalStridesMajorToMinor();
     if (dimensions.size() == 2) {
-      dimensions.insert(dimensions.begin(), 1);
+      dimensions.insert(dimensions.begin(), 1);  // Batch dimension is implicit.
       strides.insert(strides.begin(), dimensions[1] * dimensions[2]);
     }
     CHECK_EQ(dimensions.size(), 3);
@@ -4664,7 +4670,7 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
     d_tensor->set_uid(next_uid());
     d_tensor->set_is_virtual(false);
   } else {
-    std::vector<int64_t> scalar(lhs_data.ndims(), 1);
+    std::vector<int64_t> scalar(3, 1);  // Batch dimension is implicit.
     auto scale_attr = Tensor_attributes()
                           .set_uid(next_uid())
                           .set_dim(scalar)
@@ -4683,8 +4689,9 @@ absl::StatusOr<CudnnGraph> GetCudnnBlockScaledDotOperationGraph(
 
   CudnnGraph cudnnGraph(std::move(graph));
   TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
-      dnn_support, NumericOptions{/*require_determinism=*/false,
-                                  /*allow_tf32=*/true}));
+      dnn_support, EngineOptions{/*require_determinism=*/false,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b workspace size:" << cudnnGraph.Graph().get_workspace_size();
@@ -4813,12 +4820,15 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   std::shared_ptr<Tensor_attributes> d_bias_tensor;
   if (use_bias) {
     DCHECK(bias_descriptor != std::nullopt);
+    cudnn_frontend::DataType_t dataType =
+        ToCudnnFrontendDataType(bias_descriptor->type());
     auto bias_dims = bias_descriptor->dimensions();
     auto bias_strides = bias_descriptor->GetLogicalStrides();
     auto bias_tensor = graph.tensor(Tensor_attributes()
                                         .set_name("bias")
                                         .set_dim(bias_dims)
                                         .set_stride(bias_strides)
+                                        .set_data_type(dataType)
                                         .set_uid(next_uid()));
     sdpa_backward_options.set_bias(bias_tensor);
 
@@ -4972,9 +4982,10 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   }
 
   CudnnGraph cudnnGraph(std::move(graph));
-  TF_RETURN_IF_ERROR(
-      cudnnGraph.Prepare(dnn_support, NumericOptions{force_deterministic,
-                                                     /*allow_tf32=*/true}));
+  TF_RETURN_IF_ERROR(cudnnGraph.Prepare(
+      dnn_support, EngineOptions{force_deterministic,
+                                 /*allow_tf32=*/true,
+                                 /*require_command_buffer=*/false}));
   TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/std::nullopt));
 
   VLOG(4) << "\b flash attention operation backward graph: "
@@ -5371,19 +5382,19 @@ absl::Status CreateOpRunners(
     dnn::ConvolutionKind kind, dnn::DataType input_type,
     absl::Span<const int64_t> input_uids, bool use_fallback,
     std::vector<std::unique_ptr<const dnn::OpRunner<Sig>>>* out_runners,
-    bool need_side_input, const NumericOptions& numeric_options) {
+    bool need_side_input, const EngineOptions& engine_options) {
   cudnn_frontend::EngineConfigList filtered_configs;
   const bool disable_winograd = !CudnnEnvVar<WinogradNonfused>::IsEnabled();
   const bool disable_tensor_core =
-      !IsTensorMathEnabled(stream, input_type, numeric_options.allow_tf32);
+      !IsTensorMathEnabled(stream, input_type, engine_options.allow_tf32);
   auto generic_filter_fn = [=](cudnnBackendDescriptor_t engine_config) -> bool {
     return GenericEngineFilter(engine_config, disable_winograd,
-                               numeric_options.require_determinism,
+                               engine_options.require_determinism,
                                disable_tensor_core);
   };
   VLOG(4) << "Filtering engine configs with disable_winograd="
           << disable_winograd
-          << ", disable_nondeterminism=" << numeric_options.require_determinism
+          << ", disable_nondeterminism=" << engine_options.require_determinism
           << ", disable_tensor_core=" << disable_tensor_core;
 
   std::array<std::string, 1> heur_mode = {use_fallback ? "heuristics_fallback"
@@ -5421,6 +5432,11 @@ absl::Status CreateOpRunners(
                     .setEngineConfig(filtered_configs[i], op_graph->getTag())
                     .build();
     if (plan.get_status() != CUDNN_STATUS_SUCCESS) {
+      std::string message(65535, '\0');
+      cudnnGetLastErrorString(message.data(), message.size());
+      VLOG(4) << "Failed building ExecutionPlan: found error: "
+              << cudnnGetErrorString(plan.get_status())
+              << " with message: " << message;
       continue;
     }
 
@@ -5462,7 +5478,7 @@ absl::Status CreateOpRunners(
         std::move(runner_or).value()));
 
     // We will use the first working plan when determinism is required.
-    if (numeric_options.require_determinism) {
+    if (engine_options.require_determinism) {
       break;
     }
   }
@@ -5485,7 +5501,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     ScratchAllocator* /*scratch_allocator*/,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   TF_ASSIGN_OR_RETURN(
@@ -5497,7 +5513,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
   return CreateOpRunners<dnn::ConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'y'}, use_fallback, out_exec_plans,
-      /*need_side_input=*/false, numeric_options);
+      /*need_side_input=*/false, engine_options);
 }
 
 absl::Status CudnnSupport::GetGraphConvolveRunners(
@@ -5507,7 +5523,7 @@ absl::Status CudnnSupport::GetGraphConvolveRunners(
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
     std::string serialized_graph) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -5519,7 +5535,7 @@ absl::Status CudnnSupport::GetGraphConvolveRunners(
   return CreateOpRunners<dnn::GraphConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph_and_uids.first),
       kind, input_type, op_graph_and_uids.second, use_fallback, out_exec_plans,
-      /*need_side_input=*/false, numeric_options);
+      /*need_side_input=*/false, engine_options);
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
@@ -5624,7 +5640,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     const dnn::ActivationMode activation_mode,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   // Fused convolutions with identity activations are broken in that they
   // implicitly do ReLU on some engines, and we can't reliably detect which
@@ -5663,7 +5679,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
   return CreateOpRunners<dnn::FusedConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'z', 'b', 'y'}, use_fallback, out_exec_plans,
-      need_side_input, numeric_options);
+      need_side_input, engine_options);
 }
 
 absl::Status CudnnSupport::GetFusedMatmulRunners(
@@ -5671,7 +5687,7 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -5691,17 +5707,17 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
   return CreateOpRunners<dnn::FusedMatmulSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph),
       dnn::ConvolutionKind::INVALID, input_type, {'a', 'b', 'z', 'c'},
-      use_fallback, out_exec_plans, /*need_side_input=*/true, numeric_options);
+      use_fallback, out_exec_plans, /*need_side_input=*/true, engine_options);
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvFwd);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types;
@@ -5903,12 +5919,12 @@ bool CudnnSupport::GetRnnAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdData);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5922,7 +5938,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
-  if (numeric_options.require_determinism) {
+  if (engine_options.require_determinism) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
   }
 
@@ -5939,12 +5955,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdFilter);
 
   bool tensor_op_math_available = IsTensorMathEnabled(
-      cuda_compute_capability, input_type, numeric_options.allow_tf32);
+      cuda_compute_capability, input_type, engine_options.allow_tf32);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5962,7 +5978,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
-  if (!numeric_options.require_determinism) {
+  if (!engine_options.require_determinism) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
   }
@@ -6414,7 +6430,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Query the workspace size.
@@ -6428,7 +6444,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
   // Try running with `algo`, if successful then pick it. The
   // non-deterministic algorithm is first and thus preferentially picked
   // when determinism is not required.
-  auto algo = numeric_options.require_determinism
+  auto algo = engine_options.require_determinism
                   ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
                   : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC;
   cudnnStatus_t status = cudnnGetCTCLossWorkspaceSize(
@@ -6440,7 +6456,7 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
       /*algo=*/algo,
       /*ctcLossDesc=*/cudnn_ctc_loss_desc.handle(),
       /*sizeInBytes=*/&workspace_size_in_bytes);
-  if (numeric_options.require_determinism) {
+  if (engine_options.require_determinism) {
     RETURN_IF_CUDNN_ERROR(status);
   }
 
@@ -6585,14 +6601,14 @@ absl::Status CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
   return DoPoolForward(element_type, stream, pooling_dimensions,
-                       NumericOptions{}, input_dimensions, input_data,
+                       EngineOptions{}, input_dimensions, input_data,
                        output_dimensions, output_data, workspace_allocator);
 }
 
 absl::Status CudnnSupport::DoPoolForward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
@@ -6613,7 +6629,7 @@ absl::Status CudnnSupport::DoPoolForward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, engine_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6664,7 +6680,7 @@ absl::Status CudnnSupport::DoPoolBackward(
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
     ScratchAllocator* workspace_allocator) {
   return DoPoolBackward(element_type, stream, pooling_dimensions,
-                        NumericOptions{}, input_dimensions, input_data,
+                        EngineOptions{}, input_dimensions, input_data,
                         output_dimensions, output_data, input_diff_data,
                         output_diff_data, workspace_allocator);
 }
@@ -6672,7 +6688,7 @@ absl::Status CudnnSupport::DoPoolBackward(
 absl::Status CudnnSupport::DoPoolBackward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
@@ -6694,7 +6710,7 @@ absl::Status CudnnSupport::DoPoolBackward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, engine_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6853,18 +6869,22 @@ absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
 }
 
 absl::Status CudnnGraph::Prepare(dnn::DnnSupport& dnn_support,
-                                 const NumericOptions& numeric_options) {
+                                 const EngineOptions& engine_options) {
   const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
   TF_ASSIGN_OR_RETURN(auto cudnn_handle,
                       cudnn_support.cudnn_->GetCompilationHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.validate());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.build_operation_graph(cudnn_handle));
-  if (numeric_options.require_determinism) {
+  RETURN_IF_CUDNN_FRONTEND_ERROR(
+      graph_.create_execution_plans({cudnn_frontend::HeurMode_t::A}));
+  if (engine_options.require_determinism) {
     graph_.deselect_numeric_notes(
         {cudnn_frontend::NumericalNote_t::NONDETERMINISTIC});
   }
-  RETURN_IF_CUDNN_FRONTEND_ERROR(
-      graph_.create_execution_plans({cudnn_frontend::HeurMode_t::A}));
+  if (engine_options.require_command_buffer) {
+    graph_.select_behavior_notes(
+        {cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
+  }
   RETURN_CUDNN_FRONTEND_STATUS(graph_.check_support(cudnn_handle));
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index ec248539be953b..e651bf34895ca7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cudnn_sdpa_score_mod.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -58,7 +58,7 @@ class CudnnGraph : public dnn::DnnGraph {
   explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
       : graph_(std::move(graph)) {}
   // Prepares a graph and checks whether it is generally supported.
-  absl::Status Prepare(dnn::DnnSupport&, const NumericOptions&) override;
+  absl::Status Prepare(dnn::DnnSupport&, const EngineOptions&) override;
   // Builds single plan of the graph with given ID.
   absl::Status Build(dnn::DnnSupport&, std::optional<int64_t> plan_id) override;
   // Builds all the plans
@@ -106,7 +106,7 @@ class CudnnSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -270,7 +270,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans)
       override;
 
@@ -289,7 +289,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::FilterDescriptor& filter_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, const NumericOptions& numeric_options,
+      bool use_fallback, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
       std::string serialized_graph) override;
 
@@ -313,7 +313,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
@@ -322,7 +322,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
       uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
@@ -486,7 +486,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
-                             const NumericOptions& numeric_options,
+                             const EngineOptions& engine_options,
                              const dnn::BatchDescriptor& input_dimensions,
                              DeviceMemoryBase input_data,
                              const dnn::BatchDescriptor& output_dimensions,
@@ -505,7 +505,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
-                              const NumericOptions& numeric_options,
+                              const EngineOptions& engine_options,
                               const dnn::BatchDescriptor& input_dimensions,
                               DeviceMemoryBase input_data,
                               const dnn::BatchDescriptor& output_dimensions,
@@ -573,17 +573,17 @@ class CudnnSupport : public dnn::DnnSupport {
 
   bool GetConvolveAlgorithms(CudaComputeCapability cuda_compute_capability,
                              dnn::DataType input_type,
-                             const NumericOptions& numeric_options,
+                             const EngineOptions& engine_options,
                              std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   bool GetConvolveBackwardDataAlgorithms(
       CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   bool GetConvolveBackwardFilterAlgorithms(
       CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   template <class T, class U>
@@ -692,8 +692,7 @@ class CudnnSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
   CudnnSupport(const CudnnSupport&) = delete;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index a43dc5521a23a3..348602366858e2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <variant>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
@@ -47,6 +48,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
@@ -79,6 +81,9 @@ limitations under the License.
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -690,6 +695,117 @@ absl::StatusOr<CUmulticastObjectProp> CreateMulticastObjectProperties(
   return multicast_properties;
 }
 
+absl::Status ToStatus(nvmlReturn_t result) {
+  if (result == NVML_SUCCESS) {
+    return absl::OkStatus();
+  }
+  // NVML library is not a part of the CUDA toolkit, so there might be a
+  // situation when user is using newer CUDA, but the host NVML
+  // version doen't have the required functions.
+  if (result == NVML_ERROR_FUNCTION_NOT_FOUND) {
+    return absl::InternalError("NVML library doesn't have required functions.");
+  }
+  return absl::InternalError(absl::StrFormat("Nvml call failed with %d(%s).",
+                                             result, nvmlErrorString(result)));
+}
+
+// CUDA and Nvml can have different device ordering.
+absl::StatusOr<nvmlDevice_t> GetNvmlDevice(const std::string& pci_bus_id) {
+  nvmlDevice_t device;
+  TF_RETURN_IF_ERROR(
+      ToStatus(nvmlDeviceGetHandleByPciBusId_v2(pci_bus_id.c_str(), &device)));
+  return device;
+}
+
+absl::StatusOr<int64_t> GetDevicePcieBandwidth(nvmlDevice_t nvml_device) {
+  // nvmlDeviceGetPcieSpeed returns wrong information. Verified with
+  // nvbandwidth.
+  unsigned int link_gen, link_width;
+  nvmlReturn_t result =
+      nvmlDeviceGetCurrPcieLinkGeneration(nvml_device, &link_gen);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  result = nvmlDeviceGetCurrPcieLinkWidth(nvml_device, &link_width);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  // PCIe v1 single lane speed. 0.25 GB/s
+  int64_t lane_speed = 0.25 * 1024 * 1024 * 1024;
+  for (int i = 1; i < link_gen; i++) {
+    lane_speed *= 2;
+  }
+
+  return lane_speed * link_width;
+}
+
+absl::StatusOr<int> GetNumberOfActiveP2PNvlinks(nvmlDevice_t nvml_device) {
+  int p2p_links = 0;
+
+  constexpr int kBlackwellNvLinkCount = 18;
+  for (unsigned int i = 0; i < kBlackwellNvLinkCount; i++) {
+    nvmlEnableState_t is_active = NVML_FEATURE_DISABLED;
+    nvmlReturn_t result = nvmlDeviceGetNvLinkState(nvml_device, i, &is_active);
+    if (result == NVML_ERROR_NOT_SUPPORTED) {
+      break;
+    }
+    TF_RETURN_IF_ERROR(ToStatus(result));
+    if (is_active == NVML_FEATURE_DISABLED) {
+      break;
+    }
+
+    uint32_t supported_p2p = 0;
+    result = nvmlDeviceGetNvLinkCapability(
+        nvml_device, i, NVML_NVLINK_CAP_P2P_SUPPORTED, &supported_p2p);
+    if (result != NVML_ERROR_NOT_SUPPORTED) {
+      TF_RETURN_IF_ERROR(ToStatus(result));
+    }
+    if (supported_p2p) {
+      ++p2p_links;
+    }
+  }
+  return p2p_links;
+}
+
+struct FabricInfo {
+  std::string cluster_uuid;
+  std::string clique_id;
+};
+
+absl::StatusOr<FabricInfo> GetDeviceFabricInfo(nvmlDevice_t device) {
+#if CUDA_VERSION >= 12040
+  nvmlGpuFabricInfoV_t fabricInfo{nvmlGpuFabricInfo_v2};
+  fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+
+  nvmlReturn_t result = nvmlDeviceGetGpuFabricInfoV(device, &fabricInfo);
+  TF_RETURN_IF_ERROR(ToStatus(result));
+
+  if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+    std::string error_message =
+        "NVML doesn't support extracting fabric info or NVLink is not used by "
+        "the device.";
+    VLOG(2) << error_message;
+    return absl::InternalError(error_message);
+  }
+
+  static_assert(sizeof(fabricInfo.clusterUuid) == 16);
+  std::string uuid_str = absl::StrFormat(
+      "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+      fabricInfo.clusterUuid[0], fabricInfo.clusterUuid[1],
+      fabricInfo.clusterUuid[2], fabricInfo.clusterUuid[3],
+      fabricInfo.clusterUuid[4], fabricInfo.clusterUuid[5],
+      fabricInfo.clusterUuid[6], fabricInfo.clusterUuid[7],
+      fabricInfo.clusterUuid[8], fabricInfo.clusterUuid[9],
+      fabricInfo.clusterUuid[10], fabricInfo.clusterUuid[11],
+      fabricInfo.clusterUuid[12], fabricInfo.clusterUuid[13],
+      fabricInfo.clusterUuid[14], fabricInfo.clusterUuid[15]);
+
+  return FabricInfo{uuid_str, absl::StrCat(fabricInfo.cliqueId)};
+#else   // CUDA_VERSION >= 12040
+  std::string error_message = "NVML usage is not supported";
+  VLOG(2) << error_message;
+  return absl::InternalError(error_message);
+#endif  // CUDA_VERSION >= 12040
+}
+
 }  // namespace
 
 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -708,7 +824,7 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase* gpu_mem) {
 }
 
 absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetMemoryRange(
-    const DeviceMemoryBase& location) {
+    const DeviceMemoryBase& location) const {
   CUdeviceptr device_pointer;
   size_t size;
   TF_RETURN_IF_ERROR(cuda::ToStatus(
@@ -761,7 +877,7 @@ CudaExecutor::VmmMemoryHandle& CudaExecutor::VmmMemoryHandle::operator=(
 }
 
 absl::StatusOr<CudaExecutor::VmmMemoryHandle>
-CudaExecutor::RetainVmmMemoryHandle(void* ptr) {
+CudaExecutor::RetainVmmMemoryHandle(void* ptr) const {
   if (!is_vmm_supported_) {
     return absl::InternalError("VMM is not supported on this device.");
   }
@@ -772,6 +888,15 @@ CudaExecutor::RetainVmmMemoryHandle(void* ptr) {
   return CudaExecutor::VmmMemoryHandle(static_cast<uint64_t>(handle));
 }
 
+absl::StatusOr<size_t> CudaExecutor::GetVmmGranularity() const {
+  CUmemAllocationProp properties =
+      GetVmmAllocationProperties(device_, is_rdma_supported_);
+  size_t granularity = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemGetAllocationGranularity(
+      &granularity, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
+  return granularity;
+}
+
 absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
   if (!is_vmm_supported_) {
     return absl::InternalError("VMM is not supported on this device.");
@@ -798,6 +923,10 @@ absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
       cuMemAddressReserve(&ptr, padded_size, granularity, 0, 0)));
   TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemMap(ptr, padded_size, 0, handle, 0)));
 
+  VLOG(3) << "[" << device_ordinal() << "] VMM allocated " << ptr
+          << " requested size: " << bytes << " padded size: " << padded_size
+          << " granularity: " << granularity;
+
   int device_count = 0;
   TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&device_count)));
   for (int peer = 0; peer < device_count; peer++) {
@@ -1036,6 +1165,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
     VLOG(2) << "[" << device_ordinal() << "] Resolve CUDA kernel "
             << kernel_name << " from symbol pointer: " << symbol;
     cudaFunction_t func;
+    std::unique_ptr<ActivateContext> scoped_activation = Activate();
     TF_RETURN_IF_ERROR(cuda::ToStatus(
         cudaGetFuncBySymbol(&func, symbol),
         absl::StrFormat("[%d] Failed call to cudaGetFuncBySymbol",
@@ -1064,7 +1194,23 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
   TF_ASSIGN_OR_RETURN(KernelMetadata kernel_metadata,
                       cuda_kernel->GetKernelMetadata());
   cuda_kernel->set_metadata(kernel_metadata);
-  cuda_kernel->set_args_packing(spec.kernel_args_packing());
+  if (std::holds_alternative<KernelLoaderSpec::KernelArgsPackingFunc>(
+          spec.kernel_args_packing())) {
+    cuda_kernel->set_args_packing(
+        std::get<KernelLoaderSpec::KernelArgsPackingFunc>(
+            spec.kernel_args_packing()));
+  } else {
+    const auto& packing_spec =
+        std::get<KernelArgumentsPackingSpec>(spec.kernel_args_packing());
+    cuda_kernel->set_args_packing([packing_spec](const Kernel& kernel,
+                                                 const KernelArgs& args) {
+      const auto& mem_args =
+          stream_executor::Cast<stream_executor::KernelArgsDeviceMemoryArray>(
+              &args);
+      return packing_spec.BuildArguments(mem_args->device_memory_args(),
+                                         args.number_of_shared_bytes());
+    });
+  }
   return std::move(cuda_kernel);
 }
 
@@ -1573,10 +1719,10 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
   });
   cudnn_version_ready.WaitForNotification();
 
-  {
-    std::string pci_bus_id = GetPCIBusID(device);
-    desc.set_pci_bus_id(pci_bus_id);
+  std::string pci_bus_id = GetPCIBusID(device);
+  desc.set_pci_bus_id(pci_bus_id);
 
+  {
     // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
     std::optional<int> numa_node = ReadNumaNode(pci_bus_id, device_ordinal);
     // If the kernel reports -1, adjust to 0; leave as -1 if no value could be
@@ -1629,6 +1775,36 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
                               int64_t{mem_bus_width_bits.value()} / 8);
   }
 
+  if (absl::StatusOr<nvmlDevice_t> device = GetNvmlDevice(pci_bus_id);
+      device.ok()) {
+    absl::StatusOr<int64_t> bandwidth = GetDevicePcieBandwidth(*device);
+    if (bandwidth.ok()) {
+      desc.set_pcie_bandwidth(*bandwidth);
+    } else {
+      LOG(ERROR) << bandwidth.status().message()
+                 << " Assuming PCIe gen 3 x16 bandwidth.";
+      bandwidth = 16LL * 1024 * 1024 * 1024;
+    }
+
+    absl::StatusOr<int64_t> p2p_link_count =
+        GetNumberOfActiveP2PNvlinks(*device);
+    DeviceInterconnectInfo info;
+    if (p2p_link_count.ok()) {
+      info.active_links = *p2p_link_count;
+    } else {
+      LOG(ERROR) << p2p_link_count;
+    }
+    absl::StatusOr<FabricInfo> fabric_info = GetDeviceFabricInfo(*device);
+    if (fabric_info.ok()) {
+      info.cluster_uuid = fabric_info->cluster_uuid;
+      info.clique_id = fabric_info->clique_id;
+    } else {
+      LOG(WARNING) << "GPU interconnect information not available: "
+                   << fabric_info.status();
+    }
+    desc.set_device_interconnect_info(info);
+  }
+
   {
     BlockDim block_dim_limit;
     TF_RETURN_IF_ERROR(FillBlockDimLimit(device, &block_dim_limit));
@@ -1710,6 +1886,35 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
   }
 }
 
+int CudaExecutor::GetGpuStreamPriority(StreamPriority priority) {
+  if (priority == StreamPriority::Default) {
+    return 0;
+  }
+
+  absl::call_once(stream_priority_once_, [this]() {
+    std::unique_ptr<ActivateContext> activation = Activate();
+    int lowest = 0;
+    int highest = 0;
+    absl::Status status =
+        cuda::ToStatus(cuCtxGetStreamPriorityRange(&lowest, &highest));
+    if (!status.ok()) {
+      LOG(ERROR) << "Could not query stream priority range. Returning default "
+                    "priority.";
+      stream_priority_query_ok_ = false;
+      return;
+    }
+    stream_priority_lowest_ = lowest;
+    stream_priority_highest_ = highest;
+    stream_priority_query_ok_ = true;
+  });
+
+  if (!stream_priority_query_ok_) {
+    return 0;
+  }
+  return priority == StreamPriority::Highest ? stream_priority_highest_
+                                             : stream_priority_lowest_;
+}
+
 absl::StatusOr<const CudaKernel*> CudaExecutor::GetCudaKernel(
     const Kernel* kernel) {
   absl::MutexLock lock{in_memory_modules_mu_};
@@ -1749,7 +1954,7 @@ absl::StatusOr<TensorMap> CudaExecutor::CreateTensorMap(
 }
 
 absl::StatusOr<std::unique_ptr<GpuExecutor::MulticastMemory>>
-CudaExecutor::CreateMulticastMemory(uint64_t size, int num_devices) {
+CudaExecutor::CreateMulticastMemory(uint64_t size, int num_devices) const {
   if (!is_multicast_supported_) {
     return absl::FailedPreconditionError(
         "Multicast memory is not supported on this platform.");
@@ -1787,8 +1992,9 @@ CudaExecutor::CudaMulticastMemory::~CudaMulticastMemory() {
 }
 
 absl::Status CudaExecutor::CudaMulticastMemory::Initialize(
-    uint64_t size, int num_devices, GpuExecutor* gpu_executor) {
-  CudaExecutor* cuda_executor = dynamic_cast<CudaExecutor*>(gpu_executor);
+    uint64_t size, int num_devices, const GpuExecutor* gpu_executor) {
+  const CudaExecutor* cuda_executor =
+      dynamic_cast<const CudaExecutor*>(gpu_executor);
   if (cuda_executor == nullptr) {
     return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
   }
@@ -1815,12 +2021,13 @@ absl::Status CudaExecutor::CudaMulticastMemory::Initialize(
   TF_ASSIGN_OR_RETURN(CUmulticastObjectProp multicast_properties,
                       CreateMulticastObjectProperties(num_devices_, size));
 
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
+      cuMulticastCreate(&handle_, &multicast_properties)));
   VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
-          << "] Create multicast memory: " << static_cast<uint64_t>(handle_)
+          << "] Created multicast memory: " << static_cast<uint64_t>(handle_)
           << " size: " << padded_size_ << " with granularity: " << granularity_
           << " for " << num_devices_ << " devices.";
-  return stream_executor::cuda::ToStatus(
-      cuMulticastCreate(&handle_, &multicast_properties));
+  return absl::OkStatus();
 }
 
 absl::Status CudaExecutor::CudaMulticastMemory::SubscribeDevice(
@@ -1842,13 +2049,14 @@ absl::Status CudaExecutor::CudaMulticastMemory::SubscribeDevice(
 }
 
 absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
-    void* device_ptr, GpuExecutor* gpu_executor) {
-  CudaExecutor* cuda_executor = dynamic_cast<CudaExecutor*>(gpu_executor);
+    const DeviceMemoryBase& location, const GpuExecutor* gpu_executor) {
+  const CudaExecutor* cuda_executor =
+      dynamic_cast<const CudaExecutor*>(gpu_executor);
   if (cuda_executor == nullptr) {
     return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
   }
 
-  if (device_ptr == nullptr) {
+  if (location.is_null()) {
     return absl::InvalidArgumentError("Device pointer is null.");
   }
 
@@ -1863,20 +2071,26 @@ absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
 
   TF_ASSIGN_OR_RETURN(
       stream_executor::gpu::CudaExecutor::VmmMemoryHandle memory_handle,
-      cuda_executor->RetainVmmMemoryHandle(device_ptr));
+      cuda_executor->RetainVmmMemoryHandle(location.opaque()));
 
   CUmemGenericAllocationHandle retained_memory_handle =
       static_cast<CUmemGenericAllocationHandle>(memory_handle.handle());
 
+  TF_ASSIGN_OR_RETURN(auto base_address,
+                      cuda_executor->GetMemoryRange(location));
+  uint64_t offset = reinterpret_cast<uint64_t>(location.opaque()) -
+                    reinterpret_cast<uint64_t>(base_address.opaque());
+
   // Bind the memory to the multicast object.
   TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
       cuMulticastBindMem(handle_, /*mcOffset=*/0, retained_memory_handle,
-                         /*memOffset=*/0, padded_size_, /*flags=*/0)));
+                         /*memOffset=*/offset, padded_size_, /*flags=*/0)));
 
   VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
           << "] Mapped multicast memory: " << static_cast<uint64_t>(handle_)
           << " size: " << padded_size_ << " with granularity: " << granularity_
-          << " to address: " << device_ptr;
+          << " to address: " << location.opaque()
+          << " offset from base range: " << offset;
 
   // Map a virtual address range for the multicast memory. Multicast
   // memory is used to reduce the data stored in the multicast object.
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index aae517cdc4f38c..2077ee253c1098 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
 
 #include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -25,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <variant>
 
+#include "absl/base/call_once.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -66,8 +68,7 @@ class CudaExecutor : public GpuExecutor {
   absl::Status Init() override;
   bool SynchronizeAllActivity() override;
   absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) override;
-
+      const DeviceMemoryBase& location) const override;
   absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
       Stream* stream, bool use_delay_kernel) override;
   absl::StatusOr<DeviceMemoryBase> GetSymbol(
@@ -138,6 +139,13 @@ class CudaExecutor : public GpuExecutor {
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
       MemoryType type) override;
 
+  // Returns the granularity which is the minimum unit of memory that can be
+  // allocated with VMM API. In order to map the memory slices to multicast
+  // object, the offset of the slices should be aligned with this granularity.
+  absl::StatusOr<size_t> GetVmmGranularity() const;
+
+  int GetGpuStreamPriority(StreamPriority priority) override;
+
   // RAII wrapper for a VMM memory handle.
   class VmmMemoryHandle {
    public:
@@ -167,13 +175,13 @@ class CudaExecutor : public GpuExecutor {
 
     absl::Status SubscribeDevice(int device_number) override;
 
-    absl::StatusOr<void*> MapMemory(void* device_ptr,
-                                    GpuExecutor* gpu_executor) override;
+    absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
+                                    const GpuExecutor* gpu_executor) override;
 
    private:
     friend class CudaExecutor;
     absl::Status Initialize(uint64_t size, int num_devices,
-                            GpuExecutor* gpu_executor);
+                            const GpuExecutor* gpu_executor);
     CUmemGenericAllocationHandle handle_;
     uint64_t padded_size_;
     uint64_t granularity_;
@@ -185,12 +193,14 @@ class CudaExecutor : public GpuExecutor {
   };
 
   absl::StatusOr<std::unique_ptr<MulticastMemory>> CreateMulticastMemory(
-      uint64_t size, int num_devices) override;
+      uint64_t size, int num_devices) const override;
 
   // Returns a handle to the given memory if it was allocated with VMM API.
-  absl::StatusOr<VmmMemoryHandle> RetainVmmMemoryHandle(void* ptr);
+  absl::StatusOr<VmmMemoryHandle> RetainVmmMemoryHandle(void* ptr) const;
 
-  bool is_multicast_supported() const { return is_multicast_supported_; }
+  bool is_multicast_supported() const override {
+    return is_multicast_supported_;
+  }
 
  private:
   absl::Status VmmDeallocateMemory(void* ptr);
@@ -273,6 +283,13 @@ class CudaExecutor : public GpuExecutor {
 
   // CudaContext for this device.
   CudaContext* cuda_context_;
+
+  // Cached CUDA stream priority range. Initialized once on first non-default
+  // request and then reused for subsequent calls.
+  absl::once_flag stream_priority_once_;
+  int stream_priority_lowest_ = 0;
+  int stream_priority_highest_ = 0;
+  bool stream_priority_query_ok_ = false;
 };
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
index f49816ff3dd435..c5ffbf3d4d0fbd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -21,12 +22,15 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
+#include "xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
@@ -36,6 +40,39 @@ using ::absl_testing::IsOkAndHolds;
 using ::absl_testing::StatusIs;
 using ::testing::NotNull;
 
+template <typename T>
+absl::StatusOr<stream_executor::DeviceMemoryBase> AllocateInitializedMemory(
+    CudaExecutor* executor, size_t size, size_t offset, T value) {
+  stream_executor::DeviceMemoryBase device_memory = executor->Allocate(
+      size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+  if (device_memory.opaque() == nullptr) {
+    return absl::InternalError("Failed to allocate memory.");
+  }
+
+  size_t num_initialized_elements = size / sizeof(T);
+  std::vector<T> device_memory_vector(num_initialized_elements, value);
+
+  auto stride_memory = device_memory.GetByteSlice(offset, size);
+  TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
+      &stride_memory, device_memory_vector.data(), size));
+  return stride_memory;
+}
+
+template <typename T>
+absl::Status CheckMemory(CudaExecutor* executor,
+                         stream_executor::DeviceMemoryBase device_memory,
+                         T expected_value) {
+  size_t num_elements = device_memory.size() / sizeof(T);
+  std::vector<T> device_memory_vector(num_elements, 0);
+  TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
+      device_memory_vector.data(), device_memory, device_memory.size()));
+  for (int i = 0; i < device_memory_vector.size(); ++i) {
+    EXPECT_EQ(device_memory_vector[i], expected_value);
+  }
+
+  return absl::OkStatus();
+}
+
 StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
   auto* platform =
       PlatformManager::PlatformWithName(stream_executor::GpuPlatformName())
@@ -70,10 +107,10 @@ TEST(CudaExecutorMultiGpuTest, AllDevicesMustBeSubscribedBeforeMapping) {
   TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
                           executors[0]->CreateMulticastMemory(1024, 2));
   EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
-  EXPECT_THAT(
-      multicast_memory->MapMemory(reinterpret_cast<void*>(1), executors[0]),
-      StatusIs(absl::StatusCode::kFailedPrecondition,
-               "All devices should be subscribed."));
+  DeviceMemoryBase device_memory(reinterpret_cast<void*>(1), 1);
+  EXPECT_THAT(multicast_memory->MapMemory(device_memory, executors[0]),
+              StatusIs(absl::StatusCode::kFailedPrecondition,
+                       "All devices should be subscribed."));
   ;
 }
 
@@ -109,9 +146,9 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {
   EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
   EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
 
-  DeviceMemoryBase device_memory = executors[0]->Allocate(1, 0);
+  DeviceMemoryBase device_memory = executors[0]->Allocate(8, 0);
   EXPECT_THAT(
-      multicast_memory->MapMemory(device_memory.opaque(), executors[0]),
+      multicast_memory->MapMemory(device_memory, executors[0]),
       StatusIs(absl::StatusCode::kInternal,
                "CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
 }
@@ -123,28 +160,123 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
   if (!executors[0]->is_multicast_supported()) {
     GTEST_SKIP() << "Test requires multicast support.";
   }
-  const int64_t kNumDevices = 2;
-  const int64_t kMemorySize = 1024;
+  const int kNumDevices = 2;
+  const int kNumElements = 8;
+  const size_t kMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
   std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
   TF_ASSERT_OK_AND_ASSIGN(multicast_memory, executors[0]->CreateMulticastMemory(
                                                 kMemorySize, kNumDevices));
   EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
   EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
 
-  stream_executor::DeviceMemoryBase first_device_memory =
-      executors[0]->Allocate(
-          kMemorySize, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_memory,
+      AllocateInitializedMemory(executors[0], kMemorySize, 0, kValue));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase output_device_memory,
+      AllocateInitializedMemory(executors[0], kMemorySize, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* first_device_multicast_ptr,
+      multicast_memory->MapMemory(first_device_memory, executors[0]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase second_device_memory,
+      AllocateInitializedMemory(executors[1], kMemorySize, 0, kValue));
+  EXPECT_THAT(multicast_memory->MapMemory(second_device_memory, executors[1]),
+              IsOkAndHolds(NotNull()));
+
   EXPECT_THAT(
-      multicast_memory->MapMemory(first_device_memory.opaque(), executors[0]),
-      IsOkAndHolds(NotNull()));
+      MulticastReduce((int*)first_device_multicast_ptr,
+                      (int*)output_device_memory.opaque(), kNumElements),
+      IsOk());
+
+  const int kExpectedValue = kValue * kNumDevices;
+  EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
+              IsOk());
+}
 
-  stream_executor::DeviceMemoryBase second_device_memory =
-      executors[1]->Allocate(
-          kMemorySize, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlicesUnaligned) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int64_t kNumDevices = 2;
+  const int64_t kNumElements = 8;
+  const int64_t kMappedMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(
+      multicast_memory,
+      executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
+                          executors[0]->GetVmmGranularity());
+  // Allocate memory with unaligned offset.
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_mapped_memory,
+      AllocateInitializedMemory(
+          executors[0],
+          // Add granularity to make sure that there is
+          // enough memory after adding offset to map with multicast object.
+          kMappedMemorySize + vmm_granularity, kMappedMemorySize, kValue));
   EXPECT_THAT(
-      multicast_memory->MapMemory(second_device_memory.opaque(), executors[1]),
-      IsOkAndHolds(NotNull()));
+      multicast_memory->MapMemory(first_device_mapped_memory, executors[0]),
+      StatusIs(absl::StatusCode::kInternal,
+               "CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
 }
 
+// Slices mapping works only when offset is aligned with the VMM granularity.
+TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlices) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  const int64_t kNumDevices = 2;
+  const int64_t kNumElements = 8;
+  const int64_t kMappedMemorySize = kNumElements * sizeof(int);
+  const int kValue = 2;
+  std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
+  TF_ASSERT_OK_AND_ASSIGN(
+      multicast_memory,
+      executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
+  EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
+  EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
+
+  TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
+                          executors[0]->GetVmmGranularity());
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase first_device_mapped_memory,
+      AllocateInitializedMemory(executors[0], kMappedMemorySize,
+                                vmm_granularity, kValue));
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase output_device_memory,
+      AllocateInitializedMemory(executors[0], kMappedMemorySize, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* first_device_multicast_ptr,
+      multicast_memory->MapMemory(first_device_mapped_memory, executors[0]));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_executor::DeviceMemoryBase second_device_mapped_memory,
+      AllocateInitializedMemory(executors[1], kMappedMemorySize, 0, kValue));
+  EXPECT_THAT(
+      multicast_memory->MapMemory(second_device_mapped_memory, executors[1]),
+      IsOkAndHolds(NotNull()));
+
+  EXPECT_THAT(
+      MulticastReduce((int*)first_device_multicast_ptr,
+                      (int*)output_device_memory.opaque(), kNumElements),
+      IsOk());
+
+  const int kExpectedValue = kValue * kNumDevices;
+  EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
+              IsOk());
+}
 }  // namespace
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc
new file mode 100644
index 00000000000000..fc2652cf210d94
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.cu.cc
@@ -0,0 +1,51 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h"
+
+#include "xla/stream_executor/cuda/cuda_status.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+__global__ void MulticastReduceKernel(int* input, int* output, size_t size) {
+#if __CUDA_ARCH__ >= 900
+  for (int i = 0; i < size; i++) {
+    int* multimem_element_ptr = input + i;
+    int result = 0;
+    asm volatile("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(multimem_element_ptr)
+                 : "memory");
+
+    output[i] = result;
+  }
+#endif
+}
+}  // namespace
+
+__host__ absl::Status MulticastReduce(int* input, int* output, size_t size) {
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(cudaSetDevice(0)));
+  TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(cudaDeviceSynchronize()));
+  MulticastReduceKernel<<<1, 1, 0>>>(input, output, size);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    return absl::InternalError(
+        absl::StrCat("CUDA Kernel launch failed: ", cudaGetErrorString(err)));
+  }
+  return stream_executor::cuda::ToStatus(cudaDeviceSynchronize());
+}
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
similarity index 58%
rename from third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc
rename to third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
index 7b6e2ae17d1855..bac83b7ee8734b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_thunk_f16.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test_kernels.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/convolution_thunk_internal.h"
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
 
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV2D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
+#include "absl/status/status.h"
 
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
-CONV3D_INSTANTIATE_TEMPLATE(Eigen::ThreadPoolDevice, Eigen::half);
+namespace stream_executor::gpu {
+absl::Status MulticastReduce(int* input, int* output, size_t size);
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_MULTIGPU_TEST_KERNELS_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
index 36b2f20ac42612..e436ee29a94fd9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@@ -64,14 +64,23 @@ TEST(CudaExecutorTest, CreateDeviceDescription) {
   EXPECT_NE(result->driver_version(), kNullVersion);
   EXPECT_NE(result->compile_time_toolkit_version(), kNullVersion);
 
+  EXPECT_GT(result->pcie_bandwidth(), 1024 * 1024);
   EXPECT_THAT(result->platform_version(), Not(IsEmpty()));
   EXPECT_THAT(result->name(), Not(IsEmpty()));
   EXPECT_THAT(result->model_str(), Not(IsEmpty()));
   EXPECT_THAT(result->device_vendor(), "NVIDIA Corporation");
 
-  EXPECT_THAT(result->gpu_compute_capability(),
-              VariantWith<CudaComputeCapability>(::testing::Field(
-                  "major", &CudaComputeCapability::major, Ge(1))));
+  EXPECT_THAT(*result->gpu_compute_capability().cuda_compute_capability(),
+              ::testing::Field("major", &CudaComputeCapability::major, Ge(1)));
+
+  DeviceInterconnectInfo info = result->device_interconnect_info();
+  if (result->cuda_compute_capability().IsAtLeastBlackwell() &&
+      info.active_links) {
+    EXPECT_GE(info.active_links, 18);
+
+    EXPECT_THAT(info.clique_id, Not(IsEmpty()));
+    EXPECT_THAT(info.cluster_uuid, Not(IsEmpty()));
+  }
 }
 
 TEST(CudaExecutorTest, GetCudaKernel) {
@@ -185,7 +194,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithUnifiedMemory) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           unified_memory_allocator->Allocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              IsOkAndHolds(MemoryType::kUnified));
+              absl_testing::IsOkAndHolds(MemoryType::kUnified));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
@@ -197,7 +206,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           executor->HostMemoryAllocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              IsOkAndHolds(MemoryType::kHost));
+              absl_testing::IsOkAndHolds(MemoryType::kHost));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceMemory) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
index 367c6aff4ff893..81b8a7663a23c4 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
index 18bb2ca5772a63..66db39b00c9dd3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/logging.h"
@@ -61,9 +62,9 @@ class CudaKernel : public Kernel {
   absl::StatusOr<KernelMetadata> GetKernelMetadata();
 
  private:
-  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
-                      const std::optional<ClusterDim> &cluster_dims,
-                      Stream *stream, const KernelArgs &args) override;
+  absl::Status Launch(const ThreadDim& thread_dims, const BlockDim& block_dims,
+                      const std::optional<ClusterDim>& cluster_dims,
+                      Stream* stream, const KernelArgs& args) override;
 
   StreamExecutor* executor_ = nullptr;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
index 53ea53b5f2a1f7..c4e405633a367b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index f1b0040dbe6170..19a37d4080e2fc 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
@@ -44,14 +46,19 @@ namespace {
 static absl::Status InternalInit() {
   absl::Status status =
       cuda::ToStatus(cuInit(0 /* = flags */), "Failed call to cuInit");
-  if (status.ok()) {
+  if (!status.ok()) {
+    LOG(ERROR) << "failed call to cuInit: " << status;
+    cuda::Diagnostician::LogDiagnosticInformation();
     return status;
   }
 
-  LOG(ERROR) << "failed call to cuInit: " << status;
+  nvmlReturn_t init_result = nvmlInit();
+  if (init_result != NVML_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("NVML init failed with ", init_result));
+  }
 
-  cuda::Diagnostician::LogDiagnosticInformation();
-  return status;
+  return absl::OkStatus();
 }
 
 static absl::Status PlatformInitialize() {
@@ -67,6 +74,13 @@ static absl::Status PlatformInitialize() {
 
 CudaPlatform::CudaPlatform() : name_("CUDA") {}
 
+CudaPlatform::~CudaPlatform() {
+  nvmlReturn_t shutdown_result = nvmlShutdown();
+  if (shutdown_result != NVML_SUCCESS) {
+    LOG(ERROR) << "NVML shutdown failed with " << shutdown_result;
+  }
+}
+
 Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 
 int CudaPlatform::VisibleDeviceCount() const {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
index b03e90f08d8f27..3d50e92cc107fa 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -25,19 +25,13 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
-// Opaque and unique identifier for the CUDA platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a CudaPlatform object.
-extern const Platform::Id kCudaPlatformId;
-}  // namespace cuda
-
 namespace gpu {
 // Cuda-specific platform plugin, registered as a singleton value via module
 // initializer.
 class CudaPlatform : public Platform {
  public:
   CudaPlatform();
+  ~CudaPlatform() override;
 
   // Platform interface implementation:
   // Returns the same value as kCudaPlatform above.
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
index ce3a4715810080..e0083a4aa7c5dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/statusor.h"
@@ -45,5 +46,17 @@ TEST(CudaPlatformTest, FindExistingWorks) {
   }
 }
 
+TEST(CudaPlatformTest, NVML) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          PlatformManager::PlatformWithName("CUDA"));
+  CHECK_GT(platform->VisibleDeviceCount(), 0);
+
+  // After successful init, we try to use one of the
+  // nvml functions to see if the result is good.
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t get_device_result = nvmlDeviceGetHandleByIndex(0, &nvml_device);
+  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
+}
+
 }  // namespace
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
deleted file mode 100644
index f5ee54b01c8e65..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/cuda_solver_context.h"
-
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "third_party/gpus/cuda/include/cuComplex.h"
-#include "third_party/gpus/cuda/include/cusolverDn.h"
-#include "third_party/gpus/cuda/include/cusolver_common.h"
-#include "third_party/gpus/cuda/include/library_types.h"
-#include "xla/primitive_util.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/platform/platform_object_registry.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace stream_executor {
-
-namespace {
-
-// Type traits to get CUDA complex types from std::complex<T>.
-template <typename T>
-struct GpuComplexT {
-  typedef T type;
-};
-
-template <>
-struct GpuComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-template <>
-struct GpuComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-template <>
-struct GpuComplexT<std::complex<float>*> {
-  typedef cuComplex* type;
-};
-template <>
-struct GpuComplexT<std::complex<double>*> {
-  typedef cuDoubleComplex* type;
-};
-
-template <typename T>
-inline typename GpuComplexT<T>::type* ToDevicePointer(DeviceMemory<T> p) {
-  return static_cast<typename GpuComplexT<T>::type*>(p.opaque());
-}
-
-cublasFillMode_t GpuBlasUpperLower(blas::UpperLower uplo) {
-  switch (uplo) {
-    case blas::UpperLower::kUpper:
-      return CUBLAS_FILL_MODE_UPPER;
-    case blas::UpperLower::kLower:
-      return CUBLAS_FILL_MODE_LOWER;
-    default:
-      LOG(FATAL) << "Invalid value of blas::UpperLower.";
-  }
-}
-
-// Converts a cuSolver absl::Status to a absl::Status.
-absl::Status ConvertStatus(cusolverStatus_t status) {
-  switch (status) {
-    case CUSOLVER_STATUS_SUCCESS:
-      return absl::OkStatus();
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return xla::FailedPrecondition("cuSolver has not been initialized");
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return xla::ResourceExhausted("cuSolver allocation failed");
-    case CUSOLVER_STATUS_INVALID_VALUE:
-      return xla::InvalidArgument("cuSolver invalid value error");
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return xla::FailedPrecondition("cuSolver architecture mismatch error");
-    case CUSOLVER_STATUS_MAPPING_ERROR:
-      return xla::Unknown("cuSolver mapping error");
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return xla::Unknown("cuSolver execution failed");
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return xla::Internal("cuSolver internal error");
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return xla::Unimplemented("cuSolver matrix type not supported error");
-    case CUSOLVER_STATUS_NOT_SUPPORTED:
-      return xla::Unimplemented("cuSolver not supported error");
-    case CUSOLVER_STATUS_ZERO_PIVOT:
-      return xla::InvalidArgument("cuSolver zero pivot error");
-    case CUSOLVER_STATUS_INVALID_LICENSE:
-      return xla::FailedPrecondition("cuSolver invalid license error");
-    default:
-      return xla::Unknown("Unknown cuSolver error");
-  }
-}
-
-}  // namespace
-
-absl::StatusOr<std::unique_ptr<GpuSolverContext>> CudaSolverContext::Create() {
-  cusolverDnHandle_t handle;
-  TF_RETURN_IF_ERROR(ConvertStatus(cusolverDnCreate(&handle)));
-  return absl::WrapUnique(new CudaSolverContext(handle));
-}
-
-absl::Status CudaSolverContext::SetStream(Stream* stream) {
-  return ConvertStatus(cusolverDnSetStream(
-      handle_,
-      static_cast<cudaStream_t>(stream->platform_specific_handle().stream)));
-}
-
-CudaSolverContext::CudaSolverContext(cusolverDnHandle_t handle)
-    : handle_(handle) {}
-
-CudaSolverContext::~CudaSolverContext() {
-  absl::Status status = ConvertStatus(cusolverDnDestroy(handle_));
-  if (!status.ok()) {
-    LOG(ERROR) << "GpuSolverDestroy failed: " << status;
-  }
-}
-
-// Note: NVidia have promised that it is safe to pass 'nullptr' as the argument
-// buffers to cuSolver buffer size methods and this will be a documented
-// behavior in a future cuSolver release.
-absl::StatusOr<int64_t> CudaSolverContext::PotrfBufferSize(
-    xla::PrimitiveType type, blas::UpperLower uplo, int n, int lda,
-    int batch_size) {
-  int size = -1;
-  auto gpu_uplo = GpuBlasUpperLower(uplo);
-  size_t d_lwork = 0; /* size of workspace */
-  size_t h_lwork = 0; /* size of workspace */
-
-  cudaDataType_t cuda_data_type;
-  switch (type) {
-    case xla::F32: {
-      cuda_data_type = CUDA_R_32F;
-      break;
-    }
-    case xla::F64: {
-      cuda_data_type = CUDA_R_64F;
-      break;
-    }
-    case xla::C64: {
-      cuda_data_type = CUDA_C_32F;
-      break;
-    }
-    case xla::C128: {
-      cuda_data_type = CUDA_C_64F;
-      break;
-    }
-    default:
-      return xla::InvalidArgument("Invalid type for cholesky decomposition: %s",
-                                  PrimitiveType_Name(type));
-  }
-  TF_RETURN_IF_ERROR(ConvertStatus(cusolverDnXpotrf_bufferSize(
-      handle_, nullptr, gpu_uplo, n, cuda_data_type, nullptr, lda,
-      cuda_data_type, &d_lwork, &h_lwork)));
-  size = static_cast<int>(d_lwork);
-
-  // CUDA's potrfBatched needs space for the `as` array, which contains
-  // batch_size pointers.  Divide by sizeof(type) because this function returns
-  // not bytes but a number of elements of `type`.
-  int64_t potrf_batched_scratch = xla::CeilOfRatio<int64_t>(
-      batch_size * sizeof(void*), xla::primitive_util::ByteWidth(type));
-
-  return std::max<int64_t>(size, potrf_batched_scratch);
-}
-
-absl::Status CudaSolverContext::PotrfBatched(blas::UpperLower uplo, int n,
-                                             DeviceMemory<float*> as, int lda,
-                                             DeviceMemory<int> lapack_info,
-                                             int batch_size) {
-  return ConvertStatus(cusolverDnSpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(blas::UpperLower uplo, int n,
-                                             DeviceMemory<double*> as, int lda,
-                                             DeviceMemory<int> lapack_info,
-                                             int batch_size) {
-  return ConvertStatus(cusolverDnDpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<float>*> as,
-    int lda, DeviceMemory<int> lapack_info, int batch_size) {
-  return ConvertStatus(cusolverDnCpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::PotrfBatched(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<double>*> as,
-    int lda, DeviceMemory<int> lapack_info, int batch_size) {
-  return ConvertStatus(cusolverDnZpotrfBatched(
-      handle_, GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
-      ToDevicePointer(lapack_info), batch_size));
-}
-
-absl::Status CudaSolverContext::Potrf(blas::UpperLower uplo, int n,
-                                      DeviceMemory<double> a, int lda,
-                                      DeviceMemory<int> lapack_info,
-                                      DeviceMemory<double> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_R_64F,
-      ToDevicePointer(a), lda, CUDA_R_64F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(blas::UpperLower uplo, int n,
-                                      DeviceMemory<float> a, int lda,
-                                      DeviceMemory<int> lapack_info,
-                                      DeviceMemory<float> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_R_32F,
-      ToDevicePointer(a), lda, CUDA_R_32F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<float>> a, int lda,
-    DeviceMemory<int> lapack_info,
-    DeviceMemory<std::complex<float>> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_C_32F,
-      ToDevicePointer(a), lda, CUDA_C_32F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-absl::Status CudaSolverContext::Potrf(
-    blas::UpperLower uplo, int n, DeviceMemory<std::complex<double>> a, int lda,
-    DeviceMemory<int> lapack_info,
-    DeviceMemory<std::complex<double>> workspace) {
-  absl::Status status = ConvertStatus(cusolverDnXpotrf(
-      handle_, nullptr, GpuBlasUpperLower(uplo), n, CUDA_C_64F,
-      ToDevicePointer(a), lda, CUDA_C_64F, ToDevicePointer(workspace),
-      workspace.ElementCount(), nullptr, 0, ToDevicePointer(lapack_info)));
-  return status;
-}
-
-STREAM_EXECUTOR_REGISTER_OBJECT_STATICALLY(CudaSolverContextFactory,
-                                           GpuSolverContextFactory,
-                                           cuda::kCudaPlatformId,
-                                           CudaSolverContext::Create);
-
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
deleted file mode 100644
index 15c3a326601602..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
-#define XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
-
-#include <complex>
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "third_party/gpus/cuda/include/cusolverDn.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu_solver_context.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace stream_executor {
-
-class CudaSolverContext : public GpuSolverContext {
- public:
-  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create();
-
-  ~CudaSolverContext() override;
-
-  absl::Status SetStream(Stream* stream) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<float*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<double*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<std::complex<float>*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status PotrfBatched(blas::UpperLower uplo, int n,
-                            DeviceMemory<std::complex<double>*> as, int lda,
-                            DeviceMemory<int> lapack_info,
-                            int batch_size) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<float> a,
-                     int lda, DeviceMemory<int> lapack_info,
-                     DeviceMemory<float> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n, DeviceMemory<double> a,
-                     int lda, DeviceMemory<int> lapack_info,
-                     DeviceMemory<double> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n,
-                     DeviceMemory<std::complex<float>> a, int lda,
-                     DeviceMemory<int> lapack_info,
-                     DeviceMemory<std::complex<float>> workspace) override;
-  absl::Status Potrf(blas::UpperLower uplo, int n,
-                     DeviceMemory<std::complex<double>> a, int lda,
-                     DeviceMemory<int> lapack_info,
-                     DeviceMemory<std::complex<double>> workspace) override;
-  absl::StatusOr<int64_t> PotrfBufferSize(xla::PrimitiveType type,
-                                          blas::UpperLower uplo, int n, int lda,
-                                          int batch_size) override;
-
- private:
-  explicit CudaSolverContext(cusolverDnHandle_t handle);
-
-  cusolverDnHandle_t handle_;
-};
-
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_SOLVER_CONTEXT_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
index ad9a078ed73c4e..b23eaf4e13ca1f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
@@ -72,21 +72,6 @@ absl::Status RecordGpuEvent(StreamExecutor* executor, CUevent event,
                         "Error recording CUDA event");
 }
 
-int GetGpuStreamPriority(stream_executor::StreamPriority stream_priority) {
-  if (stream_priority == stream_executor::StreamPriority::Default) {
-    return 0;
-  }
-  int lowest, highest;
-  auto status = cuda::ToStatus(cuCtxGetStreamPriorityRange(&lowest, &highest));
-  if (!status.ok()) {
-    LOG(ERROR)
-        << "Could not query stream priority range. Returning default priority.";
-    return 0;
-  }
-  return stream_priority == stream_executor::StreamPriority::Highest ? highest
-                                                                     : lowest;
-}
-
 absl::StatusOr<CUstream> CreateStream(StreamExecutor* executor, int priority) {
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   CUstream stream;
@@ -191,7 +176,7 @@ absl::StatusOr<std::unique_ptr<CudaStream>> CudaStream::Create(
       return std::get<int>(priority.value());
     }
     std::unique_ptr<ActivateContext> activation = executor->Activate();
-    return GetGpuStreamPriority(
+    return executor->GetGpuStreamPriority(
         std::get<StreamPriority>(priority.value_or(StreamPriority::Default)));
   }();
   TF_ASSIGN_OR_RETURN(auto stream_handle,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
index 7209e565dd9d3e..c05783c0964b33 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
index 3167ca6c39021d..9cb92929dcda68 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_timer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
index 3d35e16828d15e..042566da4757c7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_version_parser_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
index 74b73552723f41..55b0a5185dd720 100644
--- a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod.cc
@@ -15,14 +15,25 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cudnn_sdpa_score_mod.h"
 
+#include <iostream>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
index 80189a5f745d8b..d419487a261815 100644
--- a/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cudnn_sdpa_score_mod_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "json/json.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
index a52e8058fcd972..3d2008093a4848 100644
--- a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
index 5c4f3ae0ec4380..2e4270741db670 100644
--- a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
index b2a938bf65e48b..79517ec84435ff 100644
--- a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.cc
@@ -95,7 +95,7 @@ absl::StatusOr<Assembly> DriverCompilationProvider::CompileAndLink(
 #endif
 
   if (cc.feature_extension ==
-      CudaComputeCapability::FeatureExtension::kForwardCompatibleFeatures) {
+      CudaComputeCapability::FeatureExtension::kFamilyCompatibleFeatures) {
     return absl::UnimplementedError(
         "Compiling forward compatible kernels is not implemented yet.");
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
index e40d490ce7c85b..849cdbe9f90e13 100644
--- a/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/gpu_test_kernels_cuda.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/gpu_test_kernel_traits.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
@@ -33,6 +33,25 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
           "AddI32", arity);
     }));
 
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    IncrementBy5I32KernelWithCustomArgsPackingCuda,
+    stream_executor::gpu::internal::IncrementBy5I32KernelWithCustomArgsPacking,
+    stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {
+      stream_executor::KernelArgumentsPackingSpec spec;
+      // This kernels is implemented in terms of the generic `IncI32` kernel
+      // which accepts a constant scalar argument and an addressable pointer
+      // argument. We use a custom args packing spec to pass a constant scalar
+      // value of 5 to the kernel.
+      spec.AddConstantArgument<int32_t>(5);
+      spec.AddAddressArgument(/*argument_index=*/0);
+      spec.AddAddressArgument(/*argument_index=*/1);
+
+      return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
+          absl::bit_cast<void*>(&stream_executor::gpu::IncI32),
+
+          "IncI32", /*arity=*/3, spec);
+    }));
+
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     MulI32KernelCuda, stream_executor::gpu::internal::MulI32Kernel,
     stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {
diff --git a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
index e3937c1d82205d..d04b24d363b8a2 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h"
 
-#include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
index 49e8e2bf30a6cc..d52c974bd8efc8 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status_matchers.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc b/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
index 2cc1899c1d141d..0f657ede8c9d83 100644
--- a/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/tma_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/status_matchers.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
index e2e9561e0e2a72..e89f3cc8dfdec0 100644
--- a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
+++ b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/kernel_symbol_registry.h"
 /* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -285,10 +286,15 @@ __launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
   GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                             \
       TopKKernelCuda_K##K_VAL##_##TYPE##_##VT, KERNEL_TRAIT(K_VAL, TYPE, VT), \
       stream_executor::cuda::kCudaPlatformId, ([](size_t arity) {             \
-        return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(  \
-            absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                     \
-            "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                        \
-      }));
+        return stream_executor::KernelLoaderSpec::                            \
+            CreateSerializableInProcessSymbolSpec(                            \
+                /*persistent_kernel_name=*/"topk_k" #K_VAL "_" #TYPE "_" #VT, \
+                absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                 \
+                "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                    \
+      }));                                                                    \
+  KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(                          \
+      topk_k##K_VAL##_##TYPE##_##VT, stream_executor::cuda::kCudaPlatformId,  \
+      (&Run<K_VAL, TYPE, VT>));
 
 }  // namespace stream_executor::cuda
 
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index a63bc439834e07..9264fb048dd642 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -17,13 +17,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <variant>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -68,12 +69,10 @@ absl::StatusOr<DeviceDescription> DeviceDescription::FromProto(
 
 GpuDeviceInfoProto DeviceDescription::ToGpuProto() const {
   stream_executor::GpuDeviceInfoProto proto;
-  if (auto* ptr = std::get_if<stream_executor::CudaComputeCapability>(
-          &gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.cuda_compute_capability()) {
     *proto.mutable_cuda_compute_capability() = ptr->ToProto();
   }
-  if (auto* ptr = std::get_if<stream_executor::RocmComputeCapability>(
-          &gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.rocm_compute_capability()) {
     *proto.mutable_rocm_compute_capability() = ptr->ToProto();
   }
 
@@ -106,8 +105,7 @@ const GpuComputeCapability &DeviceDescription::gpu_compute_capability() const {
 }
 
 CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
-  if (auto *ptr =
-          std::get_if<CudaComputeCapability>(&gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.cuda_compute_capability()) {
     return *ptr;
   }
   // Fallback for backwards compatibility.
@@ -115,8 +113,7 @@ CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
 }
 
 RocmComputeCapability DeviceDescription::rocm_compute_capability() const {
-  if (auto *ptr =
-          std::get_if<RocmComputeCapability>(&gpu_compute_capability_)) {
+  if (auto* ptr = gpu_compute_capability_.rocm_compute_capability()) {
     return *ptr;
   }
   return RocmComputeCapability{};
@@ -154,4 +151,33 @@ void CalculateDimensionality(const DeviceDescription &device_description,
   }
 }
 
+GpuComputeCapabilityProto GpuComputeCapability::ToProto() const {
+  GpuComputeCapabilityProto proto;
+  if (IsCuda()) {
+    *proto.mutable_cuda_compute_capability() =
+        cuda_compute_capability()->ToProto();
+  } else {
+    *proto.mutable_rocm_compute_capability() =
+        rocm_compute_capability()->ToProto();
+  }
+  return proto;
+}
+
+absl::StatusOr<GpuComputeCapability> GpuComputeCapability::FromProto(
+    const GpuComputeCapabilityProto& proto) {
+  if (proto.has_cuda_compute_capability()) {
+    TF_ASSIGN_OR_RETURN(
+        CudaComputeCapability cuda_compute_capability,
+        CudaComputeCapability::FromProto(proto.cuda_compute_capability()));
+    return GpuComputeCapability(cuda_compute_capability);
+  }
+
+  if (proto.has_rocm_compute_capability()) {
+    return GpuComputeCapability(
+        RocmComputeCapability::FromProto(proto.rocm_compute_capability()));
+  }
+
+  return absl::InvalidArgumentError(
+      "The serialized GpuComputeCapability has no compute capability set.");
+}
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 391cad3f7ebebc..39962e7dda737f 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <cassert>
 #include <cstdint>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <variant>
 
@@ -36,26 +35,40 @@ limitations under the License.
 
 namespace stream_executor {
 
-class GpuComputeCapability
-    : public std::variant<CudaComputeCapability, RocmComputeCapability> {
+class GpuComputeCapability {
  public:
-  using std::variant<CudaComputeCapability, RocmComputeCapability>::variant;
-  using std::variant<CudaComputeCapability, RocmComputeCapability>::operator=;
+  GpuComputeCapability() = default;
+  GpuComputeCapability(const CudaComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+  explicit GpuComputeCapability(const RocmComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+
+  GpuComputeCapability& operator=(
+      const CudaComputeCapability& compute_capability) {
+    compute_capability_ = compute_capability;
+    return *this;
+  }
+
+  GpuComputeCapability& operator=(
+      const RocmComputeCapability& compute_capability) {
+    compute_capability_ = compute_capability;
+    return *this;
+  }
 
   bool IsCuda() const {
-    return std::holds_alternative<CudaComputeCapability>(*this);
+    return std::holds_alternative<CudaComputeCapability>(compute_capability_);
   }
 
   bool IsRocm() const {
-    return std::holds_alternative<RocmComputeCapability>(*this);
+    return std::holds_alternative<RocmComputeCapability>(compute_capability_);
   }
 
   const CudaComputeCapability* cuda_compute_capability() const {
-    return std::get_if<CudaComputeCapability>(this);
+    return std::get_if<CudaComputeCapability>(&compute_capability_);
   }
 
   const RocmComputeCapability* rocm_compute_capability() const {
-    return std::get_if<RocmComputeCapability>(this);
+    return std::get_if<RocmComputeCapability>(&compute_capability_);
   }
 
   std::string ToString() const {
@@ -64,6 +77,35 @@ class GpuComputeCapability
     }
     return rocm_compute_capability()->ToString();
   }
+
+  GpuComputeCapabilityProto ToProto() const;
+
+  static absl::StatusOr<GpuComputeCapability> FromProto(
+      const GpuComputeCapabilityProto& proto);
+
+  friend bool operator==(const GpuComputeCapability& lhs,
+                         const GpuComputeCapability& rhs) {
+    return lhs.compute_capability_ == rhs.compute_capability_;
+  }
+
+  friend bool operator!=(const GpuComputeCapability& lhs,
+                         const GpuComputeCapability& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  std::variant<CudaComputeCapability, RocmComputeCapability>
+      compute_capability_;
+};
+
+// Information about NVLink/UALink.
+struct DeviceInterconnectInfo {
+  int active_links = 0;
+
+  // Uuid of the cluster to which this GPU belongs.
+  std::string cluster_uuid;
+  // ID of the fabric clique to which this GPU belongs.
+  std::string clique_id;
 };
 
 // Data that describes the execution target of the StreamExecutor, in terms of
@@ -74,6 +116,8 @@ class GpuComputeCapability
 // Thread-safe: immutable post-initialization.
 class DeviceDescription {
  public:
+  DeviceDescription() = default;
+
   // Returns the platform being run on; this value is primarily intended for
   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
   // 3.5".
@@ -137,34 +181,28 @@ class DeviceDescription {
   // Returns the limit on the total number of threads that can be launched in a
   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
   // This limit affects what constitutes a legitimate kernel launch request.
-  const int64_t& threads_per_block_limit() const {
-    return threads_per_block_limit_;
-  }
+  int64_t threads_per_block_limit() const { return threads_per_block_limit_; }
 
   // Returns the limit on the total number of threads that can be simultaneously
   // launched on a given multiprocessor.
-  const int64_t& threads_per_core_limit() const {
-    return threads_per_core_limit_;
-  }
+  int64_t threads_per_core_limit() const { return threads_per_core_limit_; }
 
   // Returns the number of threads per warp/wavefront.
   constexpr int64_t threads_per_warp() const { return threads_per_warp_; }
 
   // Returns the limit on the total number of registers per core.
-  const int64_t& registers_per_core_limit() const {
-    return registers_per_core_limit_;
-  }
+  int64_t registers_per_core_limit() const { return registers_per_core_limit_; }
 
   // Returns the limit on the total number of registers that can be
   // simultaneously used by a block.
-  const int64_t& registers_per_block_limit() const {
+  int64_t registers_per_block_limit() const {
     return registers_per_block_limit_;
   }
 
   // Returns the number of address bits available to kernel code running on the
   // platform. This affects things like the maximum allocation size and perhaps
   // types used in kernel code such as size_t.
-  const int64_t& device_address_bits() const { return device_address_bits_; }
+  int64_t device_address_bits() const { return device_address_bits_; }
 
   // Returns the device memory size in bytes.
   int64_t device_memory_size() const { return device_memory_size_; }
@@ -177,6 +215,9 @@ class DeviceDescription {
   // host and device.)
   int64_t memory_bandwidth() const { return memory_bandwidth_; }
 
+  // Returns the PCIe memory bandwidth in bytes/sec.
+  int64_t pcie_bandwidth() const { return pcie_bandwidth_; }
+
   // Returns the device's core clock rate in GHz.
   float clock_rate_ghz() const { return clock_rate_ghz_; }
 
@@ -261,11 +302,14 @@ class DeviceDescription {
     return 32;
   }
 
+  const DeviceInterconnectInfo& device_interconnect_info() const {
+    return interconnect_info_;
+  }
+
   GpuDeviceInfoProto ToGpuProto() const;
 
   std::string ToString() const;
 
-  DeviceDescription() = default;
   static absl::StatusOr<DeviceDescription> FromProto(
       const GpuDeviceInfoProto& proto);
 
@@ -273,10 +317,6 @@ class DeviceDescription {
   // value will be provided.
   static inline const char* const kUndefinedString = "<undefined>";
 
-  void set_gpu_compute_capability(const GpuComputeCapability& c) {
-    gpu_compute_capability_ = c;
-  }
-
   void set_block_dim_limit_x(int64_t limit) { block_dim_limit_.x = limit; }
 
   void set_block_dim_limit_y(int64_t limit) { block_dim_limit_.y = limit; }
@@ -327,6 +367,7 @@ class DeviceDescription {
   void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
   void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
   void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
+  void set_pcie_bandwidth(int64_t value) { pcie_bandwidth_ = value; }
 
   void set_shared_memory_per_core(int64_t value) {
     shared_memory_per_core_ = value;
@@ -340,6 +381,10 @@ class DeviceDescription {
 
   void set_clock_rate_ghz(float value) { clock_rate_ghz_ = value; }
 
+  void set_gpu_compute_capability(const GpuComputeCapability& c) {
+    gpu_compute_capability_ = c;
+  }
+
   void set_cuda_compute_capability(const CudaComputeCapability& cc) {
     gpu_compute_capability_ = cc;
   }
@@ -353,6 +398,10 @@ class DeviceDescription {
   void set_fpus_per_core(int value) { fpus_per_core_ = value; }
   void set_ecc_enabled(bool value) { ecc_enabled_ = value; }
 
+  void set_device_interconnect_info(DeviceInterconnectInfo info) {
+    interconnect_info_ = std::move(info);
+  }
+
  private:
   // For description of the following members, see the corresponding accessor
   // above.
@@ -383,7 +432,9 @@ class DeviceDescription {
   int64_t device_address_bits_ = kUninitialized<int64_t>;
   int64_t device_memory_size_ = kUninitialized<int64_t>;
   int64_t l2_cache_size_ = kUninitialized<int64_t>;
+
   int64_t memory_bandwidth_ = kUninitialized<int64_t>;
+  int64_t pcie_bandwidth_ = kUninitialized<int64_t>;
 
   // Shared memory limits on a given device.
   int64_t shared_memory_per_core_ = kUninitialized<int64_t>;
@@ -403,6 +454,8 @@ class DeviceDescription {
   SemanticVersion runtime_version_{0, 0, 0};
   SemanticVersion compile_time_toolkit_version_{0, 0, 0};
   SemanticVersion dnn_version_{0, 0, 0};
+
+  DeviceInterconnectInfo interconnect_info_;
 };
 
 // Returns whether the given thread_dim is acceptable given the limits described
diff --git a/third_party/xla/xla/stream_executor/device_description.proto b/third_party/xla/xla/stream_executor/device_description.proto
index 80e2e64bc82158..54e7db38cfd2d7 100644
--- a/third_party/xla/xla/stream_executor/device_description.proto
+++ b/third_party/xla/xla/stream_executor/device_description.proto
@@ -24,6 +24,13 @@ message RocmComputeCapabilityProto {
   string gcn_arch_name = 1;
 }
 
+message GpuComputeCapabilityProto {
+  oneof compute_capability {
+    CudaComputeCapabilityProto cuda_compute_capability = 1;
+    RocmComputeCapabilityProto rocm_compute_capability = 2;
+  }
+}
+
 message GpuDeviceInfoProto {
   int32 threads_per_block_limit = 1;
   int32 threads_per_warp = 2;
diff --git a/third_party/xla/xla/stream_executor/device_description_test.cc b/third_party/xla/xla/stream_executor/device_description_test.cc
index 51443ddf666f86..8f6684abaaa6b4 100644
--- a/third_party/xla/xla/stream_executor/device_description_test.cc
+++ b/third_party/xla/xla/stream_executor/device_description_test.cc
@@ -16,11 +16,16 @@ limitations under the License.
 
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "xla/stream_executor/semantic_version.h"
 
 namespace stream_executor {
 namespace {
+using absl_testing::IsOkAndHolds;
 
 TEST(DeviceDescription, DefaultConstruction) {
   DeviceDescription desc;
@@ -116,5 +121,16 @@ TEST(RocmComputeCapability, Accessors) {
   EXPECT_TRUE(RocmComputeCapability{"gfx1103"}.has_hipblaslt());
 }
 
+TEST(GpuComputeCapability, ProtoConversion) {
+  EXPECT_THAT(
+      GpuComputeCapability::FromProto(
+          GpuComputeCapability(CudaComputeCapability::Volta()).ToProto()),
+      IsOkAndHolds(GpuComputeCapability(CudaComputeCapability::Volta())));
+  EXPECT_THAT(
+      GpuComputeCapability::FromProto(
+          GpuComputeCapability(RocmComputeCapability("gfx900")).ToProto()),
+      IsOkAndHolds(GpuComputeCapability(RocmComputeCapability("gfx900"))));
+}
+
 }  // namespace
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 0818b3a0c3090d..3c39aa8272840b 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
@@ -144,7 +144,7 @@ absl::Status DnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
     bool /*use_fallback*/, ScratchAllocator* /*scratch_allocator*/,
-    const NumericOptions& /*numeric_options*/,
+    const EngineOptions& /*engine_options*/,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* /*exec_plans*/) {
   return absl::UnimplementedError("GetConvolveRunners not implemented.");
 }
@@ -167,7 +167,7 @@ absl::Status DnnSupport::GetGraphConvolveRunners(
     const dnn::FilterDescriptor& /*filter_descriptor*/,
     const dnn::BatchDescriptor& /*output_descriptor*/,
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
-    bool /*use_fallback*/, const NumericOptions& /*numeric_options*/,
+    bool /*use_fallback*/, const EngineOptions& /*engine_options*/,
     std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* /*exec_plans*/,
     std::string /*serialized_graph*/) {
   return absl::UnimplementedError("GetGraphConvolveRunners not implemented.");
@@ -195,7 +195,7 @@ absl::Status DnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
+    dnn::ActivationMode activation_mode, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   return absl::UnimplementedError("GetFusedConvolveRunners not implemented.");
 }
@@ -205,7 +205,7 @@ absl::Status DnnSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   return absl::UnimplementedError("GetFusedMatmulRunners not implemented.");
@@ -265,7 +265,7 @@ bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
 absl::Status DnnSupport::DoPoolForward(
     DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
@@ -278,7 +278,7 @@ absl::Status DnnSupport::DoPoolForward(
 absl::Status DnnSupport::DoPoolBackward(
     DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 7fe112a5bdc96c..8e90b0316e3710 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
@@ -1093,7 +1093,7 @@ class DnnGraph {
   DnnGraph() = default;
   virtual ~DnnGraph() = default;
 
-  virtual absl::Status Prepare(DnnSupport&, const NumericOptions&) = 0;
+  virtual absl::Status Prepare(DnnSupport&, const EngineOptions&) = 0;
   virtual absl::Status Build(DnnSupport&, std::optional<int64_t> plan_id) = 0;
   virtual absl::Status Execute(Stream& stream,
                                absl::Span<DeviceMemoryBase> operands,
@@ -1492,8 +1492,7 @@ class DnnSupport {
       DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
       DeviceMemoryBase output_data,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      ScratchAllocator* scratch_allocator, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const ConvRunner>>* out_exec_plans);
 
   virtual absl::StatusOr<std::unique_ptr<const ConvRunner>>
@@ -1511,7 +1510,7 @@ class DnnSupport {
       const FilterDescriptor& filter_descriptor,
       const BatchDescriptor& output_descriptor,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const GraphConvRunner>>* out_exec_plans,
       std::string serialized_graph);
 
@@ -1534,7 +1533,7 @@ class DnnSupport {
       const BatchDescriptor& bias_descriptor,
       const BatchDescriptor& output_descriptor,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-      ActivationMode activation_mode, const NumericOptions& numeric_options,
+      ActivationMode activation_mode, const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const FusedConvRunner>>* out_exec_plans);
 
   virtual absl::Status GetFusedMatmulRunners(
@@ -1542,7 +1541,7 @@ class DnnSupport {
       Stream* stream, bool trans_a, bool trans_b, uint64_t m, uint64_t n,
       uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const FusedMatmulRunner>>* out_exec_plans);
 
   virtual absl::StatusOr<std::unique_ptr<const FusedConvRunner>>
@@ -1592,14 +1591,14 @@ class DnnSupport {
   template <typename ElementType>
   absl::Status PoolForward(Stream* stream,
                            const PoolingDescriptor& pooling_dimensions,
-                           const NumericOptions& numeric_options,
+                           const EngineOptions& engine_options,
                            const BatchDescriptor& input_dimensions,
                            const DeviceMemory<ElementType>& input_data,
                            const BatchDescriptor& output_dimensions,
                            DeviceMemory<ElementType>* output_data,
                            ScratchAllocator* workspace_allocator = nullptr) {
     return DoPoolForward(ToDataType<ElementType>::value, stream,
-                         pooling_dimensions, numeric_options, input_dimensions,
+                         pooling_dimensions, engine_options, input_dimensions,
                          input_data, output_dimensions, *output_data,
                          workspace_allocator);
   }
@@ -1607,7 +1606,7 @@ class DnnSupport {
   template <typename ElementType>
   absl::Status PoolBackward(Stream* stream,
                             const PoolingDescriptor& pooling_dimensions,
-                            const NumericOptions& numeric_options,
+                            const EngineOptions& engine_options,
                             const BatchDescriptor& input_dimensions,
                             const DeviceMemory<ElementType>& input_data,
                             const BatchDescriptor& output_dimensions,
@@ -1617,7 +1616,7 @@ class DnnSupport {
                             ScratchAllocator* workspace_allocator = nullptr) {
     return DoPoolBackward(
         ToDataType<ElementType>::value, stream, pooling_dimensions,
-        numeric_options, input_dimensions, input_data, output_dimensions,
+        engine_options, input_dimensions, input_data, output_dimensions,
         output_data, input_diff_data, *output_diff_data, workspace_allocator);
   }  // Performs a forward pooling operation on input_data, writing to
   // output_data. See PoolingDescriptor for how to configure the
@@ -1642,7 +1641,7 @@ class DnnSupport {
   virtual absl::Status DoPoolForward(
       DataType element_type, Stream* stream,
       const PoolingDescriptor& pooling_dimensions,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
       const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
       ScratchAllocator* workspace_allocator);
@@ -1659,7 +1658,7 @@ class DnnSupport {
   virtual absl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
       const PoolingDescriptor& pooling_dimensions,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
       const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
       DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
@@ -1726,7 +1725,7 @@ class DnnSupport {
       int batch_size, RnnInputMode input_mode, RnnDirectionMode direction_mode,
       RnnMode rnn_mode, DataType data_type,
       const AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) {
     return absl::UnimplementedError("CreateRnnDescriptor is unimplemented");
   }
@@ -1993,13 +1992,13 @@ class DnnSupport {
                                  absl::Span<const int> labels_data,
                                  absl::Span<const int> labels_lengths_data,
                                  absl::Span<const int> input_lengths_data,
-                                 const NumericOptions& numeric_options,
+                                 const EngineOptions& engine_options,
                                  ScratchAllocator* workspace_allocator,
                                  DeviceMemory<uint8_t>* scratch_memory,
                                  int* ctc_loss_algo_id) {
     return DoPrepareForCtcLoss(
         stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
-        labels_data, labels_lengths_data, input_lengths_data, numeric_options,
+        labels_data, labels_lengths_data, input_lengths_data, engine_options,
         workspace_allocator, scratch_memory, ctc_loss_algo_id);
   }
 
@@ -2102,8 +2101,7 @@ class DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
     *scratch_memory = {};
     return absl::OkStatus();
diff --git a/third_party/xla/xla/stream_executor/numeric_options.h b/third_party/xla/xla/stream_executor/engine_options.h
similarity index 52%
rename from third_party/xla/xla/stream_executor/numeric_options.h
rename to third_party/xla/xla/stream_executor/engine_options.h
index 5620d3ad45def2..1b0eae8774fe3d 100644
--- a/third_party/xla/xla/stream_executor/numeric_options.h
+++ b/third_party/xla/xla/stream_executor/engine_options.h
@@ -13,25 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ------------------------------------------------------------------------------*/
 
-#ifndef XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
-#define XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#ifndef XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
+#define XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
 
 namespace stream_executor {
 
-// Options that specify the numeric behavior of operations like matrix
-// multiplications and convolutions
-struct NumericOptions {
-  NumericOptions(bool require_determinism, bool allow_tf32)
-      : require_determinism(require_determinism), allow_tf32(allow_tf32) {}
+// Options (used when calling math libraries) that specify the behavior of
+// operations like matrix multiplications and convolutions.
+struct EngineOptions {
+  EngineOptions(bool require_determinism, bool allow_tf32,
+                bool require_command_buffer)
+      : require_determinism(require_determinism),
+        allow_tf32(allow_tf32),
+        require_command_buffer(require_command_buffer) {}
 
-  NumericOptions() : require_determinism(false), allow_tf32(true) {}
+  EngineOptions()
+      : require_determinism(false),
+        allow_tf32(true),
+        require_command_buffer(false) {}
 
   // If true, the op must be deterministic
   bool require_determinism;
   // If true, float32 inputs can be rounded to TensorFloat-32 precision
   bool allow_tf32;
+  // If true, the execution plan selected must support command buffer
+  // construction.
+  bool require_command_buffer;
 };
 
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#endif  // XLA_STREAM_EXECUTOR_ENGINE_OPTIONS_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index e6bc15b5790c04..b5b939962755f5 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -137,6 +137,8 @@ cc_library(
         "gpu_command_buffer.h",
     ],
     deps = [
+        "//xla:debug_options_flags",
+        "//xla/service:dump",
         "//xla/stream_executor:bit_pattern",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
@@ -168,6 +170,7 @@ cc_library(
     name = "gpu_executor_header",
     hdrs = ["gpu_executor.h"],
     deps = [
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
@@ -595,6 +598,7 @@ xla_test(
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -606,13 +610,13 @@ xla_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:protobuf",
     ],
     data = [":gpu_test_kernels_fatbin"],
 )
@@ -991,8 +995,8 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":buffer_debug_log",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_cc",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
-        "//xla/backends/gpu/runtime:thunk_buffer_id",
         "//xla/backends/gpu/runtime:thunk_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
@@ -1000,12 +1004,9 @@ xla_test(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1015,8 +1016,45 @@ cc_library(
     hdrs = ["buffer_debug_xor_checksum_kernel.h"],
     deps = [
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
-        "//xla/backends/gpu/runtime:thunk_buffer_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
     ],
 )
+
+cc_library(
+    name = "buffer_debug_float_check_kernel",
+    hdrs = ["buffer_debug_float_check_kernel.h"],
+    deps = [
+        "//xla:types",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "prefix_sum_kernel",
+    hdrs = ["prefix_sum_kernel.h"],
+    deps = [
+        "//xla:types",
+        "//xla/backends/gpu/runtime:buffer_debug_log_structs",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "kernel_serialization_check",
+    testonly = True,
+    srcs = ["kernel_serialization_check.cc"],
+    hdrs = ["kernel_serialization_check.h"],
+    deps = [
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:kernel_symbol_registry",
+        "//xla/stream_executor:platform",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_for_library",
+    ],
+)
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
index 930d344eb444c8..371d11c78c16e1 100644
--- a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
@@ -25,9 +25,18 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 
+// Strategy for performing an all-reduce.
 enum class AllReduceStrategy : uint32_t {
+  // With one-shot strategy all GPUs gathers and reduces data from all peer
+  // GPUs.
   kOneShot,
+  // With two-shot strategy each GPU gathers and reduces only a part of the
+  // data in the first shot, as a second shot it gathers peer GPUs results to
+  // construct a final result.
   kTwoShot,
+  // With multimem strategy single GPU uses multimem instructions to perform
+  // reduce+broadcast in one-shot.
+  kMultimem,
 };
 
 template <typename Sink>
@@ -39,6 +48,9 @@ void AbslStringify(Sink& sink, AllReduceStrategy strategy) {
     case AllReduceStrategy::kTwoShot:
       sink.Append("kTwoShot");
       break;
+    case AllReduceStrategy::kMultimem:
+      sink.Append("kMultimem");
+      break;
   }
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
index 823b32c68c8b48..ee8a367f17080b 100644
--- a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
@@ -86,12 +86,27 @@ __device__ __forceinline__ void VecOp(Vec<T>& res, const Vec<T>& vec) {
 
 template <typename T>
 __device__ __forceinline__ RestrictedPtr<T> GetPeerPtr(
-    void* ptr, int64_t peer_rank, const CollectiveKernelMetadata& metadata) {
-  uint64_t current_base = metadata.local_buffer_root_ptrs[metadata.rank];
+    void* ptr, int64_t peer_rank, int64_t argument_index, int num_ranks,
+    const CollectiveKernelMetadata& metadata) {
+  uint64_t argument_offset = num_ranks * argument_index;
+  uint64_t current_base =
+      metadata.param_to_peers[argument_offset + metadata.rank];
+  uint64_t peer_base = metadata.param_to_peers[argument_offset + peer_rank];
   uint64_t offset = (uint64_t)ptr - current_base;
 
-  return (RestrictedPtr<T>)(metadata.local_buffer_root_ptrs[peer_rank] +
-                            offset);
+  return (RestrictedPtr<T>)(peer_base + offset);
+}
+
+template <typename T>
+__device__ __forceinline__ RestrictedPtr<T> GetMultimemPtr(
+    void* ptr, int64_t argument_index, int num_ranks,
+    const CollectiveKernelMetadata& metadata) {
+  uint64_t argument_offset = num_ranks * argument_index;
+  uint64_t current_base =
+      metadata.param_to_peers[argument_offset + metadata.rank];
+  uint64_t offset = (uint64_t)ptr - current_base;
+
+  return (RestrictedPtr<T>)(metadata.multicast_buffer_ptr + offset);
 }
 
 template <PlatformType T = PlatformType::NOGPU>
@@ -126,11 +141,13 @@ __device__ __forceinline__ void OneShotAllReduceKernelImpl(
   __shared__ std::array<RestrictedPtr<T>, kMaxNumAllReduceInputPtrs>
       remote_input_buffers;
 
-  if (threadIdx.x < kMaxNumAllReduceInputPtrs) {
-    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
-        args.symmetric_signal_ptrs, threadIdx.x, *args.metadata);
+  if (threadIdx.x < args.num_ranks) {
     remote_input_buffers[threadIdx.x] =
-        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x, *args.metadata);
+        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x,
+                      /*argument_index=*/0, args.num_ranks, *args.metadata);
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
   }
 
   __syncthreads();
@@ -164,6 +181,80 @@ __device__ __forceinline__ void OneShotAllReduceKernelImpl(
   }
 }
 
+#if __CUDA_ARCH__ >= 900
+
+// This is the simplest implementation of all-reduce with multimem instructions.
+// Right now all devices are copying their data to the remote buffer after
+// which, the first device performs the reduce and broadcast operations using
+// multimem instructions.
+template <typename T, xla::ReductionKind ReductionKindT,
+          PlatformType PlatformT = PlatformType::NOGPU>
+__device__ __forceinline__ void MultimemAllReduceKernelImpl(
+    const AllReduceKernelParams<T>& args) {
+  if (!std::is_same_v<T, float>) {
+    assert(false &&
+           "Multimem all-reduce strategy is only supported for float.");
+  }
+
+  __shared__ std::array<RestrictedPtr<uint32_t>, kMaxNumAllReduceInputPtrs>
+      signal_flags_buffers;
+
+  if (threadIdx.x < args.num_ranks) {
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
+  }
+
+  int64_t offset =
+      kNumElementsPerThread * (blockIdx.x * blockDim.x + threadIdx.x);
+  int64_t stride = kNumElementsPerThread * blockDim.x * gridDim.x;
+
+  // Copy data from local input buffer to remote input buffer.
+  for (int i = offset; i < args.num_elements; i += stride) {
+    VecStore(args.symmetric_input_ptrs + i, VecLoad(args.input_buffer + i));
+  }
+
+  SyncRemoteBlocks<PlatformT>(signal_flags_buffers, args.rank, args.num_ranks,
+                              args.signal_value);
+  __syncthreads();
+
+  RestrictedPtr<T> multimem_ptr = GetMultimemPtr<T>(
+      args.symmetric_input_ptrs, 0, args.num_ranks, *args.metadata);
+  if (args.metadata->rank == 0) {
+    for (int i = offset; i < args.num_elements; i += stride) {
+      T* multimem_element_ptr = multimem_ptr + i;
+
+      // Reduce
+      Vec<T> vec;
+      asm volatile(
+          "multimem.ld_reduce.relaxed.sys.global.add.v4.f32 {%0,%1,%2,%3}, "
+          "[%4];"
+          : "=f"(vec.data[0]), "=f"(vec.data[1]), "=f"(vec.data[2]),
+            "=f"(vec.data[3])
+          : "l"(multimem_element_ptr)
+          : "memory");
+
+      // Broadcast
+      asm volatile(
+          "multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(
+              multimem_element_ptr),
+          "f"(vec.data[0]), "f"(vec.data[1]), "f"(vec.data[2]), "f"(vec.data[3])
+          : "memory");
+    }
+  }
+
+  __syncthreads();
+  // Wait for all participants to receive the data.
+  SyncRemoteBlocks<PlatformT>(signal_flags_buffers, args.rank, args.num_ranks,
+                              args.signal_value + 1);
+  __syncthreads();
+
+  for (int i = offset; i < args.num_elements; i += stride) {
+    VecStore(args.output_buffer + i, VecLoad(args.symmetric_input_ptrs + i));
+  }
+}
+#endif  // __CUDA_ARCH__ >= 900
+
 template <typename T, xla::ReductionKind ReductionKindT,
           PlatformType PlatformT = PlatformType::NOGPU>
 __device__ __forceinline__ void TwoShotAllReduceKernelImpl(
@@ -173,11 +264,13 @@ __device__ __forceinline__ void TwoShotAllReduceKernelImpl(
   __shared__ std::array<RestrictedPtr<T>, kMaxNumAllReduceInputPtrs>
       remote_input_buffers;
 
-  if (threadIdx.x < kMaxNumAllReduceInputPtrs) {
-    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
-        args.symmetric_signal_ptrs, threadIdx.x, *args.metadata);
+  if (threadIdx.x < args.num_ranks) {
     remote_input_buffers[threadIdx.x] =
-        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x, *args.metadata);
+        GetPeerPtr<T>(args.symmetric_input_ptrs, threadIdx.x,
+                      /*argument_index=*/0, args.num_ranks, *args.metadata);
+    signal_flags_buffers[threadIdx.x] = GetPeerPtr<uint32_t>(
+        args.symmetric_signal_ptrs, threadIdx.x, /*argument_index=*/1,
+        args.num_ranks, *args.metadata);
   }
 
   __syncthreads();
@@ -278,6 +371,14 @@ __global__ void AllReduceKernelImpl(AllReduceKernelParams<T> args) {
     OneShotAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
   } else if constexpr (kAllReduceStrategy == AllReduceStrategy::kTwoShot) {
     TwoShotAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
+  } else if constexpr (kAllReduceStrategy == AllReduceStrategy::kMultimem) {
+#if __CUDA_ARCH__ >= 900
+    MultimemAllReduceKernelImpl<T, ReductionKindT, PlatformT>(args);
+#else
+    assert(false &&
+           "Multimem all-reduce strategy is not supported on this "
+           "architecture.");
+#endif
   } else {
     assert(false && "Unsupported all-reduce strategy");
   }
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
new file mode 100644
index 00000000000000..6c2d84c9613262
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
+
+#include <cstdint>
+
+#include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/types.h"
+
+namespace stream_executor::gpu {
+
+// Trait for a kernel that computes the NaN count of given input buffer and
+// appends it to the buffer debug log.
+//
+// This kernel MUST execute on a single thread block.
+struct BufferDebugFloatCheckF32Kernel {
+  using KernelType =
+      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceMemory<float>,
+                  uint64_t, DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+                  DeviceMemory<xla::gpu::BufferDebugFloatCheckEntry>>;
+};
+
+struct BufferDebugFloatCheckBf16Kernel {
+  using KernelType =
+      TypedKernel<xla::gpu::BufferDebugLogEntryId,
+                  DeviceMemory<Eigen::bfloat16>, uint64_t,
+                  DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+                  DeviceMemory<xla::gpu::BufferDebugFloatCheckEntry>>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_FLOAT_CHECK_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc
index ccf4a5f2b1552d..fee98c27c6db59 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.cc
@@ -29,84 +29,62 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 
-namespace stream_executor::cuda {
+namespace stream_executor::gpu {
 
-using ::xla::gpu::BufferDebugLogEntry;
 using ::xla::gpu::BufferDebugLogHeader;
 
-absl::StatusOr<BufferDebugLog> BufferDebugLog::CreateOnDevice(
-    Stream& stream, DeviceMemory<uint8_t> log_buffer) {
-  if (log_buffer.is_null()) {
+absl::StatusOr<DeviceMemory<uint8_t>> BufferDebugLogBase::CreateOnDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size) {
+  if (memory.is_null()) {
     return absl::InvalidArgumentError("Log buffer must be non-null");
   }
 
-  static constexpr size_t kMinBufferSize =
-      sizeof(BufferDebugLogHeader) + sizeof(BufferDebugLogEntry);
-  if (log_buffer.size() < kMinBufferSize) {
+  size_t kMinBufferSize = sizeof(BufferDebugLogHeader) + entry_size;
+  if (memory.size() < kMinBufferSize) {
     return absl::InvalidArgumentError(
         absl::StrFormat("Log buffer size %u is too small to hold any log "
                         "entries (required: %u bytes)",
-                        log_buffer.size(), kMinBufferSize));
+                        memory.size(), kMinBufferSize));
   }
 
   const uint32_t max_entries =
-      (log_buffer.size() - sizeof(BufferDebugLogHeader)) /
-      sizeof(BufferDebugLogEntry);
+      (memory.size() - sizeof(BufferDebugLogHeader)) / entry_size;
   const BufferDebugLogHeader empty_header{
       /*write_idx=*/0,
       /*capacity=*/max_entries,
   };
   TF_RETURN_IF_ERROR(
-      stream.Memcpy(&log_buffer, &empty_header, sizeof(empty_header)));
-  return BufferDebugLog(log_buffer);
+      stream.Memcpy(&memory, &empty_header, sizeof(empty_header)));
+  return memory;
 }
 
-absl::StatusOr<BufferDebugLogHeader> BufferDebugLog::ReadHeaderFromDevice(
-    Stream& stream) const {
+absl::StatusOr<BufferDebugLogHeader> BufferDebugLogBase::ReadHeaderFromDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory) const {
   BufferDebugLogHeader header;
-  TF_RETURN_IF_ERROR(stream.Memcpy(&header, memory_, sizeof(header)));
+  TF_RETURN_IF_ERROR(stream.Memcpy(&header, memory, sizeof(header)));
   TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
   return header;
 }
 
-absl::StatusOr<std::vector<BufferDebugLogEntry>> BufferDebugLog::ReadFromDevice(
-    Stream& stream) const {
-  std::vector<uint8_t> buffer(memory_.size());
-  TF_RETURN_IF_ERROR(stream.Memcpy(buffer.data(), memory_, memory_.size()));
+absl::StatusOr<size_t> BufferDebugLogBase::ReadFromDevice(
+    Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size,
+    void* entries_data) const {
+  std::vector<uint8_t> buffer(memory.size());
+  TF_RETURN_IF_ERROR(stream.Memcpy(buffer.data(), memory, memory.size()));
   TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
 
   BufferDebugLogHeader header;
   memcpy(&header, buffer.data(), sizeof(header));
 
-  const uint32_t max_entries = (memory_.size() - sizeof(BufferDebugLogHeader)) /
-                               sizeof(BufferDebugLogEntry);
+  const uint32_t max_entries =
+      (memory.size() - sizeof(BufferDebugLogHeader)) / entry_size;
   const size_t initialized_entries =
       std::min(max_entries, std::min(header.capacity, header.write_idx));
-  std::vector<BufferDebugLogEntry> entries(initialized_entries);
-  memcpy(entries.data(), buffer.data() + sizeof(header),
-         initialized_entries * sizeof(BufferDebugLogEntry));
-
-  return entries;
-}
-
-absl::StatusOr<xla::gpu::BufferDebugLogProto> BufferDebugLog::ReadProto(
-    Stream& stream) const {
-  TF_ASSIGN_OR_RETURN(std::vector<BufferDebugLogEntry> entries,
-                      ReadFromDevice(stream));
-
-  xla::gpu::BufferDebugLogProto buffer_debug_log_proto;
-  buffer_debug_log_proto.mutable_entries()->Reserve(entries.size());
-  for (const auto& entry : entries) {
-    xla::gpu::BufferDebugLogEntryProto* entry_proto =
-        buffer_debug_log_proto.add_entries();
-    entry_proto->set_thunk_id(entry.entry_id.thunk_id().value());
-    entry_proto->set_buffer_idx(entry.entry_id.buffer_idx());
-    entry_proto->set_checksum(entry.checksum);
-  }
+  memcpy(entries_data, buffer.data() + sizeof(header),
+         initialized_entries * entry_size);
 
-  return buffer_debug_log_proto;
+  return initialized_entries;
 }
 
-}  // namespace stream_executor::cuda
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h
index 5024ede81416c4..28d805eb77a3f4 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log.h
@@ -25,21 +25,38 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
 
-namespace stream_executor::cuda {
+namespace stream_executor::gpu {
+
+// Base class for BufferDebugLog.
+//
+// This class is not intended to be used directly. Use BufferDebugLog instead.
+class BufferDebugLogBase {
+ protected:
+  absl::StatusOr<xla::gpu::BufferDebugLogHeader> ReadHeaderFromDevice(
+      Stream& stream, DeviceMemory<uint8_t> memory) const;
+
+  absl::StatusOr<size_t> ReadFromDevice(Stream& stream,
+                                        DeviceMemory<uint8_t> memory,
+                                        size_t entry_size,
+                                        void* entries_data) const;
+
+  static absl::StatusOr<DeviceMemory<uint8_t>> CreateOnDevice(
+      Stream& stream, DeviceMemory<uint8_t> memory, size_t entry_size);
+};
 
 // A wrapper over a device memory buffer used to store debug info about contents
 // of buffers (e.g. checksums).
 //
-// It holds a BufferDebugLogHeader and a variable number of BufferDebugLogEntry
-// structs.
-class BufferDebugLog {
+// It holds a BufferDebugLogHeader and a variable number of Entry structs.
+template <typename Entry>
+class BufferDebugLog : public BufferDebugLogBase {
  public:
   // Returns the number of bytes required to store a log with `entries`
   // entries.
   static constexpr size_t RequiredSizeForEntries(size_t entries) {
-    return sizeof(xla::gpu::BufferDebugLogHeader) +
-           sizeof(xla::gpu::BufferDebugLogEntry) * entries;
+    return sizeof(xla::gpu::BufferDebugLogHeader) + sizeof(Entry) * entries;
   }
 
   // Initializes an empty `BufferDebugLog` using a `log_buffer` allocated in
@@ -52,16 +69,20 @@ class BufferDebugLog {
   //
   // Fails with `absl::StatusCode::kInvalidArgument` if `log_buffer` is too
   // small to hold any entries.
-  static absl::StatusOr<BufferDebugLog> CreateOnDevice(
-      Stream& stream, DeviceMemory<uint8_t> log_buffer);
+  static absl::StatusOr<BufferDebugLog<Entry>> CreateOnDevice(
+      Stream& stream, DeviceMemory<uint8_t> log_buffer) {
+    TF_ASSIGN_OR_RETURN(auto memory, BufferDebugLogBase::CreateOnDevice(
+                                         stream, log_buffer, sizeof(Entry)));
+    return BufferDebugLog<Entry>(memory);
+  }
 
   // Creates a `BufferDebugLog` from an already initialized device memory
   // buffer.
   //
   // `log_buffer` must contain an initialized `BufferDebugLogHeader`.
   static BufferDebugLog FromDeviceMemoryUnchecked(
-      DeviceMemory<uint8_t> log_buffer) {
-    return BufferDebugLog(log_buffer);
+      DeviceMemory<uint8_t> memory) {
+    return BufferDebugLog<Entry>(memory);
   }
 
   // Reads the header from the device log.
@@ -69,7 +90,9 @@ class BufferDebugLog {
   // `stream` must be associated with the same device as the one used to create
   // the log.
   absl::StatusOr<xla::gpu::BufferDebugLogHeader> ReadHeaderFromDevice(
-      Stream& stream) const;
+      Stream& stream) const {
+    return BufferDebugLogBase::ReadHeaderFromDevice(stream, memory_);
+  }
 
   // Reads all entries from the device log into host memory.
   //
@@ -78,14 +101,14 @@ class BufferDebugLog {
   //
   // `stream` must be associated with the same device as the one used to create
   // the log.
-  absl::StatusOr<std::vector<xla::gpu::BufferDebugLogEntry>> ReadFromDevice(
-      Stream& stream) const;
-
-  // Reads all entries from the device log into a proto dump.
-  //
-  // `stream` must be associated with the same device as the one used to create
-  // the log.
-  absl::StatusOr<xla::gpu::BufferDebugLogProto> ReadProto(Stream& stream) const;
+  absl::StatusOr<std::vector<Entry>> ReadFromDevice(Stream& stream) const {
+    std::vector<Entry> entries(memory_.size() / sizeof(Entry), Entry{});
+    TF_ASSIGN_OR_RETURN(size_t initialized_entries,
+                        BufferDebugLogBase::ReadFromDevice(
+                            stream, memory_, sizeof(Entry), entries.data()));
+    entries.resize(initialized_entries);
+    return entries;
+  }
 
   // Returns a view of the `BufferDebugLogHeader`.
   //
@@ -96,12 +119,12 @@ class BufferDebugLog {
         memory_.GetByteSlice(0, sizeof(xla::gpu::BufferDebugLogHeader)));
   }
 
-  // Returns a view of the `BufferDebugLogEntry` array.
+  // Returns a view of the `Entry` array.
   //
   // The returned `DeviceMemory` gets invalidated when the `BufferDebugLog` is
   // destroyed.
-  DeviceMemory<xla::gpu::BufferDebugLogEntry> GetDeviceEntries() const {
-    return DeviceMemory<xla::gpu::BufferDebugLogEntry>(memory_.GetByteSlice(
+  DeviceMemory<Entry> GetDeviceEntries() const {
+    return DeviceMemory<Entry>(memory_.GetByteSlice(
         sizeof(xla::gpu::BufferDebugLogHeader),
         memory_.size() - sizeof(xla::gpu::BufferDebugLogHeader)));
   }
@@ -112,6 +135,6 @@ class BufferDebugLog {
   DeviceMemory<uint8_t> memory_;
 };
 
-}  // namespace stream_executor::cuda
+}  // namespace stream_executor::gpu
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_DEBUG_LOG_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
index 2fd135cba05470..d3bd06eb0e33aa 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
@@ -26,9 +26,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
-#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
@@ -36,18 +35,13 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/util/proto/proto_matchers.h"
 
-namespace stream_executor::cuda {
+namespace stream_executor::gpu {
 namespace {
 
-using ::tsl::proto_testing::EqualsProto;
 using ::xla::gpu::BufferDebugLogEntry;
 using ::xla::gpu::BufferDebugLogHeader;
-using ::xla::gpu::BufferDebugLogProto;
-using ::xla::gpu::ThunkBufferId;
 using ::xla::gpu::ThunkId;
 
 class BufferDebugLogTest : public ::testing::Test {
@@ -70,8 +64,9 @@ class BufferDebugLogTest : public ::testing::Test {
 TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesEmptyLog) {
   DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(1024);
 
-  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLog device_log,
-                          BufferDebugLog::CreateOnDevice(*stream_, log_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
 
   EXPECT_EQ(host_log.size(), 0);
@@ -86,8 +81,9 @@ TEST_F(BufferDebugLogTest,
   DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
       kExpectedHeaderSize + kExpectedEntriesSize);
 
-  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLog device_log,
-                          BufferDebugLog::CreateOnDevice(*stream_, log_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
 
   EXPECT_EQ(device_log.GetDeviceHeader().size(), kExpectedHeaderSize);
   EXPECT_EQ(device_log.GetDeviceEntries().size(), kExpectedEntriesSize);
@@ -96,10 +92,11 @@ TEST_F(BufferDebugLogTest,
 TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesHeader) {
   constexpr size_t kMaxEntries = 123;
   DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
-      BufferDebugLog::RequiredSizeForEntries(kMaxEntries));
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(kMaxEntries));
 
-  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLog device_log,
-                          BufferDebugLog::CreateOnDevice(*stream_, log_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(auto device_log,
+                          BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                              *stream_, log_buffer));
   TF_ASSERT_OK_AND_ASSIGN(BufferDebugLogHeader header,
                           device_log.ReadHeaderFromDevice(*stream_));
 
@@ -108,46 +105,20 @@ TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesHeader) {
 }
 
 TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_FailsForNullBuffer) {
-  EXPECT_THAT(BufferDebugLog::CreateOnDevice(*stream_, DeviceMemory<uint8_t>()),
+  EXPECT_THAT(BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(
+                  *stream_, DeviceMemory<uint8_t>()),
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
 TEST_F(BufferDebugLogTest,
        CreateBufferDebugLogOnDevice_FailsForTooSmallBuffer) {
   DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
-      BufferDebugLog::RequiredSizeForEntries(1) - 1);
+      BufferDebugLog<BufferDebugLogEntry>::RequiredSizeForEntries(1) - 1);
 
-  EXPECT_THAT(BufferDebugLog::CreateOnDevice(*stream_, log_buffer),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
-TEST_F(BufferDebugLogTest, ReadAsProto) {
-  DeviceMemory<uint8_t> log_buffer = executor_->AllocateArray<uint8_t>(
-      BufferDebugLog::RequiredSizeForEntries(10));
-  const BufferDebugLogHeader header = {/*write_idx=*/2,
-                                       /*capacity=*/10};
-  const BufferDebugLogEntry entries[] = {
-      {/*entry_id=*/ThunkBufferId::Create(ThunkId(123), 4).value(),
-       /*checksum=*/12341234},
-      {/*entry_id=*/ThunkBufferId::Create(ThunkId(567), 8).value(),
-       /*checksum=*/56785678},
-  };
-  std::vector<uint8_t> log_data(sizeof(header) + sizeof(entries));
-  memcpy(log_data.data(), &header, sizeof(header));
-  memcpy(log_data.data() + sizeof(header), entries, sizeof(entries));
-  TF_ASSERT_OK(stream_->MemcpyH2D(absl::MakeConstSpan(log_data), &log_buffer));
-  TF_ASSERT_OK(stream_->BlockHostUntilDone());
-
-  BufferDebugLog device_log =
-      BufferDebugLog::FromDeviceMemoryUnchecked(log_buffer);
-  TF_ASSERT_OK_AND_ASSIGN(BufferDebugLogProto log_proto,
-                          device_log.ReadProto(*stream_));
-
-  EXPECT_THAT(log_proto, EqualsProto(R"pb(
-                entries { thunk_id: 123 buffer_idx: 4 checksum: 12341234 }
-                entries { thunk_id: 567 buffer_idx: 8 checksum: 56785678 }
-              )pb"));
+  EXPECT_THAT(
+      BufferDebugLog<BufferDebugLogEntry>::CreateOnDevice(*stream_, log_buffer),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
 }  // namespace
-}  // namespace stream_executor::cuda
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
index c9c7656faf162d..0cce7b2e57588b 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_xor_checksum_kernel.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
-#include "xla/backends/gpu/runtime/thunk_buffer_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 
@@ -31,8 +30,8 @@ namespace stream_executor::gpu {
 // This kernel MUST execute on a single thread block.
 struct BufferDebugXorChecksumKernel {
   using KernelType =
-      TypedKernel<xla::gpu::ThunkBufferId, DeviceMemory<uint8_t>, uint64_t,
-                  DeviceMemory<xla::gpu::BufferDebugLogHeader>,
+      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceMemory<uint8_t>,
+                  uint64_t, DeviceMemory<xla::gpu::BufferDebugLogHeader>,
                   DeviceMemory<xla::gpu::BufferDebugLogEntry>>;
 };
 
diff --git a/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h b/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
index e0b9906e29069c..631421bea2f262 100644
--- a/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
+++ b/third_party/xla/xla/stream_executor/gpu/collective_kernel_metadata.h
@@ -21,20 +21,22 @@ limitations under the License.
 // Metadata parameter which is passed to the collective kernel.
 // The metadata allows to compute the address of a peer's buffer in the
 // collective kernel and get the current rank of a peer device.
-// Right now two root pointers are getting passed. One is used for buffers
-// allocated by the buffer assignment and allows kernel to address input and
-// output buffers. The second one is used for buffers allocated within the
-// collective kernel thunk.
-// TODO(patrios): Unify two root pointers once symmetric memory allocator will
-// be implemented.
+// For each kernel parameter `param_to_peers` contains the N peer pointers to
+// the same parameter at the peer device, where N is the number of devices
+// participating in the collective kernel.
+// This information is structured as the
+// single dimentional array with the following layout:
+// [
+//   param0_peer0, param0_peer1, ..., param0_peerN,
+//   param1_peer0, param1_peer1, ..., param1_peerN,
+//   ...
+// ]
 struct CollectiveKernelMetadata {
-  constexpr static int kMaxNumDevices = 8;
-  int64_t rank;
-  // Root pointer for buffers allocated by the buffer assignment.
-  int64_t buffer_root_ptrs[kMaxNumDevices];
+  uint64_t rank;
+  uint64_t* param_to_peers;
 
-  // Root pointer for buffers allocated by the collective kernel thunk.
-  int64_t local_buffer_root_ptrs[kMaxNumDevices];
+  // Root pointer for multicast buffer for current device.
+  uint64_t multicast_buffer_ptr;
 };
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_COLLECTIVE_KERNEL_METADATA_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 2a0c582677210c..2c55443e552a98 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/dump.h"
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
@@ -569,15 +571,12 @@ absl::Status GpuCommandBuffer::Finalize() {
   TF_RETURN_IF_ERROR(PrepareFinalization());
 
   // Maybe dump created GPU graph to a dot file for debugging.
-  if (state_ == State::kCreate && VLOG_IS_ON(10)) {
-    std::string path = tsl::io::GetTempFilename(/*extension=*/"dot");
-    TF_RETURN_IF_ERROR(WriteGraphToDotFile(path));
-    if (VLOG_IS_ON(100)) {
-      std::string dot_file_contents;
-      TF_RETURN_IF_ERROR(
-          tsl::ReadFileToString(tsl::Env::Default(), path, &dot_file_contents));
-      VLOG(100) << "Contents of " << path << " is:\n" << dot_file_contents;
-    }
+  if (state_ == State::kCreate &&
+      (VLOG_IS_ON(10) || (VLOG_IS_ON(9) && mode_ == Mode::kPrimary))) {
+    xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+    std::string contents = ToString();
+    std::string filename = absl::StrFormat("gpu_command_buffer_%p.dot", this);
+    xla::DumpToFileInDir(debug_options, filename, contents);
   }
 
   size_t num_commands = commands_.size();
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 875aefb58a4743..9f28a352859a48 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_common.h"
@@ -78,18 +79,20 @@ class GpuExecutor : public StreamExecutorCommon {
       return absl::UnimplementedError("SubscribeDevice is not implemented.");
     }
 
-    virtual absl::StatusOr<void*> MapMemory(void* device_ptr,
-                                            GpuExecutor* gpu_executor) {
+    virtual absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
+                                            const GpuExecutor* gpu_executor) {
       return absl::UnimplementedError("MapMemory is not implemented.");
     }
   };
 
   virtual absl::StatusOr<std::unique_ptr<MulticastMemory>>
-  CreateMulticastMemory(uint64_t size, int num_devices) {
+  CreateMulticastMemory(uint64_t size, int num_devices) const {
     return absl::UnimplementedError(
         "CreateMulticastMemory is not implemented.");
   };
 
+  virtual bool is_multicast_supported() const { return false; }
+
  private:
   // The device ordinal value that this executor was initialized with; recorded
   // for use in getting device metadata. Immutable post-initialization.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
index 194ce88db9b4be..f3dddfd3bb80d1 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/gpu/tma_metadata.pb.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -45,10 +47,12 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/protobuf.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 
 namespace stream_executor::gpu {
 namespace {
+using ::testing::Each;
+using tsl::proto_testing::ParseTextProtoOrDie;
 
 using AddI32Kernel =
     TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
@@ -105,11 +109,6 @@ TEST_F(GpuKernelTest, LoadAndRunKernelFromPtx) {
 }
 
 TEST_F(GpuKernelTest, LoadAndRunKernelFromCubin) {
-  if (executor_->GetPlatform()->id() ==
-      stream_executor::rocm::kROCmPlatformId) {
-    // TODO(rocm): weekly sync 24-12-10
-    GTEST_SKIP() << "There is no PTX or any equivalent abstraction for ROCm.";
-  }
   TF_ASSERT_OK_AND_ASSIGN(
       auto binary, GetGpuTestKernelsFatbin(executor_->GetPlatform()->Name()));
   KernelLoaderSpec spec =
@@ -124,6 +123,36 @@ TEST_F(GpuKernelTest, LoadAndRunKernelFromSymbol) {
   RunAddI32Kernel(spec);
 }
 
+TEST_F(GpuKernelTest, LoadAndRunKernelFromSymbolWithCustomArgsPacking) {
+  constexpr int64_t kArraySize = 4;
+  constexpr int64_t kArraySizeBytes = sizeof(int32_t) * kArraySize;
+
+  // Prepare arguments: in=10, out=0
+  DeviceMemory<int32_t> in =
+      executor_->AllocateArray<int32_t>(kArraySize, /*memory_space=*/0);
+  DeviceMemory<int32_t> out =
+      executor_->AllocateArray<int32_t>(kArraySize, /*memory_space=*/0);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Stream> stream,
+                          executor_->CreateStream());
+  TF_ASSERT_OK(stream->Memset32(&in, 10, kArraySizeBytes));
+  TF_ASSERT_OK(stream->MemZero(&out, kArraySizeBytes));
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
+                          GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(
+                              executor_->GetPlatform()->id()));
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel, executor_->LoadKernel(spec));
+  TF_ASSERT_OK(kernel->Launch(
+      ThreadDim(), BlockDim(4),
+      /*cluster_dims=*/std::nullopt, stream.get(),
+      KernelArgsDeviceMemoryArray({in, out}, /*shared_memory_bytes=*/0)));
+
+  // Copy data back to host and verify that the output is 5 + 10 = 15.
+  std::vector<int32_t> dst(4, 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, kArraySizeBytes));
+  EXPECT_THAT(dst, Each(15));
+}
+
 TEST_F(GpuKernelTest, ArrayArgByValue) {
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
   TF_ASSERT_OK_AND_ASSIGN(auto kernel, LoadCopyTestKernel(executor_));
@@ -162,9 +191,8 @@ TEST_F(GpuKernelTest, TmaLoadAndRunKernelFromPtx) {
 
   auto get_tma_descriptor_from_proto =
       [](absl::string_view proto) -> absl::StatusOr<TmaDescriptor> {
-    TmaDescriptorProto tma_descriptor_proto;
-    tsl::protobuf::TextFormat::ParseFromString(proto, &tma_descriptor_proto);
-    return TmaDescriptor::FromProto(tma_descriptor_proto);
+    return TmaDescriptor::FromProto(
+        ParseTextProtoOrDie<TmaDescriptorProto>(proto));
   };
 
   TF_ASSERT_OK_AND_ASSIGN(TmaDescriptor arg0_desc,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
index f0de693e617459..28279350444151 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernel_traits.h
@@ -38,6 +38,10 @@ struct AddI32Kernel {
                                  DeviceMemory<int32_t>>;
 };
 
+struct IncrementBy5I32KernelWithCustomArgsPacking {
+  using KernelType = TypedKernel<DeviceMemory<int32_t>>;
+};
+
 struct MulI32Kernel {
   using KernelType = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
                                  DeviceMemory<int32_t>>;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
index a5f583f71e18e4..668aa3d2845394 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cc
@@ -59,6 +59,14 @@ absl::StatusOr<KernelLoaderSpec> GetAddI32TestKernelSpec(
       .FindKernel<internal::AddI32Kernel>(platform_id);
 }
 
+absl::StatusOr<KernelLoaderSpec>
+GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(
+    Platform::Id platform_id) {
+  return GpuKernelRegistry::GetGlobalRegistry()
+      .FindKernel<internal::IncrementBy5I32KernelWithCustomArgsPacking>(
+          platform_id);
+}
+
 KernelLoaderSpec GetAddI32PtxKernelSpec() {
   // PTX kernel compiled from:
   //
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
index ab16b7f3a02d2d..902de0e5f1fc1b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
@@ -49,6 +49,17 @@ absl::StatusOr<internal::CopyKernel::KernelType> LoadCopyTestKernel(
 absl::StatusOr<KernelLoaderSpec> GetAddI32TestKernelSpec(
     Platform::Id platform_id);
 
+// This is using a kernel with the function signature `void IncI32(int32_t a,
+// int32_t* b, int32_t* c)` under the hood and implements `c[i] = a + b[i]`.
+// It uses a custom argument packing that supplies a constant scalar value of 5
+// to the kernel for `a`, therefore it appears as if the the kernel had the
+// function signature `void IncI32(DeviceMemory<int32_t> in,
+// DeviceMemory<int32_t> out)`.
+//
+// The main purpose is the testing of the custom argument packing feature.
+absl::StatusOr<KernelLoaderSpec>
+GetIncrementBy5I32TestKernelSpecWithCustomArgsPacking(Platform::Id platform_id);
+
 // Returns a PTX kernel loader spec for the `AddI32` PTX kernel above.
 KernelLoaderSpec GetAddI32PtxKernelSpec();
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
index 64f5843d974e69..d92bc5c536b00d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h
@@ -32,6 +32,11 @@ __global__ void AddI32(int32_t* a, int32_t* b, int32_t* c) {
   c[index] = a[index] + b[index];
 }
 
+__global__ void IncI32(int32_t a, int32_t* b, int32_t* c) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  c[index] = a + b[index];
+}
+
 __global__ void MulI32(int32_t* a, int32_t* b, int32_t* c) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   c[index] = a[index] * b[index];
diff --git a/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc
new file mode 100644
index 00000000000000..ab79887a10dabc
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.cc
@@ -0,0 +1,56 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/kernel_serialization_check.h"
+
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/kernel_symbol_registry.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+using ::testing::IsEmpty;
+using ::testing::NotNull;
+
+void VerifyKernelIsSerializable(const KernelLoaderSpec& kernel_spec,
+                                Platform::Id platform_id) {
+  auto resolve_kernel_symbol =
+      [&](absl::string_view persistent_kernel_name) -> absl::StatusOr<void*> {
+    return KernelSymbolRegistry::GetGlobalInstance().FindSymbol(
+        persistent_kernel_name, platform_id);
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpecProto proto, kernel_spec.ToProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      KernelLoaderSpec deserialized_spec,
+      KernelLoaderSpec::FromProto(proto, resolve_kernel_symbol));
+
+  if (deserialized_spec.has_in_process_symbol()) {
+    EXPECT_THAT(deserialized_spec.in_process_symbol()->symbol, NotNull());
+  }
+  if (deserialized_spec.has_cuda_cubin_in_memory()) {
+    EXPECT_THAT(deserialized_spec.cuda_cubin_in_memory()->cubin_bytes,
+                Not(IsEmpty()));
+  }
+  if (deserialized_spec.has_cuda_ptx_in_memory()) {
+    EXPECT_THAT(deserialized_spec.cuda_ptx_in_memory()->ptx, Not(IsEmpty()));
+  }
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h
new file mode 100644
index 00000000000000..e790acb9493a3f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/kernel_serialization_check.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
+#define XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
+
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/platform.h"
+namespace stream_executor::gpu {
+
+// Verifies that a the given KernelLoaderSpec can be serialized and deserialized
+// correctly for the given platform id.
+// The check is best-effort and won't actually try to run or load the kernel.
+// It's just verifying that the necessary information is present.
+void VerifyKernelIsSerializable(const KernelLoaderSpec& kernel_spec,
+                                Platform::Id platform_id);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_KERNEL_SERIALIZATION_CHECK_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h b/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h
new file mode 100644
index 00000000000000..c669f6b3927bad
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/prefix_sum_kernel.h
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/types.h"
+
+namespace stream_executor::gpu {
+struct PrefixSumBF16Kernel {
+  using KernelType = TypedKernel<const DeviceMemory<xla::bfloat16>,
+                                 DeviceMemory<xla::bfloat16>, size_t>;
+};
+struct PrefixSumF16Kernel {
+  using KernelType = TypedKernel<const DeviceMemory<xla::half>,
+                                 DeviceMemory<xla::half>, size_t>;
+};
+struct PrefixSumF32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<float>, DeviceMemory<float>, size_t>;
+};
+struct PrefixSumF64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<double>, DeviceMemory<double>, size_t>;
+};
+struct PrefixSumS8Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int8_t>, DeviceMemory<int8_t>, size_t>;
+};
+struct PrefixSumS16Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int16_t>, DeviceMemory<int16_t>, size_t>;
+};
+struct PrefixSumS32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int32_t>, DeviceMemory<int32_t>, size_t>;
+};
+struct PrefixSumS64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<int64_t>, DeviceMemory<int64_t>, size_t>;
+};
+struct PrefixSumU8Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint8_t>, DeviceMemory<uint8_t>, size_t>;
+};
+struct PrefixSumU16Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint16_t>, DeviceMemory<uint16_t>, size_t>;
+};
+struct PrefixSumU32Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint32_t>, DeviceMemory<uint32_t>, size_t>;
+};
+struct PrefixSumU64Kernel {
+  using KernelType =
+      TypedKernel<const DeviceMemory<uint64_t>, DeviceMemory<uint64_t>, size_t>;
+};
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_PREFIX_SUM_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
index 99cbdb4268dfe4..5ffc38db918f84 100644
--- a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
+++ b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
@@ -462,8 +462,8 @@ absl::StatusOr<TmaMetadata> TmaMetadata::FromProto(
 
 bool IsTmaAvailableForDevice(
     const stream_executor::DeviceDescription& device_info) {
-  if (auto* cuda_cc = std::get_if<stream_executor::CudaComputeCapability>(
-          &device_info.gpu_compute_capability())) {
+  if (auto* cuda_cc =
+          device_info.gpu_compute_capability().cuda_compute_capability()) {
     return cuda_cc->IsAtLeastHopper();
   }
   return false;
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index 66170ec2f00749..044c6e9f109f3f 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:numbers",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
index 69ce79cf4b3254..a2c0cea3c7087f 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/logging.h"
+#include "tsl/platform/numbers.h"
 
 namespace stream_executor {
 
@@ -95,7 +96,8 @@ absl::Status MemoryAllocationError(uint64_t size, bool is_host_mem) {
 
   absl::Status status = absl::ResourceExhaustedError(
       absl::StrCat("Out of ", (is_host_mem ? "host " : ""),
-                   "memory while trying to allocate ", size, " bytes.",
+                   "memory while trying to allocate ",
+                   tsl::strings::HumanReadableNumBytes(size), ".",
                    (is_host_mem ? kHostMemoryExplanation : "")));
   status.SetPayload(kMemoryAllocationErrorPayloadKey, absl::Cord());
   return status;
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
deleted file mode 100644
index 3f419fe04ad62a..00000000000000
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/kernel.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-
-#include "absl/strings/string_view.h"
-
-namespace stream_executor {
-
-std::optional<int64_t> KernelMetadata::registers_per_thread() const {
-  return registers_per_thread_;
-}
-
-std::optional<int64_t> KernelMetadata::shared_memory_bytes() const {
-  return shared_memory_bytes_;
-}
-
-void KernelMetadata::set_registers_per_thread(int registers_per_thread) {
-  registers_per_thread_ = registers_per_thread;
-}
-
-void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
-  shared_memory_bytes_ = shared_memory_bytes;
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel
-//===----------------------------------------------------------------------===//
-
-void Kernel::set_name(absl::string_view name) { name_ = std::string(name); }
-
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 6a738ebe6c6943..ac0815683d5359 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -67,7 +67,6 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_KERNEL_H_
 
-#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -76,117 +75,19 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
-#include <type_traits>
 #include <utility>
-#include <variant>
 
-#include "absl/base/optimization.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/overload.h"
 #include "absl/log/check.h"
-#include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 
 namespace stream_executor {
 
-//===----------------------------------------------------------------------===//
-// Kernel metadata
-//===----------------------------------------------------------------------===//
-
-// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as
-// registers allocated, shared memory used, etc.
-// Not all platforms support reporting of all information, so each accessor
-// returns false if the associated field is not populated in the underlying
-// platform.
-class KernelMetadata {
- public:
-  KernelMetadata() = default;
-
-  // Returns the number of registers used per thread executing this kernel.
-  std::optional<int64_t> registers_per_thread() const;
-
-  // Returns the amount of [static] shared memory used per block executing this
-  // kernel. Note that dynamic shared memory allocations are not (and can not)
-  // be reported here (since they're not specified until kernel launch time).
-  std::optional<int64_t> shared_memory_bytes() const;
-
-  void set_registers_per_thread(int registers_per_thread);
-  void set_shared_memory_bytes(int shared_memory_bytes);
-
- private:
-  std::optional<int64_t> registers_per_thread_;
-  std::optional<int64_t> shared_memory_bytes_;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments
-//===----------------------------------------------------------------------===//
-
-// A virtual base class for passing kernel arguments to a stream executor APIs.
-class KernelArgs {
- public:
-  template <typename T>
-  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
-
-  enum class Kind {
-    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
-    // type of kernel arguments used only when the kernel has to do its own
-    // custom packing, e.g. wrap all device pointers into a custom
-    // structure, but can't be implemented as a TypedKernel because it has to be
-    // passed around as a generic Kernel.
-    kDeviceMemoryArray,
-
-    // A list of kernel arguments packed into a storage that can be passed
-    // directly to device kernel as void** kernel parameters.
-    kPackedArray
-  };
-
-  virtual ~KernelArgs() = default;
-
-  // Gets the number of arguments added so far, including shared memory
-  // arguments.
-  virtual size_t number_of_arguments() const = 0;
-
-  // Gets the total number of shared memory bytes added so far.
-  virtual uint64_t number_of_shared_bytes() const = 0;
-
-  virtual Kind kind() const = 0;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packed array
-//===----------------------------------------------------------------------===//
-
-// A virtual base class for passing kernel arguments packed into a storage so
-// that we have stable addresses for all arguments. This is a low level API for
-// passing arguments in a platform-specific way that relies on the knowledge of
-// the ABI of the underlying platform.
-//
-// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
-// packed array base guarantees that `argument_addresses` are compatible with
-// the CUDA APIs.
-//
-// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
-class KernelArgsPackedArrayBase : public KernelArgs {
- public:
-  // Gets the list of argument addresses.
-  virtual absl::Span<const void* const> argument_addresses() const = 0;
-
-  static bool classof(const KernelArgs* args) {
-    return args->kind() == Kind::kPackedArray;
-  }
-
-  Kind kind() const final { return Kind::kPackedArray; }
-};
-
 //===----------------------------------------------------------------------===//
 // Kernel
 //===----------------------------------------------------------------------===//
@@ -230,7 +131,7 @@ class Kernel {
   }
 
   absl::string_view name() const { return name_; }
-  void set_name(absl::string_view name);
+  void set_name(std::string name) { name_ = std::move(name); }
 
   // Launches a data parallel kernel with the given thread/block
   // dimensionality and already-packed args/sizes to pass to the underlying
@@ -322,463 +223,6 @@ class TypedKernel {
 
   std::unique_ptr<Kernel> kernel_;
 };
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments LLVM-style RTTI library
-//===----------------------------------------------------------------------===//
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-T* Cast(KernelArgs* args) {
-  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
-                          << typeid(T).name();
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return static_cast<const T*>(args);
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* Cast(const KernelArgs* args) {
-  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
-                          << typeid(T).name();
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return static_cast<const T*>(args);
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* DynCast(const KernelArgs* args) {
-  CHECK(args != nullptr) << "Casted arguments must be not null";
-  return T::classof(args) ? static_cast<const T*>(args) : nullptr;
-}
-
-template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
-const T* DynCastOrNull(const KernelArgs* args) {
-  return args && T::classof(args) ? static_cast<const T*>(args) : nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments device memory array
-//===----------------------------------------------------------------------===//
-
-class KernelArgsDeviceMemoryArray : public KernelArgs {
- public:
-  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
-                              size_t shared_memory_bytes)
-      : device_memory_args_(args.begin(), args.end()),
-        shared_memory_bytes_(shared_memory_bytes) {}
-
-  static bool classof(const KernelArgs* args) {
-    return args->kind() == Kind::kDeviceMemoryArray;
-  }
-
-  Kind kind() const final { return Kind::kDeviceMemoryArray; }
-
-  size_t number_of_arguments() const final {
-    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
-  }
-
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  absl::Span<const DeviceMemoryBase> device_memory_args() const {
-    return device_memory_args_;
-  }
-
-  const void* device_memory_ptr(size_t index) const {
-    return device_memory_args_[index].opaque();
-  }
-
-  size_t device_memory_size(size_t index) const {
-    return device_memory_args_[index].size();
-  }
-
- private:
-  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
-  size_t shared_memory_bytes_ = 0;
-};
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packing for device memory and POD args
-//===----------------------------------------------------------------------===//
-
-// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
-// and POD arguments (i.e. scalars) when the number and type of arguments are
-// not known at compile time.
-
-namespace internal {
-
-// An empty storage for packing just the device memory arguments, that are
-// stored directly in the `KernelArgsPackedArray`.
-struct EmptyArgs {
-  static constexpr size_t kSize = 0;
-};
-
-// A storage for POD generic arguments that are smaller than `size` and require
-// alignment smaller or equal to `alignment`.
-template <size_t capacity, size_t size = 8,
-          size_t alignment = alignof(std::max_align_t)>
-class PodArgs {
- public:
-  static constexpr size_t kSize = size;
-
- protected:
-  template <typename T>
-  const std::byte* add_pod_argument(const T& arg) {
-    static_assert(std::is_trivially_copyable_v<T> &&
-                      sizeof(T) <= size & alignof(T) <= alignment,
-                  "Type is not compatible with POD arguments storage");
-
-    assert(num_args_ < capacity && "pod args overflow");
-    std::byte* arg_storage = args_storage_[num_args_++].storage;
-    std::memcpy(arg_storage, &arg, sizeof(T));
-
-    return arg_storage;
-  }
-
- private:
-  struct Arg {
-    alignas(alignment) std::byte storage[size];
-  };
-
-  size_t num_args_ = 0;
-  std::array<Arg, capacity> args_storage_;
-};
-
-template <typename ArgsStorage>
-static constexpr bool is_pod_args_v = false;
-
-template <size_t capacity, size_t size, size_t alignment>
-static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
-
-}  // namespace internal
-
-// An array of arguments for a kernel call.
-//
-// The template parameter `num_args` is the maximum number of arguments which
-// can be stored in the array.
-template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
-class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
- public:
-  KernelArgsPackedArray() = default;
-
-  // KernelArgsPackedArray is not copyable or movable because argument addresses
-  // point to inline storage that can't be moved.
-  KernelArgsPackedArray(const KernelArgsPackedArray&) = delete;
-  KernelArgsPackedArray& operator=(const KernelArgsPackedArray&) = delete;
-
-  // Adds an argument to the list.
-  template <typename T>
-  void add_argument(const T& arg) {
-    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
-      argument_addresses_[number_of_argument_addresses_++] =
-          ArgsStorage::add_pod_argument(arg);
-    } else {
-      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
-      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
-    }
-  }
-
-  // Adds a device memory argument to the list.
-  void add_device_memory_argument(const DeviceMemoryBase& arg) {
-    const void** copy_ptr =
-        &device_memory_opaque_pointers_[number_of_argument_addresses_];
-    *copy_ptr = arg.opaque();
-    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
-    ++number_of_argument_addresses_;
-  }
-
-  // Adds a shared memory argument to the list.
-  //
-  // The only significant information about a shared argument is its size, so
-  // that is the only parameter in this function.
-  void add_shared_bytes(size_t number_of_bytes) {
-    shared_memory_bytes_ += number_of_bytes;
-  }
-
-  // Gets the number of arguments added so far, including shared memory
-  // arguments.
-  size_t number_of_arguments() const final {
-    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
-  }
-
-  // Gets the total number of shared memory bytes added so far.
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  // Gets the list of argument addresses.
-  absl::Span<const void* const> argument_addresses() const final {
-    return absl::Span<const void* const>(argument_addresses_.data(),
-                                         number_of_argument_addresses_);
-  }
-
- private:
-  // A place to store copies of opaque pointers from device memory arguments.
-  std::array<const void*, num_args> device_memory_opaque_pointers_;
-
-  // Addresses for non-shared-memory arguments.
-  std::array<const void*, num_args> argument_addresses_;
-
-  // Shared memory required by a kernel.
-  size_t shared_memory_bytes_ = 0;
-
-  // Number of significant entries in argument_addresses_.
-  size_t number_of_argument_addresses_ = 0;
-};
-
-using KernelArgument = std::variant<DeviceMemoryBase, TensorMap, int64_t>;
-
-namespace internal {
-template <int n>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
-    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
-  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
-  for (const DeviceMemoryBase& buf : args) {
-    packed->add_device_memory_argument(buf);
-  }
-  if (shared_mem_bytes > 0) {
-    packed->add_shared_bytes(shared_mem_bytes);
-  }
-  return packed;
-}
-
-template <int n, typename ArgsStorage>
-std::unique_ptr<KernelArgsPackedArray<n, ArgsStorage>> PackKernelArgsImpl(
-    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
-  auto packed = std::make_unique<KernelArgsPackedArray<n, ArgsStorage>>();
-  for (const auto& arg : args) {
-    std::visit(
-        absl::Overload{
-            [&](const DeviceMemoryBase& device_memory) {
-              packed->add_device_memory_argument(device_memory);
-            },
-            [&](int64_t int_arg) {
-              if constexpr (ArgsStorage::kSize >= sizeof(int64_t)) {
-                packed->add_argument(int_arg);
-              }
-            },
-            [&](const TensorMap& tensor_map) {
-              if constexpr (ArgsStorage::kSize >= sizeof(tensor_map.storage)) {
-                packed->add_argument(tensor_map.storage);
-              }
-            },
-        },
-        arg);
-  }
-  if (shared_mem_bytes > 0) {
-    packed->add_shared_bytes(shared_mem_bytes);
-  }
-  return packed;
-}
-
-template <int n>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
-    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
-  const int32_t pod_size = [](absl::Span<const KernelArgument> args) {
-    bool has_int = false;
-    for (const auto& arg : args) {
-      if (std::holds_alternative<TensorMap>(arg)) {
-        return 128;
-      }
-      if (std::holds_alternative<int64_t>(arg)) {
-        has_int = true;
-      }
-    }
-    return has_int ? 64 : 0;
-  }(args);
-
-  switch (pod_size) {
-    case 128:
-      return PackKernelArgsImpl<n, PodArgs<n, 128, 64>>(args, shared_mem_bytes);
-    case 64:
-      return PackKernelArgsImpl<n, PodArgs<n, 64, 64>>(args, shared_mem_bytes);
-    case 0:
-      return PackKernelArgsImpl<n, EmptyArgs>(args, shared_mem_bytes);
-    default:
-      ABSL_UNREACHABLE();
-  }
-}
-}  // namespace internal
-
-template <typename ArgType>
-inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const ArgType> args, uint32_t shared_mem_bytes) {
-  static constexpr int kKernelArgsLimit = 1024;
-
-  if (args.size() > kKernelArgsLimit) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Can't pack device memory arguments array of size ", args.size(),
-        " which is larger than the maximum supported size of ",
-        kKernelArgsLimit));
-  }
-
-  // Specialize kernel arguments array for small sizes to allocate a smaller
-  // chunk of memory and hopefully hit a small allocations cache.
-  if (args.size() <= 4) {
-    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
-  } else if (args.size() <= 8) {
-    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
-  } else if (args.size() <= 16) {
-    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
-  } else if (args.size() <= 32) {
-    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
-  } else if (args.size() <= 64) {
-    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
-  } else if (args.size() <= 256) {
-    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
-  } else if (args.size() <= 512) {
-    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
-  }
-
-  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
-}
-
-template <typename ArgType>
-inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const ArgType> args, const KernelMetadata& metadata) {
-  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
-}
-
-//===----------------------------------------------------------------------===//
-// Kernel arguments packing for statically know argument types
-//===----------------------------------------------------------------------===//
-
-// KernelArgsPackedTuple is optimized for packing arguments when their types are
-// known at compile time, and somewhat similar to `std::tuple` but with a few
-// special rules for passing device memory arguments.
-
-namespace internal {
-
-// PackedArgType template specialization defines what storage type we'll be
-// using for each kernel argument type:
-//
-//   (1) We always strip references and store a copy of an argument.
-//   (2) We do not support pointer arguments, as we should not be passing a
-//       pointers to host memory to device kernels.
-//   (3) DeviceMemory passed as an opaque `void*` pointer.
-//   (4) We have a special case for passing pointers to DeviceMemory where we
-//       also pass it as an opaque device pointer.
-template <typename T>
-struct PackedArgType {
-  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
-  using Type = T;
-};
-
-template <>
-struct PackedArgType<DeviceMemoryBase> {
-  using Type = const void*;
-};
-
-template <typename T>
-struct PackedArgType<DeviceMemory<T>> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <>
-struct PackedArgType<DeviceMemoryBase*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <>
-struct PackedArgType<const DeviceMemoryBase*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <typename T>
-struct PackedArgType<DeviceMemory<T>*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-template <typename T>
-struct PackedArgType<const DeviceMemory<T>*> {
-  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
-};
-
-// Overload set for packing kernel arguments. This overload set matches
-// supported kernel arguments types defined by `PackedArgType`.
-template <typename T, std::enable_if_t<!std::is_pointer_v<T>>* = nullptr>
-T PackArg(const T& arg) {
-  return arg;
-}
-
-inline const void* PackArg(const DeviceMemoryBase& arg) { return arg.opaque(); }
-inline const void* PackArg(const DeviceMemoryBase* arg) {
-  return PackArg(*arg);
-}
-
-template <typename T>
-const void* PackArg(const DeviceMemory<T>& arg) {
-  return arg.opaque();
-}
-
-template <typename T>
-const void* PackArg(const DeviceMemory<T>* arg) {
-  return PackArg(*arg);
-}
-
-}  // namespace internal
-
-template <typename... Args>
-class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
- public:
-  static constexpr size_t kSize = sizeof...(Args);
-
-  using Storage = std::tuple<
-      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
-
-  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
-      : storage_(internal::PackArg(std::forward<Args>(args))...),
-        shared_memory_bytes_(shared_memory_bytes) {
-    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
-  }
-
-  // KernelArgsPackedTuple is not copyable or movable because argument addresses
-  // point to inline storage that can't be moved.
-  KernelArgsPackedTuple(const KernelArgsPackedTuple&) = delete;
-  KernelArgsPackedTuple& operator=(const KernelArgsPackedTuple&) = delete;
-
-  size_t number_of_arguments() const final {
-    return kSize + (shared_memory_bytes_ > 0);
-  }
-
-  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
-
-  absl::Span<const void* const> argument_addresses() const final {
-    return absl::Span<const void* const>(argument_addresses_.data(), kSize);
-  }
-
-  // Compile time check that KernelArgsPackedTuple is compatible with
-  // `OtherArgs`: after stripping const and reference all types match.
-  template <typename... OtherArgs>
-  static void CheckCompatibleStaticAssert() {
-    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
-    static_assert(kSize == kOtherSize, "length of arguments packs must match");
-
-    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
-    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
-    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
-                  "arguments types do not match");
-  }
-
- private:
-  template <size_t... Is>
-  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
-    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
-  }
-
-  // Storage for packed kernel arguments.
-  Storage storage_;
-
-  // Shared memory required by a kernel.
-  size_t shared_memory_bytes_ = 0;
-
-  // Pointers into `storage_`.
-  std::array<const void*, kSize> argument_addresses_;
-};
-
-// Packs the given arguments into a KernelArgsPackedTuple.
-template <typename... Args>
-std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
-                                                          Args... args) {
-  using PackedArgs = KernelArgsPackedTuple<Args...>;
-  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
-}
-
 // Packs the given arguments into a KernelArgsPackedTuple with compile-time type
 // checks that arguments are compatible with TypedKernel signature.
 template <typename... Params, typename... Args>
diff --git a/third_party/xla/xla/stream_executor/kernel_args.h b/third_party/xla/xla/stream_executor/kernel_args.h
new file mode 100644
index 00000000000000..e57ed57e4e544b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args.h
@@ -0,0 +1,569 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "absl/base/optimization.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/overload.h"
+#include "absl/log/check.h"
+#include "absl/meta/type_traits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_metadata.h"
+
+namespace stream_executor {
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments to a stream executor APIs.
+class KernelArgs {
+ public:
+  template <typename T>
+  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
+
+  enum class Kind {
+    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
+    // type of kernel arguments used only when the kernel has to do its own
+    // custom packing, e.g. wrap all device pointers into a custom
+    // structure, but can't be implemented as a TypedKernel because it has to be
+    // passed around as a generic Kernel.
+    kDeviceMemoryArray,
+
+    // A list of kernel arguments packed into a storage that can be passed
+    // directly to device kernel as void** kernel parameters.
+    kPackedArray
+  };
+
+  virtual ~KernelArgs() = default;
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  virtual size_t number_of_arguments() const = 0;
+
+  // Gets the total number of shared memory bytes added so far.
+  virtual uint64_t number_of_shared_bytes() const = 0;
+
+  virtual Kind kind() const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packed array
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments packed into a storage so
+// that we have stable addresses for all arguments. This is a low level API for
+// passing arguments in a platform-specific way that relies on the knowledge of
+// the ABI of the underlying platform.
+//
+// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
+// packed array base guarantees that `argument_addresses` are compatible with
+// the CUDA APIs.
+//
+// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
+class KernelArgsPackedArrayBase : public KernelArgs {
+ public:
+  // Gets the list of argument addresses.
+  virtual absl::Span<const void* const> argument_addresses() const = 0;
+
+  static bool classof(const KernelArgs* args) {
+    return args->kind() == Kind::kPackedArray;
+  }
+
+  Kind kind() const final { return Kind::kPackedArray; }
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments LLVM-style RTTI library
+//===----------------------------------------------------------------------===//
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+T* Cast(KernelArgs* args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T*>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* Cast(const KernelArgs* args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T*>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* DynCast(const KernelArgs* args) {
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return T::classof(args) ? static_cast<const T*>(args) : nullptr;
+}
+
+template <class T, KernelArgs::IsKernelArgs<T>* = nullptr>
+const T* DynCastOrNull(const KernelArgs* args) {
+  return args && T::classof(args) ? static_cast<const T*>(args) : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments device memory array
+//===----------------------------------------------------------------------===//
+
+class KernelArgsDeviceMemoryArray : public KernelArgs {
+ public:
+  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
+                              size_t shared_memory_bytes)
+      : device_memory_args_(args.begin(), args.end()),
+        shared_memory_bytes_(shared_memory_bytes) {}
+
+  static bool classof(const KernelArgs* args) {
+    return args->kind() == Kind::kDeviceMemoryArray;
+  }
+
+  Kind kind() const final { return Kind::kDeviceMemoryArray; }
+
+  size_t number_of_arguments() const final {
+    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const DeviceMemoryBase> device_memory_args() const {
+    return device_memory_args_;
+  }
+
+  const void* device_memory_ptr(size_t index) const {
+    return device_memory_args_[index].opaque();
+  }
+
+  size_t device_memory_size(size_t index) const {
+    return device_memory_args_[index].size();
+  }
+
+ private:
+  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
+  size_t shared_memory_bytes_ = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for device memory and POD args
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
+// and POD arguments (i.e. scalars) when the number and type of arguments are
+// not known at compile time.
+
+namespace internal {
+
+// An empty storage for packing just the device memory arguments, that are
+// stored directly in the `KernelArgsPackedArray`.
+struct EmptyArgs {
+  static constexpr size_t kSize = 0;
+};
+
+// A storage for POD generic arguments that are smaller than `size` and require
+// alignment smaller or equal to `alignment`.
+template <size_t capacity, size_t size = 8,
+          size_t alignment = alignof(std::max_align_t)>
+class PodArgs {
+ public:
+  static constexpr size_t kSize = size;
+
+ protected:
+  template <typename T>
+  const std::byte* add_pod_argument(const T& arg) {
+    static_assert(std::is_trivially_copyable_v<T> &&
+                      sizeof(T) <= size & alignof(T) <= alignment,
+                  "Type is not compatible with POD arguments storage");
+
+    assert(num_args_ < capacity && "pod args overflow");
+    std::byte* arg_storage = args_storage_[num_args_++].storage;
+    std::memcpy(arg_storage, &arg, sizeof(T));
+
+    return arg_storage;
+  }
+
+ private:
+  struct Arg {
+    alignas(alignment) std::byte storage[size];
+  };
+
+  size_t num_args_ = 0;
+  std::array<Arg, capacity> args_storage_;
+};
+
+template <typename ArgsStorage>
+static constexpr bool is_pod_args_v = false;
+
+template <size_t capacity, size_t size, size_t alignment>
+static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
+
+}  // namespace internal
+
+// An array of arguments for a kernel call.
+//
+// The template parameter `num_args` is the maximum number of arguments which
+// can be stored in the array.
+template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
+class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
+ public:
+  KernelArgsPackedArray() = default;
+
+  // KernelArgsPackedArray is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedArray(const KernelArgsPackedArray&) = delete;
+  KernelArgsPackedArray& operator=(const KernelArgsPackedArray&) = delete;
+
+  // Adds an argument to the list.
+  template <typename T>
+  void add_argument(const T& arg) {
+    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
+      argument_addresses_[number_of_argument_addresses_++] =
+          ArgsStorage::add_pod_argument(arg);
+    } else {
+      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
+    }
+  }
+
+  // Adds a device memory argument to the list.
+  void add_device_memory_argument(const DeviceMemoryBase& arg) {
+    const void** copy_ptr =
+        &device_memory_opaque_pointers_[number_of_argument_addresses_];
+    *copy_ptr = arg.opaque();
+    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
+    ++number_of_argument_addresses_;
+  }
+
+  // Adds a shared memory argument to the list.
+  //
+  // The only significant information about a shared argument is its size, so
+  // that is the only parameter in this function.
+  void add_shared_bytes(size_t number_of_bytes) {
+    shared_memory_bytes_ += number_of_bytes;
+  }
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  size_t number_of_arguments() const final {
+    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
+  }
+
+  // Gets the total number of shared memory bytes added so far.
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  // Gets the list of argument addresses.
+  absl::Span<const void* const> argument_addresses() const final {
+    return absl::Span<const void* const>(argument_addresses_.data(),
+                                         number_of_argument_addresses_);
+  }
+
+ private:
+  // A place to store copies of opaque pointers from device memory arguments.
+  std::array<const void*, num_args> device_memory_opaque_pointers_;
+
+  // Addresses for non-shared-memory arguments.
+  std::array<const void*, num_args> argument_addresses_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Number of significant entries in argument_addresses_.
+  size_t number_of_argument_addresses_ = 0;
+};
+
+using KernelArgument = std::variant<DeviceMemoryBase, TensorMap, int64_t>;
+
+namespace internal {
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
+  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
+  for (const DeviceMemoryBase& buf : args) {
+    packed->add_device_memory_argument(buf);
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+
+template <int n, typename ArgsStorage>
+std::unique_ptr<KernelArgsPackedArray<n, ArgsStorage>> PackKernelArgsImpl(
+    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
+  auto packed = std::make_unique<KernelArgsPackedArray<n, ArgsStorage>>();
+  for (const auto& arg : args) {
+    std::visit(
+        absl::Overload{
+            [&](const DeviceMemoryBase& device_memory) {
+              packed->add_device_memory_argument(device_memory);
+            },
+            [&](int64_t int_arg) {
+              if constexpr (ArgsStorage::kSize >= sizeof(int64_t)) {
+                packed->add_argument(int_arg);
+              }
+            },
+            [&](const TensorMap& tensor_map) {
+              if constexpr (ArgsStorage::kSize >= sizeof(tensor_map.storage)) {
+                packed->add_argument(tensor_map.storage);
+              }
+            },
+        },
+        arg);
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
+  const int32_t pod_size = [](absl::Span<const KernelArgument> args) {
+    bool has_int = false;
+    for (const auto& arg : args) {
+      if (std::holds_alternative<TensorMap>(arg)) {
+        return 128;
+      }
+      if (std::holds_alternative<int64_t>(arg)) {
+        has_int = true;
+      }
+    }
+    return has_int ? 64 : 0;
+  }(args);
+
+  switch (pod_size) {
+    case 128:
+      return PackKernelArgsImpl<n, PodArgs<n, 128, 64>>(args, shared_mem_bytes);
+    case 64:
+      return PackKernelArgsImpl<n, PodArgs<n, 64, 64>>(args, shared_mem_bytes);
+    case 0:
+      return PackKernelArgsImpl<n, EmptyArgs>(args, shared_mem_bytes);
+    default:
+      ABSL_UNREACHABLE();
+  }
+}
+}  // namespace internal
+
+template <typename ArgType>
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const ArgType> args, uint32_t shared_mem_bytes) {
+  static constexpr int kKernelArgsLimit = 1024;
+
+  if (args.size() > kKernelArgsLimit) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't pack device memory arguments array of size ", args.size(),
+        " which is larger than the maximum supported size of ",
+        kKernelArgsLimit));
+  }
+
+  // Specialize kernel arguments array for small sizes to allocate a smaller
+  // chunk of memory and hopefully hit a small allocations cache.
+  if (args.size() <= 4) {
+    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 8) {
+    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 16) {
+    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 32) {
+    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 64) {
+    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 256) {
+    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
+  }
+  if (args.size() <= 512) {
+    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
+  }
+
+  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
+}
+
+template <typename ArgType>
+inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
+PackKernelArgs(absl::Span<const ArgType> args, const KernelMetadata& metadata) {
+  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for statically know argument types
+//===----------------------------------------------------------------------===//
+
+// KernelArgsPackedTuple is optimized for packing arguments when their types are
+// known at compile time, and somewhat similar to `std::tuple` but with a few
+// special rules for passing device memory arguments.
+
+namespace internal {
+
+// PackedArgType template specialization defines what storage type we'll be
+// using for each kernel argument type:
+//
+//   (1) We always strip references and store a copy of an argument.
+//   (2) We do not support pointer arguments, as we should not be passing a
+//       pointers to host memory to device kernels.
+//   (3) DeviceMemory passed as an opaque `void*` pointer.
+//   (4) We have a special case for passing pointers to DeviceMemory where we
+//       also pass it as an opaque device pointer.
+template <typename T>
+struct PackedArgType {
+  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
+  using Type = T;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase> {
+  using Type = const void*;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T>> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<DeviceMemoryBase*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <>
+struct PackedArgType<const DeviceMemoryBase*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<DeviceMemory<T>*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+template <typename T>
+struct PackedArgType<const DeviceMemory<T>*> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
+
+// Overload set for packing kernel arguments. This overload set matches
+// supported kernel arguments types defined by `PackedArgType`.
+template <typename T, std::enable_if_t<!std::is_pointer_v<T>>* = nullptr>
+T PackArg(const T& arg) {
+  return arg;
+}
+
+inline const void* PackArg(const DeviceMemoryBase& arg) { return arg.opaque(); }
+inline const void* PackArg(const DeviceMemoryBase* arg) {
+  return PackArg(*arg);
+}
+
+template <typename T>
+const void* PackArg(const DeviceMemory<T>& arg) {
+  return arg.opaque();
+}
+
+template <typename T>
+const void* PackArg(const DeviceMemory<T>* arg) {
+  return PackArg(*arg);
+}
+
+}  // namespace internal
+
+template <typename... Args>
+class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
+ public:
+  static constexpr size_t kSize = sizeof...(Args);
+
+  using Storage = std::tuple<
+      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
+
+  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
+      : storage_(internal::PackArg(std::forward<Args>(args))...),
+        shared_memory_bytes_(shared_memory_bytes) {
+    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
+  }
+
+  // KernelArgsPackedTuple is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedTuple(const KernelArgsPackedTuple&) = delete;
+  KernelArgsPackedTuple& operator=(const KernelArgsPackedTuple&) = delete;
+
+  size_t number_of_arguments() const final {
+    return kSize + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void* const> argument_addresses() const final {
+    return absl::Span<const void* const>(argument_addresses_.data(), kSize);
+  }
+
+  // Compile time check that KernelArgsPackedTuple is compatible with
+  // `OtherArgs`: after stripping const and reference all types match.
+  template <typename... OtherArgs>
+  static void CheckCompatibleStaticAssert() {
+    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
+    static_assert(kSize == kOtherSize, "length of arguments packs must match");
+
+    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
+    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
+    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
+                  "arguments types do not match");
+  }
+
+ private:
+  template <size_t... Is>
+  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
+    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
+  }
+
+  // Storage for packed kernel arguments.
+  Storage storage_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Pointers into `storage_`.
+  std::array<const void*, kSize> argument_addresses_;
+};
+
+// Packs the given arguments into a KernelArgsPackedTuple.
+template <typename... Args>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
+                                                          Args... args) {
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGS_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h b/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h
new file mode 100644
index 00000000000000..2d2b19ee02bcdc
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args_packed_vector.h
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel_args.h"
+
+namespace stream_executor {
+
+// An array of arguments for a kernel call, where each argument is stored in a
+// separate vector. The storage is owned by the KernelArgsPackedVector.
+class KernelArgsPackedVector : public KernelArgsPackedArrayBase {
+ public:
+  KernelArgsPackedVector(std::vector<std::vector<char>> arguments,
+                         size_t shared_memory_bytes)
+      : argument_storage_(std::move(arguments)),
+        shared_memory_bytes_(shared_memory_bytes) {
+    argument_addresses_.reserve(argument_storage_.size());
+    for (std::vector<char>& argument : argument_storage_) {
+      argument_addresses_.push_back(argument.data());
+    }
+  }
+
+  size_t number_of_arguments() const final {
+    // We need to add 1 to the number of arguments if the kernel is using shared
+    // memory because we treat the number of shared memory bytes like an
+    // additional argument in the StreamExecutor kernel launch API. Note that
+    // shared memory doesn't appear in
+    // KernelArgsPackedVector::argument_addresses().
+    return argument_storage_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  // Returns the number of bytes that need to be allocated in shared memory for
+  // this kernel. This value is treated like an additional argument in the CUDA
+  // kernel launch API, therefore we pass it alongside the real kernel arguments
+  // here.
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void* const> argument_addresses() const final {
+    return argument_addresses_;
+  }
+
+ private:
+  std::vector<std::vector<char>> argument_storage_;
+  size_t shared_memory_bytes_ = 0;
+  std::vector<const void*> argument_addresses_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGS_PACKED_VECTOR_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc b/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc
new file mode 100644
index 00000000000000..186ca5b9d7f2f2
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_args_packed_vector_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/types/span.h"
+
+namespace stream_executor {
+namespace {
+using ::testing::Each;
+using ::testing::ElementsAre;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+TEST(KernelArgsPackedVectorTest, StoresSharedMemoryBytes) {
+  KernelArgsPackedVector args(std::vector<std::vector<char>>(),
+                              /*shared_memory_bytes=*/42);
+  EXPECT_EQ(args.number_of_shared_bytes(), 42);
+  EXPECT_EQ(args.number_of_arguments(), 1);
+}
+
+TEST(KernelArgsPackedVectorTest, StoresArgumentAddresses) {
+  KernelArgsPackedVector args = []() {
+    std::vector<std::vector<char>> storage;
+    storage.push_back(std::vector<char>({10}));
+    storage.push_back(std::vector<char>({20, 21}));
+    storage.push_back(std::vector<char>({30, 31, 32}));
+    return KernelArgsPackedVector(std::move(storage),
+                                  /*shared_memory_bytes=*/0);
+  }();
+
+  EXPECT_EQ(args.number_of_arguments(), 3);
+  ASSERT_THAT(args.argument_addresses(), SizeIs(3));
+  ASSERT_THAT(args.argument_addresses(), Each(Ne(nullptr)));
+
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(0)), 1),
+      ElementsAre(10));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(1)), 2),
+      ElementsAre(20, 21));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(args.argument_addresses().at(2)), 3),
+      ElementsAre(30, 31, 32));
+}
+
+TEST(KernelArgsPackedVectorTest,
+     ConsidersSharedMemoryBytesInNumberOfArguments) {
+  KernelArgsPackedVector args({{}, {}},
+                              /*shared_memory_bytes=*/42);
+
+  // Two arguments and one shared memory argument.
+  EXPECT_EQ(args.number_of_arguments(), 3);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_args_test.cc
similarity index 93%
rename from third_party/xla/xla/stream_executor/kernel_test.cc
rename to third_party/xla/xla/stream_executor/kernel_args_test.cc
index 0e1b219c8de285..017247a3110b0a 100644
--- a/third_party/xla/xla/stream_executor/kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_args_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 
 #include <cstdint>
 #include <memory>
@@ -25,9 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "benchmark/benchmark.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -66,11 +64,6 @@ static_assert(
     std::is_same_v<ArgsStorage<DeviceMemoryBase*, const DeviceMemoryBase*>,
                    std::tuple<const void*, const void*>>);
 
-static StreamExecutor* NewStreamExecutor() {
-  Platform* platform = PlatformManager::PlatformWithName("Host").value();
-  return platform->ExecutorForDevice(/*ordinal=*/0).value();
-}
-
 TEST(KernelTest, PackDeviceMemoryArguments) {
   DeviceMemoryBase a(reinterpret_cast<void*>(0x12345678));
   DeviceMemoryBase b(reinterpret_cast<void*>(0x87654321));
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc
new file mode 100644
index 00000000000000..76b28a0f11655e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.cc
@@ -0,0 +1,173 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+namespace stream_executor {
+namespace {
+ArgumentPackingRelocationProto::Type ToProtoType(
+    ArgumentPackingRelocation::Type type) {
+  switch (type) {
+    case ArgumentPackingRelocation::Type::kBits64Absolute:
+      return ArgumentPackingRelocationProto::TYPE_BITS64_ABSOLUTE;
+  }
+}
+
+absl::StatusOr<ArgumentPackingRelocation::Type> FromProtoType(
+    ArgumentPackingRelocationProto::Type type) {
+  switch (type) {
+    case ArgumentPackingRelocationProto::TYPE_BITS64_ABSOLUTE:
+      return ArgumentPackingRelocation::Type::kBits64Absolute;
+    default:
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Unsupported relocation type: %d", static_cast<int>(type)));
+  }
+}
+}  // namespace
+
+absl::StatusOr<std::vector<char>> SingleArgumentPackingSpec::BuildArgument(
+    absl::Span<const DeviceMemoryBase> args) const {
+  auto argument = storage_;
+
+  for (const ArgumentPackingRelocation& relocation : relocations_) {
+    switch (relocation.type()) {
+      case ArgumentPackingRelocation::Type::kBits64Absolute: {
+        if (args.size() <= relocation.argument_index()) {
+          return absl::InvalidArgumentError(
+              absl::StrFormat("Not enough arguments for relocation (expected "
+                              "at least %d, but got %d)",
+                              relocation.argument_index(), args.size()));
+        }
+        if (relocation.offset() + sizeof(uint64_t) > argument.size()) {
+          return absl::InvalidArgumentError(
+              absl::StrFormat("Not enough storage for relocation (expected "
+                              "at least %d, but got %d)",
+                              sizeof(void*), argument.size()));
+        }
+        uint64_t ptr =
+            static_cast<uint64_t>(tsl::safe_reinterpret_cast<uintptr_t>(
+                args.at(relocation.argument_index()).opaque()));
+        std::memcpy(argument.data() + relocation.offset(), &ptr, sizeof(ptr));
+        break;
+      }
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Unsupported relocation type: %d",
+                            static_cast<int>(relocation.type())));
+    }
+  }
+  return argument;
+}
+
+void SingleArgumentPackingSpec::WriteArgumentAddress(int argument_index) {
+  relocations_.push_back(ArgumentPackingRelocation(
+      ArgumentPackingRelocation::Type::kBits64Absolute, argument_index,
+      /*offset=*/storage_.size()));
+  storage_.insert(storage_.end(), sizeof(uint64_t), 0);
+}
+
+absl::StatusOr<std::unique_ptr<KernelArgsPackedVector>>
+KernelArgumentsPackingSpec::BuildArguments(
+    absl::Span<const DeviceMemoryBase> thunk_arguments,
+    size_t shared_memory_bytes) const {
+  std::vector<std::vector<char>> result;
+  result.reserve(kernel_arguments_.size());
+  for (const SingleArgumentPackingSpec& kernel_argument : kernel_arguments_) {
+    TF_ASSIGN_OR_RETURN(result.emplace_back(),
+                        kernel_argument.BuildArgument(thunk_arguments));
+  }
+  return std::make_unique<KernelArgsPackedVector>(std::move(result),
+                                                  shared_memory_bytes);
+}
+absl::StatusOr<SingleArgumentPackingSpecProto>
+SingleArgumentPackingSpec::ToProto() const {
+  SingleArgumentPackingSpecProto proto;
+  for (const ArgumentPackingRelocation& relocation : relocations_) {
+    TF_ASSIGN_OR_RETURN(*proto.add_relocations(), relocation.ToProto());
+  }
+  proto.set_data(storage_.data(), storage_.size());
+  return proto;
+}
+
+absl::StatusOr<SingleArgumentPackingSpec> SingleArgumentPackingSpec::FromProto(
+    const SingleArgumentPackingSpecProto& proto) {
+  std::vector<char> storage(proto.data().begin(), proto.data().end());
+  std::vector<ArgumentPackingRelocation> relocations;
+  for (const ArgumentPackingRelocationProto& relocation_proto :
+       proto.relocations()) {
+    TF_ASSIGN_OR_RETURN(ArgumentPackingRelocation relocation,
+                        ArgumentPackingRelocation::FromProto(relocation_proto));
+    relocations.push_back(std::move(relocation));
+  }
+  return SingleArgumentPackingSpec(std::move(storage), std::move(relocations));
+}
+
+absl::StatusOr<ArgumentPackingRelocationProto>
+ArgumentPackingRelocation::ToProto() const {
+  ArgumentPackingRelocationProto proto;
+  proto.set_type(ToProtoType(type_));
+  proto.set_argument_index(argument_index_);
+  proto.set_offset(offset_);
+  return proto;
+}
+
+absl::StatusOr<ArgumentPackingRelocation> ArgumentPackingRelocation::FromProto(
+    const ArgumentPackingRelocationProto& proto) {
+  TF_ASSIGN_OR_RETURN(ArgumentPackingRelocation::Type type,
+                      FromProtoType(proto.type()));
+  return ArgumentPackingRelocation(type, proto.argument_index(),
+                                   proto.offset());
+}
+
+absl::StatusOr<KernelArgumentsPackingSpecProto>
+KernelArgumentsPackingSpec::ToProto() const {
+  KernelArgumentsPackingSpecProto proto;
+  for (const SingleArgumentPackingSpec& kernel_argument : kernel_arguments_) {
+    TF_ASSIGN_OR_RETURN(*proto.add_kernel_arguments(),
+                        kernel_argument.ToProto());
+  }
+  return proto;
+}
+
+absl::StatusOr<KernelArgumentsPackingSpec>
+KernelArgumentsPackingSpec::FromProto(
+    const KernelArgumentsPackingSpecProto& proto) {
+  std::vector<SingleArgumentPackingSpec> kernel_arguments;
+  for (const SingleArgumentPackingSpecProto& kernel_argument_proto :
+       proto.kernel_arguments()) {
+    TF_ASSIGN_OR_RETURN(
+        SingleArgumentPackingSpec kernel_argument,
+        SingleArgumentPackingSpec::FromProto(kernel_argument_proto));
+    kernel_arguments.push_back(std::move(kernel_argument));
+  }
+  return KernelArgumentsPackingSpec(std::move(kernel_arguments));
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h
new file mode 100644
index 00000000000000..eb04e5fd7a6458
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.h
@@ -0,0 +1,187 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
+
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.pb.h"
+
+namespace stream_executor {
+
+// Represents a relocation of an argument to a specific location in the
+// packed argument buffer. Imagine an arbitrary buffer of bytes with a
+// placeholder that later on will be replaced by the runtime with a
+// pointer to the actual argument. Currently only supports 64 bit absolute
+// pointers to device memory.
+class ArgumentPackingRelocation {
+ public:
+  enum class Type { kBits64Absolute };
+
+  explicit ArgumentPackingRelocation(Type type, int argument_index,
+                                     size_t offset)
+      : type_(type), argument_index_(argument_index), offset_(offset) {}
+  Type type() const { return type_; }
+  int argument_index() const { return argument_index_; }
+  size_t offset() const { return offset_; }
+
+  absl::StatusOr<ArgumentPackingRelocationProto> ToProto() const;
+
+  static absl::StatusOr<ArgumentPackingRelocation> FromProto(
+      const ArgumentPackingRelocationProto& proto);
+
+ private:
+  Type type_;
+  int argument_index_;
+  size_t offset_;
+};
+
+// Represents the packing spec for a single argument of a kernel. So this is a
+// buffer of bytes and a list of relocations (placeholders) that describe
+// where later on argument buffer addresses will be written to.
+class SingleArgumentPackingSpec {
+ public:
+  SingleArgumentPackingSpec() = default;
+  SingleArgumentPackingSpec(std::vector<char> storage,
+                            std::vector<ArgumentPackingRelocation> relocations)
+      : storage_(std::move(storage)), relocations_(std::move(relocations)) {}
+
+  // Materializes the argument buffer for this packing spec. The `args` span
+  // must contain at least the number of arguments referenced in the packing
+  // spec, otherwise an error will be returned.
+  absl::StatusOr<std::vector<char>> BuildArgument(
+      absl::Span<const DeviceMemoryBase> args) const;
+
+  // Writes a placeholder to the argument packing spec that will be replaced
+  // by the runtime with the address of the argument `argument_index`th
+  // argument. Currently this is always a 64bits absolute pointer to device
+  // memory. Other types of relocations will be added in the future if needed.
+  void WriteArgumentAddress(int argument_index);
+
+  // Writes a constant value to the argument packing spec. The value must be
+  // trivially copyable.
+  template <typename T>
+  void WriteConstant(T value) {
+    static_assert(std::is_trivially_copyable_v<T>,
+                  "The given value must be trivially copyable");
+    std::array<char, sizeof(T)> temp_storage;
+    std::memcpy(temp_storage.data(), &value, sizeof(T));
+    storage_.insert(storage_.end(), temp_storage.begin(), temp_storage.end());
+  }
+
+  absl::StatusOr<SingleArgumentPackingSpecProto> ToProto() const;
+
+  static absl::StatusOr<SingleArgumentPackingSpec> FromProto(
+      const SingleArgumentPackingSpecProto& proto);
+
+ private:
+  std::vector<char> storage_;
+  std::vector<ArgumentPackingRelocation> relocations_;
+};
+
+// `KernelArgumentsPackingSpec` defines how to convert a list of device buffer
+// pointers into a packed argument buffer that can be passed to a device kernel.
+//
+// When calling a custom kernel from XLA each HLO parameter and HLO result is
+// represented as a device buffer pointer and by default the custom kernel gets
+// launched with those pointers as kernel arguments in a predefined order -
+// input parameters first, then output parameters.
+//
+// This is very inflexible so KernelArgumentsPackingSpec allows to specify a
+// transformation from a list of device buffer pointers (usually created by
+// xla::emitters::KernelArguments) to a packed argument buffer (list of byte
+// arrays). Each argument of the custom kernel can be a buffer of arbitrary
+// bytes with a list of placeholders (which we call relocations - similar to
+// linker relocations) that will be replaced by the runtime with the address of
+// the corresponding device buffer.
+//
+// Since this is all declarative it is also possible to serialize
+// KernelArgumentsPackingSpec to a proto.
+//
+// Usage example: We want to launch a kernel that has the following launch
+// arguments:
+// - Output buffer pointer
+// - Input buffer pointer
+// - Constant value 42
+//
+// KernelArgumentsPackingSpec packing_spec;
+// // `1` refers to the second argument as defined by
+// // xla::emitters::KernelArguments. In case of 1 HLO input this is the first
+// // output buffer.
+// packing_spec.AddAddressArgument(1);
+// // `0` refers to the first argument as defined by
+// // xla::emitters::KernelArguments.
+// packing_spec.AddAddressArgument(0);
+// packing_spec.AddConstantArgument<int64_t>(42);
+//
+// custom_kernel.SetArgumentsPackingSpec(packing_spec);
+//
+// Now the custom kernel gets launched with a packed argument buffer that looks
+// like this: | output_ptr | input_ptr | 42 |
+class KernelArgumentsPackingSpec {
+ public:
+  KernelArgumentsPackingSpec() = default;
+  explicit KernelArgumentsPackingSpec(
+      std::vector<SingleArgumentPackingSpec> kernel_arguments)
+      : kernel_arguments_(std::move(kernel_arguments)) {}
+
+  // Adds a single argument packing spec to the kernel arguments packing spec.
+  void AddArgument(SingleArgumentPackingSpec spec) {
+    kernel_arguments_.push_back(std::move(spec));
+  }
+
+  // Adds a an argument that only contains a pointer to the `argument_index`th
+  // argument.
+  void AddAddressArgument(int argument_index) {
+    kernel_arguments_.push_back(SingleArgumentPackingSpec());
+    kernel_arguments_.back().WriteArgumentAddress(argument_index);
+  }
+
+  template <typename T>
+  void AddConstantArgument(T value) {
+    kernel_arguments_.push_back(SingleArgumentPackingSpec());
+    kernel_arguments_.back().WriteConstant(value);
+  }
+
+  // Materializes the argument buffers for this packing spec. The `args` span
+  // must contain at least the number of arguments referenced in the packing
+  // spec, otherwise an error will be returned.
+  absl::StatusOr<std::unique_ptr<KernelArgsPackedVector>> BuildArguments(
+      absl::Span<const DeviceMemoryBase> thunk_arguments,
+      size_t shared_memory_bytes) const;
+
+  absl::StatusOr<KernelArgumentsPackingSpecProto> ToProto() const;
+
+  static absl::StatusOr<KernelArgumentsPackingSpec> FromProto(
+      const KernelArgumentsPackingSpecProto& proto);
+
+ private:
+  std::vector<SingleArgumentPackingSpec> kernel_arguments_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_ARGUMENT_PACKING_SPEC_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto
new file mode 100644
index 00000000000000..fe2d86def0d436
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package stream_executor;
+
+option java_multiple_files = true;
+option java_outer_classname = "KernelArgumentPackingSpec";
+
+message ArgumentPackingRelocationProto {
+  enum Type {
+    TYPE_UNSPECIFIED = 0;
+    TYPE_BITS64_ABSOLUTE = 1;
+  }
+
+  Type type = 1;
+  int32 argument_index = 2;
+  int64 offset = 3;
+}
+
+message SingleArgumentPackingSpecProto {
+  repeated ArgumentPackingRelocationProto relocations = 1;
+  bytes data = 2;
+}
+
+message KernelArgumentsPackingSpecProto {
+  repeated SingleArgumentPackingSpecProto kernel_arguments = 1;
+}
diff --git a/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc
new file mode 100644
index 00000000000000..8c4fae606bc590
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_argument_packing_spec_test.cc
@@ -0,0 +1,219 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_args_packed_vector.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+namespace stream_executor {
+namespace {
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+using ::testing::ElementsAre;
+using ::testing::SizeIs;
+using tsl::proto_testing::EqualsProto;
+using tsl::proto_testing::ParseTextProtoOrDie;
+
+// This function creates a `DeviceMemoryBase` with an opaque pointer that
+// contains the given value. The size of the device memory is set to 0 since
+// it's unused.
+// Note that this device pointer is not a valid pointer to device memory, it
+// is only used for testing and can't be dereferenced.
+DeviceMemoryBase MakeDevicePointer(uint32_t value) {
+  // To construct a pointer that works both on 32bit and 64bit platforms and
+  // does not invoke undefined behaviour, we first cast our integer to uintptr_t
+  // and then cast it to void*.
+  return DeviceMemoryBase(
+      tsl::safe_reinterpret_cast<void*>(static_cast<uintptr_t>(value)),
+      /*size=*/0);
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteArgumentAddress) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/2);
+
+  // We fail if not enough arguments are provided.Since we are referencing
+  // argument #2, we will need to provide 3 arguments.
+  EXPECT_THAT(
+      first_arg.BuildArgument({MakeDevicePointer(0), MakeDevicePointer(0)}),
+      StatusIs(absl::StatusCode::kInvalidArgument));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<char> first_arg_storage,
+      first_arg.BuildArgument({MakeDevicePointer(0), MakeDevicePointer(0),
+                               MakeDevicePointer(0xff42)}));
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteMultipleArgumentAddresses) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+  first_arg.WriteArgumentAddress(/*argument_index=*/1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<char> first_arg_storage,
+      first_arg.BuildArgument(
+          {MakeDevicePointer(0xff42), MakeDevicePointer(0xaabbccdd)}));
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd,
+                          0xcc, 0xbb, 0xaa, 0x00, 0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteConstant) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteConstant(static_cast<uint32_t>(0x1348));
+  first_arg.WriteConstant(static_cast<uint64_t>(0x2389));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<char> first_arg_storage,
+                          first_arg.BuildArgument(/*args=*/{}));
+
+  // SingleArgumentPackingSpec::WriteConstant doesn't take endianness into
+  // account, so this assertion will fail for big endian architectures - which
+  // we don't support anyway.
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x48, 0x13, 0x00, 0x00, 0x89, 0x23, 0x00, 0x00, 0x00,
+                          0x00, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, WriteConstantAndAddress) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+  first_arg.WriteConstant(static_cast<uint32_t>(0x1234));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<char> first_arg_storage,
+                          first_arg.BuildArgument({MakeDevicePointer(0xff42)}));
+
+  EXPECT_THAT(first_arg_storage,
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34,
+                          0x12, 0x00, 0x00));
+}
+
+TEST(SingleArgumentPackingSpecTest, ToProto) {
+  SingleArgumentPackingSpec first_arg;
+  first_arg.WriteConstant(0x1234);
+  first_arg.WriteArgumentAddress(/*argument_index=*/0);
+
+  EXPECT_THAT(
+      first_arg.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+        relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 4 }
+        data: "\x34\x12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      )pb")));
+}
+
+TEST(SingleArgumentPackingSpecTest, FromProto) {
+  auto proto = ParseTextProtoOrDie<SingleArgumentPackingSpecProto>(
+      R"pb(
+        relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 4 }
+        data: "\x34\x12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(SingleArgumentPackingSpec spec,
+                          SingleArgumentPackingSpec::FromProto(proto));
+  EXPECT_THAT(spec.BuildArgument({MakeDevicePointer(0xff42)}),
+              IsOkAndHolds(ElementsAre(0x34, 0x12, 0x00, 0x00, 0x42, 0xff, 0x00,
+                                       0x00, 0x00, 0x00, 0x00, 0x00)));
+}
+
+TEST(KernelArgumentsPackingSpecTest, BuildArguments) {
+  KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(/*argument_index=*/0);
+  spec.AddConstantArgument(0x1234);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KernelArgsPackedVector> packed_args,
+                          spec.BuildArguments({MakeDevicePointer(0xff42)},
+                                              /*shared_memory_bytes=*/8989));
+  // We expect 3 arguments: 2 parameters and the shared memory which counts as
+  // an argument.
+  EXPECT_EQ(packed_args->number_of_arguments(), 3);
+  EXPECT_EQ(packed_args->number_of_shared_bytes(), 8989);
+  EXPECT_EQ(packed_args->argument_addresses().size(), 2);
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(packed_args->argument_addresses().at(0)),
+          8),
+      ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+  EXPECT_THAT(
+      absl::Span<const char>(
+          absl::bit_cast<const char*>(packed_args->argument_addresses().at(1)),
+          4),
+      ElementsAre(0x34, 0x12, 0x00, 0x00));
+}
+
+TEST(KernelArgumentsPackingSpecTest, ToProto) {
+  KernelArgumentsPackingSpec spec;
+  spec.AddAddressArgument(/*argument_index=*/33);
+  spec.AddConstantArgument(0x1234);
+
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                kernel_arguments {
+                  relocations {
+                    type: TYPE_BITS64_ABSOLUTE
+                    argument_index: 33
+                    offset: 0
+                  }
+                  data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+                }
+                kernel_arguments { data: "\x34\x12\x00\x00" }
+              )pb")));
+}
+
+TEST(KernelArgumentsPackingSpecTest, FromProto) {
+  auto proto = ParseTextProtoOrDie<KernelArgumentsPackingSpecProto>(
+      R"pb(
+        kernel_arguments {
+          relocations { type: TYPE_BITS64_ABSOLUTE argument_index: 0 offset: 0 }
+          data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+        }
+        kernel_arguments { data: "\x34\x12\x00\x00" }
+      )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelArgumentsPackingSpec spec,
+                          KernelArgumentsPackingSpec::FromProto(proto));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KernelArgsPackedVector> arguments,
+                          spec.BuildArguments({MakeDevicePointer(0xff42)},
+                                              /*shared_memory_bytes=*/8989));
+  // We expect 3 arguments: 2 parameters and the shared memory which counts as
+  // an argument.
+  EXPECT_EQ(arguments->number_of_arguments(), 3);
+  EXPECT_EQ(arguments->number_of_shared_bytes(), 8989);
+  ASSERT_THAT(arguments->argument_addresses(), SizeIs(2));
+  EXPECT_THAT(absl::Span<const char>(absl::bit_cast<const char*>(
+                                         arguments->argument_addresses().at(0)),
+                                     8),
+              ElementsAre(0x42, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00));
+  EXPECT_THAT(absl::Span<const char>(absl::bit_cast<const char*>(
+                                         arguments->argument_addresses().at(1)),
+                                     4),
+              ElementsAre(0x34, 0x12, 0x00, 0x00));
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_metadata.h b/third_party/xla/xla/stream_executor/kernel_metadata.h
new file mode 100644
index 00000000000000..c19e2376b5e9c3
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_metadata.h
@@ -0,0 +1,59 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
+
+#include <cstdint>
+#include <optional>
+
+namespace stream_executor {
+
+// KernelMetadata holds runtime-queryable attributes of a loaded kernel, such as
+// registers allocated, shared memory used, etc.
+// Not all platforms support reporting of all information, so each accessor
+// returns false if the associated field is not populated in the underlying
+// platform.
+class KernelMetadata {
+ public:
+  KernelMetadata() = default;
+
+  // Returns the number of registers used per thread executing this kernel.
+  std::optional<int64_t> registers_per_thread() const {
+    return registers_per_thread_;
+  }
+
+  // Returns the amount of [static] shared memory used per block executing this
+  // kernel. Note that dynamic shared memory allocations are not (and can not)
+  // be reported here (since they're not specified until kernel launch time).
+  std::optional<int64_t> shared_memory_bytes() const {
+    return shared_memory_bytes_;
+  }
+
+  void set_registers_per_thread(int registers_per_thread) {
+    registers_per_thread_ = registers_per_thread;
+  }
+  void set_shared_memory_bytes(int shared_memory_bytes) {
+    shared_memory_bytes_ = shared_memory_bytes;
+  }
+
+ private:
+  std::optional<int64_t> registers_per_thread_;
+  std::optional<int64_t> shared_memory_bytes_;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_METADATA_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_metadata_test.cc b/third_party/xla/xla/stream_executor/kernel_metadata_test.cc
new file mode 100644
index 00000000000000..1fd8c007646d95
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_metadata_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_metadata.h"
+
+#include <optional>
+
+#include <gtest/gtest.h>
+
+namespace stream_executor {
+namespace {
+
+TEST(KernelMetadataTest, DefaultConstructor) {
+  KernelMetadata metadata;
+  EXPECT_EQ(metadata.registers_per_thread(), std::nullopt);
+  EXPECT_EQ(metadata.shared_memory_bytes(), std::nullopt);
+}
+
+TEST(KernelMetadataTest, SetRegistersPerThread) {
+  KernelMetadata metadata;
+  metadata.set_registers_per_thread(10);
+  EXPECT_EQ(metadata.registers_per_thread(), 10);
+}
+
+TEST(KernelMetadataTest, SetSharedMemoryBytes) {
+  KernelMetadata metadata;
+  metadata.set_shared_memory_bytes(1024);
+  EXPECT_EQ(metadata.shared_memory_bytes(), 1024);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index ce7e205a2b3aa3..3b2c74f5091e6e 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -26,7 +28,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -37,6 +41,14 @@ KernelLoaderSpec KernelLoaderSpec::CreateInProcessSymbolSpec(
                           arity, kernel_args_packing};
 }
 
+KernelLoaderSpec KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+    std::string persistent_kernel_name, void* symbol, std::string kernel_name,
+    size_t arity, KernelArgsPacking kernel_args_packing) {
+  return KernelLoaderSpec{
+      InProcessSymbol{symbol, std::move(persistent_kernel_name)},
+      std::move(kernel_name), arity, kernel_args_packing};
+}
+
 KernelLoaderSpec KernelLoaderSpec::CreateCudaCubinInMemorySpec(
     absl::Span<const uint8_t> cubin_bytes, std::string kernel_name,
     size_t arity, KernelArgsPacking kernel_args_packing) {
@@ -66,18 +78,13 @@ KernelLoaderSpec KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
 }
 
 absl::StatusOr<KernelLoaderSpecProto> KernelLoaderSpec::ToProto() const {
-  if (kernel_args_packing_ != nullptr) {
+  if (std::holds_alternative<KernelArgsPackingFunc>(kernel_args_packing_) &&
+      std::get<KernelArgsPackingFunc>(kernel_args_packing_) != nullptr) {
     return absl::UnimplementedError(
-        "KernelLoaderSpecs with KernelArgsPacking are not currently"
+        "KernelLoaderSpecs with a function for argument packing is not "
         "serializable.");
   }
 
-  if (has_in_process_symbol()) {
-    return absl::InvalidArgumentError(
-        "KernelLoaderSpec referencing in process device functions can't "
-        "be serialized.");
-  }
-
   KernelLoaderSpecProto proto{};
   proto.set_arity(arity_);
   proto.set_kernel_name(kernel_name_);
@@ -91,28 +98,79 @@ absl::StatusOr<KernelLoaderSpecProto> KernelLoaderSpec::ToProto() const {
     proto.mutable_ptx()->set_data(cuda_ptx_in_memory()->ptx);
   }
 
-  CHECK(proto.has_cubin() || proto.has_ptx());
+  if (has_in_process_symbol()) {
+    if (in_process_symbol()->persistent_name.empty()) {
+      return absl::InvalidArgumentError(
+          "KernelLoaderSpec referencing in process device functions can't "
+          "be serialized without a persistent kernel name.");
+    }
+    proto.mutable_in_process_symbol()->set_persistent_name(
+        in_process_symbol()->persistent_name);
+  }
+
+  CHECK(has_cuda_cubin_in_memory() || has_cuda_ptx_in_memory() ||
+        has_in_process_symbol());
+
+  if (std::holds_alternative<KernelArgumentsPackingSpec>(
+          kernel_args_packing_)) {
+    TF_ASSIGN_OR_RETURN(
+        *proto.mutable_kernel_args_packing_spec(),
+        std::get<KernelArgumentsPackingSpec>(kernel_args_packing_).ToProto());
+  }
 
   return proto;
 }
 
 absl::StatusOr<KernelLoaderSpec> KernelLoaderSpec::FromProto(
-    const KernelLoaderSpecProto& proto) {
-  if (proto.has_cubin()) {
-    const std::string& data = proto.cubin().data();
-    return KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
-        std::vector<uint8_t>{data.begin(), data.end()}, proto.kernel_name(),
-        proto.arity());
+    const KernelLoaderSpecProto& proto,
+    std::optional<SymbolResolver> symbol_resolver) {
+  KernelArgsPacking kernel_args_packing;
+  if (proto.has_kernel_args_packing_spec()) {
+    TF_ASSIGN_OR_RETURN(kernel_args_packing,
+                        KernelArgumentsPackingSpec::FromProto(
+                            proto.kernel_args_packing_spec()));
   }
 
-  if (proto.has_ptx()) {
-    return KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
-        proto.ptx().data(), proto.kernel_name(), proto.arity());
+  switch (proto.payload_case()) {
+    case KernelLoaderSpecProto::kCubin: {
+      const std::string& data = proto.cubin().data();
+      return KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+          std::vector<uint8_t>{data.begin(), data.end()}, proto.kernel_name(),
+          proto.arity(), std::move(kernel_args_packing));
+    }
+
+    case KernelLoaderSpecProto::kPtx: {
+      return KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
+          proto.ptx().data(), proto.kernel_name(), proto.arity(),
+          std::move(kernel_args_packing));
+    }
+
+    case KernelLoaderSpecProto::kInProcessSymbol: {
+      if (!symbol_resolver.has_value()) {
+        return absl::InvalidArgumentError(
+            "KernelLoaderSpecProto references in process symbol, but no symbol "
+            "registry has been provided.");
+      }
+      if (proto.in_process_symbol().persistent_name().empty()) {
+        return absl::InvalidArgumentError(
+            "KernelLoaderSpecProto references in process symbol, but no "
+            "persistent name has been provided.");
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          void* symbol,
+          (*symbol_resolver)(proto.in_process_symbol().persistent_name()));
+      return KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          proto.in_process_symbol().persistent_name(), symbol,
+          proto.kernel_name(), proto.arity());
+    }
+
+    default:
+      return absl::InvalidArgumentError(
+          "Invalid KernelLoaderSpecProto. Neither PTX nor CUBIN payload has "
+          "been "
+          "found.");
   }
-
-  return absl::InvalidArgumentError(
-      "Invalid KernelLoaderSpecProto. Neither PTX nor CUBIN payload has been "
-      "found.");
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index a7ba849256570e..450bccfda1a62a 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/kernel.h"
 /* Copyright 2015 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,11 +48,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
 
 namespace stream_executor {
@@ -59,7 +62,10 @@ namespace stream_executor {
 // Loads kernel from in process symbol pointer (e.g. pointer to C++ device
 // function).
 struct InProcessSymbol {
-  void *symbol;
+  void* symbol;
+  // If not empty, this symbol can be looked up by this name in the kernel
+  // symbol registry.
+  std::string persistent_name;
 };
 
 // Kernel loader specification for PTX text that resides in memory.
@@ -89,9 +95,15 @@ class KernelLoaderSpec {
   // that can be directly passed to a device kernel. This indirection allows
   // registering custom CUDA C++ kernels with non-trivial C++ API with a
   // StreamExecutor as a generic `Kernel`.
-  using KernelArgsPacking =
+  using KernelArgsPackingFunc =
       std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
-          const Kernel &kernel, const KernelArgs &args)>;
+          const Kernel& kernel, const KernelArgs& args)>;
+
+  // Kernel arguments packing can be either a function or a specification.
+  // The specification has the advantage that it can be serialized and is
+  // therefore a requirement for AOT compilation.
+  using KernelArgsPacking =
+      std::variant<KernelArgsPackingFunc, KernelArgumentsPackingSpec>;
 
   // Returns the number of arguments that this kernel accepts.
   size_t arity() const { return arity_; }
@@ -145,44 +157,52 @@ class KernelLoaderSpec {
   // the PTX being loaded. Also be aware that in CUDA C++ the kernel name may be
   // mangled by the compiler if it is not declared in an extern "C" scope.
   static KernelLoaderSpec CreateInProcessSymbolSpec(
-      void *symbol, std::string kernel_name, size_t arity,
+      void* symbol, std::string kernel_name, size_t arity,
       KernelArgsPacking kernel_args_packing = nullptr);
+  static KernelLoaderSpec CreateSerializableInProcessSymbolSpec(
+      std::string persistent_kernel_name, void* symbol, std::string kernel_name,
+      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
   static KernelLoaderSpec CreateCudaCubinInMemorySpec(
       absl::Span<const uint8_t> cubin_bytes, std::string kernel_name,
-      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
+      size_t arity,
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateOwningCudaCubinInMemorySpec(
       std::vector<uint8_t> cubin_bytes, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateCudaPtxInMemorySpec(
       absl::string_view ptx, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
   static KernelLoaderSpec CreateOwningCudaPtxInMemorySpec(
       std::string ptx, std::string kernel_name, size_t arity,
-      KernelArgsPacking kernel_args_packing = nullptr);
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{});
 
   void set_kernel_args_packing(KernelArgsPacking kernel_args_packing) {
     kernel_args_packing_ = std::move(kernel_args_packing);
   }
 
-  const KernelArgsPacking &kernel_args_packing() const {
+  const KernelArgsPacking& kernel_args_packing() const {
     return kernel_args_packing_;
   }
 
-  const std::string &kernel_name() const { return kernel_name_; }
+  const std::string& kernel_name() const { return kernel_name_; }
 
   absl::StatusOr<KernelLoaderSpecProto> ToProto() const;
 
+  using SymbolResolver =
+      absl::FunctionRef<absl::StatusOr<void*>(absl::string_view)>;
+
   static absl::StatusOr<KernelLoaderSpec> FromProto(
-      const KernelLoaderSpecProto &proto);
+      const KernelLoaderSpecProto& proto,
+      std::optional<SymbolResolver> symbol_resolver = std::nullopt);
 
  private:
   using Payload =
       std::variant<InProcessSymbol, CudaCubinInMemory, CudaPtxInMemory,
                    OwningCudaCubinInMemory, OwningCudaPtxInMemory>;
 
-  explicit KernelLoaderSpec(Payload payload, std::string kernel_name,
-                            size_t arity,
-                            KernelArgsPacking kernel_args_packing = nullptr)
+  explicit KernelLoaderSpec(
+      Payload payload, std::string kernel_name, size_t arity,
+      KernelArgsPacking kernel_args_packing = KernelArgsPackingFunc{})
       : payload_(std::move(payload)),
         kernel_name_(std::move(kernel_name)),
         arity_(arity),
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.proto b/third_party/xla/xla/stream_executor/kernel_spec.proto
index 962229d2e3f366..939b06cba35ebd 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.proto
+++ b/third_party/xla/xla/stream_executor/kernel_spec.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package stream_executor;
 
+import "xla/stream_executor/kernel_argument_packing_spec.proto";
+
 message CudaPtxProto {
   string data = 1;
 }
@@ -10,12 +12,19 @@ message CudaCubinProto {
   bytes data = 1;
 }
 
+message InProcessSymbolProto {
+  string persistent_name = 1;
+}
+
 message KernelLoaderSpecProto {
   oneof payload {
     CudaPtxProto ptx = 1;
     CudaCubinProto cubin = 2;
+    InProcessSymbolProto in_process_symbol = 6;
   }
 
   int32 arity = 3;
   string kernel_name = 4;
+
+  optional KernelArgumentsPackingSpecProto kernel_args_packing_spec = 5;
 }
diff --git a/third_party/xla/xla/stream_executor/kernel_spec_test.cc b/third_party/xla/xla/stream_executor/kernel_spec_test.cc
index e1c24efa568518..71065e5ad50a81 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec_test.cc
@@ -18,32 +18,42 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/base/casts.h"
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.pb.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
-#include "tsl/platform/protobuf.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace stream_executor {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+using ::absl_testing::StatusIs;
 using ::testing::Field;
 using ::testing::Optional;
 using ::tsl::proto_testing::EqualsProto;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
+
+// Creates a pointer to a CUDA kernel with a value that can be used to identify
+// it later. Note that this is not a valid pointer, but that's fine as long
+// as we don't try to dereference it.
+void* InventPointerToCudaKernel(std::uintptr_t value) {
+  return tsl::safe_reinterpret_cast<void*>(value);
+}
 
 TEST(KernelLoaderSpec, InProcessSymbol) {
-  void* symbol = absl::bit_cast<void*>(0xDEADBEEFul);
-  auto spec = stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      symbol, "kernel24", 2);
+  void* symbol = InventPointerToCudaKernel(0xDEADBEEF);
+  auto spec = KernelLoaderSpec::CreateInProcessSymbolSpec(symbol, "kernel24",
+                                                          /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_TRUE(spec.has_in_process_symbol());
@@ -55,8 +65,8 @@ TEST(KernelLoaderSpec, InProcessSymbol) {
 
 TEST(KernelLoaderSpec, CudaCubin) {
   static constexpr std::array<uint8_t, 4> kCubinData = {0xDE, 0xAD, 0xBE, 0xEF};
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaCubinInMemorySpec(
-      kCubinData, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateCudaCubinInMemorySpec(
+      kCubinData, "kernel24", /*arity=*/2);
   EXPECT_TRUE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -68,10 +78,9 @@ TEST(KernelLoaderSpec, CudaCubin) {
 
 TEST(KernelLoaderSpec, OwningCudaCubin) {
   static constexpr std::array<uint8_t, 4> kCubinData = {0xDE, 0xAD, 0xBE, 0xEF};
-  auto spec =
-      stream_executor::KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
-          std::vector<uint8_t>{kCubinData.begin(), kCubinData.end()},
-          "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+      std::vector<uint8_t>{kCubinData.begin(), kCubinData.end()}, "kernel24",
+      /*arity=*/2);
   EXPECT_TRUE(spec.has_cuda_cubin_in_memory());
   EXPECT_FALSE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -83,8 +92,8 @@ TEST(KernelLoaderSpec, OwningCudaCubin) {
 
 TEST(KernelLoaderSpec, CudaPtx) {
   static constexpr absl::string_view kPtxData = "PTX DEADBEEF";
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-      kPtxData, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateCudaPtxInMemorySpec(kPtxData, "kernel24",
+                                                          /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_TRUE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -96,9 +105,8 @@ TEST(KernelLoaderSpec, CudaPtx) {
 
 TEST(KernelLoaderSpec, OwningCudaPtx) {
   static constexpr absl::string_view kPtxData = "PTX DEADBEEF";
-  auto spec =
-      stream_executor::KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
-          std::string{kPtxData}, "kernel24", 2);
+  auto spec = KernelLoaderSpec::CreateOwningCudaPtxInMemorySpec(
+      std::string{kPtxData}, "kernel24", /*arity=*/2);
   EXPECT_FALSE(spec.has_cuda_cubin_in_memory());
   EXPECT_TRUE(spec.has_cuda_ptx_in_memory());
   EXPECT_FALSE(spec.has_in_process_symbol());
@@ -109,14 +117,11 @@ TEST(KernelLoaderSpec, OwningCudaPtx) {
 }
 
 TEST(KernelLoaderSpec, PtxKernelFromProto) {
-  KernelLoaderSpecProto proto;
-  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
-      R"pb(
-        ptx { data: "PTX!" }
-        kernel_name: "kernel_name"
-        arity: 42
-      )pb",
-      &proto));
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    ptx { data: "PTX!" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
 
   TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
                           KernelLoaderSpec::FromProto(proto));
@@ -127,8 +132,8 @@ TEST(KernelLoaderSpec, PtxKernelFromProto) {
 }
 
 TEST(KernelLoaderSpec, PtxKernelToProto) {
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaPtxInMemorySpec(
-      "PTX!", "kernel_name", 42);
+  auto spec = KernelLoaderSpec::CreateCudaPtxInMemorySpec("PTX!", "kernel_name",
+                                                          /*arity=*/42);
 
   EXPECT_THAT(spec.ToProto(), absl_testing::IsOkAndHolds(EqualsProto(R"pb(
                 ptx { data: "PTX!" }
@@ -138,14 +143,11 @@ TEST(KernelLoaderSpec, PtxKernelToProto) {
 }
 
 TEST(KernelLoaderSpec, CubinKernelFromProto) {
-  KernelLoaderSpecProto proto;
-  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
-      R"pb(
-        cubin { data: "CUBIN" }
-        kernel_name: "kernel_name"
-        arity: 42
-      )pb",
-      &proto));
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    cubin { data: "CUBIN" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
 
   TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
                           KernelLoaderSpec::FromProto(proto));
@@ -158,22 +160,100 @@ TEST(KernelLoaderSpec, CubinKernelFromProto) {
 
 TEST(KernelLoaderSpec, CubinKernelToProto) {
   std::array<uint8_t, 5> kCubin = {'C', 'U', 'B', 'I', 'N'};
-  auto spec = stream_executor::KernelLoaderSpec::CreateCudaCubinInMemorySpec(
-      kCubin, "kernel_name", 42);
+  auto spec = KernelLoaderSpec::CreateCudaCubinInMemorySpec(
+      kCubin, "kernel_name", /*arity=*/42);
 
-  EXPECT_THAT(spec.ToProto(), absl_testing::IsOkAndHolds(EqualsProto(R"pb(
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
                 cubin { data: "CUBIN" }
                 kernel_name: "kernel_name"
                 arity: 42
               )pb")));
 }
 
+TEST(KernelLoaderSpec, InProcessSymbolFromProto) {
+  auto proto = ParseTextProtoOrDie<KernelLoaderSpecProto>(R"pb(
+    in_process_symbol { persistent_name: "persistent_kernel_name" }
+    kernel_name: "kernel_name"
+    arity: 42
+  )pb");
+
+  const auto symbol_resolver = [](absl::string_view name) {
+    return InventPointerToCudaKernel(0x1234567890);
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(KernelLoaderSpec spec,
+                          KernelLoaderSpec::FromProto(proto, symbol_resolver));
+  EXPECT_EQ(spec.kernel_name(), "kernel_name");
+  EXPECT_EQ(spec.arity(), 42);
+  EXPECT_THAT(spec.in_process_symbol(),
+              Optional(Field(&InProcessSymbol::symbol,
+                             InventPointerToCudaKernel(0x1234567890))));
+  EXPECT_THAT(spec.in_process_symbol(),
+              Optional(Field(&InProcessSymbol::persistent_name,
+                             "persistent_kernel_name")));
+
+  // If the symbol resolver is not provided, the spec cannot be deserialized.
+  EXPECT_THAT(KernelLoaderSpec::FromProto(proto),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 TEST(KernelLoaderSpec, InProcessSymbolToProto) {
-  auto spec = stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      nullptr, "kernel_name", 42);
+  auto non_serializable_spec = KernelLoaderSpec::CreateInProcessSymbolSpec(
+      /*symbol=*/nullptr, "kernel_name", 42);
+
+  // InProcessSymbol specs without a persistent name cannot be serialized.
+  EXPECT_THAT(non_serializable_spec.ToProto(),
+              StatusIs(absl::StatusCode::kInvalidArgument));
 
-  EXPECT_THAT(spec.ToProto(),
-              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  auto serializable_spec =
+      KernelLoaderSpec::CreateSerializableInProcessSymbolSpec(
+          "persistent_kernel_name", nullptr, "kernel_name", 42);
+  EXPECT_THAT(serializable_spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                in_process_symbol { persistent_name: "persistent_kernel_name" }
+                kernel_name: "kernel_name"
+                arity: 42
+              )pb")));
+}
+
+TEST(kernelLoaderSpec, StoresKernelArgsPackingSpec) {
+  auto kernel_args_packing_spec_proto =
+      ParseTextProtoOrDie<KernelArgumentsPackingSpecProto>(
+          R"pb(
+            kernel_arguments {
+              relocations {
+                type: TYPE_BITS64_ABSOLUTE
+                argument_index: 0
+                offset: 0
+              }
+              data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+            }
+            kernel_arguments { data: "\x34\x12\x00\x00" }
+          )pb");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      KernelArgumentsPackingSpec kernel_args_packing_spec,
+      KernelArgumentsPackingSpec::FromProto(kernel_args_packing_spec_proto));
+
+  auto spec = KernelLoaderSpec::CreateOwningCudaCubinInMemorySpec(
+      std::vector<uint8_t>{'C', 'U', 'B', 'I', 'N'}, "kernel_name",
+      /*arity=*/42, std::move(kernel_args_packing_spec));
+
+  EXPECT_THAT(spec.ToProto(), IsOkAndHolds(EqualsProto(R"pb(
+                cubin { data: "CUBIN" }
+                kernel_name: "kernel_name"
+                arity: 42
+                kernel_args_packing_spec {
+                  kernel_arguments {
+                    relocations {
+                      type: TYPE_BITS64_ABSOLUTE
+                      argument_index: 0
+                      offset: 0
+                    }
+                    data: "\x00\x00\x00\x00\x00\x00\x00\x00"
+                  }
+                  kernel_arguments { data: "\x34\x12\x00\x00" }
+                }
+              )pb")));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc b/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc
new file mode 100644
index 00000000000000..c6375e0cfef37e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry.cc
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_symbol_registry.h"
+
+#include <string>
+#include <tuple>
+
+#include "absl/base/no_destructor.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+
+namespace stream_executor {
+namespace {
+std::string GetPlatformName(Platform::Id platform_id) {
+  absl::StatusOr<Platform*> platform =
+      PlatformManager::PlatformWithId(platform_id);
+  return platform.ok() ? platform.value()->Name() : "<unknown>";
+}
+}  // namespace
+
+KernelSymbolRegistry& KernelSymbolRegistry::GetGlobalInstance() {
+  static absl::NoDestructor<KernelSymbolRegistry> registry;
+  return *registry;
+}
+
+absl::Status KernelSymbolRegistry::RegisterSymbol(absl::string_view name,
+                                                  Platform::Id platform_id,
+                                                  void* symbol) {
+  absl::MutexLock lock(mutex_);
+  bool inserted;
+  std::tie(std::ignore, inserted) =
+      symbols_.insert({{std::string(name), platform_id}, symbol});
+  if (!inserted) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("Symbol ", name, " is already registered."));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<void*> KernelSymbolRegistry::FindSymbol(
+    absl::string_view name, Platform::Id platform_id) const {
+  absl::MutexLock lock(mutex_);
+  auto it = symbols_.find({std::string(name), platform_id});
+  if (it == symbols_.end()) {
+    return absl::NotFoundError(absl::StrCat("Symbol ", name,
+                                            " not found for platform ",
+                                            GetPlatformName(platform_id)));
+  }
+  return it->second;
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry.h b/third_party/xla/xla/stream_executor/kernel_symbol_registry.h
new file mode 100644
index 00000000000000..ac388a5e17daba
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry.h
@@ -0,0 +1,114 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
+
+#include <string>
+#include <tuple>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform/initialize.h"
+
+namespace stream_executor {
+
+/**
+ * A registry for GPU kernel symbols.
+ *
+ * We use void* pointers to the host entry functions of CUDA C++ kernels to
+ * identify them and load their GPU implementation into the GPU.
+ *
+ * This registry allows us to do this consistently and reliably across different
+ * processes by mapping the host entry function to a persistent name and using
+ * that name to look up the host entry function pointer in the registry.
+ *
+ * Maps a (name, platform_id) tuple to a void* pointer.
+ *
+ * You can use the KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY macro to
+ * register symbols during initialization. Make sure to mark the target as
+ * `alwayslink = True` so that it won't be stripped by the linker.
+ *
+ * The class is thread-safe.
+ */
+class KernelSymbolRegistry {
+ public:
+  static KernelSymbolRegistry& GetGlobalInstance();
+
+  // Registers a symbol in the registry. The symbol is identified by a name and
+  // a platform ID.
+  //
+  // Returns an error if the symbol is already registered.
+  absl::Status RegisterSymbol(absl::string_view name, Platform::Id platform_id,
+                              void* symbol);
+
+  // Convenience overload for registering any raw function pointer as a symbol.
+  template <typename... Args>
+  absl::Status RegisterSymbol(absl::string_view name, Platform::Id platform_id,
+                              void (*symbol)(Args...)) {
+    return RegisterSymbol(name, platform_id, absl::bit_cast<void*>(symbol));
+  }
+
+  absl::StatusOr<void*> FindSymbol(absl::string_view name,
+                                   Platform::Id platform_id) const;
+
+ private:
+  mutable absl::Mutex mutex_;
+  using RegistryKey = std::tuple<std::string, Platform::Id>;
+  absl::flat_hash_map<RegistryKey, void*> symbols_ ABSL_GUARDED_BY(mutex_);
+};
+
+// Registers a symbol in the kernel symbol registry.
+//
+// This macro registers a symbol in the kernel symbol registry during static
+// initialization. It uses the identifier to generate a unique name for the
+// symbol and logs a fatal error if the symbol is already registered.
+//
+// Example usage:
+//
+//   __global__ void my_cuda_kernel(int* x);
+//   KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(my_unique_persistent_name,
+//                                                     cuda::kCudaPlatformId,
+//                                                     &my_cuda_kernel);
+//
+// The symbol will be registered with the name "my_unique_persistent_name" and
+// the platform ID cuda::kCudaPlatformId. The name will also be used to generate
+// a C++ identifier for the static initializer. therefore it needs to be a valid
+// C++ variable name.
+//
+// Make sure to mark the target as `alwayslink = True` so that it won't be
+// stripped by the linker.
+#define KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(identifier,          \
+                                                          platform_id, symbol) \
+  static void RegisterSymbol##identifier##Impl() {                             \
+    absl::Status result =                                                      \
+        stream_executor::KernelSymbolRegistry::GetGlobalInstance()             \
+            .RegisterSymbol(#identifier, platform_id, symbol);                 \
+    if (!result.ok()) {                                                        \
+      LOG(FATAL) << "Failed to register symbol: " << result;                   \
+    }                                                                          \
+  }                                                                            \
+  STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(                                 \
+      RegisterSymbol##identifier, RegisterSymbol##identifier##Impl());
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_SYMBOL_REGISTRY_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc b/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc
new file mode 100644
index 00000000000000..afcf7bc8801788
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_symbol_registry_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel_symbol_registry.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+
+namespace stream_executor {
+namespace {
+
+using absl_testing::IsOk;
+using absl_testing::IsOkAndHolds;
+using absl_testing::StatusIs;
+
+TEST(KernelSymbolRegistryTest, RegisterSymbol) {
+  KernelSymbolRegistry registry;
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      /*symbol=*/nullptr),
+              IsOk());
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      /*symbol=*/nullptr),
+              StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+void PretendsToBeAKernel(int* x) { *x = 42; }
+
+TEST(KernelSymbolRegistryTest, FindSymbol) {
+  KernelSymbolRegistry registry;
+  EXPECT_THAT(registry.FindSymbol("symbol_name", cuda::kCudaPlatformId),
+              StatusIs(absl::StatusCode::kNotFound));
+  EXPECT_THAT(registry.RegisterSymbol("symbol_name", cuda::kCudaPlatformId,
+                                      &PretendsToBeAKernel),
+              IsOk());
+  EXPECT_THAT(registry.FindSymbol("symbol_name", cuda::kCudaPlatformId),
+              IsOkAndHolds(absl::bit_cast<void*>(&PretendsToBeAKernel)));
+}
+
+KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(static_test_symbol,
+                                                  cuda::kCudaPlatformId,
+                                                  &PretendsToBeAKernel);
+
+TEST(KernelSymbolRegistryTest, StaticRegistration) {
+  EXPECT_THAT(KernelSymbolRegistry::GetGlobalInstance().FindSymbol(
+                  "static_test_symbol", cuda::kCudaPlatformId),
+              IsOkAndHolds(absl::bit_cast<void*>(&PretendsToBeAKernel)));
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index e3aa3d687de78f..d37250234f05df 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -38,6 +38,17 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "rocm_compute_capability_test",
+    srcs = ["rocm_compute_capability_test.cc"],
+    deps = [
+        ":rocm_compute_capability",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "rocm_context",
     srcs = ["rocm_context.cc"],
@@ -156,6 +167,9 @@ cc_library(
         "//xla/stream_executor:generic_memory_allocation",
         "//xla/stream_executor:generic_memory_allocator",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
+        "//xla/stream_executor:kernel_argument_packing_spec",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
@@ -190,9 +204,9 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:platform_port",
     ],
     alwayslink = True,
 )
@@ -239,6 +253,7 @@ cc_library(
         ":rocm_status",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -367,9 +382,9 @@ cc_library(
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
         "//xla/stream_executor:host_or_device_scalar",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_h",
@@ -506,8 +521,8 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
+        "//xla/stream_executor:engine_options",
         "//xla/stream_executor:event_based_timer",
-        "//xla/stream_executor:numeric_options",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
@@ -1169,6 +1184,7 @@ rocm_library(
     deps = [
         ":rocm_platform_id",
         "//xla:types",
+        "//xla/stream_executor:kernel_symbol_registry",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:topk_kernel",
         "//xla/tsl/lib/math:math_util",
@@ -1236,7 +1252,7 @@ rocm_library(
     ],
     deps = [
         ":rocm_platform_id",
-        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:gpu_kernel_registry",
         "//xla/stream_executor/gpu:gpu_test_kernel_traits",
diff --git a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
index c4a07d84b21054..0784512c2ba7ad 100644
--- a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
@@ -61,7 +61,7 @@ struct radix_key_codec_base<tsl::bfloat16>
 namespace traits {
 
 template <>
-struct rocprim::traits::define<Eigen::half> {
+struct define<Eigen::half> {
   using float_bit_mask =
       rocprim::traits::float_bit_mask::values<uint16_t, 0x8000, 0x7C00, 0x03FF>;
   using is_arithmetic = rocprim::traits::is_arithmetic::values<true>;
@@ -70,7 +70,7 @@ struct rocprim::traits::define<Eigen::half> {
 };
 
 template <>
-struct rocprim::traits::define<tsl::bfloat16> {
+struct define<tsl::bfloat16> {
   using float_bit_mask =
       rocprim::traits::float_bit_mask::values<uint16_t, 0x8000, 0x7F80, 0x007F>;
   using is_arithmetic = rocprim::traits::is_arithmetic::values<true>;
@@ -284,7 +284,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
 
 // Floating point types.
 #ifdef CUB_TYPE_BF16
-XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+XLA_CUB_DEFINE_SORT_KEYS(bf16, hip_bfloat16)
 #endif
 #ifdef CUB_TYPE_F16
 XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
diff --git a/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
index ce976a32e63341..1dc83bc2ba276b 100644
--- a/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/gpu_test_kernels_rocm.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/gpu_test_kernel_traits.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_lib.cu.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 
@@ -32,6 +32,24 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
           "AddI32", arity);
     }));
 
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    IncrementBy5I32KernelWithCustomArgsPackingRocm,
+    stream_executor::gpu::internal::IncrementBy5I32KernelWithCustomArgsPacking,
+    stream_executor::rocm::kROCmPlatformId, ([](size_t arity) {
+      // This kernels is implemented in terms of the generic `IncI32` kernel
+      // which accepts a constant scalar argument and an addressable pointer
+      // argument. We use a custom args packing spec to pass a constant scalar
+      // value of 5 to the kernel.
+      stream_executor::KernelArgumentsPackingSpec spec;
+      spec.AddConstantArgument<int32_t>(5);
+      spec.AddAddressArgument(/*argument_index=*/0);
+      spec.AddAddressArgument(/*argument_index=*/1);
+
+      return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(
+          absl::bit_cast<void*>(&stream_executor::gpu::IncI32), "IncI32",
+          /*arity=*/3, spec);
+    }));
+
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     MulI32KernelRocm, stream_executor::gpu::internal::MulI32Kernel,
     stream_executor::rocm::kROCmPlatformId, ([](size_t arity) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index 732454d86d2181..f74c91a5bd312d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -42,10 +42,10 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocblas_wrapper.h"
@@ -471,12 +471,14 @@ void ROCMBlas::MaybeLogGemmOp(GemmCallTrace::GemmType op,
       parent_->RecordApiTrace(GemmCallTrace{op, (int)context, size1, size2});
 }
 
-absl::Status ROCMBlas::DoBlasGemm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
-    const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
-    const void *beta, DeviceMemoryBase *c, int ldc,
-    const NumericOptions &numeric_options, blas::CallContext context) {
+absl::Status ROCMBlas::DoBlasGemm(Stream* stream, blas::Transpose transa,
+                                  blas::Transpose transb, uint64_t m,
+                                  uint64_t n, uint64_t k, blas::DataType dtype,
+                                  const void* alpha, const DeviceMemoryBase& a,
+                                  int lda, const DeviceMemoryBase& b, int ldb,
+                                  const void* beta, DeviceMemoryBase* c,
+                                  int ldc, const EngineOptions& engine_options,
+                                  blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
                  m * k * DtypeSize(dtype), n * k * DtypeSize(dtype));
 
@@ -557,13 +559,13 @@ absl::Status ROCMBlas::DoBlasGemm(
 }
 
 absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, const void* beta, DeviceMemoryBase* c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* profile_result, blas::CallContext context) {
   if (type_a != type_b) {
     return absl::InternalError(absl::StrFormat(
         "DoBlasGemmWithAlgorithm: different "
@@ -580,7 +582,7 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
     TF_RETURN_IF_ERROR(DoBlasGemm(stream, transa, transb, m, n, k, type_a,
                                   alpha, a, lda, b, ldb, beta, c, ldc,
-                                  numeric_options, context));
+                                  engine_options, context));
 
   } else {
     MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
@@ -617,14 +619,14 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
 }
 
 absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
-    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
-    blas::DataType type_b, int ldb, int64_t stride_b, const void *beta,
-    DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    uint64_t n, uint64_t k, const void* alpha, const DeviceMemoryBase& a,
+    blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase& b,
+    blas::DataType type_b, int ldb, int64_t stride_b, const void* beta,
+    DeviceMemoryBase* c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
-    blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *profile_result, blas::CallContext context) {
+    blas::AlgorithmType algorithm, const EngineOptions& engine_options,
+    blas::ProfileResult* profile_result, blas::CallContext context) {
   if (type_a != type_b) {
     return absl::InternalError(absl::StrFormat(
         "DoBlasGemmStridedBatchedWithAlgorithm: different "
@@ -641,7 +643,7 @@ absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
     TF_RETURN_IF_ERROR(DoBlasGemmStridedBatched(
         stream, transa, transb, m, n, k, type_a, alpha, a, lda, stride_a, b,
-        ldb, stride_b, beta, c, ldc, stride_c, batch_count, numeric_options,
+        ldb, stride_b, beta, c, ldc, stride_c, batch_count, engine_options,
         context));
   } else {
     MaybeLogGemmOp(GemmCallTrace::GemmType::kStridedBatched, context, a.size(),
@@ -1070,11 +1072,11 @@ class rocblas_gemm_strided_batched_bf16 {
 const char *rocblas_gemm_strided_batched_bf16::kName =
     "rocblas_gemm_strided_batched_bf16";
 bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a,
     int lda, DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a.size(),
                  b.size());
@@ -1105,12 +1107,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 }
 
 bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
+    Stream* stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64_t k, float alpha,
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     blas::CallContext context) {
   MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(),
                  b_array.size());
@@ -1129,12 +1131,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 #define IMPL_DoBlasGemmBatched(T, Fun)                                         \
   bool ROCMBlas::DoBlasGemmBatched(                                            \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      Stream* stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64_t n, uint64_t k, T alpha,                             \
       DeviceMemorySlice<T> a_array, int lda, DeviceMemorySlice<T> b_array,     \
       int ldb, T beta, DeviceMemorySlice<T> c_array, int ldc, int batch_count, \
-      const NumericOptions &numeric_options,                                   \
-      ScratchAllocator *scratch_allocator, blas::CallContext context) {        \
+      const EngineOptions& engine_options,                                     \
+      ScratchAllocator* scratch_allocator, blas::CallContext context) {        \
     MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(), \
                    b_array.size());                                            \
     absl::Status status = DoBlasGemmBatchedInternal(                           \
@@ -1190,12 +1192,12 @@ IMPL_DoBlasGemmBatched(float, wrap::rocblas_sgemm_strided_batched)
 
                                 absl::Status
     ROCMBlas::DoBlasGemmStridedBatched(
-        Stream *stream, blas::Transpose transa, blas::Transpose transb,
+        Stream* stream, blas::Transpose transa, blas::Transpose transb,
         uint64_t m, uint64_t n, uint64_t k, blas::DataType dtype,
-        const void *alpha, const DeviceMemoryBase &a, int lda, int64_t stride_a,
-        const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
-        DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
-        const NumericOptions &numeric_options, blas::CallContext context) {
+        const void* alpha, const DeviceMemoryBase& a, int lda, int64_t stride_a,
+        const DeviceMemoryBase& b, int ldb, int64_t stride_b, const void* beta,
+        DeviceMemoryBase* c, int ldc, int64_t stride_c, int batch_count,
+        const EngineOptions& engine_options, blas::CallContext context) {
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS GEMM Strided Batched: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
index 92d6067151002f..9f462f338977f2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
@@ -463,4 +463,8 @@ absl::Status RocmCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
+std::string RocmCommandBuffer::ToString() const {
+  return "ROCM graph debug dot print is not supported.";
+}
+
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
index 6a2d33ef0dd106..9f680dff3ffbb4 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
@@ -46,6 +46,8 @@ class RocmCommandBuffer : public GpuCommandBuffer {
   static absl::StatusOr<std::unique_ptr<RocmCommandBuffer>> Create(
       Mode mode, StreamExecutor* executor);
 
+  std::string ToString() const override;
+
   ~RocmCommandBuffer() override;
 
  private:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
index 3dce2854ed9a58..45efd17b32e7b1 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability.h
@@ -51,7 +51,7 @@ class RocmComputeCapability {
     return RocmComputeCapability{"gfx1030"};
   }
 
-  std::string gcn_arch_name() const { return gcn_arch_name_; }
+  const std::string& gcn_arch_name() const { return gcn_arch_name_; }
 
   std::string ToString() const { return gcn_arch_name(); }
 
@@ -61,6 +61,11 @@ class RocmComputeCapability {
     return proto;
   }
 
+  static RocmComputeCapability FromProto(
+      const RocmComputeCapabilityProto& proto) {
+    return RocmComputeCapability{proto.gcn_arch_name()};
+  }
+
   bool operator==(const RocmComputeCapability& other) const {
     return gcn_arch_name_ == other.gcn_arch_name_;
   }
@@ -198,7 +203,7 @@ class RocmComputeCapability {
   template <typename... ArrayOfStrings>
   bool IsThisGfxInAnyList(ArrayOfStrings&&... arr) const {
     static_assert(sizeof...(arr) >= 1);
-    const auto gfx = gfx_version();
+    const std::string gfx = gfx_version();
     return (implIsThisGfxInAnyList(std::begin(arr), std::end(arr), gfx) || ...);
   }
 
@@ -206,10 +211,9 @@ class RocmComputeCapability {
   /// \warning Don't use directly!
   bool implIsThisGfxInAnyList(const absl::string_view* beg,
                               const absl::string_view* end,
-                              const std::string& gfx) const {
-    return std::any_of(beg, end, [&gfx = gfx](const absl::string_view& s) {
-      return gfx == s;
-    });
+                              const absl::string_view gfx) const {
+    return std::any_of(
+        beg, end, [&gfx = gfx](const absl::string_view s) { return gfx == s; });
   }
 
   std::string gcn_arch_name_{kInvalidGfx};  // default to invalid arch.
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
new file mode 100644
index 00000000000000..ed9eba6c545e17
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace stream_executor::rocm {
+namespace {
+
+constexpr absl::string_view kExampleGcnArchName = "gfx1010:xnack-";
+
+TEST(RocmComputeCapabilityTest, FromProto) {
+  RocmComputeCapabilityProto proto;
+  proto.set_gcn_arch_name(kExampleGcnArchName);
+  RocmComputeCapability cc = RocmComputeCapability::FromProto(proto);
+  EXPECT_EQ(cc, RocmComputeCapability{std::string(kExampleGcnArchName)});
+}
+
+TEST(RocmComputeCapabilityTest, ToProto) {
+  RocmComputeCapability cc{std::string(kExampleGcnArchName)};
+  EXPECT_THAT(cc.ToProto(), tsl::proto_testing::EqualsProto(
+                                R"pb(gcn_arch_name: "gfx1010:xnack-")pb"));
+}
+
+}  // namespace
+}  // namespace stream_executor::rocm
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 6c1b728b53100b..3ab706dda45e08 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -52,8 +52,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/event_based_timer.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -2800,7 +2800,7 @@ absl::Status MIOpenSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
@@ -2922,7 +2922,7 @@ MIOpenSupport::CreateRnnDescriptor(
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
     dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    const NumericOptions& numeric_options, float dropout, uint64_t seed,
+    const EngineOptions& engine_options, float dropout, uint64_t seed,
     ScratchAllocator* state_allocator, bool use_padded_io) {
   // ROCM TODO: batch_size is used in dynamic persistent RNN algorithm and is
   // not supported by MIOpen now.
@@ -3488,7 +3488,7 @@ absl::Status MIOpenSupport::GetConvolveRunners(
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
     DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    ScratchAllocator* scratch_allocator, const NumericOptions& numeric_options,
+    ScratchAllocator* scratch_allocator, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners) {
   if (input_type != output_type) {
     return absl::UnimplementedError(
@@ -4270,7 +4270,7 @@ absl::Status ROCmFusedMatmulRunner::gemm(Stream* stream,
                               static_cast<DeviceMemory<T>>(b_data), _ldb,
                               static_cast<DeviceMemory<T>>(a_data), _lda,
                               static_cast<DeviceMemory<T>*>(&c_data), _ldc,
-                              NumericOptions{}, blas::CallContext::kNone);
+                              EngineOptions{}, blas::CallContext::kNone);
 }
 
 template <typename T, typename Tbias = T>
@@ -4348,7 +4348,7 @@ absl::Status MIOpenSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
+    const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   out_exec_plans->clear();
@@ -5167,7 +5167,7 @@ absl::Status MIOpenSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
+    dnn::ActivationMode activation_mode, const EngineOptions& engine_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   VLOG(2) << "MIOpenSupport::GetFusedConvolveRunners";
   VLOG(2) << "filter_descriptor " << filter_descriptor.ndims();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 7457448dcafcb9..63c8f8666c4c92 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/engine_options.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -92,7 +92,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
       dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      const EngineOptions& engine_options, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
@@ -249,7 +249,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners)
       override;
 
@@ -432,7 +432,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
       uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
@@ -446,7 +446,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
+      const EngineOptions& engine_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
@@ -626,8 +626,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
-      const NumericOptions& numeric_options,
-      ScratchAllocator* scratch_allocator,
+      const EngineOptions& engine_options, ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
   MIOpenSupport(const MIOpenSupport&) = delete;
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
index 07ffdc2342efa3..c077a666a8a6d7 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -49,8 +49,8 @@ namespace wrap {
   auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) {  \
     using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
     static FuncPtrT loaded = []() -> FuncPtrT {                             \
-      static const char *kName = TO_STR(hipSymbolName);                     \
-      void *f;                                                              \
+      static const char* kName = TO_STR(hipSymbolName);                     \
+      void* f;                                                              \
       auto s = tsl::Env::Default()->GetSymbolFromLibrary(                   \
           tsl::internal::CachedDsoLoader::GetHipDsoHandle().value(), kName, \
           &f);                                                              \
@@ -100,6 +100,7 @@ namespace wrap {
   __macro(hipGetDeviceCount)                        \
   __macro(hipGetDeviceProperties)                   \
   __macro(hipGetErrorString)                        \
+  __macro(hipGetLastError)                          \
   __macro(hipGraphAddKernelNode)                    \
   __macro(hipGraphAddChildGraphNode)                \
   __macro(hipGraphAddEmptyNode)                     \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index dbab0c3b0949cd..2f86ace03c247d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unistd.h>
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -61,6 +62,9 @@ limitations under the License.
 #include "xla/stream_executor/gpu/read_numa_node.h"
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
+#include "xla/stream_executor/kernel_argument_packing_spec.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -87,8 +91,8 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
-#include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/numa.h"
 #include "tsl/platform/numbers.h"
 
 namespace stream_executor {
@@ -380,11 +384,16 @@ absl::Status EnablePeerAccess(Context* from, Context* to) {
   hipError_t result =
       wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
 
-  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
+  if (result == hipErrorPeerAccessAlreadyEnabled) {
+    // hipGetLastError is used to reset per thread error state,
+    // as hipGetLastError would get the recent error code since rocm7 even the
+    // last call is successful.
+    (void)wrap::hipGetLastError();
+  } else if (result != hipSuccess) {
     return absl::InternalError(
         absl::StrFormat("failed to enable peer access from %d to %d: %s",
                         from->device_ordinal(), to->device_ordinal(),
-                        ToString(result).c_str()));
+                        wrap::hipGetErrorString(result)));
   }
 
   return absl::OkStatus();
@@ -522,7 +531,7 @@ bool RocmExecutor::UnloadModule(ModuleHandle module_handle) {
 }
 
 absl::StatusOr<DeviceMemoryBase> RocmExecutor::GetMemoryRange(
-    const DeviceMemoryBase& location) {
+    const DeviceMemoryBase& location) const {
   hipDeviceptr_t device_pointer;
   size_t size;
   hipError_t result = wrap::hipMemGetAddressRange(
@@ -713,7 +722,23 @@ absl::StatusOr<std::unique_ptr<Kernel>> RocmExecutor::LoadKernel(
     rocm_kernel->set_metadata(kernel_metadata);
   }
   rocm_kernel->set_name(kernel_name);
-  rocm_kernel->set_args_packing(spec.kernel_args_packing());
+  if (std::holds_alternative<KernelLoaderSpec::KernelArgsPackingFunc>(
+          spec.kernel_args_packing())) {
+    rocm_kernel->set_args_packing(
+        std::get<KernelLoaderSpec::KernelArgsPackingFunc>(
+            spec.kernel_args_packing()));
+  } else {
+    const auto& packing_spec =
+        std::get<KernelArgumentsPackingSpec>(spec.kernel_args_packing());
+    rocm_kernel->set_args_packing([packing_spec](const Kernel& kernel,
+                                                 const KernelArgs& args) {
+      const auto& mem_args =
+          stream_executor::Cast<stream_executor::KernelArgsDeviceMemoryArray>(
+              &args);
+      return packing_spec.BuildArguments(mem_args->device_memory_args(),
+                                         args.number_of_shared_bytes());
+    });
+  }
   return std::move(rocm_kernel);
 }
 
@@ -1077,6 +1102,20 @@ RocmExecutor::CreateCommandBuffer(CommandBuffer::Mode mode) {
   return RocmCommandBuffer::Create(mode, this);
 }
 
+int RocmExecutor::GetGpuStreamPriority(StreamPriority priority) {
+  if (priority == StreamPriority::Default) {
+    return 0;
+  }
+  std::unique_ptr<ActivateContext> activation = Activate();
+  int lowest, highest;
+  auto status = wrap::hipDeviceGetStreamPriorityRange(&lowest, &highest);
+  if (status != hipSuccess) {
+    LOG(ERROR) << "Failed to get stream priority range: " << ToString(status);
+    return 0;
+  }
+  return priority == StreamPriority::Highest ? highest : lowest;
+}
+
 absl::StatusOr<std::unique_ptr<DeviceDescription>>
 RocmExecutor::CreateDeviceDescription(int device_ordinal) {
   TF_ASSIGN_OR_RETURN(hipDevice_t device, GetDevice(device_ordinal));
@@ -1122,6 +1161,9 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
     desc.set_l2_cache_size(prop.l2CacheSize);
   }
 
+  // PCIe bandwidth for PCI Gen4 x16 (approximate)
+  desc.set_pcie_bandwidth(32LL * 1024 * 1024 * 1024);
+
   {
     auto ecc_enabled_or = IsEccEnabled(device);
     if (!ecc_enabled_or.ok()) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
index 41843efe4fee07..eb02184e4cee42 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
@@ -84,7 +84,7 @@ class RocmExecutor : public GpuExecutor {
       Stream* stream, absl::Span<const uint8_t> content) override;
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
   absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) override;
+      const DeviceMemoryBase& location) const override;
   void Deallocate(DeviceMemoryBase* mem) override;
   bool SynchronizeAllActivity() override;
   absl::StatusOr<std::unique_ptr<EventBasedTimer>> CreateEventBasedTimer(
@@ -133,6 +133,8 @@ class RocmExecutor : public GpuExecutor {
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
       MemoryType type) override;
 
+  int GetGpuStreamPriority(StreamPriority priority) override;
+
  private:
   // Initializes Blas interfaces
   absl::Status InitBlas();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 3069de644eaded..5d90b44ada4014 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -57,10 +57,10 @@ TEST(RocmExecutorTest, CreateDeviceDescription) {
   EXPECT_THAT(result->model_str(), Not(IsEmpty()));
   EXPECT_THAT(result->device_vendor(), "Advanced Micro Devices, Inc");
 
-  EXPECT_THAT(
-      std::get_if<RocmComputeCapability>(&result->gpu_compute_capability())
-          ->gcn_arch_name(),
-      Not(IsEmpty()));
+  EXPECT_THAT(result->gpu_compute_capability()
+                  .rocm_compute_capability()
+                  ->gcn_arch_name(),
+              Not(IsEmpty()));
 }
 
 TEST(RocmExecutorTest, GetRocmKernel) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
index cc10638d09c78c..8227f08c146182 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_status.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
index 9c15fe9a777546..ff532209094801 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/logging.h"
@@ -60,9 +61,9 @@ class RocmKernel : public Kernel {
   absl::StatusOr<KernelMetadata> GetKernelMetadata();
 
  private:
-  absl::Status Launch(const ThreadDim &thread_dims, const BlockDim &block_dims,
-                      const std::optional<ClusterDim> &cluster_dims,
-                      Stream *stream, const KernelArgs &args) override;
+  absl::Status Launch(const ThreadDim& thread_dims, const BlockDim& block_dims,
+                      const std::optional<ClusterDim>& cluster_dims,
+                      Stream* stream, const KernelArgs& args) override;
 
   StreamExecutor* executor_ = nullptr;
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 1cab970e37d385..5c26a6ed294583 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -27,14 +27,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace rocm {
-
-// Opaque and unique identifier for the ROCM platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a ROCmPlatform object.
-extern const Platform::Id kROCmPlatformId;
-}
-
 namespace gpu {
 // ROCm-specific platform plugin, registered as a singleton value via module
 // initializer.
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc b/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
index e16e54d5344c22..14447e39353948 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream.cc
@@ -50,22 +50,6 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 namespace {
-int GetGpuStreamPriority(StreamExecutor* executor,
-                         stream_executor::StreamPriority stream_priority) {
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-  if (stream_priority == stream_executor::StreamPriority::Default) {
-    return 0;
-  }
-  int lowest, highest;
-  hipError_t res = wrap::hipDeviceGetStreamPriorityRange(&lowest, &highest);
-  if (res != hipSuccess) {
-    LOG(ERROR)
-        << "Could not query stream priority range. Returning default priority.";
-    return 0;
-  }
-  return stream_priority == stream_executor::StreamPriority::Highest ? highest
-                                                                     : lowest;
-}
 
 absl::StatusOr<hipStream_t> CreateStream(StreamExecutor* executor,
                                          int priority) {
@@ -188,8 +172,7 @@ absl::StatusOr<std::unique_ptr<RocmStream>> RocmStream::Create(
     if (priority.has_value() && std::holds_alternative<int>(priority.value())) {
       return std::get<int>(priority.value());
     }
-    return GetGpuStreamPriority(
-        executor,
+    return executor->GetGpuStreamPriority(
         std::get<StreamPriority>(priority.value_or(StreamPriority::Default)));
   }();
   TF_ASSIGN_OR_RETURN(auto stream_handle,
diff --git a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
index 4ba18752824f0e..52ad7a29a12393 100644
--- a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -1,3 +1,4 @@
+
 /* Copyright 2021 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,92 +21,14 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
 #define XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
 
-#include "rocm/include/roctracer/roctracer.h"
-#include "rocm/include/roctracer/roctracer_hip.h"
-#include "rocm/rocm_config.h"
-#if TF_ROCM_VERSION >= 50300
-#include "rocm/include/roctracer/roctracer_roctx.h"
-#else
-#include "rocm/include/roctracer/roctracer_hcc.h"
-#endif
-#include "xla/tsl/platform/env.h"
-#include "tsl/platform/dso_loader.h"
-#include "tsl/platform/platform.h"
-
-namespace stream_executor {
-namespace wrap {
-
-#ifdef PLATFORM_GOOGLE
-
-#define ROCTRACER_API_WRAPPER(API_NAME)                            \
-  template <typename... Args>                                      \
-  auto API_NAME(Args... args) -> decltype((::API_NAME)(args...)) { \
-    return (::API_NAME)(args...);                                  \
-  }
-
-#else
-
-#define ROCTRACER_API_WRAPPER(API_NAME)                                    \
-  template <typename... Args>                                              \
-  auto API_NAME(Args... args) -> decltype(::API_NAME(args...)) {           \
-    using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type;         \
-    static FuncPtrT loaded = []() -> FuncPtrT {                            \
-      static const char* kName = #API_NAME;                                \
-      void* f;                                                             \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                  \
-          tsl::internal::CachedDsoLoader::GetRoctracerDsoHandle().value(), \
-          kName, &f);                                                      \
-      CHECK(s.ok()) << "could not find " << kName                          \
-                    << " in roctracer DSO; dlerror: " << s.message();      \
-      return reinterpret_cast<FuncPtrT>(f);                                \
-    }();                                                                   \
-    return loaded(args...);                                                \
-  }
-
-#endif  // PLATFORM_GOOGLE
-
-#if TF_ROCM_VERSION >= 50300
-#define FOREACH_ROCTRACER_API(DO_FUNC)           \
-  DO_FUNC(roctracer_default_pool_expl)           \
-  DO_FUNC(roctracer_disable_domain_activity)     \
-  DO_FUNC(roctracer_disable_domain_callback)     \
-  DO_FUNC(roctracer_disable_op_activity)         \
-  DO_FUNC(roctracer_disable_op_callback)         \
-  DO_FUNC(roctracer_enable_domain_activity_expl) \
-  DO_FUNC(roctracer_enable_domain_callback)      \
-  DO_FUNC(roctracer_enable_op_activity_expl)     \
-  DO_FUNC(roctracer_enable_op_callback)          \
-  DO_FUNC(roctracer_error_string)                \
-  DO_FUNC(roctracer_flush_activity_expl)         \
-  DO_FUNC(roctracer_get_timestamp)               \
-  DO_FUNC(roctracer_op_string)                   \
-  DO_FUNC(roctracer_open_pool_expl)              \
-  DO_FUNC(roctracer_set_properties)              \
-  DO_FUNC(roctracer_next_record)
-#else
-#define FOREACH_ROCTRACER_API(DO_FUNC)           \
-  DO_FUNC(roctracer_default_pool_expl)           \
-  DO_FUNC(roctracer_disable_domain_activity)     \
-  DO_FUNC(roctracer_disable_domain_callback)     \
-  DO_FUNC(roctracer_disable_op_activity)         \
-  DO_FUNC(roctracer_disable_op_callback)         \
-  DO_FUNC(roctracer_enable_domain_activity_expl) \
-  DO_FUNC(roctracer_enable_domain_callback)      \
-  DO_FUNC(roctracer_enable_op_activity_expl)     \
-  DO_FUNC(roctracer_enable_op_callback)          \
-  DO_FUNC(roctracer_error_string)                \
-  DO_FUNC(roctracer_flush_activity_expl)         \
-  DO_FUNC(roctracer_get_timestamp)               \
-  DO_FUNC(roctracer_op_string)                   \
-  DO_FUNC(roctracer_open_pool_expl)              \
-  DO_FUNC(roctracer_set_properties)
-#endif
-FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
-
-#undef FOREACH_ROCTRACER_API
-#undef ROCTRACER_API_WRAPPER
-
-}  // namespace wrap
-}  // namespace stream_executor
+#include "rocm/include/rocprofiler-sdk/buffer.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/buffer_tracing.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/callback_tracing.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/cxx/name_info.hpp"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/external_correlation.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/fwd.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/internal_threading.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/registration.h"  // IWYU pragma: export
+#include "rocm/include/rocprofiler-sdk/rocprofiler.h"  // IWYU pragma: export
 
 #endif  // XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
index ff557f3666cfc2..2ce77648b0ef08 100644
--- a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
+++ b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/gpu/topk_kernel.h"
+#include "xla/stream_executor/kernel_symbol_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/tsl/lib/math/math_util.h"
 
@@ -292,7 +293,10 @@ __launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
         return stream_executor::KernelLoaderSpec::CreateInProcessSymbolSpec(  \
             absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>),                     \
             "topk_k" #K_VAL "_" #TYPE "_" #VT, arity);                        \
-      }));
+      }));                                                                    \
+  KERNEL_SYMBOL_REGISTRY_REGISTER_SYMBOL_STATICALLY(                          \
+      TopKKernelRocm_K##K_VAL##_##TYPE##_##VT,                                \
+      stream_executor::rocm::kROCmPlatformId, (&Run<K_VAL, TYPE, VT>));
 
 }  // namespace stream_executor::rocm
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 1795a2afd86ecd..e8858ac82e9a75 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -192,7 +192,7 @@ class StreamExecutor {
   // for the given DeviceMemoryBase, such that location is contained within the
   // returned range.
   virtual absl::StatusOr<DeviceMemoryBase> GetMemoryRange(
-      const DeviceMemoryBase& location) {
+      const DeviceMemoryBase& location) const {
     return absl::UnimplementedError("Not implemented for this executor.");
   }
 
@@ -257,6 +257,9 @@ class StreamExecutor {
   virtual absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription() const = 0;
 
+  // Return the platform dependent stream priority value for the given priority.
+  virtual int GetGpuStreamPriority(StreamPriority priority) { return 0; }
+
   // Gets-or-creates a BlasSupport datatype that can be used to execute BLAS
   // routines on the current platform.
   //
diff --git a/third_party/xla/xla/stream_executor/sycl/BUILD b/third_party/xla/xla/stream_executor/sycl/BUILD
index 5b16d09437d6dc..be65d61c2c7f90 100644
--- a/third_party/xla/xla/stream_executor/sycl/BUILD
+++ b/third_party/xla/xla/stream_executor/sycl/BUILD
@@ -12,7 +12,7 @@ load(
     "stream_executor_friends",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_copts")
 load("//xla/tsl/platform:build_config_root.bzl", "if_static")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -47,7 +47,6 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -82,7 +81,7 @@ cc_library(
 
 cc_library(
     name = "sycl_rpath",
-    linkopts = ["-Wl,-rpath,../local_config_sycl/sycl/sycl/lib"],
+    linkopts = if_oss(["-Wl,-rpath,../local_config_sycl/sycl/sycl/lib"]),
 )
 
 cc_library(
@@ -130,11 +129,7 @@ cc_library(
     name = "sycl_solver_context",
     srcs = ["sycl_solver_context.cc"],
     hdrs = ["sycl_solver_context.h"],
-    tags = [
-        "gpu",
-        "manual",
-        "oneapi-only",
-    ],
+    tags = ["gpu"],
     deps = [
         ":sycl_platform_id",
         "//xla:xla_data_proto_cc",
@@ -157,8 +152,16 @@ sycl_library(
         "oneapi-only",
     ],
     deps = [
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_metadata",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -228,6 +231,7 @@ sycl_library(
     ],
     deps = [
         ":sycl_event",
+        ":sycl_gpu_runtime",
         "//xla/stream_executor:event_based_timer",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -264,7 +268,8 @@ sycl_library(
     deps = [
         ":sycl_status",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -279,6 +284,7 @@ xla_test(
     ],
     deps = [
         ":sycl_gpu_runtime",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:status_matchers",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_context.cc b/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
index 63d26774732f3a..56249a37988982 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_context.cc
@@ -30,9 +30,7 @@ absl::StatusOr<uint64_t> SyclContext::GetDeviceTotalMemory(
 }
 
 absl::Status SyclContext::Synchronize() {
-  // TODO(intel-tf): Add this feature once SyclStreamPool class is implemented.
-  return absl::UnimplementedError(
-      "SyclContext::Synchronize is not implemented for SYCL platform.");
+  return SyclStreamPool::SynchronizeStreamPool(device_ordinal_);
 }
 
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
index 02e245ba102cc9..dbd295677fb6c2 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include <cassert>
 #include <iostream>
-#include <unordered_map>
 
 #include "absl/base/call_once.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/tsl/util/env_var.h"
 
 namespace stream_executor::sycl {
 
@@ -32,10 +30,99 @@ absl::Status IsValidDeviceOrdinal(int device_ordinal,
   TF_ASSIGN_OR_RETURN(int device_count, SyclDevicePool::GetDeviceCount());
   if (device_ordinal >= 0 && device_ordinal < device_count) {
     return absl::OkStatus();
-  } else {
-    return absl::InvalidArgumentError(absl::StrCat(
-        function_name, ": Invalid device ordinal: ", device_ordinal));
   }
+  return absl::InvalidArgumentError(absl::StrCat(
+      function_name, ": Invalid device ordinal: ", device_ordinal));
+}
+
+// Returns true if the oneAPI version is 2024.2 or newer.
+// oneAPI 2024.2 corresponds to __LIBSYCL_MAJOR_VERSION == 7 and
+// __LIBSYCL_MINOR_VERSION == 2.
+bool IsOneAPIVersionAtLeast2024_2() {
+  return (__LIBSYCL_MAJOR_VERSION >= 7) && (__LIBSYCL_MINOR_VERSION >= 2);
+}
+
+absl::Status MemcpyDeviceToHost(::sycl::queue* stream_handle, void* dst_host,
+                                const void* src_device, size_t byte_count,
+                                bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_host, src_device, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyDeviceToHost failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemcpyHostToDevice(::sycl::queue* stream_handle, void* dst_device,
+                                const void* src_host, size_t byte_count,
+                                bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_device, src_host, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyHostToDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemcpyDeviceToDevice(::sycl::queue* stream_handle,
+                                  void* dst_device, const void* src_device,
+                                  size_t byte_count, bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memcpy(dst_device, src_device, byte_count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemcpyDeviceToDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemsetDevice(::sycl::queue* stream_handle, void* dst_device,
+                          unsigned char value, size_t count,
+                          bool async = false) {
+  try {
+    ::sycl::event event =
+        stream_handle->memset(dst_device, value, count * sizeof(uint8_t));
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError("MemsetDevice failed: " + std::string(e.what()) +
+                               ", file = " + __FILE__ +
+                               ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MemfillDevice(::sycl::queue* stream_handle, void* dst_device,
+                           uint32_t value, size_t count, bool async = false) {
+  try {
+    ::sycl::event event = stream_handle->fill(dst_device, value, count);
+    if (!async) {
+      event.wait();
+    }
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        "MemfillDevice failed: " + std::string(e.what()) +
+        ", file = " + __FILE__ + ", line = " + std::to_string(__LINE__) + ".");
+  }
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -94,10 +181,9 @@ absl::StatusOr<int> SyclDevicePool::GetDeviceOrdinal(
   auto it = std::find(device_pool_.begin(), device_pool_.end(), device);
   if (it != device_pool_.end()) {
     return static_cast<int>(it - device_pool_.begin());
-  } else {
-    return absl::InternalError(
-        "SyclDevicePool::GetDeviceOrdinal failed, got invalid device");
   }
+  return absl::InternalError(
+      "SyclDevicePool::GetDeviceOrdinal failed, got invalid device");
 }
 
 absl::StatusOr<::sycl::device> SyclDevicePool::GetDevice(int device_ordinal) {
@@ -107,4 +193,508 @@ absl::StatusOr<::sycl::device> SyclDevicePool::GetDevice(int device_ordinal) {
   return device_pool_[device_ordinal];
 }
 
+StreamPoolMap SyclStreamPool::stream_pool_map_;
+absl::Mutex SyclStreamPool::stream_pool_mu_(absl::kConstInit);
+
+void SyclAsyncHandler(::sycl::exception_list ex_list) {
+  for (auto& e : ex_list) {
+    try {
+      std::rethrow_exception(e);
+    } catch (::sycl::exception& e) {
+      LOG(ERROR) << "SYCL exception: " << e.what() << ", file = " << __FILE__
+                 << ", line = " << __LINE__ << ".";
+    }
+  }
+}
+
+absl::StatusOr<StreamPool*> SyclStreamPool::InitStreamPool(int device_ordinal) {
+  {
+    absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+    auto it = stream_pool_map_.find(device_ordinal);
+    // Returns the existing non-empty stream pool for this device, if available.
+    // The pool may be empty if DestroyStream was called on the last stream.
+    if (it != stream_pool_map_.end() && !it->second.empty()) {
+      VLOG(2) << "Check 1: Returning existing stream pool for device ordinal "
+              << device_ordinal << " whose size is " << it->second.size();
+      return &(it->second);
+    }
+  }
+  // Creates a new stream pool for this device using the device and context.
+  ::sycl::property_list prop_list{::sycl::property::queue::enable_profiling(),
+                                  ::sycl::property::queue::in_order()};
+  TF_ASSIGN_OR_RETURN(::sycl::device sycl_device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  TF_ASSIGN_OR_RETURN(::sycl::context sycl_context,
+                      SyclDevicePool::GetDeviceContext());
+
+  VLOG(2) << "Creating new stream pool for device ordinal " << device_ordinal;
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  auto it = stream_pool_map_.find(device_ordinal);
+  // Double-checks that another thread has not already created the pool.
+  if (it != stream_pool_map_.end() && !it->second.empty()) {
+    VLOG(2) << "Check 2: Returning existing stream pool for device ordinal "
+            << device_ordinal << " whose size is " << it->second.size();
+    return &(it->second);
+  }
+
+  StreamPool stream_pool = {std::make_shared<::sycl::queue>(
+      sycl_context, sycl_device, SyclAsyncHandler, prop_list)};
+
+  // Use assignment (not insert) to update the stream pool if it was
+  // previously destroyed.
+  stream_pool_map_[device_ordinal] = std::move(stream_pool);
+
+  return &(stream_pool_map_[device_ordinal]);
+}
+
+absl::StatusOr<StreamPtr> SyclStreamPool::GetDefaultStream(int device_ordinal) {
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclStreamPool::GetDefaultStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  // InitStreamPool always returns a valid pointer, so no null check is needed.
+  absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::GetDefaultStream: Stream pool is empty "
+                     "for device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  return stream_pool->front();
+}
+
+absl::StatusOr<StreamPtr> SyclStreamPool::GetOrCreateStream(
+    int device_ordinal, bool enable_multiple_streams) {
+  VLOG(2) << "SyclStreamPool::GetOrCreateStream called for device ordinal "
+          << device_ordinal
+          << ", enable_multiple_streams: " << enable_multiple_streams;
+  if (!enable_multiple_streams) {
+    return SyclStreamPool::GetDefaultStream(device_ordinal);
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal,
+                                          "SyclStreamPool::GetOrCreateStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  // If multiple streams are enabled, create a new stream and add it
+  // to the pool, unless the pool has reached kMaxStreamsPerDevice.
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  if (stream_pool->size() >= kMaxStreamsPerDevice) {
+    VLOG(2) << "Stream pool size for device ordinal " << device_ordinal
+            << " exceeds the maximum limit of " << kMaxStreamsPerDevice;
+    return absl::ResourceExhaustedError(
+        absl::StrCat("SyclStreamPool::GetOrCreateStream: Maximum number of "
+                     "streams reached for device ordinal ",
+                     device_ordinal, "."));
+  }
+  VLOG(2) << "Stream pool size for device ordinal " << device_ordinal << ": "
+          << stream_pool->size();
+  ::sycl::property_list prop_list{::sycl::property::queue::enable_profiling(),
+                                  ::sycl::property::queue::in_order()};
+  TF_ASSIGN_OR_RETURN(::sycl::device sycl_device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  TF_ASSIGN_OR_RETURN(::sycl::context sycl_context,
+                      SyclDevicePool::GetDeviceContext());
+  stream_pool->push_back(std::make_shared<::sycl::queue>(
+      sycl_context, sycl_device, SyclAsyncHandler, prop_list));
+  return stream_pool->back();
+}
+
+absl::Status SyclStreamPool::SynchronizeStreamPool(int device_ordinal) {
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(
+      device_ordinal, "SyclStreamPool::SynchronizeStreamPool"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  absl::ReaderMutexLock read_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::SynchronizeStreamPool: Stream pool is "
+                     "empty for device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  for (auto& stream : *stream_pool) {
+    stream->wait();
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SyclStreamPool::DestroyStream(int device_ordinal,
+                                           StreamPtr& stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclStreamPool::DestroyStream: Attempting to destroy a null stream "
+        "handle.");
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclStreamPool::DestroyStream"));
+  TF_ASSIGN_OR_RETURN(StreamPool * stream_pool,
+                      SyclStreamPool::InitStreamPool(device_ordinal));
+  absl::MutexLock write_lock(&stream_pool_mu_);
+  if (stream_pool->empty()) {
+    return absl::InternalError(
+        absl::StrCat("SyclStreamPool::DestroyStream: Stream pool is empty for "
+                     "device ordinal ",
+                     device_ordinal,
+                     ". The pool may have been destroyed by another thread."));
+  }
+  auto it = std::find(stream_pool->begin(), stream_pool->end(), stream_handle);
+  if (it == stream_pool->end()) {
+    return absl::NotFoundError(absl::StrCat(
+        "SyclStreamPool::DestroyStream: Stream handle for device ordinal ",
+        device_ordinal, " not found in the pool."));
+  }
+  // Remove the stream from the pool and reset the handle.
+  // The stream pool remains, but may become empty.
+  stream_pool->erase(it);
+  stream_handle.reset();
+  VLOG(2) << "Successfully destroyed stream for device ordinal "
+          << device_ordinal << ", stream pool size is " << stream_pool->size();
+  return absl::OkStatus();
+}
+
+absl::StatusOr<SyclTimerProperties> SyclGetTimerProperties(int device_ordinal) {
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclGetTimerProperties"));
+  TF_ASSIGN_OR_RETURN(::sycl::device device,
+                      SyclDevicePool::GetDevice(device_ordinal));
+  ze_device_handle_t lz_device_handle =
+      ::sycl::get_native<::sycl::backend::ext_oneapi_level_zero>(device);
+  ze_device_properties_t lz_device_props{
+      // timerResolution will be in cycles/sec (Hz) with this structure type.
+      ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2,
+  };
+  ze_result_t status =
+      zeDeviceGetProperties(lz_device_handle, &lz_device_props);
+  if (status != ZE_RESULT_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("SyclGetTimerProperties: zeDeviceGetProperties failed for "
+                     "device ordinal ",
+                     device_ordinal, " with return code: ", status));
+  }
+  uint64_t timer_freq_hz = lz_device_props.timerResolution;
+  uint64_t timestamp_mask =
+      (1ull << lz_device_props.kernelTimestampValidBits) - 1ull;
+  return SyclTimerProperties{timer_freq_hz, timestamp_mask};
+}
+
+absl::Status SyclStreamSynchronize(::sycl::queue* stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclStreamSynchronize: Null stream handle provided.");
+  }
+  try {
+    stream_handle->wait();
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclStreamSynchronize: Failed to synchronize stream: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::optional<::sycl::event>> SyclGetRecentEventFromStream(
+    ::sycl::queue* stream_handle) {
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclGetRecentEventFromStream: Null stream handle provided.");
+  }
+  try {
+    // Use the new DPC++/SYCL API when oneAPI version is at least 2024.2.
+    std::optional<::sycl::event> event =
+        IsOneAPIVersionAtLeast2024_2()
+            ? stream_handle->ext_oneapi_get_last_event()
+            : stream_handle->ext_oneapi_submit_barrier();
+    return event;
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclGetRecentEventFromStream: Failed to get event from stream: ",
+        e.what(), ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::Status SyclMemcpyDeviceToHost(int device_ordinal, void* dst_host,
+                                    const void* src_device, size_t byte_count) {
+  if (dst_host == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHost: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyDeviceToHost: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyDeviceToHost"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyDeviceToHost(stream_handle.get(), dst_host, src_device,
+                            byte_count);
+}
+
+absl::Status SyclMemcpyHostToDevice(int device_ordinal, void* dst_device,
+                                    const void* src_host, size_t byte_count) {
+  if (dst_device == nullptr || src_host == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDevice: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyHostToDevice: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyHostToDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyHostToDevice(stream_handle.get(), dst_device, src_host,
+                            byte_count);
+}
+
+absl::Status SyclMemcpyDeviceToDevice(int device_ordinal, void* dst_device,
+                                      const void* src_device,
+                                      size_t byte_count) {
+  if (dst_device == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDevice: Null pointer provided for destination or "
+        "source.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMemcpyDeviceToDevice: Attempting to copy zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(
+      IsValidDeviceOrdinal(device_ordinal, "SyclMemcpyDeviceToDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemcpyDeviceToDevice(stream_handle.get(), dst_device, src_device,
+                              byte_count);
+}
+
+absl::Status SyclMemcpyDeviceToHostAsync(::sycl::queue* stream_handle,
+                                         void* dst_host, const void* src_device,
+                                         size_t byte_count) {
+  if (dst_host == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHostAsync: Null pointer provided for destination or "
+        "source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToHostAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyDeviceToHostAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  ::sycl::usm::alloc dst_alloc_type =
+      ::sycl::get_pointer_type(dst_host, stream_handle->get_context());
+  bool async = (dst_alloc_type == ::sycl::usm::alloc::host);
+  return MemcpyDeviceToHost(stream_handle, dst_host, src_device, byte_count,
+                            async);
+}
+
+absl::Status SyclMemcpyHostToDeviceAsync(::sycl::queue* stream_handle,
+                                         void* dst_device, const void* src_host,
+                                         size_t byte_count) {
+  if (dst_device == nullptr || src_host == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDeviceAsync: Null pointer provided for destination or "
+        "source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyHostToDeviceAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyHostToDeviceAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  ::sycl::usm::alloc src_alloc_type =
+      ::sycl::get_pointer_type(src_host, stream_handle->get_context());
+  bool async = (src_alloc_type == ::sycl::usm::alloc::host);
+  return MemcpyHostToDevice(stream_handle, dst_device, src_host, byte_count,
+                            async);
+}
+
+absl::Status SyclMemcpyDeviceToDeviceAsync(::sycl::queue* stream_handle,
+                                           void* dst_device,
+                                           const void* src_device,
+                                           size_t byte_count) {
+  if (dst_device == nullptr || src_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDeviceAsync: Null pointer provided for destination "
+        "or source.");
+  }
+  if (stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemcpyDeviceToDeviceAsync: Null stream handle provided.");
+  }
+  if (byte_count == 0) {
+    LOG(WARNING)
+        << "SyclMemcpyDeviceToDeviceAsync: Attempting to copy zero bytes, "
+           "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemcpyDeviceToDevice(stream_handle, dst_device, src_device, byte_count,
+                              /*async=*/true);
+}
+
+absl::Status SyclMemsetDevice(int device_ordinal, void* dst_device,
+                              unsigned char value, size_t count) {
+  if (dst_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemsetDevice: Null pointer provided for destination.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemsetDevice: Attempting to set zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMemsetDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemsetDevice(stream_handle.get(), dst_device, value, count);
+}
+
+absl::Status SyclMemsetDeviceAsync(::sycl::queue* stream_handle,
+                                   void* dst_device, unsigned char value,
+                                   size_t count) {
+  if (dst_device == nullptr || stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemsetDeviceAsync: Null pointer provided for destination or "
+        "stream handle.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemsetDeviceAsync: Attempting to set zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemsetDevice(stream_handle, dst_device, value, count, /*async=*/true);
+}
+
+absl::Status SyclMemfillDevice(int device_ordinal, void* dst_device,
+                               uint32_t value, size_t count) {
+  if (dst_device == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemfillDevice: Null pointer provided for destination.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemfillDevice: Attempting to fill zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMemfillDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  return MemfillDevice(stream_handle.get(), dst_device, value, count);
+}
+
+absl::Status SyclMemfillDeviceAsync(::sycl::queue* stream_handle,
+                                    void* dst_device, uint32_t value,
+                                    size_t count) {
+  if (dst_device == nullptr || stream_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclMemfillDeviceAsync: Null pointer provided for destination or "
+        "stream handle.");
+  }
+  if (count == 0) {
+    LOG(WARNING) << "SyclMemfillDeviceAsync: Attempting to fill zero bytes, "
+                    "skipping operation.";
+    return absl::OkStatus();
+  }
+  return MemfillDevice(stream_handle, dst_device, value, count, /*async=*/true);
+}
+
+// TODO(intel-tf): Need OOM checks for all SYCL memory allocation functions.
+absl::StatusOr<void*> SyclMallocDevice(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocDevice: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocDevice"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_device(/*alignment=*/64, byte_count,
+                                             *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocDevice: Failed to allocate device memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::StatusOr<void*> SyclMallocHost(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocHost: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocHost"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_host(/*alignment=*/64, byte_count,
+                                           *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocHost: Failed to allocate host memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::StatusOr<void*> SyclMallocShared(int device_ordinal, size_t byte_count) {
+  if (byte_count == 0) {
+    LOG(WARNING) << "SyclMallocShared: Attempting to allocate zero bytes, "
+                    "returning nullptr.";
+    return nullptr;
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclMallocShared"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to allocate memory
+    void* ptr = ::sycl::aligned_alloc_shared(/*alignment=*/64, byte_count,
+                                             *stream_handle);
+    return ptr;
+  } catch (const std::exception& e) {
+    return absl::InternalError(absl::StrCat(
+        "SyclMallocShared: Failed to allocate shared memory: ", e.what(),
+        ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+}
+
+absl::Status SyclFree(int device_ordinal, void*& ptr) {
+  if (ptr == nullptr) {
+    return absl::InvalidArgumentError(
+        "SyclFree: Attempting to free a null pointer.");
+  }
+  TF_RETURN_IF_ERROR(IsValidDeviceOrdinal(device_ordinal, "SyclFree"));
+  TF_ASSIGN_OR_RETURN(StreamPtr stream_handle,
+                      SyclStreamPool::GetDefaultStream(device_ordinal));
+  try {
+    // Use the default stream to free memory
+    ::sycl::free(ptr, *stream_handle);
+    ptr = nullptr;
+  } catch (const ::sycl::exception& e) {
+    return absl::InternalError(
+        absl::StrCat("SyclFree: Failed to free memory: ", e.what(),
+                     ", file = ", __FILE__, ", line = ", __LINE__));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
index ec48eb282a7a92..27dc280c019a99 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/ascii.h"
 #include "xla/stream_executor/sycl/sycl_status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -73,5 +75,182 @@ class SyclDevicePool {
   SyclDevicePool() = delete;
 };
 
+using StreamPtr = std::shared_ptr<::sycl::queue>;
+using StreamPool = std::vector<StreamPtr>;
+using StreamPoolMap = absl::flat_hash_map<int /*device_ordinal*/, StreamPool>;
+
+// TODO(intel-tf): kMaxStreamsPerDevice is the maximum number of streams that
+// can be created per device via GetOrCreateStream when multiple streams are
+// enabled.
+//
+// For now, we set it to 8 so that there is no unbounded growth. However, it can
+// be adjusted based on the device capabilities and workload requirements.
+//
+// This feature will be enabled by default in the future once the performance
+// implications are better understood.
+constexpr int kMaxStreamsPerDevice = 8;
+
+// Manages pools of SYCL streams (queues) per device. All methods are static and
+// thread-safe via a global mutex. For high concurrency workloads, consider
+// refactoring to use per-device mutexes.
+// This class cannot be instantiated and is intended to be used as a
+// static utility.
+class SyclStreamPool {
+ public:
+  // Returns the default (first in the pool) SYCL stream for the given device
+  // ordinal. Returns an error if the device ordinal is invalid or the stream
+  // pool is empty.
+  static absl::StatusOr<StreamPtr> GetDefaultStream(int device_ordinal);
+
+  // Returns a SYCL stream for the given device ordinal.
+  //
+  // If multiple streams are not enabled, returns the default (first in the
+  // pool) SYCL stream. If the stream pool is empty, returns an error.
+  //
+  // If multiple streams are enabled (via enable_multiple_streams), creates
+  // a new stream up to the maximum limit (kMaxStreamsPerDevice). Returns an
+  // error if the limit is reached.
+  static absl::StatusOr<StreamPtr> GetOrCreateStream(
+      int device_ordinal, bool enable_multiple_streams);
+
+  // Synchronizes all streams associated with the given device ordinal.
+  static absl::Status SynchronizeStreamPool(int device_ordinal);
+
+  // Destroys a previously created SYCL stream for the given device ordinal.
+  static absl::Status DestroyStream(int device_ordinal,
+                                    StreamPtr& stream_handle);
+
+ private:
+  // Global mutex protecting the stream pool.
+  // TODO(intel-tf): We should consider using a more fine-grained locking
+  // mechanism (ex. per-device mutex) in the future to avoid performance issues.
+  static absl::Mutex stream_pool_mu_;
+
+  // The underlying stream pool for each device. The device ordinal
+  // is used as the key.
+  static StreamPoolMap stream_pool_map_ ABSL_GUARDED_BY(stream_pool_mu_);
+
+  // Initializes and returns a pointer to the stream pool for the given device
+  // ordinal.
+  static absl::StatusOr<StreamPool*> InitStreamPool(int device_ordinal);
+
+  // Prevent instantiation: this class is intended to be a static utility only
+  SyclStreamPool() = delete;
+};
+
+// Timer properties for SYCL device timing operations.
+struct SyclTimerProperties {
+  // Timer frequency in cycles per second (Hz).
+  uint64_t frequency_hz;
+
+  // Bitmask for valid kernel timestamp bits.
+  uint64_t timestamp_mask;
+};
+
+// Returns the timer frequency (Hz) and valid timestamp bitmask for the given
+// device ordinal using the Level Zero backend.
+absl::StatusOr<SyclTimerProperties> SyclGetTimerProperties(int device_ordinal);
+
+// Synchronizes the given SYCL stream by blocking until all previously submitted
+// tasks are complete.
+absl::Status SyclStreamSynchronize(::sycl::queue* stream_handle)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Retrieves the most recent SYCL event associated with the given stream,
+// if available.
+absl::StatusOr<std::optional<::sycl::event>> SyclGetRecentEventFromStream(
+    ::sycl::queue* stream_handle) ABSL_ATTRIBUTE_NONNULL(1);
+
+// NOTE: Similar to standard memcpy, all SYCL memcpy functions work
+// only when the source and destination buffers do not overlap. Add support for
+// overlapping copies if needed via a SYCL kernel.
+
+// Copies data from a device buffer to a host buffer using the default SYCL
+// stream for the specified device ordinal. The copy is synchronous and blocks
+// until the operation is complete.
+absl::Status SyclMemcpyDeviceToHost(int device_ordinal, void* dst_host,
+                                    const void* src_device, size_t byte_count);
+
+// Copies data from a host buffer to a device buffer using the default SYCL
+// stream for the specified device ordinal. The copy is synchronous and blocks
+// until the operation is complete.
+absl::Status SyclMemcpyHostToDevice(int device_ordinal, void* dst_device,
+                                    const void* src_host, size_t byte_count);
+
+// Copies data between two device buffers using the default SYCL stream for
+// the specified device ordinal. It supports both intra-device and
+// inter-device transfers. The copy is synchronous and blocks until the
+// operation is complete.
+absl::Status SyclMemcpyDeviceToDevice(int device_ordinal, void* dst_device,
+                                      const void* src_device,
+                                      size_t byte_count);
+
+// Asynchronously copies data from a device buffer to a host buffer using the
+// specified SYCL stream. The operation may return before the copy is complete.
+absl::Status SyclMemcpyDeviceToHostAsync(::sycl::queue* stream_handle,
+                                         void* dst_host, const void* src_device,
+                                         size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Asynchronously copies data from a host buffer to a device buffer using the
+// specified SYCL stream. The operation may return before the copy is complete.
+absl::Status SyclMemcpyHostToDeviceAsync(::sycl::queue* stream_handle,
+                                         void* dst_device, const void* src_host,
+                                         size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Asynchronously copies data between two device buffers using the specified
+// SYCL stream. It supports both intra-device and inter-device transfers. The
+// operation may return before the copy is complete.
+absl::Status SyclMemcpyDeviceToDeviceAsync(::sycl::queue* stream_handle,
+                                           void* dst_device,
+                                           const void* src_device,
+                                           size_t byte_count)
+    ABSL_ATTRIBUTE_NONNULL(1);
+
+// Sets the device buffer to a byte value using the default SYCL stream
+// for the specified device ordinal. The operation is synchronous
+// and blocks until the operation is complete.
+absl::Status SyclMemsetDevice(int device_ordinal, void* dst_device,
+                              unsigned char value, size_t count);
+
+// Asynchronously sets the device buffer to a byte value using the specified
+// SYCL stream. The operation may return before it is complete.
+absl::Status SyclMemsetDeviceAsync(::sycl::queue* stream_handle,
+                                   void* dst_device, unsigned char value,
+                                   size_t count) ABSL_ATTRIBUTE_NONNULL(1);
+
+// Sets the device buffer to an unsigned 32-bit value using the default SYCL
+// stream for the specified device ordinal. The operation is synchronous and
+// blocks until the operation is complete.
+absl::Status SyclMemfillDevice(int device_ordinal, void* dst_device,
+                               uint32_t value, size_t count);
+
+// Asynchronously sets the device buffer to an unsigned 32-bit value using the
+// specified SYCL stream. The operation may return before it is complete.
+absl::Status SyclMemfillDeviceAsync(::sycl::queue* stream_handle,
+                                    void* dst_device, uint32_t value,
+                                    size_t count) ABSL_ATTRIBUTE_NONNULL(1);
+
+// Allocates a block of memory on the given device ordinal using the default
+// stream for that device. The memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocDevice(int device_ordinal, size_t byte_count);
+
+// Allocates a block of host-accessible memory on the given device ordinal
+// using the default stream for that device. The memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocHost(int device_ordinal, size_t byte_count);
+
+// Allocates a block of shared memory that is accessible by both the host and
+// the specified device ordinal, using the default stream for that device. The
+// memory is aligned to 64 bytes.
+absl::StatusOr<void*> SyclMallocShared(int device_ordinal, size_t byte_count);
+
+// Frees a previously allocated block of memory on the specified device ordinal
+// using the default stream for that device. After successful deallocation, the
+// pointer is set to nullptr.
+// This function is thread-safe only for different pointers. Concurrent calls
+// to free the same pointer requires synchronization by the caller.
+absl::Status SyclFree(int device_ordinal, void*& ptr);
+
 }  // namespace stream_executor::sycl
 #endif  // XLA_STREAM_EXECUTOR_SYCL_SYCL_GPU_RUNTIME_H_
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
index 8e01fb4b2e8d6f..cc8e4d56755980 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_gpu_runtime_test.cc
@@ -15,17 +15,101 @@ limitations under the License.
 #include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
 
 #include <gtest/gtest.h>
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/status_matchers.h"
 
 namespace stream_executor::sycl {
 namespace {
 
-TEST(SyclGpuRuntimeTest, GetDeviceCount) {
+class SyclGpuRuntimeTest : public ::testing::Test {
+ public:
+  std::vector<::sycl::device> sycl_devices_;
+
+ protected:
+  absl::StatusOr<void*> AllocateHostBuffer(int count) {
+    TF_ASSIGN_OR_RETURN(
+        void* buf, SyclMallocHost(kDefaultDeviceOrdinal, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateHostBuffer: Failed to allocate host "
+          "buffer.");
+    }
+    return buf;
+  }
+
+  absl::StatusOr<void*> AllocateDeviceBuffer(
+      int count, int device_ordinal = kDefaultDeviceOrdinal) {
+    TF_ASSIGN_OR_RETURN(void* buf,
+                        SyclMallocDevice(device_ordinal, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateDeviceBuffer: Failed to allocate "
+          "device buffer.");
+    }
+    return buf;
+  }
+
+  void VerifyIntBuffer(void* buf, int count, int expected) {
+    for (int i = 0; i < count; ++i) {
+      EXPECT_EQ(static_cast<int*>(buf)[i], expected)
+          << "Buffer mismatch at index " << i;
+    }
+  }
+
+  absl::StatusOr<void*> AllocateAndInitHostBuffer(int count, int value) {
+    TF_ASSIGN_OR_RETURN(void* buf, AllocateHostBuffer(count));
+    for (int i = 0; i < count; ++i) {
+      static_cast<int*>(buf)[i] = value;
+    }
+    return buf;
+  }
+
+  absl::StatusOr<void*> AllocateAndInitDeviceBuffer(
+      int count, int value, int device_ordinal = kDefaultDeviceOrdinal) {
+    TF_ASSIGN_OR_RETURN(void* buf, AllocateDeviceBuffer(count));
+    TF_RETURN_IF_ERROR(
+        SyclMemfillDevice(device_ordinal, buf, value, sizeof(int) * count));
+    if (buf == nullptr) {
+      return absl::InternalError(
+          "SyclGpuRuntimeTest::AllocateAndInitDeviceBuffer: Failed to fill "
+          "device buffer.");
+    }
+    return buf;
+  }
+
+  void FreeAndNullify(void*& ptr, int device_ordinal = kDefaultDeviceOrdinal) {
+    if (ptr != nullptr) {
+      EXPECT_THAT(SyclFree(device_ordinal, ptr), absl_testing::IsOk());
+      EXPECT_EQ(ptr, nullptr);
+    }
+  }
+
+ private:
+  void SetUp() override {
+    // Find the number of SYCL devices available. If there are none, skip the
+    // test.
+    TF_ASSERT_OK_AND_ASSIGN(int device_count, SyclDevicePool::GetDeviceCount());
+    if (device_count <= 0) {
+      GTEST_SKIP() << "No SYCL devices found.";
+    } else {
+      VLOG(2) << "Found " << device_count << " SYCL devices.";
+    }
+
+    // Initialize the device pool with available devices.
+    for (int i = 0; i < device_count; ++i) {
+      TF_ASSERT_OK_AND_ASSIGN(::sycl::device sycl_device,
+                              SyclDevicePool::GetDevice(i));
+      sycl_devices_.push_back(sycl_device);
+    }
+  }
+};
+
+TEST_F(SyclGpuRuntimeTest, GetDeviceCount) {
   EXPECT_THAT(SyclDevicePool::GetDeviceCount(),
               ::absl_testing::IsOkAndHolds(::testing::Gt(0)));
 }
 
-TEST(SyclGpuRuntimeTest, GetDeviceOrdinal) {
+TEST_F(SyclGpuRuntimeTest, GetDeviceOrdinal) {
   TF_ASSERT_OK_AND_ASSIGN(::sycl::device sycl_device,
                           SyclDevicePool::GetDevice(kDefaultDeviceOrdinal));
   TF_ASSERT_OK_AND_ASSIGN(int device_ordinal,
@@ -33,7 +117,7 @@ TEST(SyclGpuRuntimeTest, GetDeviceOrdinal) {
   EXPECT_EQ(device_ordinal, kDefaultDeviceOrdinal);
 }
 
-TEST(SyclGpuRuntimeTest, TestStaticDeviceContext) {
+TEST_F(SyclGpuRuntimeTest, TestStaticDeviceContext) {
   // Verify that GetDeviceContext returns the same context instance on multiple
   // calls.
   TF_ASSERT_OK_AND_ASSIGN(::sycl::context saved_sycl_context,
@@ -43,5 +127,556 @@ TEST(SyclGpuRuntimeTest, TestStaticDeviceContext) {
   EXPECT_EQ(saved_sycl_context, current_sycl_context);
 }
 
+TEST_F(SyclGpuRuntimeTest, TestDefaultStreamSynchronizeAndDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetDefaultStream(kDefaultDeviceOrdinal));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(SyclStreamPool::SynchronizeStreamPool(kDefaultDeviceOrdinal));
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestCreateStreamSynchronizeAndDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(SyclStreamPool::SynchronizeStreamPool(kDefaultDeviceOrdinal));
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolCreateAfterDestroy) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+
+  // Verify that we can create a new stream after destroying the previous one.
+  TF_ASSERT_OK_AND_ASSIGN(
+      stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Clean up the stream after the test.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolCreate_Negative) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  EXPECT_THAT(
+      SyclStreamPool::GetOrCreateStream(kInvalidDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(SyclGpuRuntimeTest, TestStreamPoolDestroy_Negative) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+
+  // Try to destroy the stream again, which should be a no-op.
+  EXPECT_THAT(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle),
+      absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMaxStreamsPerDevice) {
+  // Ensure that the maximum number of streams per device is respected.
+  constexpr int kMaxStreams = 8;
+  std::vector<StreamPtr> streams(kMaxStreams);
+  for (int i = 0; i < kMaxStreams - 1; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(streams[i], SyclStreamPool::GetOrCreateStream(
+                                            kDefaultDeviceOrdinal,
+                                            /*enable_multiple_streams=*/true));
+    ASSERT_NE(streams[i], nullptr);
+  }
+
+  // Attempt to create one more stream, which should fail.
+  EXPECT_THAT(
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/true),
+      absl_testing::StatusIs(absl::StatusCode::kResourceExhausted));
+
+  // Clean up the streams created.
+  for (int i = 0; i < kMaxStreams - 1; ++i) {
+    TF_ASSERT_OK(
+        SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, streams[i]));
+    EXPECT_EQ(streams[i], nullptr);
+  }
+}
+
+TEST_F(SyclGpuRuntimeTest, TestGetTimerProperties) {
+  TF_ASSERT_OK_AND_ASSIGN(SyclTimerProperties timer_props,
+                          SyclGetTimerProperties(kDefaultDeviceOrdinal));
+  EXPECT_GT(timer_props.frequency_hz, 0);
+  EXPECT_GT(timer_props.timestamp_mask, 0);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclGetRecentEventFromStream) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Ensure there is an event associated with the stream by filling some memory
+  // on the device.
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADC0DE));
+
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<::sycl::event> event,
+                          SyclGetRecentEventFromStream(stream_handle.get()));
+
+  ASSERT_TRUE(event.has_value());
+
+  // Expect the event to be in a valid state. The command_execution_status
+  // should not be "unknown".
+  EXPECT_NE(
+      event.value().get_info<::sycl::info::event::command_execution_status>(),
+      ::sycl::info::event_command_status::ext_oneapi_unknown);
+
+  FreeAndNullify(device_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToHost) {
+  constexpr int kCount = 12;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADBEEF));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, dst_host,
+                                      src_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(dst_host, kCount, 0xDEADBEEF);
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyHostToDeviceAndBack) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_host,
+                          AllocateAndInitHostBuffer(kCount, 0xDEADC0DE));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyHostToDevice(kDefaultDeviceOrdinal, dst_device,
+                                      src_host, sizeof(int) * kCount));
+
+  // Clear out the host buffer to ensure data is copied back correctly.
+  for (int i = 0; i < kCount; ++i) {
+    static_cast<int*>(src_host)[i] = 0;
+  }
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, src_host,
+                                      dst_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(src_host, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(src_host);
+  FreeAndNullify(dst_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToDevice_SameDevice) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device, AllocateDeviceBuffer(kCount));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  // Test memcpy between two buffers within the same device.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDevice(kDefaultDeviceOrdinal, dst_device,
+                                        src_device, sizeof(int) * kCount));
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyDeviceToHostAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device,
+                          AllocateAndInitDeviceBuffer(kCount, 0xDEADBEEF));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHostAsync(stream_handle.get(), dst_host,
+                                           src_device, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  // Check the results after synchronization.
+  VerifyIntBuffer(dst_host, kCount, 0xDEADBEEF);
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemcopyHostToDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_host,
+                          AllocateAndInitHostBuffer(kCount, 0xDEADC0DE));
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_device, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyHostToDeviceAsync(stream_handle.get(), dst_device,
+                                           src_host, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  // Verify the copy by reading back to host.
+  // First, clear out the host buffer to ensure data is copied back correctly.
+  for (int i = 0; i < kCount; ++i) {
+    static_cast<int*>(src_host)[i] = 0;
+  }
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, src_host,
+                                      dst_device, sizeof(int) * kCount));
+
+  VerifyIntBuffer(src_host, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(src_host);
+  FreeAndNullify(dst_device);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDevice) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      void* src_device,
+      SyclMallocDevice(kDefaultDeviceOrdinal, sizeof(char) * kCount));
+  ASSERT_NE(src_device, nullptr);
+
+  TF_ASSERT_OK(SyclMemsetDevice(kDefaultDeviceOrdinal, src_device, 'A',
+                                sizeof(char) * kCount));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* dst_host, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, dst_host,
+                                      src_device, sizeof(char) * kCount));
+
+  for (int i = 0; i < kCount; ++i) {
+    EXPECT_EQ(static_cast<char*>(dst_host)[i], 'A')
+        << "Mismatch at index " << i;
+  }
+
+  FreeAndNullify(src_device);
+  FreeAndNullify(dst_host);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDevice_Negative) {
+  constexpr int kCount = 10;
+  constexpr int kInvalidDeviceOrdinal = -1;
+
+  TF_ASSERT_OK_AND_ASSIGN(void* src_device, AllocateDeviceBuffer(kCount));
+  ASSERT_NE(src_device, nullptr);
+
+  // Attempt to memset with an invalid device ordinal.
+  EXPECT_THAT(SyclMemsetDevice(kInvalidDeviceOrdinal, src_device, 'A',
+                               sizeof(char) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Attempt to memset a null pointer.
+  void* null_ptr = nullptr;
+  EXPECT_THAT(SyclMemsetDevice(kDefaultDeviceOrdinal, null_ptr, 'A',
+                               sizeof(char) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  FreeAndNullify(src_device);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemsetDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemsetDeviceAsync(stream_handle.get(), device_buf, 'B',
+                                     sizeof(char) * kCount));
+
+  // Synchronize the stream to ensure the memset is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, host_buf,
+                                      device_buf, sizeof(char) * kCount));
+
+  for (int i = 0; i < kCount; ++i) {
+    EXPECT_EQ(static_cast<char*>(host_buf)[i], 'B')
+        << "Mismatch at index " << i;
+  }
+
+  FreeAndNullify(device_buf);
+  FreeAndNullify(host_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemfillDeviceAsync) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_buf, AllocateDeviceBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemfillDeviceAsync(stream_handle.get(), device_buf,
+                                      0xDEADC0DE, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the fill is complete before checking
+  // results.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream_handle.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDefaultDeviceOrdinal, host_buf,
+                                      device_buf, sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0xDEADC0DE);
+
+  FreeAndNullify(device_buf);
+  FreeAndNullify(host_buf);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  ASSERT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMemfillDeviceAsync_Negative) {
+  constexpr int kCount = 10;
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream_handle,
+      SyclStreamPool::GetOrCreateStream(kDefaultDeviceOrdinal,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream_handle, nullptr);
+
+  // Attempt to fill a null pointer.
+  void* null_ptr = nullptr;
+  EXPECT_THAT(SyclMemfillDeviceAsync(stream_handle.get(), null_ptr, 0xFEEDEAF,
+                                     sizeof(int) * kCount),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(
+      SyclStreamPool::DestroyStream(kDefaultDeviceOrdinal, stream_handle));
+  EXPECT_EQ(stream_handle, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMultiDeviceAllocationAndSyncCopy) {
+  // Skip if less than 2 devices are available.
+  if (sycl_devices_.size() < 2) {
+    GTEST_SKIP() << "Not enough SYCL devices available for this test.";
+  }
+
+  constexpr int kDevice0 = 0, kDevice1 = 1;
+  constexpr int kCount = 16;
+
+  // Allocate and initialize on device 0.
+  TF_ASSERT_OK_AND_ASSIGN(void* device0_buf, AllocateAndInitDeviceBuffer(
+                                                 kCount, 0x1234ABCD, kDevice0));
+  // Allocate on device 1.
+  TF_ASSERT_OK_AND_ASSIGN(void* device1_buf,
+                          AllocateDeviceBuffer(kCount, kDevice1));
+
+  // Try to copy from device 0 to device 1. It should work since cross-device
+  // memcpy is supported.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDevice(kDevice0, device1_buf, device0_buf,
+                                        sizeof(int) * kCount));
+
+  // Verify the copy by reading back to host.
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDevice1, host_buf, device1_buf,
+                                      sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0x1234ABCD);
+
+  // Free the buffers.
+  FreeAndNullify(device0_buf, kDevice0);
+  FreeAndNullify(device1_buf, kDevice1);
+  FreeAndNullify(host_buf, kDefaultDeviceOrdinal);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMultiDeviceAllocationAndAsyncCopy) {
+  if (sycl_devices_.size() < 2) {
+    GTEST_SKIP() << "Not enough SYCL devices available for this test.";
+  }
+
+  constexpr int kDevice0 = 0, kDevice1 = 1;
+  constexpr int kCount = 10;
+
+  // Create a stream for device-0.
+  TF_ASSERT_OK_AND_ASSIGN(
+      StreamPtr stream0,
+      SyclStreamPool::GetOrCreateStream(kDevice0,
+                                        /*enable_multiple_streams=*/false));
+  ASSERT_NE(stream0, nullptr);
+
+  // Allocate and initialize on device-0.
+  TF_ASSERT_OK_AND_ASSIGN(void* device0_buf, AllocateAndInitDeviceBuffer(
+                                                 kCount, 0xDEADBEEF, kDevice0));
+
+  // Allocate on device 1.
+  TF_ASSERT_OK_AND_ASSIGN(void* device1_buf,
+                          AllocateDeviceBuffer(kCount, kDevice1));
+
+  // Copy from device-0 to device-1 using stream-0.
+  TF_ASSERT_OK(SyclMemcpyDeviceToDeviceAsync(
+      stream0.get(), device1_buf, device0_buf, sizeof(int) * kCount));
+
+  // Synchronize the stream to ensure the copy is complete.
+  TF_ASSERT_OK(SyclStreamSynchronize(stream0.get()));
+
+  // Verify the copy by copying back to host.
+  TF_ASSERT_OK_AND_ASSIGN(void* host_buf, AllocateHostBuffer(kCount));
+
+  TF_ASSERT_OK(SyclMemcpyDeviceToHost(kDevice1, host_buf, device1_buf,
+                                      sizeof(int) * kCount));
+
+  VerifyIntBuffer(host_buf, kCount, 0xDEADBEEF);
+
+  // Free the buffers.
+  FreeAndNullify(device0_buf, kDevice0);
+  FreeAndNullify(device1_buf, kDevice1);
+  FreeAndNullify(host_buf, kDefaultDeviceOrdinal);
+
+  // Destroy the stream after use.
+  TF_ASSERT_OK(SyclStreamPool::DestroyStream(kDevice0, stream0));
+  EXPECT_EQ(stream0, nullptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_Positive) {
+  TF_ASSERT_OK_AND_ASSIGN(void* host_ptr, AllocateHostBuffer(/*count=*/256));
+  FreeAndNullify(host_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr,
+                          AllocateDeviceBuffer(/*count=*/256));
+  FreeAndNullify(device_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* shared_ptr,
+                          SyclMallocShared(kDefaultDeviceOrdinal,
+                                           /*byte_count=*/1024));
+  EXPECT_NE(shared_ptr, nullptr);
+  FreeAndNullify(shared_ptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_InvalidDeviceOrdinal) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  EXPECT_THAT(SyclMallocHost(kInvalidDeviceOrdinal, 10).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_THAT(SyclMallocDevice(kInvalidDeviceOrdinal, 20).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_THAT(SyclMallocShared(kInvalidDeviceOrdinal, 30).status(),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(SyclGpuRuntimeTest, TestMallocAll_ZeroAllocation) {
+  constexpr size_t kByteCount = 0;
+  TF_ASSERT_OK_AND_ASSIGN(void* host_ptr,
+                          SyclMallocHost(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(host_ptr, nullptr)
+      << "Expected nullptr for zero allocation on host memory.";
+  FreeAndNullify(host_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr,
+                          SyclMallocDevice(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(device_ptr, nullptr)
+      << "Expected nullptr for zero allocation on device memory.";
+  FreeAndNullify(device_ptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(void* shared_ptr,
+                          SyclMallocShared(kDefaultDeviceOrdinal, kByteCount));
+  EXPECT_EQ(shared_ptr, nullptr)
+      << "Expected nullptr for zero allocation on shared memory.";
+  FreeAndNullify(shared_ptr);
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclFree_Negative) {
+  constexpr int kInvalidDeviceOrdinal = -1;
+  void* null_ptr = nullptr;  // Null pointer should not cause issues.
+
+  // Attempt to free with an invalid device ordinal.
+  EXPECT_THAT(SyclFree(kInvalidDeviceOrdinal, null_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // Attempt to free a null pointer.
+  EXPECT_THAT(SyclFree(kDefaultDeviceOrdinal, null_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument))
+      << "Expected error when trying to free a null pointer.";
+}
+
+TEST_F(SyclGpuRuntimeTest, TestSyclFree_DoubleFree) {
+  TF_ASSERT_OK_AND_ASSIGN(void* device_ptr, AllocateDeviceBuffer(10));
+  TF_ASSERT_OK(SyclFree(kDefaultDeviceOrdinal, device_ptr));
+  EXPECT_EQ(device_ptr, nullptr);
+
+  // Try to free again, which should return an error.
+  EXPECT_THAT(SyclFree(kDefaultDeviceOrdinal, device_ptr),
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
 }  // namespace
 }  // namespace stream_executor::sycl
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
index 72dc4b222a75cc..04bf0eaa4c82d0 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.cc
@@ -12,6 +12,22 @@ limitations under the License.
 
 #include "xla/stream_executor/sycl/sycl_kernel.h"
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
+
 namespace stream_executor::sycl {
 
 // TODO(intel-tf): Implement this feature in SYCL
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
index 5fc589138c3c0a..4dd3e030fd1e92 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_kernel.h
@@ -13,9 +13,16 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_SYCL_SYCL_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_SYCL_SYCL_KERNEL_H_
 
-#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <sycl/sycl.hpp>  // NOLINT
 
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_metadata.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc b/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
index 94de7f9ba2f96c..69fc648cd167c5 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_platform.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_platform.h b/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
index 4273d7744e0cee..1fe8e301d735d7 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_platform.h
@@ -20,18 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor::sycl {
 
-// Opaque and unique identifier for the SYCL platform plugin.
-// This is needed so that plugins can refer to/identify this platform without
-// instantiating a SyclPlatform object.
-extern const Platform::Id kSyclPlatformId;
-
 // SYCL-specific platform plugin, registered as a singleton value via module
 // initializer.
 class SyclPlatform : public Platform {
@@ -40,7 +35,7 @@ class SyclPlatform : public Platform {
   ~SyclPlatform() override;
 
   // Platform interface implementation:
-  // Returns the same value as kSyclPlatform above.
+  // Returns the same value as kSyclPlatformId above.
   Platform::Id id() const override;
 
   // Returns -1 as a sentinel on internal failure (and logs the error).
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc b/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
index 4b734db79c76ae..4bf9353279be5a 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_timer.cc
@@ -55,10 +55,10 @@ absl::StatusOr<float> GetEventElapsedTime(StreamExecutor* executor,
 
   // Get the frequency and mask for the device to convert timestamps to
   // milliseconds.
-  // TODO(intel-tf): Remove the hardcoded frequency and mask values once
-  // SyclGetFrequencyMask is implemented for all devices in SYCL GPU runtime.
-  constexpr uint64_t frequency = 12500000;  // 12.5 MHz
-  constexpr uint64_t mask = 4294967295;     // 0xFFFFFFFF
+  // We assume that all SYCL devices have the same frequency and mask, so
+  // we use kDefaultDeviceOrdinal.
+  TF_ASSIGN_OR_RETURN(SyclTimerProperties timer_props,
+                      SyclGetTimerProperties(kDefaultDeviceOrdinal));
 
   const uint64_t kernel_start_time = start_timestamp.global.kernelStart;
   const uint64_t kernel_end_time = end_timestamp.global.kernelEnd;
@@ -66,12 +66,14 @@ absl::StatusOr<float> GetEventElapsedTime(StreamExecutor* executor,
   if (kernel_start_time < kernel_end_time) {
     elapsed_ticks = kernel_end_time - kernel_start_time;
   } else {
-    elapsed_ticks = (mask + 1ull) + kernel_end_time - kernel_start_time;
+    elapsed_ticks = (timer_props.timestamp_mask + 1ull) + kernel_end_time -
+                    kernel_start_time;
   }
   float elapsed_milliseconds =
-      static_cast<float>(elapsed_ticks) * kMsecInSec / frequency;
+      static_cast<float>(elapsed_ticks) * kMsecInSec / timer_props.frequency_hz;
 
-  VLOG(1) << "Frequency: " << frequency << ", mask: " << mask;
+  VLOG(1) << "Frequency: " << timer_props.frequency_hz
+          << ", mask: " << timer_props.timestamp_mask;
   VLOG(1) << "The duration between start and stop events is "
           << elapsed_milliseconds << " ms.";
   return elapsed_milliseconds;
diff --git a/third_party/xla/xla/stream_executor/sycl/sycl_timer.h b/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
index db7317ac6daaaf..984608b5b160e7 100644
--- a/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
+++ b/third_party/xla/xla/stream_executor/sycl/sycl_timer.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/sycl/sycl_event.h"
+#include "xla/stream_executor/sycl/sycl_gpu_runtime.h"
 
 namespace stream_executor::sycl {
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 0babed4fd330ff..445495e1c2c4a2 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -235,6 +235,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -525,17 +526,22 @@ xla_test(
         "conv_depthwise_test.cc",
     ],
     shard_count = 30,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        ":client_library_test_base",
         ":conv_depthwise_common",
-        ":hlo_test_base",
-        ":xla_internal_test_main",
-        "//xla:execution_options_util",
-        "//xla:status_macros",
-        "//xla/hlo/builder:xla_computation",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
         "//xla/hlo/transforms:despecializer",
         "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -622,7 +628,6 @@ xla_cc_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
-    # placeholder for extra args for while_test
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -674,9 +679,13 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -706,10 +715,12 @@ xla_test(
     shard_count = 15,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:error_spec",
@@ -863,6 +874,7 @@ xla_test(
         "//xla/service",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -941,9 +953,7 @@ cc_library(
     testonly = True,
     srcs = ["conv_depthwise_common.cc"],
     hdrs = ["conv_depthwise_common.h"],
-    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
@@ -979,13 +989,16 @@ xla_test(
 xla_test(
     name = "fft_test",
     srcs = ["fft_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1000,13 +1013,15 @@ xla_test(
     },
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1018,13 +1033,16 @@ xla_test(
         # TODO(b/445172709): Re-enable once fixed.
         "b200": ["broken"],
     },
+    precompile_test = False,
     shard_count = 20,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
@@ -1045,15 +1063,18 @@ xla_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1071,7 +1092,7 @@ xla_test(
 # Run dot tests with auto-tuning disabled.  This just does a basic sanity check
 # that setting xla_gpu_autotune_level to 0 does not break simple graphs.
 xla_test(
-    name = "dot_operation_test_autotune_disabled",
+    name = "dot_operation_autotune_disabled_test",
     srcs = ["dot_operation_test.cc"],
     args = ["--xla_gpu_autotune_level=0"],
     backend_tags = {
@@ -1079,33 +1100,43 @@ xla_test(
         "b200": ["broken"],
     },
     backends = ["gpu"],
+    precompile_test = False,
     shard_count = 20,
     tags = [
         "optonly",
         # TODO(b/151340488): Timed out on 2020-03-12.
         "nozapfhahn",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
+        "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -1117,7 +1148,7 @@ xla_test(
 
 # Run dot tests with dot canonicalization after the layout assignment pass.
 xla_test(
-    name = "dot_operation_test_canonicalization_after_layout",
+    name = "dot_operation_canonicalization_after_layout_test",
     timeout = "long",
     srcs = ["dot_operation_test.cc"],
     args = [
@@ -1129,35 +1160,42 @@ xla_test(
         "gpu",
         "interpreter",
     ],
+    precompile_test = False,
     shard_count = 50,
     tags = [
         "nozapfhahn",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
-        "//xla/client:local_client",
+        "//xla/client:client_library",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ] + if_rocm_is_configured([
         # keep sorted
         "@local_config_rocm//rocm:rocm_headers",
@@ -1291,31 +1329,41 @@ xla_test(
         # TODO(b/445172709): Re-enable once fixed.
         "b200": ["broken"],
     },
+    precompile_test = False,
     shard_count = 50,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
+        "//xla/client:client_library",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_runner_interface",
         "//xla/service:platform_util",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -1417,7 +1465,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d",
+    name = "convolution_1d_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     # Turn on logging so that VLOG statements don't appear uncovered to zapfhahn.
@@ -1454,7 +1502,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_no_vmodule",
+    name = "convolution_1d_no_vmodule_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     backends = [
@@ -1490,7 +1538,7 @@ xla_test(
 # sanity check that setting xla_gpu_autotune_level to 0 does not break simple
 # graphs.
 xla_test(
-    name = "convolution_test_autotune_disabled",
+    name = "convolution_autotune_disabled_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
     args = ["--xla_gpu_autotune_level=0"],
@@ -1528,7 +1576,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_autotune_disabled",
+    name = "convolution_1d_autotune_disabled_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     args = ["--xla_gpu_autotune_level=0"],
@@ -1559,7 +1607,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_gpu_alternative_layout",
+    name = "convolution_gpu_alternative_layout_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
@@ -1593,7 +1641,7 @@ xla_test(
 )
 
 xla_test(
-    name = "convolution_test_1d_gpu_alternative_layout",
+    name = "convolution_1d_gpu_alternative_layout_test",
     timeout = "long",
     srcs = ["convolution_test_1d.cc"],
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
@@ -1832,9 +1880,13 @@ xla_test(
     timeout = "long",
     srcs = ["slice_test.cc"],
     shard_count = 50,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -1876,6 +1928,7 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
+    precompile_test = False,
     shard_count = 4,
     tags = [
         "test_migrated_to_hlo_runner_pjrt",
@@ -2314,9 +2367,13 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
@@ -2337,13 +2394,18 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:array",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
         "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:shape_util",
@@ -2352,7 +2414,8 @@ xla_test(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:arithmetic",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2476,9 +2539,13 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
@@ -2818,6 +2885,7 @@ xla_test(
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -3117,9 +3185,13 @@ xla_test(
         "gpu",
         "interpreter",
     ],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
@@ -3155,11 +3227,12 @@ xla_test(
     name = "multioutput_fusion_test",
     srcs = ["multioutput_fusion_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
@@ -3474,7 +3547,7 @@ xla_cc_test(
 )
 
 xla_test(
-    name = "ptxas_bug_120501638",
+    name = "ptxas_bug_120501638_test",
     srcs = ["ptxas_bug_120501638.cc"],
     tags = [
         # Disabled in OSS until nvidia publicly releases a fixed ptxas.
@@ -3635,6 +3708,7 @@ xla_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3819,7 +3893,7 @@ cc_library(
     hdrs = ["hlo_test_base_with_symbolic_expr_context.h"],
     deps = [
         ":hlo_test_base",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
+        "//xla/hlo/analysis:symbolic_expr",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/tests/broadcast_simple_test.cc b/third_party/xla/xla/tests/broadcast_simple_test.cc
index 8efc57cf84bb76..f567c6ab76931f 100644
--- a/third_party/xla/xla/tests/broadcast_simple_test.cc
+++ b/third_party/xla/xla/tests/broadcast_simple_test.cc
@@ -32,7 +32,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
@@ -101,7 +102,8 @@ float ApplyOpToFloats(HloOpcode op, float lhs, float rhs) {
   }
 }
 
-using BroadcastSimpleTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using BroadcastSimpleTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
   XlaBuilder b(TestName());
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index c68afd61af210b..917ad2db33c198 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/error_spec.h"
@@ -112,9 +113,16 @@ Type CheckStatus(absl::StatusOr<Type> result) {
   return *result;
 }
 
-class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
+bool IsAsync(const HloInstruction* inst) {
+  return !inst->backend_config<gpu::GpuBackendConfig>()
+              .value()
+              .collective_backend_config()
+              .is_sync();
+}
+
+class CollectiveOpsE2ETestBase : public HloHardwareIndependentTestBase {
  public:
-  CollectiveOpsTestE2E() {
+  CollectiveOpsE2ETestBase() {
     se::Platform* platform = CheckStatus(PlatformUtil::GetPlatform("GPU"));
     se::Platform* reference_platform =
         CheckStatus(PlatformUtil::GetPlatform("GPU"));
@@ -144,63 +152,12 @@ class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
                                                  std::move(allocators)));
     reference_hlo_runner_ = std::make_unique<HloRunner>(
         reference_platform, /*intra_op_parallelism_threads=*/0);
-
-    replacements_[kF8E4M3DatatypePlaceholder] =
-        IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
-    replacements_[kF8E5M2DatatypePlaceholder] =
-        IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
-  }
-
-  bool IsCuda() { return Capability().IsCuda(); }
-
-  const se::GpuComputeCapability& Capability() {
-    return hlo_runner_->backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
-
-  bool HasFp8Support() {
-    if (IsCuda()) {
-      return Capability().cuda_compute_capability()->IsAtLeast(8, 9);
-    }
-    return Capability().rocm_compute_capability()->has_fp8_support() &&
-           GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
-  }
-
-  void CollectiveOpsVerifyF8Matmul(absl::string_view hlo_text,
-                                   const DebugOptions& options) {
-    if (!HasFp8Support()) {
-      return;
-    }
-    const int64_t kNumReplicas = 1;
-    const int64_t kNumPartitions = 4;
-
-    HloModuleConfig config =
-        GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
-    config.set_debug_options(options);
-    config.set_num_partitions(kNumPartitions);
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(hlo_text, config));
-
-    TF_ASSERT_OK_AND_ASSIGN(auto executable, hlo_runner_->CreateExecutable(
-                                                 std::move(module),
-                                                 /*run_hlo_passes=*/true));
-    TF_ASSERT_OK_AND_ASSIGN(
-        const HloModule* const hlo_module,
-        hlo_runner_->HloModuleFromWrapped(executable.get()));
-    std::vector<HloInstruction*> gemm_ops =
-        FindInstructions(hlo_module, HloOpcode::kCustomCall);
-    for (HloInstruction* gemm_op : gemm_ops) {
-      EXPECT_EQ(gemm_op->custom_call_target(), "__cublas$lt$matmul$f8");
-    }
   }
 
-  // TODO(b/449655621) Use absl::AnyInvocable instead of std::function.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      const std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      const std::function<int64_t(int64_t)> argument_count_provider,
-      const std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       const int64_t num_replicas, const bool run_hlo_passes,
       DeviceAssignment* const device_assignment) {
     // TODO(b/441865120): Use designated initializers this once XLA moves to
@@ -265,17 +222,65 @@ class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
         num_replicas, /*run_hlo_passes=*/false, &device_assignment);
   }
 
-  bool IsAsync(const HloInstruction* inst) {
-    return !inst->backend_config<gpu::GpuBackendConfig>()
-                .value()
-                .collective_backend_config()
-                .is_sync();
+ protected:
+  std::unique_ptr<HloRunner> hlo_runner_;
+  std::unique_ptr<HloRunner> reference_hlo_runner_;
+};
+
+class CollectiveOpsTestE2E : public CollectiveOpsE2ETestBase {
+ public:
+  CollectiveOpsTestE2E() {
+    replacements_[kF8E4M3DatatypePlaceholder] =
+        Capability().IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
+    replacements_[kF8E5M2DatatypePlaceholder] =
+        Capability().IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
+  }
+
+  const se::GpuComputeCapability& Capability() {
+    return hlo_runner_->backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability();
+  }
+
+  bool HasFp8Support() {
+    if (Capability().IsCuda()) {
+      return Capability().cuda_compute_capability()->IsAtLeast(8, 9);
+    }
+    return Capability().rocm_compute_capability()->has_fp8_support() &&
+           GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  }
+
+  void CollectiveOpsVerifyF8Matmul(absl::string_view hlo_text,
+                                   const DebugOptions& options) {
+    if (!HasFp8Support()) {
+      return;
+    }
+    const int64_t kNumReplicas = 1;
+    const int64_t kNumPartitions = 4;
+
+    HloModuleConfig config =
+        GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+    config.set_debug_options(options);
+    config.set_num_partitions(kNumPartitions);
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_text, config));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto executable, hlo_runner_->CreateExecutable(
+                                                 std::move(module),
+                                                 /*run_hlo_passes=*/true));
+    TF_ASSERT_OK_AND_ASSIGN(
+        const HloModule* const hlo_module,
+        hlo_runner_->HloModuleFromWrapped(executable.get()));
+    std::vector<HloInstruction*> gemm_ops =
+        FindInstructions(hlo_module, HloOpcode::kCustomCall);
+    for (HloInstruction* gemm_op : gemm_ops) {
+      EXPECT_EQ(gemm_op->custom_call_target(), "__cublas$lt$matmul$f8");
+    }
   }
 
  protected:
   absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
-  std::unique_ptr<HloRunner> hlo_runner_;
-  std::unique_ptr<HloRunner> reference_hlo_runner_;
 
  private:
   static constexpr const char* kF8E4M3DatatypePlaceholder{"<<F8E4M3>>"};
@@ -289,7 +294,7 @@ class CollectiveOpsTestE2E : public HloHardwareIndependentTestBase {
 // E2E test for collectives with flags set. Has constructor arguments specifying
 // whether to enable/disable async collectives, and to set the memcpy_local_p2p
 // flag. Subclasses pass in constructor arguments based on GetParam().
-class CollectiveOpsWithFlagsBase : public CollectiveOpsTestE2E {
+class CollectiveOpsWithFlagsBase : public CollectiveOpsE2ETestBase {
  public:
   CollectiveOpsWithFlagsBase(bool enable_async, bool enable_p2p_memcpy)
       : enable_async_(enable_async), enable_p2p_memcpy_(enable_p2p_memcpy) {
@@ -1502,8 +1507,9 @@ TEST_F(CollectiveOpsTestE2E, HostMemoryOffloadingWithDonation) {
 // E2E tests comparing the results of sharded and unsharded execution.
 class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
  public:
-  void CollectiveOpsCompareShardedUnsharded(const std::string& hlo_text,
-                                            const int64_t num_partitions = 2) {
+  void CollectiveOpsCompareShardedUnsharded(
+      const std::string& hlo_text, const int64_t num_partitions = 2,
+      bool enable_enzyme_comms_opt = false) {
     const int64_t num_replicas = 1;
     if (hlo_runner_->device_count() < num_replicas * num_partitions) {
       GTEST_SKIP() << "Test requires at least " << num_replicas * num_partitions
@@ -1515,8 +1521,9 @@ class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
                             ExecuteUnsharded(hlo_text));
     ASSERT_EQ(ref_results.size(), 1);
 
-    TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                            ExecuteSharded(hlo_text, num_partitions));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<Literal> results,
+        ExecuteSharded(hlo_text, num_partitions, enable_enzyme_comms_opt));
     ASSERT_EQ(results.size(), num_partitions);
 
     ErrorSpec error_spec{1e-4, 1e-4};
@@ -1562,12 +1569,16 @@ class CollectiveOpsTestE2EShardedUnsharded : public CollectiveOpsTestE2E {
 
   // Execute the sharded case.
   absl::StatusOr<std::vector<Literal>> ExecuteSharded(
-      const std::string& hlo_text, int64_t num_partitions) {
+      const std::string& hlo_text, int64_t num_partitions,
+      bool enable_enzyme_comms_opt = false) {
     HloModuleConfig config = GetModuleConfigForTest();
     DebugOptions opts = GetDebugOptionsForTest();
     opts.set_xla_gpu_enable_triton_gemm(false);
     config.set_debug_options(opts);
     config.set_num_partitions(num_partitions);
+    if (enable_enzyme_comms_opt) {
+      config.mutable_debug_options().set_xla_enable_enzyme_comms_opt(true);
+    }
     TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
                         ParseAndReturnVerifiedModule(hlo_text, config));
     const int64_t num_params = module->entry_computation()->num_parameters();
@@ -1713,6 +1724,26 @@ ENTRY entry {
   CollectiveOpsCompareShardedUnsharded(hlo_text);
 }
 
+// This is an execution test for the example in Option 2 in go/dus-spmd. This
+// test should pass regardless of which DUS SPMD implementation option is used.
+TEST_F(CollectiveOpsTestE2EShardedUnsharded,
+       DusSingleDimensionInPartitionMode) {
+  const std::string hlo_text = R"(
+    HloModule module, entry_computation_layout={(s32[16]{0}, s32[8]{0})->s32[16]{0}}, num_partitions=4
+
+    ENTRY entry {
+      %input = s32[16] parameter(0), sharding={devices=[4]<=[4]}
+      %update = s32[8] parameter(1), sharding={devices=[4]<=[4]}
+      %c3 = s32[] constant(3)
+      ROOT %dynamic-update-slice = s32[16] dynamic-update-slice(%input, %update, %c3), sharding={devices=[4]<=[4]}
+    })";
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/4,
+                                       /*enable_enzyme_comms_opt=*/true);
+  // This test should pass regardless if enzyme comms opt is enabled or not.
+  CollectiveOpsCompareShardedUnsharded(hlo_text, /*num_partitions=*/4,
+                                       /*enable_enzyme_comms_opt=*/false);
+}
+
 TEST_F(CollectiveOpsTestE2EShardedUnsharded, DotBatchAndNonContracting) {
   const std::string hlo_text = R"(
 HloModule module, entry_computation_layout={(f32[4,16,8]{2,1,0}, f32[4,4,8]{2,1,0})->f32[4,16,4]{2,1,0}}, num_partitions=2
@@ -3543,7 +3574,9 @@ INSTANTIATE_TEST_SUITE_P(
                           RaggedAllToAllImplTypeName(std::get<1>(info.param)));
     });
 
-class RaggedAllToAllMultiHostDecomposerTest : public RaggedAllToAllTestBase {
+class RaggedAllToAllMultiHostDecomposerTest
+    : public RaggedAllToAllTestBase,
+      public ::testing::WithParamInterface<std::tuple<int64_t, int64_t>> {
  public:
   RaggedAllToAllMultiHostDecomposerTest()
       : RaggedAllToAllTestBase(/*enable_async=*/false,
@@ -3561,21 +3594,25 @@ class RaggedAllToAllMultiHostDecomposerTest : public RaggedAllToAllTestBase {
   }
 };
 
-TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
-  absl::string_view kModuleReplicatedStr = R"(
+TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
   HloModule module, num_partitions=1
 
   ENTRY entry {
-    input = f32[512,5,32] parameter(0)
-    output = f32[512,5,32] parameter(1)
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
     input_offsets = s32[32] parameter(2)
     send_sizes = s32[32] parameter(3)
     output_offsets = s32[32] parameter(4)
     recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512,5,32] ragged-all-to-all(input, output,
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
       input_offsets, send_sizes, output_offsets, recv_sizes),
       replica_groups={{0,1}}
-  })";
+  })",
+                       num_input_rows, num_output_rows);
 
   const int64_t kNumReplicas = 2;
   const int64_t kNumPartitions = 1;
@@ -3612,21 +3649,25 @@ TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
   }
 }
 
-TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
-  absl::string_view kModuleReplicatedStr = R"(
+TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
   HloModule module, num_partitions=1
 
   ENTRY entry {
-    input = f32[512,5,32] parameter(0)
-    output = f32[512,5,32] parameter(1)
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
     input_offsets = s32[32] parameter(2)
     send_sizes = s32[32] parameter(3)
     output_offsets = s32[32] parameter(4)
     recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512,5,32] ragged-all-to-all(input, output,
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
       input_offsets, send_sizes, output_offsets, recv_sizes),
       replica_groups={{0,1,2,3,4,5,6,7}}
-  })";
+  })",
+                       num_input_rows, num_output_rows);
 
   const int64_t kNumReplicas = 8;
   const int64_t kNumPartitions = 1;
@@ -3648,7 +3689,7 @@ TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
 
   Array<int64_t> input_sizes(
       {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
-  input_sizes.FillRandomUniform(0, 10);
+  input_sizes.FillRandomUniform(0, 16);
 
   TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
 
@@ -3665,22 +3706,26 @@ TEST_F(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
   }
 }
 
-TEST_F(RaggedAllToAllMultiHostDecomposerTest,
+TEST_P(RaggedAllToAllMultiHostDecomposerTest,
        RaggedAllToAll_8GPUs_SliceSize4_2ReplicaGroups) {
-  absl::string_view kModuleReplicatedStr = R"(
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
   HloModule module, num_partitions=1
 
   ENTRY entry {
-    input = f32[512,5,32] parameter(0)
-    output = f32[512,5,32] parameter(1)
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
     input_offsets = s32[32] parameter(2)
     send_sizes = s32[32] parameter(3)
     output_offsets = s32[32] parameter(4)
     recv_sizes = s32[32] parameter(5)
-    ROOT ra2a = f32[512,5,32] ragged-all-to-all(input, output,
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
       input_offsets, send_sizes, output_offsets, recv_sizes),
       replica_groups={{0,2,4,6},{1,3,5,7}}
-  })";
+  })",
+                       num_input_rows, num_output_rows);
 
   const int64_t kNumReplicas = 8;
   const int64_t kNumReplicasPerGroup = 4;
@@ -3720,6 +3765,19 @@ TEST_F(RaggedAllToAllMultiHostDecomposerTest,
   }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    RaggedAllToAllMultiHostDecomposerTest,
+    RaggedAllToAllMultiHostDecomposerTest,
+    ::testing::Values(std::make_tuple(512, 4096), std::make_tuple(4096, 512)),
+    [](const ::testing::TestParamInfo<std::tuple<int64_t, int64_t>>& info) {
+      if (std::get<0>(info.param) > std::get<1>(info.param)) {
+        return absl::StrCat("combine_", std::get<0>(info.param), "_",
+                            std::get<1>(info.param));
+      }
+      return absl::StrCat("dispatch_", std::get<0>(info.param), "_",
+                          std::get<1>(info.param));
+    });
+
 TEST_F(CollectiveOpsTestE2E, MemcpyP2pWhileLoopCorrectness) {
   absl::string_view hlo_string = R"(
 HloModule MemcpyP2pWhileLoopCorrectness, entry_computation_layout={(bf16[128,96]{1,0})->(bf16[32,384]{1,0}, bf16[32,384]{1,0})}, allow_spmd_sharding_propagation_to_output={true,true}, num_partitions=4
diff --git a/third_party/xla/xla/tests/collective_ops_test.cc b/third_party/xla/xla/tests/collective_ops_test.cc
index f9293c75aa5958..26dab88fea8a8b 100644
--- a/third_party/xla/xla/tests/collective_ops_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_test.cc
@@ -2665,15 +2665,13 @@ class Fp8CollectiveOpsTest : public CollectiveOpsTest {
  public:
   Fp8CollectiveOpsTest() {
     replacements_[kF8E4M3DatatypePlaceholder] =
-        IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
+        Capability().IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
     replacements_[kF8E5M2DatatypePlaceholder] =
-        IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
+        Capability().IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
     replacements_[kF8E8M0DatatypePlaceholder] = "f8e8m0fnu";
   }
 
  protected:
-  bool IsCuda() { return Capability().IsCuda(); }
-
   const se::GpuComputeCapability& Capability() {
     return backend()
         .default_stream_executor()
diff --git a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
index 86d6d0c860529b..82e1e657a3223d 100644
--- a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
@@ -434,13 +434,15 @@ ENTRY entry {
   ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
 }
 )";
-  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
-  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  std::unique_ptr<HloModule> module1 = module->Clone("module1");
+  std::unique_ptr<HloModule> module2 = module->Clone("module2");
+  EXPECT_FALSE(RunOptimizer(module1.get(), /*last_run=*/true, 0).value());
   EXPECT_FALSE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
-  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module1->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module1), std::move(module2),
                                       ErrorSpec{0.1, 0.1}));
 }
 
@@ -563,13 +565,16 @@ TEST_F(CollectivePipelinerExecutionTest, EscapedInputNoTransform) {
    get-tuple-element(while), index=1
  }
 )";
-  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
-  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  std::unique_ptr<HloModule> module1 = module->Clone("module1");
+  std::unique_ptr<HloModule> module2 = module->Clone("module2");
+  EXPECT_FALSE(RunOptimizer(module1.get(), /*last_run=*/true, 0).value());
   EXPECT_FALSE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
-  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module1->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module1), std::move(module2),
                                       ErrorSpec{0.1, 0.1}));
 }
 
diff --git a/third_party/xla/xla/tests/complex_unary_op_test.cc b/third_party/xla/xla/tests/complex_unary_op_test.cc
index dc6a0d6ee4d6e3..50023f9e103571 100644
--- a/third_party/xla/xla/tests/complex_unary_op_test.cc
+++ b/third_party/xla/xla/tests/complex_unary_op_test.cc
@@ -40,6 +40,14 @@ class ComplexUnaryOpTest
     : public ClientLibraryTestRunnerMixin<
           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
+  // Disable constant folding to ensure we test the actual backend
+  // implementation. Otherwise, constant folding pre-computes results using
+  // HloEvaluator's reference implementation (std c++), not the backend under
+  // test.
+  void SetUp() override {
+    ClientLibraryTestRunnerMixin::SetUp();
+    mutable_debug_options()->add_xla_disable_hlo_passes("constant_folding");
+  }
   template <typename T, size_t index, typename... Types>
   std::vector<T> get_column(const std::vector<std::tuple<Types...>>& table) {
     std::vector<T> column;
diff --git a/third_party/xla/xla/tests/constants_test.cc b/third_party/xla/xla/tests/constants_test.cc
index c08a4bdf2b8f7c..47dbccc1d85e9c 100644
--- a/third_party/xla/xla/tests/constants_test.cc
+++ b/third_party/xla/xla/tests/constants_test.cc
@@ -206,17 +206,25 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
   input_array.FillWithPZ(pz);
   Literal input_literal = LiteralUtil::CreateR4FromArray4D(input_array);
 
-  {
-    XlaBuilder builder(TestName());
-    ConstantLiteral(&builder, input_literal);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
-  }
+  XlaBuilder builder(TestName());
+  ConstantLiteral(&builder, input_literal);
+  ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
+}
 
-  {
-    XlaBuilder builder(TestName());
-    ConstantR4FromArray4D<float>(&builder, input_array);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
-  }
+TEST_F(ConstantsTest, Small_3x2x1x1_array4d) {
+  Array4D<float> input_array(3, 2, 1, 1);
+  Array2D<float> pz({
+      // z0 z1
+      {-1.0f, 4.1f},  // p0
+      {2.0f, 4.1f},   // p1
+      {5.0f, 4.4f},   // p2
+  });
+  input_array.FillWithPZ(pz);
+  Literal input_literal = LiteralUtil::CreateR4FromArray4D(input_array);
+
+  XlaBuilder builder(TestName());
+  ConstantR4FromArray4D<float>(&builder, input_array);
+  ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
 }
 
 // TODO(b/29263943): Support tuple constants.
diff --git a/third_party/xla/xla/tests/conv_depthwise_test.cc b/third_party/xla/xla/tests/conv_depthwise_test.cc
index f96eac7c272703..5e4209a5a9083a 100644
--- a/third_party/xla/xla/tests/conv_depthwise_test.cc
+++ b/third_party/xla/xla/tests/conv_depthwise_test.cc
@@ -13,23 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <optional>
+#include <string>
+#include <vector>
 
-#include "xla/execution_options_util.h"
-#include "xla/hlo/builder/xla_computation.h"
+#include "absl/status/status.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/transforms/despecializer.h"
 #include "xla/hlo/transforms/simplifiers/float_normalization.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
 #include "xla/tests/conv_depthwise_common.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
 class DepthwiseConvolution2DTest
-    : public HloTestBase,
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
       public ::testing::WithParamInterface<
           ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
 
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index 5e41066abf30cd..5163b9842d8ba5 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -52,7 +52,8 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
@@ -74,7 +75,8 @@ const float test_float_vals[3][test_width][test_height] = {
 
 // Test whether fusion operations are emitted with no errors and compute
 // accurate outputs.
-class CpuGpuFusionTest : public HloTestBase {
+class CpuGpuFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  protected:
   template <typename T, int Arity>
   void TestElementwise2D(
@@ -155,7 +157,7 @@ class CpuGpuFusionTest : public HloTestBase {
   bool ComputeElementwiseAnswerCompare(ComparisonDirection direction,
                                        absl::Span<const float> xs);
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options = HloPjRtTestBase::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("layout-assignment");
     return debug_options;
   }
@@ -884,7 +886,8 @@ TEST_F(CpuGpuFusionTest, Clamp2D) {
 }
 
 class FusionClientLibraryTest
-    : public ClientLibraryTestRunnerMixin<HloTestBase> {};
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
 TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   // On the GPU backend, it's possible to have too many transposes within one
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index eddd658eb10653..28bfe038cb97f7 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -15,26 +15,37 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "xla/tests/xla_test_backend_predicates.h"
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
-#include "xla/client/local_client.h"
+#include "xla/client/client_library.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/lib/matrix.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/reference_util.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -46,7 +57,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class DotOperationTest : public ClientLibraryTestBase {
+class DotOperationTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   ErrorSpec error_spec_{0.0001, 1e-5};
 };
@@ -74,24 +87,23 @@ using TypesF8 = ::testing::Types<tsl::float8_e4m3fnuz>;
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
 TEST_F(DotOperationTest, DotOfInputTupleElem) {
-  XlaBuilder builder(TestName());
+  XlaBuilder builder("DotOfInputTupleElem");
 
   XlaOp param;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto param_data,
-      CreateParameterAndTransferLiteral(
-          0,
-          LiteralUtil::MakeTupleFromSlices(
-              {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
-               LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
-          "arg0", &builder, &param));
+  Literal param_data = this->CreateParameterAndTransferLiteral(
+      0,
+      LiteralUtil::MakeTupleFromSlices(
+          {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
+           LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
+      "arg0", &builder, &param);
   auto lhs = GetTupleElement(param, 0);
   auto rhs = GetTupleElement(param, 1);
   Dot(lhs, rhs);
 
-  ComputeAndCompareLiteral(&builder,
-                           LiteralUtil::CreateR2<float>({{19, 22}, {43, 50}}),
-                           {param_data.get()});
+  Literal expected_literal = LiteralUtil::CreateR2<float>({{19, 22}, {43, 50}});
+
+  ComputeAndCompareLiteral(&builder, expected_literal, {&param_data},
+                           &expected_literal.shape());
 }
 
 template <typename T>
@@ -222,51 +234,37 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, FusedDot) {
   auto exp0 = Exp(param0);
   Dot(exp0, param1);
 
-  auto lhs_handle =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-              {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}}))
-          .value();
-  auto rhs_handle = this->client_
-                        ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                            {{1.0f}, {2.0f}, {3.0f}, {4.0f}}))
-                        .value();
-
+  Literal lhs_handle = LiteralUtil::CreateR2FromArray2D<T>(
+      {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}});
+  Literal rhs_handle =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f}, {2.0f}, {3.0f}, {4.0f}});
   if (std::is_same<Eigen::half, T>::value) {
     this->error_spec_ = ErrorSpec{0.0001, 1e-3};
   }
 
   this->template ComputeAndCompareR2<T>(
       &builder, Array2D<T>({{296.14560492846033f}, {0.8611737683031964f}}),
-      {lhs_handle.get(), rhs_handle.get()}, this->error_spec_);
+      {&lhs_handle, &rhs_handle}, this->error_spec_);
 }
 
 template <typename T>
 class SquareMatrixDot : public DotOperationTest {
  public:
   void TestImpl(bool lhs_row_major, bool rhs_row_major) {
-    auto lhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 2.0f}, {3.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(lhs_row_major))))
-            .value();
-    auto rhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 6.0f}, {7.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(rhs_row_major))))
-            .value();
+    Literal lhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 2.0f}, {3.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
+    Literal rhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 6.0f}, {7.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
     XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
     Dot(Parameter(&builder, 0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"),
         Parameter(&builder, 1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs"));
 
     Array2D<T> expected({{15.0f, -2.0f}, {-25.0f, 34.0f}});
-    ComputeAndCompareR2<T>(&builder, expected,
-                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+    ComputeAndCompareR2<T>(&builder, expected, {&lhs_handle, &rhs_handle},
+                           error_spec_);
   }
 
  protected:
@@ -312,6 +310,10 @@ class ParametricDotTest : public DotOperationTest,
                           public ::testing::WithParamInterface<DotTestParam> {
  protected:
   // This method runs before each test runs.
+  bool IsRocm() {
+    return test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuRocm);
+  }
+
   void SetUp() override {
     // Several F16 tests are subject to denormal issues on MI210 architecture.
     // For that matter, we set propagate_grad_xy_ flag for these tests, which
@@ -319,21 +321,11 @@ class ParametricDotTest : public DotOperationTest,
     // does not work well with ROCBLAS autotuning, hence we also disable it.
     // This also serves as a test that grad_x/y attributes are correctly
     // propagated down to a GEMM routine.
-    const auto& gpu_comp = client_->backend()
-                               .default_stream_executor()
-                               ->GetDeviceDescription()
-                               .gpu_compute_capability();
-    if (gpu_comp.IsRocm()) {
+    if (IsRocm()) {
       absl::string_view name(
           ::testing::UnitTest::GetInstance()->current_test_info()->name());
-      if (name.find("TestF16/270x270x520_MajorToMinor") != std::string::npos) {
-        execution_options_.mutable_debug_options()->set_xla_gpu_autotune_level(
-            0);
-        DotTestParam param = GetParam();
-        // In order to test both grad_x and grad_y attributes, we set
-        // propagate_grad_xy_ to 1 or 2 based on some alternating parameter
-        // to set it deterministically.
-        propagate_grad_xy_ = param.dot_lhs_row_major ? 1 : 2;
+      if (absl::StrContains(name, "TestF16/270x270x520_MajorToMinor")) {
+        GTEST_SKIP() << "Not supported on ROCm until Triton is re-enabled.";
       }
     }
   }
@@ -344,7 +336,7 @@ class ParametricDotTest : public DotOperationTest,
   template <typename NativeT>
   void ComputeAndCompareR2WithError(XlaBuilder* builder,
                                     const Array2D<NativeT>& expected,
-                                    absl::Span<GlobalData* const> arguments);
+                                    absl::Span<Literal* const> arguments);
 
   int32_t propagate_grad_xy_ = 0;
 };
@@ -352,7 +344,7 @@ class ParametricDotTest : public DotOperationTest,
 template <typename NativeT>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   ComputeAndCompareR2(builder, expected, arguments, error_spec);
 }
@@ -360,7 +352,7 @@ void ParametricDotTest::ComputeAndCompareR2WithError(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<Eigen::half>(
     XlaBuilder* builder, const Array2D<Eigen::half>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 7e-3);
   ComputeAndCompareR2(builder, expected, arguments, error_spec);
 }
@@ -368,21 +360,21 @@ void ParametricDotTest::ComputeAndCompareR2WithError<Eigen::half>(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<int32_t>(
     XlaBuilder* builder, const Array2D<int32_t>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ComputeAndCompareR2(builder, expected, arguments);
 }
 
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError<uint8_t>(
     XlaBuilder* builder, const Array2D<uint8_t>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ComputeAndCompareR2(builder, expected, arguments);
 }
 
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<tsl::float8_e5m2>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   error_spec.low_precision_fp_error_spec.type =
       primitive_util::NativeToPrimitiveType<tsl::float8_e5m2>();
@@ -393,7 +385,7 @@ void ParametricDotTest::ComputeAndCompareR2WithError(
 template <>
 void ParametricDotTest::ComputeAndCompareR2WithError(
     XlaBuilder* builder, const Array2D<tsl::float8_e4m3fn>& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<Literal* const> arguments) {
   ErrorSpec error_spec(0.3, 3e-3);
   error_spec.low_precision_fp_error_spec.type =
       primitive_util::NativeToPrimitiveType<tsl::float8_e4m3fn>();
@@ -406,31 +398,25 @@ void ParametricDotTest::TestImpl() {
 
   std::unique_ptr<Array2D<NativeT>> dot_lhs_data =
       MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.k);
-  Literal dot_lhs_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
+  Literal dot_lhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout(
       *dot_lhs_data, LayoutUtil::MakeLayout(
                          MinorToMajorForIsRowMajor(param.dot_lhs_row_major)));
-  std::unique_ptr<GlobalData> dot_lhs_handle =
-      client_->TransferToServer(dot_lhs_lit).value();
 
   std::unique_ptr<Array2D<NativeT>> dot_rhs_data =
       MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.k, param.n);
   Layout rhs_layout = LayoutUtil::MakeLayout(
       MinorToMajorForIsRowMajor(param.dot_rhs_row_major));
-  Literal dot_rhs_lit =
+  Literal dot_rhs_handle =
       LiteralUtil::CreateR2FromArray2DWithLayout(*dot_rhs_data, rhs_layout);
-  std::unique_ptr<GlobalData> dot_rhs_handle =
-      client_->TransferToServer(dot_rhs_lit).value();
 
   std::unique_ptr<Array2D<NativeT>> addend_data;
   Literal addend_lit;
-  std::unique_ptr<GlobalData> addend_handle;
 
   if (param.has_addend) {
     addend_data = MakeLinspaceArray2D<NativeT>(0.0, 1.0, param.m, param.n);
     addend_lit = LiteralUtil::CreateR2FromArray2DWithLayout(
         *addend_data, LayoutUtil::MakeLayout(
                           MinorToMajorForIsRowMajor(param.addend_row_major)));
-    addend_handle = client_->TransferToServer(addend_lit).value();
   }
 
   XlaBuilder builder(TestName());
@@ -479,9 +465,9 @@ void ParametricDotTest::TestImpl() {
     expected = ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data);
   }
 
-  std::vector<GlobalData*> args = {dot_lhs_handle.get(), dot_rhs_handle.get()};
+  std::vector<Literal*> args = {&dot_lhs_handle, &dot_rhs_handle};
   if (param.has_addend) {
-    args.push_back(addend_handle.get());
+    args.push_back(&addend_lit);
   }
   ComputeAndCompareR2WithError<NativeT>(&builder, *expected, args);
 }
@@ -541,14 +527,11 @@ INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
 class ParametricDotTestWithoutLayoutAssignment : public ParametricDotTest {
  public:
   ParametricDotTestWithoutLayoutAssignment() {
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "layout-assignment");
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "hlo-verifier");
+    mutable_debug_options()->add_xla_disable_hlo_passes("layout-assignment");
+    mutable_debug_options()->add_xla_disable_hlo_passes("hlo-verifier");
     // Disable algebraic simplification because the pass may replace a dot
     // instruction with a layout-changing multiplication instruction.
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
   }
 };
 
@@ -613,20 +596,12 @@ template <typename T>
 class NonsquareMatrixDot : public DotOperationTest {
  public:
   void TestImpl(bool lhs_row_major, bool rhs_row_major) {
-    auto lhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(lhs_row_major))))
-            .value();
-    auto rhs_handle =
-        client_
-            ->TransferToServer(LiteralUtil::CreateFromArrayWithLayout<T>(
-                {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}},
-                LayoutUtil::MakeLayout(
-                    MinorToMajorForIsRowMajor(rhs_row_major))))
-            .value();
+    Literal lhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
+    Literal rhs_handle = LiteralUtil::CreateFromArrayWithLayout<T>(
+        {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}},
+        LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
 
     XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
@@ -635,8 +610,8 @@ class NonsquareMatrixDot : public DotOperationTest {
 
     Array2D<T> expected({{26.0f, 0.0f}, {-12.0f, 10.0f}});
 
-    ComputeAndCompareR2<T>(&builder, expected,
-                           {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+    ComputeAndCompareR2<T>(&builder, expected, {&lhs_handle, &rhs_handle},
+                           error_spec_);
   }
 
  protected:
@@ -655,17 +630,11 @@ TYPED_TEST(NonsquareMatrixDot, TestTF) { this->TestImpl(true, false); }
 TYPED_TEST(NonsquareMatrixDot, TestTT) { this->TestImpl(true, true); }
 
 TEST_F(DotOperationTest, MatrixVectorC64) {
-  auto lhs_handle =
-      client_
-          ->TransferToServer(LiteralUtil::CreateR2WithLayout<complex64>(
-              {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0})))
-          .value();
-  auto rhs_handle =
-      client_
-          ->TransferToServer(LiteralUtil::CreateR2WithLayout<complex64>(
-              {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
-              LayoutUtil::MakeLayout({1, 0})))
-          .value();
+  auto lhs_handle = LiteralUtil::CreateR2WithLayout<complex64>(
+      {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0}));
+  auto rhs_handle = LiteralUtil::CreateR2WithLayout<complex64>(
+      {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
+      LayoutUtil::MakeLayout({1, 0}));
 
   XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
@@ -674,8 +643,8 @@ TEST_F(DotOperationTest, MatrixVectorC64) {
 
   Array2D<complex64> expected({{30.0, -2.0}});
 
-  ComputeAndCompareR2<complex64>(
-      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
+  ComputeAndCompareR2<complex64>(&builder, expected, {&lhs_handle, &rhs_handle},
+                                 error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, ConcurrentMatMult) {
@@ -715,13 +684,13 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   }
   using T = TypeParam;
   XlaBuilder builder(this->TestName());
-  auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}),
-                     "x");
-  auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}),
-                     "y");
+  XlaOp x = Parameter(&builder, 0,
+                      ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "x");
+  XlaOp y = Parameter(&builder, 1,
+                      ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "y");
 
-  auto x_flat = Reshape(x, {4, 2, 2});
-  auto y_flat = Reshape(y, {4, 2, 2});
+  XlaOp x_flat = Reshape(x, {4, 2, 2});
+  XlaOp y_flat = Reshape(y, {4, 2, 2});
 
   // Slice batches into individual matrices and multiply them.
   std::vector<XlaOp> out_slices;
@@ -741,20 +710,13 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   auto out_flat = ConcatInDim(&builder, out_slices, 0);
   Reshape(out_flat, {2, 2, 2, 2});
 
-  auto x_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-                        {{{{1000.0f, 100.0f}, {10.0f, 1.0f}},
-                          {{2000.0f, 200.0f}, {20.0f, 2.0f}}},
-                         {{{3000.0f, 300.0f}, {30.0f, 3.0f}},
-                          {{4000.0f, 400.0f}, {40.0f, 4.0f}}}}))
-                    .value();
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-               {{{11.0f, 22.0f}, {33.0f, 44.0f}},
-                {{55.0f, 66.0f}, {77.0f, 88.0f}}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1000.0f, 100.0f}, {10.0f, 1.0f}}, {{2000.0f, 200.0f}, {20.0f, 2.0f}}},
+       {{{3000.0f, 300.0f}, {30.0f, 3.0f}},
+        {{4000.0f, 400.0f}, {40.0f, 4.0f}}}});
+  Literal y_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+       {{{11.0f, 22.0f}, {33.0f, 44.0f}}, {{55.0f, 66.0f}, {77.0f, 88.0f}}}});
 
   if (std::is_same<Eigen::half, T>::value) {
     this->error_spec_ = ErrorSpec{0.0001, 1e-3};
@@ -766,7 +728,7 @@ TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
         {{11400.0f, 13600.0f}, {114.0f, 136.0f}}},
        {{{42900.0f, 79200.0f}, {429.0f, 792.0f}},
         {{250800.0f, 299200.0f}, {2508.0f, 2992.0f}}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
@@ -786,23 +748,17 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}});
 
   this->template ComputeAndCompareR3<T>(
       &builder,
       /*expected=*/
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 #if GOOGLE_CUDA || (TF_HIPBLASLT && TF_ROCM_VERSION >= 60000)
@@ -812,8 +768,7 @@ class DotOperationTestWithCublasLt_F16F32F64CF64 : public DotOperationTest {
   DotOperationTestWithCublasLt_F16F32F64CF64() {
     bool enable_cublas_lt = true;
 
-    execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
-        enable_cublas_lt);
+    mutable_debug_options()->set_xla_gpu_enable_cublaslt(enable_cublas_lt);
   }
 
  protected:
@@ -844,17 +799,11 @@ TYPED_TEST(DotOperationTestWithCublasLt_F16F32F64CF64,
 
   auto dot = DotGeneral(x, y, dnums);
   auto prim_type = primitive_util::NativeToPrimitiveType<T>();
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{-1.0f, 2.0f}, {3.0f, -4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{-1.0f, 2.0f}, {3.0f, -4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 0.0f}, {0.0f, -1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 0.0f}, {0.0f, -1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}});
   Array3D<T> expected(
       {{{-1.0f, -2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {-7.0f, 8.0f}}});
   if (prim_type != C64) {
@@ -864,8 +813,8 @@ TYPED_TEST(DotOperationTestWithCublasLt_F16F32F64CF64,
     expected = Array3D<T>(
         {{{0.0f, 0.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {0.0f, 8.0f}}});
   }
-  this->template ComputeAndCompareR3<T>(
-      &builder, expected, {x_data.get(), y_data.get()}, this->error_spec_);
+  this->template ComputeAndCompareR3<T>(&builder, expected, {&x_data, &y_data},
+                                        this->error_spec_);
 }
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
@@ -874,8 +823,7 @@ template <typename T>
 class DotOperationTestWithCublasLt_F8 : public DotOperationTest {
  public:
   DotOperationTestWithCublasLt_F8() {
-    execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
-        true);
+    mutable_debug_options()->set_xla_gpu_enable_cublaslt(true);
   }
 };
 TYPED_TEST_CASE(DotOperationTestWithCublasLt_F8, TypesF8);
@@ -907,82 +855,74 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABUnscaledDF8) {
 
   DotGeneral(a_scaled_f32, b_scaled_f32, dnums);
 
-  auto a_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
-                    .value();
-  auto b_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
-                    .value();
-  auto a_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
-          .value();
-  auto b_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
-          .value();
+  Literal a_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 7.0f, 5.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 3.0f, 2.0f}});
+  Literal b_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 19.0f, 17.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 13.0f, 11.0f}});
+  Literal a_scale_data = LiteralUtil::CreateR0<float>(2.0f);
+  Literal b_scale_data = LiteralUtil::CreateR0<float>(4.0f);
 
   Literal expected_d = LiteralUtil::CreateR2FromArray2D<float>(
       {{560.0f, 688.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -1018,10 +958,9 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABUnscaledDF8) {
        {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
         0.0f, 0.0f, 688.0f, 560.0f}});
 
-  this->ComputeAndCompareTuple(
-      &builder, expected_d,
-      {a_data.get(), b_data.get(), a_scale_data.get(), b_scale_data.get()},
-      this->error_spec_);
+  this->ComputeAndCompareTuple(&builder, expected_d,
+                               {&a_data, &b_data, &a_scale_data, &b_scale_data},
+                               this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
@@ -1070,85 +1009,75 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
       d_clamped_f32, primitive_util::NativeToPrimitiveType<T>());
   Tuple(&builder, {d_f8, d_amax});
 
-  auto a_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
-                    .value();
-  auto b_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
-                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
-                    .value();
-  auto a_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
-          .value();
-  auto b_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
-          .value();
-  auto d_scale_data =
-      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(8.0f))
-          .value();
+  Literal a_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 7.0f, 5.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 3.0f, 2.0f}});
+  auto b_data = LiteralUtil::CreateR2FromArray2D<T>(
+      {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 19.0f, 17.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 13.0f, 11.0f}});
+  Literal a_scale_data = LiteralUtil::CreateR0<float>(2.0f);
+  Literal b_scale_data = LiteralUtil::CreateR0<float>(4.0f);
+  auto d_scale_data = LiteralUtil::CreateR0<float>(8.0f);
 
   Literal expected_d = LiteralUtil::CreateR2FromArray2D<T>(
       {{72.0f, 88.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -1186,10 +1115,10 @@ TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
   Literal expected_amax = LiteralUtil::CreateR0<float>(1640.0f);
   Literal expected = LiteralUtil::MakeTuple({&expected_d, &expected_amax});
 
-  this->ComputeAndCompareTuple(&builder, expected,
-                               {a_data.get(), b_data.get(), a_scale_data.get(),
-                                b_scale_data.get(), d_scale_data.get()},
-                               this->error_spec_);
+  this->ComputeAndCompareTuple(
+      &builder, expected,
+      {&a_data, &b_data, &a_scale_data, &b_scale_data, &d_scale_data},
+      this->error_spec_);
 }
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
@@ -1209,20 +1138,15 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
-  auto y_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
-                    .value();
+  Literal y_data =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f, 0.0f}, {0.0f, 1.0f}});
 
   this->template ComputeAndCompareR2<T>(
       &builder,
-      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {&x_data, &y_data},
       this->error_spec_);
 }
 
@@ -1242,20 +1166,15 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR2LhsR3Rhs) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data = this->client_
-                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
-                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
-                    .value();
+  Literal x_data =
+      LiteralUtil::CreateR2FromArray2D<T>({{1.0f, 0.0f}, {0.0f, 1.0f}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
-              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
-          .value();
+  auto y_data = LiteralUtil::CreateR3FromArray3D<T>(
+      {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
 
   this->template ComputeAndCompareR2<T>(
       &builder,
-      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {&x_data, &y_data},
       this->error_spec_);
 }
 
@@ -1278,27 +1197,20 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) {
 
   DotGeneral(x, y, dnums);
 
-  auto x_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
-               {{{9.0f, 10.0f}, {11.0f, 12.0f}},
-                {{13.0f, 14.0f}, {15.0f, 16.0f}}}}))
-          .value();
+  Literal x_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
+       {{{9.0f, 10.0f}, {11.0f, 12.0f}}, {{13.0f, 14.0f}, {15.0f, 16.0f}}}});
 
-  auto y_data =
-      this->client_
-          ->TransferToServer(LiteralUtil::CreateR4FromArray4D<T>(
-              {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}},
-               {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}}))
-          .value();
+  Literal y_data = LiteralUtil::CreateR4FromArray4D<T>(
+      {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}},
+       {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}});
 
   this->template ComputeAndCompareR4<T>(
       &builder,
       /*expected=*/
       {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
        {{{10.0f, 9.0f}, {12.0f, 11.0f}}, {{14.0f, 13.0f}, {16.0f, 15.0f}}}},
-      {x_data.get(), y_data.get()}, this->error_spec_);
+      {&x_data, &y_data}, this->error_spec_);
 }
 
 TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
@@ -1317,20 +1229,10 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
         if (transpose_rhs) {
           rhs = ReferenceUtil::TransposeArray2D(*rhs);
         }
-        auto lhs_handle =
-            this->client_
-                ->TransferToServer(
-                    LiteralUtil::CreateR2FromArray2DWithLayout<T>(
-                        *lhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
-                .value();
-        auto rhs_handle =
-            this->client_
-                ->TransferToServer(
-                    LiteralUtil::CreateR2FromArray2DWithLayout<T>(
-                        *rhs, LayoutUtil::MakeLayout(
-                                  MinorToMajorForIsRowMajor(row_major))))
-                .value();
+        Literal lhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout<T>(
+            *lhs, LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(row_major)));
+        Literal rhs_handle = LiteralUtil::CreateR2FromArray2DWithLayout<T>(
+            *rhs, LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(row_major)));
 
         XlaBuilder builder(this->TestName());
         auto prim_type = primitive_util::NativeToPrimitiveType<T>();
@@ -1354,8 +1256,7 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) {
         VLOG(1) << "TestTransposeFolding " << transpose_lhs << " "
                 << transpose_rhs << " " << row_major;
         this->template ComputeAndCompareR2<T>(
-            &builder, expected, {lhs_handle.get(), rhs_handle.get()},
-            this->error_spec_);
+            &builder, expected, {&lhs_handle, &rhs_handle}, this->error_spec_);
       }
     }
   }
@@ -1387,23 +1288,14 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64,
       new Array2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}));
   std::unique_ptr<Array2D<T>> arg_2_value_array(new Array2D<T>({{1.0f, 2.0f}}));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_0_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_1_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_2_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array)));
+  Literal arg_0_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array);
+
+  Literal arg_1_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array);
+  Literal arg_2_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array);
 
   Array2D<T> expected({{53.0f, 74.0f}, {45.0f, 66.0f}});
   this->template ComputeAndCompareR2<T>(
-      &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      &builder, expected, {&arg_0_value, &arg_1_value, &arg_2_value},
       this->error_spec_);
 }
 
@@ -1436,23 +1328,13 @@ TYPED_TEST(DotOperationTest_F16F32F64CF64,
   std::unique_ptr<Array2D<T>> arg_2_value_array(
       new Array2D<T>({{1.0f}, {2.0f}}));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_0_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_1_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array)));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto arg_2_value,
-      this->client_->TransferToServer(
-          LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array)));
+  auto arg_0_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_0_value_array);
+  auto arg_1_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_1_value_array);
+  auto arg_2_value = LiteralUtil::CreateR2FromArray2D<T>(*arg_2_value_array);
 
   Array2D<T> expected({{38.0f, 36.0f}, {93.0f, 91.0f}});
   this->template ComputeAndCompareR2<T>(
-      &builder, expected,
-      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()},
+      &builder, expected, {&arg_0_value, &arg_1_value, &arg_2_value},
       this->error_spec_);
 }
 
@@ -1712,33 +1594,24 @@ TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) {
 using EinsumParamType =
     std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::string>;
 class EinsumTest : public DotOperationTest,
-                   public ::testing::WithParamInterface<EinsumParamType> {
- protected:
-  void SetUp() override {
-    const auto& gpu_comp = client_->backend()
-                               .default_stream_executor()
-                               ->GetDeviceDescription()
-                               .gpu_compute_capability();
-  }
-};
-
+                   public ::testing::WithParamInterface<EinsumParamType> {};
 TEST_P(EinsumTest, SimpleEinsumTest) {
   XlaBuilder builder(TestName());
-  auto x = AddParam(
+  Literal x_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
-          .value(),
-      &builder);
-  auto y = AddParam(
+          .value();
+  XlaOp x = Parameter(&builder, 0, x_literal.shape(), "parameter1");
+  Literal y_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
-          .value(),
-      &builder);
+          .value();
+  XlaOp y = Parameter(&builder, 1, y_literal.shape(), "parameter2");
   auto config = std::get<2>(GetParam());
   if (config.find(',') == config.npos) {
     Einsum(x, config);
   } else {
     Einsum(x, y, config);
   }
-  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+  ComputeAndCompare(&builder, {&x_literal, &y_literal}, ErrorSpec{1e-3, 1e-3});
 }
 
 std::vector<EinsumParamType> GetEinsumTestCases() {
@@ -1810,30 +1683,22 @@ INSTANTIATE_TEST_SUITE_P(Einsum, EinsumTest,
 using BatchDotParamType = std::tuple<std::vector<int64_t>, std::vector<int64_t>,
                                      std::vector<int64_t>>;
 class BatchDotTest : public DotOperationTest,
-                     public ::testing::WithParamInterface<BatchDotParamType> {
- protected:
-  void SetUp() override {
-    const auto& gpu_comp = client_->backend()
-                                .default_stream_executor()
-                                ->GetDeviceDescription()
-                                .gpu_compute_capability();
-  }
-};
+                     public ::testing::WithParamInterface<BatchDotParamType> {};
 
 TEST_P(BatchDotTest, BroadcastingBatchDotTest) {
   XlaBuilder builder(TestName());
-  auto x = AddParam(
+  Literal x_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
-          .value(),
-      &builder);
-  auto y = AddParam(
+          .value();
+  XlaOp x = Parameter(&builder, 0, x_literal.shape(), "parameter1");
+  Literal y_literal =
       MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
-          .value(),
-      &builder);
+          .value();
+  XlaOp y = Parameter(&builder, 1, y_literal.shape(), "parameter2");
   auto batch_dot = BatchDot(x, y);
   auto output_shape = builder.GetShape(batch_dot).value();
   EXPECT_EQ(output_shape.dimensions(), std::get<2>(GetParam()));
-  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+  ComputeAndCompare(&builder, {&x_literal, &y_literal}, ErrorSpec{1e-3, 1e-3});
 }
 
 std::vector<BatchDotParamType> GetBatchDotTestCases() {
@@ -1855,17 +1720,8 @@ std::vector<BatchDotParamType> GetBatchDotTestCases() {
 INSTANTIATE_TEST_SUITE_P(BatchDot, BatchDotTest,
                          ::testing::ValuesIn(GetBatchDotTestCases()));
 
-class DotOperationTextTest : public HloTestBase {
- public:
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return device_desc().gpu_compute_capability();
-  }
-
- protected:
-  const stream_executor::DeviceDescription& device_desc() {
-    return backend().default_stream_executor()->GetDeviceDescription();
-  }
-};
+class DotOperationTextTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
 TEST_F(DotOperationTextTest, DotReorderedDotDims) {
   absl::string_view hlo_string =
@@ -2107,9 +1963,10 @@ ENTRY SmallIntegerDot {
 }
 
 TEST_F(DotOperationTextTest, S4Dot) {
-  if (test::DeviceTypeIs(test::kTpu)) {
-    GTEST_SKIP();
-  }
+  // TODO (b/456833594): reenable once the missing logic in tfrt_gpu_client
+  // to pack int4 type for host literals has been added.
+  GTEST_SKIP();
+
   absl::string_view hlo_string =
       R"(
 HloModule SmallIntegerDot
@@ -2276,14 +2133,16 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstLHS_RL) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
+  Literal y_literal =
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, {2, 6})).value();
   auto t1 = Transpose(t0, {1, 0, 2});
   auto rhs = Reshape(t1, {6, 2});
   auto lhs = ConstantR2FromArray2D(&builder, const_arr);
   Dot(lhs, rhs);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
@@ -2293,8 +2152,8 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {1, 0, 2});
   auto lhs = Reshape(t1, {6, 2});
   auto rhs = ConstantR2FromArray2D(&builder, const_arr);
@@ -2304,7 +2163,7 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_LR) {
   dims.add_rhs_contracting_dimensions(1);
   DotGeneral(lhs, rhs, dims);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_RL) {
@@ -2314,14 +2173,14 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_RL) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR4FromArray4D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {0, 2, 3, 1});
   auto lhs = Reshape(t1, {2, 24});
   auto rhs = ConstantR2FromArray2D(&builder, const_arr);
   Dot(lhs, rhs);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
@@ -2331,8 +2190,8 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR3FromArray3D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR3FromArray3D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Reshape(t0, {2, 2, 3, 2});
   auto t2 = Transpose(t1, {0, 2, 1, 3});
   auto lhs = Reshape(t2, {2, 6, 2});
@@ -2345,7 +2204,7 @@ TEST_F(DotOperationTest, ReorderContractingDimsConstRHS_MM) {
   dims.add_rhs_batch_dimensions(0);
   DotGeneral(lhs, rhs, dims);
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
@@ -2355,8 +2214,8 @@ TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
   const_arr.FillIota(0);
 
   XlaBuilder builder(TestName());
-  auto t0 =
-      AddParam(LiteralUtil::CreateR4FromArray4D<float>(input_arr), &builder);
+  Literal t0_literal = LiteralUtil::CreateR4FromArray4D<float>(input_arr);
+  XlaOp t0 = Parameter(&builder, 0, t0_literal.shape(), "parameter");
   auto t1 = Transpose(t0, {0, 2, 1, 3});
   auto t2 = Reshape(t1, {2, 6, 5});
   auto t3 = Transpose(t2, {0, 2, 1});
@@ -2372,7 +2231,7 @@ TEST_F(DotOperationTest, ReorderContractingDims_Multipass) {
   // optimization can be applied multiple times if we fold the transpose
   // and reshape that are moved to the constant side of the dot.
   mutable_debug_options()->clear_xla_disable_hlo_passes();
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {&t0_literal}, error_spec_);
 }
 
 TEST_F(DotOperationTextTest, WiderIntegralResultAccumulation) {
diff --git a/third_party/xla/xla/tests/fft_test.cc b/third_party/xla/xla/tests/fft_test.cc
index 14c4df8cdb84cc..c55bd735f8ab4c 100644
--- a/third_party/xla/xla/tests/fft_test.cc
+++ b/third_party/xla/xla/tests/fft_test.cc
@@ -15,13 +15,14 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class FftTextTest : public HloTestBase {};
+using FftTextTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(FftTextTest, Fft) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index 62be9d21cd60a9..0f03467cf92087 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -176,9 +177,9 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
 
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
-    const std::function<OpaqueExecutable*(int64_t)> executable_provider,
-    const std::function<int64_t(int64_t)> argument_count_provider,
-    const std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+    absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+    absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const int64_t num_replicas, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
@@ -186,8 +187,8 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
   return test_runner_->ExecuteReplicated(
-      executable_provider, argument_count_provider, argument_provider,
-      std::move(options), device_assignment);
+      std::move(executable_provider), std::move(argument_count_provider),
+      std::move(argument_provider), std::move(options), device_assignment);
 }
 
 absl::StatusOr<std::vector<Literal>>
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index b00eb38fbd7738..d7460df59858f2 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -176,9 +177,9 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
 
   // Same as above, but allows passing different programs for replicas.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::function<OpaqueExecutable*(int64_t)> executable_provider,
-      std::function<int64_t(int64_t)> argument_count_provider,
-      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
+      absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
+      absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
       int64_t num_replicas, bool run_hlo_passes,
       DeviceAssignment* device_assignment = nullptr);
 
diff --git a/third_party/xla/xla/tests/hlo_test_base_with_symbolic_expr_context.h b/third_party/xla/xla/tests/hlo_test_base_with_symbolic_expr_context.h
index 6116a8b147e8a9..bb114c846790ef 100644
--- a/third_party/xla/xla/tests/hlo_test_base_with_symbolic_expr_context.h
+++ b/third_party/xla/xla/tests/hlo_test_base_with_symbolic_expr_context.h
@@ -17,20 +17,20 @@ limitations under the License.
 #define XLA_TESTS_HLO_TEST_BASE_WITH_SYMBOLIC_EXPR_CONTEXT_H_
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 
 class HloTestBaseWithSymbolicExprContext : public HloTestBase {
  public:
-  gpu::SymbolicExprContext* symbolic_expr_context() {
+  SymbolicExprContext* symbolic_expr_context() {
     return &symbolic_expr_context_;
   }
 
  private:
   mlir::MLIRContext mlir_context_;
-  gpu::SymbolicExprContext symbolic_expr_context_{&mlir_context_};
+  SymbolicExprContext symbolic_expr_context_{&mlir_context_};
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 98d1f81782071e..ac4aec28517450 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -857,8 +857,10 @@ TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
 }
 
 // Disabled on interpreter backend since infeed HLO is unsupported.
+// Not all TPU generations support infeed/outfeed, but SE does provide the
+// capability to query this.
 TEST_F(LocalClientExecuteTest, InfeedTest) {
-  if (test::DeviceIs(test::kInterpreter)) {
+  if (test::DeviceTypeIsOneOf({test::kInterpreter, test::kTpu})) {
     GTEST_SKIP();
   }
   XlaBuilder builder(TestName());
@@ -885,8 +887,10 @@ TEST_F(LocalClientExecuteTest, InfeedTest) {
 }
 
 // Disabled on interpreter backend since infeed/outfeed HLOs are unsupported.
+// Not all TPU generations support infeed/outfeed, but SE does provide the
+// capability to query this.
 TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
-  if (test::DeviceIs(test::kInterpreter)) {
+  if (test::DeviceTypeIsOneOf({test::kInterpreter, test::kTpu})) {
     GTEST_SKIP();
   }
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/map_test.cc b/third_party/xla/xla/tests/map_test.cc
index b2d309646b76f9..6980a9fcfd3fc0 100644
--- a/third_party/xla/xla/tests/map_test.cc
+++ b/third_party/xla/xla/tests/map_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
@@ -42,7 +43,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
+class MapTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   MapTest() {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
@@ -454,7 +456,7 @@ TEST_F(MapTest, MapOperationWithBuildError) {
                                    "different element types: f32[] and u16[]"));
 }
 
-class MapHloTest : public HloTestBase {};
+using MapHloTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 // TODO(b/230123847): Enable this on GPU once mhlo allows mixed-type map.
 TEST_F(MapHloTest, MapWithMixedInputTypes) {
@@ -484,7 +486,8 @@ TEST_F(MapHloTest, MapWithMixedInputTypes) {
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
 // optimizations.
-using MapTestWithFullOpt = ClientLibraryTestRunnerMixin<HloTestBase>;
+using MapTestWithFullOpt = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 // Regression test for b/31466798. The inliner simplifies map(param0, param1,
 // power) to power(param0, param1) without deleting the old subcomputation which
diff --git a/third_party/xla/xla/tests/multioutput_fusion_test.cc b/third_party/xla/xla/tests/multioutput_fusion_test.cc
index ce1c147ab9ffc4..0514b0bd4f1629 100644
--- a/third_party/xla/xla/tests/multioutput_fusion_test.cc
+++ b/third_party/xla/xla/tests/multioutput_fusion_test.cc
@@ -30,7 +30,8 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -40,26 +41,20 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class MultiOutputFusionTest : public HloTestBase {
- protected:
-  MultiOutputFusionTest() { error_spec_ = ErrorSpec{0.0001, 1e-2}; }
+constexpr ErrorSpec kErrorSpec{0.0001, 1e-2};
 
+class MultiOutputFusionTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
+ protected:
   // Layout assignment assumes that there are no fusions in the input graph.
   // Since the purpose of this test is to send pre-fused graphs to XLA, we have
   // to do layout assignment ourselves.
   DebugOptions GetDebugOptionsForTest() const override {
-    auto opts = HloTestBase::GetDebugOptionsForTest();
+    auto opts = HloPjRtTestBase::GetDebugOptionsForTest();
     opts.add_xla_disable_hlo_passes("layout-assignment");
     return opts;
   }
 
-  const se::GpuComputeCapability& GetGpuComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
-
   void RunTest2D(bool manual_fusion, int64_t size) {
     auto builder = HloComputation::Builder(TestName());
     auto hlo_module = CreateNewVerifiedModule();
@@ -117,7 +112,7 @@ class MultiOutputFusionTest : public HloTestBase {
     Literal literal_r0 = LiteralUtil::CreateR0<float>(-9.0f);
     TF_ASSERT_OK_AND_ASSIGN(
         Literal actual, Execute(std::move(hlo_module), {&literal_r0, &arg1}));
-    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, error_spec_));
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, kErrorSpec));
   }
 
   void RunTest1D(bool manual_fusion, int size) {
@@ -181,7 +176,7 @@ class MultiOutputFusionTest : public HloTestBase {
     Literal expect = LiteralUtil::CreateR1<float>({size * 1.5f * 3.5f});
     TF_ASSERT_OK_AND_ASSIGN(Literal actual,
                             Execute(std::move(hlo_module), {&input0, &input1}));
-    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, error_spec_));
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, actual, kErrorSpec));
   }
 };
 
@@ -382,10 +377,6 @@ TEST_F(MultiOutputFusionTest, MultiOutputReduceFusionMinorWithExtraOutput) {
 }
 
 TEST_F(MultiOutputFusionTest, MultiOutputReduceFusionMajorWithExtraOutput) {
-  if (std::holds_alternative<se::RocmComputeCapability>(GetGpuComputeCapability())) {
-    // TODO(rocm): weekly sync 25-07-14
-    GTEST_SKIP() << "Currently failing on ROCm!";
-  }
   const std::string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[32,32,2]{2,1,0} parameter(0)
diff --git a/third_party/xla/xla/tests/numerics_test.cc b/third_party/xla/xla/tests/numerics_test.cc
index 6e7fa4f3f1ba07..d0b1478010ab26 100644
--- a/third_party/xla/xla/tests/numerics_test.cc
+++ b/third_party/xla/xla/tests/numerics_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "xla/tests/xla_test_backend_predicates.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
@@ -47,6 +48,7 @@ ENTRY entry {
   auto abs_of_complex_x = [&hlo, this](float x) {
     std::unique_ptr<HloModule> module =
         ParseAndReturnVerifiedModule(hlo).value();
+    module->set_name(absl::StrCat(module->name(), "_", x));
     auto x_lit = LiteralUtil::CreateR0<complex64>(x);
     return RunAndCompare(std::move(module), {&x_lit}, ErrorSpec{1e-5, 1e-5});
   };
@@ -70,6 +72,7 @@ ENTRY entry {
   auto complex_a_raised_to_complex_b = [&hlo, this](float num, float exp) {
     std::unique_ptr<HloModule> module =
         ParseAndReturnVerifiedModule(hlo).value();
+    module->set_name(absl::StrCat(module->name(), "_", num, "_", exp));
     auto num_lit = LiteralUtil::CreateR0<complex64>(num);
     auto exp_lit = LiteralUtil::CreateR0<complex64>(exp);
     return RunAndCompare(std::move(module), {&num_lit, &exp_lit},
diff --git a/third_party/xla/xla/tests/pad_test.cc b/third_party/xla/xla/tests/pad_test.cc
index a2ae629515f116..33a11a0df5e349 100644
--- a/third_party/xla/xla/tests/pad_test.cc
+++ b/third_party/xla/xla/tests/pad_test.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
@@ -27,20 +30,26 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-static std::array<PrimitiveType, 4> test_type_params{F32, BF16, F8E5M2,
-                                                     F8E4M3FN};
+constexpr std::array<PrimitiveType, 4> test_type_params{F32, BF16, F8E5M2,
+                                                        F8E4M3FN};
 
-class PadTest : public ClientLibraryTestBase {
+constexpr ErrorSpec kErrorSpec(1e-5, 1e-5);
+
+class PadTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   PadTest() {
     // Initializes the padding configuration used for R4 tests.
@@ -64,16 +73,50 @@ class PadTest : public ClientLibraryTestBase {
     dimension3->set_interior_padding(0);
   }
 
+  void TearDown() override {
+    ASSERT_FALSE(!params_were_used_ && !params_.empty())
+        << "AddParam() was used to add parameters, but those parameters were "
+           "never used for execution. Please remove the AddParam() calls or "
+           "ensure that you call AddParamArgumentPointers().";
+  }
+
+  // Convenience function to help us port tests from ClientLibraryTestBase.
+  // Usually AddParam should be replaced with Parameter() and an appropriate
+  // literal, but there are too many of these in this test.
+  XlaOp AddParam(Literal literal, XlaBuilder* builder) {
+    Literal converted_literal = MaybeConvertLiteralToTestType(literal);
+    const Shape shape = converted_literal.shape();
+    params_.push_back(std::move(converted_literal));
+    return Parameter(builder, params_.size() - 1, shape, "");
+  }
+
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(LiteralUtil::CreateFromArray(argument), builder);
+  }
+
+  std::vector<const Literal*> AddParamArgumentPointers() {
+    params_were_used_ = true;
+    std::vector<const Literal*> ptrs;
+    ptrs.reserve(params_.size());
+    for (const Literal& param : params_) {
+      ptrs.push_back(&param);
+    }
+    return ptrs;
+  }
+
   // Padding configuration for R4 that only pads dimension 0 and 1.
   PaddingConfig r4_padding_on_dim0_dim1_;
+
+ private:
+  std::vector<Literal> params_;
+  bool params_were_used_ = false;
 };
 
 class PadTestFloat : public PadTest,
                      public ::testing::WithParamInterface<PrimitiveType> {
  protected:
   PadTestFloat() { set_float_type(GetParam()); }
-
-  ErrorSpec DefaultErrorSpec() const { return ErrorSpec(1e-5, 1e-5); }
 };
 
 // Tests a Pad() with a zero-element input and output.
@@ -88,7 +131,7 @@ TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
   Pad(AddParam(LiteralUtil::CreateR1<float>({}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, {}, AddParamArgumentPointers(), kErrorSpec);
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
@@ -103,8 +146,8 @@ TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 
   Pad(AddParam(LiteralUtil::CreateR1<float>({}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
-  ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1),
+                             AddParamArgumentPointers(), kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad1DS3Array) {
@@ -119,7 +162,8 @@ TEST_P(PadTestFloat, Pad1DS3Array) {
   Pad(AddParam(LiteralUtil::CreateR1<float>({1, 2, 3}), &b),
       AddParam(LiteralUtil::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR1<float>(&b, expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
@@ -127,8 +171,8 @@ TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
       AddParam(LiteralUtil::CreateR0<float>(1.5), &b),
       r4_padding_on_dim0_dim1_);
-  ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             DefaultErrorSpec());
+  ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f),
+                             AddParamArgumentPointers(), kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
@@ -152,7 +196,8 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR4<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
@@ -172,7 +217,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   (*expected)(4, 2, 0, 0) = 4.0f;
   (*expected)(7, 0, 0, 0) = 5.0f;
   (*expected)(7, 2, 0, 0) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
@@ -203,7 +249,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
   input = input.Relayout(layout);
 
-  Pad(AddParam(input, &b),
+  Pad(AddParam(std::move(input), &b),
       AddParam(LiteralUtil::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
@@ -214,7 +260,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   expected_array(0, 0, 3, 2) = 4.0f;
   expected_array(0, 0, 3, 3) = 5.0f;
   expected_array(0, 0, 3, 4) = 6.0f;
-  ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, expected_array, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
@@ -249,7 +296,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = LiteralUtil::CreateR4FromArray4D<float>(input_array);
   input = input.Relayout(layout);
 
-  Pad(AddParam(input, &b),
+  Pad(AddParam(std::move(input), &b),
       AddParam(LiteralUtil::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
@@ -257,7 +304,8 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   expected_array(0, 0, 2, 2) = 1.0f;
   expected_array(0, 24, 14, 8) = 2.0f;
   expected_array(0, 17, 6, 7) = 3.0f;
-  ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, expected_array, AddParamArgumentPointers(),
+                             ErrorSpec(0.0001));
 }
 
 TEST_F(PadTest, Pad4DU8Array) {
@@ -281,7 +329,7 @@ TEST_F(PadTest, Pad4DU8Array) {
   (*expected)(1, 0, 1, 1) = 4;
   (*expected)(1, 0, 2, 0) = 5;
   (*expected)(1, 0, 2, 1) = 6;
-  ComputeAndCompareR4<uint8_t>(&b, *expected, {});
+  ComputeAndCompareR4<uint8_t>(&b, *expected, AddParamArgumentPointers());
 }
 
 TEST_F(PadTest, Pad4DPredArray) {
@@ -308,7 +356,7 @@ TEST_F(PadTest, Pad4DPredArray) {
   (*expected)(1, 0, 1, 1) = 1;
   (*expected)(1, 0, 2, 0) = 1;
   (*expected)(1, 0, 2, 1) = 1;
-  ComputeAndCompareR4<int32_t>(&b, *expected, {});
+  ComputeAndCompareR4<int32_t>(&b, *expected, AddParamArgumentPointers());
 }
 
 TEST_P(PadTestFloat, Large2DPad) {
@@ -327,7 +375,8 @@ TEST_P(PadTestFloat, Large2DPad) {
   Pad(input, AddParam(LiteralUtil::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, AllTypes2DPad) {
@@ -349,7 +398,8 @@ TEST_P(PadTestFloat, AllTypes2DPad) {
   Pad(input, AddParam(LiteralUtil::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, High2DPad) {
@@ -376,7 +426,8 @@ TEST_P(PadTestFloat, High2DPad) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, NegativePadding2D) {
@@ -404,7 +455,8 @@ TEST_P(PadTestFloat, NegativePadding2D) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
@@ -432,7 +484,8 @@ TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
-  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR2<float>(&b, *expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 // Regression test for b/31827337.
@@ -455,7 +508,8 @@ TEST_P(PadTestFloat, ReducePad) {
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
+  ComputeAndCompareR3<float>(&b, expected, AddParamArgumentPointers(),
+                             kErrorSpec);
 }
 
 INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
diff --git a/third_party/xla/xla/tests/params_test.cc b/third_party/xla/xla/tests/params_test.cc
index 2fc9daa23f1ef9..85a23ca0d4acb9 100644
--- a/third_party/xla/xla/tests/params_test.cc
+++ b/third_party/xla/xla/tests/params_test.cc
@@ -32,14 +32,16 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using ParamsTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using ParamsTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(ParamsTest, ConstantR0F32Param) {
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/reshape_test.cc b/third_party/xla/xla/tests/reshape_test.cc
index 19b6ae21226d53..2058df6629e3ec 100644
--- a/third_party/xla/xla/tests/reshape_test.cc
+++ b/third_party/xla/xla/tests/reshape_test.cc
@@ -35,7 +35,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -47,7 +48,8 @@ namespace xla {
 namespace {
 
 class ReshapeTest : public ::testing::WithParamInterface<PrimitiveType>,
-                    public ClientLibraryTestRunnerMixin<HloTestBase> {
+                    public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   ReshapeTest() { set_float_type(GetParam()); }
 
@@ -957,7 +959,7 @@ TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
 INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
                         ::testing::ValuesIn({F32, BF16, F8E5M2, F8E4M3FN}));
 
-using ReshapeHloTest = HloTestBase;
+using ReshapeHloTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(ReshapeHloTest, NoHloPasses) {
   const std::string hlo_string = R"(
diff --git a/third_party/xla/xla/tests/scalar_computations_test.cc b/third_party/xla/xla/tests/scalar_computations_test.cc
index b38a0139990376..77e56519d9b968 100644
--- a/third_party/xla/xla/tests/scalar_computations_test.cc
+++ b/third_party/xla/xla/tests/scalar_computations_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
@@ -385,6 +386,8 @@ TEST_F(ScalarComputationsTest, DivU32s) {
             LiteralUtil::CreateR0<uint32_t>(dividend);
         const Literal divisor_literal =
             LiteralUtil::CreateR0<uint32_t>(divisor);
+        *div_computation.mutable_proto()->mutable_name() =
+            absl::StrCat(TestName(), "_", dividend, "_", divisor);
         TF_ASSERT_OK_AND_ASSIGN(
             const Literal actual_literal,
             ExecuteAndTransfer(div_computation,
@@ -423,6 +426,8 @@ TEST_F(ScalarComputationsTest, RemU32s) {
             LiteralUtil::CreateR0<uint32_t>(dividend);
         const Literal divisor_literal =
             LiteralUtil::CreateR0<uint32_t>(divisor);
+        *rem_computation.mutable_proto()->mutable_name() =
+            absl::StrCat(TestName(), "_", dividend, "_", divisor);
         TF_ASSERT_OK_AND_ASSIGN(
             const Literal actual_literal,
             ExecuteAndTransfer(rem_computation,
diff --git a/third_party/xla/xla/tests/slice_test.cc b/third_party/xla/xla/tests/slice_test.cc
index b228501200d638..b363742d8b271c 100644
--- a/third_party/xla/xla/tests/slice_test.cc
+++ b/third_party/xla/xla/tests/slice_test.cc
@@ -37,13 +37,15 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using SliceTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using SliceTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
@@ -216,7 +218,8 @@ struct R1Spec {
 
 // Parameterized test that generates R1 values, slices them according
 // to the R1Spec, and compares the result with a computed version.
-class SliceR1Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR1Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R1Spec> {
  protected:
   template <typename NativeT>
@@ -421,7 +424,8 @@ struct R2Spec {
 
 // Parameterized test that generates patterned R2 values, slices them according
 // to the R2Spec, and compares the results with the ReferenceUtil version.
-class SliceR2Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR2Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R2Spec> {};
 
 TEST_P(SliceR2Test, DoIt) {
@@ -510,7 +514,8 @@ std::string R4SpecToString(const ::testing::TestParamInfo<R4Spec>& data) {
                       "__strides_", absl::StrJoin(spec.slice_strides, "x"));
 }
 
-class SliceR4Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
+class SliceR4Test : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                     public ::testing::WithParamInterface<R4Spec> {
  protected:
   void Run(const R4Spec& spec) {
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 1eadf77e54253a..96149b311907cc 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -894,7 +894,7 @@ xla_cc_binary(
 )
 
 xla_test(
-    name = "matmul_perf_table_gen_run",
+    name = "matmul_perf_table_gen_run_test",
     timeout = "eternal",
     srcs = ["matmul_perf_table_gen_run.cc"],
     # TODO(b/372714955): Fix the memory leak.
@@ -1141,8 +1141,8 @@ xla_cc_binary(
     deps = [
         ":hlo_module_loader",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/model/experimental:symbolic_expr",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
index 32e137ebd729c3..a34900b559e30b 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -108,24 +108,24 @@ GetHardwareToRunnerLabelMap() {
 
 const absl::flat_hash_map<std::string, std::string>&
 GetHardwareToContainerImage() {
-  static const auto* kHardwareToContainerImage = new absl::flat_hash_map<
-      std::string, std::string>{
-      {"CPU_X86",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build:infrastructure-public-image-530371eedb7e"},
-      {"CPU_ARM64",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-arm64:latest"},
-      {"GPU_L4",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-      {"GPU_B200",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-      {"GPU_L4_1H_4D",
-       "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
-       "ml-build-cuda12.8-cudnn9.8:infrastructure-public-image-46c0fc3324bc"},
-  };
+  static const auto* kHardwareToContainerImage =
+      new absl::flat_hash_map<std::string, std::string>{
+          {"CPU_X86",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build:latest"},
+          {"CPU_ARM64",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-arm64:latest"},
+          {"GPU_L4",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_B200",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_L4_1H_4D",
+           "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+      };
   return *kHardwareToContainerImage;
 }
 
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
index 11ab2aded0f2ac..327d9777a2758d 100644
--- a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 // ==============================================================================
 
-#include <algorithm>
-#include <cctype>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -24,6 +22,7 @@
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "json/json.h"
 #include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
@@ -71,8 +70,7 @@ constexpr char kUsageText[] = R"(
 absl::StatusOr<xla::WorkflowType> GetWorkflowTypeFromStr(
     std::string workflow_type_arg_str) {
   // Convert to uppercase for matching with enum names
-  std::transform(workflow_type_arg_str.begin(), workflow_type_arg_str.end(),
-                 workflow_type_arg_str.begin(), ::toupper);
+  absl::AsciiStrToUpper(&workflow_type_arg_str);
 
   static const absl::NoDestructor<
       absl::flat_hash_map<std::string, xla::WorkflowType>>
diff --git a/third_party/xla/xla/tools/buffer_debug_log/BUILD b/third_party/xla/xla/tools/buffer_debug_log/BUILD
new file mode 100644
index 00000000000000..e1761a3a75f09d
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/BUILD
@@ -0,0 +1,47 @@
+# Tools and utilities for analyzing the BufferDebugLogProto dumps.
+
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_binary", "pytype_strict_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
+    licenses = ["notice"],
+)
+
+pytype_strict_library(
+    name = "checksum_mismatch_report",
+    srcs = ["checksum_mismatch_report.py"],
+    deps = [
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+    ],
+)
+
+py_strict_test(
+    name = "checksum_mismatch_report_test",
+    srcs = ["checksum_mismatch_report_test.py"],
+    deps = [
+        ":checksum_mismatch_report",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+        "@absl_py//absl/testing:absltest",
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
+
+pytype_strict_binary(
+    name = "check_thunk_output_consistency",
+    srcs = [
+        "check_thunk_output_consistency.py",
+    ],
+    main = "check_thunk_output_consistency.py",
+    deps = [
+        ":checksum_mismatch_report",
+        "//xla/backends/gpu/runtime:buffer_debug_log_proto_py",
+        "//xla/backends/gpu/runtime:thunk_proto_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
diff --git a/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py b/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py
new file mode 100644
index 00000000000000..d8189dcc5cf377
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/check_thunk_output_consistency.py
@@ -0,0 +1,117 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tool to analyze buffer debug logs.
+
+To generate the log files, run the HLO with
+--xla_gpu_experimental_enable_checksum_tracing_on_thunks.
+"""
+
+from collections.abc import Sequence
+
+from absl import app
+from absl import flags
+from google.protobuf import message
+from google.protobuf import text_format
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+from xla.tools.buffer_debug_log import checksum_mismatch_report
+
+
+def parse_binary_or_text_proto(data: bytes, proto_type):
+  """Parses a binary or text proto."""
+  try:
+    return proto_type.FromString(data)
+  except message.DecodeError:
+    pass
+  return text_format.Parse(data, proto_type())
+
+
+_METADATA_FILE = flags.DEFINE_string(
+    "metadata-file", None, "Path to the thunk metadata proto file."
+)
+
+
+def _print_formatted_report(
+    report: checksum_mismatch_report.ChecksumMismatchReport,
+):
+  """Prints a ChecksumMismatchReport to stdout in a human-readable format."""
+
+  if not report.mismatches:
+    print("\N{WHITE HEAVY CHECK MARK} All results are perfectly consistent.")
+    return
+
+  print(
+      "\N{OCTAGONAL SIGN} Different outputs detected among identical"
+      " thunk executions:"
+  )
+  for thunk_id, mismatches_by_inputs in report.mismatches.items():
+    if not mismatches_by_inputs:
+      continue
+
+    def describe_thunk(thunk_id: checksum_mismatch_report.ThunkId):
+      result = f"In outputs of thunk {thunk_id}"
+      metadata = " (metadata missing)"
+      if report.thunk_metadata:
+        thunk_metadata = report.thunk_metadata.get(thunk_id)
+        if thunk_metadata:
+          metadata = f" (kind: {thunk_metadata.thunk_kind}, profile_annotation:"
+          metadata += f" {thunk_metadata.profile_annotation})"
+      return result + metadata
+
+    print(describe_thunk(thunk_id))
+    for _, mismatches_by_buffer_idx in sorted(mismatches_by_inputs.items()):
+      for buffer_idx, checksums in mismatches_by_buffer_idx.items():
+        print(f"  buffer {buffer_idx}: checksums={checksums}")
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) < 2:
+    raise app.UsageError(
+        "Usage: buffer-debug.py [--metadata-file METADATA_PROTO_PATH]"
+        " LOG_PROTO_PATHS..."
+    )
+
+  log_protos = {}
+  for module_id, arg in enumerate(argv[1:]):
+    try:
+      with open(arg, "rb") as f:
+        log_protos[module_id] = parse_binary_or_text_proto(
+            f.read(), buffer_debug_log_pb2.BufferDebugLogProto
+        )
+    except Exception as e:
+      e.add_note(f"when reading {arg}")
+      raise
+
+  if _METADATA_FILE.value:
+    try:
+      with open(_METADATA_FILE.value, "rb") as f:
+        metadata_proto = parse_binary_or_text_proto(
+            f.read(), thunk_pb2.ThunkMetadataListProto
+        )
+    except Exception as e:
+      e.add_note(f"when reading {_METADATA_FILE.value}")
+      raise
+  else:
+    metadata_proto = thunk_pb2.ThunkMetadataListProto()
+
+  report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+      log_protos, metadata_proto
+  )
+  _print_formatted_report(report)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py
new file mode 100644
index 00000000000000..d9f4b6098c5deb
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report.py
@@ -0,0 +1,250 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for detecting checksum mismatches in buffer debug logs.
+
+The log is generated by running with
+--xla_gpu_experimental_enable_checksum_tracing_on_thunks.
+"""
+
+import collections
+import dataclasses
+import itertools
+from typing import Callable, Iterable, NewType, Optional, Self, TypeVar
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+
+
+ModuleExecutionId = NewType("ModuleExecutionId", int)
+ThunkId = NewType("ThunkId", int)
+BufferIdx = NewType("BufferIdx", int)
+Checksum = NewType("Checksum", int)
+
+
+@dataclasses.dataclass(frozen=True)
+class BufferChecksums:
+  """A set of buffer checksums with order-independent hashing."""
+
+  checksums: dict[BufferIdx, Checksum]
+
+  def __hash__(self):
+    return hash(tuple(sorted(self.checksums.items())))
+
+
+@dataclasses.dataclass(frozen=True)
+class ThunkMetadata:
+  """Thunk metadata, read from ThunkMetadataListProto.
+
+  Stored in a separate type to enable type checking.
+  """
+
+  thunk_id: ThunkId
+  thunk_kind: str
+  profile_annotation: Optional[str]
+
+
+@dataclasses.dataclass(frozen=True)
+class ThunkExecution:
+  """The details of a single execution of a thunk."""
+
+  # An ID of the HLO module execution that produced this thunk execution.
+  module_execution_id: int
+  # An ID of the thunk execution within a HLO module execution. If a thunk
+  # executes in a loop, there will create multiple entries with same thunk_id
+  # but different execution IDs.
+  thunk_execution_id: int
+  # The ID of the thunk that was executed. Details about the thunk can be found
+  # in ThunkMetadata.
+  thunk_id: ThunkId
+  # Checksums of buffers with defined contents before thunk execution.
+  # These are used to identify repeats of the same computation that are expected
+  # to produce the same results.
+  input_checksums: BufferChecksums
+  # Checksums of buffers with defined contents after thunk execution.
+  # These are the values we want to verify are consistent across executions.
+  output_checksums: BufferChecksums
+
+
+@dataclasses.dataclass(frozen=True)
+class ChecksumMismatchReport:
+  """A report of checksum mismatches for a thunk."""
+
+  thunk_metadata: dict[ThunkId, ThunkMetadata]
+  # Thunks for which different executions produced different results. The value
+  # is a input checksums => output checksum sets dict containing the info about
+  # inconsistent outptus, and the checksums of inputs that caused them.
+  mismatches: dict[
+      ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]
+  ]
+
+  @classmethod
+  def from_protos(
+      cls,
+      log_protos: dict[
+          ModuleExecutionId, buffer_debug_log_pb2.BufferDebugLogProto
+      ],
+      metadata_proto: thunk_pb2.ThunkMetadataListProto,
+  ) -> Self:
+    """Creates a ChecksumMismatchReport from protobufs.
+
+    Args:
+      log_protos: A dict of BufferDebugLogProto keyed by module execution ID.
+      metadata_proto: A ThunkMetadataListProto.
+
+    Preconditions:
+      - All log protos must refer to the same HLO module.
+      - metadata proto must describe the same HLO module as the log protos or be
+        an empty proto.
+    """
+    metadata = _parse_metadata(metadata_proto)
+
+    executions = itertools.chain.from_iterable(
+        _parse_log(module_execution_id, log_proto)
+        for module_execution_id, log_proto in log_protos.items()
+    )
+    mismatches = _find_inconsistent_thunks(executions)
+
+    return cls(metadata, mismatches)
+
+
+K = TypeVar("K")
+T = TypeVar("T")
+
+
+def group_by(
+    values: Iterable[T], key_getter: Callable[[T], K]
+) -> dict[K, list[T]]:
+  """Groups a sequence by a key function."""
+  result = collections.defaultdict(list)
+  for item in values:
+    result[key_getter(item)].append(item)
+  return result
+
+
+def _parse_metadata(
+    metadata_proto: thunk_pb2.ThunkMetadataListProto,
+) -> dict[ThunkId, ThunkMetadata]:
+  """Parses a ThunkMetadataListProto into a dict of ThunkMetadata."""
+  metadata_by_thunk_id: dict[ThunkId, ThunkMetadata] = {}
+  for metadata in metadata_proto.thunk_metadata:
+    thunk_id = ThunkId(metadata.thunk_info.thunk_id)
+    metadata_by_thunk_id[thunk_id] = ThunkMetadata(
+        thunk_id=thunk_id,
+        thunk_kind=metadata.thunk_kind,
+        profile_annotation=metadata.thunk_info.profile_annotation,
+    )
+
+  return metadata_by_thunk_id
+
+
+def _parse_log(
+    module_execution: int,
+    log_proto: buffer_debug_log_pb2.BufferDebugLogProto,
+) -> list[ThunkExecution]:
+  """Parses a BufferDebugLogProto and ThunkMetadataListProto into a list of ThunkExecutions."""
+
+  entries_by_execution = group_by(
+      log_proto.entries, lambda entry: (entry.thunk_id, entry.execution_id)
+  )
+  executions = [
+      ThunkExecution(
+          module_execution_id=module_execution,
+          thunk_execution_id=execution_id,
+          thunk_id=thunk_id,
+          input_checksums=BufferChecksums({
+              entry.buffer_idx: entry.checksum
+              for entry in entries
+              if entry.is_input_buffer
+          }),
+          output_checksums=BufferChecksums({
+              entry.buffer_idx: entry.checksum
+              for entry in entries
+              if not entry.is_input_buffer
+          }),
+      )
+      for (thunk_id, execution_id), entries in entries_by_execution.items()
+  ]
+  return executions
+
+
+def _find_inconsistent_output_checksums(
+    executions: list[ThunkExecution],
+) -> dict[BufferIdx, set[Checksum]]:
+  """Finds mismatches in output checksums for a list of identical executions.
+
+  Args:
+    executions: A list of executions of the same thunk on the same input
+      arguments.
+
+  Returns:
+    A dict of buffers whose contents were not consistent across executions with
+    the same inputs, based on the checksum value. The value is a set of
+    checksums observed for that buffer.
+  """
+  checksums_by_buffer_idx: dict[BufferIdx, set[Checksum]] = (
+      collections.defaultdict(set)
+  )
+  for execution in executions:
+    for buffer_idx, checksum in execution.output_checksums.checksums.items():
+      checksums_by_buffer_idx[buffer_idx].add(checksum)
+
+  return {
+      buffer_idx: checksums
+      for buffer_idx, checksums in checksums_by_buffer_idx.items()
+      if len(checksums) > 1
+  }
+
+
+def _find_inconsistent_thunks(
+    executions: Iterable[ThunkExecution],
+) -> dict[ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]]:
+  """Finds thunks with inconsistent output checksums across identical executions.
+
+  Args:
+    executions: A arbitrary list of thunk executions.
+
+  Returns:
+    A dict of thunks whose outputs were inconsistent across identical
+    executions.
+
+    The value is a dict keyed by the set of input checksums, with values
+    identifying the output buffers with inconsistent checksums, along with the
+    set of observed checksums for each.
+  """
+  executions_by_thunk_id: dict[ThunkId, list[ThunkExecution]] = group_by(
+      executions,
+      lambda e: e.thunk_id,
+  )
+
+  mismatches: dict[
+      ThunkId, dict[BufferChecksums, dict[BufferIdx, set[Checksum]]]
+  ] = {}
+  for thunk_id, executions in executions_by_thunk_id.items():
+    executions_by_inputs: dict[BufferChecksums, list[ThunkExecution]] = (
+        group_by(executions, lambda e: e.input_checksums)
+    )
+
+    mismatches_by_inputs: dict[
+        BufferChecksums, dict[BufferIdx, set[Checksum]]
+    ] = {}
+    for input_checksums, executions in executions_by_inputs.items():
+      m = _find_inconsistent_output_checksums(executions)
+      if m:
+        mismatches_by_inputs[input_checksums] = m
+
+    if mismatches_by_inputs:
+      mismatches[thunk_id] = mismatches_by_inputs
+
+  return mismatches
diff --git a/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py
new file mode 100644
index 00000000000000..6ad14efb5d4302
--- /dev/null
+++ b/third_party/xla/xla/tools/buffer_debug_log/checksum_mismatch_report_test.py
@@ -0,0 +1,228 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from absl.testing import absltest
+from google.protobuf import text_format
+
+from xla.backends.gpu.runtime import buffer_debug_log_pb2
+from xla.backends.gpu.runtime import thunk_pb2
+from xla.tools.buffer_debug_log import checksum_mismatch_report
+
+
+class ChecksumMismatchReportTest(absltest.TestCase):
+
+  def test_from_protos_loads_metadata(self):
+    test_log = ""
+    test_metadata = """
+thunk_metadata {
+  thunk_info {
+    thunk_id: 100
+    profile_annotation: "thunk1"
+  }
+  thunk_kind: "kGemm"
+}
+thunk_metadata {
+  thunk_info {
+    thunk_id: 101
+    profile_annotation: "thunk2"
+  }
+  thunk_kind: "kConv"
+}
+"""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEqual(
+        report.thunk_metadata,
+        {
+            100: checksum_mismatch_report.ThunkMetadata(
+                thunk_id=100,
+                thunk_kind="kGemm",
+                profile_annotation="thunk1",
+            ),
+            101: checksum_mismatch_report.ThunkMetadata(
+                thunk_id=101,
+                thunk_kind="kConv",
+                profile_annotation="thunk2",
+            ),
+        },
+    )
+
+  def test_from_protos_finds_mismatches_in_single_proto(self):
+    test_log = """
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 33333333
+}
+"""
+    test_metadata = ""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEqual(
+        report.mismatches,
+        {
+            # thunk ID
+            100: {
+                # input checksums
+                checksum_mismatch_report.BufferChecksums({0: 11111111}): {
+                    # output buffer index => checksums
+                    1: {22222222, 33333333},
+                },
+            },
+        },
+    )
+
+  def test_from_protos_finds_mismatches_in_multiple_protos(self):
+    test_log_template = """
+entries {{
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}}
+entries {{
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: {output_checksum}
+}}
+"""
+    test_logs = [
+        test_log_template.format(output_checksum=checksum)
+        for checksum in [22222222, 33333333]
+    ]
+    test_metadata = ""
+    log_protos = {
+        module_id: text_format.Parse(
+            test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+        )
+        for module_id, test_log in enumerate(test_logs)
+    }
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        log_protos, metadata_proto
+    )
+
+    self.assertEqual(
+        report.mismatches,
+        {
+            # thunk ID
+            100: {
+                # input checksums
+                checksum_mismatch_report.BufferChecksums({0: 11111111}): {
+                    # output buffer index => checksums
+                    1: {22222222, 33333333},
+                },
+            },
+        },
+    )
+
+  def test_from_protos_does_not_include_consistent_executions(self):
+    test_log = """
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 10
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 0
+  is_input_buffer: true
+  checksum: 11111111
+}
+entries {
+  thunk_id: 100
+  execution_id: 11
+  buffer_idx: 1
+  is_input_buffer: false
+  checksum: 22222222
+}
+"""
+    test_metadata = ""
+    log_proto = text_format.Parse(
+        test_log, buffer_debug_log_pb2.BufferDebugLogProto()
+    )
+    metadata_proto = text_format.Parse(
+        test_metadata,
+        thunk_pb2.ThunkMetadataListProto(),
+    )
+
+    report = checksum_mismatch_report.ChecksumMismatchReport.from_protos(
+        {0: log_proto}, metadata_proto
+    )
+
+    self.assertEmpty(report.mismatches)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen_test.cc b/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
index a27a0434a08030..ea57e1d2231bb0 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen_test.cc
@@ -33,21 +33,17 @@ using ::testing::Property;
 
 class CollectivePerfTableGenTest : public HloTestBase {
   void SetUp() override {
-    if (!IsCuda()) {
+    if (!backend()
+             .default_stream_executor()
+             ->GetDeviceDescription()
+             .gpu_compute_capability()
+             .IsCuda()) {
       GTEST_SKIP() << "Not built with --config=cuda";
     }
     cfg_.dry_run = true;
   }
 
  protected:
-  bool IsCuda() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability()
-        .IsCuda();
-  }
-
   CollectivePerfTableGen::Config cfg_;
 };
 
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index e89f58490e7ed0..519ab6e5a58e0a 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -484,7 +484,7 @@ absl::Status HloControlFlowFlattening::RemoveEntryComputationLayoutDynamism(
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> HloControlFlowFlattening::Run(
+absl::StatusOr<bool> HloControlFlowFlattening::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto call_graph = CallGraph::Build(module);
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
index efd2a482ba0654..9841078c3805e2 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.h
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -78,8 +78,9 @@ class HloControlFlowFlattening : public HloModulePass {
         remove_id_(options.remove_id) {}
   ~HloControlFlowFlattening() override = default;
   absl::string_view name() const override { return "control-flow-flattening"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 32f34e6e137b2b..62ebf6aa8bbdef 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -102,7 +102,6 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/service/gpu:alias_info",
         "//xla/service/gpu:compile_module_to_llvm_ir",
-        "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/gpu:gpu_spmd_pipeline",
@@ -247,7 +246,7 @@ lit_test_suite(
     hermetic_cuda_data_dir = "%S/../../../../../cuda_nvcc",
     tags_override = {
         "tests/gpu_hlo_ptx.hlo": ["cuda-only"],
-        "tests/gpu_hlo_llvm.hlo": ["cuda-only"], #TODO(rocm): weekly sync 25-07-14
+        "tests/gpu_hlo_llvm.hlo": [],
     },
     tools = [
         "//xla/tools:hlo-opt",
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
index 2004c7b1b0c82d..aa7ce7e948ced2 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
-#include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_compiler.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
index 070cde9d58fc0b..7aa3d6863d752e 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
@@ -35,19 +35,14 @@ namespace {
 
 class MatmulPerfTableGenTest : public HloTestBase {
   void SetUp() override {
-    if (!IsCuda()) {
+    if (!backend()
+             .default_stream_executor()
+             ->GetDeviceDescription()
+             .gpu_compute_capability()
+             .IsCuda()) {
       GTEST_SKIP() << "Not built with --config=cuda";
     }
   }
-
- protected:
-  bool IsCuda() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability()
-        .IsCuda();
-  }
 };
 
 TEST_F(MatmulPerfTableGenTest, DryRunsSpecifiedSweepSpace) {
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index 53430c744e58be..c21040042f8ac6 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:xla.default.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_py_strict_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_cuda_or_rocm", "if_google")
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
@@ -184,7 +184,6 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/runtime/large_hlo_snapshot_serialization:serialization",
@@ -256,6 +255,7 @@ xla_test(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
@@ -339,3 +339,26 @@ tsl_pybind_extension(
         "//xla/stream_executor:rocm_platform",
     ]),
 )
+
+xla_py_strict_test(
+    name = "python_hlo_runner_test",
+    srcs = ["python_hlo_runner_test.py"],
+    data = [
+        ":hlo_file",
+    ],
+    # Transformer engine dlopens several cuda libraries and so requires them as data dependencies.
+    need_cuda_libs = True,
+    tags = [
+        "gpu",
+        # Transformer engine takes a long time to compile. Disabling it for CI tests.
+        "no_oss",
+        "requires-gpu-sm90-only",
+    ],
+    deps = [
+        ":py_hlo_multihost_runner",
+        "@absl_py//absl/testing:absltest",
+        "@transformer_engine//:transformer_engine_jax",
+    ] + if_cuda([
+        "//xla/stream_executor:cuda_platform",
+    ]),
+)
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo
new file mode 100644
index 00000000000000..d95bf065ebe051
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/transformer_engine_softmax.hlo
@@ -0,0 +1,7 @@
+HloModule f
+
+ENTRY f {
+  // Softmax custom call requires rank 4 input.
+  %arg1 = bf16[4,1,16,32] parameter(0)
+  ROOT %custom-call = bf16[4,1,16,32] custom-call(bf16[4,1,16,32] %arg1), custom_call_target="te_scaled_softmax_forward_ffi", api_version=API_VERSION_TYPED_FFI, backend_config={scale_factor = 0.200000e+00 : f64}
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index f968b7d467180d..10fc35eb824671 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/large_hlo_snapshot_serialization/serialization.h"
@@ -112,35 +111,12 @@ absl::Span<PjRtDevice* const> GetLocalDevices(const PjRtClient& client) {
 //
 // Case 1: the HLO module is compiled with
 // CompileOptions::parameter_is_tupled_arguments = true
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
 // This enables PjRtClient::Execute to assemble the tupled arguments from
 // a flat list of buffers.
-// Additionally, we set ExecuteOptions::untuple_result = true if the module's
-// output is a tuple. Thus we can use the aliased output buffer as input
-// arguments and reuse the non-aliased argument buffers. In this mode, users may
-// provide the argument literals as a list of tuples (for the convenience of
-// future use cases) or a tuple literal (to support existing use cases).
 //
 // Case 2: the HLO module is compiled with
 // CompileOptions::parameter_is_tupled_arguments = false
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
-// Same as above, we set ExecuteOptions::untuple_result = true if the module's
-// output is a tuple. This allows us to reuse on-device buffers in the same way
-// as case 1.
-//
-// Case 3: the HLO module is compiled with
-// CompileOptions::parameter_is_tupled_arguments = false
-// and the HLO module is executed with
-// ExecuteOptions::arguments_are_tupled = false.
 // We will create new on-device buffers for each repeated execution.
-//
-// Irrespective of the above, if the output is a tuple with leaves mixing host
-// and device memory spaces, we set ExecuteOptions::untuple_result = true.
-// Otherwise PJRT cannot correctly represent these tuples, because a PjRtBuffer
-// can only belong to one memory space. By "untupling", PJRT assigns a separate
-// PjRtBuffer to each leaf.
 
 enum class ParameterType {
   kOneTupleOfArrays = 0,
@@ -586,7 +562,6 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
   };
 
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
-  execute_options.arguments_are_tupled = false;
   std::optional<std::vector<Future<>>> futures;
   futures.emplace();
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> device_buffers;
@@ -613,7 +588,6 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
                                                 flatten_arguments));
         argument_ptrs = CreateArgumentPointersFromDeviceBuffers(device_buffers);
       }
-      execute_options.untuple_result = true;
       execute_options.launch_id = repeat + 1 + running_options.base_run_id;
       if (running_options.execution_profiles != nullptr) {
         execute_options.execution_profile =
@@ -1223,16 +1197,17 @@ absl::Status LoadAndRunAndDump(
     const xla::FunctionalHloRunner::RunningOptions& running_options,
     absl::string_view hlo_file, InputFormat input_format,
     std::string dump_output_to, int task_id, int num_nodes,
-    std::shared_ptr<xla::KeyValueStoreInterface> kv_store) {
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store,
+    std::minstd_rand0* engine) {
   TF_ASSIGN_OR_RETURN(
       CompileOptions compile_options,
       FunctionalHloRunner::CreateCompileOptions(client, raw_compile_options,
                                                 task_id, num_nodes, kv_store));
   TF_ASSIGN_OR_RETURN(
       FunctionalHloRunner::PerDeviceLiteralVecType output,
-      FunctionalHloRunner::LoadAndRun(client, debug_options, preproc_options,
-                                      compile_options, running_options,
-                                      hlo_file, input_format));
+      FunctionalHloRunner::LoadAndRun(
+          client, debug_options, preproc_options, compile_options,
+          running_options, hlo_file, input_format, /*arguments=*/{}, engine));
   return dump_output_to.empty()
              ? absl::OkStatus()
              : FunctionalHloRunner::DumpOutput(output, dump_output_to, task_id);
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index 57e38a49199512..1a878ed87d4de9 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -314,7 +314,8 @@ absl::Status LoadAndRunAndDump(
     const xla::FunctionalHloRunner::RunningOptions& running_options,
     absl::string_view hlo_file, InputFormat input_format,
     std::string dump_output_to = "", int task_id = 0, int num_nodes = 1,
-    std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr);
+    std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr,
+    std::minstd_rand0* engine = nullptr);
 
 // Loads an HLO module from hlo_file according to input_format and run it.
 // The HLO module is run with the provided arguments if the arguments map is
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
index 8e09c907e02dff..87c1a959708000 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <random>
@@ -33,6 +34,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -62,6 +65,11 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::Each;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::Property;
 using ::testing::SizeIs;
 using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
@@ -98,6 +106,26 @@ TEST_F(FunctionalHloRunnerTest, SingleDeviceHlo) {
       running_options, {GetHloPath("single_device.hlo")}, InputFormat::kText));
 }
 
+TEST_F(FunctionalHloRunnerTest, SingleDeviceHloWithRandomEngine) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
+                          GetPjRtClient());
+
+  // Options corresponding to --num_replicas=1 --num_partitions=1
+  xla::DebugOptions debug_options;
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  FunctionalHloRunner::RawCompileOptions raw_compile_options;
+  raw_compile_options.num_replicas = 1;
+  raw_compile_options.num_partitions = 1;
+  FunctionalHloRunner::RunningOptions running_options;
+  std::minstd_rand0 engine(42);
+
+  TF_EXPECT_OK(FunctionalHloRunner::LoadAndRunAndDump(
+      *client, debug_options, preproc_options, raw_compile_options,
+      running_options, {GetHloPath("single_device.hlo")}, InputFormat::kText,
+      /*dump_output_to=*/"", /*task_id=*/0, /*num_nodes=*/1,
+      /*kv_store=*/nullptr, /*engine=*/&engine));
+}
+
 TEST_F(FunctionalHloRunnerTest, SingleDeviceHloThroughStableHlo) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
                           GetPjRtClient());
@@ -775,6 +803,71 @@ TEST_F(FunctionalHloRunnerTest, ReadHloUnoptimizedSnapshot) {
            hlo_module_and_arguments_from_binary.arguments.size());
 }
 
+TEST_F(FunctionalHloRunnerTest,
+       ReadHloModuleProtoDoesNotPreserveInstructionIds) {
+  std::string path_to_text_hlo =
+      GetHloPath("sharded_unoptimized_hlo_snapshot.pbtxt");
+
+  tsl::Env* env = tsl::Env::Default();
+
+  // Read the text proto
+  HloUnoptimizedSnapshot message;
+  TF_ASSERT_OK(tsl::ReadTextProto(env, path_to_text_hlo, &message));
+
+  // Manually modify instruction ids in the proto.
+  int64_t instruction_id_offset = 1000;
+  for (HloComputationProto& computation :
+       *message.mutable_hlo_module()->mutable_computations()) {
+    for (HloInstructionProto& instruction :
+         *computation.mutable_instructions()) {
+      instruction.set_id(instruction.id() + instruction_id_offset);
+      for (int64_t& operand_id : *instruction.mutable_operand_ids()) {
+        operand_id += instruction_id_offset;
+      }
+    }
+    computation.set_root_id(computation.root_id() + instruction_id_offset);
+  }
+
+  // Dump message in the custom binary format
+  std::string path_to_binary_hlo =
+      tsl::io::JoinPath(std::getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
+                        "sharded_unoptimized_hlo_snapshot_modified_ids.pb");
+
+  std::unique_ptr<tsl::WritableFile> file;
+  TF_ASSERT_OK(env->NewWritableFile(path_to_binary_hlo, &file));
+
+  tsl::WritableFileCopyingOutputStream output(file.get());
+
+  tsl::protobuf::io::CopyingOutputStreamAdaptor adaptor(&output);
+  EXPECT_TRUE(message.SerializeToZeroCopyStream(&adaptor));
+  adaptor.Flush();
+
+  TF_ASSERT_OK(file->Close());
+
+  // Read HloModuleAndArguments from binary dump.
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleAndArguments hlo_module_and_arguments_from_binary,
+      FunctionalHloRunner::LoadHloModuleAndArguments(
+          path_to_binary_hlo, InputFormat::kUnoptimizedSnapshotProtoBinary));
+
+  // Check if ids have been re-assigned in a compact way
+  HloComputation* entry_computation =
+      hlo_module_and_arguments_from_binary.hlo_module->entry_computation();
+
+  EXPECT_THAT(entry_computation->instructions(),
+              ElementsAre(Property(&HloInstruction::local_id, Eq(0)),
+                          Property(&HloInstruction::local_id, Eq(1)),
+                          Property(&HloInstruction::local_id, Eq(2)),
+                          Property(&HloInstruction::local_id, Eq(3))));
+
+  // Check that all operand ids are also within the re-assigned range.
+  EXPECT_THAT(entry_computation->instructions(),
+              Each(Property(&HloInstruction::operands,
+                            Each(Property(&HloInstruction::local_id, Lt(4))))));
+
+  EXPECT_THAT(entry_computation->root_instruction()->local_id(), Eq(3));
+}
+
 TEST_F(FunctionalHloRunnerTest, FixFakeArguments) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
                           GetPjRtClient());
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index c123491f08274f..7b8d8a18487f56 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iostream>
 #include <memory>
 #include <optional>
+#include <random>
 #include <string>
 #include <vector>
 
@@ -98,6 +99,7 @@ struct HloRunnerConfig {
   bool xla_dump_as_text = false;
   bool xla_dump_as_proto = false;
   std::string hlo_argument_mode = "use_random_inputs";
+  int random_seed = -1;
   int32_t while_execution_count = -1;
   bool remove_infeed_outfeed = true;
   bool compile_as_stablehlo = false;
@@ -244,6 +246,11 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
   QCHECK(opts.dump_output_literal_to.empty() || argc == 2)
       << "Can only dump output literal when single input file is specified";
 
+  std::unique_ptr<std::minstd_rand0> engine = nullptr;
+  if (opts.random_seed != -1) {
+    engine = std::make_unique<std::minstd_rand0>(opts.random_seed);
+  }
+
   QCHECK_GT(opts.gpu_client_mem_fraction, 0.0);
   QCHECK_LT(opts.gpu_client_mem_fraction, 1.0);
 
@@ -295,7 +302,8 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
       TF_RETURN_IF_ERROR(xla::FunctionalHloRunner::LoadAndRunAndDump(
           *env.client, GetDebugOptionsFromFlags(), preproc_options,
           raw_compile_options, running_options, hlo_file, opts.input_format,
-          opts.dump_output_literal_to, opts.task_id));
+          opts.dump_output_literal_to, opts.task_id, opts.num_nodes,
+          env.kv_store, engine.get()));
     } else {
       std::cout << "\n** Compiling " << hlo_file << " **\n";
       TF_RETURN_IF_ERROR(FunctionalHloRunner::LoadAndCompile(
@@ -393,6 +401,10 @@ int main(int argc, char** argv) {
                 "use_device_id_as_input, use_random_inputs, "
                 "use_shared_random_inputs, "
                 "use_zeros_as_input or uninitialized."),
+      tsl::Flag("random_seed", &opts.random_seed,
+                "Seed to be used for generating random inputs when "
+                "`hlo_argument_mode` is set to use_random_inputs or "
+                "use_shared_random_inputs."),
       tsl::Flag("while_execution_count", &opts.while_execution_count,
                 "If set to a positive number, flatten all while loops to "
                 "a certain number of iterations."),
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
index da6992f4537309..d2d022ad7c22ef 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner.cc
@@ -331,12 +331,15 @@ absl::Status RegisterCustomTypeId(absl::string_view type_name,
   }
   XLA_FFI_TypeId* type_id_ptr =
       reinterpret_cast<XLA_FFI_TypeId*>(static_cast<void*>(capsule.data()));
-  return ffi::TakeStatus(ffi::Ffi::RegisterTypeId(xla::ffi::GetXlaFfiApi(),
-                                                  type_name, type_id_ptr));
+  XLA_FFI_TypeInfo* type_info_ptr = nullptr;
+  return ffi::TakeStatus(ffi::Ffi::RegisterTypeId(
+      xla::ffi::GetXlaFfiApi(), type_name, type_id_ptr, type_info_ptr));
 }
 
 NB_MODULE(py_hlo_multihost_runner, m) {
-  InitializeAbslLogging();
+#ifndef PLATFORM_GOOGLE
+  xla::InitializeAbslLogging();
+#endif  // PLATFORM_GOOGLE
 
   m.def("RunHloFiles", ThrowIfErrorWrapper(RunHloFiles));
   m.def(
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py
new file mode 100644
index 00000000000000..91fcc20f443d95
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/python_hlo_runner_test.py
@@ -0,0 +1,67 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running HLO files."""
+
+import os
+import pathlib
+
+from absl.testing import absltest
+from transformer_engine import transformer_engine_jax
+
+from xla.tools.multihost_hlo_runner import py_hlo_multihost_runner
+
+
+def _register_transformer_engine_custom_calls():
+  for name, value in transformer_engine_jax.registrations().items():
+    py_hlo_multihost_runner.register_custom_call_target(
+        name, value, platform="CUDA", api_version=1
+    )
+
+
+def _get_test_hlo_path(file_name: str) -> str:
+  """Returns the path to a HLO file in the data directory."""
+  test_srcdir = pathlib.Path(os.environ["TEST_SRCDIR"])
+  test_workspace = os.environ["TEST_WORKSPACE"]
+  test_binary = os.environ["TEST_BINARY"]
+  return os.path.join(
+      os.path.dirname(test_srcdir / test_workspace / test_binary),
+      "data",
+      file_name,
+  )
+
+
+class RunTEHloTest(absltest.TestCase):
+  """Tests for running custom calls from Transformer Engine."""
+
+  def setUp(self):
+    super().setUp()
+    _register_transformer_engine_custom_calls()
+    self.config = py_hlo_multihost_runner.PyHloRunnerConfig()
+    self.config.input_format = py_hlo_multihost_runner.InputFormat.Text
+    self.config.hlo_argument_mode = (
+        py_hlo_multihost_runner.ModuleArgumentMode.Uninitialized
+    )
+
+  def test_run_custom_call_hlo(self):
+    hlo_file = _get_test_hlo_path("transformer_engine_softmax.hlo")
+    py_hlo_multihost_runner.RunHloFiles([hlo_file], self.config)
+
+
+def main():
+  absltest.main()
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/xla/xla/tools/print_indexing.cc b/third_party/xla/xla/tools/print_indexing.cc
index 6ce09783af8348..115e7b50ab9490 100644
--- a/third_party/xla/xla/tools/print_indexing.cc
+++ b/third_party/xla/xla/tools/print_indexing.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/experimental/symbolic_expr.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/command_line_flags.h"
@@ -52,7 +52,7 @@ absl::Status Run(const std::string& filename, int operand_id, int output_id) {
     get_operand_id = 0;
   }
   mlir::MLIRContext mlir_context;
-  gpu::SymbolicExprContext symbolic_expr_context(&mlir_context);
+  SymbolicExprContext symbolic_expr_context(&mlir_context);
   VLOG(1) << "module:\n" << module->ToString() << std::endl;
   LOG(INFO) << "root instruction is: " << root->ToString() << std::endl;
   VLOG(1) << "root is tuple: " << root->shape().IsTuple();
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 56f59a0f76a9f4..b5591610baa5a8 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -316,6 +316,16 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_riscv64",
+    constraint_values =
+        [
+            "@platforms//cpu:riscv64",
+            "@platforms//os:linux",
+        ],
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "linux_s390x",
     constraint_values =
@@ -382,6 +392,15 @@ selects.config_setting_group(
     visibility = ["//visibility:public"],
 )
 
+selects.config_setting_group(
+    name = "riscv64_or_cross",
+    match_any = [
+        ":linux_riscv64",
+        ":with_cross_compiler_support",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 selects.config_setting_group(
     name = "s390x_or_cross",
     match_any = [
@@ -453,6 +472,7 @@ selects.config_setting_group(
         ":linux_aarch64",
         ":linux_armhf",
         ":linux_ppc64le",
+        ":linux_riscv64",
         ":linux_s390x",
         ":linux_x86_64",
     ],
diff --git a/third_party/xla/xla/tsl/concurrency/BUILD b/third_party/xla/xla/tsl/concurrency/BUILD
index 3fea615d045c85..f6872f3ea815fc 100644
--- a/third_party/xla/xla/tsl/concurrency/BUILD
+++ b/third_party/xla/xla/tsl/concurrency/BUILD
@@ -152,6 +152,7 @@ cc_library(
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/meta:type_traits",
diff --git a/third_party/xla/xla/tsl/concurrency/async_value.h b/third_party/xla/xla/tsl/concurrency/async_value.h
index a74dc9efc2a9cd..d5d7cec0c99066 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value.h
@@ -88,7 +88,12 @@ class AsyncValue {
   // Return reference count. This should be used for testing and debugging only.
   uint32_t NumRef() const { return refcount_.load(std::memory_order_acquire); }
 
-  // Return true if reference count is 1.
+  // Return true if this async value is a unique reference to the underlying
+  // payload. For concrete async values, this is equivalent to `NumRef() == 1`.
+  // For indirect async values it means that the whole chain of indirect async
+  // values has a reference count of 1. For unavailable indirect async values we
+  // conservatively return false as we don't know to what async value it will be
+  // forwarded.
   bool IsUnique() const;
 
   // Add a new reference to this object.
@@ -806,10 +811,9 @@ class IndirectAsyncValue : public AsyncValue {
 
   bool IsUnique() const {
     // In addition to checking the refcount of this IndirectAsyncValue, we also
-    // need to check the refcount of the underlying value. If the underlying
-    // value is not available, we conservatively return false.
-    return (refcount_.load(std::memory_order_acquire) == 1) && IsAvailable() &&
-           value_->IsUnique();
+    // need to check the refcount of the underlying value. If indirect async
+    // value value is not forwarded, we conservatively return false.
+    return (NumRef() == 1) && value_ && value_->IsUnique();
   }
 
  protected:
@@ -1152,7 +1156,7 @@ inline void AsyncValue::Destroy() {
 
 inline bool AsyncValue::IsUnique() const {
   if (kind() != Kind::kIndirect) {
-    return refcount_.load(std::memory_order_acquire) == 1;
+    return NumRef() == 1;
   }
 
   // If it is an IndirectAsyncValue, we also need to check the refcount of the
diff --git a/third_party/xla/xla/tsl/concurrency/future.h b/third_party/xla/xla/tsl/concurrency/future.h
index 7846aba70b9146..27bf49e62c577a 100644
--- a/third_party/xla/xla/tsl/concurrency/future.h
+++ b/third_party/xla/xla/tsl/concurrency/future.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/no_destructor.h"
 #include "absl/base/optimization.h"
+#include "absl/functional/bind_front.h"
 #include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
@@ -242,9 +243,15 @@ class FutureBase : public FutureMoveControl<is_move_only> {
     // instead of dropping the promise without fulfilling it in order to make
     // debugging easier. Also, be aware that the current promise may still be
     // used to mint a future.
+    //
+    // We use this API only when we are exclusive owner of the promise and can
+    // guarantee that it didn't escape to other threads via pointers. Otherwise,
+    // this is best effort check, because it uses two atomic operations and is
+    // not atomic itself.
     bool IsUniqueReference() const {
-      CHECK(promise_) << "Promise must wrap an async value";
-      return promise_.IsUnique() && !promise_.HasWaiter();
+      CHECK(promise_ && !promise_.GetAsyncValue()->IsIndirect())
+          << "Promise must wrap a concrete async value";
+      return promise_.GetAsyncValue()->NumRef() == 1 && !promise_.HasWaiter();
     }
 
    protected:
@@ -368,7 +375,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(F&& f) const& {
     CHECK(IsValid());
-    promise_.AndThen(Wrap(std::forward<F>(f)));
+    promise_.AndThen(AndThen(std::forward<F>(f)));
   }
 
   // Registers callback to be called once the promise is ready, with the final
@@ -376,7 +383,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(Executor& executor, F&& f) const& {
     CHECK(IsValid());
-    promise_.AndThen(executor, Wrap(std::forward<F>(f)));
+    promise_.AndThen(executor, AndThen(std::forward<F>(f)));
   }
 
   // Registers callback to be called once the promise is ready, with the final
@@ -385,7 +392,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F, true>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(F&& f) && {
     CHECK(IsValid());
-    promise_.AndThen(std::move(*this).Wrap(std::forward<F>(f)));
+    promise_.AndThen(std::move(*this).AndThen(std::forward<F>(f)));
     promise_.reset();
   }
 
@@ -394,7 +401,7 @@ class FutureBase : public FutureMoveControl<is_move_only> {
   template <typename F, OnReadyFunctor<F, true>* = nullptr>
   ABSL_ATTRIBUTE_ALWAYS_INLINE void OnReady(Executor& executor, F&& f) && {
     CHECK(IsValid());
-    promise_.AndThen(executor, std::move(*this).Wrap(std::forward<F>(f)));
+    promise_.AndThen(executor, std::move(*this).AndThen(std::forward<F>(f)));
     promise_.reset();
   }
 
@@ -415,23 +422,23 @@ class FutureBase : public FutureMoveControl<is_move_only> {
 
   // Wraps a callback into a functor compatible with AsyncValue::AndThen.
   template <typename F>
-  auto Wrap(F&& f) const& {
-    return [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
-      f(*promise);
+  auto AndThen(F&& f) const& {
+    return [ptr = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+      std::move(f)(*ptr);
     };
   }
 
   // Wraps a callback into a functor compatible with AsyncValue::AndThen.
   template <typename F>
-  auto Wrap(F&& f) && {
-    return [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+  auto AndThen(F&& f) && {
+    return [ptr = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
       if constexpr (is_move_only) {
-        f(std::move(*promise));
+        std::move(f)(std::move(*ptr));
       } else {
         // We can't move from the promise to the caller because for copyable
         // futures we can have multiple copies of the Future sharing the
         // same underlying promise object.
-        f(*promise);
+        std::move(f)(*ptr);
       }
     };
   }
@@ -1151,8 +1158,16 @@ Future<future_type_t<T>> FutureBase<T, is_move_only>::Detach(
   }
 
   RCReference<IndirectAsyncValue> detached = MakeIndirectAsyncValue<T>();
-  promise_.AndThen(executor, [detached, ptr = promise_.AsPtr()] {
-    detached->ForwardTo(ptr.CopyRCRef());
+  promise_.AndThen([&executor, detached, ptr = promise_.AsPtr()] {
+    // If we hold the last reference to the detached promise, then we can safely
+    // forward it to the available value without using an executor, as we know
+    // that it will not execute any callbacks in the caller thread.
+    if (ABSL_PREDICT_FALSE(detached->NumRef() == 1 && !detached->HasWaiter())) {
+      detached->ForwardTo(ptr.CopyRCRef());
+    } else {
+      executor.Execute(absl::bind_front(&IndirectAsyncValue::ForwardTo,
+                                        std::move(detached), ptr.CopyRCRef()));
+    }
   });
   return Future<future_type_t<T>>(AsyncValueRef<T>(std::move(detached)),
                                   on_block_start_, on_block_end_);
@@ -1169,8 +1184,16 @@ Future<future_type_t<T>> FutureBase<T, is_move_only>::Detach(
 
   AsyncValuePtr<T> ptr = promise_.AsPtr();
   RCReference<IndirectAsyncValue> detached = MakeIndirectAsyncValue<T>();
-  ptr.AndThen(executor, [detached, ref = std::move(promise_)]() mutable {
-    detached->ForwardTo(ref.ReleaseRCRef());
+  ptr.AndThen([&executor, detached, ref = std::move(promise_)]() mutable {
+    // If we hold the last reference to the detached promise, then we can safely
+    // forward it to the available value without using an executor, as we know
+    // that it will not execute any callbacks in the caller thread.
+    if (ABSL_PREDICT_FALSE(detached->NumRef() == 1 && !detached->HasWaiter())) {
+      detached->ForwardTo(std::move(ref));
+    } else {
+      executor.Execute(absl::bind_front(&IndirectAsyncValue::ForwardTo,
+                                        std::move(detached), std::move(ref)));
+    }
   });
   return Future<future_type_t<T>>(AsyncValueRef<T>(std::move(detached)),
                                   std::move(on_block_start_),
diff --git a/third_party/xla/xla/tsl/concurrency/future_test.cc b/third_party/xla/xla/tsl/concurrency/future_test.cc
index 571dd3f0dae02a..bd6ff8d555d119 100644
--- a/third_party/xla/xla/tsl/concurrency/future_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/future_test.cc
@@ -642,7 +642,7 @@ TEST(FutureTest, DetachMoveOnly) {
   EXPECT_EQ(executor.num_tasks, 1);
 };
 
-TEST(DetachFutureTest, DetachOnThreadPool) {
+TEST(FutureTest, DetachOnThreadPool) {
   // We use static thread local counter to make sure that all callbacks are
   // executed on a thread inside the thread pool.
   static thread_local int32_t counter = 0;
@@ -676,6 +676,24 @@ TEST(DetachFutureTest, DetachOnThreadPool) {
   EXPECT_EQ(counter, 0);
 }
 
+TEST(FutureTest, NoOpDetachDoesNotExecute) {
+  auto [promise, future] = Future<>::MakePromise();
+
+  CountingExecutor executor;
+  (void)future.Detach(executor);
+  promise.Set(absl::OkStatus());
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
+TEST(FutureTest, NoOpMoveOnlyDetachDoesNotExecute) {
+  auto [promise, future] = Future<std::unique_ptr<int32_t>>::MakePromise();
+
+  CountingExecutor executor;
+  (void)std::move(future).Detach(executor);
+  promise.Set(std::make_unique<int32_t>(42));
+  EXPECT_EQ(executor.num_tasks, 0);
+}
+
 TEST(FutureTest, MapOnExecutorDoesNotCopy) {
   thread::ThreadPool thread_pool(Env::Default(), "test", 4);
   Executor* executor = thread_pool.AsExecutor();
diff --git a/third_party/xla/xla/tsl/cuda/stub.bzl b/third_party/xla/xla/tsl/cuda/stub.bzl
index 23c980766c8673..dd7ac5d171d90c 100644
--- a/third_party/xla/xla/tsl/cuda/stub.bzl
+++ b/third_party/xla/xla/tsl/cuda/stub.bzl
@@ -19,9 +19,9 @@ def cuda_stub(name, srcs):
         ],
         tags = ["gpu"],
         cmd = select({
-            "@local_xla//xla/tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
-            "@local_xla//xla/tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
-            "@local_xla//xla/tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
+            Label("//xla/tsl:linux_aarch64"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
+            Label("//xla/tsl:linux_x86_64"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            Label("//xla/tsl:linux_ppc64le"): "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
             "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
         }),
     )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
index d606f58a7d7a5d..e069d99f296006 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
@@ -1062,8 +1062,8 @@ TEST_F(ClientServerTest, GetAliveTasks_Succeed) {
   auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
-    absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> alive_tasks =
-        client->GetAliveTasks({GetTask(0), GetTask(1)});
+    absl::StatusOr<std::vector<CoordinationServiceAgent::AliveTask>>
+        alive_tasks = client->GetAliveTasks({GetTask(0), GetTask(1)});
     if (!alive_tasks.ok()) {
       return alive_tasks.status();
     }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index 4da8f4e586461d..4a970f83aa0f80 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -1566,7 +1566,6 @@ void CoordinationService::RefreshAliveness() {
       // the same set of alive tasks (alive_tasks) to every task in the barrier.
       std::vector<CoordinatedTask> v{alive_tasks.begin(), alive_tasks.end()};
       std::vector<IncarnationId> incarnation_ids = IncarnationIds(v);
-      absl::c_sort(incarnation_ids);
       for (const GetAliveTasksCallback& done : it->dones) {
         done(absl::OkStatus(), v, incarnation_ids);
       }
@@ -1618,7 +1617,6 @@ void CoordinationService::GetAliveTasksAsync(
   if (TaskSetSubset(alive_tasks, it->in_barrier)) {
     std::vector<CoordinatedTask> v{alive_tasks.begin(), alive_tasks.end()};
     std::vector<IncarnationId> incarnation_ids = IncarnationIds(v);
-    absl::c_sort(incarnation_ids);
     for (const GetAliveTasksCallback& done : it->dones) {
       done(absl::OkStatus(), v, incarnation_ids);
     }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 22cdbacf145200..e2738cb2304875 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -1012,7 +1012,7 @@ void CoordinationServiceAgent::CancelBarrierAsync(absl::string_view barrier_id,
       });
 }
 
-absl::StatusOr<std::vector<tensorflow::CoordinatedTask>>
+absl::StatusOr<std::vector<CoordinationServiceAgent::AliveTask>>
 CoordinationServiceAgent::GetAliveTasks(
     const std::vector<CoordinatedTask>& tasks) {
   // Validate the agent.
@@ -1036,20 +1036,21 @@ CoordinationServiceAgent::GetAliveTasks(
   };
   leader_client_->GetAliveTasksAsync(request.get(), response.get(), done);
   n.WaitForNotification();
-
-  // Parse the response.
   if (!status.ok()) {
     return status;
   }
-  {
-    absl::MutexLock lock(incarnations_mu_);
-    for (int i = 0; i < response->alive_tasks_size(); ++i) {
-      incarnations_[response->alive_tasks(i).task_id()] =
-          response->incarnations(i);
-    }
+
+  // Parse the response.
+  absl::MutexLock lock(incarnations_mu_);
+  std::vector<AliveTask> alive_tasks;
+  for (int i = 0; i < response->alive_tasks_size(); ++i) {
+    int task_id = response->alive_tasks(i).task_id();
+    IncarnationId incarnation_id(response->incarnations(i));
+
+    alive_tasks.push_back(AliveTask{task_id, incarnation_id});
+    incarnations_[task_id] = incarnation_id;
   }
-  return std::vector<tensorflow::CoordinatedTask>(
-      response->alive_tasks().begin(), response->alive_tasks().end());
+  return alive_tasks;
 }
 
 // Returns an error if agent is not running.
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 7e3bc0fa798d14..2a0dba4c331b89 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -320,7 +320,11 @@ class CoordinationServiceAgent {
   // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
   // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
   // barrier across tasks A, B, and C. Task D, which failed, is ignored.
-  absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> GetAliveTasks(
+  struct AliveTask {
+    int task_id;
+    IncarnationId incarnation_id;
+  };
+  absl::StatusOr<std::vector<AliveTask>> GetAliveTasks(
       const std::vector<tensorflow::CoordinatedTask>& tasks);
 
   // Returns the latest known set of incarnation ids for every task. Incarnation
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 65258f3a4419cf..482a39b3e4256a 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -161,14 +161,14 @@ class TestCoordinationClientCache : public CoordinationClientCache {
     clients_.emplace(target, client);
   }
 
-  CoordinationClient* GetClient(const string& target) override {
+  CoordinationClient* GetClient(const std::string& target) override {
     auto it = clients_.find(target);
     if (it == clients_.end()) return nullptr;
     return it->second;
   }
 
   std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const string& target) override {
+      const std::string& target) override {
     LOG(ERROR) << "GetOwnedClient is not supported.";
     return nullptr;
   }
@@ -2562,9 +2562,9 @@ TEST_F(GetAliveTasksTest, SuccessfulGetAliveTasks) {
                   const std::vector<IncarnationId>& incarnations) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAreArray(GetTaskMatchers()));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1),
-                                          IncarnationId(2)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1),
+                                     IncarnationId(2)));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2583,8 +2583,8 @@ TEST_F(GetAliveTasksTest, FailedTaskBeforeCallingGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
                                                   EqualsProto(GetTask(1))));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished.DecrementCount();
   };
   ASSERT_OK(GetCoordinationService()->ReportTaskError(
@@ -2605,8 +2605,8 @@ TEST_F(GetAliveTasksTest, FailedTaskAfterCallingGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
                                                   EqualsProto(GetTask(1))));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2630,8 +2630,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_01[0]),
                                                   EqualsProto(tasks_01[1])));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(0), IncarnationId(1)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(0), IncarnationId(1)));
     finished_01.DecrementCount();
   };
 
@@ -2644,8 +2644,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
     EXPECT_OK(status);
     EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_12[0]),
                                                   EqualsProto(tasks_12[1])));
-    EXPECT_EQ(incarnations,
-              (std::vector<IncarnationId>{IncarnationId(1), IncarnationId(2)}));
+    EXPECT_THAT(incarnations,
+                UnorderedElementsAre(IncarnationId(1), IncarnationId(2)));
     finished_12.DecrementCount();
   };
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index 42c3d9fd1b1269..12504f9c7b2176 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -213,7 +213,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
   string TranslateTask(const string& target) override {
-    absl::MutexLock l(&mu_);  // could use reader lock
+    absl::MutexLock l(mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
     if (cache == nullptr) {
       for (GrpcChannelCache* c : caches_) {
@@ -235,7 +235,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     for (GrpcChannelCache* cache : caches_) {
       SharedGrpcChannelPtr ch(cache->FindWorkerChannel(target));
       if (ch) {
-        absl::MutexLock l(&mu_);
+        absl::MutexLock l(mu_);
         target_caches_.insert({target, cache});
         return ch;
       }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
index de9aadff1db4af..e608b614704564 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -88,7 +88,7 @@ GrpcChannelCache* NewGrpcChannelCache(
 ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<absl::Status(string, const RPCOptions*,
+    const std::function<absl::Status(std::string, const RPCOptions*,
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr);
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
index 890c4fa66f1898..b4181ad695d80f 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
@@ -42,7 +42,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
   ~GenericCachingChannelCache() override {}
 
-  SharedGrpcChannelPtr FindWorkerChannel(const string& target) override {
+  SharedGrpcChannelPtr FindWorkerChannel(const std::string& target) override {
     {
       absl::MutexLock l(mu_);
       auto iter = channels_.find(target);
@@ -60,7 +60,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
     {
       absl::MutexLock l(mu_);
-      typename absl::flat_hash_map<string, ChannelState>::iterator iter;
+      typename absl::flat_hash_map<std::string, ChannelState>::iterator iter;
       bool was_inserted;
       std::tie(iter, was_inserted) = channels_.insert({target, new_chan_state});
       VLOG(2) << "Channel cache for target: " << target
@@ -74,7 +74,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
   // Find the ClientChannel for "target".  Only called when no channel was
   // found in the channels_ cache for "target".  A non nullptr result will be
   // cached in channels_.
-  virtual SharedGrpcChannelPtr FindChannelOnce(const string& target) = 0;
+  virtual SharedGrpcChannelPtr FindChannelOnce(const std::string& target) = 0;
 
  private:
   struct ChannelState {
@@ -96,7 +96,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
   const int num_channels_per_target_;
   // TODO(zhifengc): Eviction when the map becomes too big.
   absl::Mutex mu_;
-  absl::flat_hash_map<string, ChannelState> channels_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, ChannelState> channels_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index 3efae80a0511c2..eae6d0a6c26169 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -344,9 +344,9 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   {
     std::vector<string> workers;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:3",
-                                   "/job:mnist/replica:0/task:4"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:3",
+                                        "/job:mnist/replica:0/task:4"}),
               workers);
   }
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
index c2a0e2fcb0e401..235b4d98391c17 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
@@ -50,7 +50,7 @@ class RPCState : public GrpcClientCQTag {
       const ::grpc::string& method, const protobuf::Message& request,
       Response* response, StatusCallback done, CallOptions* call_opts,
       thread::ThreadPool* threadpool, int32_t max_retries = 0,
-      bool fail_fast = true, const string* target = nullptr,
+      bool fail_fast = true, const std::string* target = nullptr,
       std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
           [](::grpc::ByteBuffer* src, Response* dst) {
             return tsl::GrpcMaybeParseProto(src, dst);
@@ -69,10 +69,11 @@ class RPCState : public GrpcClientCQTag {
             // on worker task failures, except a few cases such as GetStatus
             // in cluster initialization and collective param resolution.
             [fail_fast, &done]() -> bool {
-              string fail_fast_env;
+              std::string fail_fast_env;
               TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
                                                &fail_fast_env));
-              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              std::string fail_fast_env_lower =
+                  absl::AsciiStrToLower(fail_fast_env);
               if (fail_fast_env_lower == "true") {
                 return true;
               } else if (fail_fast_env_lower == "use_caller") {
@@ -80,7 +81,7 @@ class RPCState : public GrpcClientCQTag {
               } else if (fail_fast_env_lower == "false") {
                 return false;
               } else {
-                string error_message = absl::StrCat(
+                std::string error_message = absl::StrCat(
                     "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
                 LOG(WARNING) << error_message;
                 done(errors::InvalidArgument(error_message));
@@ -96,7 +97,7 @@ class RPCState : public GrpcClientCQTag {
       const ::grpc::string& method, const Request& request, Response* response,
       StatusCallback done, CallOptions* call_opts,
       thread::ThreadPool* threadpool, bool fail_fast, int64_t timeout_in_ms,
-      int32_t max_retries, const string* target,
+      int32_t max_retries, const std::string* target,
       std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
           [](::grpc::ByteBuffer* src, Response* dst) {
             return tsl::GrpcMaybeParseProto(src, dst);
@@ -186,7 +187,7 @@ class RPCState : public GrpcClientCQTag {
                                         [this]() { StartCall(); });
     } else {
       // Attach additional GRPC error information if any to the final status
-      string error_msg = std::string(s.message());
+      std::string error_msg = std::string(s.message());
       absl::StrAppend(&error_msg, "\nAdditional GRPC error information");
       if (target_) {
         absl::StrAppend(&error_msg, " from remote target ", *target_);
@@ -247,7 +248,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
-  const string* target_;
+  const std::string* target_;
   std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn_ = nullptr;
 };
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
index 4fdc9213986bd8..cb86ac69ce38e2 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
@@ -39,7 +39,8 @@ bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
 
 // GrpcMaybeUnparseProto from a string simply copies the string to the
 // ByteBuffer.
-::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+::grpc::Status GrpcMaybeUnparseProto(const std::string& src,
+                                     grpc::ByteBuffer* dst) {
   ::grpc::Slice s(src.data(), src.size());
   ::grpc::ByteBuffer buffer(&s, 1);
   dst->Swap(&buffer);
@@ -47,7 +48,7 @@ ::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
 }
 
 // GrpcMaybeParseProto simply copies bytes into the string.
-bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, std::string* dst) {
   dst->clear();
   dst->reserve(src->Length());
   std::vector<::grpc::Slice> slices;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
index 4b510b1a02afda..7830312c6d4829 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
@@ -98,8 +98,8 @@ inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
   } else {
     if (s.message().size() > 3072 /* 3k bytes */) {
       // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
-      string scratch = strings::Printf("%.3072s ... [truncated]",
-                                       absl::StatusMessageAsCStr(s));
+      std::string scratch = strings::Printf("%.3072s ... [truncated]",
+                                            absl::StatusMessageAsCStr(s));
       LOG(ERROR) << "Truncated error message: " << s;
       return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch,
                             SerializePayloads(s));
@@ -119,11 +119,11 @@ ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
 
 // Copy string src to grpc buffer *dst.
-::grpc::Status GrpcMaybeUnparseProto(const string& src,
+::grpc::Status GrpcMaybeUnparseProto(const std::string& src,
                                      ::grpc::ByteBuffer* dst);
 
 // Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, std::string* dst);
 
 // Copy grpc buffer src to tstring *dst.
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
index af596db53a576d..61832e333c76c6 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
@@ -32,10 +32,10 @@ namespace {
 using tsl::proto_testing::EqualsProto;
 using tsl::test::TestRequest;
 
-string ToString(const grpc::ByteBuffer& buf) {
+std::string ToString(const grpc::ByteBuffer& buf) {
   std::vector<grpc::Slice> slices;
   CHECK(buf.Dump(&slices).ok());
-  string result;
+  std::string result;
   for (const grpc::Slice& s : slices) {
     result.append(reinterpret_cast<const char*>(s.begin()), s.size());
   }
@@ -43,7 +43,7 @@ string ToString(const grpc::ByteBuffer& buf) {
 }
 
 // Return a ByteBuffer that contains str split up into num_slices slices.
-grpc::ByteBuffer MakeBuffer(const string& str, int num_slices) {
+grpc::ByteBuffer MakeBuffer(const std::string& str, int num_slices) {
   // Convert to a ByteBuffer.
   std::vector<::grpc::Slice> slices;
   const size_t per_slice = (str.size() + num_slices - 1) / num_slices;
@@ -65,7 +65,7 @@ TestRequest MakeProto(int size) {
   int index = 0;
   while (approx_size < size) {
     int item_size = std::min(size - approx_size, 1024);
-    proto.add_data(string(item_size, 'a' + static_cast<char>(index % 26)));
+    proto.add_data(std::string(item_size, 'a' + static_cast<char>(index % 26)));
     approx_size += item_size + 3;  // +3 for encoding overhead.
     index++;
   }
@@ -105,7 +105,7 @@ TEST(GrpcProto, UnparseToString) {
   TestRequest proto;
   proto.add_data("hello");
   proto.add_data("world");
-  string str;
+  std::string str;
   CHECK(proto.SerializeToString(&str));
   grpc::ByteBuffer buf;
   ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
@@ -153,7 +153,7 @@ TEST(GrpcProto, ParseFromString) {
        }) {
     TestRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
-    string parsed_str;
+    std::string parsed_str;
     TestRequest parsed;
     ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
@@ -179,7 +179,7 @@ static void BM_UnparseString(::testing::benchmark::State& state) {
   auto proto = MakeProto(size);
 
   for (auto s : state) {
-    string buf;
+    std::string buf;
     proto.SerializeToString(&buf);
   }
 }
@@ -207,7 +207,7 @@ static void BM_ParseString(::testing::benchmark::State& state) {
   const int size = state.range(0);
 
   TestRequest proto = MakeProto(size);
-  string serial = proto.SerializeAsString();
+  std::string serial = proto.SerializeAsString();
 
   for (auto s : state) {
     CHECK(proto.ParseFromString(serial));
diff --git a/third_party/xla/xla/tsl/framework/allocator.cc b/third_party/xla/xla/tsl/framework/allocator.cc
index b55ee822f46de8..459dbb6e4db0b0 100644
--- a/third_party/xla/xla/tsl/framework/allocator.cc
+++ b/third_party/xla/xla/tsl/framework/allocator.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tsl {
 
-string AllocatorStats::DebugString() const {
+std::string AllocatorStats::DebugString() const {
   return strings::Printf(
       "Limit:            %20lld\n"
       "InUse:            %20lld\n"
@@ -56,7 +56,7 @@ static bool cpu_allocator_collect_full_stats = false;
 void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
-string AllocatorAttributes::DebugString() const {
+std::string AllocatorAttributes::DebugString() const {
   return strings::StrCat("AllocatorAttributes(on_host=", on_host(),
                          " nic_compatible=", nic_compatible(),
                          " gpu_compatible=", gpu_compatible(), ")");
diff --git a/third_party/xla/xla/tsl/framework/allocator.h b/third_party/xla/xla/tsl/framework/allocator.h
index 2f8df404b94768..218bb24f05d54b 100644
--- a/third_party/xla/xla/tsl/framework/allocator.h
+++ b/third_party/xla/xla/tsl/framework/allocator.h
@@ -38,7 +38,7 @@ struct AllocationAttributes {
   AllocationAttributes() = default;
 
   AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
-                       std::function<uint64()>* freed_by_func)
+                       std::function<uint64_t()>* freed_by_func)
       : retry_on_failure(retry_on_failure),
         allocation_will_be_logged(allocation_will_be_logged),
         freed_by_func(freed_by_func) {}
@@ -59,7 +59,7 @@ struct AllocationAttributes {
   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
   // a memory chunk whose freed_at_count is at this value or earlier may be
   // returned.
-  std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
+  std::function<uint64_t()>* freed_by_func = nullptr;  // Not owned.
 
   AllocationAttributes(const AllocationAttributes&) = delete;
   void operator=(const AllocationAttributes&) = delete;
@@ -232,7 +232,7 @@ class Allocator {
   // REQUIRES: GetStats is overridden.
   virtual bool ClearStats() TF_MUST_USE_RESULT { return false; }
 
-  virtual void SetSafeFrontier(uint64 count) {}
+  virtual void SetSafeFrontier(uint64_t count) {}
 
   // For allocator that are stream aware, allow to specify the compute
   // stream this allocator is used for. This can also trigger memory
@@ -352,10 +352,10 @@ struct AllocatorAttributes {
   // device-specific uses.  Implementors of a device can interpret these
   // upper 8 bits in device-specific ways, and ops implemented for those
   // devices are responsible for setting those 8 bits appropriately.
-  uint32 value = 0;
+  uint32_t value = 0;
   // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
   // a named special-purpose allocator on the same device.
-  int32 scope_id = 0;
+  int32_t scope_id = 0;
 
   // Returns a human readable representation of this.
   std::string DebugString() const;
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.cc b/third_party/xla/xla/tsl/framework/allocator_registry.cc
index a821ce8d7c08e6..90bfc1c70b623a 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.cc
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.cc
@@ -30,7 +30,8 @@ AllocatorFactoryRegistry* AllocatorFactoryRegistry::singleton() {
 }
 
 const AllocatorFactoryRegistry::FactoryEntry*
-AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
+AllocatorFactoryRegistry::FindEntry(const std::string& name,
+                                    int priority) const {
   for (auto& entry : factories_) {
     if (!name.compare(entry.name) && priority == entry.priority) {
       return &entry;
@@ -40,8 +41,8 @@ AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
 }
 
 void AllocatorFactoryRegistry::Register(const char* source_file,
-                                        int source_line, const string& name,
-                                        int priority,
+                                        int source_line,
+                                        const std::string& name, int priority,
                                         AllocatorFactory* factory) {
   absl::MutexLock l(mu_);
   CHECK(!first_alloc_made_) << "Attempt to register an AllocatorFactory "
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.h b/third_party/xla/xla/tsl/framework/allocator_registry.h
index b6267e6c2d32e5..a293ce63254bbb 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.h
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.h
@@ -71,8 +71,9 @@ class AllocatorFactoryRegistry {
   AllocatorFactoryRegistry() {}
   ~AllocatorFactoryRegistry() {}
 
-  void Register(const char* source_file, int source_line, const string& name,
-                int priority, AllocatorFactory* factory);
+  void Register(const char* source_file, int source_line,
+                const std::string& name, int priority,
+                AllocatorFactory* factory);
 
   // Returns 'best fit' Allocator.  Find the factory with the highest priority
   // and return an allocator constructed by it.  If multiple factories have
@@ -109,7 +110,7 @@ class AllocatorFactoryRegistry {
   struct FactoryEntry {
     const char* source_file;
     int source_line;
-    string name;
+    std::string name;
     int priority;
     std::unique_ptr<AllocatorFactory> factory;
     std::unique_ptr<Allocator> allocator;
@@ -121,7 +122,7 @@ class AllocatorFactoryRegistry {
 
   // Returns any FactoryEntry registered under 'name' and 'priority',
   // or 'nullptr' if none found.
-  const FactoryEntry* FindEntry(const string& name, int priority) const
+  const FactoryEntry* FindEntry(const std::string& name, int priority) const
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   AllocatorFactoryRegistry(const AllocatorFactoryRegistry&) = delete;
@@ -130,8 +131,9 @@ class AllocatorFactoryRegistry {
 
 class AllocatorFactoryRegistration {
  public:
-  AllocatorFactoryRegistration(const char* file, int line, const string& name,
-                               int priority, AllocatorFactory* factory) {
+  AllocatorFactoryRegistration(const char* file, int line,
+                               const std::string& name, int priority,
+                               AllocatorFactory* factory) {
     AllocatorFactoryRegistry::singleton()->Register(file, line, name, priority,
                                                     factory);
   }
diff --git a/third_party/xla/xla/tsl/framework/allocator_retry.cc b/third_party/xla/xla/tsl/framework/allocator_retry.cc
index d500bdbce6e581..268a2841b07ff4 100644
--- a/third_party/xla/xla/tsl/framework/allocator_retry.cc
+++ b/third_party/xla/xla/tsl/framework/allocator_retry.cc
@@ -38,14 +38,14 @@ class ScopedTimeTracker {
   }
   ~ScopedTimeTracker() {
     if (start_us_) {
-      uint64 end_us = env_->NowMicros();
+      uint64_t end_us = env_->NowMicros();
       metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
     }
   }
 
  private:
   Env* env_;
-  std::optional<uint64> start_us_;
+  std::optional<uint64_t> start_us_;
 };
 }  // namespace
 
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.cc b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
index 7e3e60e2a82da9..cd9d9773aa2871 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
@@ -264,7 +264,7 @@ void* BFCAllocator::AllocateRawInternalWithRetry(
     static const int64_t kMaxMillisToWait = 10000;  // 10 seconds
     r = retry_helper_.AllocateRaw(
         [this, &allocation_attr](size_t a, size_t nb, bool v) {
-          uint64 freed_by_count = 0;
+          uint64_t freed_by_count = 0;
           if (allocation_attr.freed_by_func != nullptr) {
             freed_by_count = (*allocation_attr.freed_by_func)();
           }
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.h b/third_party/xla/xla/tsl/framework/bfc_allocator.h
index 599e5e026b238a..b52ac7c77558c1 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.h
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.h
@@ -76,11 +76,11 @@ class BFCAllocator : public Allocator {
     double fragmentation_fraction = 0;
   };
   BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory,
-               const string& name, const Options& opts);
+               const std::string& name, const Options& opts);
 
   ~BFCAllocator() override;
 
-  string Name() override { return name_; }
+  std::string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     return AllocateRaw(alignment, num_bytes, AllocationAttributes());
@@ -105,7 +105,7 @@ class BFCAllocator : public Allocator {
 
   void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
 
-  void SetSafeFrontier(uint64 count) override;
+  void SetSafeFrontier(uint64_t count) override;
 
   AllocatorMemoryType GetMemoryType() const override;
 
@@ -118,7 +118,7 @@ class BFCAllocator : public Allocator {
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
                             bool dump_log_on_failure,
-                            uint64 freed_before_count);
+                            uint64_t freed_before_count);
 
   void* AllocateRawInternalWithRetry(
       size_t alignment, size_t num_bytes,
@@ -212,7 +212,7 @@ class BFCAllocator : public Allocator {
     BinNum bin_num = kInvalidBinNum;
 
     // Optional count when this chunk was most recently made free.
-    uint64 freed_at_count = 0;
+    uint64_t freed_at_count = 0;
 
     bool in_use() const { return allocation_id != -1; }
 
@@ -223,9 +223,9 @@ class BFCAllocator : public Allocator {
     int64 action_count = 0;
 #endif
 
-    string DebugString(BFCAllocator* a,
-                       bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
-      string dbg;
+    std::string DebugString(BFCAllocator* a,
+                            bool recurse) ABSL_NO_THREAD_SAFETY_ANALYSIS {
+      std::string dbg;
       absl::StrAppend(
           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
@@ -474,7 +474,8 @@ class BFCAllocator : public Allocator {
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
-                     uint64 freed_before) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+                     uint64_t freed_before)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
@@ -504,7 +505,7 @@ class BFCAllocator : public Allocator {
   // Removes the chunk metadata represented by 'h'.
   void DeleteChunk(ChunkHandle h) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  std::string RenderOccupancy() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   void DumpMemoryLog(size_t num_bytes) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   tensorflow::MemoryDump RecordMemoryMapInternal()
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
@@ -552,7 +553,7 @@ class BFCAllocator : public Allocator {
     return static_cast<size_t>(256) << index;
   }
   BinNum BinNumForSize(size_t bytes) {
-    uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
+    uint64_t v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
     int b = std::min(kNumBins - 1, tsl::Log2Floor64(v));
     return b;
   }
@@ -573,11 +574,11 @@ class BFCAllocator : public Allocator {
   const bool coalesce_regions_;
 
   std::unique_ptr<SubAllocator> sub_allocator_;
-  string name_;
+  std::string name_;
   SharedCounter* timing_counter_ = nullptr;
   std::deque<ChunkHandle> timestamped_chunks_;
 
-  std::atomic<uint64> safe_frontier_ = {0};
+  std::atomic<uint64_t> safe_frontier_ = {0};
 
   // Structures mutable after construction
   mutable absl::Mutex mutex_;
diff --git a/third_party/xla/xla/tsl/framework/contraction/BUILD b/third_party/xla/xla/tsl/framework/contraction/BUILD
index 52eda9768bdc0a..029fa1dd009241 100644
--- a/third_party/xla/xla/tsl/framework/contraction/BUILD
+++ b/third_party/xla/xla/tsl/framework/contraction/BUILD
@@ -113,6 +113,7 @@ cc_library(
         "//xla/tsl:fuchsia_x86_64": [],
         "//xla/tsl:ios": [],
         "//xla/tsl:linux_ppc64le": [],
+        "//xla/tsl:linux_riscv64": [],
         "//xla/tsl:linux_s390x": [],
         "//xla/tsl:macos_arm64": [],
         "//conditions:default": [
@@ -132,6 +133,7 @@ cc_library(
         "//xla/tsl:fuchsia_x86_64": [],
         "//xla/tsl:ios": [],
         "//xla/tsl:linux_ppc64le": [],
+        "//xla/tsl:linux_riscv64": [],
         "//xla/tsl:linux_s390x": [],
         "//xla/tsl:macos_arm64": [],
         "//conditions:default": ["//xla/tsl/mkl:onednn"],
diff --git a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
index fefd86a79a0cad..adb3834b5b0e47 100644
--- a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
+++ b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
@@ -75,7 +75,7 @@ class CPUAllocator : public Allocator {
 
   ~CPUAllocator() override = default;
 
-  string Name() override { return "cpu"; }
+  std::string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     if (num_bytes > static_cast<size_t>(LargeAllocationWarningBytes()) &&
@@ -147,7 +147,7 @@ class CPUAllocator : public Allocator {
                              {"peak_bytes_in_use", stats_.peak_bytes_in_use},
                              {"requested_bytes", req_bytes},
                              {"allocation_bytes", alloc_bytes},
-                             {"addr", reinterpret_cast<uint64>(chunk_ptr)},
+                             {"addr", reinterpret_cast<uint64_t>(chunk_ptr)},
                              {"tf_op", annotation.pending_op_name},
                              {"id", annotation.pending_step_id},
                              {"region_type", annotation.pending_region_type},
diff --git a/third_party/xla/xla/tsl/framework/device_id.h b/third_party/xla/xla/tsl/framework/device_id.h
index e80d84298195fe..6738ffb356c7b3 100644
--- a/third_party/xla/xla/tsl/framework/device_id.h
+++ b/third_party/xla/xla/tsl/framework/device_id.h
@@ -81,8 +81,8 @@ namespace tsl {
 // for the StreamExecutor interface (as we don't change its API), whenever we
 // need a TF device id (or platform device id) we should use TfDeviceId (or
 // PlatformDeviceId) instead of a raw integer.
-TSL_LIB_GTL_DEFINE_INT_TYPE(TfDeviceId, int32);
-TSL_LIB_GTL_DEFINE_INT_TYPE(PlatformDeviceId, int32);
+TSL_LIB_GTL_DEFINE_INT_TYPE(TfDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PlatformDeviceId, int32_t);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/framework/device_id_manager.cc b/third_party/xla/xla/tsl/framework/device_id_manager.cc
index b78cbc0fb5ed77..98c34c92134365 100644
--- a/third_party/xla/xla/tsl/framework/device_id_manager.cc
+++ b/third_party/xla/xla/tsl/framework/device_id_manager.cc
@@ -94,7 +94,7 @@ class TfToPlatformDeviceIdMap {
   }
 
   // Map from physical device id to platform device id.
-  using IdMapType = std::unordered_map<int32, int32>;
+  using IdMapType = std::unordered_map<int32_t, int32_t>;
   // Map from DeviceType to IdMapType.
   // We use std::string instead of DeviceType because the key should
   // be default-initializable.
diff --git a/third_party/xla/xla/tsl/framework/type_traits.h b/third_party/xla/xla/tsl/framework/type_traits.h
index b5d1c64dfb9212..18120da72225ab 100644
--- a/third_party/xla/xla/tsl/framework/type_traits.h
+++ b/third_party/xla/xla/tsl/framework/type_traits.h
@@ -89,15 +89,15 @@ struct is_simple_type {
 // standard types.
 namespace std {
 template <>
-class numeric_limits<tsl::qint8> : public numeric_limits<tsl::int8> {};
+class numeric_limits<tsl::qint8> : public numeric_limits<int8_t> {};
 template <>
-class numeric_limits<tsl::quint8> : public numeric_limits<tsl::uint8> {};
+class numeric_limits<tsl::quint8> : public numeric_limits<uint8_t> {};
 template <>
-class numeric_limits<tsl::qint16> : public numeric_limits<tsl::int16> {};
+class numeric_limits<tsl::qint16> : public numeric_limits<int16_t> {};
 template <>
-class numeric_limits<tsl::quint16> : public numeric_limits<tsl::uint16> {};
+class numeric_limits<tsl::quint16> : public numeric_limits<uint16_t> {};
 template <>
-class numeric_limits<tsl::qint32> : public numeric_limits<tsl::int32> {};
+class numeric_limits<tsl::qint32> : public numeric_limits<int32_t> {};
 
 }  // namespace std
 
diff --git a/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc b/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
index 45ce9a91e272da..c48d27bdabc39d 100644
--- a/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/compactptrset_test.cc
@@ -33,7 +33,7 @@ static std::vector<const char*> SortedContents(const StringSet& set) {
 
 TEST(CompactPointerSetTest, Simple) {
   // Make some aligned and some unaligned pointers.
-  string data = "ABCDEFG";
+  std::string data = "ABCDEFG";
   const char* a = &data[0];
   const char* b = &data[1];
   const char* c = &data[2];
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatmap.h b/third_party/xla/xla/tsl/lib/gtl/flatmap.h
index 63ece98a408e80..c873c40ef27390 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatmap.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatmap.h
@@ -133,7 +133,7 @@ class FlatMap {
     iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) { SkipUnused(); }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
-    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {
+    iterator(Bucket* b, Bucket* end, uint32_t i) : b_(b), end_(end), i_(i) {
       FillValue();
     }
 
@@ -160,7 +160,7 @@ class FlatMap {
     Bucket* b_;
     Bucket* end_;
     char space_ alignas(value_type)[sizeof(value_type)];
-    uint32 i_;
+    uint32_t i_;
 
     pointer val() { return reinterpret_cast<pointer>(space_); }
     void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
@@ -192,7 +192,7 @@ class FlatMap {
 
     const_iterator() : rep_() {}
     const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
-    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
+    const_iterator(Bucket* b, Bucket* end, uint32_t i) : rep_(b, end, i) {}
 
     reference operator*() const { return *rep_.val(); }
     pointer operator->() const { return rep_.val(); }
@@ -321,7 +321,7 @@ class FlatMap {
   // Bucket stores kWidth <marker, key, value> triples.
   // The data is organized as three parallel arrays to reduce padding.
   struct Bucket {
-    uint8 marker[Rep::kWidth];
+    uint8_t marker[Rep::kWidth];
 
     // Wrap keys and values in union to control construction and destruction.
     union Storage {
@@ -333,27 +333,27 @@ class FlatMap {
       ~Storage() {}
     } storage;
 
-    Key& key(uint32 i) {
+    Key& key(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.key[i];
     }
-    Val& val(uint32 i) {
+    Val& val(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.val[i];
     }
     template <typename V>
-    void InitVal(uint32 i, V&& v) {
+    void InitVal(uint32_t i, V&& v) {
       new (&storage.val[i]) Val(std::forward<V>(v));
     }
-    void Destroy(uint32 i) {
+    void Destroy(uint32_t i) {
       storage.key[i].Key::~Key();
       storage.val[i].Val::~Val();
     }
-    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void MoveFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
       new (&storage.val[i]) Val(std::move(src->storage.val[src_index]));
     }
-    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void CopyFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(src->storage.key[src_index]);
       new (&storage.val[i]) Val(src->storage.val[src_index]);
     }
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc b/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
index 2cf4f517bee6cf..34827198d33872 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/flatmap_test.cc
@@ -30,10 +30,10 @@ namespace tsl {
 namespace gtl {
 namespace {
 
-typedef FlatMap<int64_t, int32> NumMap;
+typedef FlatMap<int64_t, int32_t> NumMap;
 
 // If map has an entry for k, return the corresponding value, else return def.
-int32 Get(const NumMap& map, int64_t k, int32_t def = -1) {
+int32_t Get(const NumMap& map, int64_t k, int32_t def = -1) {
   auto iter = map.find(k);
   if (iter == map.end()) {
     EXPECT_EQ(map.count(k), 0);
@@ -47,7 +47,7 @@ int32 Get(const NumMap& map, int64_t k, int32_t def = -1) {
 }
 
 // Return contents of map as a sorted list of pairs.
-typedef std::vector<std::pair<int64_t, int32>> NumMapContents;
+typedef std::vector<std::pair<int64_t, int32_t>> NumMapContents;
 NumMapContents Contents(const NumMap& map) {
   NumMapContents result;
   for (const auto& p : map) {
@@ -146,8 +146,8 @@ TEST(FlatMapTest, Emplace) {
 }
 
 TEST(FlatMapTest, EmplaceUniquePtr) {
-  FlatMap<int64_t, std::unique_ptr<string>> smap;
-  smap.emplace(1, std::make_unique<string>("hello"));
+  FlatMap<int64_t, std::unique_ptr<std::string>> smap;
+  smap.emplace(1, std::make_unique<std::string>("hello"));
 }
 
 TEST(FlatMapTest, Size) {
@@ -344,7 +344,7 @@ TEST(FlatMap, InitializerList) {
   NumMap b({{1, 10}, {2, 20}, {3, 30}});
   NumMap c = {{1, 10}, {2, 20}, {3, 30}};
 
-  typedef std::unordered_map<int64_t, int32> StdNumMap;
+  typedef std::unordered_map<int64_t, int32_t> StdNumMap;
   StdNumMap std({{1, 10}, {2, 20}, {3, 30}});
   StdNumMap::value_type std_r1 = *std.find(1);
   StdNumMap::value_type std_r2 = *std.find(2);
@@ -591,17 +591,17 @@ TEST(FlatMap, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatMap, ConstructDestruct) {
-  FlatMap<string, string> map;
-  string k1 = "the quick brown fox jumped over the lazy dog";
-  string k2 = k1 + k1;
-  string k3 = k1 + k2;
+  FlatMap<std::string, std::string> map;
+  std::string k1 = "the quick brown fox jumped over the lazy dog";
+  std::string k2 = k1 + k1;
+  std::string k3 = k1 + k2;
   map[k1] = k2;
   map[k3] = k1;
   EXPECT_EQ(k1, map.find(k1)->first);
   EXPECT_EQ(k2, map.find(k1)->second);
   EXPECT_EQ(k1, map[k3]);
   map.erase(k3);
-  EXPECT_EQ(string(), map[k3]);
+  EXPECT_EQ(std::string(), map[k3]);
 
   map.clear();
   map[k1] = k2;
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatrep.h b/third_party/xla/xla/tsl/lib/gtl/flatrep.h
index ed772875452c8a..51ac65480b9a9c 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatrep.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatrep.h
@@ -47,8 +47,8 @@ template <typename Key, typename Bucket, class Hash, class Eq>
 class FlatRep {
  public:
   // kWidth is the number of entries stored in a bucket.
-  static constexpr uint32 kBase = 3;
-  static constexpr uint32 kWidth = (1 << kBase);
+  static constexpr uint32_t kBase = 3;
+  static constexpr uint32_t kWidth = (1 << kBase);
 
   FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
     Init(N);
@@ -102,7 +102,7 @@ class FlatRep {
 
   void clear_no_resize() {
     for (Bucket* b = array_; b != end_; b++) {
-      for (uint32 i = 0; i < kWidth; i++) {
+      for (uint32_t i = 0; i < kWidth; i++) {
         if (b->marker[i] >= 2) {
           b->Destroy(i);
           b->marker[i] = kEmpty;
@@ -134,7 +134,7 @@ class FlatRep {
   struct SearchResult {
     bool found;
     Bucket* b;
-    uint32 index;
+    uint32_t index;
   };
 
   // Hash value is partitioned as follows:
@@ -145,13 +145,13 @@ class FlatRep {
   // Find bucket/index for key k.
   SearchResult Find(const Key& k) const {
     size_t h = hash_(k);
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == marker && equal_(b->key(bi), k)) {
         return {true, b, bi};
       } else if (x == kEmpty) {
@@ -170,15 +170,15 @@ class FlatRep {
   template <typename KeyType>
   SearchResult FindOrInsert(KeyType&& k) {
     size_t h = hash_(k);
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     Bucket* del = nullptr;            // First encountered deletion for kInsert
-    uint32 di = 0;
+    uint32_t di = 0;
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == marker && equal_(b->key(bi), k)) {
         return {true, b, bi};
       } else if (!del && x == kDeleted) {
@@ -203,7 +203,7 @@ class FlatRep {
     }
   }
 
-  void Erase(Bucket* b, uint32 i) {
+  void Erase(Bucket* b, uint32_t i) {
     b->Destroy(i);
     b->marker[i] = kDeleted;
     deleted_++;
@@ -213,7 +213,7 @@ class FlatRep {
   void Prefetch(const Key& k) const {
     size_t h = hash_(k);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 bi = index & (kWidth - 1);
+    uint32_t bi = index & (kWidth - 1);
     Bucket* b = &array_[index >> kBase];
     absl::PrefetchToLocalCache(&b->marker[bi]);
     absl::PrefetchToLocalCache(&b->storage.key[bi]);
@@ -247,7 +247,7 @@ class FlatRep {
 
   Hash hash_;         // User-supplied hasher
   Eq equal_;          // User-supplied comparator
-  uint8 lglen_;       // lg(#buckets)
+  uint8_t lglen_;     // lg(#buckets)
   Bucket* array_;     // array of length (1 << lglen_)
   Bucket* end_;       // Points just past last bucket in array_
   size_t mask_;       // (# of entries in table) - 1
@@ -258,7 +258,7 @@ class FlatRep {
 
   // Avoid kEmpty and kDeleted markers when computing hash values to
   // store in Bucket::marker[].
-  static uint32 Marker(uint32 hb) { return hb + (hb < 2 ? 2 : 0); }
+  static uint32_t Marker(uint32_t hb) { return hb + (hb < 2 ? 2 : 0); }
 
   void Init(size_t N) {
     // Make enough room for N elements.
@@ -290,14 +290,16 @@ class FlatRep {
 
   // Used by FreshInsert when we should copy from source.
   struct CopyEntry {
-    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+    inline void operator()(Bucket* dst, uint32_t dsti, Bucket* src,
+                           uint32_t srci) {
       dst->CopyFrom(dsti, src, srci);
     }
   };
 
   // Used by FreshInsert when we should move from source.
   struct MoveEntry {
-    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+    inline void operator()(Bucket* dst, uint32_t dsti, Bucket* src,
+                           uint32_t srci) {
       dst->MoveFrom(dsti, src, srci);
       src->Destroy(srci);
       src->marker[srci] = kDeleted;
@@ -307,7 +309,7 @@ class FlatRep {
   template <typename Copier>
   void CopyEntries(Bucket* start, Bucket* end, Copier copier) {
     for (Bucket* b = start; b != end; b++) {
-      for (uint32 i = 0; i < kWidth; i++) {
+      for (uint32_t i = 0; i < kWidth; i++) {
         if (b->marker[i] >= 2) {
           FreshInsert(b, i, copier);
         }
@@ -320,15 +322,15 @@ class FlatRep {
   // assume that there are no deletions, and k does not already exist
   // in the table.
   template <typename Copier>
-  void FreshInsert(Bucket* src, uint32 src_index, Copier copier) {
+  void FreshInsert(Bucket* src, uint32_t src_index, Copier copier) {
     size_t h = hash_(src->key(src_index));
-    const uint32 marker = Marker(h & 0xff);
+    const uint32_t marker = Marker(h & 0xff);
     size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
-    uint32 num_probes = 1;            // Needed for quadratic probing
+    uint32_t num_probes = 1;          // Needed for quadratic probing
     while (true) {
-      uint32 bi = index & (kWidth - 1);
+      uint32_t bi = index & (kWidth - 1);
       Bucket* b = &array_[index >> kBase];
-      const uint32 x = b->marker[bi];
+      const uint32_t x = b->marker[bi];
       if (x == 0) {
         b->marker[bi] = marker;
         not_empty_++;
@@ -340,7 +342,7 @@ class FlatRep {
     }
   }
 
-  inline size_t NextIndex(size_t i, uint32 num_probes) const {
+  inline size_t NextIndex(size_t i, uint32_t num_probes) const {
     // Quadratic probing.
     return (i + num_probes) & mask_;
   }
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatset.h b/third_party/xla/xla/tsl/lib/gtl/flatset.h
index c4b44b9bb5a349..1ffe1cc054ee2e 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatset.h
+++ b/third_party/xla/xla/tsl/lib/gtl/flatset.h
@@ -118,7 +118,7 @@ class FlatSet {
     }
 
     // Make iterator pointing exactly at ith element in b, which must exist.
-    const_iterator(Bucket* b, Bucket* end, uint32 i)
+    const_iterator(Bucket* b, Bucket* end, uint32_t i)
         : b_(b), end_(end), i_(i) {}
 
     reference operator*() const { return key(); }
@@ -143,7 +143,7 @@ class FlatSet {
     friend class FlatSet;
     Bucket* b_;
     Bucket* end_;
-    uint32 i_;
+    uint32_t i_;
 
     reference key() const { return b_->key(i_); }
     void SkipUnused() {
@@ -257,7 +257,7 @@ class FlatSet {
   // Bucket stores kWidth <marker, key, value> triples.
   // The data is organized as three parallel arrays to reduce padding.
   struct Bucket {
-    uint8 marker[Rep::kWidth];
+    uint8_t marker[Rep::kWidth];
 
     // Wrap keys in union to control construction and destruction.
     union Storage {
@@ -266,15 +266,15 @@ class FlatSet {
       ~Storage() {}
     } storage;
 
-    Key& key(uint32 i) {
+    Key& key(uint32_t i) {
       DCHECK_GE(marker[i], 2);
       return storage.key[i];
     }
-    void Destroy(uint32 i) { storage.key[i].Key::~Key(); }
-    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void Destroy(uint32_t i) { storage.key[i].Key::~Key(); }
+    void MoveFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
     }
-    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+    void CopyFrom(uint32_t i, Bucket* src, uint32_t src_index) {
       new (&storage.key[i]) Key(src->storage.key[src_index]);
     }
   };
diff --git a/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc b/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
index 11cd92f5b4ec3f..02831cd1d7ee5b 100644
--- a/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/flatset_test.cc
@@ -487,10 +487,10 @@ TEST(FlatSet, ForwardIterator) {
 // or destructions will show up as errors under a sanitizer or
 // heap checker.
 TEST(FlatSet, ConstructDestruct) {
-  FlatSet<string> set;
-  string k1 = "the quick brown fox jumped over the lazy dog";
-  string k2 = k1 + k1;
-  string k3 = k1 + k2;
+  FlatSet<std::string> set;
+  std::string k1 = "the quick brown fox jumped over the lazy dog";
+  std::string k2 = k1 + k1;
+  std::string k3 = k1 + k2;
   set.insert(k1);
   set.insert(k3);
   EXPECT_EQ(set.count(k1), 1);
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
index 19e6a7cb10fc3f..1205cd7a3e3251 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
@@ -25,14 +25,14 @@ limitations under the License.
 
 namespace tsl {
 
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int8_IT, int8);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt8_IT, uint8);
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int16_IT, int16);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt16_IT, uint16);
-TSL_LIB_GTL_DEFINE_INT_TYPE(Int32_IT, int32);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int8_IT, int8_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt8_IT, uint8_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int16_IT, int16_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt16_IT, uint16_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(Int32_IT, int32_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(Int64_IT, int64_t);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt32_IT, uint32);
-TSL_LIB_GTL_DEFINE_INT_TYPE(UInt64_IT, uint64);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt32_IT, uint32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(UInt64_IT, uint64_t);
 TSL_LIB_GTL_DEFINE_INT_TYPE(Long_IT, long);  // NOLINT
 
 template <typename IntType_Type>
@@ -252,12 +252,12 @@ TYPED_TEST(IntTypeTest, TestValueAccessor) {
   // as this code is part of a template class.  Weird syntax though.  Good news
   // is that only int_type.value<int>() is needed in most code.
   EXPECT_EQ(static_cast<int>(i), int_type.template value<int>());
-  EXPECT_EQ(static_cast<int8>(i), int_type.template value<int8>());
-  EXPECT_EQ(static_cast<int16>(i), int_type.template value<int16>());
-  EXPECT_EQ(static_cast<int32>(i), int_type.template value<int32>());
-  EXPECT_EQ(static_cast<uint32>(i), int_type.template value<uint32>());
+  EXPECT_EQ(static_cast<int8_t>(i), int_type.template value<int8_t>());
+  EXPECT_EQ(static_cast<int16_t>(i), int_type.template value<int16_t>());
+  EXPECT_EQ(static_cast<int32_t>(i), int_type.template value<int32_t>());
+  EXPECT_EQ(static_cast<uint32_t>(i), int_type.template value<uint32_t>());
   EXPECT_EQ(static_cast<int64_t>(i), int_type.template value<int64_t>());
-  EXPECT_EQ(static_cast<uint64>(i), int_type.template value<uint64>());
+  EXPECT_EQ(static_cast<uint64_t>(i), int_type.template value<uint64_t>());
   EXPECT_EQ(static_cast<long>(i), int_type.template value<long>());  // NOLINT
   static_assert(int_type.template value<int>() == static_cast<int>(i),
                 "value<Value>() failed");
diff --git a/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc b/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
index 92ac1d0e1c5e52..af4540f188db80 100644
--- a/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/map_util_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tsl {
 
 TEST(MapUtil, Find) {
-  typedef std::map<string, string> Map;
+  typedef std::map<std::string, std::string> Map;
   Map m;
 
   // Check that I can use a type that's implicitly convertible to the
@@ -39,7 +39,7 @@ TEST(MapUtil, Find) {
 }
 
 TEST(MapUtil, LookupOrInsert) {
-  typedef std::map<string, string> Map;
+  typedef std::map<std::string, std::string> Map;
   Map m;
 
   // Check that I can use a type that's implicitly convertible to the
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c.cc b/third_party/xla/xla/tsl/lib/hash/crc32c.cc
index 37d0ed501ce785..d87dd3d118e51a 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c.cc
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c.cc
@@ -28,7 +28,7 @@ namespace tsl {
 namespace crc32c {
 
 #if defined(TF_CORD_SUPPORT)
-uint32 Extend(uint32 crc, const absl::Cord &cord) {
+uint32_t Extend(uint32_t crc, const absl::Cord& cord) {
   for (absl::string_view fragment : cord.Chunks()) {
     crc = Extend(crc, fragment.data(), fragment.size());
   }
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c.h b/third_party/xla/xla/tsl/lib/hash/crc32c.h
index 8d797dacf0572f..b7a0ab3981a58d 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c.h
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c.h
@@ -30,37 +30,37 @@ namespace crc32c {
 // Return the crc32c of concat(A, buf[0,size-1]) where init_crc is the
 // crc32c of some string A.  Extend() is often used to maintain the
 // crc32c of a stream of data.
-inline uint32 Extend(uint32 init_crc, const char* buf, size_t size) {
-  return static_cast<uint32>(absl::ExtendCrc32c(
+inline uint32_t Extend(uint32_t init_crc, const char* buf, size_t size) {
+  return static_cast<uint32_t>(absl::ExtendCrc32c(
       static_cast<absl::crc32c_t>(init_crc), absl::string_view(buf, size)));
 }
 
 #if defined(TF_CORD_SUPPORT)
-extern uint32 Extend(uint32 init_crc, const absl::Cord& cord);
+extern uint32_t Extend(uint32_t init_crc, const absl::Cord& cord);
 #endif
 
 // Return the crc32c of data[0,n-1]
-inline uint32 Value(const char* data, size_t n) { return Extend(0, data, n); }
+inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); }
 
 #if defined(TF_CORD_SUPPORT)
-inline uint32 Value(const absl::Cord& cord) { return Extend(0, cord); }
+inline uint32_t Value(const absl::Cord& cord) { return Extend(0, cord); }
 #endif
 
-static const uint32 kMaskDelta = 0xa282ead8ul;
+static const uint32_t kMaskDelta = 0xa282ead8ul;
 
 // Return a masked representation of crc.
 //
 // Motivation: it is problematic to compute the CRC of a string that
 // contains embedded CRCs.  Therefore we recommend that CRCs stored
 // somewhere (e.g., in files) should be masked before being stored.
-inline uint32 Mask(uint32 crc) {
+inline uint32_t Mask(uint32_t crc) {
   // Rotate right by 15 bits and add a constant.
   return ((crc >> 15) | (crc << 17)) + kMaskDelta;
 }
 
 // Return the crc whose masked representation is masked_crc.
-inline uint32 Unmask(uint32 masked_crc) {
-  uint32 rot = masked_crc - kMaskDelta;
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
   return ((rot >> 17) | (rot << 15));
 }
 
diff --git a/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc b/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
index 5082e27ac672e4..8196ad87e24dd7 100644
--- a/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
+++ b/third_party/xla/xla/tsl/lib/hash/crc32c_test.cc
@@ -68,7 +68,7 @@ TEST(CRC, Extend) {
 }
 
 TEST(CRC, Mask) {
-  uint32 crc = Value("foo", 3);
+  uint32_t crc = Value("foo", 3);
   ASSERT_NE(crc, Mask(crc));
   ASSERT_NE(crc, Mask(Mask(crc)));
   ASSERT_EQ(crc, Unmask(Mask(crc)));
@@ -89,7 +89,7 @@ TEST(CRC, ExtendWithCord) {
 static void BM_CRC(::testing::benchmark::State& state) {
   int len = state.range(0);
   std::string input(len, 'x');
-  uint32 h = 0;
+  uint32_t h = 0;
   for (auto s : state) {
     h = Extend(h, input.data() + 1, len - 1);
   }
diff --git a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
index 3ac2e3dd41213b..504b0020873580 100644
--- a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
+++ b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
@@ -36,7 +36,7 @@ static void Validate(const Histogram& h) {
   h.EncodeToProto(&proto_no_zeroes, false);
   Histogram h3;
   EXPECT_TRUE(h3.DecodeFromProto(proto_no_zeroes));
-  string s3 = h3.ToString();
+  std::string s3 = h3.ToString();
   LOG(ERROR) << s3;
 
   EXPECT_EQ(h3.ToString(), h.ToString());
diff --git a/third_party/xla/xla/tsl/lib/io/block.cc b/third_party/xla/xla/tsl/lib/io/block.cc
index ae6d2a1ac46248..34efc30e9dfc2c 100644
--- a/third_party/xla/xla/tsl/lib/io/block.cc
+++ b/third_party/xla/xla/tsl/lib/io/block.cc
@@ -28,24 +28,24 @@ limitations under the License.
 namespace tsl {
 namespace table {
 
-inline uint32 Block::NumRestarts() const {
-  assert(size_ >= sizeof(uint32));
-  return core::DecodeFixed32(data_ + size_ - sizeof(uint32));
+inline uint32_t Block::NumRestarts() const {
+  assert(size_ >= sizeof(uint32_t));
+  return core::DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
 
 Block::Block(const BlockContents& contents)
     : data_(contents.data.data()),
       size_(contents.data.size()),
       owned_(contents.heap_allocated) {
-  if (size_ < sizeof(uint32)) {
+  if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
-    size_t max_restarts_allowed = (size_ - sizeof(uint32)) / sizeof(uint32);
+    size_t max_restarts_allowed = (size_ - sizeof(uint32_t)) / sizeof(uint32_t);
     if (NumRestarts() > max_restarts_allowed) {
       // The size is too small for NumRestarts()
       size_ = 0;
     } else {
-      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32);
+      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
     }
   }
 }
@@ -64,8 +64,8 @@ Block::~Block() {
 // If any errors are detected, returns NULL.  Otherwise, returns a
 // pointer to the key delta (just past the three decoded values).
 static inline const char* DecodeEntry(const char* p, const char* limit,
-                                      uint32* shared, uint32* non_shared,
-                                      uint32* value_length) {
+                                      uint32_t* shared, uint32_t* non_shared,
+                                      uint32_t* value_length) {
   if (limit - p < 3) return nullptr;
   *shared = reinterpret_cast<const unsigned char*>(p)[0];
   *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
@@ -81,7 +81,7 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
       return nullptr;
   }
 
-  if (static_cast<uint32>(limit - p) < (*non_shared + *value_length)) {
+  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
     return nullptr;
   }
   return p;
@@ -90,13 +90,13 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
 class Block::Iter : public Iterator {
  private:
   const char* const data_;     // underlying block contents
-  uint32 const restarts_;      // Offset of restart array (list of fixed32)
-  uint32 const num_restarts_;  // Number of uint32 entries in restart array
+  const uint32_t restarts_;    // Offset of restart array (list of fixed32)
+  const uint32_t num_restarts_;  // Number of uint32 entries in restart array
 
   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
-  uint32 current_;
-  uint32 restart_index_;  // Index of restart block in which current_ falls
-  string key_;
+  uint32_t current_;
+  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  std::string key_;
   absl::string_view value_;
   absl::Status status_;
 
@@ -106,27 +106,27 @@ class Block::Iter : public Iterator {
   }
 
   // Return the offset in data_ just past the end of the current entry.
-  inline uint32 NextEntryOffset() const {
+  inline uint32_t NextEntryOffset() const {
     return (value_.data() + value_.size()) - data_;
   }
 
-  uint32 GetRestartPoint(uint32 index) {
+  uint32_t GetRestartPoint(uint32_t index) {
     assert(index < num_restarts_);
-    return core::DecodeFixed32(data_ + restarts_ + index * sizeof(uint32));
+    return core::DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
   }
 
-  void SeekToRestartPoint(uint32 index) {
+  void SeekToRestartPoint(uint32_t index) {
     key_.clear();
     restart_index_ = index;
     // current_ will be fixed by ParseNextKey();
 
     // ParseNextKey() starts at the end of value_, so set value_ accordingly
-    uint32 offset = GetRestartPoint(index);
+    uint32_t offset = GetRestartPoint(index);
     value_ = absl::string_view(data_ + offset, 0);
   }
 
  public:
-  Iter(const char* data, uint32 restarts, uint32 num_restarts)
+  Iter(const char* data, uint32_t restarts, uint32_t num_restarts)
       : data_(data),
         restarts_(restarts),
         num_restarts_(num_restarts),
@@ -154,12 +154,12 @@ class Block::Iter : public Iterator {
   void Seek(absl::string_view target) override {
     // Binary search in restart array to find the last restart point
     // with a key < target
-    uint32 left = 0;
-    uint32 right = num_restarts_ - 1;
+    uint32_t left = 0;
+    uint32_t right = num_restarts_ - 1;
     while (left < right) {
-      uint32 mid = left + (right - left + 1) / 2;
-      uint32 region_offset = GetRestartPoint(mid);
-      uint32 shared, non_shared, value_length;
+      uint32_t mid = left + (right - left + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
       const char* key_ptr =
           DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
                       &non_shared, &value_length);
@@ -217,7 +217,7 @@ class Block::Iter : public Iterator {
     }
 
     // Decode next entry
-    uint32 shared, non_shared, value_length;
+    uint32_t shared, non_shared, value_length;
     p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
     if (p == nullptr || key_.size() < shared) {
       CorruptionError();
@@ -236,10 +236,10 @@ class Block::Iter : public Iterator {
 };
 
 Iterator* Block::NewIterator() {
-  if (size_ < sizeof(uint32)) {
+  if (size_ < sizeof(uint32_t)) {
     return NewErrorIterator(errors::DataLoss("bad block contents"));
   }
-  const uint32 num_restarts = NumRestarts();
+  const uint32_t num_restarts = NumRestarts();
   if (num_restarts == 0) {
     return NewEmptyIterator();
   } else {
diff --git a/third_party/xla/xla/tsl/lib/io/block.h b/third_party/xla/xla/tsl/lib/io/block.h
index c97a0f9830d48f..8c839c0bce2047 100644
--- a/third_party/xla/xla/tsl/lib/io/block.h
+++ b/third_party/xla/xla/tsl/lib/io/block.h
@@ -37,11 +37,11 @@ class Block {
   Iterator* NewIterator();
 
  private:
-  uint32 NumRestarts() const;
+  uint32_t NumRestarts() const;
 
   const char* data_;
   size_t size_;
-  uint32 restart_offset_;  // Offset in data_ of restart array
+  uint32_t restart_offset_;  // Offset in data_ of restart array
   bool owned_;             // Block owns data_[]
 
   // No copying allowed
diff --git a/third_party/xla/xla/tsl/lib/io/block_builder.cc b/third_party/xla/xla/tsl/lib/io/block_builder.cc
index 6ec8e73caa21cf..61dd61d8d6d0d1 100644
--- a/third_party/xla/xla/tsl/lib/io/block_builder.cc
+++ b/third_party/xla/xla/tsl/lib/io/block_builder.cc
@@ -65,9 +65,9 @@ void BlockBuilder::Reset() {
 }
 
 size_t BlockBuilder::CurrentSizeEstimate() const {
-  return (buffer_.size() +                     // Raw data buffer
-          restarts_.size() * sizeof(uint32) +  // Restart array
-          sizeof(uint32));                     // Restart array length
+  return (buffer_.size() +                       // Raw data buffer
+          restarts_.size() * sizeof(uint32_t) +  // Restart array
+          sizeof(uint32_t));                     // Restart array length
 }
 
 absl::string_view BlockBuilder::Finish() {
diff --git a/third_party/xla/xla/tsl/lib/io/block_builder.h b/third_party/xla/xla/tsl/lib/io/block_builder.h
index 2f181835819108..a27bbf1c3fe3b5 100644
--- a/third_party/xla/xla/tsl/lib/io/block_builder.h
+++ b/third_party/xla/xla/tsl/lib/io/block_builder.h
@@ -53,11 +53,11 @@ class BlockBuilder {
 
  private:
   const Options* options_;
-  string buffer_;                 // Destination buffer
-  std::vector<uint32> restarts_;  // Restart points
+  std::string buffer_;              // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
   int counter_;                   // Number of entries emitted since restart
   bool finished_;                 // Has Finish() been called?
-  string last_key_;
+  std::string last_key_;
 
   // No copying allowed
   BlockBuilder(const BlockBuilder&);
diff --git a/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc b/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
index f1faf55ef5353f..898a731c7e2e0c 100644
--- a/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/buffered_file_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST(BufferedInputStream, Tell) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   std::unique_ptr<WritableFile> write_file;
   TF_ASSERT_OK(env->NewWritableFile(fname, &write_file));
diff --git a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
index 7c0621dbb9bfa0..ee3241e62f82d2 100644
--- a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
@@ -63,7 +63,7 @@ class ReadOnceInputStream : public InputStreamInterface {
 
 TEST(BufferedInputStream, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
   std::unique_ptr<RandomAccessFile> file;
@@ -73,14 +73,14 @@ TEST(BufferedInputStream, ReadLine_Empty) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
 TEST(BufferedInputStream, ReadLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -91,7 +91,7 @@ TEST(BufferedInputStream, ReadLine1) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -106,7 +106,7 @@ TEST(BufferedInputStream, ReadLine1) {
 
 TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -116,7 +116,7 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -131,7 +131,7 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
 
 TEST(BufferedInputStream, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
@@ -142,7 +142,7 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -161,7 +161,7 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
 
 TEST(BufferedInputStream, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
@@ -172,7 +172,7 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
     TF_ASSERT_OK(in.ReadLine(&line));
@@ -191,7 +191,7 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
 
 TEST(BufferedInputStream, SkipLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -202,7 +202,7 @@ TEST(BufferedInputStream, SkipLine1) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
@@ -215,7 +215,7 @@ TEST(BufferedInputStream, SkipLine1) {
 
 TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -225,7 +225,7 @@ TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
@@ -238,7 +238,7 @@ TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
 
 TEST(BufferedInputStream, SkipLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\n\n\nline two"));
   std::unique_ptr<RandomAccessFile> file;
@@ -248,7 +248,7 @@ TEST(BufferedInputStream, SkipLine_EmptyLines) {
     std::unique_ptr<RandomAccessInputStream> input_stream(
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
-    string line;
+    std::string line;
     TF_ASSERT_OK(in.SkipLine());
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "");
@@ -260,7 +260,7 @@ TEST(BufferedInputStream, SkipLine_EmptyLines) {
 
 TEST(BufferedInputStream, ReadNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -324,7 +324,7 @@ TEST(BufferedInputStream, OutOfRangeCache) {
 
 TEST(BufferedInputStream, SkipNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -362,7 +362,7 @@ TEST(BufferedInputStream, SkipNBytes) {
 
 TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -398,7 +398,7 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -434,7 +434,7 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, Seek) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -467,7 +467,7 @@ TEST(BufferedInputStream, Seek_NotReset) {
   // This test verifies seek backwards within the buffer doesn't reset
   // input_stream
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
@@ -489,9 +489,9 @@ TEST(BufferedInputStream, Seek_NotReset) {
 
 TEST(BufferedInputStream, ReadAll_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  const string expected = "";
+  const std::string expected = "";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -499,7 +499,7 @@ TEST(BufferedInputStream, ReadAll_Empty) {
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
     BufferedInputStream in(&input_stream, buf_size);
-    string contents;
+    std::string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
     EXPECT_EQ(expected, contents);
   }
@@ -507,9 +507,9 @@ TEST(BufferedInputStream, ReadAll_Empty) {
 
 TEST(BufferedInputStream, ReadAll_Text) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  const string expected = "line one\nline two\nline three";
+  const std::string expected = "line one\nline two\nline three";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -517,7 +517,7 @@ TEST(BufferedInputStream, ReadAll_Text) {
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
     BufferedInputStream in(&input_stream, buf_size);
-    string contents;
+    std::string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
     EXPECT_EQ(expected, contents);
   }
@@ -527,10 +527,10 @@ void BM_BufferedReaderSmallReads(::testing::benchmark::State& state) {
   const int buff_size = state.range(0);
   const int file_size = state.range(1);
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
-  const string file_elem = "0123456789";
+  const std::string file_elem = "0123456789";
   std::unique_ptr<WritableFile> write_file;
   TF_ASSERT_OK(env->NewWritableFile(fname, &write_file));
   for (int i = 0; i < file_size; ++i) {
diff --git a/third_party/xla/xla/tsl/lib/io/format.cc b/third_party/xla/xla/tsl/lib/io/format.cc
index 0880f47fde1ea5..aeee0db931cae9 100644
--- a/third_party/xla/xla/tsl/lib/io/format.cc
+++ b/third_party/xla/xla/tsl/lib/io/format.cc
@@ -28,10 +28,10 @@ limitations under the License.
 namespace tsl {
 namespace table {
 
-void BlockHandle::EncodeTo(string* dst) const {
+void BlockHandle::EncodeTo(std::string* dst) const {
   // Sanity check that all fields have been set
-  assert(offset_ != ~static_cast<uint64>(0));
-  assert(size_ != ~static_cast<uint64>(0));
+  assert(offset_ != ~static_cast<uint64_t>(0));
+  assert(size_ != ~static_cast<uint64_t>(0));
   core::PutVarint64(dst, offset_);
   core::PutVarint64(dst, size_);
 }
@@ -44,24 +44,24 @@ absl::Status BlockHandle::DecodeFrom(absl::string_view* input) {
   }
 }
 
-void Footer::EncodeTo(string* dst) const {
+void Footer::EncodeTo(std::string* dst) const {
 #ifndef NDEBUG
   const size_t original_size = dst->size();
 #endif
   metaindex_handle_.EncodeTo(dst);
   index_handle_.EncodeTo(dst);
   dst->resize(2 * BlockHandle::kMaxEncodedLength);  // Padding
-  core::PutFixed32(dst, static_cast<uint32>(kTableMagicNumber & 0xffffffffu));
-  core::PutFixed32(dst, static_cast<uint32>(kTableMagicNumber >> 32));
+  core::PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber & 0xffffffffu));
+  core::PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
   assert(dst->size() == original_size + kEncodedLength);
 }
 
 absl::Status Footer::DecodeFrom(absl::string_view* input) {
   const char* magic_ptr = input->data() + kEncodedLength - 8;
-  const uint32 magic_lo = core::DecodeFixed32(magic_ptr);
-  const uint32 magic_hi = core::DecodeFixed32(magic_ptr + 4);
-  const uint64 magic =
-      ((static_cast<uint64>(magic_hi) << 32) | (static_cast<uint64>(magic_lo)));
+  const uint32_t magic_lo = core::DecodeFixed32(magic_ptr);
+  const uint32_t magic_hi = core::DecodeFixed32(magic_ptr + 4);
+  const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+                          (static_cast<uint64_t>(magic_lo)));
   if (magic != kTableMagicNumber) {
     return errors::DataLoss("not an sstable (bad magic number)");
   }
@@ -110,8 +110,8 @@ absl::Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // This checksum verification is optional.  We leave it on for now
   const bool verify_checksum = true;
   if (verify_checksum) {
-    const uint32 crc = crc32c::Unmask(core::DecodeFixed32(data + n + 1));
-    const uint32 actual = crc32c::Value(data, n + 1);
+    const uint32_t crc = crc32c::Unmask(core::DecodeFixed32(data + n + 1));
+    const uint32_t actual = crc32c::Value(data, n + 1);
     if (actual != crc) {
       delete[] buf;
       s = errors::DataLoss("block checksum mismatch");
diff --git a/third_party/xla/xla/tsl/lib/io/format.h b/third_party/xla/xla/tsl/lib/io/format.h
index 408be574f6b059..fa34bdd8ac75e8 100644
--- a/third_party/xla/xla/tsl/lib/io/format.h
+++ b/third_party/xla/xla/tsl/lib/io/format.h
@@ -38,22 +38,22 @@ class BlockHandle {
   BlockHandle();
 
   // The offset of the block in the file.
-  uint64 offset() const { return offset_; }
-  void set_offset(uint64 offset) { offset_ = offset; }
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t offset) { offset_ = offset; }
 
   // The size of the stored block
-  uint64 size() const { return size_; }
-  void set_size(uint64 size) { size_ = size; }
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t size) { size_ = size; }
 
-  void EncodeTo(string* dst) const;
+  void EncodeTo(std::string* dst) const;
   absl::Status DecodeFrom(absl::string_view* input);
 
   // Maximum encoding length of a BlockHandle
   enum { kMaxEncodedLength = 10 + 10 };
 
  private:
-  uint64 offset_;
-  uint64 size_;
+  uint64_t offset_;
+  uint64_t size_;
 };
 
 // Footer encapsulates the fixed information stored at the tail
@@ -70,7 +70,7 @@ class Footer {
   const BlockHandle& index_handle() const { return index_handle_; }
   void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
 
-  void EncodeTo(string* dst) const;
+  void EncodeTo(std::string* dst) const;
   absl::Status DecodeFrom(absl::string_view* input);
 
   // Encoded length of a Footer.  Note that the serialization of a
@@ -86,7 +86,7 @@ class Footer {
 // kTableMagicNumber was picked by running
 //    echo http://code.google.com/p/leveldb/ | sha1sum
 // and taking the leading 64 bits.
-static const uint64 kTableMagicNumber = 0xdb4775248b80fb57ull;
+static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
 
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
@@ -105,7 +105,7 @@ extern absl::Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
 // Implementation details follow.  Clients should ignore,
 
 inline BlockHandle::BlockHandle()
-    : offset_(~static_cast<uint64>(0)), size_(~static_cast<uint64>(0)) {}
+    : offset_(~static_cast<uint64_t>(0)), size_(~static_cast<uint64_t>(0)) {}
 
 }  // namespace table
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
index 3923b1ef7ecc6d..9fbd9ecb8f1679 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
@@ -130,7 +130,7 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read, char* result,
   return status;
 }
 
-absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
+absl::Status InputBuffer::ReadVarint32Fallback(uint32_t* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint32Bytes);
   if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint32.");
@@ -138,7 +138,7 @@ absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
   return s;
 }
 
-absl::Status InputBuffer::ReadVarint64Fallback(uint64* result) {
+absl::Status InputBuffer::ReadVarint64Fallback(uint64_t* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint64Bytes);
   if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint64.");
@@ -148,7 +148,7 @@ absl::Status InputBuffer::ReadVarint64Fallback(uint64* result) {
 
 template <typename T>
 absl::Status InputBuffer::ReadVarintFallback(T* result, int max_bytes) {
-  uint8 scratch = 0;
+  uint8_t scratch = 0;
   auto* p = reinterpret_cast<char*>(&scratch);
   size_t unused_bytes_read = 0;
 
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.h b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
index 5dd9923d248fb3..b9d26f3bc451f3 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
@@ -67,10 +67,10 @@ class InputBuffer {
                           size_t* bytes_read);
 
   // Reads a single varint32.
-  absl::Status ReadVarint32(uint32* result);
+  absl::Status ReadVarint32(uint32_t* result);
 
   // Reads a single varint64.
-  absl::Status ReadVarint64(uint64* result);
+  absl::Status ReadVarint64(uint64_t* result);
 
   // Like ReadNBytes() without returning the bytes read.
   absl::Status SkipNBytes(int64_t bytes_to_skip);
@@ -95,10 +95,10 @@ class InputBuffer {
   absl::Status FillBuffer();
 
   // Internal slow-path routine used by ReadVarint32().
-  absl::Status ReadVarint32Fallback(uint32* result);
+  absl::Status ReadVarint32Fallback(uint32_t* result);
 
   // Internal slow-path routine used by ReadVarint64().
-  absl::Status ReadVarint64Fallback(uint64* result);
+  absl::Status ReadVarint64Fallback(uint64_t* result);
 
   // Helper method for reading a varint which can span at max `max_bytes`.
   // If the varint is longer, a DataLoss error status is returned.
@@ -134,7 +134,7 @@ extern template Status InputBuffer::ReadLine<std::string>(std::string* result);
 extern template Status InputBuffer::ReadLine<tstring>(tstring* result);
 
 // Inlined for performance.
-inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
+inline absl::Status InputBuffer::ReadVarint32(uint32_t* result) {
   if (pos_ + core::kMaxVarint32Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
@@ -149,7 +149,7 @@ inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
 }
 
 // Inlined for performance.
-inline absl::Status InputBuffer::ReadVarint64(uint64* result) {
+inline absl::Status InputBuffer::ReadVarint64(uint64_t* result) {
   if (pos_ + core::kMaxVarint64Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
index f6a26becac9b6d..682740aca7bf0a 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
@@ -37,14 +37,14 @@ static std::vector<int> BufferSizes() {
 
 TEST(InputBuffer, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
@@ -52,7 +52,7 @@ TEST(InputBuffer, ReadLine_Empty) {
 
 TEST(InputBuffer, ReadLine1) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
@@ -60,7 +60,7 @@ TEST(InputBuffer, ReadLine1) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -76,14 +76,14 @@ TEST(InputBuffer, ReadLine1) {
 
 TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -99,7 +99,7 @@ TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
 
 TEST(InputBuffer, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
@@ -107,7 +107,7 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -127,7 +127,7 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
 
 TEST(InputBuffer, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
@@ -135,7 +135,7 @@ TEST(InputBuffer, ReadLine_CRLF) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string line;
+    std::string line;
     io::InputBuffer in(file.get(), buf_size);
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line one");
@@ -155,7 +155,7 @@ TEST(InputBuffer, ReadLine_CRLF) {
 
 TEST(InputBuffer, ReadNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
@@ -163,7 +163,7 @@ TEST(InputBuffer, ReadNBytes) {
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_CHECK_OK(in.ReadNBytes(3, &read));
@@ -229,14 +229,14 @@ TEST(InputBuffer, ReadNBytes) {
 
 TEST(InputBuffer, SkipNBytes) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
     EXPECT_EQ(0, in.Tell());
     TF_CHECK_OK(in.SkipNBytes(3));
@@ -265,14 +265,14 @@ TEST(InputBuffer, SkipNBytes) {
 
 TEST(InputBuffer, Seek) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
 
     TF_CHECK_OK(in.ReadNBytes(3, &read));
@@ -301,23 +301,23 @@ TEST(InputBuffer, Seek) {
 
 TEST(InputBuffer, ReadVarint32) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
-  std::vector<uint32> data;
-  uint32 i = 0;
+  std::vector<uint32_t> data;
+  uint32_t i = 0;
   for (; i < (1U << 10); i += 1) data.push_back(i);
   for (; i < (1U << 15); i += 5) data.push_back(i);
   for (; i < (1U << 31); i += 132817) data.push_back(i);
-  data.push_back(std::numeric_limits<uint32>::max());
+  data.push_back(std::numeric_limits<uint32_t>::max());
 
   // Writes the varints.
   {
     std::unique_ptr<WritableFile> file;
     TF_CHECK_OK(env->NewWritableFile(fname, &file));
-    string varint;
-    for (uint32 number : data) {
+    std::string varint;
+    for (uint32_t number : data) {
       varint.clear();
       core::PutVarint32(&varint, number);
       TF_CHECK_OK(file->Append(absl::string_view(varint)));
@@ -328,9 +328,9 @@ TEST(InputBuffer, ReadVarint32) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
     io::InputBuffer in(file.get(), buf_size);
-    uint32 result = 0;
+    uint32_t result = 0;
 
-    for (uint32 expected : data) {
+    for (uint32_t expected : data) {
       TF_ASSERT_OK(in.ReadVarint32(&result));
       EXPECT_EQ(expected, result);
     }
@@ -340,24 +340,24 @@ TEST(InputBuffer, ReadVarint32) {
 
 TEST(InputBuffer, ReadVarint64) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
-  std::vector<uint64> data;
-  uint64 i = 0;
+  std::vector<uint64_t> data;
+  uint64_t i = 0;
   for (; i < (1U << 10); i += 1) data.push_back(i);
   for (; i < (1U << 15); i += 5) data.push_back(i);
   for (; i < (1U << 31); i += 164817) data.push_back(i);
   for (; i < (1ULL << 63); i += 16481797854795663UL) data.push_back(i);
-  data.push_back(std::numeric_limits<uint64>::max());
+  data.push_back(std::numeric_limits<uint64_t>::max());
 
   // Writes the varints.
   {
     std::unique_ptr<WritableFile> file;
     TF_CHECK_OK(env->NewWritableFile(fname, &file));
-    string varint;
-    for (uint64 number : data) {
+    std::string varint;
+    for (uint64_t number : data) {
       varint.clear();
       core::PutVarint64(&varint, number);
       TF_CHECK_OK(file->Append(absl::string_view(varint)));
@@ -368,9 +368,9 @@ TEST(InputBuffer, ReadVarint64) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
     io::InputBuffer in(file.get(), buf_size);
-    uint64 result = 0;
+    uint64_t result = 0;
 
-    for (uint64 expected : data) {
+    for (uint64_t expected : data) {
       TF_ASSERT_OK(in.ReadVarint64(&result));
       EXPECT_EQ(expected, result);
     }
@@ -380,14 +380,14 @@ TEST(InputBuffer, ReadVarint64) {
 
 TEST(InputBuffer, Hint) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
     std::unique_ptr<RandomAccessFile> file;
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
-    string read;
+    std::string read;
     io::InputBuffer in(file.get(), buf_size);
 
     TF_CHECK_OK(in.ReadNBytes(3, &read));
diff --git a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
index 453e898438ddb2..6ab16d4fbf103c 100644
--- a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 class TestStringStream : public InputStreamInterface {
  public:
-  explicit TestStringStream(const string& content) : content_(content) {}
+  explicit TestStringStream(const std::string& content) : content_(content) {}
 
   absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override {
     result->clear();
@@ -45,7 +45,7 @@ class TestStringStream : public InputStreamInterface {
   }
 
  private:
-  string content_;
+  std::string content_;
   int64_t pos_ = 0;
 };
 
diff --git a/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h b/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
index a63a4f950f466d..b8ee11aa2078ea 100644
--- a/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
+++ b/third_party/xla/xla/tsl/lib/io/proto_encode_helper.h
@@ -41,7 +41,7 @@ class ProtoEncodeHelper {
   const char* data() const { return base_; }
   size_t size() const { return p_ - base_; }
 
-  void WriteUint64(int tag, uint64 v) {
+  void WriteUint64(int tag, uint64_t v) {
     Encode32(combine(tag, WIRETYPE_VARINT));
     Encode64(v);
   }
@@ -54,7 +54,7 @@ class ProtoEncodeHelper {
     Encode32(v.size());
     EncodeBytes(v.data(), v.size());
   }
-  void WriteVarlengthBeginning(int tag, uint32 len) {
+  void WriteVarlengthBeginning(int tag, uint32_t len) {
     Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
     Encode32(len);
   }
@@ -67,8 +67,10 @@ class ProtoEncodeHelper {
     WIRETYPE_VARINT = 0,
     WIRETYPE_LENGTH_DELIMITED = 2,
   };
-  static uint32 combine(uint32 tag, uint32 type) { return ((tag << 3) | type); }
-  inline void Encode32(uint32 v) {
+  static uint32_t combine(uint32_t tag, uint32_t type) {
+    return ((tag << 3) | type);
+  }
+  inline void Encode32(uint32_t v) {
     if (v < 128) {
       // Fast path for single-byte values.  Many of the calls will use a
       // constant value for v, so the comparison will get optimized away
@@ -79,7 +81,7 @@ class ProtoEncodeHelper {
       p_ = core::EncodeVarint32(p_, v);
     }
   }
-  void Encode64(uint64 v) { p_ = core::EncodeVarint64(p_, v); }
+  void Encode64(uint64_t v) { p_ = core::EncodeVarint64(p_, v); }
   void EncodeBool(bool v) {
     *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
     p_++;
diff --git a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
index 03492d8c95dd3c..9c2dd2f88f7fe7 100644
--- a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
@@ -25,7 +25,7 @@ namespace {
 
 TEST(RandomInputStream, ReadNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -55,7 +55,7 @@ TEST(RandomInputStream, ReadNBytes) {
 #if defined(TF_CORD_SUPPORT)
 TEST(RandomInputStream, ReadNBytesWithCords) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -87,7 +87,7 @@ TEST(RandomInputStream, ReadNBytesWithCords) {
 
 TEST(RandomInputStream, SkipNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
@@ -118,7 +118,7 @@ TEST(RandomInputStream, SkipNBytes) {
 
 TEST(RandomInputStream, Seek) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/random_inputbuffer_seek_test";
+  std::string fname = testing::TmpDir() + "/random_inputbuffer_seek_test";
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   std::unique_ptr<RandomAccessFile> file;
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader.cc b/third_party/xla/xla/tsl/lib/io/record_reader.cc
index 9ff15581af9edc..0ec1d146226a89 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_reader.cc
@@ -29,7 +29,7 @@ namespace tsl {
 namespace io {
 
 RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
-    const string& compression_type) {
+    const std::string& compression_type) {
   RecordReaderOptions options;
 
 #if defined(IS_SLIM_BUILD)
@@ -86,7 +86,7 @@ RecordReader::RecordReader(RandomAccessFile* file,
 }
 
 namespace {
-inline const char* GetChecksumErrorSuffix(uint64 offset) {
+inline const char* GetChecksumErrorSuffix(uint64_t offset) {
   if (offset == 0) {
     return " (Is this even a TFRecord file?)";
   }
@@ -101,14 +101,14 @@ inline const char* GetChecksumErrorSuffix(uint64 offset) {
 // and is used only in error messages. For failures at offset 0,
 // a reminder about the file format is added, because TFRecord files
 // contain no explicit format marker.
-absl::Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
+absl::Status RecordReader::ReadChecksummed(uint64_t offset, size_t n,
                                            tstring* result) {
-  if (n >= SIZE_MAX - sizeof(uint32)) {
+  if (n >= SIZE_MAX - sizeof(uint32_t)) {
     return errors::DataLoss("record size too large",
                             GetChecksumErrorSuffix(offset));
   }
 
-  const size_t expected = n + sizeof(uint32);
+  const size_t expected = n + sizeof(uint32_t);
   TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result));
 
   if (result->size() != expected) {
@@ -120,7 +120,7 @@ absl::Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
     }
   }
 
-  const uint32 masked_crc = core::DecodeFixed32(result->data() + n);
+  const uint32_t masked_crc = core::DecodeFixed32(result->data() + n);
   if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) {
     return errors::DataLoss("corrupted record at ", offset,
                             GetChecksumErrorSuffix(offset));
@@ -145,11 +145,11 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
     // Within the loop, we always increment offset positively, so this
     // loop should be guaranteed to either return after reaching EOF
     // or encountering an error.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     while (true) {
       // Read header, containing size of data.
-      absl::Status s = ReadChecksummed(offset, sizeof(uint64), &record);
+      absl::Status s = ReadChecksummed(offset, sizeof(uint64_t), &record);
       if (!s.ok()) {
         if (absl::IsOutOfRange(s)) {
           // We should reach out of range when the record file is complete.
@@ -159,7 +159,7 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
       }
 
       // Read the length of the data.
-      const uint64 length = core::DecodeFixed64(record.data());
+      const uint64_t length = core::DecodeFixed64(record.data());
 
       // Skip reading the actual data since we just want the number
       // of records and the size of the data.
@@ -182,7 +182,7 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::PositionInputStream(uint64 offset) {
+absl::Status RecordReader::PositionInputStream(uint64_t offset) {
   int64_t curr_pos = input_stream_->Tell();
   int64_t desired_pos = static_cast<int64_t>(offset);
   if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ ||
@@ -197,16 +197,16 @@ absl::Status RecordReader::PositionInputStream(uint64 offset) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
+absl::Status RecordReader::ReadRecord(uint64_t* offset, tstring* record) {
   TF_RETURN_IF_ERROR(PositionInputStream(*offset));
 
   // Read header data.
-  absl::Status s = ReadChecksummed(*offset, sizeof(uint64), record);
+  absl::Status s = ReadChecksummed(*offset, sizeof(uint64_t), record);
   if (!s.ok()) {
     last_read_failed_ = true;
     return s;
   }
-  const uint64 length = core::DecodeFixed64(record->data());
+  const uint64_t length = core::DecodeFixed64(record->data());
 
   // Read data
   s = ReadChecksummed(*offset + kHeaderSize, length, record);
@@ -224,7 +224,7 @@ absl::Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   return absl::OkStatus();
 }
 
-absl::Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
+absl::Status RecordReader::SkipRecords(uint64_t* offset, int num_to_skip,
                                        int* num_skipped) {
   TF_RETURN_IF_ERROR(PositionInputStream(*offset));
 
@@ -232,12 +232,12 @@ absl::Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
   tstring record;
   *num_skipped = 0;
   for (int i = 0; i < num_to_skip; ++i) {
-    s = ReadChecksummed(*offset, sizeof(uint64), &record);
+    s = ReadChecksummed(*offset, sizeof(uint64_t), &record);
     if (!s.ok()) {
       last_read_failed_ = true;
       return s;
     }
-    const uint64 length = core::DecodeFixed64(record.data());
+    const uint64_t length = core::DecodeFixed64(record.data());
 
     // Skip data
     s = input_stream_->SkipNBytes(length + kFooterSize);
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader.h b/third_party/xla/xla/tsl/lib/io/record_reader.h
index 8f144148ca33f5..7dce369860fc1d 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader.h
+++ b/third_party/xla/xla/tsl/lib/io/record_reader.h
@@ -47,7 +47,7 @@ struct RecordReaderOptions {
   int64_t buffer_size = 0;
 
   static RecordReaderOptions CreateRecordReaderOptions(
-      const string& compression_type);
+      const std::string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
   // Options specific to compression.
@@ -68,8 +68,8 @@ class RecordReader {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static constexpr size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64_t) + sizeof(uint32_t);
+  static constexpr size_t kFooterSize = sizeof(uint32_t);
 
   // Statistics (sizes are in units of bytes)
   struct Stats {
@@ -94,14 +94,14 @@ class RecordReader {
   // Read the record at "*offset" into *record and update *offset to
   // point to the offset of the next record.  Returns OK on success,
   // OUT_OF_RANGE for end of file, or something else for an error.
-  absl::Status ReadRecord(uint64* offset, tstring* record);
+  absl::Status ReadRecord(uint64_t* offset, tstring* record);
 
   // Skip num_to_skip record starting at "*offset" and update *offset
   // to point to the offset of the next num_to_skip + 1 record.
   // Return OK on success, OUT_OF_RANGE for end of file, or something
   // else for an error. "*num_skipped" records the number of records that
   // are actually skipped. It should be equal to num_to_skip on success.
-  absl::Status SkipRecords(uint64* offset, int num_to_skip, int* num_skipped);
+  absl::Status SkipRecords(uint64_t* offset, int num_to_skip, int* num_skipped);
 
   // Return the metadata of the Record file.
   //
@@ -115,8 +115,8 @@ class RecordReader {
   absl::Status GetMetadata(Metadata* md);
 
  private:
-  absl::Status ReadChecksummed(uint64 offset, size_t n, tstring* result);
-  absl::Status PositionInputStream(uint64 offset);
+  absl::Status ReadChecksummed(uint64_t offset, size_t n, tstring* result);
+  absl::Status PositionInputStream(uint64_t offset);
 
   RecordReaderOptions options_;
   std::unique_ptr<InputStreamInterface> input_stream_;
@@ -156,11 +156,11 @@ class SequentialRecordReader {
   }
 
   // Return the current offset in the file.
-  uint64 TellOffset() { return offset_; }
+  uint64_t TellOffset() { return offset_; }
 
   // Seek to this offset within the file and set this offset as the current
   // offset. Trying to seek backward will throw error.
-  absl::Status SeekOffset(uint64 offset) {
+  absl::Status SeekOffset(uint64_t offset) {
     if (offset < offset_)
       return errors::InvalidArgument(
           "Trying to seek offset: ", offset,
@@ -171,7 +171,7 @@ class SequentialRecordReader {
 
  private:
   RecordReader underlying_;
-  uint64 offset_ = 0;
+  uint64_t offset_ = 0;
 };
 
 }  // namespace io
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc b/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
index 2220a3ba0cc63c..ad3b865411725c 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_reader_writer_test.cc
@@ -48,15 +48,15 @@ io::RecordReaderOptions GetMatchingReaderOptions(
   return io::RecordReaderOptions::CreateRecordReaderOptions("");
 }
 
-uint64 GetFileSize(const string& fname) {
+uint64_t GetFileSize(const std::string& fname) {
   Env* env = Env::Default();
-  uint64 fsize;
+  uint64_t fsize;
   TF_CHECK_OK(env->GetFileSize(fname, &fsize));
   return fsize;
 }
 
 void VerifyFlush(const io::RecordWriterOptions& options) {
-  std::vector<string> records = {
+  std::vector<std::string> records = {
       "abcdefghijklmnopqrstuvwxyz",
       "ZYXWVUTSRQPONMLKJIHGFEDCBA0123456789!@#$%^&*()",
       "G5SyohOL9UmXofSOOwWDrv9hoLLMYPJbG9r38t3uBRcHxHj2PdKcPDuZmKW62RIY",
@@ -64,7 +64,7 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
   };
 
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_flush_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_flush_test";
 
   std::unique_ptr<WritableFile> file;
   TF_CHECK_OK(env->NewWritableFile(fname, &file));
@@ -77,7 +77,7 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
 
   EXPECT_EQ(GetFileSize(fname), 0);
   for (size_t i = 0; i < records.size(); i++) {
-    uint64 start_size = GetFileSize(fname);
+    uint64_t start_size = GetFileSize(fname);
 
     // Write a new record.
     TF_EXPECT_OK(writer.WriteRecord(records[i]));
@@ -85,11 +85,11 @@ void VerifyFlush(const io::RecordWriterOptions& options) {
     TF_CHECK_OK(file->Flush());
 
     // Verify that file size has changed after file flush.
-    uint64 new_size = GetFileSize(fname);
+    uint64_t new_size = GetFileSize(fname);
     EXPECT_GT(new_size, start_size);
 
     // Verify that file has all records written so far and no more.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     for (size_t j = 0; j <= i; j++) {
       // Check that j'th record is written correctly.
@@ -123,7 +123,7 @@ TEST(RecordReaderWriterTest, TestZlibSyncFlush) {
 
 TEST(RecordReaderWriterTest, TestBasics) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_test";
 
   for (auto buf_size : BufferSizes()) {
     {
@@ -145,7 +145,7 @@ TEST(RecordReaderWriterTest, TestBasics) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -164,7 +164,8 @@ TEST(RecordReaderWriterTest, TestBasics) {
 
 TEST(RecordReaderWriterTest, TestSkipBasic) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_skip_basic_test";
+  std::string fname =
+      testing::TmpDir() + "/record_reader_writer_skip_basic_test";
 
   for (auto buf_size : BufferSizes()) {
     {
@@ -187,7 +188,7 @@ TEST(RecordReaderWriterTest, TestSkipBasic) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       int num_skipped;
       tstring record;
       TF_CHECK_OK(reader.SkipRecords(&offset, 2, &num_skipped));
@@ -200,7 +201,7 @@ TEST(RecordReaderWriterTest, TestSkipBasic) {
 
 TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
   Env* env = Env::Default();
-  string fname =
+  std::string fname =
       testing::TmpDir() + "/record_reader_writer_skip_out_of_range_test";
 
   for (auto buf_size : BufferSizes()) {
@@ -223,7 +224,7 @@ TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
       io::RecordReaderOptions options;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       int num_skipped;
       tstring record;
       absl::Status s = reader.SkipRecords(&offset, 3, &num_skipped);
@@ -235,7 +236,7 @@ TEST(RecordReaderWriterTest, TestSkipOutOfRange) {
 
 TEST(RecordReaderWriterTest, TestMalformedInput) {
   Env* env = Env::Default();
-  string fname =
+  std::string fname =
       testing::TmpDir() + "/record_reader_writer_malformed_input_test";
 
   {
@@ -253,7 +254,7 @@ TEST(RecordReaderWriterTest, TestMalformedInput) {
     io::RecordReader reader(read_file.get());
     tstring record;
     // At offset 0, the error message reminds of the file type.
-    uint64 offset = 0;
+    uint64_t offset = 0;
     absl::Status s = reader.ReadRecord(&offset, &record);
     EXPECT_EQ(error::DATA_LOSS, s.code());
     EXPECT_EQ("corrupted record at 0 (Is this even a TFRecord file?)",
@@ -269,7 +270,7 @@ TEST(RecordReaderWriterTest, TestMalformedInput) {
 
 TEST(RecordReaderWriterTest, TestSnappy) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
 
   for (auto buf_size : BufferSizes()) {
     // Snappy compression needs output buffer size > 1.
@@ -295,7 +296,7 @@ TEST(RecordReaderWriterTest, TestSnappy) {
       options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -307,7 +308,7 @@ TEST(RecordReaderWriterTest, TestSnappy) {
 
 TEST(RecordReaderWriterTest, TestZlib) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
+  std::string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
 
   for (auto buf_size : BufferSizes()) {
     // Zlib compression needs output buffer size > 1.
@@ -333,7 +334,7 @@ TEST(RecordReaderWriterTest, TestZlib) {
       options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
       options.zlib_options.input_buffer_size = buf_size;
       io::RecordReader reader(read_file.get(), options);
-      uint64 offset = 0;
+      uint64_t offset = 0;
       tstring record;
       TF_CHECK_OK(reader.ReadRecord(&offset, &record));
       EXPECT_EQ("abc", record);
@@ -345,7 +346,8 @@ TEST(RecordReaderWriterTest, TestZlib) {
 
 TEST(RecordReaderWriterTest, TestUseAfterClose) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/record_reader_writer_flush_close_test";
+  std::string fname =
+      testing::TmpDir() + "/record_reader_writer_flush_close_test";
 
   {
     std::unique_ptr<WritableFile> file;
diff --git a/third_party/xla/xla/tsl/lib/io/record_writer.cc b/third_party/xla/xla/tsl/lib/io/record_writer.cc
index 985e415f632a61..0bc224a195cf84 100644
--- a/third_party/xla/xla/tsl/lib/io/record_writer.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_writer.cc
@@ -33,7 +33,7 @@ bool IsSnappyCompressed(const RecordWriterOptions& options) {
 }  // namespace
 
 RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
-    const string& compression_type) {
+    const std::string& compression_type) {
   RecordWriterOptions options;
 #if defined(IS_SLIM_BUILD)
   if (compression_type != compression::kNone) {
diff --git a/third_party/xla/xla/tsl/lib/io/record_writer.h b/third_party/xla/xla/tsl/lib/io/record_writer.h
index ced0bc687a6e28..f48fa58b0bc82d 100644
--- a/third_party/xla/xla/tsl/lib/io/record_writer.h
+++ b/third_party/xla/xla/tsl/lib/io/record_writer.h
@@ -46,7 +46,7 @@ struct RecordWriterOptions {
   CompressionType compression_type = NONE;
 
   static RecordWriterOptions CreateRecordWriterOptions(
-      const string& compression_type);
+      const std::string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
   // Options specific to compression.
@@ -62,8 +62,8 @@ class RecordWriter {
   //  uint32    masked crc of length
   //  byte      data[length]
   //  uint32    masked crc of data
-  static constexpr size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
-  static constexpr size_t kFooterSize = sizeof(uint32);
+  static constexpr size_t kHeaderSize = sizeof(uint64_t) + sizeof(uint32_t);
+  static constexpr size_t kFooterSize = sizeof(uint32_t);
 
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
@@ -114,12 +114,12 @@ class RecordWriter {
   WritableFile* dest_;
   RecordWriterOptions options_;
 
-  inline static uint32 MaskedCrc(const char* data, size_t n) {
+  inline static uint32_t MaskedCrc(const char* data, size_t n) {
     return crc32c::Mask(crc32c::Value(data, n));
   }
 
 #if defined(TF_CORD_SUPPORT)
-  inline static uint32 MaskedCrc(const absl::Cord& data) {
+  inline static uint32_t MaskedCrc(const absl::Cord& data) {
     return crc32c::Mask(crc32c::Value(data));
   }
 #endif
@@ -130,8 +130,8 @@ class RecordWriter {
 
 void RecordWriter::PopulateHeader(char* header, const char* data, size_t n) {
   core::EncodeFixed64(header + 0, n);
-  core::EncodeFixed32(header + sizeof(uint64),
-                      MaskedCrc(header, sizeof(uint64)));
+  core::EncodeFixed32(header + sizeof(uint64_t),
+                      MaskedCrc(header, sizeof(uint64_t)));
 }
 
 void RecordWriter::PopulateFooter(char* footer, const char* data, size_t n) {
@@ -141,8 +141,8 @@ void RecordWriter::PopulateFooter(char* footer, const char* data, size_t n) {
 #if defined(TF_CORD_SUPPORT)
 void RecordWriter::PopulateHeader(char* header, const absl::Cord& data) {
   core::EncodeFixed64(header + 0, data.size());
-  core::EncodeFixed32(header + sizeof(uint64),
-                      MaskedCrc(header, sizeof(uint64)));
+  core::EncodeFixed32(header + sizeof(uint64_t),
+                      MaskedCrc(header, sizeof(uint64_t)));
 }
 
 void RecordWriter::PopulateFooter(char* footer, const absl::Cord& data) {
diff --git a/third_party/xla/xla/tsl/lib/io/recordio_test.cc b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
index 7dc9e4c77b4fb5..138b937d6f0fef 100644
--- a/third_party/xla/xla/tsl/lib/io/recordio_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
@@ -30,8 +30,8 @@ namespace {
 
 // Construct a string of the specified length made out of the supplied
 // partial string.
-string BigString(const string& partial_string, size_t n) {
-  string result;
+std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
   while (result.size() < n) {
     result.append(partial_string);
   }
@@ -40,20 +40,20 @@ string BigString(const string& partial_string, size_t n) {
 }
 
 // Construct a string from a number
-string NumberString(int n) {
+std::string NumberString(int n) {
   char buf[50];
   snprintf(buf, sizeof(buf), "%d.", n);
-  return string(buf);
+  return std::string(buf);
 }
 
 // Return a skewed potentially long string
-string RandomSkewedString(int i, random::SimplePhilox* rnd) {
+std::string RandomSkewedString(int i, random::SimplePhilox* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
 class StringDest : public WritableFile {
  public:
-  explicit StringDest(string* contents) : contents_(contents) {}
+  explicit StringDest(std::string* contents) : contents_(contents) {}
 
   absl::Status Close() override { return absl::OkStatus(); }
   absl::Status Flush() override { return absl::OkStatus(); }
@@ -74,15 +74,15 @@ class StringDest : public WritableFile {
   }
 
  private:
-  string* contents_;
+  std::string* contents_;
 };
 
 class StringSource : public RandomAccessFile {
  public:
-  explicit StringSource(string* contents)
+  explicit StringSource(std::string* contents)
       : contents_(contents), force_error_(false) {}
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (force_error_) {
       force_error_ = false;
@@ -103,17 +103,17 @@ class StringSource : public RandomAccessFile {
   void force_error() { force_error_ = true; }
 
  private:
-  string* contents_;
+  std::string* contents_;
   mutable bool force_error_;
 };
 
 class RecordioTest : public ::testing::Test {
  private:
-  string contents_;
+  std::string contents_;
   StringDest dest_;
   StringSource source_;
   bool reading_;
-  uint64 readpos_;
+  uint64_t readpos_;
   RecordWriter* writer_;
   RecordReader* reader_;
 
@@ -131,7 +131,7 @@ class RecordioTest : public ::testing::Test {
     delete reader_;
   }
 
-  void Write(const string& msg) {
+  void Write(const std::string& msg) {
     ASSERT_TRUE(!reading_) << "Write() after starting to read";
     TF_ASSERT_OK(writer_->WriteRecord(absl::string_view(msg)));
   }
@@ -145,7 +145,7 @@ class RecordioTest : public ::testing::Test {
 
   size_t WrittenBytes() const { return contents_.size(); }
 
-  string Read() {
+  std::string Read() {
     if (!reading_) {
       reading_ = true;
     }
@@ -182,7 +182,7 @@ class RecordioTest : public ::testing::Test {
     Write("bar");
     Write(BigString("x", 10000));
     reading_ = true;
-    uint64 offset = WrittenBytes() + offset_past_end;
+    uint64_t offset = WrittenBytes() + offset_past_end;
     tstring record;
     absl::Status s = reader_->ReadRecord(&offset, &record);
     ASSERT_TRUE(absl::IsOutOfRange(s)) << s;
@@ -250,7 +250,7 @@ TEST_F(RecordioTest, RandomRead) {
 
 void TestNonSequentialReads(const RecordWriterOptions& writer_options,
                             const RecordReaderOptions& reader_options) {
-  string contents;
+  std::string contents;
   StringDest dst(&contents);
   RecordWriter writer(&dst, writer_options);
   for (int i = 0; i < 10; ++i) {
@@ -263,8 +263,8 @@ void TestNonSequentialReads(const RecordWriterOptions& writer_options,
 
   tstring record;
   // First read sequentially to fill in the offsets table.
-  uint64 offsets[10] = {0};
-  uint64 offset = 0;
+  uint64_t offsets[10] = {0};
+  uint64_t offset = 0;
   for (int i = 0; i < 10; ++i) {
     offsets[i] = offset;
     TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i;
@@ -306,15 +306,15 @@ void AssertHasSubstr(absl::string_view s, absl::string_view expected) {
 
 void TestReadError(const RecordWriterOptions& writer_options,
                    const RecordReaderOptions& reader_options) {
-  const string wrote = BigString("well hello there!", 100);
-  string contents;
+  const std::string wrote = BigString("well hello there!", 100);
+  std::string contents;
   StringDest dst(&contents);
   TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote));
 
   StringSource file(&contents);
   RecordReader reader(&file, reader_options);
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
   tstring read;
   file.force_error();
   absl::Status status = reader.ReadRecord(&offset, &read);
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
index b1e2b92b90b6bf..2b1522898d7833 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
@@ -86,7 +86,7 @@ size_t SnappyInputBuffer::ReadBytesFromCache(size_t bytes_to_read,
 
 absl::Status SnappyInputBuffer::Inflate() {
   // Read length of compressed block.
-  uint32 compressed_block_length;
+  uint32_t compressed_block_length;
   TF_RETURN_IF_ERROR(ReadCompressedBlockLength(&compressed_block_length));
 
   // If the entire block is not in cache do a read from file.
@@ -130,7 +130,7 @@ absl::Status SnappyInputBuffer::Inflate() {
   return absl::OkStatus();
 }
 
-absl::Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
+absl::Status SnappyInputBuffer::ReadCompressedBlockLength(uint32_t* length) {
   *length = 0;
   size_t bytes_to_read = 4;
   while (bytes_to_read > 0) {
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
index 8688e368719828..8a0c75fa0ad0c2 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.h
@@ -92,7 +92,7 @@ class SnappyInputBuffer : public InputStreamInterface {
 
   // Reads the length of the next *compressed* block and stores in `length`.
   // The length is stored in 4 bytes in little endian notation.
-  absl::Status ReadCompressedBlockLength(uint32* length);
+  absl::Status ReadCompressedBlockLength(uint32_t* length);
 
   RandomAccessFile* file_;         // Not owned
   int64_t file_pos_ = 0;           // Next position to read from in `file_`
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
index 30e3abbd777993..de892a5ecd6516 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
@@ -85,11 +85,11 @@ absl::Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read,
 
 absl::Status SnappyInputStream::Inflate() {
   tstring compressed_block_length_ts;
-  uint32 compressed_block_length;
+  uint32_t compressed_block_length;
 
   TF_RETURN_IF_ERROR(
-      input_stream_->ReadNBytes(sizeof(uint32), &compressed_block_length_ts));
-  for (int i = 0; i < sizeof(uint32); ++i) {
+      input_stream_->ReadNBytes(sizeof(uint32_t), &compressed_block_length_ts));
+  for (int i = 0; i < sizeof(uint32_t); ++i) {
     compressed_block_length =
         (compressed_block_length << 8) |
         static_cast<unsigned char>(compressed_block_length_ts.data()[i]);
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
index 7241d24c46b155..4bbca4712b6ef3 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.cc
@@ -80,7 +80,7 @@ absl::Status SnappyOutputBuffer::Write(absl::string_view data) {
 
   // If there is sufficient free space in input_buffer_ to fit data we
   // add it there and return.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (static_cast<int32_t>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
@@ -91,7 +91,7 @@ absl::Status SnappyOutputBuffer::Write(absl::string_view data) {
   TF_RETURN_IF_ERROR(DeflateBuffered());
 
   // input_buffer_ should be empty at this point.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (static_cast<int32_t>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
@@ -117,7 +117,7 @@ absl::Status SnappyOutputBuffer::Flush() {
   return absl::OkStatus();
 }
 
-int32 SnappyOutputBuffer::AvailableInputSpace() const {
+int32_t SnappyOutputBuffer::AvailableInputSpace() const {
   return input_buffer_capacity_ - avail_in_;
 }
 
@@ -148,7 +148,7 @@ void SnappyOutputBuffer::AddToInputBuffer(absl::string_view data) {
   const int32_t free_tail_bytes =
       input_buffer_capacity_ - (read_bytes + unread_bytes);
 
-  if (static_cast<int32>(bytes_to_write) > free_tail_bytes) {
+  if (static_cast<int32_t>(bytes_to_write) > free_tail_bytes) {
     memmove(input_buffer_.get(), next_in_, avail_in_);
     next_in_ = input_buffer_.get();
   }
@@ -197,7 +197,7 @@ absl::Status SnappyOutputBuffer::Deflate() {
   if (avail_in_ == 0) {
     return absl::OkStatus();
   }
-  string output;
+  std::string output;
   if (!port::Snappy_Compress(next_in_, avail_in_, &output)) {
     return errors::DataLoss("Snappy_Compress failed");
   }
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
index d48ded2196a454..0abd3e48942c7b 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_outputbuffer.h
@@ -114,7 +114,7 @@ class SnappyOutputBuffer : public WritableFile {
   absl::Status AddToOutputBuffer(const char* data, size_t length);
 
   // Returns the total space available in `input_buffer_`.
-  int32 AvailableInputSpace() const;
+  int32_t AvailableInputSpace() const;
 
   // Deflate contents in input_buffer_ and store results in output_buffer_.
   // The contents of output stream are written to file if more space is needed.
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
index 8104f89131567f..a1b743e45d3a4f 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
@@ -26,16 +26,16 @@ limitations under the License.
 
 namespace tsl {
 
-static void CheckPrefixSuffix(absl::string_view str, const string& prefix,
-                              const string& suffix) {
+static void CheckPrefixSuffix(absl::string_view str, const std::string& prefix,
+                              const std::string& suffix) {
   CHECK_GE(str.size(), prefix.size());
   CHECK_GE(str.size(), suffix.size());
   CHECK_EQ(str.substr(0, prefix.length()), prefix);
   CHECK_EQ(str.substr(str.length() - suffix.length()), suffix);
 }
 
-static string GetRecord() {
-  static const string lorem_ipsum =
+static std::string GetRecord() {
+  static const std::string lorem_ipsum =
       "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
       " Fusce vehicula tincidunt libero sit amet ultrices. Vestibulum non "
       "felis augue. Duis vitae augue id lectus lacinia congue et ut purus. "
@@ -51,8 +51,8 @@ static string GetRecord() {
   return lorem_ipsum;
 }
 
-static string GenTestString(int copies = 1) {
-  string result = "";
+static std::string GenTestString(int copies = 1) {
+  std::string result = "";
   for (int i = 0; i < copies; i++) {
     result += GetRecord();
   }
@@ -64,8 +64,8 @@ absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
                                          int num_writes, bool with_flush,
                                          int num_copies,
                                          bool corrupt_compressed_file,
-                                         string& fname, string& data,
-                                         string& expected_result) {
+                                         std::string& fname, std::string& data,
+                                         std::string& expected_result) {
   Env* env = Env::Default();
 
   fname = testing::TmpDir() + "/snappy_buffers_test";
@@ -88,7 +88,8 @@ absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
   TF_RETURN_IF_ERROR(file_writer->Close());
 
   if (corrupt_compressed_file) {
-    string corrupt_fname = testing::TmpDir() + "/snappy_buffers_test_corrupt";
+    std::string corrupt_fname =
+        testing::TmpDir() + "/snappy_buffers_test_corrupt";
     std::unique_ptr<WritableFile> corrupt_file_writer;
     TF_RETURN_IF_ERROR(
         env->NewWritableFile(corrupt_fname, &corrupt_file_writer));
@@ -135,9 +136,9 @@ absl::Status TestMultipleWrites(size_t compress_input_buf_size,
                                 bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
 
-  string expected_result;
-  string fname;
-  string data;
+  std::string expected_result;
+  std::string fname;
+  std::string data;
 
   TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
       compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
@@ -150,7 +151,7 @@ absl::Status TestMultipleWrites(size_t compress_input_buf_size,
 
   // Run the test twice, resetting the stream after the first attempt.
   for (int attempt = 0; attempt < 2; ++attempt) {
-    string actual_result;
+    std::string actual_result;
     for (int i = 0; i < num_writes; i++) {
       tstring decompressed_output;
       TF_RETURN_IF_ERROR(in.ReadNBytes(data.size(), &decompressed_output));
@@ -173,9 +174,9 @@ absl::Status TestMultipleWritesInputStream(
     bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
 
-  string expected_result;
-  string fname;
-  string data;
+  std::string expected_result;
+  std::string fname;
+  std::string data;
 
   TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
       compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
@@ -188,7 +189,7 @@ absl::Status TestMultipleWritesInputStream(
                                             uncompress_output_buf_size);
 
   for (int attempt = 0; attempt < 2; ++attempt) {
-    string actual_result;
+    std::string actual_result;
     for (int i = 0; i < num_writes; ++i) {
       tstring decompressed_output;
       TF_RETURN_IF_ERROR(
@@ -208,7 +209,7 @@ void TestTellWriteFile(size_t compress_input_buf_size,
                        size_t compress_output_buf_size,
                        size_t uncompress_input_buf_size,
                        size_t uncompress_output_buf_size, int num_copies,
-                       string& fname, string& data) {
+                       std::string& fname, std::string& data) {
   Env* env = Env::Default();
   fname = testing::TmpDir() + "/snappy_buffers_test";
   data = GenTestString(num_copies);
@@ -228,14 +229,14 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
               size_t uncompress_input_buf_size,
               size_t uncompress_output_buf_size, int num_copies = 1) {
   Env* env = Env::Default();
-  string data;
-  string fname;
+  std::string data;
+  std::string fname;
 
   TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
                     uncompress_input_buf_size, uncompress_output_buf_size,
                     num_copies, fname, data);
 
-  tstring first_half(string(data, 0, data.size() / 2));
+  tstring first_half(std::string(data, 0, data.size() / 2));
   tstring bytes_read;
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
@@ -265,14 +266,14 @@ void TestTellInputStream(size_t compress_input_buf_size,
                          size_t uncompress_output_buf_size,
                          int num_copies = 1) {
   Env* env = Env::Default();
-  string data;
-  string fname;
+  std::string data;
+  std::string fname;
 
   TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
                     uncompress_input_buf_size, uncompress_output_buf_size,
                     num_copies, fname, data);
 
-  tstring first_half(string(data, 0, data.size() / 2));
+  tstring first_half(std::string(data, 0, data.size() / 2));
   tstring bytes_read;
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
@@ -297,7 +298,7 @@ void TestTellInputStream(size_t compress_input_buf_size,
 }
 
 static bool SnappyCompressionSupported() {
-  string out;
+  std::string out;
   absl::string_view in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
   return port::Snappy_Compress(in.data(), in.size(), &out);
 }
diff --git a/third_party/xla/xla/tsl/lib/io/table.cc b/third_party/xla/xla/tsl/lib/io/table.cc
index 6274b788c548e9..0c7fc36e5d295b 100644
--- a/third_party/xla/xla/tsl/lib/io/table.cc
+++ b/third_party/xla/xla/tsl/lib/io/table.cc
@@ -34,14 +34,14 @@ struct Table::Rep {
   Options options;
   absl::Status status;
   RandomAccessFile* file;
-  uint64 cache_id;
+  uint64_t cache_id;
 
   BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
   Block* index_block;
 };
 
 absl::Status Table::Open(const Options& options, RandomAccessFile* file,
-                         uint64 size, Table** table) {
+                         uint64_t size, Table** table) {
   *table = nullptr;
   if (size < Footer::kEncodedLength) {
     return errors::DataLoss("file is too short to be an sstable");
@@ -181,10 +181,10 @@ absl::Status Table::InternalGet(absl::string_view key, void* arg,
   return s;
 }
 
-uint64 Table::ApproximateOffsetOf(absl::string_view key) const {
+uint64_t Table::ApproximateOffsetOf(absl::string_view key) const {
   Iterator* index_iter = rep_->index_block->NewIterator();
   index_iter->Seek(key);
-  uint64 result;
+  uint64_t result;
   if (index_iter->Valid()) {
     BlockHandle handle;
     absl::string_view input = index_iter->value();
diff --git a/third_party/xla/xla/tsl/lib/io/table.h b/third_party/xla/xla/tsl/lib/io/table.h
index 9ef0230fd90f46..253ad4dad72586 100644
--- a/third_party/xla/xla/tsl/lib/io/table.h
+++ b/third_party/xla/xla/tsl/lib/io/table.h
@@ -44,7 +44,7 @@ class Table {
   // "*file", but the client must ensure that "file" remains live
   // for the duration of the returned table's lifetime.
   static absl::Status Open(const Options& options, tsl::RandomAccessFile* file,
-                           uint64 file_size, Table** table);
+                           uint64_t file_size, Table** table);
 
   ~Table();
 
@@ -59,7 +59,7 @@ class Table {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  uint64 ApproximateOffsetOf(absl::string_view key) const;
+  uint64_t ApproximateOffsetOf(absl::string_view key) const;
 
  private:
   struct Rep;
diff --git a/third_party/xla/xla/tsl/lib/io/table_builder.cc b/third_party/xla/xla/tsl/lib/io/table_builder.cc
index ee8f8e3bb17a13..0ee043a09093e8 100644
--- a/third_party/xla/xla/tsl/lib/io/table_builder.cc
+++ b/third_party/xla/xla/tsl/lib/io/table_builder.cc
@@ -33,7 +33,7 @@ namespace table {
 
 namespace {
 
-void FindShortestSeparator(string* start, absl::string_view limit) {
+void FindShortestSeparator(std::string* start, absl::string_view limit) {
   // Find length of common prefix
   size_t min_length = std::min(start->size(), limit.size());
   size_t diff_index = 0;
@@ -45,9 +45,9 @@ void FindShortestSeparator(string* start, absl::string_view limit) {
   if (diff_index >= min_length) {
     // Do not shorten if one string is a prefix of the other
   } else {
-    uint8 diff_byte = static_cast<uint8>((*start)[diff_index]);
-    if (diff_byte < static_cast<uint8>(0xff) &&
-        diff_byte + 1 < static_cast<uint8>(limit[diff_index])) {
+    uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+    if (diff_byte < static_cast<uint8_t>(0xff) &&
+        diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
       (*start)[diff_index]++;
       start->resize(diff_index + 1);
       assert(absl::string_view(*start).compare(limit) < 0);
@@ -55,12 +55,12 @@ void FindShortestSeparator(string* start, absl::string_view limit) {
   }
 }
 
-void FindShortSuccessor(string* key) {
+void FindShortSuccessor(std::string* key) {
   // Find first character that can be incremented
   size_t n = key->size();
   for (size_t i = 0; i < n; i++) {
-    const uint8 byte = (*key)[i];
-    if (byte != static_cast<uint8>(0xff)) {
+    const uint8_t byte = (*key)[i];
+    if (byte != static_cast<uint8_t>(0xff)) {
       (*key)[i] = byte + 1;
       key->resize(i + 1);
       return;
@@ -74,11 +74,11 @@ struct TableBuilder::Rep {
   Options options;
   Options index_block_options;
   WritableFile* file;
-  uint64 offset;
+  uint64_t offset;
   absl::Status status;
   BlockBuilder data_block;
   BlockBuilder index_block;
-  string last_key;
+  std::string last_key;
   int64_t num_entries;
   bool closed;  // Either Finish() or Abandon() has been called.
 
@@ -94,7 +94,7 @@ struct TableBuilder::Rep {
   bool pending_index_entry;
   BlockHandle pending_handle;  // Handle to add to index block
 
-  string compressed_output;
+  std::string compressed_output;
 
   Rep(const Options& opt, WritableFile* f)
       : options(opt),
@@ -136,7 +136,7 @@ void TableBuilder::Add(absl::string_view key, absl::string_view value) {
   if (r->pending_index_entry) {
     assert(r->data_block.empty());
     FindShortestSeparator(&r->last_key, key);
-    string handle_encoding;
+    std::string handle_encoding;
     r->pending_handle.EncodeTo(&handle_encoding);
     r->index_block.Add(r->last_key, absl::string_view(handle_encoding));
     r->pending_index_entry = false;
@@ -183,7 +183,7 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
       break;
 
     case kSnappyCompression: {
-      string* compressed = &r->compressed_output;
+      std::string* compressed = &r->compressed_output;
       if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
           compressed->size() < raw.size() - (raw.size() / 8u)) {
         block_contents = *compressed;
@@ -210,7 +210,7 @@ void TableBuilder::WriteRawBlock(absl::string_view block_contents,
   if (r->status.ok()) {
     char trailer[kBlockTrailerSize];
     trailer[0] = type;
-    uint32 crc = crc32c::Value(block_contents.data(), block_contents.size());
+    uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
     crc = crc32c::Extend(crc, trailer, 1);  // Extend crc to cover block type
     core::EncodeFixed32(trailer + 1, crc32c::Mask(crc));
     r->status = r->file->Append(absl::string_view(trailer, kBlockTrailerSize));
@@ -241,7 +241,7 @@ absl::Status TableBuilder::Finish() {
   if (ok()) {
     if (r->pending_index_entry) {
       FindShortSuccessor(&r->last_key);
-      string handle_encoding;
+      std::string handle_encoding;
       r->pending_handle.EncodeTo(&handle_encoding);
       r->index_block.Add(r->last_key, absl::string_view(handle_encoding));
       r->pending_index_entry = false;
@@ -254,7 +254,7 @@ absl::Status TableBuilder::Finish() {
     Footer footer;
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
-    string footer_encoding;
+    std::string footer_encoding;
     footer.EncodeTo(&footer_encoding);
     r->status = r->file->Append(footer_encoding);
     if (r->status.ok()) {
@@ -270,9 +270,9 @@ void TableBuilder::Abandon() {
   r->closed = true;
 }
 
-uint64 TableBuilder::NumEntries() const { return rep_->num_entries; }
+uint64_t TableBuilder::NumEntries() const { return rep_->num_entries; }
 
-uint64 TableBuilder::FileSize() const { return rep_->offset; }
+uint64_t TableBuilder::FileSize() const { return rep_->offset; }
 
 }  // namespace table
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/io/table_builder.h b/third_party/xla/xla/tsl/lib/io/table_builder.h
index 7ffdf9c9f94d44..bbbb2da5931f8c 100644
--- a/third_party/xla/xla/tsl/lib/io/table_builder.h
+++ b/third_party/xla/xla/tsl/lib/io/table_builder.h
@@ -75,11 +75,11 @@ class TableBuilder {
   void Abandon();
 
   // Number of calls to Add() so far.
-  uint64 NumEntries() const;
+  uint64_t NumEntries() const;
 
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
-  uint64 FileSize() const;
+  uint64_t FileSize() const;
 
  private:
   bool ok() const { return status().ok(); }
diff --git a/third_party/xla/xla/tsl/lib/io/table_test.cc b/third_party/xla/xla/tsl/lib/io/table_test.cc
index 0955f7b9f74041..2411834653cc40 100644
--- a/third_party/xla/xla/tsl/lib/io/table_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/table_test.cc
@@ -41,19 +41,19 @@ typedef std::pair<absl::string_view, absl::string_view> StringPiecePair;
 
 namespace test {
 static absl::string_view RandomString(random::SimplePhilox* rnd, int len,
-                                      string* dst) {
+                                      std::string* dst) {
   dst->resize(len);
   for (int i = 0; i < len; i++) {
     (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));  // ' ' .. '~'
   }
   return absl::string_view(*dst);
 }
-static string RandomKey(random::SimplePhilox* rnd, int len) {
+static std::string RandomKey(random::SimplePhilox* rnd, int len) {
   // Make sure to generate a wide variety of characters so we
   // test the boundary conditions for short-key optimizations.
   static const char kTestChars[] = {'\0', '\1', 'a',    'b',    'c',
                                     'd',  'e',  '\xfd', '\xfe', '\xff'};
-  string result;
+  std::string result;
   for (int i = 0; i < len; i++) {
     result += kTestChars[rnd->Uniform(sizeof(kTestChars))];
   }
@@ -61,10 +61,10 @@ static string RandomKey(random::SimplePhilox* rnd, int len) {
 }
 static absl::string_view CompressibleString(random::SimplePhilox* rnd,
                                             double compressed_fraction,
-                                            size_t len, string* dst) {
+                                            size_t len, std::string* dst) {
   int raw = static_cast<int>(len * compressed_fraction);
   if (raw < 1) raw = 1;
-  string raw_data;
+  std::string raw_data;
   RandomString(rnd, raw, &raw_data);
 
   // Duplicate the random data until we have filled "len" bytes
@@ -77,13 +77,13 @@ static absl::string_view CompressibleString(random::SimplePhilox* rnd,
 }
 }  // namespace test
 
-static void Increment(string* key) { key->push_back('\0'); }
+static void Increment(std::string* key) { key->push_back('\0'); }
 
 // An STL comparator that compares two StringPieces
 namespace {
 struct STLLessThan {
   STLLessThan() {}
-  bool operator()(const string& a, const string& b) const {
+  bool operator()(const std::string& a, const std::string& b) const {
     return absl::string_view(a).compare(absl::string_view(b)) < 0;
   }
 };
@@ -93,7 +93,7 @@ class StringSink : public WritableFile {
  public:
   ~StringSink() override {}
 
-  const string& contents() const { return contents_; }
+  const std::string& contents() const { return contents_; }
 
   absl::Status Close() override { return absl::OkStatus(); }
   absl::Status Flush() override { return absl::OkStatus(); }
@@ -112,7 +112,7 @@ class StringSink : public WritableFile {
   }
 
  private:
-  string contents_;
+  std::string contents_;
 };
 
 class StringSource : public RandomAccessFile {
@@ -122,13 +122,13 @@ class StringSource : public RandomAccessFile {
 
   ~StringSource() override {}
 
-  uint64 Size() const { return contents_.size(); }
+  uint64_t Size() const { return contents_.size(); }
 
   absl::Status Name(absl::string_view* result) const override {
     return errors::Unimplemented("StringSource does not support Name()");
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (offset > contents_.size()) {
       return errors::InvalidArgument("invalid Read offset");
@@ -142,14 +142,14 @@ class StringSource : public RandomAccessFile {
     return absl::OkStatus();
   }
 
-  uint64 BytesRead() const { return bytes_read_; }
+  uint64_t BytesRead() const { return bytes_read_; }
 
  private:
-  string contents_;
-  mutable uint64 bytes_read_;
+  std::string contents_;
+  mutable uint64_t bytes_read_;
 };
 
-typedef std::map<string, string, STLLessThan> KVMap;
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
 
 // Helper class for tests to unify the interface between
 // BlockBuilder/TableBuilder and Block/Table.
@@ -158,14 +158,15 @@ class Constructor {
   explicit Constructor() : data_(STLLessThan()) {}
   virtual ~Constructor() {}
 
-  void Add(const string& key, absl::string_view value) {
-    data_[key] = string(value);
+  void Add(const std::string& key, absl::string_view value) {
+    data_[key] = std::string(value);
   }
 
   // Finish constructing the data structure with all the keys that have
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
-  void Finish(const Options& options, std::vector<string>* keys, KVMap* kvmap) {
+  void Finish(const Options& options, std::vector<std::string>* keys,
+              KVMap* kvmap) {
     *kvmap = data_;
     keys->clear();
     for (KVMap::const_iterator it = data_.begin(); it != data_.end(); ++it) {
@@ -201,7 +202,7 @@ class BlockConstructor : public Constructor {
       builder.Add(it->first, it->second);
     }
     // Open the block
-    data_ = string(builder.Finish());
+    data_ = std::string(builder.Finish());
     BlockContents contents;
     contents.data = data_;
     contents.cacheable = false;
@@ -212,7 +213,7 @@ class BlockConstructor : public Constructor {
   Iterator* NewIterator() const override { return block_->NewIterator(); }
 
  private:
-  string data_;
+  std::string data_;
   Block* block_;
 };
 
@@ -242,11 +243,11 @@ class TableConstructor : public Constructor {
 
   Iterator* NewIterator() const override { return table_->NewIterator(); }
 
-  uint64 ApproximateOffsetOf(absl::string_view key) const {
+  uint64_t ApproximateOffsetOf(absl::string_view key) const {
     return table_->ApproximateOffsetOf(key);
   }
 
-  uint64 BytesRead() const { return source_->BytesRead(); }
+  uint64_t BytesRead() const { return source_->BytesRead(); }
 
  private:
   void Reset() {
@@ -298,12 +299,12 @@ class Harness : public ::testing::Test {
 
   ~Harness() override { delete constructor_; }
 
-  void Add(const string& key, const string& value) {
+  void Add(const std::string& key, const std::string& value) {
     constructor_->Add(key, value);
   }
 
   void Test(random::SimplePhilox* rnd, int num_random_access_iters = 200) {
-    std::vector<string> keys;
+    std::vector<std::string> keys;
     KVMap data;
     constructor_->Finish(options_, &keys, &data);
 
@@ -311,7 +312,8 @@ class Harness : public ::testing::Test {
     TestRandomAccess(rnd, keys, data, num_random_access_iters);
   }
 
-  void TestForwardScan(const std::vector<string>& keys, const KVMap& data) {
+  void TestForwardScan(const std::vector<std::string>& keys,
+                       const KVMap& data) {
     Iterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToFirst();
@@ -325,7 +327,7 @@ class Harness : public ::testing::Test {
   }
 
   void TestRandomAccess(random::SimplePhilox* rnd,
-                        const std::vector<string>& keys, const KVMap& data,
+                        const std::vector<std::string>& keys, const KVMap& data,
                         int num_random_access_iters) {
     static const bool kVerbose = false;
     Iterator* iter = constructor_->NewIterator();
@@ -356,7 +358,7 @@ class Harness : public ::testing::Test {
         }
 
         case 2: {
-          string key = PickRandomKey(rnd, keys);
+          std::string key = PickRandomKey(rnd, keys);
           model_iter = data.lower_bound(key);
           if (kVerbose)
             fprintf(stderr, "Seek '%s'\n", absl::CEscape(key).c_str());
@@ -396,13 +398,13 @@ class Harness : public ::testing::Test {
     }
   }
 
-  string PickRandomKey(random::SimplePhilox* rnd,
-                       const std::vector<string>& keys) {
+  std::string PickRandomKey(random::SimplePhilox* rnd,
+                            const std::vector<std::string>& keys) {
     if (keys.empty()) {
       return "foo";
     } else {
       const int index = rnd->Uniform(keys.size());
-      string result = keys[index];
+      std::string result = keys[index];
       switch (rnd->Uniform(3)) {
         case 0:
           // Return an existing key
@@ -443,7 +445,7 @@ TEST_F(Harness, Empty) {
 // code never generates such blocks, but the Java version of leveldb
 // seems to.
 TEST_F(Harness, ZeroRestartPointsInBlock) {
-  char data[sizeof(uint32)];
+  char data[sizeof(uint32_t)];
   memset(data, 0, sizeof(data));
   BlockContents contents;
   contents.data = absl::string_view(data, sizeof(data));
@@ -497,8 +499,8 @@ TEST_F(Harness, SimpleMultiBigValues) {
     random::PhiloxRandom philox(testing::RandomSeed() + 3, 17);
     random::SimplePhilox rnd(&philox);
     Add("ainitial", "tiny");
-    Add("anext", string(10000000, 'a'));
-    Add("anext2", string(10000000, 'b'));
+    Add("anext", std::string(10000000, 'a'));
+    Add("anext2", std::string(10000000, 'b'));
     Add("azz", "tiny");
     Test(&rnd, 100 /* num_random_access_iters */);
   }
@@ -526,16 +528,16 @@ TEST_F(Harness, Randomized) {
                 int(kNumTestArgs), num_entries);
       }
       for (int e = 0; e < num_entries; e++) {
-        string v;
+        std::string v;
         Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-            string(test::RandomString(&rnd, rnd.Skewed(5), &v)));
+            std::string(test::RandomString(&rnd, rnd.Skewed(5), &v)));
       }
       Test(&rnd);
     }
   }
 }
 
-static bool Between(uint64 val, uint64 low, uint64 high) {
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
     fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
@@ -552,12 +554,12 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
   TableConstructor c;
   c.Add("k01", "hello");
   c.Add("k02", "hello2");
-  c.Add("k03", string(10000, 'x'));
-  c.Add("k04", string(200000, 'x'));
-  c.Add("k05", string(300000, 'x'));
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
   c.Add("k06", "hello3");
-  c.Add("k07", string(100000, 'x'));
-  std::vector<string> keys;
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
@@ -578,7 +580,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
 }
 
 static bool SnappyCompressionSupported() {
-  string out;
+  std::string out;
   absl::string_view in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
   return port::Snappy_Compress(in.data(), in.size(), &out);
 }
@@ -592,12 +594,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   TableConstructor c;
-  string tmp;
+  std::string tmp;
   c.Add("k01", "hello");
   c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
   c.Add("k03", "hello3");
   c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
@@ -614,12 +616,12 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
 TEST(TableTest, SeekToFirstKeyDoesNotReadTooMuch) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string tmp;
+  std::string tmp;
   TableConstructor c;
   c.Add("k01", "firstvalue");
   c.Add("k03", test::CompressibleString(&rnd, 0.25, 1000000, &tmp));
   c.Add("k04", "abc");
-  std::vector<string> keys;
+  std::vector<std::string> keys;
   KVMap kvmap;
   Options options;
   options.block_size = 1024;
diff --git a/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc b/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
index 9de0e76a303e3b..91a676ef9382d1 100644
--- a/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
+++ b/third_party/xla/xla/tsl/lib/io/two_level_iterator.cc
@@ -77,7 +77,7 @@ class TwoLevelIterator : public Iterator {
   Iterator* data_iter_;  // May be NULL
   // If data_iter_ is non-NULL, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the data_iter_.
-  string data_block_handle_;
+  std::string data_block_handle_;
 };
 
 TwoLevelIterator::TwoLevelIterator(Iterator* index_iter,
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
index 7d2889b284d6d5..0fe1e4e15ba555 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
@@ -35,8 +35,8 @@ static std::vector<int> OutputBufferSizes() { return {100, 200, 500, 1000}; }
 
 static std::vector<int> NumCopies() { return {1, 50, 500}; }
 
-static string GetRecord() {
-  static const string lorem_ipsum =
+static std::string GetRecord() {
+  static const std::string lorem_ipsum =
       "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
       " Fusce vehicula tincidunt libero sit amet ultrices. Vestibulum non "
       "felis augue. Duis vitae augue id lectus lacinia congue et ut purus. "
@@ -52,8 +52,8 @@ static string GetRecord() {
   return lorem_ipsum;
 }
 
-static string GenTestString(int copies = 1) {
-  string result = "";
+static std::string GenTestString(int copies = 1) {
+  std::string result = "";
   for (int i = 0; i < copies; i++) {
     result += GetRecord();
   }
@@ -65,11 +65,11 @@ typedef io::ZlibCompressionOptions CompressionOptions;
 void TestAllCombinations(CompressionOptions input_options,
                          CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
     // Write to compressed file
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         std::unique_ptr<WritableFile> file_writer;
@@ -111,18 +111,18 @@ TEST(ZlibBuffers, Gzip) {
   TestAllCombinations(CompressionOptions::GZIP(), CompressionOptions::GZIP());
 }
 
-void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
+void TestMultipleWrites(uint8_t input_buf_size, uint8_t output_buf_size,
                         int num_writes, bool with_flush = false) {
   Env* env = Env::Default();
   CompressionOptions input_options = CompressionOptions::DEFAULT();
   CompressionOptions output_options = CompressionOptions::DEFAULT();
 
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
-  string data = GenTestString();
+  std::string data = GenTestString();
   std::unique_ptr<WritableFile> file_writer;
-  string actual_result;
-  string expected_result;
+  std::string actual_result;
+  std::string expected_result;
 
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
   ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
@@ -166,7 +166,7 @@ TEST(ZlibBuffers, MultipleWriteCallsWithFlush) {
 
 TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   CompressionOptions output_options = CompressionOptions::DEFAULT();
   CompressionOptions input_options = CompressionOptions::DEFAULT();
@@ -175,7 +175,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   // inflate() has smaller history buffer.
   input_options.window_bits = output_options.window_bits - 1;
 
-  string data = GenTestString(10);
+  std::string data = GenTestString(10);
   std::unique_ptr<WritableFile> file_writer;
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
   tstring result;
@@ -199,10 +199,10 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   CHECK(absl::StrContains(read_status.message(), "inflate() failed"));
 }
 
-void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
+void WriteCompressedFile(Env* env, const std::string& fname, int input_buf_size,
                          int output_buf_size,
                          const CompressionOptions& output_options,
-                         const string& data) {
+                         const std::string& data) {
   std::unique_ptr<WritableFile> file_writer;
   TF_ASSERT_OK(env->NewWritableFile(fname, &file_writer));
 
@@ -219,10 +219,10 @@ void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
 void TestTell(CompressionOptions input_options,
               CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         // Write the compressed file.
@@ -237,7 +237,7 @@ void TestTell(CompressionOptions input_options,
         ZlibInputStream in(input_stream.get(), input_buf_size, output_buf_size,
                            input_options);
 
-        tstring first_half(string(data, 0, data.size() / 2));
+        tstring first_half(std::string(data, 0, data.size() / 2));
         tstring bytes_read;
 
         // Read the first half of the uncompressed file and expect that Tell()
@@ -264,10 +264,10 @@ void TestTell(CompressionOptions input_options,
 void TestSkipNBytes(CompressionOptions input_options,
                     CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
-    string data = GenTestString(file_size);
+    std::string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
       for (auto output_buf_size : OutputBufferSizes()) {
         // Write the compressed file.
@@ -283,7 +283,8 @@ void TestSkipNBytes(CompressionOptions input_options,
                            input_options);
 
         size_t data_half_size = data.size() / 2;
-        string second_half(data, data_half_size, data.size() - data_half_size);
+        std::string second_half(data, data_half_size,
+                                data.size() - data_half_size);
 
         // Skip past the first half of the file and expect Tell() returns
         // correctly.
@@ -303,7 +304,7 @@ void TestSkipNBytes(CompressionOptions input_options,
 
 void TestSoftErrorOnDecompress(CompressionOptions input_options) {
   Env* env = Env::Default();
-  string fname;
+  std::string fname;
   ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   input_options.soft_fail_on_error = true;
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h b/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
index b0cb2f05724642..be3ae029bb1b16 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
+++ b/third_party/xla/xla/tsl/lib/io/zlib_compression_options.h
@@ -30,7 +30,7 @@ class ZlibCompressionOptions {
   static ZlibCompressionOptions GZIP();
 
   // Defaults to Z_NO_FLUSH
-  int8 flush_mode;
+  int8_t flush_mode;
 
   // Size of the buffer used for caching the data read from source file.
   int64_t input_buffer_size = 256 << 10;
@@ -74,7 +74,7 @@ class ZlibCompressionOptions {
   // error code Z_DATA_ERROR instead of trying to allocate a larger window.
   //
   // Defaults to MAX_WBITS
-  int8 window_bits;
+  int8_t window_bits;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
@@ -82,17 +82,17 @@ class ZlibCompressionOptions {
   // (the input data is simply copied a block at a time). Z_DEFAULT_COMPRESSION
   // requests a default compromise between speed and compression (currently
   // equivalent to level 6).
-  int8 compression_level;
+  int8_t compression_level;
 
   // Only Z_DEFLATED is supported at this time.
-  int8 compression_method;
+  int8_t compression_method;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The mem_level parameter specifies how much memory should be allocated for
   // the internal compression state. mem_level=1 uses minimum memory but is slow
   // and reduces compression ratio; mem_level=9 uses maximum memory for optimal
   // speed. The default value is 8.
-  int8 mem_level = 9;
+  int8_t mem_level = 9;
 
   // From the zlib manual (http://www.zlib.net/manual.html):
   // The strategy parameter is used to tune the compression algorithm. Use the
@@ -109,7 +109,7 @@ class ZlibCompressionOptions {
   // but not the correctness of the compressed output even if it is not set
   // appropriately. Z_FIXED prevents the use of dynamic Huffman codes, allowing
   // for a simpler decoder for special applications.
-  int8 compression_strategy;
+  int8_t compression_strategy;
 
   // When this is set to true and we are unable to find the header to correctly
   // decompress a file, we return an error when `ReadNBytes` is called instead
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
index 46fea510fd7ead..31dd50b54100c5 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
@@ -250,7 +250,8 @@ absl::Status ZlibInputStream::Inflate() {
   // not fatal and `inflate` can be called again with more input and output
   // space to continue inflating.
   if (error != Z_OK && error != Z_STREAM_END && error != Z_BUF_ERROR) {
-    string error_string = absl::StrCat("inflate() failed with error ", error);
+    std::string error_string =
+        absl::StrCat("inflate() failed with error ", error);
     if (z_stream_def_->stream->msg != nullptr) {
       absl::StrAppend(&error_string, ": ", z_stream_def_->stream->msg);
     }
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
index 7dc7ba425893d8..f30b125b30738d 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.cc
@@ -68,7 +68,7 @@ absl::Status ZlibOutputBuffer::Init() {
   return absl::OkStatus();
 }
 
-int32 ZlibOutputBuffer::AvailableInputSpace() const {
+int32_t ZlibOutputBuffer::AvailableInputSpace() const {
   return input_buffer_capacity_ - z_stream_->avail_in;
 }
 
@@ -130,7 +130,7 @@ absl::Status ZlibOutputBuffer::DeflateBuffered(int flush_mode) {
 }
 
 absl::Status ZlibOutputBuffer::FlushOutputBufferToFile() {
-  uint32 bytes_to_write = output_buffer_capacity_ - z_stream_->avail_out;
+  uint32_t bytes_to_write = output_buffer_capacity_ - z_stream_->avail_out;
   if (bytes_to_write > 0) {
     absl::Status s = file_->Append(absl::string_view(
         reinterpret_cast<char*>(z_stream_output_.get()), bytes_to_write));
@@ -231,7 +231,8 @@ absl::Status ZlibOutputBuffer::Deflate(int flush) {
       (error == Z_STREAM_END && flush == Z_FINISH)) {
     return absl::OkStatus();
   }
-  string error_string = absl::StrCat("deflate() failed with error ", error);
+  std::string error_string =
+      absl::StrCat("deflate() failed with error ", error);
   if (z_stream_->msg != nullptr) {
     absl::StrAppend(&error_string, ": ", z_stream_->msg);
   }
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
index 3d7e3024993ee9..6a20456e2cb5b8 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/zlib_outputbuffer.h
@@ -126,7 +126,7 @@ class ZlibOutputBuffer : public WritableFile {
   void AddToInputBuffer(absl::string_view data);
 
   // Returns the total space available in z_input_stream_ buffer.
-  int32 AvailableInputSpace() const;
+  int32_t AvailableInputSpace() const;
 
   // Deflate contents in z_stream_input_ and store results in z_stream_output_.
   // The contents of output stream are written to file if more space is needed.
@@ -145,7 +145,7 @@ class ZlibOutputBuffer : public WritableFile {
   // Calls `deflate()` and returns DataLoss Status if it failed.
   absl::Status Deflate(int flush);
 
-  static bool IsSyncOrFullFlush(uint8 flush_mode) {
+  static bool IsSyncOrFullFlush(uint8_t flush_mode) {
     return flush_mode == Z_SYNC_FLUSH || flush_mode == Z_FULL_FLUSH;
   }
 
diff --git a/third_party/xla/xla/tsl/lib/math/math_util_test.cc b/third_party/xla/xla/tsl/lib/math/math_util_test.cc
index b7a91877b1168c..83723d40adbc8d 100644
--- a/third_party/xla/xla/tsl/lib/math/math_util_test.cc
+++ b/third_party/xla/xla/tsl/lib/math/math_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/tsl/lib/math/math_util.h"
 
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <vector>
 
@@ -60,9 +61,9 @@ void TestCeilOfRatio(const TestDataType test_data[][kNumTestArguments],
 }
 
 template <typename UnsignedIntegralType>
-void TestCeilOfRatioUnsigned(uint64 kMax) {
+void TestCeilOfRatioUnsigned(uint64_t kMax) {
   const int kNumTests = 12;
-  const uint64 kTestData[kNumTests][kNumTestArguments] = {
+  const uint64_t kTestData[kNumTests][kNumTestArguments] = {
       // Numerator  | Denominator | Expected floor of ratio | Expected ceil of
       // ratio |
       // When numerator = 0, the result is always zero
@@ -83,7 +84,7 @@ void TestCeilOfRatioUnsigned(uint64 kMax) {
       // Try with a huge numerator and a huge denominator
       {kMax, kMax, 1, 1},
   };
-  TestCeilOfRatio<UnsignedIntegralType, uint64>(kTestData, kNumTests);
+  TestCeilOfRatio<UnsignedIntegralType, uint64_t>(kTestData, kNumTests);
 }
 
 template <typename SignedInteger>
@@ -185,18 +186,23 @@ void TestThatCeilOfRatioDenomMinusOneIsIncorrect() {
   // It does not work with negative values
   TestThatCeilOfRatioDenomMinusOneIsIncorrect(-1LL, -2LL, -1LL);
 
-  // This would also fail if given kint64max because of signed integer overflow.
+  // This would also fail if given std::numeric_limits<int64_t>::max() because
+  // of signed integer overflow.
 }
 
 TEST(MathUtil, CeilOfRatio) {
-  TestCeilOfRatioUnsigned<uint8>(kuint8max);
-  TestCeilOfRatioUnsigned<uint16>(kuint16max);
-  TestCeilOfRatioUnsigned<uint32>(kuint32max);
-  TestCeilOfRatioUnsigned<uint64>(kuint64max);
-  TestCeilOfRatioSigned<int8>(kint8min, kint8max);
-  TestCeilOfRatioSigned<int16>(kint16min, kint16max);
-  TestCeilOfRatioSigned<int32>(kint32min, kint32max);
-  TestCeilOfRatioSigned<int64_t>(kint64min, kint64max);
+  TestCeilOfRatioUnsigned<uint8_t>(std::numeric_limits<uint8_t>::max());
+  TestCeilOfRatioUnsigned<uint16_t>(std::numeric_limits<uint16_t>::max());
+  TestCeilOfRatioUnsigned<uint32_t>(std::numeric_limits<uint32_t>::max());
+  TestCeilOfRatioUnsigned<uint64_t>(std::numeric_limits<uint64_t>::max());
+  TestCeilOfRatioSigned<int8_t>(std::numeric_limits<int8_t>::min(),
+                                std::numeric_limits<int8_t>::max());
+  TestCeilOfRatioSigned<int16_t>(std::numeric_limits<int16_t>::min(),
+                                 std::numeric_limits<int16_t>::max());
+  TestCeilOfRatioSigned<int32_t>(std::numeric_limits<int32_t>::min(),
+                                 std::numeric_limits<int32_t>::max());
+  TestCeilOfRatioSigned<int64_t>(std::numeric_limits<int64_t>::min(),
+                                 std::numeric_limits<int64_t>::max());
 #if 0
   TestThatCeilOfRatioDenomMinusOneIsIncorrect();
 #endif
@@ -220,15 +226,15 @@ TEST(MathUtil, GCD) {
   });
 
   for (const auto& tc : testcases) {
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32>(tc.x, tc.y));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32>(tc.y, tc.x));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64>(tc.x, tc.y));
-    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32_t>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint32_t>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64_t>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, tsl::MathUtil::GCD<uint64_t>(tc.y, tc.x));
   }
 
-  const uint64 biggish_prime = 1666666667;
+  const uint64_t biggish_prime = 1666666667;
   EXPECT_EQ(biggish_prime,
-            tsl::MathUtil::GCD<uint64>(biggish_prime * 3, biggish_prime * 4));
+            tsl::MathUtil::GCD<uint64_t>(biggish_prime * 3, biggish_prime * 4));
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/tsl/lib/monitoring/BUILD b/third_party/xla/xla/tsl/lib/monitoring/BUILD
index 0485ed516f15cc..1a7fdee569ca27 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/xla/tsl/lib/monitoring/BUILD
@@ -1,5 +1,9 @@
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tsl_cc_test",
+)
 load(
     "//xla/tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -12,6 +16,7 @@ package(
         "//learning/brain/google/monitoring:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/experimental/orbax_model:__subpackages__",
         # copybara:uncomment "//learning/pathways/util/platform:__subpackages__",
+        # copybara:uncomment "//third_party/pathways/util/platform:__subpackages__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         # tensorflow/core/platform:monitoring depends on this package
@@ -70,6 +75,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "counter_gauge",
+    hdrs = ["counter_gauge.h"],
+    deps = [
+        ":collection_registry",
+        ":metric_def",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tsl_cc_test(
+    name = "counter_gauge_test",
+    srcs = ["counter_gauge_test.cc"],
+    deps = [
+        ":counter_gauge",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "sampler",
     srcs = ["sampler.cc"],
@@ -180,6 +208,23 @@ cc_library(
     ],
 )
 
+tsl_cc_test(
+    name = "cell_reader_test",
+    size = "small",
+    srcs = ["cell_reader_test.cc"],
+    deps = [
+        ":cell_reader",
+        ":counter",
+        ":counter_gauge",
+        ":gauge",
+        ":percentile_sampler",
+        ":sampler",
+        ":test_utils",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "percentile_sampler",
     srcs = ["percentile_sampler.cc"],
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
index 69a5536fae9034..be4f0ae71615a4 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.cc
@@ -70,7 +70,7 @@ absl::StatusOr<std::vector<Point>> GetPoints(
     return errors::NotFound("Metric descriptor is not found for metric ",
                             metric_name, ".");
   }
-  const std::vector<string>& label_names =
+  const std::vector<std::string>& label_names =
       metric_descriptor->second->label_names;
   if (label_names.size() != labels.size()) {
     return errors::InvalidArgument(
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
index 8eb263ba4c0424..f0e1efdfcb00ca 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader-inl.h
@@ -89,18 +89,23 @@ Percentiles GetValue(const Point& point);
 
 // Returns the latest value for `metric_name`, associated with the `labels`. If
 // the metric has not collected any data, it returns a default value appropriate
-// for `ValueType`. If the metric does not exist, or the wrong number of labels
-// is provided, it will crash.
+// for `ValueType`. If a wrong number of labels is provided, it will crash. If
+// the metric does not exist, it will crash if `return_default_on_not_found` is
+// false, otherwise it will return a default value.
 template <typename ValueType>
 ValueType GetLatestValueOrDefault(const CollectedMetrics& metrics,
                                   const std::string& metric_name,
                                   const std::vector<std::string>& labels,
-                                  const ValueType default_value = ValueType()) {
+                                  bool return_default_on_not_found = false,
+                                  ValueType default_value = ValueType()) {
   absl::StatusOr<Point> latest_point =
       GetLatestPoint(metrics, metric_name, labels);
   if (absl::IsUnavailable(latest_point.status())) {
     return std::move(default_value);
   }
+  if (return_default_on_not_found && absl::IsNotFound(latest_point.status())) {
+    return std::move(default_value);
+  }
   if (!latest_point.ok()) {
     LOG(FATAL) << "Failed to read from tfstreamz: " << latest_point.status();
   }
diff --git a/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h b/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
index ce906ee8ecbb57..017cb0f868d54c 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -81,9 +80,8 @@ class CellReader {
  public:
   // Constructs a `CellReader` that reads values exported for `metric_name`.
   //
-  // REQUIRES: a tfstreamz with `metric_name` exists. Otherwise, the
-  // `CellReader` will construct without issue, but the `Read` and `Delta` calls
-  // will CHECK-fail.
+  // NOTE: if a tfstreamz with `metric_name` does not exists, the `CellReader`
+  // will construct without issue, but the `Read` calls will CHECK-fail.
   explicit CellReader(const std::string& metric_name);
   virtual ~CellReader() = default;
   CellReader(const CellReader&) = delete;
@@ -100,13 +98,13 @@ class CellReader {
 
   // Returns the difference in the value of this cell since the last time
   // `Delta()` was called for this cell, or when the `CellReader` was created,
-  // whichever was most recent. If the metric has not been modified, it returns
-  // a default value appropriate for `ValueType`. `Delta` is not supported for
-  // string and bool gauges.
+  // whichever was most recent. If tfstreamz does not exist or the metric has
+  // not been modified, it returns a default value appropriate for `ValueType`.
+  // `Delta` is not supported for string and bool gauges.
   //
-  // REQUIRES: The tfstreamz exists, `labels` contains a correct number of
-  // labels per tfstreamz definition, and the ValueType is not string or bool.
-  // Otherwise, it will CHECK-fail.
+  // REQUIRES: `labels` contains a correct number of labels per tfstreamz
+  // definition, and the ValueType is not string or bool. Otherwise, it will
+  // CHECK-fail.
   template <typename... LabelType>
   ValueType Delta(const LabelType&... labels);
 
@@ -149,12 +147,14 @@ ValueType CellReader<ValueType>::Delta(const LabelType&... labels) {
   std::vector<std::string> labels_list{labels...};
   std::unique_ptr<CollectedMetrics> metrics = internal::CollectMetrics();
   ValueType value = internal::GetLatestValueOrDefault<ValueType>(
-      *metrics, metric_name_, labels_list);
+      *metrics, metric_name_, labels_list,
+      /*return_default_on_not_found=*/true);
   auto it = delta_map_.find(labels_list);
   ValueType initial_value;
   if (it == delta_map_.end()) {
     initial_value = internal::GetLatestValueOrDefault<ValueType>(
-        *initial_metrics_, metric_name_, labels_list);
+        *initial_metrics_, metric_name_, labels_list,
+        /*return_default_on_not_found=*/true);
     delta_map_[labels_list] = value;
   } else {
     initial_value = it->second;
diff --git a/tensorflow/core/lib/monitoring/cell_reader_test.cc b/third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
similarity index 86%
rename from tensorflow/core/lib/monitoring/cell_reader_test.cc
rename to third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
index 3e3fb99d02093c..4c6bcb86d12279 100644
--- a/tensorflow/core/lib/monitoring/cell_reader_test.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/cell_reader_test.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/lib/monitoring/cell_reader.h"
+
+#include "xla/tsl/lib/monitoring/cell_reader.h"
 
 #include <cstdint>
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/lib/monitoring/counter.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
-#include "tensorflow/core/lib/monitoring/sampler.h"
-#include "tensorflow/core/lib/monitoring/test_utils.h"
-#include "tensorflow/core/lib/monitoring/types.h"
-#include "tensorflow/core/platform/test.h"
+#include <gtest/gtest.h>
+#include "xla/tsl/lib/monitoring/counter.h"
+#include "xla/tsl/lib/monitoring/counter_gauge.h"
+#include "xla/tsl/lib/monitoring/gauge.h"
+#include "xla/tsl/lib/monitoring/percentile_sampler.h"
+#include "xla/tsl/lib/monitoring/sampler.h"
+#include "xla/tsl/lib/monitoring/test_utils.h"
+#include "xla/tsl/lib/monitoring/types.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace monitoring {
 namespace testing {
 namespace {
@@ -35,58 +37,72 @@ std::vector<double> GetDefaultPercentiles() {
   return {25.0, 50.0, 80.0, 90.0, 95.0, 99.0};
 }
 
-auto* test_counter = monitoring::Counter<0>::New(
-    "/tensorflow/monitoring/test/counter", "Test counter.");
+auto* test_counter = tsl::monitoring::Counter<0>::New(
+    "/tsl/monitoring/test/counter", "Test counter.");
 
-auto* test_counter_with_labels = monitoring::Counter<2>::New(
-    "/tensorflow/monitoring/test/counter_with_labels",
-    "Test counter with two labels.", "label1", "label2");
+auto* test_counter_with_labels = tsl::monitoring::Counter<2>::New(
+    "/tsl/monitoring/test/counter_with_labels", "Test counter with two labels.",
+    "label1", "label2");
 
-auto* test_sampler = monitoring::Sampler<0>::New(
-    {"/tensorflow/monitoring/test/sampler", "Test sampler."},
-    /*buckets=*/monitoring::Buckets::Explicit(
+auto* test_sampler = tsl::monitoring::Sampler<0>::New(
+    {"/tsl/monitoring/test/sampler", "Test sampler."},
+    /*buckets=*/tsl::monitoring::Buckets::Explicit(
         {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0}));
 
-auto* test_sampler_with_labels = monitoring::Sampler<2>::New(
-    {"/tensorflow/monitoring/test/sampler_with_labels", "Test sampler.",
-     "label1", "label2"},
-    /*buckets=*/monitoring::Buckets::Exponential(
+auto* test_sampler_with_labels = tsl::monitoring::Sampler<2>::New(
+    {"/tsl/monitoring/test/sampler_with_labels", "Test sampler.", "label1",
+     "label2"},
+    /*buckets=*/tsl::monitoring::Buckets::Exponential(
         /*scale=*/1, /*growth_factor=*/10, /*bucket_count=*/5));
 
-auto* test_int_gauge = monitoring::Gauge<int64_t, 0>::New(
-    "/tensorflow/monitoring/test/int_gauge", "Test gauge.");
+auto* test_int_gauge = tsl::monitoring::Gauge<int64_t, 0>::New(
+    "/tsl/monitoring/test/int_gauge", "Test gauge.");
 
-auto* test_int_gauge_with_labels = monitoring::Gauge<int64_t, 2>::New(
-    "/tensorflow/monitoring/test/int_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_int_gauge_with_labels = tsl::monitoring::Gauge<int64_t, 2>::New(
+    "/tsl/monitoring/test/int_gauge_with_labels", "Test gauge.", "label1",
+    "label2");
 
-auto* test_string_gauge = monitoring::Gauge<std::string, 0>::New(
-    "/tensorflow/monitoring/test/string_gauge", "Test gauge.");
+auto* test_string_gauge = tsl::monitoring::Gauge<std::string, 0>::New(
+    "/tsl/monitoring/test/string_gauge", "Test gauge.");
 
-auto* test_string_gauge_with_labels = monitoring::Gauge<std::string, 2>::New(
-    "/tensorflow/monitoring/test/string_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_string_gauge_with_labels =
+    tsl::monitoring::Gauge<std::string, 2>::New(
+        "/tsl/monitoring/test/string_gauge_with_labels", "Test gauge.",
+        "label1", "label2");
 
-auto* test_bool_gauge = monitoring::Gauge<bool, 0>::New(
-    "/tensorflow/monitoring/test/bool_gauge", "Test gauge.");
+auto* test_bool_gauge = tsl::monitoring::Gauge<bool, 0>::New(
+    "/tsl/monitoring/test/bool_gauge", "Test gauge.");
 
-auto* test_bool_gauge_with_labels = monitoring::Gauge<bool, 2>::New(
-    "/tensorflow/monitoring/test/bool_gauge_with_labels", "Test gauge.",
-    "label1", "label2");
+auto* test_bool_gauge_with_labels = tsl::monitoring::Gauge<bool, 2>::New(
+    "/tsl/monitoring/test/bool_gauge_with_labels", "Test gauge.", "label1",
+    "label2");
 
-auto* test_percentiles = monitoring::PercentileSampler<0>::New(
-    {"/tensorflow/monitoring/test/percentiles", "Test percentiles."},
+auto* test_counter_gauge = tsl::monitoring::CounterGauge<0>::New(
+    "/tsl/monitoring/test/counter_gauge", "Test counter gauge.");
+
+auto* test_counter_gauge_with_labels = tsl::monitoring::CounterGauge<2>::New(
+    "/tsl/monitoring/test/counter_gauge_with_labels",
+    "Test counter gauge with two labels.", "label1", "label2");
+
+auto* test_percentiles = tsl::monitoring::PercentileSampler<0>::New(
+    {"/tsl/monitoring/test/percentiles", "Test percentiles."},
     GetDefaultPercentiles(), /*max_samples=*/1024,
-    monitoring::UnitOfMeasure::kTime);
+    tsl::monitoring::UnitOfMeasure::kTime);
 
-auto* test_percentiles_with_labels = monitoring::PercentileSampler<2>::New(
-    {"/tensorflow/monitoring/test/percentiles_with_labels", "Test percentiles.",
+auto* test_percentiles_with_labels = tsl::monitoring::PercentileSampler<2>::New(
+    {"/tsl/monitoring/test/percentiles_with_labels", "Test percentiles.",
      "label1", "label2"},
     GetDefaultPercentiles(), /*max_samples=*/1024,
-    monitoring::UnitOfMeasure::kTime);
+    tsl::monitoring::UnitOfMeasure::kTime);
+
+void IncrementLazyCounter() {
+  static auto* test_counter = monitoring::Counter<0>::New(
+      "/tsl/monitoring/test/lazy_counter", "Test lazy counter.");
+  test_counter->GetCell()->IncrementBy(1);
+}
 
 TEST(CellReaderTest, CounterDeltaNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Delta(), 0);
 
   test_counter->GetCell()->IncrementBy(5);
@@ -100,7 +116,7 @@ TEST(CellReaderTest, CounterDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, CounterReadNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Read(), 0);
 
   test_counter->GetCell()->IncrementBy(5);
@@ -114,7 +130,7 @@ TEST(CellReaderTest, CounterReadNoLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaAndReadNoLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader.Read(), 0);
 
@@ -132,8 +148,7 @@ TEST(CellReaderTest, CounterDeltaAndReadNoLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Delta("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Delta("x2", "y1"), 0);
@@ -164,8 +179,7 @@ TEST(CellReaderTest, CounterDeltaWithLabels) {
 }
 
 TEST(CellReaderTest, CounterReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y1"), 0);
@@ -196,8 +210,7 @@ TEST(CellReaderTest, CounterReadWithLabels) {
 }
 
 TEST(CellReaderTest, CounterDeltaAndReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/counter_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Delta("x1", "y2"), 0);
   EXPECT_EQ(cell_reader.Delta("x2", "y1"), 0);
@@ -244,9 +257,9 @@ TEST(CellReaderTest, CounterDeltaAndReadWithLabels) {
 }
 
 TEST(CellReaderTest, TwoCounterReaders) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x2", "y2"), 0);
@@ -284,9 +297,9 @@ TEST(CellReaderTest, TwoCounterReaders) {
 }
 
 TEST(CellReaderTest, RepeatedReads) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_EQ(cell_reader.Delta(), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x1", "y1"), 0);
   EXPECT_EQ(cell_reader_with_labels.Delta("x2", "y2"), 0);
@@ -314,7 +327,7 @@ TEST(CellReaderTest, RepeatedReads) {
 }
 
 TEST(CellReaderTest, SamplerDeltaNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Delta();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -392,7 +405,7 @@ TEST(CellReaderTest, SamplerDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerReadNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Read();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -470,7 +483,7 @@ TEST(CellReaderTest, SamplerReadNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerDeltaAndReadNoLabels) {
-  CellReader<Histogram> cell_reader("/tensorflow/monitoring/test/sampler");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler");
   Histogram histogram = cell_reader.Delta();
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -585,8 +598,7 @@ TEST(CellReaderTest, SamplerDeltaAndReadNoLabels) {
 }
 
 TEST(CellReaderTest, SamplerDeltaWithLabels) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Delta("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -677,8 +689,7 @@ TEST(CellReaderTest, SamplerDeltaWithLabels) {
 }
 
 TEST(CellReaderTest, SamplerReadWithLabels) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Read("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -769,8 +780,7 @@ TEST(CellReaderTest, SamplerReadWithLabels) {
 }
 
 TEST(CellReaderTest, SamplerRepeatedReads) {
-  CellReader<Histogram> cell_reader(
-      "/tensorflow/monitoring/test/sampler_with_labels");
+  CellReader<Histogram> cell_reader("/tsl/monitoring/test/sampler_with_labels");
   Histogram histogram = cell_reader.Read("x1", "y1");
   EXPECT_FLOAT_EQ(histogram.num(), 0.0);
   EXPECT_FLOAT_EQ(histogram.sum(), 0.0);
@@ -878,7 +888,7 @@ TEST(CellReaderTest, SamplerRepeatedReads) {
 }
 
 TEST(CellReaderTest, IntGaugeRead) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/int_gauge");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge");
   EXPECT_EQ(cell_reader.Read(), 0);
 
   test_int_gauge->GetCell()->Set(100);
@@ -892,8 +902,7 @@ TEST(CellReaderTest, IntGaugeRead) {
 }
 
 TEST(CellReaderTest, IntGaugeReadWithLabels) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/int_gauge_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 
@@ -913,11 +922,12 @@ TEST(CellReaderTest, IntGaugeReadWithLabels) {
   test_int_gauge_with_labels->GetCell("x1", "y1")->Set(0);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), 100000);
+  test_int_gauge_with_labels->GetCell("x2", "y2")->Set(0);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 }
 
 TEST(CellReaderTest, IntGaugeRepeatedSetAndRead) {
-  CellReader<int64_t> cell_reader(
-      "/tensorflow/monitoring/test/int_gauge_with_labels");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/int_gauge_with_labels");
 
   test_int_gauge_with_labels->GetCell("x1", "y1")->Set(-1);
   test_int_gauge_with_labels->GetCell("x2", "y2")->Set(1);
@@ -940,11 +950,12 @@ TEST(CellReaderTest, IntGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), -500);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), 0);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), -500);
+  test_int_gauge_with_labels->GetCell("x2", "y2")->Set(0);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), 0);
 }
 
 TEST(CellReaderTest, StringGaugeRead) {
-  CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge");
+  CellReader<std::string> cell_reader("/tsl/monitoring/test/string_gauge");
   EXPECT_EQ(cell_reader.Read(), "");
 
   test_string_gauge->GetCell()->Set("gauge value");
@@ -959,7 +970,7 @@ TEST(CellReaderTest, StringGaugeRead) {
 
 TEST(CellReaderTest, StringGaugeReadWithLabels) {
   CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 
@@ -984,7 +995,7 @@ TEST(CellReaderTest, StringGaugeReadWithLabels) {
 
 TEST(CellReaderTest, StringGaugeRepeatedSetAndRead) {
   CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 
@@ -1009,10 +1020,14 @@ TEST(CellReaderTest, StringGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "-10");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), "-10");
   EXPECT_EQ(cell_reader.Read("x2", "y2"), "-10");
+  test_string_gauge_with_labels->GetCell("x1", "y1")->Set("");
+  test_string_gauge_with_labels->GetCell("x2", "y2")->Set("");
+  EXPECT_EQ(cell_reader.Read("x1", "y1"), "");
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), "");
 }
 
 TEST(CellReaderTest, BoolGaugeRead) {
-  CellReader<bool> cell_reader("/tensorflow/monitoring/test/bool_gauge");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge");
   EXPECT_EQ(cell_reader.Read(), false);
 
   test_bool_gauge->GetCell()->Set(true);
@@ -1023,8 +1038,7 @@ TEST(CellReaderTest, BoolGaugeRead) {
 }
 
 TEST(CellReaderTest, BoolGaugeReadWithLabels) {
-  CellReader<bool> cell_reader(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
 
@@ -1048,8 +1062,7 @@ TEST(CellReaderTest, BoolGaugeReadWithLabels) {
 }
 
 TEST(CellReaderTest, BoolGaugeRepeatedSetAndRead) {
-  CellReader<bool> cell_reader(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
 
@@ -1074,11 +1087,42 @@ TEST(CellReaderTest, BoolGaugeRepeatedSetAndRead) {
   EXPECT_EQ(cell_reader.Read("x2", "y2"), true);
   EXPECT_EQ(cell_reader.Read("x1", "y1"), false);
   EXPECT_EQ(cell_reader.Read("x2", "y2"), true);
+  test_bool_gauge_with_labels->GetCell("x2", "y2")->Set(false);
+  EXPECT_EQ(cell_reader.Read("x2", "y2"), false);
+}
+
+TEST(CellReaderTest, CounterGaugeRead) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_gauge");
+  EXPECT_EQ(cell_reader.Read(), 0);
+  test_counter_gauge->GetCell()->IncrementBy(10);
+  EXPECT_EQ(cell_reader.Read(), 10);
+  test_counter_gauge->GetCell()->IncrementBy(20);
+  EXPECT_EQ(cell_reader.Read(), 30);
+  test_counter_gauge->GetCell()->IncrementBy(-30);
+  EXPECT_EQ(cell_reader.Read(), 0);
+}
+
+TEST(CellReaderTest, CounterGaugeRepeatedIncrementAndDecrement) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter_gauge");
+  EXPECT_EQ(cell_reader.Read(), 0);
+  const int kNumIterations = 10;
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Increment();
+    EXPECT_EQ(cell_reader.Read(), i + 1);
+  }
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Decrement();
+    EXPECT_EQ(cell_reader.Read(), 10 - i - 1);
+  }
+  for (int i = 0; i < kNumIterations; ++i) {
+    test_counter_gauge->GetCell()->Increment();
+    test_counter_gauge->GetCell()->Decrement();
+    EXPECT_EQ(cell_reader.Read(), 0);
+  }
 }
 
 TEST(CellReaderTest, PercentilesDeltaNoLabels) {
-  CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles");
+  CellReader<Percentiles> cell_reader("/tsl/monitoring/test/percentiles");
   Percentiles percentiles = cell_reader.Delta();
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1100,8 +1144,7 @@ TEST(CellReaderTest, PercentilesDeltaNoLabels) {
 }
 
 TEST(CellReaderTest, PercentilesReadNoLabels) {
-  CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles");
+  CellReader<Percentiles> cell_reader("/tsl/monitoring/test/percentiles");
   Percentiles percentiles = cell_reader.Read();
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1124,7 +1167,7 @@ TEST(CellReaderTest, PercentilesReadNoLabels) {
 
 TEST(CellReaderTest, PercentilesWithLabels) {
   CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles_with_labels");
+      "/tsl/monitoring/test/percentiles_with_labels");
   Percentiles percentiles = cell_reader.Delta("x1", "y1");
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1184,7 +1227,7 @@ TEST(CellReaderTest, PercentilesWithLabels) {
 
 TEST(CellReaderTest, PercentilesRepeatedSetAndRead) {
   CellReader<Percentiles> cell_reader(
-      "/tensorflow/monitoring/test/percentiles_with_labels");
+      "/tsl/monitoring/test/percentiles_with_labels");
   Percentiles percentiles = cell_reader.Delta("x1", "y1");
   EXPECT_EQ(percentiles.num(), 0);
   EXPECT_FLOAT_EQ(percentiles.sum(), 0.0);
@@ -1232,16 +1275,15 @@ TEST(CellReaderTest, PercentilesRepeatedSetAndRead) {
   EXPECT_FLOAT_EQ(percentiles.sum(), -111.0);
 }
 
-#if GTEST_HAS_DEATH_TEST
 TEST(CellReaderTest, WrongNumberOfLabels) {
-  CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Read(), 0);
   EXPECT_DEATH(cell_reader.Read("label1"), "has 0 labels");
   EXPECT_DEATH(cell_reader.Read("label1", "label2"), "has 0 labels");
   EXPECT_DEATH(cell_reader.Read("label1", "label2", "label3"), "has 0 labels");
 
   CellReader<int64_t> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_DEATH(cell_reader_with_labels.Read(), "has 2 labels");
   EXPECT_DEATH(cell_reader_with_labels.Read("label1"), "has 2 labels");
   EXPECT_EQ(cell_reader_with_labels.Read("label1", "label2"), 0);
@@ -1249,7 +1291,7 @@ TEST(CellReaderTest, WrongNumberOfLabels) {
                "has 2 labels");
 }
 
-TEST(CellReaderTest, MetricIsNotFound) {
+TEST(CellReaderTest, MetricIsNotFoundRead) {
   CellReader<int64_t> cell_reader("/metric/does/not/exist");
   CellReader<int64_t> empty_cell_reader("");
   EXPECT_DEATH(cell_reader.Read(), "Metric descriptor is not found");
@@ -1257,29 +1299,27 @@ TEST(CellReaderTest, MetricIsNotFound) {
 }
 
 TEST(CellReaderTest, StringGaugeDelta) {
-  CellReader<std::string> cell_reader(
-      "/tensorflow/monitoring/test/string_gauge");
+  CellReader<std::string> cell_reader("/tsl/monitoring/test/string_gauge");
   CellReader<std::string> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/string_gauge_with_labels");
+      "/tsl/monitoring/test/string_gauge_with_labels");
   EXPECT_DEATH(cell_reader.Delta(), "Please use `Read` instead.");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Please use `Read` instead.");
 }
 
 TEST(CellReaderTest, BoolGaugeDelta) {
-  CellReader<bool> cell_reader("/tensorflow/monitoring/test/bool_gauge");
+  CellReader<bool> cell_reader("/tsl/monitoring/test/bool_gauge");
   CellReader<bool> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/bool_gauge_with_labels");
+      "/tsl/monitoring/test/bool_gauge_with_labels");
   EXPECT_DEATH(cell_reader.Delta(), "Please use `Read` instead.");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Please use `Read` instead.");
 }
 
 TEST(CellReaderTest, InvalidType) {
-  CellReader<std::vector<int>> cell_reader(
-      "/tensorflow/monitoring/test/counter");
+  CellReader<std::vector<int>> cell_reader("/tsl/monitoring/test/counter");
   CellReader<std::vector<int>> cell_reader_with_labels(
-      "/tensorflow/monitoring/test/counter_with_labels");
+      "/tsl/monitoring/test/counter_with_labels");
   EXPECT_DEATH(cell_reader.Read(),
                "Tensorflow CellReader does not support type");
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
@@ -1292,9 +1332,26 @@ TEST(CellReaderTest, InvalidType) {
   EXPECT_DEATH(cell_reader_with_labels.Delta("x", "y"),
                "Tensorflow CellReader does not support type");
 }
-#endif
+
+TEST(CellReaderTest, MetricIsNotFoundDelta) {
+  CellReader<int64_t> cell_reader("/metric/does/not/exist");
+  CellReader<int64_t> empty_cell_reader("");
+  EXPECT_EQ(cell_reader.Delta(), 0);
+  EXPECT_EQ(empty_cell_reader.Delta(), 0);
+}
+
+TEST(CellReaderTest, LazyInitializationDelta) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/lazy_counter");
+  EXPECT_EQ(cell_reader.Delta(), 0);
+}
+
+TEST(CellReaderTest, LazyInitializationDeltaAfterIncrement) {
+  CellReader<int64_t> cell_reader("/tsl/monitoring/test/lazy_counter");
+  IncrementLazyCounter();
+  EXPECT_EQ(cell_reader.Delta(), 1);
+}
 
 }  // namespace
 }  // namespace testing
 }  // namespace monitoring
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h b/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
index 8e305493e83c6b..434fb7c057f857 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/collected_metrics.h
@@ -58,14 +58,14 @@ namespace monitoring {
 // monitorable entity is exporting it.
 struct MetricDescriptor {
   // Metric names are path-like.  E.g., "/mycomponent/mymetric".
-  string name;
+  std::string name;
 
   // A human-readable description of what this metric measures.
-  string description;
+  std::string description;
 
   // Label names for the metric.
   // See the example in the top level comment for MetricDescriptor.
-  std::vector<string> label_names;
+  std::vector<std::string> label_names;
 
   MetricKind metric_kind;
 
@@ -80,15 +80,15 @@ struct Point {
   struct Label {
     // The |name| field must match the |label_name| field in the
     // MetricDescriptor for this Point.
-    string name;
-    string value;
+    std::string name;
+    std::string value;
   };
   std::vector<Label> labels;
 
   // The actual metric value, dependent on the value_type enum.
   ValueType value_type;
   int64_t int64_value;
-  string string_value;
+  std::string string_value;
   bool bool_value;
   double double_value;
   HistogramProto histogram_value;
@@ -131,14 +131,14 @@ struct Point {
   // made.
   //
   // start_timestamp must not be younger than end_timestamp.
-  uint64 start_timestamp_millis;
-  uint64 end_timestamp_millis;
+  uint64_t start_timestamp_millis;
+  uint64_t end_timestamp_millis;
 };
 
 // A set of points belonging to a metric.
 struct PointSet {
   // This must match a name defined by a MetricDescriptor message.
-  string metric_name;
+  std::string metric_name;
 
   // No two Points in the same PointSet should have the same set of labels.
   std::vector<std::unique_ptr<Point>> points;
@@ -147,8 +147,9 @@ struct PointSet {
 // Standard format in which the metrics are collected, before being exported.
 struct CollectedMetrics {
   // The keys are the metric-names.
-  std::map<string, std::unique_ptr<MetricDescriptor>> metric_descriptor_map;
-  std::map<string, std::unique_ptr<PointSet>> point_set_map;
+  std::map<std::string, std::unique_ptr<MetricDescriptor>>
+      metric_descriptor_map;
+  std::map<std::string, std::unique_ptr<PointSet>> point_set_map;
 };
 
 }  // namespace monitoring
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
index 18d0ad0c7632da..1feb39c9f36347 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
@@ -49,12 +49,12 @@ void Collector::CollectMetricDescriptor(
     absl::MutexLock l(mu_);
     return collected_metrics_->metric_descriptor_map
         .insert(std::make_pair(
-            string(metric_def->name()),
+            std::string(metric_def->name()),
             std::unique_ptr<MetricDescriptor>(new MetricDescriptor())))
         .first->second.get();
   }();
-  metric_descriptor->name = string(metric_def->name());
-  metric_descriptor->description = string(metric_def->description());
+  metric_descriptor->name = std::string(metric_def->name());
+  metric_descriptor->description = std::string(metric_def->description());
 
   for (const absl::string_view label_name : metric_def->label_descriptions()) {
     metric_descriptor->label_names.emplace_back(label_name);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
index 80ff3d0a02dba6..871a66f99b0976 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
@@ -152,7 +152,7 @@ class MetricCollector {
 
   MetricCollector(
       const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
-      const uint64 registration_time_millis,
+      const uint64_t registration_time_millis,
       internal::Collector* const collector, PointSet* const point_set)
       : metric_def_(metric_def),
         registration_time_millis_(registration_time_millis),
@@ -162,7 +162,7 @@ class MetricCollector {
   }
 
   const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
-  const uint64 registration_time_millis_;
+  const uint64_t registration_time_millis_;
   internal::Collector* const collector_;
   PointSet* const point_set_;
 
@@ -191,14 +191,14 @@ class MetricCollectorGetter {
 
   MetricCollectorGetter(internal::Collector* const collector,
                         const AbstractMetricDef* const allowed_metric_def,
-                        const uint64 registration_time_millis)
+                        const uint64_t registration_time_millis)
       : collector_(collector),
         allowed_metric_def_(allowed_metric_def),
         registration_time_millis_(registration_time_millis) {}
 
   internal::Collector* const collector_;
   const AbstractMetricDef* const allowed_metric_def_;
-  const uint64 registration_time_millis_;
+  const uint64_t registration_time_millis_;
 };
 
 // A collection registry for metrics.
@@ -270,7 +270,7 @@ class CollectionRegistry {
   struct CollectionInfo {
     const AbstractMetricDef* const metric_def;
     CollectionFunction collection_function;
-    uint64 registration_time_millis;
+    uint64_t registration_time_millis;
   };
   std::map<absl::string_view, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
 
@@ -375,14 +375,14 @@ inline void CollectValue(std::function<double()> value_fn, Point* const point) {
 // This class is thread-safe.
 class Collector {
  public:
-  explicit Collector(const uint64 collection_time_millis)
+  explicit Collector(const uint64_t collection_time_millis)
       : collected_metrics_(new CollectedMetrics()),
         collection_time_millis_(collection_time_millis) {}
 
   template <MetricKind metric_kind, typename Value, int NumLabels>
   MetricCollector<metric_kind, Value, NumLabels> GetMetricCollector(
       const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
-      const uint64 registration_time_millis,
+      const uint64_t registration_time_millis,
       internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
     auto* const point_set = [&]() {
       absl::MutexLock l(mu_);
@@ -395,7 +395,7 @@ class Collector {
         metric_def, registration_time_millis, collector, point_set);
   }
 
-  uint64 collection_time_millis() const { return collection_time_millis_; }
+  uint64_t collection_time_millis() const { return collection_time_millis_; }
 
   void CollectMetricDescriptor(const AbstractMetricDef* const metric_def)
       TF_LOCKS_EXCLUDED(mu_);
@@ -409,7 +409,7 @@ class Collector {
  private:
   mutable absl::Mutex mu_;
   std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
-  const uint64 collection_time_millis_;
+  const uint64_t collection_time_millis_;
 
   Collector(const Collector&) = delete;
   void operator=(const Collector&) = delete;
@@ -423,21 +423,21 @@ class Collector {
 // collection function was registered, while the end timestamp will be set to
 // the collection time.
 template <MetricKind kind>
-void WriteTimestamps(const uint64 registration_time_millis,
-                     const uint64 collection_time_millis, Point* const point);
+void WriteTimestamps(const uint64_t registration_time_millis,
+                     const uint64_t collection_time_millis, Point* const point);
 
 template <>
 inline void WriteTimestamps<MetricKind::kGauge>(
-    const uint64 registration_time_millis, const uint64 collection_time_millis,
-    Point* const point) {
+    const uint64_t registration_time_millis,
+    const uint64_t collection_time_millis, Point* const point) {
   point->start_timestamp_millis = collection_time_millis;
   point->end_timestamp_millis = collection_time_millis;
 }
 
 template <>
 inline void WriteTimestamps<MetricKind::kCumulative>(
-    const uint64 registration_time_millis, const uint64 collection_time_millis,
-    Point* const point) {
+    const uint64_t registration_time_millis,
+    const uint64_t collection_time_millis, Point* const point) {
   point->start_timestamp_millis = registration_time_millis;
   // There's a chance that the clock goes backwards on the same machine, so we
   // protect ourselves against that.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter.h b/third_party/xla/xla/tsl/lib/monitoring/counter.h
index d0f23d2c63d250..38dec95cc3fa04 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/counter.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter.h
@@ -187,7 +187,7 @@ class Counter {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   std::map<LabelArray, CounterCell> cells_ TF_GUARDED_BY(mu_);
 
   // The metric definition. This will be used to identify the metric when we
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h
new file mode 100644
index 00000000000000..709866f8da914e
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge.h
@@ -0,0 +1,180 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
+#define XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/lib/monitoring/collection_registry.h"
+#include "xla/tsl/lib/monitoring/metric_def.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl {
+namespace monitoring {
+
+// CounterGaugeCell stores each value of an CounterGauge.
+//
+// This class is thread-safe.
+class CounterGaugeCell {
+ public:
+  explicit CounterGaugeCell(int64_t value) : value_(value) {}
+  CounterGaugeCell() = default;
+
+  // Atomically increments the value by step. `step` can be any value.
+  void IncrementBy(int64_t step);
+
+  // Atomically increments the value by 1.
+  void Increment();
+
+  // Atomically decrements the value by 1.
+  void Decrement();
+
+  // Retrieves the current value.
+  int64_t value() const;
+
+ private:
+  std::atomic<int64_t> value_;
+
+  CounterGaugeCell(const CounterGaugeCell&) = delete;
+  void operator=(const CounterGaugeCell&) = delete;
+};
+
+// A stateful class for updating a gauge integer metric.
+//
+// This class encapsulates a set of values (or a single value for a label-less
+// metric). Each value is identified by a tuple of labels. The class allows the
+// user to increment each value.
+//
+// Counter allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <int NumLabels>
+class CounterGauge {
+ public:
+  ~CounterGauge() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments.
+  //
+  // Example;
+  // auto* counter_with_label = CounterGauge<1>::New("/tensorflow/counter",
+  //   "Tensorflow counter", "MyLabelName");
+  template <typename... MetricDefArgs>
+  static CounterGauge* New(MetricDefArgs&&... metric_def_args);
+
+  // Retrieves the cell for the specified labels, creating it on demand if
+  // not already present.
+  template <typename... Labels>
+  CounterGaugeCell* GetCell(const Labels&... labels) ABSL_LOCKS_EXCLUDED(mu_);
+
+  absl::Status GetStatus() { return status_; }
+
+ private:
+  explicit CounterGauge(
+      const MetricDef<MetricKind::kGauge, int64_t, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              absl::MutexLock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second->value());
+              }
+            })) {
+    if (registration_handle_) {
+      status_ = absl::OkStatus();
+    } else {
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
+                       "Another metric with the same name already exists.");
+    }
+  }
+
+  mutable absl::Mutex mu_;
+
+  absl::Status status_;
+
+  using LabelArray = std::array<std::string, NumLabels>;
+  absl::flat_hash_map<LabelArray, std::unique_ptr<CounterGaugeCell> > cells_
+      ABSL_GUARDED_BY(mu_);
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kGauge, int64_t, NumLabels> metric_def_;
+
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  CounterGauge(const CounterGauge&) = delete;
+  void operator=(const CounterGauge&) = delete;
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+
+inline void CounterGaugeCell::IncrementBy(int64_t step) { value_ += step; }
+
+inline int64_t CounterGaugeCell::value() const { return value_; }
+
+inline void CounterGaugeCell::Increment() { IncrementBy(1); }
+
+inline void CounterGaugeCell::Decrement() { IncrementBy(-1); }
+
+template <int NumLabels>
+template <typename... MetricDefArgs>
+CounterGauge<NumLabels>* CounterGauge<NumLabels>::New(
+    MetricDefArgs&&... metric_def_args) {
+  return new CounterGauge<NumLabels>(
+      MetricDef<MetricKind::kGauge, int64_t, NumLabels>(
+          std::forward<MetricDefArgs>(metric_def_args)...));
+}
+
+template <int NumLabels>
+template <typename... Labels>
+CounterGaugeCell* CounterGauge<NumLabels>::GetCell(const Labels&... labels)
+    ABSL_LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(sizeof...(Labels) == NumLabels,
+                "Mismatch between CounterGauge<NumLabels> and number of labels "
+                "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  absl::MutexLock l(mu_);
+  auto [it, unused_inserted] =
+      cells_.try_emplace(label_array, std::make_unique<CounterGaugeCell>());
+  return it->second.get();
+}
+
+}  // namespace monitoring
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_MONITORING_COUNTER_GAUGE_H_
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc
new file mode 100644
index 00000000000000..59f7afec0a5182
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter_gauge_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/lib/monitoring/counter_gauge.h"
+
+#include <gtest/gtest.h>
+
+namespace tsl::monitoring {
+namespace {
+
+auto* counter_gauge_with_labels =
+    CounterGauge<1>::New("/tensorflow/test/counter_gauge_with_labels",
+                         "CounterGauge with one label.", "MyLabel");
+
+TEST(LabeledCounterGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, counter_gauge_with_labels->GetCell("Empty")->value());
+}
+
+TEST(LabeledCounterGaugeTest, GetCell) {
+  auto* cell = counter_gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(0, cell->value());
+
+  cell->IncrementBy(-42);
+  EXPECT_EQ(-42, cell->value());
+
+  auto* same_cell = counter_gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(-42, same_cell->value());
+
+  same_cell->IncrementBy(42);
+  EXPECT_EQ(0, cell->value());
+  EXPECT_EQ(0, same_cell->value());
+}
+
+TEST(LabeledCounterGaugeTest, IncrementAndDecrement) {
+  auto* cell = counter_gauge_with_labels->GetCell("IncrementAndDecrementOp");
+  cell->Increment();
+  EXPECT_EQ(1, cell->value());
+  cell->Increment();
+  EXPECT_EQ(2, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(1, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(0, cell->value());
+}
+
+TEST(LabeledCounterGaugeTest, SameName) {
+  auto* same_counter =
+      CounterGauge<1>::New("/tensorflow/test/counter_gauge_with_labels",
+                           "Counter with one label.", "MyLabel");
+  EXPECT_TRUE(counter_gauge_with_labels->GetStatus().ok());
+  EXPECT_TRUE(same_counter->GetStatus().ok());
+  delete same_counter;
+}
+
+auto* init_counter_gauge_without_labels = CounterGauge<0>::New(
+    "/tensorflow/test/init_counter_gauge_without_labels",
+    "Counter without any labels to check if it is initialized as 0.");
+
+TEST(UnlabeledCounterGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, init_counter_gauge_without_labels->GetCell()->value());
+}
+
+auto* counter_gauge_without_labels =
+    CounterGauge<0>::New("/tensorflow/test/counter_gauge_without_labels",
+                         "Counter without any labels.");
+
+TEST(UnlabeledCounterGaugeTest, GetCell) {
+  auto* cell = counter_gauge_without_labels->GetCell();
+  EXPECT_EQ(0, cell->value());
+
+  cell->IncrementBy(42);
+  EXPECT_EQ(42, cell->value());
+
+  auto* same_cell = counter_gauge_without_labels->GetCell();
+  EXPECT_EQ(42, same_cell->value());
+
+  same_cell->IncrementBy(58);
+  EXPECT_EQ(100, cell->value());
+  EXPECT_EQ(100, same_cell->value());
+
+  cell->IncrementBy(-100);
+  EXPECT_EQ(0, cell->value());
+  EXPECT_EQ(0, same_cell->value());
+}
+
+TEST(UnlabeledCounterGaugeTest, IncrementAndDecrement) {
+  auto* cell = counter_gauge_without_labels->GetCell();
+  cell->Increment();
+  EXPECT_EQ(1, cell->value());
+  cell->Increment();
+  EXPECT_EQ(2, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(1, cell->value());
+  cell->Decrement();
+  EXPECT_EQ(0, cell->value());
+}
+
+}  // namespace
+}  // namespace tsl::monitoring
diff --git a/third_party/xla/xla/tsl/lib/monitoring/gauge.h b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
index 76c82efe1fc8ea..966f7da14c22b6 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
@@ -253,7 +253,7 @@ class Gauge {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   std::map<LabelArray, GaugeCell<ValueType> > cells_ TF_GUARDED_BY(mu_);
 
   // The metric definition. This will be used to identify the metric when we
diff --git a/third_party/xla/xla/tsl/lib/monitoring/metric_def.h b/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
index 82896f43a7e77e..4a5301b6baf697 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/metric_def.h
@@ -127,7 +127,7 @@ class AbstractMetricDef {
 
   absl::string_view description() const { return description_; }
 
-  const std::vector<string>& label_descriptions() const {
+  const std::vector<std::string>& label_descriptions() const {
     return label_descriptions_;
   }
 
@@ -138,19 +138,19 @@ class AbstractMetricDef {
   AbstractMetricDef(const MetricKind kind, const ValueType value_type,
                     const absl::string_view name,
                     const absl::string_view description,
-                    const std::vector<string>& label_descriptions)
+                    const std::vector<std::string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
         name_(name),
         description_(description),
-        label_descriptions_(std::vector<string>(label_descriptions.begin(),
-                                                label_descriptions.end())) {}
+        label_descriptions_(std::vector<std::string>(
+            label_descriptions.begin(), label_descriptions.end())) {}
 
   const MetricKind kind_;
   const ValueType value_type_;
-  const string name_;
-  const string description_;
-  const std::vector<string> label_descriptions_;
+  const std::string name_;
+  const std::string description_;
+  const std::vector<std::string> label_descriptions_;
 };
 
 // Metric definition.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
index 88b8d24030b207..87d94d3fe47d5b 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
@@ -35,7 +35,7 @@ namespace tsl {
 namespace monitoring {
 
 void PercentileSamplerCell::Add(double sample) {
-  uint64 nstime = EnvTime::NowNanos();
+  uint64_t nstime = EnvTime::NowNanos();
   absl::MutexLock l(mu_);
   samples_[next_position_] = {nstime, sample};
   ++next_position_;
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
index ffc442ad5601c7..be24fc564e9f63 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
@@ -121,7 +121,7 @@ class PercentileSamplerCell {
   struct Sample {
     bool operator<(const Sample& rhs) const { return value < rhs.value; }
 
-    uint64 nstime = 0;
+    uint64_t nstime = 0;
     double value = NAN;
   };
 
@@ -227,7 +227,7 @@ class PercentileSampler {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/sampler.cc b/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
index 6ee26bc5ee01c5..999d92ead44c04 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/sampler.cc
@@ -82,6 +82,7 @@ class ExponentialBuckets : public Buckets {
                                                  int bucket_count) {
     CHECK_GT(bucket_count, 0);
     std::vector<double> bucket_limits;
+    bucket_limits.reserve(bucket_count);
     double bound = scale;
     for (int i = 0; i < bucket_count; i++) {
       bucket_limits.push_back(bound);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/sampler.h b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
index d7122f3633ff71..1a3b25fc1f01d3 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
@@ -264,7 +264,7 @@ class Sampler {
 
   absl::Status status_;
 
-  using LabelArray = std::array<string, NumLabels>;
+  using LabelArray = std::array<std::string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
diff --git a/third_party/xla/xla/tsl/lib/monitoring/types.h b/third_party/xla/xla/tsl/lib/monitoring/types.h
index 4618308c8ce3e3..ff86579b185f31 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/types.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/types.h
@@ -38,8 +38,8 @@ struct PercentilePoint {
 
 struct Percentiles {
   UnitOfMeasure unit_of_measure = UnitOfMeasure::kNumber;
-  uint64 start_nstime = 0;
-  uint64 end_nstime = 0;
+  uint64_t start_nstime = 0;
+  uint64_t end_nstime = 0;
   double min_value = NAN;
   double max_value = NAN;
   double mean = NAN;
diff --git a/third_party/xla/xla/tsl/lib/random/BUILD b/third_party/xla/xla/tsl/lib/random/BUILD
index 9ee3308b720d85..36a6c1c1fd8c85 100644
--- a/third_party/xla/xla/tsl/lib/random/BUILD
+++ b/third_party/xla/xla/tsl/lib/random/BUILD
@@ -164,6 +164,7 @@ tsl_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc b/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
index c94d9ec2de73d7..4cbff0d5fcfec7 100644
--- a/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/distribution_sampler_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <string.h>
 
+#include <cstdint>
+#include <limits>
 #include <memory>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "xla/tsl/lib/random/simple_philox.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/test.h"
@@ -96,7 +99,7 @@ static void BM_DistributionSampler(::testing::benchmark::State& state) {
   for (auto s : state) {
     r |= picker.Sample(&rand);
   }
-  CHECK_NE(r, kint32max);
+  CHECK_NE(r, std::numeric_limits<int32_t>::max());
 }
 
 BENCHMARK(BM_DistributionSampler)->Arg(10)->Arg(100)->Arg(1000);
diff --git a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
index 7427df746d6e40..629d32d2b94360 100644
--- a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
+++ b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
@@ -59,12 +59,12 @@ class DeterministicSerializer {
 }  // namespace
 
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result) {
+                                    std::string* result) {
   const size_t size = msg.ByteSizeLong();
   if (size > static_cast<size_t>(INT_MAX)) {
     return false;
   }
-  *result = string(size, '\0');
+  *result = std::string(size, '\0');
   return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
                                         result->size());
 }
@@ -95,13 +95,13 @@ bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
   return memcmp(x_serialized.data(), y_serialized.data(), size) == 0;
 }
 
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
-                                uint64 seed) {
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                  uint64_t seed) {
   DeterministicSerializer serialized(proto);
   return Hash64(serialized.data(), serialized.size(), seed);
 }
 
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto) {
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto) {
   DeterministicSerializer serialized(proto);
   return Hash64(serialized.data(), serialized.size());
 }
diff --git a/third_party/xla/xla/tsl/lib/strings/proto_serialization.h b/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
index b79e9aff6c21df..709f17e72454e4 100644
--- a/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
+++ b/third_party/xla/xla/tsl/lib/strings/proto_serialization.h
@@ -26,7 +26,7 @@ namespace tsl {
 // See the following for more details:
 // https://github.com/google/protobuf/blob/a1bb147e96b6f74db6cdf3c3fcb00492472dbbfa/src/google/protobuf/io/coded_stream.h#L834
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
-                                    string* result);
+                                    std::string* result);
 
 // As above, but takes a pre-allocated buffer wrapped by result.
 // PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
@@ -39,9 +39,9 @@ bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
                               const protobuf::MessageLite& y);
 
 // Computes Hash64 of the output of SerializeToBufferDeterministic().
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
-uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
-                                uint64 seed);
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64_t DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                  uint64_t seed);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/mkl/build_defs.bzl b/third_party/xla/xla/tsl/mkl/build_defs.bzl
index 1027ff6deecb9a..cf6c2ac5b46315 100644
--- a/third_party/xla/xla/tsl/mkl/build_defs.bzl
+++ b/third_party/xla/xla/tsl/mkl/build_defs.bzl
@@ -35,9 +35,9 @@ def if_mkl(if_true, if_false = []):
       may need it. It may be deleted in future with refactoring.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
-        "@local_xla//xla/tsl:linux_x86_64": if_true,
-        "@local_xla//xla/tsl:windows": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
+        Label("//xla/tsl:linux_x86_64"): if_true,
+        Label("//xla/tsl:windows"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -57,8 +57,8 @@ def if_mkl_ml(if_true, if_false = []):
       a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_opensource": if_false,
-        "@local_xla//xla/tsl/mkl:build_with_mkl": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_opensource"): if_false,
+        Label("//xla/tsl/mkl:build_with_mkl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -75,7 +75,7 @@ def if_mkl_lnx_x64(if_true, if_false = []):
       a select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_lnx_x64": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_lnx_x64"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -92,7 +92,7 @@ def if_enable_mkl(if_true, if_false = []):
       A select evaluating to either if_true or if_false as appropriate.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:enable_mkl": if_true,
+        Label("//xla/tsl/mkl:enable_mkl"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -108,10 +108,10 @@ def mkl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
-        "@local_xla//xla/tsl:linux_x86_64_with_onednn_async": ["@onednn_async//:mkl_dnn"],
-        "@local_xla//xla/tsl:linux_x86_64": ["@onednn//:mkl_dnn"],
-        "@local_xla//xla/tsl:windows": ["@onednn//:mkl_dnn"],
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
+        Label("//xla/tsl:linux_x86_64_with_onednn_async"): ["@onednn_async//:mkl_dnn"],
+        Label("//xla/tsl:linux_x86_64"): ["@onednn//:mkl_dnn"],
+        Label("//xla/tsl:windows"): ["@onednn//:mkl_dnn"],
         "//conditions:default": [],
     })
 
@@ -124,7 +124,7 @@ def if_onednn_async(if_true, if_false = []):
       Otherwise, the select statement evaluates to if_false.
     """
     return select({
-        "@local_xla//xla/tsl:linux_x86_64_with_onednn_async": if_true,
+        Label("//xla/tsl:linux_x86_64_with_onednn_async"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -136,9 +136,9 @@ def onednn_v3_define():
       An empty list of all other cases (include ARM builds).
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["-DENABLE_ONEDNN_V3"],
-        "@local_xla//xla/tsl:linux_x86_64": ["-DENABLE_ONEDNN_V3"],
-        "@local_xla//xla/tsl:windows": ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl:linux_x86_64"): ["-DENABLE_ONEDNN_V3"],
+        Label("//xla/tsl:windows"): ["-DENABLE_ONEDNN_V3"],
         "//conditions:default": [],
     })
 
@@ -154,13 +154,13 @@ def if_mkldnn_openmp(if_true, if_false = []):
 
     """
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkldnn_openmp": if_true,
+        Label("//xla/tsl/mkl:build_with_mkldnn_openmp"): if_true,
         "//conditions:default": if_false,
     })
 
 def if_mkldnn_aarch64_acl(if_true, if_false = []):
     return select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -168,8 +168,8 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
 def if_graph_api(if_true, if_false = []):
     """Returns `if_true` if Graph API is used with oneDNN."""
     return select({
-        "@local_xla//xla/tsl:linux_x86_64": if_true,
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
+        Label("//xla/tsl:linux_x86_64"): if_true,
+        Label("//xla/tsl/mkl:build_with_mkl_aarch64"): if_true,
         "//conditions:default": if_false,
     })
 
diff --git a/third_party/xla/xla/tsl/platform/BUILD b/third_party/xla/xla/tsl/platform/BUILD
index a5be271747f428..0734def090959d 100644
--- a/third_party/xla/xla/tsl/platform/BUILD
+++ b/third_party/xla/xla/tsl/platform/BUILD
@@ -490,6 +490,7 @@ cc_library(
     ],
     deps = [
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:globals",
         "@com_google_absl//absl/log:vlog_is_on",
diff --git a/third_party/xla/xla/tsl/platform/build_config_root.bzl b/third_party/xla/xla/tsl/platform/build_config_root.bzl
index 764251ac28d0e5..173c5df828e540 100644
--- a/third_party/xla/xla/tsl/platform/build_config_root.bzl
+++ b/third_party/xla/xla/tsl/platform/build_config_root.bzl
@@ -11,6 +11,7 @@ load(
     _if_llvm_aarch64_available = "if_llvm_aarch64_available",
     _if_llvm_arm_available = "if_llvm_arm_available",
     _if_llvm_powerpc_available = "if_llvm_powerpc_available",
+    _if_llvm_riscv_available = "if_llvm_riscv_available",
     _if_llvm_system_z_available = "if_llvm_system_z_available",
     _if_llvm_x86_available = "if_llvm_x86_available",
     _if_pywrap = "if_pywrap",
@@ -32,6 +33,7 @@ if_llvm_aarch32_available = _if_llvm_aarch32_available
 if_llvm_aarch64_available = _if_llvm_aarch64_available
 if_llvm_arm_available = _if_llvm_arm_available
 if_llvm_powerpc_available = _if_llvm_powerpc_available
+if_llvm_riscv_available = _if_llvm_riscv_available
 if_llvm_system_z_available = _if_llvm_system_z_available
 if_llvm_x86_available = _if_llvm_x86_available
 if_static = _if_static
diff --git a/third_party/xla/xla/tsl/platform/cloud/BUILD b/third_party/xla/xla/tsl/platform/cloud/BUILD
index 65eebbcd5b9914..664a57c7cc94f6 100644
--- a/third_party/xla/xla/tsl/platform/cloud/BUILD
+++ b/third_party/xla/xla/tsl/platform/cloud/BUILD
@@ -179,6 +179,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:path",
@@ -386,6 +387,7 @@ tsl_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/tsl/platform/cloud/auth_provider.h b/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
index 5cbc1704baa498..6c840b9804617e 100644
--- a/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/auth_provider.h
@@ -31,9 +31,9 @@ class AuthProvider {
   /// \brief Returns the short-term authentication bearer token.
   ///
   /// Safe for concurrent use by multiple threads.
-  virtual absl::Status GetToken(string* t) = 0;
+  virtual absl::Status GetToken(std::string* t) = 0;
 
-  static absl::Status GetToken(AuthProvider* provider, string* token) {
+  static absl::Status GetToken(AuthProvider* provider, std::string* token) {
     if (!provider) {
       return errors::Internal("Auth provider is required.");
     }
@@ -44,7 +44,7 @@ class AuthProvider {
 /// No-op auth provider, which will only work for public objects.
 class EmptyAuthProvider : public AuthProvider {
  public:
-  absl::Status GetToken(string* token) override {
+  absl::Status GetToken(std::string* token) override {
     *token = "";
     return absl::OkStatus();
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
index 586c91a0b8bfab..2590dcd743e0d6 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.cc
@@ -41,9 +41,9 @@ ComputeEngineMetadataClient::ComputeEngineMetadataClient(
       retry_config_(config) {}
 
 absl::Status ComputeEngineMetadataClient::GetMetadata(
-    const string& path, std::vector<char>* response_buffer) {
+    const std::string& path, std::vector<char>* response_buffer) {
   const auto get_metadata_from_gce = [path, response_buffer, this]() {
-    string metadata_url;
+    std::string metadata_url;
     const char* metadata_url_override = std::getenv(kGceMetadataHost);
     if (metadata_url_override) {
       metadata_url = absl::StrCat("http://", metadata_url_override,
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
index 81863019a247ee..7fd20c0854c60e 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client.h
@@ -51,7 +51,7 @@ class ComputeEngineMetadataClient {
   /// To get the zone of an instance:
   ///   compute_engine_metadata_client.GetMetadata(
   ///       "instance/zone", response_buffer);
-  virtual absl::Status GetMetadata(const string& path,
+  virtual absl::Status GetMetadata(const std::string& path,
                                    std::vector<char>* response_buffer);
 
  private:
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
index b89e63cfa0a303..5004842d8e1a8a 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_metadata_client_test.cc
@@ -31,7 +31,7 @@ class ComputeEngineMetadataClientTest : public ::testing::Test {
 };
 
 TEST_F(ComputeEngineMetadataClientTest, GetMetadata) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
 
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: http://metadata.google.internal/computeMetadata/v1/instance"
@@ -52,7 +52,7 @@ TEST_F(ComputeEngineMetadataClientTest, GetMetadata) {
 }
 
 TEST_F(ComputeEngineMetadataClientTest, GetCustomMetadataEndpoint) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
   setenv("GCE_METADATA_HOST", "foo.bar", 1);
 
   std::vector<HttpRequest*> requests(
@@ -74,7 +74,7 @@ TEST_F(ComputeEngineMetadataClientTest, GetCustomMetadataEndpoint) {
 }
 
 TEST_F(ComputeEngineMetadataClientTest, RetryOnFailure) {
-  const string example_response = "example response";
+  const std::string example_response = "example response";
 
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
index be878e3099ee1b..2c1705802ebdb5 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.cc
@@ -28,7 +28,7 @@ ComputeEngineZoneProvider::ComputeEngineZoneProvider(
     std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client)
     : google_metadata_client_(std::move(google_metadata_client)) {}
 
-absl::Status ComputeEngineZoneProvider::GetZone(string* zone) {
+absl::Status ComputeEngineZoneProvider::GetZone(std::string* zone) {
   if (!cached_zone.empty()) {
     *zone = cached_zone;
     return absl::OkStatus();
@@ -38,13 +38,12 @@ absl::Status ComputeEngineZoneProvider::GetZone(string* zone) {
                                                           &response_buffer));
   absl::string_view location(&response_buffer[0], response_buffer.size());
 
-  std::vector<string> elems = str_util::Split(location, "/");
+  std::vector<std::string> elems = str_util::Split(location, "/");
   if (elems.size() == 4) {
     cached_zone = elems.back();
     *zone = cached_zone;
   } else {
-    LOG(ERROR) << "Failed to parse the zone name from location: "
-               << string(location);
+    LOG(ERROR) << "Failed to parse the zone name from location: " << location;
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
index e54b8aba42ab84..8366eb100e708d 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider.h
@@ -27,11 +27,11 @@ class ComputeEngineZoneProvider : public ZoneProvider {
       std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client);
   virtual ~ComputeEngineZoneProvider();
 
-  absl::Status GetZone(string* zone) override;
+  absl::Status GetZone(std::string* zone) override;
 
  private:
   std::shared_ptr<ComputeEngineMetadataClient> google_metadata_client_;
-  string cached_zone;
+  std::string cached_zone;
   ComputeEngineZoneProvider(const ComputeEngineZoneProvider&) = delete;
   void operator=(const ComputeEngineZoneProvider&) = delete;
 };
diff --git a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
index e9ecd10f68743a..8c3f29d02f4cb0 100644
--- a/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/compute_engine_zone_provider_test.cc
@@ -40,7 +40,7 @@ TEST_F(ComputeEngineZoneProviderTest, GetZone) {
 
   ComputeEngineZoneProvider provider(metadata_client);
 
-  string zone;
+  std::string zone;
 
   TF_EXPECT_OK(provider.GetZone(&zone));
   EXPECT_EQ("us-west1-b", zone);
@@ -61,7 +61,7 @@ TEST_F(ComputeEngineZoneProviderTest, InvalidZoneString) {
 
   ComputeEngineZoneProvider provider(metadata_client);
 
-  string zone;
+  std::string zone;
 
   TF_EXPECT_OK(provider.GetZone(&zone));
   EXPECT_EQ("", zone);
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
index b9d06926a497ea..ed8e8382657c3b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.cc
@@ -173,7 +173,7 @@ CurlHttpRequest::~CurlHttpRequest() {
   }
 }
 
-string CurlHttpRequest::EscapeString(const string& str) {
+string CurlHttpRequest::EscapeString(const std::string& str) {
   char* out_char_str = libcurl_->curl_easy_escape(curl_, str.c_str(), 0);
   string out_str(out_char_str);
   libcurl_->curl_free(out_char_str);
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
index 717e59b13e5507..47440e6d3dbe49 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request.h
@@ -200,7 +200,7 @@ class CurlHttpRequest : public HttpRequest {
 
   std::vector<char> default_response_buffer_;
 
-  std::unordered_map<string, string> response_headers_;
+  std::unordered_map<std::string, string> response_headers_;
   uint64 response_code_ = 0;
 
   // The timestamp of the last activity related to the request execution, in
diff --git a/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc b/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
index 87824ba0a8d5db..35a4da95b1f24f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/curl_http_request_test.cc
@@ -42,7 +42,7 @@ class FakeEnv : public EnvWrapper {
 // A fake proxy that pretends to be libcurl.
 class FakeLibCurl : public LibCurl {
  public:
-  FakeLibCurl(const string& response_content, uint64 response_code)
+  FakeLibCurl(const std::string& response_content, uint64 response_code)
       : response_content_(response_content), response_code_(response_code) {}
   FakeLibCurl(const string& response_content, uint64 response_code,
               std::vector<std::tuple<uint64, curl_off_t>> progress_ticks,
diff --git a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
index 903fcb0071e926..49c2070e1a6c79 100644
--- a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
@@ -39,13 +39,13 @@ class ExpiringLRUCache {
   /// A `max_age` of 0 means that nothing is cached. A `max_entries` of 0 means
   /// that there is no limit on the number of entries in the cache (however, if
   /// `max_age` is also 0, the cache will not be populated).
-  ExpiringLRUCache(uint64 max_age, size_t max_entries,
+  ExpiringLRUCache(uint64_t max_age, size_t max_entries,
                    Env* env = Env::Default())
       : max_age_(max_age), max_entries_(max_entries), env_(env) {}
 
   /// Insert `value` with key `key`. This will replace any previous entry with
   /// the same key.
-  void Insert(const string& key, const T& value) {
+  void Insert(const std::string& key, const T& value) {
     if (max_age_ == 0) {
       return;
     }
@@ -56,7 +56,7 @@ class ExpiringLRUCache {
   // Delete the entry with key `key`. Return true if the entry was found for
   // `key`, false if the entry was not found. In both cases, there is no entry
   // with key `key` existed after the call.
-  bool Delete(const string& key) {
+  bool Delete(const std::string& key) {
     absl::MutexLock lock(mu_);
     return DeleteLocked(key);
   }
@@ -64,7 +64,7 @@ class ExpiringLRUCache {
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
   /// true if an entry was found for `key`, and its timestamp is not more than
   /// max_age_ seconds in the past.
-  bool Lookup(const string& key, T* value) {
+  bool Lookup(const std::string& key, T* value) {
     if (max_age_ == 0) {
       return false;
     }
@@ -72,12 +72,12 @@ class ExpiringLRUCache {
     return LookupLocked(key, value);
   }
 
-  typedef std::function<absl::Status(const string&, T*)> ComputeFunc;
+  typedef std::function<absl::Status(const std::string&, T*)> ComputeFunc;
 
   /// Look up the entry with key `key` and copy it to `value` if found. If not
   /// found, call `compute_func`. If `compute_func` returns successfully, store
   /// a copy of the output parameter in the cache, and another copy in `value`.
-  absl::Status LookupOrCompute(const string& key, T* value,
+  absl::Status LookupOrCompute(const std::string& key, T* value,
                                const ComputeFunc& compute_func) {
     if (max_age_ == 0) {
       return compute_func(key, value);
@@ -105,22 +105,22 @@ class ExpiringLRUCache {
   }
 
   /// Accessors for cache parameters.
-  uint64 max_age() const { return max_age_; }
+  uint64_t max_age() const { return max_age_; }
   size_t max_entries() const { return max_entries_; }
 
  private:
   struct Entry {
     /// The timestamp (seconds) at which the entry was added to the cache.
-    uint64 timestamp;
+    uint64_t timestamp;
 
     /// The entry's value.
     T value;
 
     /// A list iterator pointing to the entry's position in the LRU list.
-    std::list<string>::iterator lru_iterator;
+    std::list<std::string>::iterator lru_iterator;
   };
 
-  bool LookupLocked(const string& key, T* value)
+  bool LookupLocked(const std::string& key, T* value)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
@@ -137,7 +137,7 @@ class ExpiringLRUCache {
     return true;
   }
 
-  void InsertLocked(const string& key, const T& value)
+  void InsertLocked(const std::string& key, const T& value)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     lru_list_.push_front(key);
     Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
@@ -151,7 +151,7 @@ class ExpiringLRUCache {
     }
   }
 
-  bool DeleteLocked(const string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool DeleteLocked(const std::string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
       return false;
@@ -163,7 +163,7 @@ class ExpiringLRUCache {
 
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
-  const uint64 max_age_;
+  const uint64_t max_age_;
 
   /// The maximum number of entries in the cache. A value of 0 means there is no
   /// limit on entry count.
@@ -176,11 +176,11 @@ class ExpiringLRUCache {
   absl::Mutex mu_;
 
   /// The cache (a map from string key to Entry).
-  std::map<string, Entry> cache_ TF_GUARDED_BY(mu_);
+  std::map<std::string, Entry> cache_ TF_GUARDED_BY(mu_);
 
   /// The LRU list of entries. The front of the list identifies the most
   /// recently accessed entry.
-  std::list<string> lru_list_ TF_GUARDED_BY(mu_);
+  std::list<std::string> lru_list_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
index 9f107e59c29599..8f69466b1404e0 100644
--- a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache_test.cc
@@ -25,7 +25,7 @@ namespace tsl {
 namespace {
 
 TEST(ExpiringLRUCacheTest, MaxAge) {
-  const string key = "a";
+  const std::string key = "a";
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
   ExpiringLRUCache<int> cache(1, 0, env.get());
   env->SetNowSeconds(1);
@@ -92,9 +92,9 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
 
 TEST(ExpiringLRUCacheTest, LookupOrCompute) {
   // max_age of 0 means we should always compute.
-  uint64 num_compute_calls = 0;
+  uint64_t num_compute_calls = 0;
   ExpiringLRUCache<int>::ComputeFunc compute_func =
-      [&num_compute_calls](const string& key, int* value) {
+      [&num_compute_calls](const std::string& key, int* value) {
         *value = num_compute_calls;
         num_compute_calls++;
         return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
index 3a6899036eee8e..bdab4dc8454f60 100644
--- a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
@@ -69,7 +69,7 @@ class FileBlockCache {
   /// cache is constructed. The returned Status should be OK as long as the
   /// read from the remote filesystem succeeded (similar to the semantics of the
   /// read(2) system call).
-  typedef std::function<absl::Status(const string& filename, size_t offset,
+  typedef std::function<absl::Status(const std::string& filename, size_t offset,
                                      size_t buffer_size, char* buffer,
                                      size_t* bytes_transferred)>
       BlockFetcher;
@@ -90,18 +90,19 @@ class FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  virtual absl::Status Read(const string& filename, size_t offset, size_t n,
-                            char* buffer, size_t* bytes_transferred) = 0;
+  virtual absl::Status Read(const std::string& filename, size_t offset,
+                            size_t n, char* buffer,
+                            size_t* bytes_transferred) = 0;
 
   // Validate the given file signature with the existing file signature in the
   // cache. Returns true if the signature doesn't change or the file did not
   // exist before. If the signature changes, update the existing signature with
   // the new one and remove the file from cache.
-  virtual bool ValidateAndUpdateFileSignature(const string& filename,
+  virtual bool ValidateAndUpdateFileSignature(const std::string& filename,
                                               int64_t file_signature) = 0;
 
   /// Remove all cached blocks for `filename`.
-  virtual void RemoveFile(const string& filename) = 0;
+  virtual void RemoveFile(const std::string& filename) = 0;
 
   /// Remove all cached data.
   virtual void Flush() = 0;
@@ -109,7 +110,7 @@ class FileBlockCache {
   /// Accessors for cache parameters.
   virtual size_t block_size() const = 0;
   virtual size_t max_bytes() const = 0;
-  virtual uint64 max_staleness() const = 0;
+  virtual uint64_t max_staleness() const = 0;
 
   /// The current size (in bytes) of the cache.
   virtual size_t CacheSize() const = 0;
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
index 735a2fa663781e..73af26eed2f16b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
@@ -40,10 +40,11 @@ namespace tsl {
 
 namespace {
 
-const std::vector<string>& kCachedDomainNames =
-    *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
+const std::vector<std::string>& kCachedDomainNames =
+    *new std::vector<std::string>{"www.googleapis.com",
+                                  "storage.googleapis.com"};
 
-inline void print_getaddrinfo_error(const string& name,
+inline void print_getaddrinfo_error(const std::string& name,
                                     absl::Status return_status) {
   // Status doesn't map well to EAI type errors.
   LOG(ERROR) << "Error resolving " << name << ": " << return_status;
@@ -81,10 +82,10 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 
   CHECK_EQ(kCachedDomainNames.size(), addresses_.size());
   for (size_t i = 0; i < kCachedDomainNames.size(); ++i) {
-    const string& name = kCachedDomainNames[i];
-    const std::vector<string>& addresses = addresses_[i];
+    const std::string& name = kCachedDomainNames[i];
+    const std::vector<std::string>& addresses = addresses_[i];
     if (!addresses.empty()) {
-      const string& chosen_address =
+      const std::string& chosen_address =
           SelectRandomItemUniform(&random_, addresses);
       request->AddResolveOverride(name, 443, chosen_address);
       VLOG(1) << "Annotated DNS mapping: " << name << " --> " << chosen_address;
@@ -94,7 +95,8 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   }
 }
 
-/* static */ std::vector<string> GcsDnsCache::ResolveName(const string& name) {
+/* static */ std::vector<std::string> GcsDnsCache::ResolveName(
+    const std::string& name) {
   VLOG(1) << "Resolving DNS name: " << name;
 
   addrinfo hints;
@@ -182,7 +184,7 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
       },
       retryConfig);
 
-  std::vector<string> output;
+  std::vector<std::string> output;
   if (getaddrinfo_status.ok()) {
     for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
       if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
@@ -221,11 +223,11 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 //
 // Ensures: names.size() == return_value.size()
 
-std::vector<std::vector<string>> GcsDnsCache::ResolveNames(
-    const std::vector<string>& names) {
-  std::vector<std::vector<string>> all_addresses;
+std::vector<std::vector<std::string>> GcsDnsCache::ResolveNames(
+    const std::vector<std::string>& names) {
+  std::vector<std::vector<std::string>> all_addresses;
   all_addresses.reserve(names.size());
-  for (const string& name : names) {
+  for (const std::string& name : names) {
     all_addresses.push_back(ResolveName(name));
   }
   return all_addresses;
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
index 4b4f19935c42b3..0a84728739441a 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
@@ -52,9 +52,9 @@ class GcsDnsCache {
   void AnnotateRequest(HttpRequest* request);
 
  private:
-  static std::vector<string> ResolveName(const string& name);
-  static std::vector<std::vector<string>> ResolveNames(
-      const std::vector<string>& names);
+  static std::vector<std::string> ResolveName(const std::string& name);
+  static std::vector<std::vector<std::string>> ResolveNames(
+      const std::vector<std::string>& names);
   void WorkerThread();
 
   // Define a friend class for testing.
@@ -70,7 +70,7 @@ class GcsDnsCache {
   const int64_t refresh_rate_secs_;
 
   // Entries in this vector correspond to entries in kCachedDomainNames.
-  std::vector<std::vector<string>> addresses_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<std::string>> addresses_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
index b2180666bb6b87..6059ccba3ed056 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
@@ -23,25 +23,25 @@ namespace tsl {
 
 class TestHttpRequest : public HttpRequest {
  public:
-  void SetUri(const string& uri) override {}
-  void SetRange(uint64 start, uint64 end) override {}
-  void AddHeader(const string& name, const string& value) override {}
-  void AddResolveOverride(const string& hostname, int64_t port,
-                          const string& ip_addr) override {
+  void SetUri(const std::string& uri) override {}
+  void SetRange(uint64_t start, uint64_t end) override {}
+  void AddHeader(const std::string& name, const std::string& value) override {}
+  void AddResolveOverride(const std::string& hostname, int64_t port,
+                          const std::string& ip_addr) override {
     EXPECT_EQ(port, 443) << "Unexpected port set for hostname: " << hostname;
     auto itr = resolve_overrides_.find(hostname);
     EXPECT_EQ(itr, resolve_overrides_.end())
         << "Hostname " << hostname << "already in map: " << itr->second;
 
     resolve_overrides_.insert(
-        std::map<string, string>::value_type(hostname, ip_addr));
+        std::map<std::string, std::string>::value_type(hostname, ip_addr));
   }
 
-  void AddAuthBearerHeader(const string& auth_token) override {}
+  void AddAuthBearerHeader(const std::string& auth_token) override {}
   void SetRequestStats(HttpRequest::RequestStats* stats) override {}
   void SetDeleteRequest() override {}
 
-  absl::Status SetPutFromFile(const string& body_filepath,
+  absl::Status SetPutFromFile(const std::string& body_filepath,
                               size_t offset) override {
     return absl::OkStatus();
   }
@@ -52,15 +52,17 @@ class TestHttpRequest : public HttpRequest {
   void SetResultBufferDirect(char* buffer, size_t size) override {}
   size_t GetResultBufferDirectBytesTransferred() override { return 0; }
 
-  string GetResponseHeader(const string& name) const override { return ""; }
-  uint64 GetResponseCode() const override { return 0; }
+  std::string GetResponseHeader(const std::string& name) const override {
+    return "";
+  }
+  uint64_t GetResponseCode() const override { return 0; }
   absl::Status Send() override { return absl::OkStatus(); }
-  string EscapeString(const string& str) override { return ""; }
+  std::string EscapeString(const std::string& str) override { return ""; }
 
-  void SetTimeouts(uint32 connection, uint32 inactivity,
-                   uint32 total) override {}
+  void SetTimeouts(uint32_t connection, uint32_t inactivity,
+                   uint32_t total) override {}
 
-  std::map<string, string> resolve_overrides_;
+  std::map<std::string, std::string> resolve_overrides_;
 };
 
 // Friend class for testing.
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
index 6f40b1eafbdb9e..9cf2d016d1ccba 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
@@ -197,7 +197,7 @@ class GcsFileSystem : public FileSystem {
   absl::Status CreateDir(const string& dirname,
                          TransactionToken* token) override;
 
-  absl::Status DeleteDir(const string& dirname,
+  absl::Status DeleteDir(const std::string& dirname,
                          TransactionToken* token) override;
 
   absl::Status GetFileSize(const string& fname, TransactionToken* token,
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
index 96d1ce0f23a69e..aeb504f0eb7cb8 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
@@ -1469,7 +1469,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_ObjectDoesNotExist) {
 }
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
-  const string content = "file content";
+  const std::string content = "file content";
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
index ba890acf2addb8..4b23f3032ca06c 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
@@ -60,7 +60,7 @@ void GcsThrottle::SetConfig(GcsThrottleConfig config) {
 void GcsThrottle::UpdateState() {
   // TODO(b/72643279): Switch to a monotonic clock.
   int64_t now = env_time_->GetOverridableNowSeconds();
-  uint64 delta_secs =
+  uint64_t delta_secs =
       std::max(int64_t{0}, now - static_cast<int64_t>(last_updated_secs_));
   available_tokens_ += delta_secs * config_.token_rate;
   available_tokens_ = std::min(available_tokens_, config_.bucket_size);
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
index 454bdf505a4783..9fd44e794bac74 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
@@ -139,7 +139,7 @@ class GcsThrottle {
    */
   void UpdateState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+  inline uint64_t request_bytes_to_tokens(size_t num_bytes) {
     return num_bytes >> 10;
   }
 
@@ -150,7 +150,7 @@ class GcsThrottle {
    * the internal state of the GcsThrottle was updated. This is important when
    * determining the number of tokens to add to the available_tokens_ pool.
    */
-  uint64 last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
+  uint64_t last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
 
   /**
    * available_tokens_ records how many tokens are available to be consumed.
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
index 50e5aab36cab2e..beb87685ef878f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle_test.cc
@@ -25,16 +25,16 @@ namespace {
 
 class TestTime : public EnvTime {
  public:
-  uint64 GetOverridableNowNanos() const override {
+  uint64_t GetOverridableNowNanos() const override {
     return now_micros_ * kMicrosToNanos;
   }
 
-  void SetTime(uint64 now_micros) { now_micros_ = now_micros; }
+  void SetTime(uint64_t now_micros) { now_micros_ = now_micros; }
 
   void AdvanceSeconds(int64_t secs) { now_micros_ += secs * kSecondsToMicros; }
 
  private:
-  uint64 now_micros_ = 1234567890000000ULL;
+  uint64_t now_micros_ = 1234567890000000ULL;
 };
 
 class GcsThrottleTest : public ::testing::Test {
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
index be35057328c597..fc064044e830a5 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
@@ -166,8 +166,8 @@ absl::Status GoogleAuthProvider::GetToken(string* t) {
   if (skip_gce_check) {
     token_from_gce_status =
         absl::Status(absl::StatusCode::kCancelled,
-                     strings::StrCat("GCE check skipped due to presence of $",
-                                     kNoGceCheck, " environment variable."));
+                     absl::StrCat("GCE check skipped due to presence of $",
+                                  kNoGceCheck, " environment variable."));
   } else {
     token_from_gce_status = GetTokenFromGce();
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
index f15b77af1e61f6..2a84c54dd06ddc 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
@@ -40,7 +40,7 @@ class GoogleAuthProvider : public AuthProvider {
   /// \brief Returns the short-term authentication bearer token.
   ///
   /// Safe for concurrent use by multiple threads.
-  absl::Status GetToken(string* token) override;
+  absl::Status GetToken(std::string* token) override;
 
  private:
   /// \brief Gets the bearer token from files.
@@ -59,8 +59,8 @@ class GoogleAuthProvider : public AuthProvider {
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
   Env* env_;
   absl::Mutex mu_;
-  string current_token_ TF_GUARDED_BY(mu_);
-  uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
+  std::string current_token_ TF_GUARDED_BY(mu_);
+  uint64_t expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
   GoogleAuthProvider(const GoogleAuthProvider&) = delete;
   void operator=(const GoogleAuthProvider&) = delete;
 };
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
index 3b87cb5aa0fa73..2657e58687d6f7 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider_test.cc
@@ -26,7 +26,7 @@ namespace tsl {
 
 namespace {
 
-string TestData() {
+std::string TestData() {
   return io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform", "cloud",
                       "testdata");
 }
@@ -35,16 +35,16 @@ class FakeEnv : public EnvWrapper {
  public:
   FakeEnv() : EnvWrapper(Env::Default()) {}
 
-  uint64 NowSeconds() const override { return now; }
-  uint64 now = 10000;
+  uint64_t NowSeconds() const override { return now; }
+  uint64_t now = 10000;
 };
 
 class FakeOAuthClient : public OAuthClient {
  public:
   absl::Status GetTokenFromServiceAccountJson(
       Json::Value json, absl::string_view oauth_server_uri,
-      absl::string_view scope, string* token,
-      uint64* expiration_timestamp_sec) override {
+      absl::string_view scope, std::string* token,
+      uint64_t* expiration_timestamp_sec) override {
     provided_credentials_json = json;
     *token = return_token;
     *expiration_timestamp_sec = return_expiration_timestamp;
@@ -53,16 +53,16 @@ class FakeOAuthClient : public OAuthClient {
 
   /// Retrieves a bearer token using a refresh token.
   absl::Status GetTokenFromRefreshTokenJson(
-      Json::Value json, absl::string_view oauth_server_uri, string* token,
-      uint64* expiration_timestamp_sec) override {
+      Json::Value json, absl::string_view oauth_server_uri, std::string* token,
+      uint64_t* expiration_timestamp_sec) override {
     provided_credentials_json = json;
     *token = return_token;
     *expiration_timestamp_sec = return_expiration_timestamp;
     return absl::OkStatus();
   }
 
-  string return_token;
-  uint64 return_expiration_timestamp;
+  std::string return_token;
+  uint64_t return_expiration_timestamp;
   Json::Value provided_credentials_json;
 };
 
@@ -103,7 +103,7 @@ TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-token", token);
   EXPECT_EQ("fake_key_id",
@@ -139,7 +139,7 @@ TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
   oauth_client->return_token = "fake-token";
   oauth_client->return_expiration_timestamp = env.NowSeconds() + 3600;
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-token", token);
   EXPECT_EQ("fake-refresh-token",
@@ -185,7 +185,7 @@ TEST_F(GoogleAuthProviderTest, RunningOnGCE) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("fake-gce-token", token);
 
@@ -213,7 +213,7 @@ TEST_F(GoogleAuthProviderTest, OverrideForTesting) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("tokenForTesting", token);
 }
@@ -235,7 +235,7 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               metadataClient, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("", token);
 }
@@ -250,7 +250,7 @@ TEST_F(GoogleAuthProviderTest, NoGceCheckEnvironmentVariable) {
   GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
                               nullptr, &env);
 
-  string token;
+  std::string token;
   TF_EXPECT_OK(provider.GetToken(&token));
   EXPECT_EQ("", token);
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/http_request.h b/third_party/xla/xla/tsl/platform/cloud/http_request.h
index 9ca2391b86dd57..2a441fee630e32 100644
--- a/third_party/xla/xla/tsl/platform/cloud/http_request.h
+++ b/third_party/xla/xla/tsl/platform/cloud/http_request.h
@@ -80,12 +80,13 @@ class HttpRequest {
     virtual ~RequestStats() = default;
 
     /// RecordRequest is called right before a request is sent on the wire.
-    virtual void RecordRequest(const HttpRequest* request, const string& uri,
+    virtual void RecordRequest(const HttpRequest* request,
+                               const std::string& uri,
                                RequestMethod method) = 0;
 
     /// RecordResponse is called after the response has been received.
-    virtual void RecordResponse(const HttpRequest* request, const string& uri,
-                                RequestMethod method,
+    virtual void RecordResponse(const HttpRequest* request,
+                                const std::string& uri, RequestMethod method,
                                 const absl::Status& result) = 0;
   };
 
@@ -93,27 +94,27 @@ class HttpRequest {
   virtual ~HttpRequest() {}
 
   /// Sets the request URI.
-  virtual void SetUri(const string& uri) = 0;
+  virtual void SetUri(const std::string& uri) = 0;
 
   /// \brief Sets the Range header.
   ///
   /// Used for random seeks, for example "0-999" returns the first 1000 bytes
   /// (note that the right border is included).
-  virtual void SetRange(uint64 start, uint64 end) = 0;
+  virtual void SetRange(uint64_t start, uint64_t end) = 0;
 
   /// Sets a request header.
-  virtual void AddHeader(const string& name, const string& value) = 0;
+  virtual void AddHeader(const std::string& name, const std::string& value) = 0;
 
   /// Sets a DNS resolve mapping (to skip DNS resolution).
   ///
   /// Note: because GCS is available over HTTPS, we cannot replace the hostname
   /// in the URI with an IP address, as that will cause the certificate check
   /// to fail.
-  virtual void AddResolveOverride(const string& hostname, int64_t port,
-                                  const string& ip_addr) = 0;
+  virtual void AddResolveOverride(const std::string& hostname, int64_t port,
+                                  const std::string& ip_addr) = 0;
 
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
-  virtual void AddAuthBearerHeader(const string& auth_token) = 0;
+  virtual void AddAuthBearerHeader(const std::string& auth_token) = 0;
 
   /// Sets the RequestStats object to use to record the request and response.
   virtual void SetRequestStats(RequestStats* stats) = 0;
@@ -125,7 +126,7 @@ class HttpRequest {
   ///
   /// The request body will be taken from the specified file starting from
   /// the given offset.
-  virtual absl::Status SetPutFromFile(const string& body_filepath,
+  virtual absl::Status SetPutFromFile(const std::string& body_filepath,
                                       size_t offset) = 0;
 
   /// Makes the request a PUT request with an empty body.
@@ -162,10 +163,10 @@ class HttpRequest {
   /// \brief Returns the response headers of a completed request.
   ///
   /// If the header is not found, returns an empty string.
-  virtual string GetResponseHeader(const string& name) const = 0;
+  virtual std::string GetResponseHeader(const std::string& name) const = 0;
 
   /// Returns the response code of a completed request.
-  virtual uint64 GetResponseCode() const = 0;
+  virtual uint64_t GetResponseCode() const = 0;
 
   /// \brief Sends the formed request.
   ///
@@ -174,7 +175,7 @@ class HttpRequest {
   virtual absl::Status Send() = 0;
 
   // Url encodes str and returns a new string.
-  virtual string EscapeString(const string& str) = 0;
+  virtual std::string EscapeString(const std::string& str) = 0;
 
   /// \brief Set timeouts for this request.
   ///
@@ -183,8 +184,8 @@ class HttpRequest {
   /// we should wait between additional responses from the server. Finally the
   /// total parameter controls the maximum total connection time to prevent
   /// hanging indefinitely.
-  virtual void SetTimeouts(uint32 connection, uint32 inactivity,
-                           uint32 total) = 0;
+  virtual void SetTimeouts(uint32_t connection, uint32_t inactivity,
+                           uint32_t total) = 0;
 
   HttpRequest(const HttpRequest&) = delete;
   void operator=(const HttpRequest&) = delete;
diff --git a/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h b/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
index 4997953b2e14e7..884de3f631f5c2 100644
--- a/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
+++ b/third_party/xla/xla/tsl/platform/cloud/http_request_fake.h
@@ -65,7 +65,7 @@ class FakeHttpRequest : public CurlHttpRequest {
   ///  and capture the POST body.
   ///
   /// Post body is not expected to be a part of the 'request' parameter.
-  FakeHttpRequest(const string& request, const string& response,
+  FakeHttpRequest(const std::string& request, const string& response,
                   absl::Status response_status, string* captured_post_body,
                   const std::map<string, string>& response_headers,
                   uint64 response_code)
diff --git a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
index 32ba11e19d9aaf..06fd8fe8eef7c1 100644
--- a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
+++ b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
@@ -28,13 +28,13 @@ class NowSecondsEnv : public EnvWrapper {
   NowSecondsEnv() : EnvWrapper(Env::Default()) {}
 
   /// The current (fake) timestamp.
-  uint64 NowSeconds() const override {
+  uint64_t NowSeconds() const override {
     absl::MutexLock lock(mu_);
     return now_;
   }
 
   /// Set the current (fake) timestamp.
-  void SetNowSeconds(uint64 now) {
+  void SetNowSeconds(uint64_t now) {
     absl::MutexLock lock(mu_);
     now_ = now;
   }
@@ -43,7 +43,7 @@ class NowSecondsEnv : public EnvWrapper {
   mutable absl::Mutex mu_;
 
   /// The NowSeconds() value that this Env will return.
-  uint64 now_ = 1;
+  uint64_t now_ = 1;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc b/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
index 3bbdb5248552ac..c9a179fb88072b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client.cc
@@ -210,7 +210,7 @@ absl::Status OAuthClient::GetTokenFromServiceAccountJson(
 
   const uint64 request_timestamp_sec = env_->NowSeconds();
 
-  string encoded_claim, encoded_header;
+  std::string encoded_claim, encoded_header;
   TF_RETURN_IF_ERROR(EncodeJwtHeader(private_key_id, &encoded_header));
   TF_RETURN_IF_ERROR(EncodeJwtClaim(client_email, scope, oauth_server_uri,
                                     request_timestamp_sec, &encoded_claim));
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client.h b/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
index 578914ea0af507..45a150efdfe74f 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client.h
@@ -39,18 +39,19 @@ class OAuthClient {
   /// with the client's private key.
   virtual absl::Status GetTokenFromServiceAccountJson(
       Json::Value json, absl::string_view oauth_server_uri,
-      absl::string_view scope, string* token, uint64* expiration_timestamp_sec);
+      absl::string_view scope, std::string* token,
+      uint64_t* expiration_timestamp_sec);
 
   /// Retrieves a bearer token using a refresh token.
   virtual absl::Status GetTokenFromRefreshTokenJson(
-      Json::Value json, absl::string_view oauth_server_uri, string* token,
-      uint64* expiration_timestamp_sec);
+      Json::Value json, absl::string_view oauth_server_uri, std::string* token,
+      uint64_t* expiration_timestamp_sec);
 
   /// Parses the JSON response with the token from an OAuth 2.0 server.
   virtual absl::Status ParseOAuthResponse(absl::string_view response,
-                                          uint64 request_timestamp_sec,
-                                          string* token,
-                                          uint64* expiration_timestamp_sec);
+                                          uint64_t request_timestamp_sec,
+                                          std::string* token,
+                                          uint64_t* expiration_timestamp_sec);
 
  private:
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
diff --git a/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc b/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
index 3a0a866bc53d1e..58113ce78d63d5 100644
--- a/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/oauth_client_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-string TestData() {
+std::string TestData() {
   return io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform", "cloud",
                       "testdata");
 }
@@ -47,16 +47,16 @@ class FakeEnv : public EnvWrapper {
  public:
   FakeEnv() : EnvWrapper(Env::Default()) {}
 
-  uint64 NowSeconds() const override { return now; }
-  uint64 now = 10000;
+  uint64_t NowSeconds() const override { return now; }
+  uint64_t now = 10000;
 };
 
 }  // namespace
 
 TEST(OAuthClientTest, ParseOAuthResponse) {
-  const uint64 request_timestamp = 100;
-  string token;
-  uint64 expiration_timestamp;
+  const uint64_t request_timestamp = 100;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(OAuthClient().ParseOAuthResponse(kTokenJson, request_timestamp,
                                                 &token, &expiration_timestamp));
   EXPECT_EQ("WITH_FAKE_ACCESS_TOKEN_TEST_SHOULD_BE_HAPPY", token);
@@ -64,7 +64,7 @@ TEST(OAuthClientTest, ParseOAuthResponse) {
 }
 
 TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
-  const string credentials_json = R"(
+  const std::string credentials_json = R"(
       {
         "client_id": "test_client_id",
         "client_secret": "@@@test_client_secret@@@",
@@ -85,8 +85,8 @@ TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
   OAuthClient client(std::unique_ptr<HttpRequest::Factory>(
                          new FakeHttpRequestFactory(&requests)),
                      &env);
-  string token;
-  uint64 expiration_timestamp;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(client.GetTokenFromRefreshTokenJson(
       json, "https://www.googleapis.com/oauth2/v3/token", &token,
       &expiration_timestamp));
@@ -102,7 +102,7 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   Json::Reader reader;
   ASSERT_TRUE(reader.parse(credentials, json));
 
-  string post_body;
+  std::string post_body;
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest("Uri: https://www.googleapis.com/oauth2/v3/token\n",
                            kTokenJson, &post_body)});
@@ -110,8 +110,8 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   OAuthClient client(std::unique_ptr<HttpRequest::Factory>(
                          new FakeHttpRequestFactory(&requests)),
                      &env);
-  string token;
-  uint64 expiration_timestamp;
+  std::string token;
+  uint64_t expiration_timestamp;
   TF_EXPECT_OK(client.GetTokenFromServiceAccountJson(
       json, "https://www.googleapis.com/oauth2/v3/token",
       "https://test-token-scope.com", &token, &expiration_timestamp));
@@ -131,15 +131,15 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
             grant_type);
 
   int last_dot = assertion.rfind('.');
-  string header_dot_claim(assertion.substr(0, last_dot));
-  string signature_encoded(assertion.substr(last_dot + 1));
+  std::string header_dot_claim(assertion.substr(0, last_dot));
+  std::string signature_encoded(assertion.substr(last_dot + 1));
 
   // Check that 'signature' signs 'header_dot_claim'.
 
   // Read the serialized public key.
   std::ifstream public_key_stream(
       io::JoinPath(TestData(), "service_account_public_key.txt"));
-  string public_key_serialized(
+  std::string public_key_serialized(
       (std::istreambuf_iterator<char>(public_key_stream)),
       (std::istreambuf_iterator<char>()));
 
@@ -152,7 +152,7 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   EXPECT_TRUE(public_key) << "Could not load the public key from testdata.";
 
   // Deserialize the signature.
-  string signature;
+  std::string signature;
   TF_EXPECT_OK(Base64Decode(signature_encoded, &signature));
 
   // Actually cryptographically verify the signature.
@@ -178,10 +178,10 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
 
   // Now check the content of the header and the claim.
   int dot = header_dot_claim.find_last_of('.');
-  string header_encoded = header_dot_claim.substr(0, dot);
-  string claim_encoded = header_dot_claim.substr(dot + 1);
+  std::string header_encoded = header_dot_claim.substr(0, dot);
+  std::string claim_encoded = header_dot_claim.substr(dot + 1);
 
-  string header, claim;
+  std::string header, claim;
   TF_EXPECT_OK(Base64Decode(header_encoded, &header));
   TF_EXPECT_OK(Base64Decode(claim_encoded, &claim));
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
index fc4b745a29e521..fc4838af1d7aab 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
@@ -169,7 +169,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
       "Control flow should never reach the end of RamFileBlockCache::Fetch.");
 }
 
-absl::Status RamFileBlockCache::Read(const string& filename, size_t offset,
+absl::Status RamFileBlockCache::Read(const std::string& filename, size_t offset,
                                      size_t n, char* buffer,
                                      size_t* bytes_transferred) {
   *bytes_transferred = 0;
@@ -232,8 +232,8 @@ absl::Status RamFileBlockCache::Read(const string& filename, size_t offset,
   return absl::OkStatus();
 }
 
-bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
-                                                       int64_t file_signature) {
+bool RamFileBlockCache::ValidateAndUpdateFileSignature(
+    const std::string& filename, int64_t file_signature) {
   absl::MutexLock lock(mu_);
   auto it = file_signature_map_.find(filename);
   if (it != file_signature_map_.end()) {
@@ -258,7 +258,7 @@ void RamFileBlockCache::Prune() {
   while (
       !stop_pruning_thread_.WaitForNotificationWithTimeout(absl::Seconds(1))) {
     absl::MutexLock lock(mu_);
-    uint64 now = env_->NowSeconds();
+    uint64_t now = env_->NowSeconds();
     while (!lra_list_.empty()) {
       auto it = block_map_.find(lra_list_.back());
       if (now - it->second->timestamp <= max_staleness_) {
@@ -280,12 +280,12 @@ void RamFileBlockCache::Flush() {
   cache_size_ = 0;
 }
 
-void RamFileBlockCache::RemoveFile(const string& filename) {
+void RamFileBlockCache::RemoveFile(const std::string& filename) {
   absl::MutexLock lock(mu_);
   RemoveFile_Locked(filename);
 }
 
-void RamFileBlockCache::RemoveFile_Locked(const string& filename) {
+void RamFileBlockCache::RemoveFile_Locked(const std::string& filename) {
   Key begin = std::make_pair(filename, 0);
   auto it = block_map_.lower_bound(begin);
   while (it != block_map_.end() && it->first.first == filename) {
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
index 3a6e90f5b08776..6b1236d0bd6d6c 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
@@ -46,12 +46,12 @@ class RamFileBlockCache : public FileBlockCache {
   /// cache is constructed. The returned Status should be OK as long as the
   /// read from the remote filesystem succeeded (similar to the semantics of the
   /// read(2) system call).
-  typedef std::function<absl::Status(const string& filename, size_t offset,
+  typedef std::function<absl::Status(const std::string& filename, size_t offset,
                                      size_t buffer_size, char* buffer,
                                      size_t* bytes_transferred)>
       BlockFetcher;
 
-  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
+  RamFileBlockCache(size_t block_size, size_t max_bytes, uint64_t max_staleness,
                     BlockFetcher block_fetcher, Env* env = Env::Default())
       : block_size_(block_size),
         max_bytes_(max_bytes),
@@ -89,19 +89,19 @@ class RamFileBlockCache : public FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  absl::Status Read(const string& filename, size_t offset, size_t n,
+  absl::Status Read(const std::string& filename, size_t offset, size_t n,
                     char* buffer, size_t* bytes_transferred) override;
 
   // Validate the given file signature with the existing file signature in the
   // cache. Returns true if the signature doesn't change or the file doesn't
   // exist before. If the signature changes, update the existing signature with
   // the new one and remove the file from cache.
-  bool ValidateAndUpdateFileSignature(const string& filename,
+  bool ValidateAndUpdateFileSignature(const std::string& filename,
                                       int64_t file_signature) override
       TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached blocks for `filename`.
-  void RemoveFile(const string& filename) override TF_LOCKS_EXCLUDED(mu_);
+  void RemoveFile(const std::string& filename) override TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
   void Flush() override TF_LOCKS_EXCLUDED(mu_);
@@ -109,7 +109,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// Accessors for cache parameters.
   size_t block_size() const override { return block_size_; }
   size_t max_bytes() const override { return max_bytes_; }
-  uint64 max_staleness() const override { return max_staleness_; }
+  uint64_t max_staleness() const override { return max_staleness_; }
 
   /// The current size (in bytes) of the cache.
   size_t CacheSize() const override TF_LOCKS_EXCLUDED(mu_);
@@ -127,7 +127,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// The maximum number of bytes (sum of block sizes) allowed in the LRU cache.
   const size_t max_bytes_;
   /// The maximum staleness of any block in the LRU cache, in seconds.
-  const uint64 max_staleness_;
+  const uint64_t max_staleness_;
   /// The callback to read a block from the underlying filesystem.
   const BlockFetcher block_fetcher_;
   /// The Env from which we read timestamps.
@@ -136,7 +136,7 @@ class RamFileBlockCache : public FileBlockCache {
   /// \brief The key type for the file block cache.
   ///
   /// The file block cache key is a {filename, offset} pair.
-  typedef std::pair<string, size_t> Key;
+  typedef std::pair<std::string, size_t> Key;
 
   /// \brief The state of a block.
   ///
@@ -175,7 +175,7 @@ class RamFileBlockCache : public FileBlockCache {
     /// A list iterator pointing to the block's position in the LRA list.
     std::list<Key>::iterator lra_iterator;
     /// The timestamp (seconds since epoch) at which the block was cached.
-    uint64 timestamp;
+    uint64_t timestamp;
     /// Mutex to guard state variable
     absl::Mutex mu;
     /// The state of the block.
@@ -209,7 +209,7 @@ class RamFileBlockCache : public FileBlockCache {
       TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all blocks of a file, with mu_ already held.
-  void RemoveFile_Locked(const string& filename)
+  void RemoveFile_Locked(const std::string& filename)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Remove the block `entry` from the block map and LRU list, and update the
@@ -243,7 +243,7 @@ class RamFileBlockCache : public FileBlockCache {
   size_t cache_size_ TF_GUARDED_BY(mu_) = 0;
 
   // A filename->file_signature map.
-  std::map<string, int64_t> file_signature_map_ TF_GUARDED_BY(mu_);
+  std::map<std::string, int64_t> file_signature_map_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
index b8a72f15a42601..0148f01fcd2377 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
+#include "absl/strings/ascii.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -28,7 +29,7 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-absl::Status ReadCache(RamFileBlockCache* cache, const string& filename,
+absl::Status ReadCache(RamFileBlockCache* cache, const std::string& filename,
                        size_t offset, size_t n, std::vector<char>* out) {
   out->clear();
   out->resize(n, 0);
@@ -41,7 +42,7 @@ absl::Status ReadCache(RamFileBlockCache* cache, const string& filename,
 }
 
 TEST(RamFileBlockCacheTest, IsCacheEnabled) {
-  auto fetcher = [](const string& filename, size_t offset, size_t n,
+  auto fetcher = [](const std::string& filename, size_t offset, size_t n,
                     char* buffer, size_t* bytes_transferred) {
     // Do nothing.
     return absl::OkStatus();
@@ -59,14 +60,14 @@ TEST(RamFileBlockCacheTest, IsCacheEnabled) {
 
 TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  string filename = "file";
+  std::string filename = "file";
   RamFileBlockCache cache(16, 32, 0, fetcher);
   std::vector<char> out;
 
@@ -87,12 +88,12 @@ TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
 }
 
 TEST(RamFileBlockCacheTest, PassThrough) {
-  const string want_filename = "foo/bar";
+  const std::string want_filename = "foo/bar";
   const size_t want_offset = 42;
   const size_t want_n = 1024;
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
-                     const string& got_filename, size_t got_offset,
+                     const std::string& got_filename, size_t got_offset,
                      size_t got_n, char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
@@ -128,7 +129,7 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
     buf.push_back(i);
   }
   // The fetcher just fetches slices of the buffer.
-  auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&buf](const std::string& filename, size_t offset, size_t n,
                         char* buffer, size_t* bytes_transferred) {
     if (offset < buf.size()) {
       size_t bytes_to_copy = std::min<size_t>(buf.size() - offset, n);
@@ -173,8 +174,8 @@ TEST(RamFileBlockCacheTest, BlockAlignment) {
 TEST(RamFileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -184,7 +185,7 @@ TEST(RamFileBlockCacheTest, CacheHits) {
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  const uint32 block_count = 256;
+  const uint32_t block_count = 256;
   RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   out.resize(block_count, 0);
@@ -206,7 +207,7 @@ TEST(RamFileBlockCacheTest, OutOfRange) {
   bool first_block = false;
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
@@ -249,8 +250,9 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
   // where we expected complete blocks.
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
-  auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              char* buffer, size_t* bytes_transferred) {
+  auto fetcher = [block_size](const std::string& filename, size_t offset,
+                              size_t n, char* buffer,
+                              size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_GE(n, 1);
@@ -272,8 +274,8 @@ TEST(RamFileBlockCacheTest, Inconsistent) {
 TEST(RamFileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
-  auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, char* buffer,
+  auto fetcher = [&calls, block_size](const std::string& filename,
+                                      size_t offset, size_t n, char* buffer,
                                       size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
@@ -285,7 +287,7 @@ TEST(RamFileBlockCacheTest, LRU) {
     *bytes_transferred = n;
     return absl::OkStatus();
   };
-  const uint32 block_count = 2;
+  const uint32_t block_count = 2;
   RamFileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
   // Read blocks from the cache, and verify the LRU behavior based on the
@@ -320,7 +322,7 @@ TEST(RamFileBlockCacheTest, LRU) {
 
 TEST(RamFileBlockCacheTest, MaxStaleness) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
@@ -360,13 +362,13 @@ TEST(RamFileBlockCacheTest, MaxStaleness) {
 
 TEST(RamFileBlockCacheTest, RemoveFile) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
       // The first block is lower case and all subsequent blocks are upper case.
-      c = toupper(c);
+      c = absl::ascii_toupper(c);
     }
     memset(buffer, c, n);
     *bytes_transferred = n;
@@ -422,7 +424,7 @@ TEST(RamFileBlockCacheTest, RemoveFile) {
 
 TEST(RamFileBlockCacheTest, Prune) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
@@ -432,7 +434,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   std::vector<char> out;
   // Our fake environment is initialized with the current timestamp.
   std::unique_ptr<NowSecondsEnv> env(new NowSecondsEnv);
-  uint64 now = Env::Default()->NowSeconds();
+  uint64_t now = Env::Default()->NowSeconds();
   env->SetNowSeconds(now);
   RamFileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
   // Read three blocks into the cache, and advance the timestamp by one second
@@ -459,7 +461,7 @@ TEST(RamFileBlockCacheTest, Prune) {
   // timestamp of `now` + 2, file "a" is stale because its first block is stale,
   // but file "b" is not stale yet. Thus, once the pruning thread wakes up (in
   // one second of wall time), it should remove "a" and leave "b" alone.
-  uint64 start = Env::Default()->NowSeconds();
+  uint64_t start = Env::Default()->NowSeconds();
   do {
     Env::Default()->SleepForMicroseconds(100000);
   } while (cache.CacheSize() == 24 && Env::Default()->NowSeconds() - start < 3);
@@ -487,7 +489,7 @@ TEST(RamFileBlockCacheTest, ParallelReads) {
   absl::BlockingCounter counter(callers);
   absl::Notification notification;
   auto fetcher = [&counter, &notification](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     if (counter.DecrementCount()) {
       notification.Notify();
@@ -528,7 +530,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
   int num_requests = 0;
   absl::Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
-                     const string& filename, size_t offset, size_t n,
+                     const std::string& filename, size_t offset, size_t n,
                      char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
@@ -558,7 +560,7 @@ TEST(RamFileBlockCacheTest, CoalesceConcurrentReads) {
 
 TEST(RamFileBlockCacheTest, Flush) {
   int calls = 0;
-  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+  auto fetcher = [&calls](const std::string& filename, size_t offset, size_t n,
                           char* buffer, size_t* bytes_transferred) {
     calls++;
     memset(buffer, 'x', n);
diff --git a/third_party/xla/xla/tsl/platform/cloud/time_util.cc b/third_party/xla/xla/tsl/platform/cloud/time_util.cc
index 8ab851d2765f64..9bb1fc8f77106d 100644
--- a/third_party/xla/xla/tsl/platform/cloud/time_util.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/time_util.cc
@@ -34,7 +34,7 @@ constexpr int64_t kNanosecondsPerSecond = 1000 * 1000 * 1000;
 
 // Only implements one special case of RFC 3339 which is returned by
 // GCS API, e.g 2016-04-29T23:15:24.896Z.
-absl::Status ParseRfc3339Time(const string& time, int64_t* mtime_nsec) {
+absl::Status ParseRfc3339Time(const std::string& time, int64_t* mtime_nsec) {
   tm parsed{0};
   float seconds;
   if (sscanf(time.c_str(), "%4d-%2d-%2dT%2d:%2d:%fZ", &(parsed.tm_year),
diff --git a/third_party/xla/xla/tsl/platform/cloud/time_util.h b/third_party/xla/xla/tsl/platform/cloud/time_util.h
index de9653b87acafe..0d0ab557b6d714 100644
--- a/third_party/xla/xla/tsl/platform/cloud/time_util.h
+++ b/third_party/xla/xla/tsl/platform/cloud/time_util.h
@@ -22,7 +22,7 @@ namespace tsl {
 
 /// Parses the timestamp in RFC 3339 format and returns it
 /// as nanoseconds since epoch.
-absl::Status ParseRfc3339Time(const string& time, int64_t* mtime_nsec);
+absl::Status ParseRfc3339Time(const std::string& time, int64_t* mtime_nsec);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/zone_provider.h b/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
index 22a109500b94ad..5c61e2af0df6ed 100644
--- a/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/zone_provider.h
@@ -34,9 +34,9 @@ class ZoneProvider {
   /// Returns an empty string in the case where the zone does not match the
   /// expected format
   /// Safe for concurrent use by multiple threads.
-  virtual absl::Status GetZone(string* zone) = 0;
+  virtual absl::Status GetZone(std::string* zone) = 0;
 
-  static absl::Status GetZone(ZoneProvider* provider, string* zone) {
+  static absl::Status GetZone(ZoneProvider* provider, std::string* zone) {
     if (!provider) {
       return errors::Internal("Zone provider is required.");
     }
diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD
index c0deb109c4300b..3c0b20666535be 100644
--- a/third_party/xla/xla/tsl/platform/default/BUILD
+++ b/third_party/xla/xla/tsl/platform/default/BUILD
@@ -116,13 +116,15 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_rocm//rocm:rocm_config",
-        "@local_config_tensorrt//:tensorrt_headers",
         "@local_tsl//tsl/platform:load_library",
         "@local_tsl//tsl/platform:path",
     ] + if_oss([
         "@nvshmem//:nvshmem_config",
         "@local_config_nccl//:nccl_config",
+        "@local_config_tensorrt//:tensorrt_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_config",
+        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
@@ -312,6 +314,8 @@ cc_library(
     ],
     deps = [
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:strcat",
     ],
     alwayslink = True,
@@ -355,6 +359,7 @@ cc_library(
         "//xla/tsl/platform:types",
         "//xla/tsl/platform/profile_utils:profile_utils_cpu_utils",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform",
         "@snappy",
     ] + select({
diff --git a/third_party/xla/xla/tsl/platform/default/build_config.bzl b/third_party/xla/xla/tsl/platform/default/build_config.bzl
index 51a159afb44dc3..1dc96760e99d56 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config.bzl
@@ -7,12 +7,6 @@ load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
 load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_grpc_library")
 load("@com_google_protobuf//bazel:cc_proto_library.bzl", "cc_proto_library")
 load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
-load(
-    "@local_xla//xla/tsl:tsl.bzl",
-    "clean_dep",
-    "if_tsl_link_protobuf",
-)
-load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
 load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
 load("@rules_python//python:py_library.bzl", "py_library")
@@ -20,6 +14,12 @@ load("@rules_python//python:py_library.bzl", "py_library")
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load(
+    "//xla/tsl:tsl.bzl",
+    "clean_dep",
+    "if_tsl_link_protobuf",
+)
+load("//xla/tsl/platform:build_config_root.bzl", "if_static")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
@@ -132,7 +132,7 @@ def pyx_library(
         cc_binary(
             name = shared_object_name,
             srcs = [stem + ".cpp"],
-            deps = cc_deps + ["@local_xla//third_party/python_runtime:headers"],
+            deps = cc_deps + [Label("//third_party/python_runtime:headers")],
             linkshared = 1,
             testonly = testonly,
             copts = copts,
@@ -327,7 +327,7 @@ def tf_additional_lib_hdrs():
         clean_dep("//xla/tsl/platform/default:tracing_impl.h"),
         clean_dep("//xla/tsl/platform/default:unbounded_work_queue.h"),
     ] + select({
-        clean_dep("@local_xla//xla/tsl:windows"): [
+        clean_dep("//xla/tsl:windows"): [
             clean_dep("//xla/tsl/platform/windows:intrinsics_port.h"),
             clean_dep("//xla/tsl/platform/windows:stacktrace.h"),
             clean_dep("//xla/tsl/platform/windows:subprocess.h"),
@@ -379,11 +379,11 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
     return select({
-        clean_dep("@local_xla//xla/tsl:android"): [],
-        clean_dep("@local_xla//xla/tsl:ios"): [],
-        clean_dep("@local_xla//xla/tsl:linux_s390x"): [],
+        clean_dep("//xla/tsl:android"): [],
+        clean_dep("//xla/tsl:ios"): [],
+        clean_dep("//xla/tsl:linux_s390x"): [],
         "//conditions:default": [
-            clean_dep("@local_xla//xla/tsl/platform/cloud:gcs_file_system"),
+            clean_dep("//xla/tsl/platform/cloud:gcs_file_system"),
         ],
     })
 
@@ -391,7 +391,7 @@ def tf_lib_proto_parsing_deps():
     return [
         ":protos_all_cc",
         clean_dep("@eigen_archive//:eigen3"),
-        clean_dep("@local_xla//xla/tsl/protobuf:protos_all_cc"),
+        clean_dep("//xla/tsl/protobuf:protos_all_cc"),
     ]
 
 def tf_py_clif_cc(name, visibility = None, **_kwargs):
@@ -438,7 +438,7 @@ def strict_cc_test(
         shuffle_tests = True,
         args = None,
         fail_if_no_test_linked = True,
-        fail_if_no_test_selected = True,
+        fail_if_no_test_selected = False,
         **kwargs):
     """A drop-in replacement for cc_test that enforces some good practices by default.
 
@@ -472,7 +472,7 @@ def strict_cc_test(
         # cases. Local builds are exempt from this enforcement to allow for development with
         # --gtest_filter.
         args = args + select({
-            clean_dep("@local_xla//xla/tsl:is_ci_build"): ["--gtest_fail_if_no_test_selected"],
+            clean_dep("//xla/tsl:is_ci_build"): ["--gtest_fail_if_no_test_selected"],
             "//conditions:default": [],
         })
     _cc_test(
@@ -511,9 +511,9 @@ def tsl_cc_test(
                 clean_dep("@com_google_protobuf//:protobuf"),
                 # TODO(ddunleavy) remove these and add proto deps to tests
                 # granularly
-                clean_dep("@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc_impl"),
-                clean_dep("@local_xla//xla/tsl/protobuf:histogram_proto_cc_impl"),
-                clean_dep("@local_xla//xla/tsl/protobuf:status_proto_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:error_codes_proto_impl_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:histogram_proto_cc_impl"),
+                clean_dep("//xla/tsl/protobuf:status_proto_cc_impl"),
                 clean_dep("//tsl/profiler/protobuf:xplane_proto_cc_impl"),
                 clean_dep("//tsl/profiler/protobuf:profiler_options_proto_cc_impl"),
             ],
@@ -522,7 +522,7 @@ def tsl_cc_test(
     )
 
 def tf_portable_proto_lib():
-    return ["//tensorflow/core:protos_all_cc_impl", clean_dep("@local_xla//xla/tsl/protobuf:protos_all_cc_impl")]
+    return ["//tensorflow/core:protos_all_cc_impl", clean_dep("//xla/tsl/protobuf:protos_all_cc_impl")]
 
 def tf_protobuf_compiler_deps():
     return if_static(
@@ -534,8 +534,8 @@ def tf_protobuf_compiler_deps():
 
 def tf_windows_aware_platform_deps(name):
     return select({
-        clean_dep("@local_xla//xla/tsl:windows"): [
-            clean_dep("@local_xla//xla/tsl/platform/windows:" + name),
+        clean_dep("//xla/tsl:windows"): [
+            clean_dep("//xla/tsl/platform/windows:" + name),
         ],
         "//conditions:default": [
             clean_dep("//xla/tsl/platform/default:" + name),
diff --git a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
index 6b8291f62ca617..8ba7066de73a0a 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
@@ -6,7 +6,7 @@ be separate to avoid cyclic references.
 
 load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
 load("@local_config_rocm//rocm:build_defs.bzl", "is_rocm_configured")
-load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
+load("//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "is_cuda_configured")
 
@@ -19,6 +19,14 @@ GPU_TEST_PROPERTIES = {
     "Pool": "gpu-pool",
 }
 
+ROCM_SINGLE_GPU_TEST_PROPERTIES = {
+    "test.Pool": "linux_x64_gpu",
+}
+
+ROCM_MULTI_GPU_TEST_PROPERTIES = {
+    "test.Pool": "linux_x64_multigpu",
+}
+
 def tf_gpu_tests_tags():
     """Gets tags for TensorFlow GPU tests based on the configured environment.
 
@@ -38,9 +46,23 @@ def tf_gpu_tests_tags():
 def tf_cuda_tests_tags():
     return tf_gpu_tests_tags()
 
+def tf_has_tag(kwargs, tag):
+    return ("tags" in kwargs and kwargs["tags"] != None and tag in kwargs["tags"])
+
 def tf_exec_properties(kwargs):
-    if ("tags" in kwargs and kwargs["tags"] != None and
-        "remote-gpu" in kwargs["tags"]):
+    """Gets execution_properties for TensorFlow GPU tests based on the provided tags.
+
+    Args:
+      kwargs: all arguments of the xla test target
+    Returns:
+        execution_properties with the execution pool names for rbe.
+    """
+    if is_rocm_configured():
+        if tf_has_tag(kwargs, "multi_gpu"):
+            return ROCM_MULTI_GPU_TEST_PROPERTIES
+        if tf_has_tag(kwargs, "gpu"):
+            return ROCM_SINGLE_GPU_TEST_PROPERTIES
+    elif tf_has_tag(kwargs, "remote-gpu"):
         return GPU_TEST_PROPERTIES
     return {}
 
@@ -122,6 +144,12 @@ def if_llvm_powerpc_available(then, otherwise = []):
         "//conditions:default": otherwise,
     })
 
+def if_llvm_riscv_available(then, otherwise = []):
+    return select({
+        str(Label("//xla/tsl:riscv64_or_cross")): then,
+        "//conditions:default": otherwise,
+    })
+
 def if_llvm_system_z_available(then, otherwise = []):
     return select({
         str(Label("//xla/tsl:s390x_or_cross")): then,
diff --git a/third_party/xla/xla/tsl/platform/default/casts.h b/third_party/xla/xla/tsl/platform/default/casts.h
index 600d40bd495628..3825e2180d5d84 100644
--- a/third_party/xla/xla/tsl/platform/default/casts.h
+++ b/third_party/xla/xla/tsl/platform/default/casts.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <assert.h>  // for use with down_cast<>
 
+#include <memory>
 #include <type_traits>
 
 namespace tensorflow {
@@ -87,10 +88,19 @@ inline To down_cast(From& f) {
   return static_cast<To>(f);
 }
 
+// A `down_cast` version for `std::shared_ptr`.
+template <typename To, typename From>
+std::shared_ptr<To> down_pointer_cast(const std::shared_ptr<From>& from) {
+  auto* ptr =
+      down_cast<typename std::shared_ptr<To>::element_type*>(from.get());
+  return std::shared_ptr<To>{from, ptr};
+}
+
 }  // namespace tensorflow
 
 namespace tsl {
 using ::tensorflow::down_cast;
-}
+using ::tensorflow::down_pointer_cast;
+}  // namespace tsl
 
 #endif  // XLA_TSL_PLATFORM_DEFAULT_CASTS_H_
diff --git a/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc b/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
index f915f6288f22e2..8ac4bc05f2869f 100644
--- a/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
+++ b/third_party/xla/xla/tsl/platform/default/cuda_root_path.cc
@@ -43,7 +43,8 @@ std::vector<std::string> CandidateCudaRoots() {
   auto roots = std::vector<std::string>{};
   std::string runfiles_suffix = "runfiles";
   std::vector<std::string> cuda_dir_names = {"cuda_nvcc", "cuda_nvdisasm",
-                                             "nvidia_nvshmem", "cuda_nvvm"};
+                                             "nvidia_nvshmem", "cuda_nvvm",
+                                             "cuda_cudart"};
 
   // The CUDA candidate root for c++ targets.
   std::string executable_path = tsl::Env::Default()->GetExecutablePath();
@@ -56,14 +57,23 @@ std::vector<std::string> CandidateCudaRoots() {
   // The CUDA candidate root for python targets.
   std::string runfiles_dir = tsl::Env::Default()->GetRunfilesDir();
   std::size_t runfiles_ind = runfiles_dir.rfind(runfiles_suffix);
-  for (const std::string& cuda_dir_name : cuda_dir_names) {
-    std::string cuda_dir = io::JoinPath(
-        runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
-        cuda_dir_name);
-    roots.push_back(cuda_dir);
+  if (runfiles_ind != std::string::npos) {
+    for (const std::string& cuda_dir_name : cuda_dir_names) {
+      std::string cuda_dir = io::JoinPath(
+          runfiles_dir.substr(0, runfiles_ind + runfiles_suffix.length()),
+          cuda_dir_name);
+      roots.push_back(cuda_dir);
+    }
   }
 
-  roots.push_back(TF_CUDA_TOOLKIT_PATH);
+  const char* cuda_home = getenv("CUDA_HOME");
+  if (cuda_home) {
+    roots.emplace_back(cuda_home);
+  }
+  std::string cuda_toolkit_path = TF_CUDA_TOOLKIT_PATH;
+  if (!cuda_toolkit_path.empty()) {
+    roots.push_back(std::move(cuda_toolkit_path));
+  }
   roots.emplace_back(std::string("/usr/local/cuda"));
   roots.emplace_back(std::string("/opt/cuda"));
 
@@ -101,12 +111,15 @@ std::vector<std::string> CandidateCudaRoots() {
     // $CONDA_PREFIX/lib/python3.12/site-packages/pkg_name, so if we want
     // to add $CONDA_PREFIX to the candidate roots dirs we need to add
     // ../../../..
-    for (auto path : {"../../../..", "../../../../.."})
+    for (auto path : {"../../../..", "../../../../.."}) {
       roots.emplace_back(io::JoinPath(dir, path));
+    }
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 
-  for (auto root : roots) VLOG(3) << "CUDA root = " << root;
+  for (auto root : roots) {
+    VLOG(3) << "CUDA root = " << root;
+  }
   return roots;
 #else   // !defined(PLATFORM_GOOGLE)
   return {};
diff --git a/third_party/xla/xla/tsl/platform/default/dso_loader.cc b/third_party/xla/xla/tsl/platform/default/dso_loader.cc
index 6fc3308dffbb88..c280d7df15fe45 100644
--- a/third_party/xla/xla/tsl/platform/default/dso_loader.cc
+++ b/third_party/xla/xla/tsl/platform/default/dso_loader.cc
@@ -25,7 +25,9 @@ limitations under the License.
 #include "third_party/gpus/cuda/cuda_config.h"
 #include "third_party/nccl/nccl_config.h"
 #include "third_party/nvshmem/nvshmem_config.h"
+#if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
+#endif
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/load_library.h"
 #include "tsl/platform/path.h"
@@ -136,7 +138,7 @@ absl::StatusOr<void*> GetDsoHandle(const std::string& name,
                               "'; dlerror: ", status.message());
 #if !defined(PLATFORM_WINDOWS)
   if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
-    message += absl::StrCat("; LD_LIBRARY_PATH: ", ld_library_path);
+    absl::StrAppend(&message, "; LD_LIBRARY_PATH: ", ld_library_path);
   }
 #endif
   VLOG(1) << message;
diff --git a/third_party/xla/xla/tsl/platform/default/env.cc b/third_party/xla/xla/tsl/platform/default/env.cc
index 0097b558f6d8b3..55f1bc09cd3e1a 100644
--- a/third_party/xla/xla/tsl/platform/default/env.cc
+++ b/third_party/xla/xla/tsl/platform/default/env.cc
@@ -55,10 +55,10 @@ namespace {
 
 ABSL_CONST_INIT absl::Mutex name_mutex(absl::kConstInit);
 
-std::map<std::thread::id, string>& GetThreadNameRegistry()
+std::map<std::thread::id, std::string>& GetThreadNameRegistry()
     TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
   static auto* const thread_name_registry =
-      new std::map<std::thread::id, string>();
+      new std::map<std::thread::id, std::string>();
   return *thread_name_registry;
 }
 
@@ -125,11 +125,11 @@ class PosixEnv : public Env {
 
   ~PosixEnv() override { LOG(FATAL) << "Env::Default() must not be destroyed"; }
 
-  bool MatchPath(const string& path, const string& pattern) override {
+  bool MatchPath(const std::string& path, const std::string& pattern) override {
     return fnmatch(pattern.c_str(), path.c_str(), FNM_PATHNAME) == 0;
   }
 
-  void SleepForMicroseconds(int64 micros) override {
+  void SleepForMicroseconds(int64_t micros) override {
     while (micros > 0) {
       timespec sleep_time;
       sleep_time.tv_sec = 0;
@@ -150,12 +150,13 @@ class PosixEnv : public Env {
     }
   }
 
-  Thread* StartThread(const ThreadOptions& thread_options, const string& name,
+  Thread* StartThread(const ThreadOptions& thread_options,
+                      const std::string& name,
                       absl::AnyInvocable<void()> fn) override {
     return new PThread(thread_options, name, std::move(fn));
   }
   void StartDetachedThread(const ThreadOptions& thread_options,
-                           const string& name,
+                           const std::string& name,
                            absl::AnyInvocable<void()> fn) override {
     PThread detached(thread_options, name, std::move(fn), /*detached=*/true);
   }
@@ -166,7 +167,7 @@ class PosixEnv : public Env {
     return current_thread_id;
   }
 
-  bool GetCurrentThreadName(string* name) override {
+  bool GetCurrentThreadName(std::string* name) override {
     {
       absl::MutexLock l(name_mutex);
       auto thread_name =
@@ -202,7 +203,7 @@ class PosixEnv : public Env {
     closure_thread.detach();
   }
 
-  void SchedClosureAfter(int64 micros,
+  void SchedClosureAfter(int64_t micros,
                          absl::AnyInvocable<void()> closure) override {
     // TODO(b/27290852): Consuming a thread here is wasteful, but this
     // code is (currently) only used in the case where a step fails
@@ -223,14 +224,14 @@ class PosixEnv : public Env {
     return internal::GetSymbolFromLibrary(handle, symbol_name, symbol);
   }
 
-  string FormatLibraryFileName(const string& name,
-                               const string& version) override {
+  std::string FormatLibraryFileName(const std::string& name,
+                                    const std::string& version) override {
     return internal::FormatLibraryFileName(name, version);
   }
 
-  string GetRunfilesDir() override {
-    string bin_path = this->GetExecutablePath();
-    string runfiles_suffix = ".runfiles/org_tensorflow";
+  std::string GetRunfilesDir() override {
+    std::string bin_path = this->GetExecutablePath();
+    std::string runfiles_suffix = ".runfiles/org_tensorflow";
     std::size_t pos = bin_path.find(runfiles_suffix);
 
     // Sometimes (when executing under python) bin_path returns the full path to
@@ -241,7 +242,7 @@ class PosixEnv : public Env {
 
     // See if we have the executable path. if executable.runfiles exists, return
     // that folder.
-    string runfiles_path = bin_path + runfiles_suffix;
+    std::string runfiles_path = bin_path + runfiles_suffix;
     absl::Status s = this->IsDirectory(runfiles_path);
     if (s.ok()) {
       return runfiles_path;
@@ -252,7 +253,7 @@ class PosixEnv : public Env {
   }
 
  private:
-  void GetLocalTempDirectories(std::vector<string>* list) override;
+  void GetLocalTempDirectories(std::vector<std::string>* list) override;
 
   int64_t GetCurrentThreadIdInternal() {
 #ifdef __APPLE__
@@ -282,7 +283,7 @@ Env* Env::Default() {
 }
 #endif
 
-void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
+void PosixEnv::GetLocalTempDirectories(std::vector<std::string>* list) {
   list->clear();
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
@@ -307,7 +308,7 @@ void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
     if (!d || d[0] == '\0') continue;  // Empty env var
     paths.push_back(d);
     // Make sure we don't surprise anyone who's expecting a '/'
-    string dstr = d;
+    std::string dstr = d;
     if (dstr[dstr.size() - 1] != '/') {
       dstr += "/";
     }
diff --git a/third_party/xla/xla/tsl/platform/default/env_time.cc b/third_party/xla/xla/tsl/platform/default/env_time.cc
index cfe7d23d1a2a72..a25b50f899f98b 100644
--- a/third_party/xla/xla/tsl/platform/default/env_time.cc
+++ b/third_party/xla/xla/tsl/platform/default/env_time.cc
@@ -21,11 +21,11 @@ limitations under the License.
 namespace tsl {
 
 /* static */
-uint64 EnvTime::NowNanos() {
+uint64_t EnvTime::NowNanos() {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);
-  return (static_cast<uint64>(ts.tv_sec) * kSecondsToNanos +
-          static_cast<uint64>(ts.tv_nsec));
+  return (static_cast<uint64_t>(ts.tv_sec) * kSecondsToNanos +
+          static_cast<uint64_t>(ts.tv_nsec));
 }
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/default/human_readable_json.cc b/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
index a1de7d6ed2806b..7870f3af9c212a 100644
--- a/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
+++ b/third_party/xla/xla/tsl/platform/default/human_readable_json.cc
@@ -51,7 +51,7 @@ absl::StatusOr<std::string> ProtoToHumanReadableJson(
   return std::string("[human readable output not available for lite protos]");
 }
 
-absl::Status HumanReadableJsonToProto(const string& str,
+absl::Status HumanReadableJsonToProto(const std::string& str,
                                       protobuf::Message* proto) {
   proto->Clear();
   auto status = protobuf::util::JsonStringToMessage(str, proto);
@@ -65,7 +65,7 @@ absl::Status HumanReadableJsonToProto(const string& str,
   return absl::OkStatus();
 }
 
-absl::Status HumanReadableJsonToProto(const string& str,
+absl::Status HumanReadableJsonToProto(const std::string& str,
                                       protobuf::MessageLite* proto) {
   return errors::Internal("Cannot parse JSON protos on Android");
 }
diff --git a/third_party/xla/xla/tsl/platform/default/net.cc b/third_party/xla/xla/tsl/platform/default/net.cc
index 23df48be75ece1..d8aa6aeef92120 100644
--- a/third_party/xla/xla/tsl/platform/default/net.cc
+++ b/third_party/xla/xla/tsl/platform/default/net.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <random>
 #include <unordered_set>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/strcat.h"
 
@@ -97,6 +99,40 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   return true;
 }
 
+// Manages the set of ports that have been chosen by PickUnusedPort().
+// This class is a singleton and is thread-safe.
+class ChosenPorts {
+ public:
+  static ChosenPorts& GetChosenPorts() {
+    static ChosenPorts chosen_ports;
+    return chosen_ports;
+  }
+
+  // Returns true if the port is in the chosen set.
+  bool Contains(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.count(port) > 0;
+  }
+
+  // If the port is not in the chosen set, inserts it and returns true.
+  // Otherwise, returns false.
+  bool Insert(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.insert(port).second;
+  }
+
+  // Erases the port from the chosen set. Returns true if the port was present.
+  bool Erase(int port) {
+    absl::MutexLock l(mu_);
+    return ports_.erase(port) > 0;
+  }
+
+ private:
+  ChosenPorts() = default;
+  absl::Mutex mu_;
+  std::unordered_set<int> ports_ ABSL_GUARDED_BY(mu_);
+};
+
 const int kNumRandomPortsToPick = 100;
 const int kMaximumTrials = 1000;
 
@@ -109,7 +145,6 @@ int PickUnusedPortOrDie() {
 }
 
 int PickUnusedPort() {
-  static std::unordered_set<int> chosen_ports;
   // Type of port to first pick in the next iteration.
   bool is_tcp = true;
   int trial = 0;
@@ -132,7 +167,7 @@ int PickUnusedPort() {
       port = 0;
     }
 
-    if (chosen_ports.find(port) != chosen_ports.end()) {
+    if (ChosenPorts::GetChosenPorts().Contains(port)) {
       continue;
     }
     if (!IsPortAvailable(&port, is_tcp)) {
@@ -146,12 +181,22 @@ int PickUnusedPort() {
       continue;
     }
 
-    chosen_ports.insert(port);
-    return port;
+    if (ChosenPorts::GetChosenPorts().Insert(port)) {
+      return port;
+    }
   }
 
   return -1;
 }
 
+void RecycleUnusedPort(int port) {
+  if (port <= 0 || !ChosenPorts::GetChosenPorts().Erase(port)) {
+    LOG(FATAL)
+        << "Port " << port
+        << " is not a valid port to be recycled. It must be a positive "
+           "number that was previously returned by PickUnusedPort[OrDie](), "
+           "and not yet recycled.";
+  }
+}
 }  // namespace internal
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/platform/default/port.cc b/third_party/xla/xla/tsl/platform/default/port.cc
index a406d79f42d014..31a2b75b317aaa 100644
--- a/third_party/xla/xla/tsl/platform/default/port.cc
+++ b/third_party/xla/xla/tsl/platform/default/port.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <new>
+
 #include "absl/base/internal/sysinfo.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/profile_utils/cpu_utils.h"
@@ -68,17 +70,17 @@ namespace port {
 
 void InitMain(const char* usage, int* argc, char*** argv) {}
 
-string Hostname() {
+std::string Hostname() {
   char hostname[1024];
   gethostname(hostname, sizeof hostname);
   hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
+  return std::string(hostname);
 }
 
-string JobName() {
+std::string JobName() {
   const char* job_name_cs = std::getenv("TF_JOB_NAME");
   if (job_name_cs != nullptr) {
-    return string(job_name_cs);
+    return std::string(job_name_cs);
   }
   return "";
 }
@@ -166,7 +168,7 @@ int NumHyperthreadsPerCore() {
   return (ht_per_core > 0) ? ht_per_core : 1;
 }
 
-bool Snappy_Compress(const char* input, size_t length, string* output) {
+bool Snappy_Compress(const char* input, size_t length, std::string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
   size_t outlen;
@@ -179,7 +181,7 @@ bool Snappy_Compress(const char* input, size_t length, string* output) {
 }
 
 bool Snappy_CompressFromIOVec(const struct iovec* iov,
-                              size_t uncompressed_length, string* output) {
+                              size_t uncompressed_length, std::string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(uncompressed_length));
   size_t outlen;
@@ -219,7 +221,7 @@ bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
 #endif
 }
 
-static void DemangleToString(const char* mangled, string* out) {
+static void DemangleToString(const char* mangled, std::string* out) {
   int status = 0;
   char* demangled = nullptr;
 #if TENSORFLOW_HAS_CXA_DEMANGLE
@@ -233,8 +235,8 @@ static void DemangleToString(const char* mangled, string* out) {
   }
 }
 
-string Demangle(const char* mangled) {
-  string demangled;
+std::string Demangle(const char* mangled) {
+  std::string demangled;
   DemangleToString(mangled, &demangled);
   return demangled;
 }
@@ -249,28 +251,31 @@ double NominalCPUFrequency() {
 namespace tsl {
 namespace port {
 
-void* AlignedMalloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment) {
+  const size_t alignment = static_cast<size_t>(minimum_alignment);
 #if defined(__ANDROID__)
-  return memalign(minimum_alignment, size);
+  return memalign(alignment, size);
 #else  // !defined(__ANDROID__)
-  void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return
   // memory aligned to at least the size of a pointer.
-  const int required_alignment = sizeof(void*);
-  if (minimum_alignment < required_alignment) return Malloc(size);
-  int err = posix_memalign(&ptr, minimum_alignment, size);
+  constexpr int kRequiredAlignment = sizeof(void*);
+  if (alignment < kRequiredAlignment) {
+    return Malloc(size);
+  }
+  void* ptr = nullptr;
+  int err = posix_memalign(&ptr, alignment, size);
   if (err != 0) {
     return nullptr;
-  } else {
-    return ptr;
   }
+  return ptr;
 #endif
 }
 
 void AlignedFree(void* aligned_memory) { Free(aligned_memory); }
 
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size) {
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t alignment) {
   (void)alignment;
   (void)size;
 
diff --git a/third_party/xla/xla/tsl/platform/default/posix_file_system.cc b/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
index 68ee3b1b7b9697..7167636239a6a9 100644
--- a/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/default/posix_file_system.cc
@@ -48,11 +48,11 @@ constexpr size_t kPosixCopyFileBufferSize = 128 * 1024;
 // pread() based random-access
 class PosixRandomAccessFile : public RandomAccessFile {
  private:
-  string filename_;
+  std::string filename_;
   int fd_;
 
  public:
-  PosixRandomAccessFile(const string& fname, int fd)
+  PosixRandomAccessFile(const std::string& fname, int fd)
       : filename_(fname), fd_(fd) {}
   ~PosixRandomAccessFile() override {
     if (close(fd_) < 0) {
@@ -65,7 +65,7 @@ class PosixRandomAccessFile : public RandomAccessFile {
     return absl::OkStatus();
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     absl::Status s;
     char* dst = scratch;
@@ -98,7 +98,8 @@ class PosixRandomAccessFile : public RandomAccessFile {
   }
 
 #if defined(TF_CORD_SUPPORT)
-  absl::Status Read(uint64 offset, size_t n, absl::Cord* cord) const override {
+  absl::Status Read(uint64_t offset, size_t n,
+                    absl::Cord* cord) const override {
     if (n == 0) {
       return absl::OkStatus();
     }
@@ -128,11 +129,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
 
 class PosixWritableFile : public WritableFile {
  private:
-  string filename_;
+  std::string filename_;
   FILE* file_;
 
  public:
-  PosixWritableFile(const string& fname, FILE* f)
+  PosixWritableFile(const std::string& fname, FILE* f)
       : filename_(fname), file_(f) {}
 
   ~PosixWritableFile() override {
@@ -209,23 +210,23 @@ class PosixWritableFile : public WritableFile {
 
 class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
-  PosixReadOnlyMemoryRegion(const void* address, uint64 length)
+  PosixReadOnlyMemoryRegion(const void* address, uint64_t length)
       : address_(address), length_(length) {}
   ~PosixReadOnlyMemoryRegion() override {
     munmap(const_cast<void*>(address_), length_);
   }
   const void* data() override { return address_; }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  private:
   const void* const address_;
-  const uint64 length_;
+  const uint64_t length_;
 };
 
 absl::Status PosixFileSystem::NewRandomAccessFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   int fd = open(translated_fname.c_str(), O_RDONLY);
   if (fd < 0) {
@@ -237,9 +238,9 @@ absl::Status PosixFileSystem::NewRandomAccessFile(
 }
 
 absl::Status PosixFileSystem::NewWritableFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   FILE* f = fopen(translated_fname.c_str(), "w");
   if (f == nullptr) {
@@ -251,9 +252,9 @@ absl::Status PosixFileSystem::NewWritableFile(
 }
 
 absl::Status PosixFileSystem::NewAppendableFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s;
   FILE* f = fopen(translated_fname.c_str(), "a");
   if (f == nullptr) {
@@ -265,9 +266,9 @@ absl::Status PosixFileSystem::NewAppendableFile(
 }
 
 absl::Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, TransactionToken* token,
+    const std::string& fname, TransactionToken* token,
     std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  string translated_fname = TranslateName(fname);
+  std::string translated_fname = TranslateName(fname);
   absl::Status s = absl::OkStatus();
   int fd = open(translated_fname.c_str(), O_RDONLY);
   if (fd < 0) {
@@ -289,7 +290,7 @@ absl::Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
   return s;
 }
 
-absl::Status PosixFileSystem::FileExists(const string& fname,
+absl::Status PosixFileSystem::FileExists(const std::string& fname,
                                          TransactionToken* token) {
   if (access(TranslateName(fname).c_str(), F_OK) == 0) {
     return absl::OkStatus();
@@ -297,10 +298,10 @@ absl::Status PosixFileSystem::FileExists(const string& fname,
   return errors::NotFound(fname, " not found");
 }
 
-absl::Status PosixFileSystem::GetChildren(const string& dir,
+absl::Status PosixFileSystem::GetChildren(const std::string& dir,
                                           TransactionToken* token,
-                                          std::vector<string>* result) {
-  string translated_dir = TranslateName(dir);
+                                          std::vector<std::string>* result) {
+  std::string translated_dir = TranslateName(dir);
   result->clear();
   DIR* d = opendir(translated_dir.c_str());
   if (d == nullptr) {
@@ -319,13 +320,13 @@ absl::Status PosixFileSystem::GetChildren(const string& dir,
   return absl::OkStatus();
 }
 
-absl::Status PosixFileSystem::GetMatchingPaths(const string& pattern,
-                                               TransactionToken* token,
-                                               std::vector<string>* results) {
+absl::Status PosixFileSystem::GetMatchingPaths(
+    const std::string& pattern, TransactionToken* token,
+    std::vector<std::string>* results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-absl::Status PosixFileSystem::DeleteFile(const string& fname,
+absl::Status PosixFileSystem::DeleteFile(const std::string& fname,
                                          TransactionToken* token) {
   absl::Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
@@ -334,9 +335,9 @@ absl::Status PosixFileSystem::DeleteFile(const string& fname,
   return result;
 }
 
-absl::Status PosixFileSystem::CreateDir(const string& name,
+absl::Status PosixFileSystem::CreateDir(const std::string& name,
                                         TransactionToken* token) {
-  string translated = TranslateName(name);
+  std::string translated = TranslateName(name);
   if (translated.empty()) {
     return errors::AlreadyExists(name);
   }
@@ -346,7 +347,7 @@ absl::Status PosixFileSystem::CreateDir(const string& name,
   return absl::OkStatus();
 }
 
-absl::Status PosixFileSystem::DeleteDir(const string& name,
+absl::Status PosixFileSystem::DeleteDir(const std::string& name,
                                         TransactionToken* token) {
   absl::Status result;
   if (rmdir(TranslateName(name).c_str()) != 0) {
@@ -355,9 +356,9 @@ absl::Status PosixFileSystem::DeleteDir(const string& name,
   return result;
 }
 
-absl::Status PosixFileSystem::GetFileSize(const string& fname,
+absl::Status PosixFileSystem::GetFileSize(const std::string& fname,
                                           TransactionToken* token,
-                                          uint64* size) {
+                                          uint64_t* size) {
   absl::Status s;
   struct stat sbuf;
   if (stat(TranslateName(fname).c_str(), &sbuf) != 0) {
@@ -369,7 +370,8 @@ absl::Status PosixFileSystem::GetFileSize(const string& fname,
   return s;
 }
 
-absl::Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
+absl::Status PosixFileSystem::Stat(const std::string& fname,
+                                   TransactionToken* token,
                                    FileStatistics* stats) {
   absl::Status s;
   struct stat sbuf;
@@ -383,8 +385,8 @@ absl::Status PosixFileSystem::Stat(const string& fname, TransactionToken* token,
   return s;
 }
 
-absl::Status PosixFileSystem::RenameFile(const string& src,
-                                         const string& target,
+absl::Status PosixFileSystem::RenameFile(const std::string& src,
+                                         const std::string& target,
                                          TransactionToken* token) {
   absl::Status result;
   if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
@@ -393,9 +395,10 @@ absl::Status PosixFileSystem::RenameFile(const string& src,
   return result;
 }
 
-absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
+absl::Status PosixFileSystem::CopyFile(const std::string& src,
+                                       const std::string& target,
                                        TransactionToken* token) {
-  string translated_src = TranslateName(src);
+  std::string translated_src = TranslateName(src);
   struct stat sbuf;
   if (stat(translated_src.c_str(), &sbuf) != 0) {
     return IOError(src, errno);
@@ -404,7 +407,7 @@ absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
   if (src_fd < 0) {
     return IOError(src, errno);
   }
-  string translated_target = TranslateName(target);
+  std::string translated_target = TranslateName(target);
   // O_WRONLY | O_CREAT | O_TRUNC:
   //   Open file for write and if file does not exist, create the file.
   //   If file exists, truncate its size to 0.
@@ -421,7 +424,7 @@ absl::Status PosixFileSystem::CopyFile(const string& src, const string& target,
   std::unique_ptr<char[]> buffer(new char[kPosixCopyFileBufferSize]);
   while (offset < sbuf.st_size) {
     // Use uint64 for safe compare SSIZE_MAX
-    uint64 chunk = sbuf.st_size - offset;
+    uint64_t chunk = sbuf.st_size - offset;
     if (chunk > SSIZE_MAX) {
       chunk = SSIZE_MAX;
     }
diff --git a/third_party/xla/xla/tsl/platform/default/posix_file_system.h b/third_party/xla/xla/tsl/platform/default/posix_file_system.h
index a54ecf04017dcd..b65f4f8af84cc5 100644
--- a/third_party/xla/xla/tsl/platform/default/posix_file_system.h
+++ b/third_party/xla/xla/tsl/platform/default/posix_file_system.h
@@ -30,55 +30,59 @@ class PosixFileSystem : public FileSystem {
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
   absl::Status NewRandomAccessFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
 
-  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
                                std::unique_ptr<WritableFile>* result) override;
 
   absl::Status NewAppendableFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override;
 
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename, TransactionToken* token,
+      const std::string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  absl::Status FileExists(const string& fname,
+  absl::Status FileExists(const std::string& fname,
                           TransactionToken* token) override;
 
-  absl::Status GetChildren(const string& dir, TransactionToken* token,
-                           std::vector<string>* result) override;
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<std::string>* result) override;
 
-  absl::Status Stat(const string& fname, TransactionToken* token,
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
                     FileStatistics* stats) override;
 
-  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                                std::vector<string>* results) override;
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override;
 
-  absl::Status DeleteFile(const string& fname,
+  absl::Status DeleteFile(const std::string& fname,
                           TransactionToken* token) override;
 
-  absl::Status CreateDir(const string& name, TransactionToken* token) override;
+  absl::Status CreateDir(const std::string& name,
+                         TransactionToken* token) override;
 
-  absl::Status DeleteDir(const string& name, TransactionToken* token) override;
+  absl::Status DeleteDir(const std::string& name,
+                         TransactionToken* token) override;
 
-  absl::Status GetFileSize(const string& fname, TransactionToken* token,
-                           uint64* size) override;
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64_t* size) override;
 
-  absl::Status RenameFile(const string& src, const string& target,
+  absl::Status RenameFile(const std::string& src, const std::string& target,
                           TransactionToken* token) override;
 
-  absl::Status CopyFile(const string& src, const string& target,
+  absl::Status CopyFile(const std::string& src, const std::string& target,
                         TransactionToken* token) override;
 };
 
 class LocalPosixFileSystem : public PosixFileSystem {
  public:
-  string TranslateName(const string& name) const override {
+  std::string TranslateName(const std::string& name) const override {
     absl::string_view scheme, host, path;
     io::ParseURI(name, &scheme, &host, &path);
-    return string(path);
+    return std::string(path);
   }
 };
 
diff --git a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
index 5eb1c2329dfca9..12c127bac63727 100644
--- a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
@@ -36,6 +36,6 @@ def cc_library(name, deps = None, **kwargs):
     # Horrifying, but needed to prevent a cycle, as `bazel_issue_21519` is an
     # alias of `empty`.
     if name != "empty":
-        deps = deps + ["@local_xla//xla/tsl:bazel_issue_21519"]  # buildifier: disable=list-append
-        deps = deps + ["@local_tsl//:bazel_issue_21519"]  # buildifier: disable=list-append
+        deps = deps + [Label("//xla/tsl:bazel_issue_21519")]  # buildifier: disable=list-append
+        deps = deps + [Label("@local_tsl//:bazel_issue_21519")]  # buildifier: disable=list-append
     _cc_library(name = name, deps = deps, **kwargs)
diff --git a/third_party/xla/xla/tsl/platform/env.cc b/third_party/xla/xla/tsl/platform/env.cc
index 8da042c0d9286e..e62a49d223b7a1 100644
--- a/third_party/xla/xla/tsl/platform/env.cc
+++ b/third_party/xla/xla/tsl/platform/env.cc
@@ -156,7 +156,7 @@ absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
 }
 
 absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
-                            const std::vector<string>& values) {
+                            const std::vector<std::string>& values) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -186,9 +186,9 @@ absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
 }
 
 absl::Status Env::FlushFileSystemCaches() {
-  std::vector<string> schemes;
+  std::vector<std::string> schemes;
   TF_RETURN_IF_ERROR(GetRegisteredFileSystemSchemes(&schemes));
-  for (const string& scheme : schemes) {
+  for (const std::string& scheme : schemes) {
     FileSystem* fs = nullptr;
     TF_RETURN_IF_ERROR(
         GetFileSystemForFile(io::CreateURI(scheme, "", ""), &fs));
@@ -198,49 +198,49 @@ absl::Status Env::FlushFileSystemCaches() {
 }
 
 absl::Status Env::NewRandomAccessFile(
-    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewRandomAccessFile(fname, result);
 }
 
 absl::Status Env::NewReadOnlyMemoryRegionFromFile(
-    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewReadOnlyMemoryRegionFromFile(fname, result);
 }
 
-absl::Status Env::NewWritableFile(const string& fname,
+absl::Status Env::NewWritableFile(const std::string& fname,
                                   std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewWritableFile(fname, result);
 }
 
-absl::Status Env::NewAppendableFile(const string& fname,
+absl::Status Env::NewAppendableFile(const std::string& fname,
                                     std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewAppendableFile(fname, result);
 }
 
-absl::Status Env::FileExists(const string& fname) {
+absl::Status Env::FileExists(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->FileExists(fname);
 }
 
-bool Env::FilesExist(const std::vector<string>& files,
+bool Env::FilesExist(const std::vector<std::string>& files,
                      std::vector<absl::Status>* status) {
-  std::unordered_map<string, std::vector<string>> files_per_fs;
+  std::unordered_map<std::string, std::vector<std::string>> files_per_fs;
   for (const auto& file : files) {
     absl::string_view scheme, host, path;
     io::ParseURI(file, &scheme, &host, &path);
-    files_per_fs[string(scheme)].push_back(file);
+    files_per_fs[std::string(scheme)].push_back(file);
   }
 
-  std::unordered_map<string, absl::Status> per_file_status;
+  std::unordered_map<std::string, absl::Status> per_file_status;
   bool result = true;
   for (auto itr : files_per_fs) {
     FileSystem* file_system = file_system_registry_->Lookup(itr.first);
@@ -277,62 +277,64 @@ bool Env::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-absl::Status Env::GetChildren(const string& dir, std::vector<string>* result) {
+absl::Status Env::GetChildren(const std::string& dir,
+                              std::vector<std::string>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dir, &fs));
   return fs->GetChildren(dir, result);
 }
 
-absl::Status Env::GetMatchingPaths(const string& pattern,
-                                   std::vector<string>* results) {
+absl::Status Env::GetMatchingPaths(const std::string& pattern,
+                                   std::vector<std::string>* results) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(pattern, &fs));
   return fs->GetMatchingPaths(pattern, results);
 }
 
-absl::Status Env::DeleteFile(const string& fname) {
+absl::Status Env::DeleteFile(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->DeleteFile(fname);
 }
 
-absl::Status Env::RecursivelyCreateDir(const string& dirname) {
+absl::Status Env::RecursivelyCreateDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->RecursivelyCreateDir(dirname);
 }
 
-absl::Status Env::CreateDir(const string& dirname) {
+absl::Status Env::CreateDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->CreateDir(dirname);
 }
 
-absl::Status Env::DeleteDir(const string& dirname) {
+absl::Status Env::DeleteDir(const std::string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->DeleteDir(dirname);
 }
 
-absl::Status Env::Stat(const string& fname, FileStatistics* stat) {
+absl::Status Env::Stat(const std::string& fname, FileStatistics* stat) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->Stat(fname, stat);
 }
 
-absl::Status Env::IsDirectory(const string& fname) {
+absl::Status Env::IsDirectory(const std::string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->IsDirectory(fname);
 }
 
-absl::Status Env::HasAtomicMove(const string& path, bool* has_atomic_move) {
+absl::Status Env::HasAtomicMove(const std::string& path,
+                                bool* has_atomic_move) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(path, &fs));
   return fs->HasAtomicMove(path, has_atomic_move);
 }
 
-absl::Status Env::DeleteRecursively(const string& dirname,
+absl::Status Env::DeleteRecursively(const std::string& dirname,
                                     int64_t* undeleted_files,
                                     int64_t* undeleted_dirs) {
   FileSystem* fs;
@@ -340,13 +342,14 @@ absl::Status Env::DeleteRecursively(const string& dirname,
   return fs->DeleteRecursively(dirname, undeleted_files, undeleted_dirs);
 }
 
-absl::Status Env::GetFileSize(const string& fname, uint64* file_size) {
+absl::Status Env::GetFileSize(const std::string& fname, uint64_t* file_size) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->GetFileSize(fname, file_size);
 }
 
-absl::Status Env::RenameFile(const string& src, const string& target) {
+absl::Status Env::RenameFile(const std::string& src,
+                             const std::string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -358,7 +361,7 @@ absl::Status Env::RenameFile(const string& src, const string& target) {
   return src_fs->RenameFile(src, target);
 }
 
-absl::Status Env::CopyFile(const string& src, const string& target) {
+absl::Status Env::CopyFile(const std::string& src, const std::string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -369,7 +372,7 @@ absl::Status Env::CopyFile(const string& src, const string& target) {
   return FileSystemCopyFile(src_fs, src, target_fs, target);
 }
 
-string Env::GetExecutablePath() {
+std::string Env::GetExecutablePath() {
   char exe_path[PATH_MAX] = {0};
 #ifdef __APPLE__
   uint32_t buffer_size(0U);
@@ -426,13 +429,13 @@ string Env::GetExecutablePath() {
   return exe_path;
 }
 
-bool Env::LocalTempFilename(string* filename) {
-  std::vector<string> dirs;
+bool Env::LocalTempFilename(std::string* filename) {
+  std::vector<std::string> dirs;
   GetLocalTempDirectories(&dirs);
 
   // Try each directory, as they might be full, have inappropriate
   // permissions or have different problems at times.
-  for (const string& dir : dirs) {
+  for (const std::string& dir : dirs) {
     *filename = io::JoinPath(dir, "tempfile-");
     if (CreateUniqueFileName(filename, "")) {
       return true;
@@ -441,7 +444,7 @@ bool Env::LocalTempFilename(string* filename) {
   return false;
 }
 
-bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
+bool Env::CreateUniqueFileName(std::string* prefix, const std::string& suffix) {
   int64_t tid = GetCurrentThreadId();
   int32_t pid = GetProcessId();
   long long now_microsec = NowMicros();  // NOLINT
@@ -460,11 +463,11 @@ bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
   }
 }
 
-int32 Env::GetProcessId() {
+int32_t Env::GetProcessId() {
 #ifdef PLATFORM_WINDOWS
   return static_cast<int32>(GetCurrentProcessId());
 #else
-  return static_cast<int32>(getpid());
+  return static_cast<int32_t>(getpid());
 #endif
 }
 
@@ -472,8 +475,9 @@ Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
 
-absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
-  uint64 file_size;
+absl::Status ReadFileToString(Env* env, const std::string& fname,
+                              std::string* data) {
+  uint64_t file_size;
   absl::Status s = env->GetFileSize(fname, &file_size);
   if (!s.ok()) {
     return s;
@@ -501,7 +505,7 @@ absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
   return s;
 }
 
-absl::Status WriteStringToFile(Env* env, const string& fname,
+absl::Status WriteStringToFile(Env* env, const std::string& fname,
                                absl::string_view data) {
   std::unique_ptr<WritableFile> file;
   absl::Status s = env->NewWritableFile(fname, &file);
@@ -515,13 +519,14 @@ absl::Status WriteStringToFile(Env* env, const string& fname,
   return s;
 }
 
-absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
-                                FileSystem* target_fs, const string& target) {
+absl::Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
+                                FileSystem* target_fs,
+                                const std::string& target) {
   std::unique_ptr<RandomAccessFile> src_file;
   TF_RETURN_IF_ERROR(src_fs->NewRandomAccessFile(src, &src_file));
 
   // When `target` points to a directory, we need to create a file within.
-  string target_name;
+  std::string target_name;
   if (target_fs->IsDirectory(target).ok()) {
     target_name = io::JoinPath(target, io::Basename(src));
   } else {
@@ -531,7 +536,7 @@ absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
   std::unique_ptr<WritableFile> target_file;
   TF_RETURN_IF_ERROR(target_fs->NewWritableFile(target_name, &target_file));
 
-  uint64 offset = 0;
+  uint64_t offset = 0;
   std::unique_ptr<char[]> scratch(new char[kCopyFileBufferSize]);
   absl::Status s = absl::OkStatus();
   while (s.ok()) {
@@ -586,14 +591,14 @@ class FileStream : public protobuf::io::ZeroCopyInputStream {
 
 }  // namespace
 
-absl::Status WriteBinaryProto(Env* env, const string& fname,
+absl::Status WriteBinaryProto(Env* env, const std::string& fname,
                               const protobuf::MessageLite& proto) {
-  string serialized;
+  std::string serialized;
   proto.AppendToString(&serialized);
   return WriteStringToFile(env, fname, serialized);
 }
 
-absl::Status ReadBinaryProto(Env* env, const string& fname,
+absl::Status ReadBinaryProto(Env* env, const std::string& fname,
                              protobuf::MessageLite* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
@@ -608,16 +613,16 @@ absl::Status ReadBinaryProto(Env* env, const string& fname,
   return absl::OkStatus();
 }
 
-absl::Status WriteTextProto(Env* env, const string& fname,
+absl::Status WriteTextProto(Env* env, const std::string& fname,
                             const protobuf::Message& proto) {
-  string serialized;
+  std::string serialized;
   if (!protobuf::TextFormat::PrintToString(proto, &serialized)) {
     return errors::FailedPrecondition("Unable to convert proto to text.");
   }
   return WriteStringToFile(env, fname, serialized);
 }
 
-absl::Status ReadTextProto(Env* env, const string& fname,
+absl::Status ReadTextProto(Env* env, const std::string& fname,
                            protobuf::Message* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
@@ -630,7 +635,7 @@ absl::Status ReadTextProto(Env* env, const string& fname,
   return absl::OkStatus();
 }
 
-absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                                    protobuf::Message* proto) {
   if (ReadTextProto(env, fname, proto).ok()) {
     return absl::OkStatus();
@@ -638,7 +643,7 @@ absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
   return ReadBinaryProto(env, fname, proto);
 }
 
-absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
                                    protobuf::MessageLite* proto) {
   return ReadBinaryProto(env, fname, proto);
 }
diff --git a/third_party/xla/xla/tsl/platform/env.h b/third_party/xla/xla/tsl/platform/env.h
index 2357ae513d2581..30bd4e34b618db 100644
--- a/third_party/xla/xla/tsl/platform/env.h
+++ b/third_party/xla/xla/tsl/platform/env.h
@@ -104,7 +104,7 @@ class Env {
                          const std::string& value);
 
   absl::Status SetOption(const std::string& scheme, const std::string& key,
-                         const std::vector<string>& values);
+                         const std::vector<std::string>& values);
 
   absl::Status SetOption(const std::string& scheme, const std::string& key,
                          const std::vector<int64_t>& values);
@@ -211,11 +211,11 @@ class Env {
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
-  bool FilesExist(const std::vector<string>& files,
+  bool FilesExist(const std::vector<std::string>& files,
                   std::vector<absl::Status>* status);
 
-  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
-                  std::vector<absl::Status>* status) {
+  bool FilesExist(const std::vector<std::string>& files,
+                  TransactionToken* token, std::vector<absl::Status>* status) {
     return true;
   }
 
@@ -223,10 +223,11 @@ class Env {
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
-  absl::Status GetChildren(const std::string& dir, std::vector<string>* result);
+  absl::Status GetChildren(const std::string& dir,
+                           std::vector<std::string>* result);
 
   absl::Status GetChildren(const std::string& dir, TransactionToken* token,
-                           std::vector<string>* result) {
+                           std::vector<std::string>* result) {
     return absl::OkStatus();
   }
 
@@ -240,11 +241,11 @@ class Env {
   ///
   /// More details about `pattern` in FileSystem::GetMatchingPaths.
   virtual absl::Status GetMatchingPaths(const std::string& pattern,
-                                        std::vector<string>* results);
+                                        std::vector<std::string>* results);
 
   absl::Status GetMatchingPaths(const std::string& pattern,
                                 TransactionToken* token,
-                                std::vector<string>* results) {
+                                std::vector<std::string>* results) {
     return absl::OkStatus();
   }
 
@@ -348,10 +349,10 @@ class Env {
   absl::Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
   /// Stores the size of `fname` in `*file_size`.
-  absl::Status GetFileSize(const std::string& fname, uint64* file_size);
+  absl::Status GetFileSize(const std::string& fname, uint64_t* file_size);
 
   absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
-                           uint64* file_size) {
+                           uint64_t* file_size) {
     return absl::OkStatus();
   }
 
@@ -426,19 +427,19 @@ class Env {
   // provide a routine to get the absolute time.
 
   /// \brief Returns the number of nano-seconds since the Unix epoch.
-  virtual uint64 NowNanos() const { return EnvTime::NowNanos(); }
+  virtual uint64_t NowNanos() const { return EnvTime::NowNanos(); }
 
   /// \brief Returns the number of micro-seconds since the Unix epoch.
-  virtual uint64 NowMicros() const { return EnvTime::NowMicros(); }
+  virtual uint64_t NowMicros() const { return EnvTime::NowMicros(); }
 
   /// \brief Returns the number of seconds since the Unix epoch.
-  virtual uint64 NowSeconds() const { return EnvTime::NowSeconds(); }
+  virtual uint64_t NowSeconds() const { return EnvTime::NowSeconds(); }
 
   /// Sleeps/delays the thread for the prescribed number of micro-seconds.
   virtual void SleepForMicroseconds(int64_t micros) = 0;
 
   /// Returns the process ID of the calling process.
-  int32 GetProcessId();
+  int32_t GetProcessId();
 
   /// \brief Returns a new thread that is running fn() and is identified
   /// (for debugging/performance-analysis) by "name".
@@ -511,7 +512,7 @@ class Env {
                                             const std::string& version) = 0;
 
   // Returns a possible list of local temporary directories.
-  virtual void GetLocalTempDirectories(std::vector<string>* list) = 0;
+  virtual void GetLocalTempDirectories(std::vector<std::string>* list) = 0;
 
  private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
@@ -538,7 +539,7 @@ class EnvWrapper : public Env {
   }
 
   absl::Status GetRegisteredFileSystemSchemes(
-      std::vector<string>* schemes) override {
+      std::vector<std::string>* schemes) override {
     return target_->GetRegisteredFileSystemSchemes(schemes);
   }
 
@@ -551,7 +552,7 @@ class EnvWrapper : public Env {
     return target_->MatchPath(path, pattern);
   }
 
-  uint64 NowMicros() const override { return target_->NowMicros(); }
+  uint64_t NowMicros() const override { return target_->NowMicros(); }
   void SleepForMicroseconds(int64_t micros) override {
     target_->SleepForMicroseconds(micros);
   }
@@ -595,7 +596,7 @@ class EnvWrapper : public Env {
   std::string GetRunfilesDir() override { return target_->GetRunfilesDir(); }
 
  private:
-  void GetLocalTempDirectories(std::vector<string>* list) override {
+  void GetLocalTempDirectories(std::vector<std::string>* list) override {
     target_->GetLocalTempDirectories(list);
   }
 
@@ -700,7 +701,8 @@ struct Register {
     // after TF 2.6+.
     if (try_modular_filesystems) {
       const char* env_value = getenv("TF_USE_MODULAR_FILESYSTEM");
-      string load_plugin = env_value ? absl::AsciiStrToLower(env_value) : "";
+      std::string load_plugin =
+          env_value ? absl::AsciiStrToLower(env_value) : "";
       if (load_plugin == "true" || load_plugin == "1") {
         // We don't register the static filesystem and wait for SIG IO one
         LOG(WARNING) << "Using modular file system for '" << scheme << "'."
diff --git a/third_party/xla/xla/tsl/platform/env_time.h b/third_party/xla/xla/tsl/platform/env_time.h
index f37e3129f45697..f66f2362f40dc4 100644
--- a/third_party/xla/xla/tsl/platform/env_time.h
+++ b/third_party/xla/xla/tsl/platform/env_time.h
@@ -25,37 +25,37 @@ namespace tsl {
 /// access timer related operations.
 class EnvTime {
  public:
-  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
-  static constexpr uint64 kMicrosToNanos = 1000ULL;
-  static constexpr uint64 kMillisToMicros = 1000ULL;
-  static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
-  static constexpr uint64 kNanosToPicos = 1000ULL;
-  static constexpr uint64 kSecondsToMillis = 1000ULL;
-  static constexpr uint64 kSecondsToMicros = 1000ULL * 1000ULL;
-  static constexpr uint64 kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
+  static constexpr uint64_t kMicrosToPicos = 1000ULL * 1000ULL;
+  static constexpr uint64_t kMicrosToNanos = 1000ULL;
+  static constexpr uint64_t kMillisToMicros = 1000ULL;
+  static constexpr uint64_t kMillisToNanos = 1000ULL * 1000ULL;
+  static constexpr uint64_t kNanosToPicos = 1000ULL;
+  static constexpr uint64_t kSecondsToMillis = 1000ULL;
+  static constexpr uint64_t kSecondsToMicros = 1000ULL * 1000ULL;
+  static constexpr uint64_t kSecondsToNanos = 1000ULL * 1000ULL * 1000ULL;
 
   EnvTime() = default;
   virtual ~EnvTime() = default;
 
   /// \brief Returns the number of nano-seconds since the Unix epoch.
-  static uint64 NowNanos();
+  static uint64_t NowNanos();
 
   /// \brief Returns the number of micro-seconds since the Unix epoch.
-  static uint64 NowMicros() { return NowNanos() / kMicrosToNanos; }
+  static uint64_t NowMicros() { return NowNanos() / kMicrosToNanos; }
 
   /// \brief Returns the number of seconds since the Unix epoch.
-  static uint64 NowSeconds() { return NowNanos() / kSecondsToNanos; }
+  static uint64_t NowSeconds() { return NowNanos() / kSecondsToNanos; }
 
   /// \brief A version of NowNanos() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowNanos() const { return NowNanos(); }
+  virtual uint64_t GetOverridableNowNanos() const { return NowNanos(); }
 
   /// \brief A version of NowMicros() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowMicros() const {
+  virtual uint64_t GetOverridableNowMicros() const {
     return GetOverridableNowNanos() / kMicrosToNanos;
   }
 
   /// \brief A version of NowSeconds() that may be overridden by a subclass.
-  virtual uint64 GetOverridableNowSeconds() const {
+  virtual uint64_t GetOverridableNowSeconds() const {
     return GetOverridableNowNanos() / kSecondsToNanos;
   }
 };
diff --git a/third_party/xla/xla/tsl/platform/file_system.cc b/third_party/xla/xla/tsl/platform/file_system.cc
index be988cb773385a..ff3f44fe4d2ed2 100644
--- a/third_party/xla/xla/tsl/platform/file_system.cc
+++ b/third_party/xla/xla/tsl/platform/file_system.cc
@@ -42,7 +42,8 @@ limitations under the License.
 
 namespace tsl {
 
-bool FileSystem::Match(const string& filename, const string& pattern) {
+bool FileSystem::Match(const std::string& filename,
+                       const std::string& pattern) {
 #if defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM) || \
     defined(PLATFORM_GOOGLE)
   // We avoid relying on RE2 on mobile platforms, because it incurs a
@@ -61,7 +62,7 @@ bool FileSystem::Match(const string& filename, const string& pattern) {
         // defined(PLATFORM_GOOGLE)
 }
 
-string FileSystem::TranslateName(const string& name) const {
+std::string FileSystem::TranslateName(const std::string& name) const {
   // If the name is empty, CleanPath returns "." which is incorrect and
   // we should return the empty path instead.
   if (name.empty()) return name;
@@ -76,7 +77,7 @@ string FileSystem::TranslateName(const string& name) const {
   return this->CleanPath(path);
 }
 
-absl::Status FileSystem::IsDirectory(const string& name,
+absl::Status FileSystem::IsDirectory(const std::string& name,
                                      TransactionToken* token) {
   // Check if path exists.
   // TODO(sami):Forward token to other methods once migration is complete.
@@ -89,7 +90,7 @@ absl::Status FileSystem::IsDirectory(const string& name,
   return absl::Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
 }
 
-absl::Status FileSystem::HasAtomicMove(const string& path,
+absl::Status FileSystem::HasAtomicMove(const std::string& path,
                                        bool* has_atomic_move) {
   *has_atomic_move = true;
   return absl::OkStatus();
@@ -97,7 +98,7 @@ absl::Status FileSystem::HasAtomicMove(const string& path,
 
 void FileSystem::FlushCaches(TransactionToken* token) {}
 
-bool FileSystem::FilesExist(const std::vector<string>& files,
+bool FileSystem::FilesExist(const std::vector<std::string>& files,
                             TransactionToken* token,
                             std::vector<absl::Status>* status) {
   bool result = true;
@@ -114,7 +115,7 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-absl::Status FileSystem::DeleteRecursively(const string& dirname,
+absl::Status FileSystem::DeleteRecursively(const std::string& dirname,
                                            TransactionToken* token,
                                            int64_t* undeleted_files,
                                            int64_t* undeleted_dirs) {
@@ -137,18 +138,18 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
     return delete_root_status;
   }
 
-  std::deque<string> dir_q;      // Queue for the BFS
-  std::vector<string> dir_list;  // List of all dirs discovered
+  std::deque<std::string> dir_q;      // Queue for the BFS
+  std::vector<std::string> dir_list;  // List of all dirs discovered
   dir_q.push_back(dirname);
   absl::Status ret;  // Status to be returned.
   // Do a BFS on the directory to discover all the sub-directories. Remove all
   // children that are files along the way. Then cleanup and remove the
   // directories in reverse order.;
   while (!dir_q.empty()) {
-    string dir = dir_q.front();
+    std::string dir = dir_q.front();
     dir_q.pop_front();
     dir_list.push_back(dir);
-    std::vector<string> children;
+    std::vector<std::string> children;
     // GetChildren might fail if we don't have appropriate permissions.
     absl::Status s = GetChildren(dir, &children);
     ret.Update(s);
@@ -156,8 +157,8 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
       (*undeleted_dirs)++;
       continue;
     }
-    for (const string& child : children) {
-      const string child_path = this->JoinPath(dir, child);
+    for (const std::string& child : children) {
+      const std::string child_path = this->JoinPath(dir, child);
       // If the child is a directory add it to the queue, otherwise delete it.
       if (IsDirectory(child_path).ok()) {
         dir_q.push_back(child_path);
@@ -175,7 +176,7 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
   // Now reverse the list of directories and delete them. The BFS ensures that
   // we can delete the directories in this order.
   std::reverse(dir_list.begin(), dir_list.end());
-  for (const string& dir : dir_list) {
+  for (const std::string& dir : dir_list) {
     // Delete dir might fail because of permissions issues or might be
     // unimplemented.
     absl::Status s = DeleteDir(dir);
@@ -187,7 +188,7 @@ absl::Status FileSystem::DeleteRecursively(const string& dirname,
   return ret;
 }
 
-absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
+absl::Status FileSystem::RecursivelyCreateDir(const std::string& dirname,
                                               TransactionToken* token) {
   absl::string_view scheme, host, remaining_dir;
   this->ParseURI(dirname, &scheme, &host, &remaining_dir);
@@ -222,7 +223,7 @@ absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
   std::reverse(sub_dirs.begin(), sub_dirs.end());
 
   // Now create the directories.
-  string built_path(remaining_dir);
+  std::string built_path(remaining_dir);
   for (const absl::string_view sub_dir : sub_dirs) {
     built_path = this->JoinPath(built_path, sub_dir);
     absl::Status status = CreateDir(this->CreateURI(scheme, host, built_path));
@@ -233,22 +234,23 @@ absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
   return absl::OkStatus();
 }
 
-absl::Status FileSystem::CopyFile(const string& src, const string& target,
+absl::Status FileSystem::CopyFile(const std::string& src,
+                                  const std::string& target,
                                   TransactionToken* token) {
   return FileSystemCopyFile(this, src, this, target);
 }
 
 char FileSystem::Separator() const { return '/'; }
 
-string FileSystem::JoinPathImpl(
+std::string FileSystem::JoinPathImpl(
     std::initializer_list<absl::string_view> paths) {
-  string result;
+  std::string result;
 
   for (absl::string_view path : paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
-      result = string(path);
+      result = std::string(path);
       continue;
     }
 
@@ -367,10 +369,10 @@ absl::string_view FileSystem::Extension(absl::string_view path) const {
   }
 }
 
-string FileSystem::CleanPath(absl::string_view unclean_path) const {
-  string path(unclean_path);
+std::string FileSystem::CleanPath(absl::string_view unclean_path) const {
+  std::string path(unclean_path);
   const char* src = path.c_str();
-  string::iterator dst = path.begin();
+  std::string::iterator dst = path.begin();
 
   // Check for absolute path and determine initial backtrack limit.
   const bool is_absolute_path = *src == '/';
@@ -378,7 +380,7 @@ string FileSystem::CleanPath(absl::string_view unclean_path) const {
     *dst++ = *src++;
     while (*src == '/') ++src;
   }
-  string::const_iterator backtrack_limit = dst;
+  std::string::const_iterator backtrack_limit = dst;
 
   // Process all parts
   while (*src) {
@@ -434,7 +436,7 @@ string FileSystem::CleanPath(absl::string_view unclean_path) const {
   }
 
   // Calculate and check the length of the cleaned path.
-  string::difference_type path_length = dst - path.begin();
+  std::string::difference_type path_length = dst - path.begin();
   if (path_length != 0) {
     // Remove trailing '/' except if it is root path ("/" ==> path_length := 1)
     if (path_length > 1 && path[path_length - 1] == '/') {
@@ -480,10 +482,11 @@ void FileSystem::ParseURI(absl::string_view remaining,
   *path = remaining;
 }
 
-string FileSystem::CreateURI(absl::string_view scheme, absl::string_view host,
-                             absl::string_view path) const {
+std::string FileSystem::CreateURI(absl::string_view scheme,
+                                  absl::string_view host,
+                                  absl::string_view path) const {
   if (scheme.empty()) {
-    return string(path);
+    return std::string(path);
   }
   return absl::StrCat(scheme, "://", host, path);
 }
diff --git a/third_party/xla/xla/tsl/platform/file_system_helper.cc b/third_party/xla/xla/tsl/platform/file_system_helper.cc
index 98fe0d5c3b2ec9..f8c590704a6603 100644
--- a/third_party/xla/xla/tsl/platform/file_system_helper.cc
+++ b/third_party/xla/xla/tsl/platform/file_system_helper.cc
@@ -121,8 +121,9 @@ static inline int GetFirstGlobbingEntry(const std::vector<std::string>& dirs) {
 
 }  // namespace
 
-absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
-                              std::vector<string>* results) {
+absl::Status GetMatchingPaths(FileSystem* fs, Env* env,
+                              const std::string& pattern,
+                              std::vector<std::string>* results) {
   // Check that `fs`, `env` and `results` are non-null.
   if (fs == nullptr || env == nullptr || results == nullptr) {
     return absl::Status(
@@ -181,8 +182,8 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   // INVARIANT: If `{d, _}` is in queue, then `d` is a real directory.
   // INVARIANT: If `{_, ix}` is in queue, then `ix < dirs.size() - 1`.
   // INVARIANT: If `{_, ix}` is in queue, `IsGlobbingPattern(dirs[ix + 1])`.
-  std::deque<std::pair<string, int>> expand_queue;
-  std::deque<std::pair<string, int>> next_expand_queue;
+  std::deque<std::pair<std::string, int>> expand_queue;
+  std::deque<std::pair<std::string, int>> next_expand_queue;
   expand_queue.emplace_back(dirs[matching_index - 1], matching_index - 1);
 
   // Adding to `result` or `new_expand_queue` need to be protected by mutexes
@@ -267,7 +268,7 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   return absl::OkStatus();
 }
 
-absl::StatusOr<bool> FileExists(Env* env, const string& fname) {
+absl::StatusOr<bool> FileExists(Env* env, const std::string& fname) {
   absl::Status status = env->FileExists(fname);
   if (absl::IsNotFound(status)) {
     return false;
diff --git a/third_party/xla/xla/tsl/platform/logging.h b/third_party/xla/xla/tsl/platform/logging.h
index 1cb30bf5734c27..f709a5d02a1af9 100644
--- a/third_party/xla/xla/tsl/platform/logging.h
+++ b/third_party/xla/xla/tsl/platform/logging.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_LOGGING_H_
 #define XLA_TSL_PLATFORM_LOGGING_H_
 
+#include "absl/log/absl_log.h"
 #include "absl/log/check.h"       // IWYU pragma: export
 #include "absl/log/log.h"         // IWYU pragma: export
 #include "absl/log/vlog_is_on.h"  // IWYU pragma: export
@@ -29,7 +30,9 @@ template <typename T>
 T&& CheckNotNull(absl::string_view file, int line, absl::string_view exprtext,
                  T&& t) {
   if (t == nullptr) {
-    LOG(FATAL).AtLocation(file, line) << exprtext;
+    // Use ABSL_LOG instead of LOG to avoid conflicts if downstream
+    // projects (e.g. pytorch) define their own LOG macro.
+    ABSL_LOG(FATAL).AtLocation(file, line) << exprtext;
   }
   return std::forward<T>(t);
 }
diff --git a/third_party/xla/xla/tsl/platform/macros.h b/third_party/xla/xla/tsl/platform/macros.h
index aeca9877620047..f3e04156f11342 100644
--- a/third_party/xla/xla/tsl/platform/macros.h
+++ b/third_party/xla/xla/tsl/platform/macros.h
@@ -26,7 +26,6 @@ limitations under the License.
 #define TF_ATTRIBUTE_UNUSED ABSL_ATTRIBUTE_UNUSED
 #define TF_PACKED ABSL_ATTRIBUTE_PACKED
 #define TF_MUST_USE_RESULT ABSL_MUST_USE_RESULT
-#define TF_PRINTF_ATTRIBUTE ABSL_PRINTF_ATTRIBUTE
 
 // Control visibility outside .so
 #if defined(_WIN32)
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
index bc3d08ad983a27..f05bdd36aebc3f 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tsl {
 
-void ClockCycleProfiler::DumpStatistics(const string& tag) {
+void ClockCycleProfiler::DumpStatistics(const std::string& tag) {
   CHECK(!IsStarted());
   const double average_clock_cycle = GetAverageClockCycle();
   const double count = GetCount();
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
index b922cb942902a3..5862253149cf2b 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/clock_cycle_profiler.h
@@ -60,11 +60,11 @@ class ClockCycleProfiler {
   }
 
   // Dump statistics
-  void DumpStatistics(const string& tag);
+  void DumpStatistics(const std::string& tag);
 
  private:
-  inline uint64 GetCurrentClockCycleInternal() {
-    const uint64 clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
+  inline uint64_t GetCurrentClockCycleInternal() {
+    const uint64_t clockCycle = profile_utils::CpuUtils::GetCurrentClockCycle();
     if (clockCycle <= 0) {
       if (valid_) {
         LOG(WARNING) << "GetCurrentClockCycle is not implemented."
@@ -80,7 +80,7 @@ class ClockCycleProfiler {
   inline bool IsStarted() const { return start_clock_ > 0; }
 
   inline void AccumulateClockCycle() {
-    const uint64 now = GetCurrentClockCycleInternal();
+    const uint64_t now = GetCurrentClockCycleInternal();
     const double clock_diff = static_cast<double>(now - start_clock_);
     const double next_count = count_ + 1.0;
     const double next_count_inv = 1.0 / next_count;
@@ -92,7 +92,7 @@ class ClockCycleProfiler {
     start_clock_ = 0;
   }
 
-  uint64 start_clock_{0};
+  uint64_t start_clock_{0};
   double count_{0.0};
   double average_clock_cycle_{0.0};
   double worst_clock_cycle_{0.0};
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
index 394d1f87a341ff..5dd0fbb4747911 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.cc
@@ -87,7 +87,7 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
     LOG(WARNING) << "Failed to open /proc/cpuinfo";
     return INVALID_FREQUENCY;
   }
-  string line;
+  std::string line;
   while (std::getline(cpuinfo, line)) {
     double cpu_freq = 0.0;
     int retval = 0;
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
index f3d6d42566496b..df568714d80360 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils.h
@@ -48,14 +48,14 @@ class CpuUtils {
   // Constant for invalid frequency.
   // This value is returned when the frequency is not obtained somehow.
   static constexpr int64_t INVALID_FREQUENCY = -1;
-  static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
+  static constexpr uint64_t DUMMY_CYCLE_CLOCK = 1;
 
   // Return current clock cycle. This function is designed to
   // minimize the overhead to get clock and maximize the accuracy of
   // time for profile.
   // This returns unsigned int because there is no guarantee that rdtsc
   // is less than 2 ^ 61.
-  static inline uint64 GetCurrentClockCycle() {
+  static inline uint64_t GetCurrentClockCycle() {
 #if defined(__ANDROID__)
     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
 // ----------------------------------------------------------------
@@ -158,7 +158,7 @@ class CpuUtils {
    public:
     DefaultCpuUtilsHelper() = default;
     void ResetClockCycle() final {}
-    uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
+    uint64_t GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
     void EnableClockCycleProfiling() final {}
     void DisableClockCycleProfiling() final {}
     int64_t CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
index 968846acb40f5a..77b852f395bc0f 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
+++ b/third_party/xla/xla/tsl/platform/profile_utils/cpu_utils_test.cc
@@ -36,15 +36,15 @@ TEST_F(CpuUtilsTest, TearDownTestCase) {}
 
 TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
   static constexpr int LOOP_COUNT = 10;
-  const uint64 start_clock_count = CpuUtils::GetCurrentClockCycle();
+  const uint64_t start_clock_count = CpuUtils::GetCurrentClockCycle();
   CHECK_GT(start_clock_count, 0);
-  uint64 prev_clock_count = start_clock_count;
+  uint64_t prev_clock_count = start_clock_count;
   for (int i = 0; i < LOOP_COUNT; ++i) {
-    const uint64 clock_count = CpuUtils::GetCurrentClockCycle();
+    const uint64_t clock_count = CpuUtils::GetCurrentClockCycle();
     CHECK_GE(clock_count, prev_clock_count);
     prev_clock_count = clock_count;
   }
-  const uint64 end_clock_count = CpuUtils::GetCurrentClockCycle();
+  const uint64_t end_clock_count = CpuUtils::GetCurrentClockCycle();
   if (DBG) {
     LOG(INFO) << "start clock = " << start_clock_count;
     LOG(INFO) << "end clock = " << end_clock_count;
diff --git a/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h b/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
index 11d5bf2f4b675f..3e9470a2dd942b 100644
--- a/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
+++ b/third_party/xla/xla/tsl/platform/profile_utils/i_cpu_utils_helper.h
@@ -34,7 +34,7 @@ class ICpuUtilsHelper {
   // clock cycle counters from overflowing on some platforms.
   virtual void ResetClockCycle() = 0;
   // Return current clock cycle.
-  virtual uint64 GetCurrentClockCycle() = 0;
+  virtual uint64_t GetCurrentClockCycle() = 0;
   // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
   virtual void EnableClockCycleProfiling() = 0;
diff --git a/third_party/xla/xla/tsl/platform/windows/BUILD b/third_party/xla/xla/tsl/platform/windows/BUILD
index 41dee8607da970..6052e72acee918 100644
--- a/third_party/xla/xla/tsl/platform/windows/BUILD
+++ b/third_party/xla/xla/tsl/platform/windows/BUILD
@@ -58,6 +58,7 @@ cc_library(
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform",
@@ -198,6 +199,7 @@ cc_library(
         "//xla/tsl/platform:dynamic_annotations",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/platform",
         "@snappy",
     ],
diff --git a/third_party/xla/xla/tsl/platform/windows/port.cc b/third_party/xla/xla/tsl/platform/windows/port.cc
index d75627d1069f72..7f2f84bd4799ec 100644
--- a/third_party/xla/xla/tsl/platform/windows/port.cc
+++ b/third_party/xla/xla/tsl/platform/windows/port.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <new>
 #ifdef TF_USE_SNAPPY
 #include "snappy.h"
 #endif
@@ -186,13 +188,14 @@ int NumHyperthreadsPerCore() {
 namespace tsl {
 namespace port {
 
-void* AlignedMalloc(size_t size, int minimum_alignment) {
-  return _aligned_malloc(size, minimum_alignment);
+void* AlignedMalloc(size_t size, std::align_val_t minimum_alignment) {
+  return _aligned_malloc(size, static_cast<size_t>(minimum_alignment));
 }
 
 void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
 
-void AlignedSizedFree(void* aligned_memory, size_t alignment, size_t size) {
+void AlignedSizedFree(void* aligned_memory, size_t size,
+                      std::align_val_t alignment) {
   (void)alignment;
   (void)size;
 
diff --git a/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc b/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
index a6053586b0b861..275f72c817906c 100644
--- a/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/windows/windows_file_system.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <time.h>
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system_helper.h"
@@ -118,12 +119,12 @@ class WindowsRandomAccessFile : public RandomAccessFile {
     }
   }
 
-  Status Name(StringPiece* result) const override {
+  Status Name(absl::string_view* result) const override {
     *result = filename_;
     return absl::OkStatus();
   }
 
-  Status Read(uint64 offset, size_t n, StringPiece* result,
+  Status Read(uint64 offset, size_t n, absl::string_view* result,
               char* scratch) const override {
     Status s;
     char* dst = scratch;
@@ -148,7 +149,7 @@ class WindowsRandomAccessFile : public RandomAccessFile {
         s = IOError(filename_, errno);
       }
     }
-    *result = StringPiece(scratch, dst - scratch);
+    *result = absl::string_view(scratch, dst - scratch);
     return s;
   }
 
@@ -169,7 +170,7 @@ class WindowsRandomAccessFile : public RandomAccessFile {
                                        " bytes for file reading.");
     }
 
-    StringPiece tmp;
+    absl::string_view tmp;
     Status s = Read(offset, n, &tmp, scratch);
 
     absl::Cord tmp_cord = absl::MakeCordFromExternal(
@@ -196,7 +197,7 @@ class WindowsWritableFile : public WritableFile {
     }
   }
 
-  Status Append(StringPiece data) override {
+  Status Append(absl::string_view data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
     BOOL write_result =
@@ -267,7 +268,7 @@ class WindowsWritableFile : public WritableFile {
     return absl::OkStatus();
   }
 
-  Status Name(StringPiece* result) const override {
+  Status Name(absl::string_view* result) const override {
     *result = filename_;
     return absl::OkStatus();
   }
@@ -559,7 +560,7 @@ Status WindowsFileSystem::GetChildren(const string& dir,
 
   do {
     string file_name = WideCharToUtf8(find_data.cFileName);
-    const StringPiece basename = file_name;
+    const absl::string_view basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
diff --git a/third_party/xla/xla/tsl/platform/windows/windows_file_system.h b/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
index 4dad78172ea441..75fc104940368f 100644
--- a/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
+++ b/third_party/xla/xla/tsl/platform/windows/windows_file_system.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
 #define XLA_TSL_PLATFORM_WINDOWS_WINDOWS_FILE_SYSTEM_H_
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/file_system.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
@@ -88,7 +89,7 @@ class WindowsFileSystem : public FileSystem {
 class LocalWinFileSystem : public WindowsFileSystem {
  public:
   string TranslateName(const string& name) const override {
-    StringPiece scheme, host, path;
+    absl::string_view scheme, host, path;
     io::ParseURI(name, &scheme, &host, &path);
     return string(path);
   }
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
index 70a74de2776b97..0e577e3dc18217 100644
--- a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
@@ -94,7 +94,8 @@ void MergeHostPlanesAndSortLines(tensorflow::profiler::XSpace* space) {
 }  // namespace
 
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns, uint64 stop_time_ns) {
+                                 uint64_t start_time_ns,
+                                 uint64_t stop_time_ns) {
   VLOG(3) << "Post processing local profiler XSpace.";
   // Post processing the collected XSpace without hold profiler lock.
   // 1. Merge all host planes and sorts lines by name.
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
index 287e76586e2748..1b90ed9928a930 100644
--- a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.h
@@ -23,7 +23,7 @@ namespace profiler {
 
 // Post process XSpaces collected locally from multiple profilers.
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns, uint64 stop_time_ns);
+                                 uint64_t start_time_ns, uint64_t stop_time_ns);
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc b/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
index 9796e29ec18702..ee31a60d03cef3 100644
--- a/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/trace_events_to_json.cc
@@ -34,7 +34,7 @@ namespace {
 
 // Converts the given time from picoseconds to microseconds and then to a string
 // using maximum precision.
-inline std::string PicosToMicrosString(uint64 ps) {
+inline std::string PicosToMicrosString(uint64_t ps) {
   return MaxPrecision(PicoToMicro(ps));
 }
 
@@ -58,7 +58,7 @@ std::vector<const typename Map::value_type*> SortByKey(const Map& m) {
   return pairs;
 }
 
-inline void AddDeviceMetadata(uint32 device_id, const Device& device,
+inline void AddDeviceMetadata(uint32_t device_id, const Device& device,
                               std::string* json) {
   if (!device.name().empty()) {
     absl::StrAppend(json, R"({"ph":"M","pid":)", device_id,
@@ -70,21 +70,21 @@ inline void AddDeviceMetadata(uint32 device_id, const Device& device,
                   device_id, "}},");
 }
 
-inline void AddResourceMetadata(uint32 device_id, uint32 resource_id,
+inline void AddResourceMetadata(uint32_t device_id, uint32_t resource_id,
                                 const Resource& resource, std::string* json) {
   if (!resource.name().empty()) {
     absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
                     resource_id, R"(,"name":"thread_name","args":{"name":)",
                     JsonString(resource.name()), "}},");
   }
-  uint32 sort_index =
+  uint32_t sort_index =
       resource.sort_index() ? resource.sort_index() : resource_id;
   absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
                   resource_id, R"(,"name":"thread_sort_index")",
                   R"(,"args":{"sort_index":)", sort_index, "}},");
 }
 
-inline void AddTraceEvent(const TraceEvent& event, string* json) {
+inline void AddTraceEvent(const TraceEvent& event, std::string* json) {
   auto duration_ps = std::max(event.duration_ps(), protobuf_uint64{1});
   absl::StrAppend(json, R"({"ph":"X","pid":)", event.device_id(), R"(,"tid":)",
                   event.resource_id(), R"(,"ts":)",
@@ -110,11 +110,11 @@ std::string TraceContainerToJson(const TraceContainer& container) {
       R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)"
       R"("traceEvents":[)";
   for (const auto* id_and_device : SortByKey(container.trace().devices())) {
-    uint32 device_id = id_and_device->first;
+    uint32_t device_id = id_and_device->first;
     const Device& device = id_and_device->second;
     AddDeviceMetadata(device_id, device, &json);
     for (const auto* id_and_resource : SortByKey(device.resources())) {
-      uint32 resource_id = id_and_resource->first;
+      uint32_t resource_id = id_and_resource->first;
       const Resource& resource = id_and_resource->second;
       AddResourceMetadata(device_id, resource_id, resource, &json);
     }
diff --git a/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc b/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
index a8f2e7beef1833..9bbe94314e37e4 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/xplane_to_trace_events.cc
@@ -39,7 +39,7 @@ namespace {
 
 using tensorflow::profiler::XSpace;
 
-void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
+void BuildDeviceAndResources(uint32_t device_id, const XPlaneVisitor& plane,
                              Device* device) {
   device->set_name(std::string(plane.Name()));
   device->set_device_id(device_id);
@@ -47,7 +47,7 @@ void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
   bool sort_by_ordinal = (device_id == kHostThreadsDeviceId);
   int ordinal = 0;
   plane.ForEachLine([&](const XLineVisitor& line) {
-    uint32 resource_id = line.DisplayId();
+    uint32_t resource_id = line.DisplayId();
     Resource& resource = (*device->mutable_resources())[resource_id];
     resource.set_resource_id(resource_id);
     resource.set_name(std::string(line.DisplayName()));
@@ -59,7 +59,7 @@ void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
   });
 }
 
-void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
+void ConvertXPlaneToTraceEvents(uint32_t device_id, const XPlaneVisitor& xplane,
                                 TraceContainer& container) {
   // Convert devices and resources.
   BuildDeviceAndResources(device_id, xplane,
@@ -67,7 +67,7 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
 
   // Convert events.
   xplane.ForEachLine([device_id, &container](const XLineVisitor& xline) {
-    uint32 resource_id = xline.DisplayId();
+    uint32_t resource_id = xline.DisplayId();
     if (xline.DisplayName() == tsl::profiler::kXlaAsyncOpLineName) {
       return;
     }
@@ -106,8 +106,8 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
 
 }  // namespace
 
-uint64 GetTraceViewerMaxEvents() {
-  constexpr uint64 kMaxEvents = 1000000;
+uint64_t GetTraceViewerMaxEvents() {
+  constexpr uint64_t kMaxEvents = 1000000;
   // Testing only env variable, not recommended for use
   char* max_events = getenv("TF_PROFILER_TRACE_VIEWER_MAX_EVENTS");
   if (max_events != nullptr) {
@@ -136,12 +136,12 @@ TraceContainer ConvertXSpaceToTraceContainer(const XSpace& xspace) {
   }
   for (const XPlane* device_plane : device_planes) {
     XPlaneVisitor xplane = CreateTfXPlaneVisitor(device_plane);
-    uint32 device_id = kFirstDeviceId + xplane.Id();
+    uint32_t device_id = kFirstDeviceId + xplane.Id();
     ConvertXPlaneToTraceEvents(device_id, xplane, container);
   }
   // Trace viewer (non-streaming) has scalability issues, we need to drop
   // events to avoid loading failure for trace viewer.
-  uint64 viewer_max_events = GetTraceViewerMaxEvents();
+  uint64_t viewer_max_events = GetTraceViewerMaxEvents();
   container.CapEvents(viewer_max_events);
   return container;
 }
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
index ff6e8e698c0ca8..f23a2d2304fa5a 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
@@ -63,6 +63,7 @@ cc_library(
         "//xla/tsl/profiler:internal",
         "//xla/tsl/profiler/rpc:__pkg__",
         "//learning/pathways/util/platform:__subpackages__",
+        "//third_party/pathways/util/platform:__subpackages__",
     ]),
     deps = [
         "//xla/tsl/lib/io:zlib_compression_options",
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc b/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
index c90ee62deca79b..e04acb1b7414bb 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/capture_profile.cc
@@ -56,7 +56,7 @@ using tensorflow::ProfileResponse;
 using tensorflow::RemoteProfilerSessionManagerOptions;
 using tensorflow::profiler::XSpace;
 
-constexpr uint64 kMaxEvents = 1000000;
+constexpr uint64_t kMaxEvents = 1000000;
 const absl::string_view kXPlanePb = "xplane.pb";
 
 MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
@@ -199,8 +199,10 @@ absl::Status CaptureRemoteTrace(const std::string& logdir,
   DCHECK_GT(opts.profiler_options().duration_ms(), 0);
   DCHECK(!opts.service_addresses().empty());
 
-  // Use the current timestamp as the run name.
-  std::string session_id = GetCurrentTimeStampAsString();
+  // Sets the session ID if provided, otherwise uses the current timestamp.
+  std::string session_id = opts.profiler_options().session_id().empty()
+                               ? GetCurrentTimeStampAsString()
+                               : opts.profiler_options().session_id();
   std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
   auto duration_ms = opts.profiler_options().duration_ms();
 
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
index e4fa849fab0e9a..cd3a28f62aed57 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/profiler_client.cc
@@ -48,7 +48,7 @@ template <typename T>
 std::unique_ptr<typename T::Stub> CreateStub(
     const std::string& service_address) {
   ::grpc::ChannelArguments channel_args;
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
   // Default URI prefix is "dns:///" if not provided.
   auto channel = ::grpc::CreateCustomChannel(
       service_address, ::grpc::InsecureChannelCredentials(), channel_args);
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
index 78d671601cc1b2..0fc39fa81c8f16 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -50,7 +50,7 @@ ProfileRequest PopulateProfileRequest(
     absl::string_view repository_root, absl::string_view session_id,
     absl::string_view host_name,
     const RemoteProfilerSessionManagerOptions& options) {
-  constexpr uint64 kMaxEvents = 1000000;
+  constexpr uint64_t kMaxEvents = 1000000;
   const absl::string_view kXPlanePb = "xplane.pb";
   ProfileRequest request;
   // TODO(b/169976117) Remove duration from request.
diff --git a/third_party/xla/xla/tsl/profiler/utils/BUILD b/third_party/xla/xla/tsl/profiler/utils/BUILD
index 8ad67789a1b914..864452e7a93aee 100644
--- a/third_party/xla/xla/tsl/profiler/utils/BUILD
+++ b/third_party/xla/xla/tsl/profiler/utils/BUILD
@@ -486,6 +486,18 @@ cc_library(
     ],
 )
 
+tsl_cc_test(
+    name = "session_manager_test",
+    srcs = ["session_manager_test.cc"],
+    deps = [
+        ":session_manager",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "timestamp_utils",
     srcs = ["timestamp_utils.cc"],
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.cc b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
index e891231d807bbf..7edf70559c50f3 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
@@ -362,12 +362,12 @@ const EventNode* EventNode::FindParent(int64_t event_type) const {
 
 void EventForest::FindEventNodeAndApply(
     const int64_t event_type, const std::vector<int64_t>& stat_types,
-    const std::function<void(EventNode&, const std::vector<uint64>&)>& cb) {
+    const std::function<void(EventNode&, const std::vector<uint64_t>&)>& cb) {
   if (auto* event_node_list = gtl::FindOrNull(event_node_map_, event_type)) {
     // Drop 'const' here because the event_node entry can be mutated by the
     // apply function 'cb'.
     for (EventNode& event_node : *event_node_list) {
-      std::vector<uint64> stats;
+      std::vector<uint64_t> stats;
       for (const auto stat_type : stat_types) {
         std::optional<XStatVisitor> stat =
             event_node.GetEventVisitor().GetStat(stat_type);
@@ -424,7 +424,7 @@ void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
 void EventForest::ConnectInterThread(
     const std::vector<InterThreadConnectInfo>& connect_info_list) {
   for (const auto& connect_info : connect_info_list) {
-    absl::flat_hash_map<std::vector<uint64>, EventNode*> connect_map;
+    absl::flat_hash_map<std::vector<uint64_t>, EventNode*> connect_map;
     const std::vector<int64_t>& parent_stat_types =
         connect_info.parent_stat_types;
     const std::vector<int64_t>* child_stat_types =
@@ -438,7 +438,7 @@ void EventForest::ConnectInterThread(
     // the parent node.
     FindEventNodeAndApply(connect_info.parent_event_type, parent_stat_types,
                           [&connect_map](EventNode& event_node,
-                                         const std::vector<uint64>& stats) {
+                                         const std::vector<uint64_t>& stats) {
                             connect_map[stats] = &event_node;
                           });
 
@@ -449,7 +449,7 @@ void EventForest::ConnectInterThread(
     FindEventNodeAndApply(
         connect_info.child_event_type, *child_stat_types,
         [&connect_map](EventNode& event_node,
-                       const std::vector<uint64>& stats) {
+                       const std::vector<uint64_t>& stats) {
           if (auto parent_event_node = gtl::FindPtrOrNull(connect_map, stats)) {
             parent_event_node->AddChild(&event_node);
           }
@@ -651,7 +651,7 @@ void EventForest::ConnectTfDataEvents() {
       std::pair<int64_t /*iterator_id*/, int64_t /*element_id*/>,
       std::vector<EventNode*>>
       produce_iterator_map;
-  uint64 num_producers = 0;
+  uint64_t num_producers = 0;
   for (HostEventType event_type :
        {HostEventType::kPrefetchProduce,
         HostEventType::kParallelInterleaveProduce,
@@ -681,7 +681,7 @@ void EventForest::ConnectTfDataEvents() {
     }
   }
   VLOG(1) << num_producers << " producer iterators found.";
-  uint64 num_matched = 0;
+  uint64_t num_matched = 0;
   for (HostEventType event_type :
        {HostEventType::kPrefetchConsume,
         HostEventType::kParallelInterleaveConsume,
@@ -934,7 +934,15 @@ void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
   if (step_line) {
     bool device_loop = (step_line->events_size() > module_line->events_size());
     if (device_loop) {
-      group_line = nullptr;
+      int32_t group_id = 0;
+      for (XEvent& event : *step_line->mutable_events()) {
+        XEventBuilder step_builder(step_line, &plane_builder, &event);
+        XEventVisitor step_visitor(&plane_visitor, step_line, &event);
+        if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
+          step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
+        }
+      }
+      group_line = step_line;
     } else {  // host loop
       if (group_line) {
         // Determine whether the module line has been grouped.
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.h b/third_party/xla/xla/tsl/profiler/utils/group_events.h
index 6e16d4ee4a5628..2bb0aa811a92b9 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.h
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.h
@@ -136,7 +136,7 @@ struct ContextGroup {
 
 using ContextGroupMap = absl::flat_hash_map<
     int /*context_type*/,
-    absl::flat_hash_map<uint64 /*context_id*/, ContextGroup>>;
+    absl::flat_hash_map<uint64_t /*context_id*/, ContextGroup>>;
 
 // EventForest augments the input XSpace with the trace context. The trace
 // context is created by stitching XEvents (1) using the nesting relationship
@@ -212,7 +212,7 @@ class EventForest {
   //     - The node's event type in event_node_map_ is event_type.
   void FindEventNodeAndApply(
       int64_t event_type, const std::vector<int64_t>& stat_types,
-      const std::function<void(EventNode&, const std::vector<uint64>&)>& cb);
+      const std::function<void(EventNode&, const std::vector<uint64_t>&)>& cb);
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
index 9b882f71059566..f3b8e20a56ebe6 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
@@ -441,7 +441,7 @@ TEST(GroupEventsTest, SemanticArgTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kContextId = 456;
+  constexpr uint64_t kContextId = 456;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -479,8 +479,8 @@ TEST(GroupEventsTest, SemanticIntArgNoMatchTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kProducerId = 456;
-  constexpr uint64 kConsumerId = 789;
+  constexpr uint64_t kProducerId = 456;
+  constexpr uint64_t kConsumerId = 789;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -522,8 +522,8 @@ TEST(GroupEventsTest, SemanticUintArgNoMatchTest) {
   constexpr int64_t kIsRoot = 1;
   constexpr int64_t kStepNum = 100;
   constexpr int64_t kContextType = 123;
-  constexpr uint64 kProducerId = UINT64_MAX;
-  constexpr uint64 kConsumerId = UINT64_MAX - 1;
+  constexpr uint64_t kProducerId = UINT64_MAX;
+  constexpr uint64_t kConsumerId = UINT64_MAX - 1;
 
   XSpace raw_space;
   XPlane* raw_plane = raw_space.add_planes();
@@ -641,7 +641,7 @@ TEST(GroupEventsTest, BatchingSessionTest) {
   EXPECT_EQ(group_metadata_map.at(1).children.size(), 1);
   EXPECT_EQ(group_metadata_map.at(2).children.size(), 1);
   // Check that the events have the selected_group_ids stat set.
-  uint64 num_checked = 0;
+  uint64_t num_checked = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine([&](const XLineVisitor& line) {
     line.ForEachEvent([&](const XEventVisitor& event) {
       std::optional<int64_t> group_id;
diff --git a/third_party/xla/xla/tsl/profiler/utils/session_manager.cc b/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
index 87b6be5483fd6d..90a2d58d25df43 100644
--- a/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/session_manager.cc
@@ -136,6 +136,13 @@ RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
             options.set_delay_ms(value);
           },
           nullptr);
+    } else if (key == "session_id") {
+      SetOption<std::string>(
+          key, kw.second,
+          [&options](tensorflow::ProfileOptions*, std::string value) {
+            options.mutable_profiler_options()->set_session_id(value);
+          },
+          nullptr);
     } else {
       LOG(WARNING) << "Unrecognised key: " << key;
     }
@@ -213,7 +220,7 @@ absl::Status ValidateRemoteProfilerSessionManagerOptions(
 }
 
 absl::Status ValidateHostPortPair(absl::string_view host_port) {
-  tsl::uint32 port;
+  uint32_t port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
   // host also must not be empty.
diff --git a/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc b/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc
new file mode 100644
index 00000000000000..2bd018d684c91c
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/utils/session_manager_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/profiler/utils/session_manager.h"
+
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/test.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using tensorflow::RemoteProfilerSessionManagerOptions;
+
+TEST(SessionManagerTest, OptionsWithSessionIdTest) {
+  absl::string_view logdir = "/tmp/logdir";
+  absl::flat_hash_map<std::string, std::variant<bool, int, std::string>> opts;
+  opts["session_id"] = std::string("test_session_id");
+  RemoteProfilerSessionManagerOptions options =
+      GetRemoteSessionManagerOptionsLocked(logdir, opts);
+  EXPECT_EQ(options.profiler_options().session_id(), "test_session_id");
+}
+
+TEST(SessionManagerTest, OptionsWithoutSessionIdTest) {
+  absl::string_view logdir = "/tmp/logdir";
+  absl::flat_hash_map<std::string, std::variant<bool, int, std::string>> opts;
+  RemoteProfilerSessionManagerOptions options =
+      GetRemoteSessionManagerOptionsLocked(logdir, opts);
+  EXPECT_EQ(options.profiler_options().session_id().empty(), true);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/utils/timespan.h b/third_party/xla/xla/tsl/profiler/utils/timespan.h
index 5429468374ca28..e1e48dd215a157 100644
--- a/third_party/xla/xla/tsl/profiler/utils/timespan.h
+++ b/third_party/xla/xla/tsl/profiler/utils/timespan.h
@@ -31,20 +31,20 @@ namespace profiler {
 // Events may have duration 0 ("instant events") but duration can't be negative.
 class Timespan {
  public:
-  static Timespan FromEndPoints(uint64 begin_ps, uint64 end_ps) {
+  static Timespan FromEndPoints(uint64_t begin_ps, uint64_t end_ps) {
     if (begin_ps > end_ps) {
       return Timespan(begin_ps, 0);
     }
     return Timespan(begin_ps, end_ps - begin_ps);
   }
 
-  explicit Timespan(uint64 begin_ps = 0, uint64 duration_ps = 0)
+  explicit Timespan(uint64_t begin_ps = 0, uint64_t duration_ps = 0)
       : begin_ps_(begin_ps), duration_ps_(duration_ps) {}
 
-  uint64 begin_ps() const { return begin_ps_; }
-  uint64 middle_ps() const { return begin_ps_ + duration_ps_ / 2; }
-  uint64 end_ps() const { return begin_ps_ + duration_ps_; }
-  uint64 duration_ps() const { return duration_ps_; }
+  uint64_t begin_ps() const { return begin_ps_; }
+  uint64_t middle_ps() const { return begin_ps_ + duration_ps_ / 2; }
+  uint64_t end_ps() const { return begin_ps_ + duration_ps_; }
+  uint64_t duration_ps() const { return duration_ps_; }
 
   // Returns true if the Timespan represents an instant in time (duration 0).
   bool Instant() const { return duration_ps() == 0; }
@@ -72,10 +72,10 @@ class Timespan {
   }
 
   // Returns true if time_ps is within this Timespan.
-  bool Includes(uint64 time_ps) const { return Includes(Timespan(time_ps)); }
+  bool Includes(uint64_t time_ps) const { return Includes(Timespan(time_ps)); }
 
   // Returns the duration in ps that this Timespan overlaps with the other.
-  uint64 OverlappedDurationPs(const Timespan& other) const {
+  uint64_t OverlappedDurationPs(const Timespan& other) const {
     if (!Overlaps(other)) return 0;
     return std::min(end_ps(), other.end_ps()) -
            std::max(begin_ps(), other.begin_ps());
@@ -123,12 +123,12 @@ class Timespan {
   }
 
  private:
-  uint64 begin_ps_;
-  uint64 duration_ps_;  // 0 for an instant event.
+  uint64_t begin_ps_;
+  uint64_t duration_ps_;  // 0 for an instant event.
 };
 
 // Creates a Timespan from endpoints in picoseconds.
-inline Timespan PicoSpan(uint64 start_ps, uint64 end_ps) {
+inline Timespan PicoSpan(uint64_t start_ps, uint64_t end_ps) {
   return Timespan::FromEndPoints(start_ps, end_ps);
 }
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/trace_utils.h b/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
index 8f0db91f7115d5..b7cd8a466960bd 100644
--- a/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/trace_utils.h
@@ -29,27 +29,27 @@ namespace profiler {
 // Constants used as trace_viewer PID (device_id in trace_events.proto).
 // PID 0 is unused.
 // Support up to 500 accelerator devices.
-constexpr uint32 kFirstDeviceId = 1;
-constexpr uint32 kLastDeviceId = 500;
+constexpr uint32_t kFirstDeviceId = 1;
+constexpr uint32_t kLastDeviceId = 500;
 
 // Max. devices per host. Power of 10 for ease of debugging.
 static constexpr uint32_t kMaxDevicesPerHost = 1000;
 // Support Upto 200 custom planes as fake devices (i.e., planes with a
 // "/custom:" prefix). See `<project_name>::kCustomPlanePrefix` for more
 // information
-constexpr uint32 kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
-constexpr uint32 kMaxCustomPlaneDevicesPerHost = 200;
-constexpr uint32 kLastCustomPlaneDeviceId =
+constexpr uint32_t kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
+constexpr uint32_t kMaxCustomPlaneDevicesPerHost = 200;
+constexpr uint32_t kLastCustomPlaneDeviceId =
     kFirstCustomPlaneDeviceId + kMaxCustomPlaneDevicesPerHost - 1;
 // Host threads are shown as a single fake device.
-constexpr uint32 kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
+constexpr uint32_t kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
 
 // Constants used as plane ID for custom NCCL planes, starting from the last
 // kMaxNcclPlanes in the custom plane device ID range.
-constexpr uint32 kMaxNcclPlanes = 100;
-constexpr uint32 kFirstNcclPlaneId =
+constexpr uint32_t kMaxNcclPlanes = 100;
+constexpr uint32_t kFirstNcclPlaneId =
     tsl::profiler::kMaxCustomPlaneDevicesPerHost - kMaxNcclPlanes;
-constexpr uint32 kLastNcclPlaneId = kFirstNcclPlaneId + kMaxNcclPlanes - 1;
+constexpr uint32_t kLastNcclPlaneId = kFirstNcclPlaneId + kMaxNcclPlanes - 1;
 
 constexpr int kNumGpuOnDeviceCustomPlanesPerHost = 50;
 constexpr int kFirstGpuOnDeviceCustomPlaneId =
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h b/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
index 44ef9150ec2b57..a0267766de828f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_builder.h
@@ -83,7 +83,7 @@ class XStatsBuilder {
   void ParseAndAddStatValue(const XStatMetadata& metadata,
                             absl::string_view value) {
     int64_t int_value;
-    uint64 uint_value;
+    uint64_t uint_value;
     double double_value;
     if (absl::SimpleAtoi(value, &int_value)) {
       AddStatValue(metadata, int_value);
@@ -116,7 +116,7 @@ class XStatsBuilder {
     return nullptr;
   }
 
-  static uint64 IntOrUintValue(const XStat& stat) {
+  static uint64_t IntOrUintValue(const XStat& stat) {
     return stat.value_case() == XStat::kUint64Value ? stat.uint64_value()
                                                     : stat.int64_value();
   }
@@ -228,6 +228,10 @@ class XEventBuilder : public XStatsBuilder<XEvent> {
   int64_t DurationPs() const { return event_->duration_ps(); }
   int64_t MetadataId() const { return event_->metadata_id(); }
 
+  void SetMetadataId(int64_t metadata_id) {
+    event_->set_metadata_id(metadata_id);
+  }
+
   void SetOffsetPs(int64_t offset_ps) { event_->set_offset_ps(offset_ps); }
 
   void SetOffsetNs(int64_t offset_ns) { SetOffsetPs(NanoToPico(offset_ns)); }
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
index 69eb26070c6237..53f062cafd3c27 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
@@ -568,7 +568,7 @@ bool IsInternalEvent(std::optional<int64_t> event_type) {
 bool IsInternalStat(std::optional<int64_t> stat_type) {
   if (!stat_type.has_value()) return false;
   switch (*stat_type) {
-    case StatType::kKernelDetails:
+    // case StatType::kKernelDetails:  # removed for rocm gpu kernel details
     case StatType::kProducerType:
     case StatType::kProducerId:
     case StatType::kConsumerType:
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
index 8f15e92ba9fc56..36dac81f12bfbd 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -503,7 +503,7 @@ class XFlow {
   }
 
   // Encoding
-  uint64 ToStatValue() const { return encoded_.whole; }
+  uint64_t ToStatValue() const { return encoded_.whole; }
 
   // Decoding
   static XFlow FromStatValue(uint64_t encoded) { return XFlow(encoded); }
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
index fd75840788c04c..f7292594df0af0 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tsl {
 namespace profiler {
 
-using XStatValue = std::variant<int64_t, uint64, absl::string_view>;
+using XStatValue = std::variant<int64_t, uint64_t, absl::string_view>;
 
 XPlane* GetOrCreateHostXPlane(XSpace* space);
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
index 4c18933e50fc02..0a1d0c85098508 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.cc
@@ -316,21 +316,33 @@ void SortXSpace(XSpace* space) {
 // The assumption is that both line's timestamp_ns and start_time_ns are
 // nano-seconds from epoch time, the different of these values is much
 // smaller than these value.
-void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
+void NormalizeTimestamps(XPlane* plane, uint64_t start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
     if (line.timestamp_ns() >= static_cast<int64_t>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() - start_time_ns);
+    } else {
+      // When this happen, we suppose that the line.timestamp_ns() should
+      // already be normalized, i.e., pretty small. Here use MAX_INT64 / 1000
+      // to check, supposing when it convert to picosecond, it should not cause
+      // overflow.
+      if (line.timestamp_ns() >= std::numeric_limits<int64_t>::max() / 1000) {
+        LOG(ERROR) << "line.timestamp_ns() " << line.timestamp_ns()
+                   << " is too large, which means the line.timestamp_ns() is "
+                      "not normalized before, "
+                      "and here it is normalized to some timestamp after it:"
+                   << start_time_ns;
+      }
     }
   }
 }
 
-void NormalizeTimestamps(XSpace* space, uint64 start_time_ns) {
+void NormalizeTimestamps(XSpace* space, uint64_t start_time_ns) {
   for (XPlane& plane : *space->mutable_planes()) {
     NormalizeTimestamps(&plane, start_time_ns);
   }
 }
 
-void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
+void DenormalizeTimestamps(XPlane* plane, uint64_t start_time_ns) {
   for (XLine& line : *plane->mutable_lines()) {
     if (line.timestamp_ns() < static_cast<int64_t>(start_time_ns)) {
       line.set_timestamp_ns(line.timestamp_ns() + start_time_ns);
@@ -338,7 +350,7 @@ void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns) {
   }
 }
 
-void DenormalizeTimestamps(XSpace* space, uint64 start_time_ns) {
+void DenormalizeTimestamps(XSpace* space, uint64_t start_time_ns) {
   for (XPlane& plane : *space->mutable_planes()) {
     DenormalizeTimestamps(&plane, start_time_ns);
   }
@@ -567,7 +579,7 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
   aggregated_plane.SetName(plane.Name());
   aggregated_plane.SetId(plane.Id());
 
-  uint64_t first_op_start_ps = kint64max;
+  uint64_t first_op_start_ps = std::numeric_limits<int64_t>::max();
   uint64_t last_op_end_ps = 0;
 
   plane.ForEachLine([&](const XLineVisitor& line) {
@@ -594,8 +606,9 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
                            ? last_op_end_ps
                            : timespan.end_ps();
       const auto& group_stat = event.GetStat(StatType::kGroupId);
-      int64_t group_id =
-          group_stat.has_value() ? group_stat->IntOrUintValue() : kint64max;
+      int64_t group_id = group_stat.has_value()
+                             ? group_stat->IntOrUintValue()
+                             : std::numeric_limits<int64_t>::max();
 
       StatByEvent& line_stats = stats[line.Id()][group_id];
       line_stats[event.Id()].stat.UpdateStat(timespan.duration_ps());
@@ -646,7 +659,7 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
             aggregated_line.AddEvent(event_metadata);
         aggregated_event.SetNumOccurrences(event_stat.stat.count());
         aggregated_event.SetDurationPs(event_stat.stat.sum());
-        if (group_id != kint64max) {
+        if (group_id != std::numeric_limits<int64_t>::max()) {
           aggregated_event.AddStatValue(*kGroupId, group_id);
         }
         if (event_stat.stat.count() > 1) {
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
index d1eb817305fd9f..fb4a43b814a21e 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
@@ -153,12 +153,12 @@ std::vector<Event> GetSortedEvents(Plane& plane,
 }
 
 // Normalize timestamps by time-shifting to start_time_ns_ as origin.
-void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
-void NormalizeTimestamps(XSpace* space, uint64 start_time_ns);
+void NormalizeTimestamps(XPlane* plane, uint64_t start_time_ns);
+void NormalizeTimestamps(XSpace* space, uint64_t start_time_ns);
 
 // Denormalize timestamps by time-shifting to 0 as origin.
-void DenormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
-void DenormalizeTimestamps(XSpace* space, uint64 start_time_ns);
+void DenormalizeTimestamps(XPlane* plane, uint64_t start_time_ns);
+void DenormalizeTimestamps(XSpace* space, uint64_t start_time_ns);
 
 // Merges src_plane into dst_plane. Both plane level stats, lines, events and
 // event level stats are merged. If src_plane and dst_plane both have the same
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
index 1c5dfeb7b990ac..01db1f0df504bb 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.cc
@@ -92,7 +92,7 @@ void XPlaneVisitor::BuildEventTypeMap(
     const XPlane* plane, const TypeGetterList& event_type_getter_list) {
   if (event_type_getter_list.empty()) return;
   for (const auto& event_metadata : plane->event_metadata()) {
-    uint64 metadata_id = event_metadata.first;
+    uint64_t metadata_id = event_metadata.first;
     const auto& metadata = event_metadata.second;
     for (const auto& event_type_getter : event_type_getter_list) {
       std::optional<int64_t> event_type = event_type_getter(metadata.name());
@@ -124,7 +124,7 @@ void XPlaneVisitor::BuildStatTypeMap(
     const XPlane* plane, const TypeGetterList& stat_type_getter_list) {
   if (stat_type_getter_list.empty()) return;
   for (const auto& stat_metadata : plane->stat_metadata()) {
-    uint64 metadata_id = stat_metadata.first;
+    uint64_t metadata_id = stat_metadata.first;
     const auto& metadata = stat_metadata.second;
     for (const auto& stat_type_getter : stat_type_getter_list) {
       std::optional<int64_t> stat_type = stat_type_getter(metadata.name());
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
index df3c6a242aaecd..b2eb5af4ab47b3 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -64,13 +64,14 @@ class XStatVisitor {
 
   int64_t IntValue() const { return stat_->int64_value(); }
 
-  uint64 UintValue() const { return stat_->uint64_value(); }
+  uint64_t UintValue() const { return stat_->uint64_value(); }
 
   absl::string_view BytesValue() const { return stat_->bytes_value(); }
 
-  uint64 IntOrUintValue() const {
-    return ValueCase() == XStat::kUint64Value ? UintValue()
-                                              : static_cast<uint64>(IntValue());
+  uint64_t IntOrUintValue() const {
+    return ValueCase() == XStat::kUint64Value
+               ? UintValue()
+               : static_cast<uint64_t>(IntValue());
   }
 
   double DoubleValue() const { return stat_->double_value(); }
diff --git a/third_party/xla/xla/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
index 48351ded114b14..47817d98e0d94c 100644
--- a/third_party/xla/xla/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -278,6 +278,8 @@ cc_library(
     deps = [
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:stringpiece",
     ],
@@ -305,11 +307,11 @@ cc_library(
     hdrs = ["command_line_flags.h"],
     deps = [
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:str_util",
-        "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/platform:stringprintf",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/util/command_line_flags.cc b/third_party/xla/xla/tsl/util/command_line_flags.cc
index 5cdb31ae6b4c50..b5f12b443e667f 100644
--- a/third_party/xla/xla/tsl/util/command_line_flags.cc
+++ b/third_party/xla/xla/tsl/util/command_line_flags.cc
@@ -16,68 +16,53 @@ limitations under the License.
 #include "xla/tsl/util/command_line_flags.h"
 
 #include <algorithm>
-#include <cinttypes>
-#include <cstring>
+#include <cstdint>
+#include <functional>
+#include <limits>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/functional/function_ref.h"
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/logging.h"
-#include "tsl/platform/str_util.h"
-#include "tsl/platform/stringpiece.h"
-#include "tsl/platform/stringprintf.h"
 
 namespace tsl {
 namespace {
 
 bool ParseStringFlag(absl::string_view arg, absl::string_view flag,
-                     const std::function<bool(string)>& hook,
+                     absl::FunctionRef<bool(std::string)> hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    *value_parsing_ok = hook(string(arg));
+    *value_parsing_ok = hook(std::string(arg));
     return true;
   }
 
   return false;
 }
 
-bool ParseInt32Flag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(int32_t)>& hook,
-                    bool* value_parsing_ok) {
+template <typename T>
+bool ParseIntFlag(absl::string_view arg, absl::string_view flag,
+                  absl::FunctionRef<bool(T)> hook, bool* value_parsing_ok) {
+  static_assert(std::numeric_limits<T>::is_integer);
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    char extra;
-    int32_t parsed_int32;
-    if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
+    T parsed_int;
+    if (absl::SimpleAtoi(arg, &parsed_int)) {
+      *value_parsing_ok = hook(parsed_int);
     } else {
-      *value_parsing_ok = hook(parsed_int32);
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt64Flag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(int64_t)>& hook,
-                    bool* value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
-      absl::ConsumePrefix(&arg, "=")) {
-    char extra;
-    int64_t parsed_int64;
-    if (sscanf(arg.data(), "%" SCNd64 "%c", &parsed_int64, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
-    } else {
-      *value_parsing_ok = hook(parsed_int64);
     }
     return true;
   }
@@ -86,8 +71,7 @@ bool ParseInt64Flag(absl::string_view arg, absl::string_view flag,
 }
 
 bool ParseBoolFlag(absl::string_view arg, absl::string_view flag,
-                   const std::function<bool(bool)>& hook,
-                   bool* value_parsing_ok) {
+                   absl::FunctionRef<bool(bool)> hook, bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
@@ -105,31 +89,29 @@ bool ParseBoolFlag(absl::string_view arg, absl::string_view flag,
     if (absl::EqualsIgnoreCase(arg, "false") || arg == "0") {
       *value_parsing_ok = hook(false);
       return true;
-    } else {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-      return true;
     }
+    LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+               << ".";
+    *value_parsing_ok = false;
+    return true;
   }
 
   return false;
 }
 
 bool ParseFloatFlag(absl::string_view arg, absl::string_view flag,
-                    const std::function<bool(float)>& hook,
+                    absl::FunctionRef<bool(float)> hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
   if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
       absl::ConsumePrefix(&arg, "=")) {
-    char extra;
     float parsed_float;
-    if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
+    if (absl::SimpleAtof(arg, &parsed_float)) {
+      *value_parsing_ok = hook(parsed_float);
+    } else {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
-    } else {
-      *value_parsing_ok = hook(parsed_float);
     }
     return true;
   }
@@ -139,7 +121,7 @@ bool ParseFloatFlag(absl::string_view arg, absl::string_view flag,
 
 }  // namespace
 
-Flag::Flag(const char* name, int32_t* dst, const string& usage_text,
+Flag::Flag(const char* name, int32_t* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_INT32),
@@ -153,7 +135,7 @@ Flag::Flag(const char* name, int32_t* dst, const string& usage_text,
       int32_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, int64_t* dst, const string& usage_text,
+Flag::Flag(const char* name, int64_t* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_INT64),
@@ -167,7 +149,7 @@ Flag::Flag(const char* name, int64_t* dst, const string& usage_text,
       int64_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, float* dst, const string& usage_text,
+Flag::Flag(const char* name, float* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_FLOAT),
@@ -181,7 +163,7 @@ Flag::Flag(const char* name, float* dst, const string& usage_text,
       float_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, bool* dst, const string& usage_text,
+Flag::Flag(const char* name, bool* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_BOOL),
@@ -195,11 +177,11 @@ Flag::Flag(const char* name, bool* dst, const string& usage_text,
       bool_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, string* dst, const string& usage_text,
+Flag::Flag(const char* name, std::string* dst, absl::string_view usage_text,
            bool* dst_updated)
     : name_(name),
       type_(TYPE_STRING),
-      string_hook_([dst, dst_updated](string value) {
+      string_hook_([dst, dst_updated](std::string value) {
         *dst = std::move(value);
         if (dst_updated) {
           *dst_updated = true;
@@ -210,7 +192,7 @@ Flag::Flag(const char* name, string* dst, const string& usage_text,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(int32_t)> int32_hook,
-           int32_t default_value_for_display, const string& usage_text)
+           int32_t default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_INT32),
       int32_hook_(std::move(int32_hook)),
@@ -218,7 +200,7 @@ Flag::Flag(const char* name, std::function<bool(int32_t)> int32_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(int64_t)> int64_hook,
-           int64_t default_value_for_display, const string& usage_text)
+           int64_t default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_INT64),
       int64_hook_(std::move(int64_hook)),
@@ -226,7 +208,7 @@ Flag::Flag(const char* name, std::function<bool(int64_t)> int64_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(float)> float_hook,
-           float default_value_for_display, const string& usage_text)
+           float default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_FLOAT),
       float_hook_(std::move(float_hook)),
@@ -234,27 +216,27 @@ Flag::Flag(const char* name, std::function<bool(float)> float_hook,
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, std::function<bool(bool)> bool_hook,
-           bool default_value_for_display, const string& usage_text)
+           bool default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_BOOL),
       bool_hook_(std::move(bool_hook)),
       bool_default_for_display_(default_value_for_display),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, std::function<bool(string)> string_hook,
-           string default_value_for_display, const string& usage_text)
+Flag::Flag(const char* name, std::function<bool(std::string)> string_hook,
+           std::string default_value_for_display, absl::string_view usage_text)
     : name_(name),
       type_(TYPE_STRING),
       string_hook_(std::move(string_hook)),
       string_default_for_display_(std::move(default_value_for_display)),
       usage_text_(usage_text) {}
 
-bool Flag::Parse(string arg, bool* value_parsing_ok) const {
+bool Flag::Parse(absl::string_view arg, bool* value_parsing_ok) const {
   bool result = false;
   if (type_ == TYPE_INT32) {
-    result = ParseInt32Flag(arg, name_, int32_hook_, value_parsing_ok);
+    result = ParseIntFlag<int32_t>(arg, name_, int32_hook_, value_parsing_ok);
   } else if (type_ == TYPE_INT64) {
-    result = ParseInt64Flag(arg, name_, int64_hook_, value_parsing_ok);
+    result = ParseIntFlag<int64_t>(arg, name_, int64_hook_, value_parsing_ok);
   } else if (type_ == TYPE_BOOL) {
     result = ParseBoolFlag(arg, name_, bool_hook_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
@@ -266,11 +248,12 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
 }
 
 /*static*/ bool Flags::Parse(int* argc, char** argv,
-                             const std::vector<Flag>& flag_list) {
+                             absl::Span<const Flag> flag_list) {
   bool result = true;
   std::vector<char*> unknown_flags;
   for (int i = 1; i < *argc; ++i) {
-    if (string(argv[i]) == "--") {
+    absl::string_view arg = argv[i];
+    if (arg == "--") {
       while (i < *argc) {
         unknown_flags.push_back(argv[i]);
         ++i;
@@ -281,7 +264,7 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     bool was_found = false;
     for (const Flag& flag : flag_list) {
       bool value_parsing_ok;
-      was_found = flag.Parse(argv[i], &value_parsing_ok);
+      was_found = flag.Parse(arg, &value_parsing_ok);
       if (!value_parsing_ok) {
         result = false;
       }
@@ -294,17 +277,14 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     }
   }
   // Passthrough any extra flags.
-  int dst = 1;  // Skip argv[0]
-  for (char* f : unknown_flags) {
-    argv[dst++] = f;
-  }
-  argv[dst++] = nullptr;
+  absl::c_copy(unknown_flags, argv + 1);
   *argc = unknown_flags.size() + 1;
-  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
+  argv[*argc] = nullptr;
+  return result && (*argc < 2 || absl::string_view(argv[1]) != "--help");
 }
 
 /*static*/ bool Flags::Parse(std::vector<std::string>& flags,
-                             const std::vector<Flag>& flag_list) {
+                             absl::Span<const Flag> flag_list) {
   bool result = true;
   std::vector<std::string> unknown_flags;
   for (auto& flag : flags) {
@@ -321,47 +301,45 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
       }
     }
   }
-  auto IsEmpty = [](const std::string& flag) { return flag.empty(); };
+  auto IsEmpty = [](absl::string_view flag) { return flag.empty(); };
   flags.erase(std::remove_if(flags.begin(), flags.end(), IsEmpty), flags.end());
   return result;
 }
 
-/*static*/ string Flags::Usage(const string& cmdline,
-                               const std::vector<Flag>& flag_list) {
-  string usage_text;
+/*static*/ std::string Flags::Usage(absl::string_view cmdline,
+                                    absl::Span<const Flag> flag_list) {
+  std::string usage_text;
   if (!flag_list.empty()) {
-    strings::Appendf(&usage_text, "usage: %s\nFlags:\n", cmdline.c_str());
+    absl::StrAppendFormat(&usage_text, "usage: %s\nFlags:\n", cmdline);
   } else {
-    strings::Appendf(&usage_text, "usage: %s\n", cmdline.c_str());
+    absl::StrAppendFormat(&usage_text, "usage: %s\n", cmdline);
   }
   for (const Flag& flag : flag_list) {
     const char* type_name = "";
-    string flag_string;
+    std::string flag_string;
     if (flag.type_ == Flag::TYPE_INT32) {
       type_name = "int32";
-      flag_string = strings::Printf("--%s=%d", flag.name_.c_str(),
+      flag_string = absl::StrFormat("--%s=%d", flag.name_,
                                     flag.int32_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_INT64) {
       type_name = "int64";
-      flag_string = strings::Printf(
-          "--%s=%lld", flag.name_.c_str(),
-          static_cast<long long>(flag.int64_default_for_display_));
+      flag_string = absl::StrFormat("--%s=%d", flag.name_,
+                                    flag.int64_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_BOOL) {
       type_name = "bool";
-      flag_string =
-          strings::Printf("--%s=%s", flag.name_.c_str(),
-                          flag.bool_default_for_display_ ? "true" : "false");
+      flag_string = absl::StrFormat("--%s=%v", flag.name_,
+                                    flag.bool_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_STRING) {
       type_name = "string";
-      flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
-                                    flag.string_default_for_display_.c_str());
+      flag_string = absl::StrFormat("--%s=\"%s\"", flag.name_,
+                                    flag.string_default_for_display_);
     } else if (flag.type_ == Flag::TYPE_FLOAT) {
       type_name = "float";
-      flag_string = strings::Printf("--%s=%f", flag.name_.c_str(),
+      flag_string = absl::StrFormat("--%s=%f", flag.name_,
                                     flag.float_default_for_display_);
     }
-    strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
-                     type_name, flag.usage_text_.c_str());
+    absl::StrAppendFormat(&usage_text, "\t%-33s\t%s\t%s\n", flag_string,
+                          type_name, flag.usage_text_);
   }
   return usage_text;
 }
diff --git a/third_party/xla/xla/tsl/util/command_line_flags.h b/third_party/xla/xla/tsl/util/command_line_flags.h
index 50888879219f3c..55fd3366eaf67a 100644
--- a/third_party/xla/xla/tsl/util/command_line_flags.h
+++ b/third_party/xla/xla/tsl/util/command_line_flags.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 #define XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 
+#include <cstdint>
 #include <functional>
 #include <string>
 #include <vector>
 
-#include "xla/tsl/platform/types.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 
 namespace tsl {
 
@@ -63,15 +65,15 @@ namespace tsl {
 // text, and a pointer to the corresponding variable.
 class Flag {
  public:
-  Flag(const char* name, int32* dst, const string& usage_text,
+  Flag(const char* name, int32_t* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, int64_t* dst, const string& usage_text,
+  Flag(const char* name, int64_t* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, bool* dst, const string& usage_text,
+  Flag(const char* name, bool* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, string* dst, const string& usage_text,
+  Flag(const char* name, std::string* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
-  Flag(const char* name, float* dst, const string& usage_text,
+  Flag(const char* name, float* dst, absl::string_view usage_text,
        bool* dst_updated = nullptr);
 
   // These constructors invoke a hook on a match instead of writing to a
@@ -81,22 +83,22 @@ class Flag {
   // "default_value_for_display" is shown as the default value of this flag in
   // Flags::Usage().
   Flag(const char* name, std::function<bool(int32_t)> int32_hook,
-       int32_t default_value_for_display, const string& usage_text);
+       int32_t default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(int64_t)> int64_hook,
-       int64_t default_value_for_display, const string& usage_text);
+       int64_t default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(float)> float_hook,
-       float default_value_for_display, const string& usage_text);
+       float default_value_for_display, absl::string_view usage_text);
   Flag(const char* name, std::function<bool(bool)> bool_hook,
-       bool default_value_for_display, const string& usage_text);
-  Flag(const char* name, std::function<bool(string)> string_hook,
-       string default_value_for_display, const string& usage_text);
+       bool default_value_for_display, absl::string_view usage_text);
+  Flag(const char* name, std::function<bool(std::string)> string_hook,
+       std::string default_value_for_display, absl::string_view usage_text);
 
  private:
   friend class Flags;
 
-  bool Parse(string arg, bool* value_parsing_ok) const;
+  bool Parse(absl::string_view arg, bool* value_parsing_ok) const;
 
-  string name_;
+  std::string name_;
   enum {
     TYPE_INT32,
     TYPE_INT64,
@@ -106,7 +108,7 @@ class Flag {
   } type_;
 
   std::function<bool(int32_t)> int32_hook_;
-  int32 int32_default_for_display_;
+  int32_t int32_default_for_display_;
 
   std::function<bool(int64_t)> int64_hook_;
   int64_t int64_default_for_display_;
@@ -117,10 +119,10 @@ class Flag {
   std::function<bool(bool)> bool_hook_;
   bool bool_default_for_display_;
 
-  std::function<bool(string)> string_hook_;
-  string string_default_for_display_;
+  std::function<bool(std::string)> string_hook_;
+  std::string string_default_for_display_;
 
-  string usage_text_;
+  std::string usage_text_;
 };
 
 class Flags {
@@ -130,17 +132,17 @@ class Flags {
   // with matching flags, and remove the matching arguments from (*argc, argv).
   // Return true iff all recognized flag values were parsed correctly, and the
   // first remaining argument is not "--help".
-  static bool Parse(int* argc, char** argv, const std::vector<Flag>& flag_list);
+  static bool Parse(int* argc, char** argv, absl::Span<const Flag> flag_list);
 
   // Similar as above, but accepts a mutable vector of strings in place of
   // argc and argv. Doesn't ignore the first flag, and return the unknown flags
   // back in flags vector.
   static bool Parse(std::vector<std::string>& flags,
-                    const std::vector<Flag>& flag_list);
+                    absl::Span<const Flag> flag_list);
   // Return a usage message with command line cmdline, and the
   // usage_text strings in flag_list[].
-  static string Usage(const string& cmdline,
-                      const std::vector<Flag>& flag_list);
+  static std::string Usage(absl::string_view cmdline,
+                           absl::Span<const Flag> flag_list);
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
index cc5fa36a5ebcab..a3bc7abf1f341b 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -16,29 +16,30 @@ limitations under the License.
 #include "xla/tsl/util/device_name_utils.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <string>
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/str_util.h"
 
 namespace tsl {
 
-static bool IsAlpha(char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
 static bool IsAlphaNumOrUnderscore(char c) {
-  return IsAlpha(c) || (c >= '0' && c <= '9') || c == '_';
+  return absl::ascii_isalnum(c) || c == '_';
 }
 
 // Returns true iff "in" is a valid job name.
 static bool IsJobName(absl::string_view in) {
-  return !in.empty() && IsAlpha(in.front()) &&
+  return !in.empty() && absl::ascii_isalpha(in.front()) &&
          std::all_of(in.begin(), in.end(), IsAlphaNumOrUnderscore);
 }
 
 static bool ConsumePrefix(absl::string_view* in, string* out,
                           absl::string_view prefix_terminators) {
-  if (in->empty() || !IsAlpha(in->front())) {
+  if (in->empty() || !absl::ascii_isalpha(in->front())) {
     return false;
   }
   const auto end_it =
@@ -66,7 +67,7 @@ static bool ConsumeDeviceType(absl::string_view* in, string* device_type) {
 // Returns true and fills in "*val" iff "*in" starts with a decimal
 // number.
 static bool ConsumeNumber(absl::string_view* in, int* val) {
-  uint64 tmp;
+  uint64_t tmp;
   if (str_util::ConsumeLeadingDigits(in, &tmp)) {
     *val = tmp;
     return true;
@@ -75,16 +76,16 @@ static bool ConsumeNumber(absl::string_view* in, int* val) {
 }
 
 // Returns a fully qualified device name given the parameters.
-static string DeviceName(const string& job, int replica, int task,
-                         const string& device_prefix, const string& device_type,
-                         int id) {
+static std::string DeviceName(absl::string_view job, int replica, int task,
+                              absl::string_view device_prefix,
+                              absl::string_view device_type, int id) {
   CHECK(IsJobName(job)) << job;
   CHECK_LE(0, replica);
   CHECK_LE(0, task);
   CHECK(!device_type.empty());
   CHECK_LE(0, id);
-  return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task,
-                         device_prefix, device_type, ":", id);
+  return absl::StrCat("/job:", job, "/replica:", replica, "/task:", task,
+                      device_prefix, device_type, ":", id);
 }
 
 /* static */
diff --git a/third_party/xla/xla/tsl/util/proto/BUILD b/third_party/xla/xla/tsl/util/proto/BUILD
index ce6a593cebe7ef..b0045943904f8a 100644
--- a/third_party/xla/xla/tsl/util/proto/BUILD
+++ b/third_party/xla/xla/tsl/util/proto/BUILD
@@ -31,7 +31,8 @@ cc_library(
     srcs = ["proto_matchers.cc"],
     hdrs = ["proto_matchers.h"],
     visibility = [
-        "//learning/pathways:__subpackages__",  # For OSS
+        "//learning/pathways:__subpackages__",
+        "//third_party/pathways:__subpackages__",  # For OSS
         "//xla:__subpackages__",
     ],
     deps = [
diff --git a/third_party/xla/xla/tuple_tree.h b/third_party/xla/xla/tuple_tree.h
index dcdf8442e8bca4..018fcb56031960 100644
--- a/third_party/xla/xla/tuple_tree.h
+++ b/third_party/xla/xla/tuple_tree.h
@@ -371,6 +371,8 @@ class TupleTree {
         .ok();
   }
 
+  bool IsTuple() const { return nodes_.size() > 1; }
+
   absl::Status CopyCompatibleSubtreeFrom(const TupleTree<T>& other,
                                          const ShapeIndex& src_index,
                                          const ShapeIndex& dst_index) {
diff --git a/third_party/xla/xla/tuple_tree_test.cc b/third_party/xla/xla/tuple_tree_test.cc
index 767beebf19a4e5..5741d694424682 100644
--- a/third_party/xla/xla/tuple_tree_test.cc
+++ b/third_party/xla/xla/tuple_tree_test.cc
@@ -747,5 +747,14 @@ TEST_F(TupleTreeTest, ToNode) {
   EXPECT_THAT(tree.ToNode({0, 0}), StatusIs(absl::StatusCode::kInvalidArgument,
                                             "Cannot index into a leaf node"));
 }
+
+TEST_F(TupleTreeTest, IsTuple) {
+  TupleTree<int> tuple_tree({5});
+  TupleTree<int> non_tuple_tree(5);
+
+  EXPECT_TRUE(tuple_tree.IsTuple());
+  EXPECT_FALSE(non_tuple_tree.IsTuple());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 6c482d113d1164..06d78a424daa8c 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -286,7 +286,7 @@ std::string HumanReadableNumOps(double flops, double nanoseconds,
       absl::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
-  throughput += absl::StrCat(op_prefix, "OP/s");
+  absl::StrAppend(&throughput, op_prefix, "OP/s");
   return throughput;
 }
 
@@ -538,26 +538,5 @@ std::string PrintAllFields(const tsl::protobuf::Message& message) {
   return result.str();
 }
 
-std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
-                                                std::size_t size) {
-  CHECK_GT(alignment, 0) << "alignment must be positive";
-  CHECK(IsPowerOf2(alignment))
-      << "alignment must be a power of 2, but got " << alignment;
-  CHECK_GT(size, 0) << "size must be positive";
-#ifdef _WIN32
-  void* raw_ptr = _aligned_malloc(size, alignment);  // Note argument order
-#elif defined(__ANDROID__) && __ANDROID_API__ < 28
-  // Use posix_memalign as a fallback for older Android APIs
-  void* raw_ptr;
-  int result = posix_memalign(&raw_ptr, alignment, size);
-  CHECK_EQ(result, 0) << "posix_memalign failed with error code: " << result;
-#else
-  void* raw_ptr = std::aligned_alloc(alignment, size);
-#endif
-  CHECK_NE(raw_ptr, nullptr) << "aligned_alloc failed";
-  // Return unique_ptr managing the memory.
-  return std::unique_ptr<void, FreeDeleter>(raw_ptr, FreeDeleter());
-}
-
 int64_t Product(absl::Span<const int64_t> xs) { return Product<int64_t>(xs); }
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 80091d8328c49d..7bb69650e3f854 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -1005,31 +1005,12 @@ using Vector3 = std::array<int64_t, 3>;
 std::string PrintAllFields(const tsl::protobuf::Message& message);
 
 // Returns true if x is a power of 2.
-constexpr bool IsPowerOf2(size_t x) noexcept {
+ABSL_DEPRECATE_AND_INLINE()
+constexpr bool IsPowerOf2(size_t x) {
   // Checks that x is non-zero and has only a single bit set.
-  return x != 0 && (x & (x - 1)) == 0;
+  return absl::has_single_bit(x);
 }
 
-// A custom deleter that frees the pointer via std::free().
-struct FreeDeleter {
-  void operator()(void* ptr) {
-#if defined(_WIN32)
-    _aligned_free(ptr);
-#else
-    std::free(ptr);
-#endif
-  }
-};
-
-/**
- * @brief Allocates memory with specified alignment.
- * @param alignment Specifies the alignment. Power of two.
- * @param size The number of bytes to allocate. Integral multiple of alignment
- * @return A unique_ptr managing the allocated memory.
- */
-std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
-                                                std::size_t size);
-
 // Note that STRING is evaluated regardless of whether it will be logged.
 #define XLA_LOG_LINES(SEV, STRING) \
   ::xla::LogLines##SEV(STRING, __FILE__, __LINE__)
diff --git a/third_party/xla/xla/xla.default.bzl b/third_party/xla/xla/xla.default.bzl
index 3a975e4df2bc1b..8d829c93c2eabb 100644
--- a/third_party/xla/xla/xla.default.bzl
+++ b/third_party/xla/xla/xla.default.bzl
@@ -1,11 +1,15 @@
 """Wrapper around proto libraries used inside the XLA codebase."""
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:dicts.bzl", "dicts")
+load("@bazel_skylib//lib:paths.bzl", "paths")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
+load("@rules_cc//cc/common:cc_info.bzl", "CcInfo")
+load("//xla:py_strict.bzl", "py_strict_test")
 load(
     "//xla/tsl:package_groups.bzl",
     "DEFAULT_LOAD_VISIBILITY",
@@ -20,6 +24,7 @@ load(
     "tf_exec_properties",
 )
 load("//xla/tsl/platform/default:build_config.bzl", "strict_cc_test")
+load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 visibility(DEFAULT_LOAD_VISIBILITY + LEGACY_XLA_USERS)
 
@@ -111,3 +116,82 @@ def xla_bzl_library(name = "xla_bzl_library"):
             "@bazel_skylib//:bzl_library",
         ],
     )
+
+def _symlink_dynamic_libs_rule_impl(ctx):
+    runfiles = ctx.runfiles()
+    runfiles_symlinks = {}
+    for dep in ctx.attr.deps:
+        linker_inputs = dep[CcInfo].linking_context.linker_inputs.to_list()
+        for linker_input in linker_inputs:
+            if len(linker_input.libraries) == 0:
+                continue
+            lib = linker_input.libraries[0].dynamic_library
+            if not lib:
+                continue
+            lib_path = paths.join(ctx.attr.lib_dir, lib.basename)
+            runfiles_symlinks[lib_path] = lib
+    return [
+        DefaultInfo(runfiles = ctx.runfiles(
+            symlinks = runfiles_symlinks,
+        ).merge(runfiles)),
+    ]
+
+_symlink_dynamic_libs_rule = rule(
+    implementation = _symlink_dynamic_libs_rule_impl,
+    attrs = {
+        "deps": attr.label_list(allow_empty = True),
+        "lib_dir": attr.string(mandatory = True),
+    },
+    doc = "Symlinks all dynamic libraries for `deps` into a single `lib_dir` directory.",
+)
+
+def xla_py_strict_test(name, deps = None, data = None, env = None, need_cuda_libs = False, **kwargs):
+    """A wrapper around py_strict_test that adds XLA-specific dependencies.
+
+    Args:
+      name: The name of the test.
+      deps: The dependencies of the test.
+      data: The data dependencies of the test.
+      env: The environment variables to set for the test.
+      need_cuda_libs: Whether to add CUDA libraries as data dependencies.
+      **kwargs: Other arguments to pass to the test.
+    """
+    deps = deps or []
+    data = data or []
+    env = env or {}
+
+    if need_cuda_libs:
+        library_target = "_{}_libs".format(name)
+        lib_dir = paths.join(
+            native.package_name(),
+            library_target,
+        )
+
+        # If the python tests needs to have CUDA libraries as data dependencies, we symlink
+        # them into a directory inside the runfiles directory that the test can access and add
+        # that directory to the LD_LIBRARY_PATH and CUDA_HOME environment variables.
+        _symlink_dynamic_libs_rule(
+            name = library_target,
+            lib_dir = lib_dir,
+            deps = if_cuda_is_configured(
+                [
+                    "//xla/stream_executor/cuda:all_runtime",
+                ],
+            ),
+            testonly = True,
+            visibility = ["//visibility:private"],
+        )
+
+        data = data + [library_target]
+        env = dicts.add(env, {
+            "CUDA_HOME": lib_dir,
+            "LD_LIBRARY_PATH": lib_dir,
+        })
+
+    py_strict_test(
+        name = name,
+        deps = deps + xla_py_test_deps(),
+        data = data,
+        env = env,
+        **kwargs
+    )
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 80459772c7bbae..357d37a20351f7 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -26,6 +26,24 @@ message CompilationEnvironmentsProto {
   repeated google.protobuf.Any environments = 1;
 }
 
+// A [first; last] range of integers, inclusive on both ends.
+message IntRangeInclusive {
+  int64 first = 1;
+  int64 last = 2;
+}
+
+// Set of filters for limiting the thunk buffer debug instrumentation to
+// specific thunks. Only meaningful in combination with either
+// `xla_gpu_experimental_enable_checksum_tracing_on_thunks` or
+// `xla_gpu_experimental_enable_checksum_validation_on_thunks`.
+message ThunkBufferDebugFilter {
+  // If set, only thunk IDs matching one or more of the ranges will be included.
+  repeated IntRangeInclusive thunk_id_ranges = 1;
+  // If set, only thunks with profile annotations matching one or more of the
+  // regexes will be included.
+  repeated string profile_annotation_regexes = 2;
+}
+
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields with
 // the exception of few stable GPU feature flags (see `GetFlagStatus` in
@@ -155,6 +173,7 @@ message DebugOptions {
     LIBRARY_FUSION_TYPE_DOT = 1;  // Dot and any eltwise ops around it.
     LIBRARY_FUSION_TYPE_ELTWISE = 2;
     LIBRARY_FUSION_TYPE_REDUCE = 3;
+    LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT = 4;
   }
 
   enum XnnGraphFusionMode {
@@ -202,18 +221,27 @@ message DebugOptions {
   // below!
   optional bool xla_cpu_enable_fast_min_max = 140;
 
+  // When xla_enable_platform_dependent_math is true, we allow operations to use
+  // calculations that produce different results depending on the current
+  // machine.
+  optional bool xla_cpu_enable_platform_dependent_math = 425;
+
   // Call oneDNN custom call thunks in the CPU backend
   optional bool xla_cpu_experimental_onednn_custom_call = 412;
 
-  // Stores the fusion types enabled for oneDNN in DotLibraryRewriter pass.
+  // Stores the fusion types enabled for oneDNN in LibraryRewriter pass.
   repeated LibraryFusionType xla_cpu_experimental_onednn_fusion_type = 399;
 
-  // Stores the fusion types enabled for XNNPACK in DotLibraryRewriter pass.
+  // Stores the fusion types enabled for XNNPACK in LibraryRewriter pass.
   repeated LibraryFusionType xla_cpu_experimental_xnn_fusion_type = 400;
 
   // Controls XnnGraphFusion HLO pass.
   optional XnnGraphFusionMode xla_cpu_experimental_xnn_graph_fusion_mode = 365;
 
+  // Stores the fusion types enabled for YNNPACK in LibraryRewriter pass or
+  // for individual operations.
+  repeated LibraryFusionType xla_cpu_experimental_ynn_fusion_type = 422;
+
   // When xla_cpu_enable_fast_math is true then this controls whether we forbid
   // to use the reciprocal of an argument instead of division. Ignored when
   // xla_cpu_enable_fast_math is false.
@@ -323,12 +351,20 @@ message DebugOptions {
     reserved 2;  // Was PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER
   }
 
+  // Limits the thunk buffer debug instrumentation to specific thunks.
+  optional ThunkBufferDebugFilter
+      xla_gpu_experimental_thunk_buffer_debug_filter = 424;
+
   // If true, every time an HLO module is run, we will dump an
   // HloUnoptimizedSnapshot (essentially, a serialized unoptimizedmodule plus
   // its inputs) to the --xla_dump_to directory. This flag is currently
   // supported on XLA:GPU and XLA:CPU.
   optional bool xla_dump_hlo_unoptimized_snapshots = 405;
 
+  // Enable communication optimization patterns specified in Enzyme. More
+  // details in http://shortn/_jXJ2VFoyMN.
+  optional bool xla_enable_enzyme_comms_opt = 429;
+
   // Denylist for cuDNN convolutions.
   optional string xla_gpu_algorithm_denylist_path = 128;
 
@@ -426,6 +462,11 @@ message DebugOptions {
   optional string xla_gpu_cuda_data_dir = 61;
 
   // Let GEMM fusion autotuning probe cuDNN as a backend.
+  //
+  // CAUTION:
+  // * HLO dot precision (e.g. `algorithm=dot_bf16_bf16_f32`) is ignored.
+  // * `tf32` matmuls are enabled unconditionally.
+  //
   // Current levels:
   // 0: Disabled.
   // 1: Enabled on Blackwell+ GPUs.
@@ -458,6 +499,8 @@ message DebugOptions {
   // compilation, possibly multiple times per process. This only works on CUDA.
   optional string xla_gpu_dump_autotune_results_to = 222;
 
+  // The flag is being generalized to dump all autotuned instructions as we
+  // combine the autotuner passes into a single pass.
   optional bool xla_gpu_dump_autotuned_gemm_fusions = 232;
 
   // Whether to dump llvm ir when compiling to ptx.
@@ -582,6 +625,12 @@ message DebugOptions {
 
   optional bool xla_gpu_exhaustive_tiling_search = 219;
 
+  // If true, allows unroll factor 8 on Blackwell architectures. This is also
+  // guarded with a heuristic, but the heuristic is not perfect, so enabling
+  // this flag can cause both performance improvements and performance
+  // regressions.
+  optional bool xla_gpu_experimental_allow_unroll_factor_eight = 430;
+
   // Specifies the behavior of per kernel autotuning cache.
   optional AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324;
 
@@ -600,6 +649,15 @@ message DebugOptions {
   // Dump FDO profiles in a binary format to a separate file.
   optional bool xla_gpu_experimental_dump_fdo_profiles = 338;
 
+  // Dump the serialized GPU executables to 'gpu_executable' suffixed
+  // files in the directory specified by `xla_dump_to`.
+  // No-op if `xla_dump_to` isn't set, or during autotuning compilations.
+  //
+  // The dumped files are serialized `xla.ExecutableAndOptionsProto` messages,
+  // which contain the binary serialized `xla.gpu.GpuExecutableProto`, in the
+  // `serialized_executable` field.
+  optional bool xla_gpu_experimental_dump_gpu_executable = 427;
+
   // Enable windowed einsum(collective matmul) rewrite for all-to-all + gemm
   // This feature is still experimental and effective only
   // xla_gpu_multi_streamed_windowed_einsum is set to true.
@@ -949,6 +1007,7 @@ message DebugOptions {
   reserved 385;  // xla_gpu_experimental_enable_dynamic_dot_search_space
   reserved "xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms";
   reserved 367;
+  reserved 423;  // xla_gpu_experimental_enable_checksum_tracing_on_thunks
 
   //--------------------------------------------------------------------------//
   // XLA:TPU options.
@@ -1260,13 +1319,19 @@ message DebugOptions {
   // Maximum number of buffers to print when debugging buffer assignment.
   optional int64 xla_debug_buffer_assignment_show_max = 251;
 
-  enum UnstableReductionDetectionMode {
-    UNSTABLE_REDUCTION_DETECTION_MODE_NONE = 0;
-    UNSTABLE_REDUCTION_DETECTION_MODE_WARNING = 1;
-    UNSTABLE_REDUCTION_DETECTION_MODE_FAIL = 2;
+  enum DetectionMode {
+    DETECTION_MODE_NONE = 0;
+    DETECTION_MODE_WARNING = 1;
+    DETECTION_MODE_FAIL = 2;
   }
   // Whether to enable checks for unstable reductions in computations.
-  optional UnstableReductionDetectionMode xla_detect_unstable_reductions = 403;
+  optional DetectionMode xla_detect_unstable_reductions = 403;
+
+  // Whether to enable checks for NaN values in computations.
+  optional DetectionMode xla_gpu_detect_nan = 426;
+
+  // Whether to enable checks for Inf values in computations.
+  optional DetectionMode xla_gpu_detect_inf = 428;
 
   reserved 275;  // was xla_gpu_enable_mlir_emitters
   reserved 281;  // was xla_gpu_max_mlir_kernels
@@ -1361,7 +1426,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 422
+  // Next id: 431
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -1579,7 +1644,7 @@ message ExecutionOptions {
 // Serialization of HloModuleConfig. See the C++ class definition for
 // descriptions of each field.
 // There are no guarantees of backwards or forwards compatibility.
-// Next id: 42.
+// Next id: 43.
 message HloModuleConfigProto {
   enum FusionConfigCollection {
     OFF = 0;       // Do not collect configuration.
@@ -1644,6 +1709,8 @@ message HloModuleConfigProto {
   bool use_shardy_partitioner = 34;
   ShardingConfigProto sharding_config = 38;
   ScheduleConfigProto schedule_config = 41;
+  // Number of devices in a fast-interconnect domain.
+  int64 partition_size = 42;
 }
 
 message HloModuleProtoWithConfig {
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index e97dc4c41b3e3f..9557c9526209a1 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -959,38 +959,42 @@ message StatisticsViz {
   repeated Statistic statistics = 2;
 }
 
-// A mesh is a list of axes and an optional list of device IDs specifying
-// the device ordering.
-//
-// If the list of axes is empty, the mesh has an implicit unnamed axis of
-// size 1. In this case, if a device ID list is not provided, the implicit
-// device ID list is [0]; if a device ID list is provided, it must
-// contains a single integer of any non-negative value. We call this
-// maximal-sharding case.
-//
-// For all non-maximal-sharding cases, if a device ID list is specified,
-// the product of the axis sizes should match the number of devices. If a
-// device ID list is not specified, the implicit device ID list is
-// iota(product(axes)). For simplicity, we also disallow specifying a
-// device ID list that is the same as iota(product(axes)); in this case, a
-// device ID list shouldn't be specified.
-//
+// A mesh is a list of axes and an optional list of device IDs specifying the
+// device ordering.
+
+// If the list of axes is empty
+//   - If the `device_ids` is not provided, it is an empty mesh.
+//   - If the `device_ids` is provided, it must be a single non-negative
+//     integer, we call it a **maximal-sharding mesh**.
+
+// If the list of axes is provided
+//   - If a device ID list is specified, the product of the axis sizes should
+//     match the number of devices.
+//   - If a device ID list is not specified, the implicit device ID list is
+//     iota(product(axes)). For simplicity, we also disallow specifying a
+//     device ID list that is the same as iota(product(axes)); in this case, a
+//     device ID list shouldn't be specified.
+//   - It is not a maximal-sharding mesh even if the total size of axes is 1.
+
 // Here are some examples of meshes:
+
 // - An empty mesh represents a placeholder mesh that can be replaced during
-// propagation: <[]>
-// - A mesh with an unnamed axis and an explicit device ID, which is
-// typically used to represent maximal sharding: <[], device_ids=[3]>
+//   propagation: <[]>
+// - A mesh without axes list and a single non-negative device ID, which is a
+//   maximal-sharding mesh: <[], device_ids=[3]>
 // - A mesh with two axes and implicit device IDs iota(6): <["a"=2, "b"=3]>
 // - A mesh with two axes and explicit device IDs specifying the device
-// ordering: <["a"=3, "b"=2], device_ids=[0, 2, 4, 1, 3, 5]>
-//
-// Constraints:
-// - Elements in `axes` must not have duplicate names.
-// - If `device_ids` is specified:
-//   * The product of axis sizes must match the number of devices.
-//   * All of its elements must be non-negative.
-//   * `device_ids` should not be equal to `iota(product(axis_sizes))`.
-//   * Sorted `device_ids` must be `iota(product(axis_sizes))`.
+//   ordering: <["a"=3, "b"=2], device_ids=[0, 2, 4, 1, 3, 5]>
+
+// **Constraints:**
+// - Elements in `device_ids` should be non-negative.
+// - If `axes` is empty, the size of `device_ids` can be 0 (empty mesh) or 1
+//   (maximal-sharding mesh).
+// - If `axes` is not empty,
+//     - Elements in `axes` must not have duplicate names.
+//     - If `device_ids` is specified, the original `device_ids` is not
+//       `iota(product(axis_sizes))` and the sorted `device_ids` is
+//       `iota(product(axis_sizes))`.
 message MeshProto {
   message MeshAxis {
     string name = 1;
@@ -1054,10 +1058,6 @@ message NamedShardingProto {
     bool is_closed = 2;
   }
 
-  // If this is non-empty, then all the other fields are ignored and this
-  // represents a tuple sharding.
-  repeated NamedShardingProto tuple_shardings = 1;
-
   MeshProto mesh = 2;
   // The dimension shardings tell us for each dimension of the tensor, along
   // which axes it is sharded from major to minor.
@@ -1073,6 +1073,8 @@ message NamedShardingProto {
   // combined with other shardings. Metadata should be set on individual tuple
   // elements and not tuple_sharding.
   repeated OpMetadata metadata = 6;
+
+  reserved 1;
 }
 
 // LINT.IfChange
@@ -1168,6 +1170,13 @@ message OpSharding {
   // or else are ignored. This is to facilitate migration from the old sharding
   // format.
   //
+  // Note that for tuple NamedShardings, we reuse HloSharding's tuple_elements_
+  // field. If named sharding format is enabled each element in tuple_elements_
+  // will be an HloSharding, which itself can be a tuple or should only have
+  // named_sharding_ populated. This approach is taken to maintain backward
+  // compatibility with the existing `tuple_elements()` method, which provides a
+  // modifiable reference to a `std::vector<HloSharding>`.
+  //
   // Note that instead of reusing OpSharding's fields like metadata, we have
   // separate fields in NamedSharding to treat it as a standalone message which
   // is more clear and will help in future cleanup.
@@ -1183,6 +1192,19 @@ message ReplicaGroup {
   repeated int64 replica_ids = 1;
 }
 
+// Represents a list of replica groups (a list of list of devices) via a mesh
+// and list of axes. The replica groups correspond to the partitions of the
+// device ids which would arise if a collective operation was performed over the
+// specified axes.
+message MeshAxesReplicaGroupListProto {
+  // The mesh used to define the full set of axes and devices ids.
+  MeshProto mesh = 1;
+  // The axes defining the replica groups. These groups are implicitly defined
+  // by the device ids which would communicate together if a collective
+  // operation is performed over these axes.
+  repeated AxisRefProto axes = 2;
+}
+
 // Represents a list of replica groups (a list of list of devices) with
 // reshaping and transposing an iota array (iota tile assignment). Can be used
 // to represent certain common patterns of device lists in a compact, scalable